diff --git a/.gitattributes b/.gitattributes
index 8ae3c80128..1215d42fca 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1 +1 @@
-cunumeric/_version.py export-subst
+cunpyumeric/_version.py export-subst
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 5ac9b710d8..b310312985 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,4 +1,4 @@
# Code Ownership
-.github @marcinz @m3vaz @sandeepd-nv @mag1cp1n
-continuous_integration @marcinz @m3vaz @sandeepd-nv @mag1cp1n
-conda @marcinz @m3vaz @sandeepd-nv @mag1cp1n
+.github @nv-legate/devops-reviewers
+continuous_integration @nv-legate/devops-reviewers
+conda @nv-legate/devops-reviewers
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
index 39f252254f..74fb1d45b1 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -8,7 +8,7 @@ body:
value: "# Bug report"
- type: markdown
attributes:
- value: Thank you for reporting a bug and helping us improve Cunumeric!
+ value: Thank you for reporting a bug and helping us improve cuPyNumeric!
- type: markdown
attributes:
value: >
@@ -29,7 +29,7 @@ body:
Platform : Linux-6.8.0-40-generic-x86_64-with-glibc2.35
Legion : (failed to detect)
Legate : 24.05.00+255.g2656afbd
- Cunumeric : 24.05.00+132.gc4741d57
+ cuPynumeric : 24.05.00+132.gc4741d57
Numpy : 1.26.4
Scipy : 1.13.1
Numba : (failed to detect)
diff --git a/.github/workflows/ci-gh-docs.yml b/.github/workflows/ci-gh-docs.yml
new file mode 100644
index 0000000000..349cf79e95
--- /dev/null
+++ b/.github/workflows/ci-gh-docs.yml
@@ -0,0 +1,46 @@
+---
+name: Docs
+
+concurrency:
+ group: ${{ startsWith(github.ref_name, 'main') && format('unique-{0}', github.run_id) || format('ci-build-docs-on-{0}-from-{1}', github.event_name, github.ref_name) }}
+ cancel-in-progress: true
+
+on:
+ push:
+ branches:
+ - "pull-request/[0-9]+"
+ - "branch-*"
+ - "main"
+ merge_group:
+
+jobs:
+ build-and-test:
+ name: Build documentation (${{ matrix.platform }}, ${{ matrix.target-device }}, ${{ matrix.build-mode }}, ucx enabled)
+ strategy:
+ fail-fast: false
+ matrix:
+ platform:
+ - linux
+ target-device:
+ - gpu
+ build-mode:
+ - release
+ uses:
+ ./.github/workflows/gh-build-docs.yml
+ with:
+ platform: ${{ matrix.platform }}
+ target-device: ${{ matrix.target-device }}
+ build-mode: ${{ matrix.build-mode }}
+ build-type: ci
+ upload-docs-to-gh-pages: false
+ secrets: inherit
+
+ docs-pass:
+ if: always()
+ needs:
+ - build-and-test
+ runs-on: linux-amd64-cpu4
+ steps:
+ - name: Check job results
+ if: contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled')
+ run: exit 1
diff --git a/.github/workflows/ci-gh-nightly-release.yml b/.github/workflows/ci-gh-nightly-release.yml
index 0b214d2c63..46b887687c 100644
--- a/.github/workflows/ci-gh-nightly-release.yml
+++ b/.github/workflows/ci-gh-nightly-release.yml
@@ -30,11 +30,42 @@ jobs:
uses:
./.github/workflows/gh-build-and-test.yml
with:
- build-type: release
- dependencies-workflow: ci-gh-nightly-release.yml
+ build-type: nightly
platform: ${{ matrix.platform }}
python-version: ${{ matrix.python-version }}
target-device: ${{ matrix.target-device }}
upload-enabled: ${{ matrix.upload-enabled }}
- waive-gpu-tests: ${{ github.workflow == 'Build Release package' && matrix.platform == 'linux-aarch64' }}
+ refname: ${{ github.ref_name }}
+ default-branch: ${{ github.event.repository.default_branch }}
+ secrets: inherit
+
+ build-nightly-docs:
+ name: Build Nightly documentation (${{ matrix.platform }}, ${{ matrix.target-device }}, ${{ matrix.build-mode }}, ucx enabled)
+ strategy:
+ fail-fast: false
+ matrix:
+ platform:
+ - linux
+ target-device:
+ - gpu
+ build-mode:
+ - release
+ uses:
+ ./.github/workflows/gh-build-docs.yml
+ with:
+ platform: ${{ matrix.platform }}
+ target-device: ${{ matrix.target-device }}
+ build-mode: ${{ matrix.build-mode }}
+ build-type: nightly
+ upload-docs-to-gh-pages: true
+ secrets: inherit
+
+ push_code:
+ name: Nightly source release
+ uses:
+ nv-legate/legate-gh-ci/.github/workflows/gh-push-code.yml@nightly_push_to_external_repo
+ with:
+ runs-on: linux-amd64-cpu4
+ source-repo: "${{ github.repository_owner }}/cupynumeric.internal"
+ dest-repo: "${{ github.repository_owner }}/cupynumeric"
secrets: inherit
diff --git a/.github/workflows/ci-gh-validate-legate-sha.yml b/.github/workflows/ci-gh-validate-legate-sha.yml
index d15982ca3c..9e2309a233 100644
--- a/.github/workflows/ci-gh-validate-legate-sha.yml
+++ b/.github/workflows/ci-gh-validate-legate-sha.yml
@@ -20,7 +20,7 @@ jobs:
- name: Checkout repository
uses: actions/checkout@v4
with:
- path: cunumeric.internal
+ path: cupynumeric.internal
- name: Set up environment
run: |
@@ -30,7 +30,7 @@ jobs:
- name: Parse versions.json
shell: bash --noprofile --norc -xeuo pipefail {0}
run: |
- DEPENDENCIES_FILE="cunumeric.internal/cmake/versions.json"
+ DEPENDENCIES_FILE="cupynumeric.internal/cmake/versions.json"
GIT_REPO=$(jq -r '.packages.legate.repo' ${DEPENDENCIES_FILE})
GIT_ORG=$(jq -r '.packages.legate.org' ${DEPENDENCIES_FILE})
GIT_TAG=$(jq -r '.packages.legate.git_tag' ${DEPENDENCIES_FILE})
diff --git a/.github/workflows/ci-gh-release.yml b/.github/workflows/ci-gh.yml
similarity index 67%
rename from .github/workflows/ci-gh-release.yml
rename to .github/workflows/ci-gh.yml
index 654fad29ef..4bb50dd233 100644
--- a/.github/workflows/ci-gh-release.yml
+++ b/.github/workflows/ci-gh.yml
@@ -1,4 +1,4 @@
-name: Build Release package
+name: Build CI package
concurrency:
group: ${{ startsWith(github.ref_name, 'main') && format('unique-{0}', github.run_id) || format('ci-build-and-test-on-{0}-from-{1}', github.event_name, github.ref_name) }}
@@ -33,10 +33,21 @@ jobs:
uses:
./.github/workflows/gh-build-and-test.yml
with:
- build-type: release
+ build-type: ci
platform: ${{ matrix.platform }}
- python-version: "3.10"
+ python-version: ${{ matrix.python-version }}
target-device: ${{ matrix.target-device }}
upload-enabled: ${{ matrix.upload-enabled }}
- waive-gpu-tests: ${{ github.workflow == 'Build Release package' && matrix.platform == 'linux-aarch64' }}
+ refname: ${{ github.ref_name }}
+ default-branch: ${{ github.event.repository.default_branch }}
secrets: inherit
+
+ tests-pass:
+ if: always()
+ needs:
+ - build-and-test
+ runs-on: linux-amd64-cpu4
+ steps:
+ - name: Check job results
+ if: contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled')
+ run: exit 1
diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
index b9890641a0..06bd77b2ff 100644
--- a/.github/workflows/gh-build-and-test.yml
+++ b/.github/workflows/gh-build-and-test.yml
@@ -13,18 +13,16 @@ on:
upload-enabled:
type: boolean
required: true
- waive-gpu-tests:
- required: true
- type: boolean
- description: Waive GPU tests based on specific configuration
python-version:
required: false
type: string
default: "3.12"
- dependencies-workflow:
- required: false
+ refname:
+ required: true
+ type: string
+ default-branch:
+ required: true
type: string
- default: ci-gh.yml
jobs:
setup-build:
@@ -51,14 +49,14 @@ jobs:
needs: setup-build
name: "Build (${{ inputs.platform }}, ${{ inputs.target-device }}, ${{ inputs.build-type }}, Python ${{ inputs.python-version }})"
uses:
- nv-legate/legate-gh-ci/.github/workflows/gh-build.yml@v1.17
+ nv-legate/legate-gh-ci/.github/workflows/gh-build.yml@v1.29
with:
+ build-has-tests: ${{ !inputs.upload-enabled }}
build-mode: ""
build-type: ${{ inputs.build-type }}
client-repo: ${{ github.event.repository.name }}
- dependencies-file: "cmake/versions.json"
- dependencies-workflow: ${{ inputs.dependencies-workflow }}
- legate-gh-ci-tag: "v1.17"
+ dependencies-file: ""
+ legate-gh-ci-tag: "v1.29"
network: "ucx"
platform: ${{ inputs.platform }}
python-version: ${{ inputs.python-version }}
@@ -68,45 +66,29 @@ jobs:
use-container: ${{ inputs.platform == 'linux' || inputs.platform == 'linux-aarch64' }}
secrets: inherit
- nightly-exists:
- needs: setup-build
- name: "Check if legate.internal nightly exists for SHA specified in versions.json (${{ inputs.platform }}, ${{ inputs.target-device }}, ${{ inputs.build-type }})"
- uses:
- nv-legate/legate-gh-ci/.github/workflows/gh-check-if-nightly-exists-for-all-dependencies.yml@v1.17
- with:
- build-mode: ""
- build-type: ${{ inputs.build-type }}
- client-repo: ${{ github.event.repository.name }}
- dependencies-file: "cmake/versions.json"
- legate-gh-ci-tag: "v1.17"
- network: "ucx"
- platform: ${{ inputs.platform }}
- python-version: ${{ inputs.python-version }}
- runs-on: linux-amd64-cpu4
- target-device: ${{ inputs.target-device }}
- upload-enabled: ${{ inputs.upload-enabled }}
- secrets: inherit
-
upload:
needs: build
if: ${{ github.repository_owner == 'nv-legate' && contains(github.workflow, 'release') && inputs.upload-enabled == true }}
name: Upload package to Server
uses:
- nv-legate/legate-gh-ci/.github/workflows/gh-upload.yml@v1.17
+ nv-legate/legate-gh-ci/.github/workflows/gh-upload.yml@v1.29
with:
+ build-has-tests: ${{ !inputs.upload-enabled }}
build-mode: ""
build-type: ${{ inputs.build-type }}
client-repo: ${{ github.event.repository.name }}
- legate-gh-ci-tag: "v1.17"
+ legate-gh-ci-tag: "v1.29"
name: Upload package to Server
network: "ucx"
- pkgSubString: "cunumeric-"
+ pkgSubString: "cupynumeric-"
platform: ${{ inputs.platform }}
python-version: ${{ inputs.python-version }}
- repos-Root: "cunumeric"
+ repos-Root: "cupynumeric"
target-device: ${{ inputs.target-device }}
- upload-action: "upload-package"
+ upload-action: "upload-package-Anaconda"
upload-enabled: ${{ inputs.upload-enabled }}
+ refname: ${{ inputs.refname }}
+ default-branch: ${{ inputs.default-branch }}
secrets: inherit
setup-test:
@@ -121,20 +103,25 @@ jobs:
- id: set-matrix
run: |
set -xeuo pipefail
+
MATRIX_JSON='{"include": ['
+
RUNNERS=(
- 'linux-amd64-gpu-v100-latest-1:gpu:gpu:linux' 'linux-amd64-2gpu:gpu:2gpu:linux'
+ 'linux-amd64-gpu-l4-latest-1:gpu:gpu:linux' 'linux-amd64-2gpu:gpu:2gpu:linux'
'linux-amd64-cpu16:cpu:cpu:linux'
'linux-arm64-cpu16:cpu:cpu:linux-aarch64' 'linux-aarch64-2gpu:gpu:2gpu:linux-aarch64' 'linux-aarch64-2gpu:gpu:gpu:linux-aarch64'
'macos-latest:cpu:cpu:mac')
+
TEST_CONFIGS=(
'1 CPU test:test --cpus 1 --debug:cpu'
'1 CPU test:test --cpus 1 --debug:gpu'
'2 CPU test:test --cpus 2 --debug:cpu'
'2 CPU test:test --cpus 2 --debug:gpu'
- # set the number of workers manually because nvidia runners report 6 gpus when onyl one is really available
- # this workaround can be removed when the number of available gpus is reported correctly (when we run on VMs)
- 'GPU test:test --use cuda --gpus 1 -j 7 --debug:gpu'
+ # Set the number of workers manually because nvidia runners report 6
+ # gpus when only one is really available this workaround can be
+ # removed when the number of available gpus is reported correctly
+ # (when we run on VMs)
+ 'GPU test:test --use cuda --gpus 1 --debug:gpu'
'2 GPU test:test --use cuda --gpus 2 --debug:2gpu'
'OpenMP test:test --use openmp --omps 1 --ompthreads 2 --debug:gpu'
'OpenMP test:test --use openmp --omps 1 --ompthreads 2 --debug:cpu'
@@ -143,31 +130,49 @@ jobs:
'Eager execution test:test --use eager --debug:gpu'
'Eager execution test:test --use eager --debug:cpu'
'mypy:mypy:cpu'
- 'Documentation:docs:cpu'
'Unit tests:unit:cpu'
+ 'CPP tests:cpp:cpu'
+ # TODO: Uncomment the following lines once
+ # https://github.com/nv-legate/cupynumeric.internal/issues/654 has
+ # been fixed.
+ # 'CPP tests:cpp:gpu'
+ # 'CPP tests:cpp:2gpu'
)
+
for RUNNER in "${RUNNERS[@]}"; do
IFS=':' read -ra RUNNER_INFO <<< "$RUNNER"
RUNNER_NAME=${RUNNER_INFO[0]}
RUNNER_TYPE=${RUNNER_INFO[1]}
RUNNER_DEVICE=${RUNNER_INFO[2]}
RUNNER_PLATFORM=${RUNNER_INFO[3]}
+
if [[ "$RUNNER_TYPE" == "${{ inputs.target-device }}" && "$RUNNER_PLATFORM" == "${{ inputs.platform }}" ]]; then
+
for TEST_CONFIG in "${TEST_CONFIGS[@]}"; do
IFS=':' read -ra CONFIG_INFO <<< "$TEST_CONFIG"
TEST_NAME=${CONFIG_INFO[0]}
TEST_OPTIONS=${CONFIG_INFO[1]}
TEST_TARGET_DEVICE=${CONFIG_INFO[2]}
+
+ # Note: we don't have enough linux-aarch64 GPU runners to
+ # support per commit testing. This is why these tests are waived
+ # here.
+ WAIVE_TEST="${{ inputs.target-device == 'gpu' && inputs.build-type == 'ci' && inputs.platform == 'linux-aarch64' }}"
+
if [[ "$TEST_TARGET_DEVICE" == "$RUNNER_DEVICE" ]]; then
- if ! [[ "$TEST_NAME" =~ "GPU" && "${{ inputs.waive-gpu-tests }}" == 'true' ]]; then
+ if [[ "${WAIVE_TEST}" == "false" ]]; then
MATRIX_JSON+="{\"runner\": {\"name\": \"$RUNNER_NAME\", \"type\": \"$RUNNER_TYPE\", \"platform\": \"$RUNNER_PLATFORM\"}, \"test-config\": {\"name\": \"$TEST_NAME\", \"test-options\": \"$TEST_OPTIONS\"}},"
fi
fi
done
fi
done
- MATRIX_JSON=$(echo "$MATRIX_JSON" | sed 's/,$//') # Remove the trailing comma
+
+ # Remove the trailing comma
+ MATRIX_JSON=$(echo "$MATRIX_JSON" | sed 's/,$//')
+ # Terminate JSON expression
MATRIX_JSON+=']}'
+
echo "matrix=$MATRIX_JSON" >> $GITHUB_OUTPUT
test:
@@ -180,13 +185,14 @@ jobs:
matrix: ${{fromJson(needs.setup-test.outputs.matrix)}}
uses:
- nv-legate/legate-gh-ci/.github/workflows/gh-test-within-container.yml@v1.17
+ nv-legate/legate-gh-ci/.github/workflows/gh-test-within-container.yml@v1.29
with:
+ build-has-tests: ${{ !inputs.upload-enabled }}
build-mode: ""
build-type: ${{ inputs.build-type }}
client-repo: ${{ github.event.repository.name }}
has-gpu: ${{ matrix.runner.type == 'gpu' }}
- legate-gh-ci-tag: "v1.17"
+ legate-gh-ci-tag: "v1.29"
name: ${{ matrix.test-config.name }}
network: "ucx"
platform: ${{ inputs.platform }}
@@ -200,21 +206,24 @@ jobs:
updateTestStatus:
needs: test
name: Update Test status on Server
- if: ${{ (github.repository_owner == 'nv-legate') && contains(github.workflow, 'Nightly') && (inputs.upload-enabled == true) }}
+ if: ${{ false }}
uses:
- nv-legate/legate-gh-ci/.github/workflows/gh-upload.yml@v1.17
+ nv-legate/legate-gh-ci/.github/workflows/gh-upload.yml@v1.29
with:
+ build-has-tests: ${{ !inputs.upload-enabled }}
build-mode: ""
build-type: ${{ inputs.build-type }}
client-repo: ${{ github.event.repository.name }}
- legate-gh-ci-tag: "v1.17"
+ legate-gh-ci-tag: "v1.29"
name: UpdateTestStatus
network: "ucx"
- pkgSubString: "cunumeric-"
+ pkgSubString: "cupynumeric-"
platform: ${{ inputs.platform }}
python-version: ${{ inputs.python-version }}
- repos-Root: "cunumeric"
+ repos-Root: "cupynumeric"
target-device: ${{ inputs.target-device }}
upload-action: "update-test-status"
upload-enabled: true
+ refname: ${{ inputs.refname }}
+ default-branch: ${{ inputs.default-branch }}
secrets: inherit
diff --git a/.github/workflows/gh-build-docs.yml b/.github/workflows/gh-build-docs.yml
new file mode 100644
index 0000000000..57dd3bf54e
--- /dev/null
+++ b/.github/workflows/gh-build-docs.yml
@@ -0,0 +1,122 @@
+---
+on:
+ workflow_call:
+ inputs:
+ platform:
+ type: string
+ required: true
+ target-device:
+ type: string
+ required: true
+ build-mode:
+ type: string
+ required: true
+ build-type:
+ type: string
+ required: true
+ upload-docs-to-gh-pages:
+ type: boolean
+ required: false
+ default: false
+
+jobs:
+ build-cupynumeric:
+ if: ${{ github.repository_owner == 'nv-legate' }}
+ uses:
+ nv-legate/legate-gh-ci/.github/workflows/gh-build.yml@v1.29
+ with:
+ build-has-tests: false
+ client-repo: ${{ github.event.repository.name }}
+ target-device: ${{ inputs.target-device }}
+ runs-on: ${{ (inputs.platform == 'linux' && 'linux-amd64-cpu16') || (inputs.platform == 'mac' && 'macos-latest') }}
+ build-type: ${{ inputs.build-type }}
+ use-container: ${{ inputs.platform == 'linux' }}
+ platform: ${{ inputs.platform }}
+ dependencies-file: ""
+ legate-gh-ci-tag: "v1.29"
+ build-mode: ${{ inputs.build-mode }}
+ upload-enabled: false
+ network: "ucx"
+ secrets: inherit
+
+
+ build-docs:
+ needs:
+ - build-cupynumeric
+ name: Build cupynumeric docs (${{ inputs.platform }}, ${{ inputs.target-device }})
+
+ uses:
+ nv-legate/legate-gh-ci/.github/workflows/gh-test-within-container.yml@v1.29
+ with:
+ build-has-tests: false
+ build-mode: ${{ inputs.build-mode }}
+ build-type: ${{ inputs.build-type }}
+ output-build-type: docs
+ client-repo: ${{ github.event.repository.name }}
+ has-gpu: false
+ legate-gh-ci-tag: "v1.29"
+ name: Build documentation
+ network: "ucx"
+ platform: ${{ inputs.platform }}
+ python-version: ${{ inputs.python-version }}
+ runs-on: ${{ (inputs.platform == 'linux' && 'linux-amd64-gpu-l4-latest-1') || (inputs.platform == 'mac' && 'macos-latest') }}
+ target-device: ${{ inputs.target-device }}
+ test-options: docs
+ upload-enabled: false
+ secrets: inherit
+
+
+ upload-docs-to-gh-pages:
+ if: ${{ inputs.upload-docs-to-gh-pages && github.ref_name == 'main' }}
+ needs:
+ - build-docs
+ runs-on: ${{ (inputs.platform == 'linux' && 'linux-amd64-cpu4') || (inputs.platform == 'mac' && 'macos-latest') }}
+ steps:
+ - name: Set environment variables
+ shell: bash --noprofile --norc -xeuo pipefail {0}
+ run: |
+ echo "${{ needs.build-docs.outputs.output-artifact-name }}"
+
+ ARTIFACTS_DIR=$(realpath "$(pwd)/../artifacts")
+ echo "ARTIFACTS_DIR=${ARTIFACTS_DIR}" >> $GITHUB_ENV
+
+ mkdir -p "${ARTIFACTS_DIR}"
+
+ - name: Download build artifacts
+ uses: actions/download-artifact@v4
+ with:
+ name: ${{ needs.build-docs.outputs.output-artifact-name }}
+ path: ${{ env.ARTIFACTS_DIR }}
+
+ - name: Display structure of downloaded artifacts
+ shell: bash --noprofile --norc -xeuo pipefail {0}
+ run: |
+ pwd
+ ls -lahR ${{ env.ARTIFACTS_DIR }}
+
+ - name: Find index.html's parent folder
+ shell: bash --noprofile --norc -xeuo pipefail {0}
+ id: find_docs_dir
+ run: |
+ FILE_PATH="$(
+ find "${{ env.ARTIFACTS_DIR }}" -name "index.html" -printf '%d %p\n' \
+ | sort -nk1 \
+ | cut -d' ' -f2- \
+ | head -n 1
+ )"
+ if [ -z "${FILE_PATH}" ]; then
+ echo "index.html not found" >&2
+ exit 1
+ fi
+ PARENT_DIR=$(dirname "${FILE_PATH}")
+ echo "docs_dir=${PARENT_DIR}" >> "${GITHUB_OUTPUT}"
+
+ - name: Checkout
+ uses: actions/checkout@v4
+
+ - name: Deploy
+ uses: JamesIves/github-pages-deploy-action@v4
+ with:
+ folder: ${{ steps.find_docs_dir.outputs.docs_dir }}
+ token: ${{ secrets.NV_LEGATE_INTER_REPOS_ACCESS }}
+ repository-name: "nv-legate/cupynumeric"
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
new file mode 100644
index 0000000000..3fd2c7f62e
--- /dev/null
+++ b/.github/workflows/pr.yml
@@ -0,0 +1,46 @@
+name: pr
+
+on:
+ push:
+ branches:
+ - "pull-request/[0-9]+"
+ - "branch-*"
+ - "main"
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
+defaults:
+ run:
+ shell: bash -eou pipefail {0}
+
+jobs:
+ legate-sha:
+ runs-on: linux-amd64-cpu4
+ outputs:
+ LEGATE_SHA: ${{ steps.legate-sha.outputs.sha }}
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
+ - name: Get the Legate SHA
+ id: legate-sha
+ run: |
+ sha=$(jq .packages.legate.git_tag cmake/versions.json)
+ echo "sha=$sha" >> $GITHUB_OUTPUT
+ wheels-build:
+ needs: legate-sha
+ secrets: inherit
+ uses: ./.github/workflows/wheels-build.yml
+ with:
+ build-type: pull-request
+ legate-sha: ${{ needs.legate-sha.outputs.LEGATE_SHA }}
+ wheels-test:
+ needs: [wheels-build, legate-sha]
+ secrets: inherit
+ uses: ./.github/workflows/wheels-test.yml
+ with:
+ build-type: pull-request
+ legate-sha: ${{ needs.legate-sha.outputs.LEGATE_SHA }}
diff --git a/.github/workflows/wheels-build.yml b/.github/workflows/wheels-build.yml
new file mode 100644
index 0000000000..fa91eebee0
--- /dev/null
+++ b/.github/workflows/wheels-build.yml
@@ -0,0 +1,134 @@
+on:
+ workflow_call:
+ inputs:
+ build-type:
+ required: true
+ type: string
+ legate-sha:
+ type: string
+ required: true
+ branch:
+ type: string
+ sha:
+ type: string
+ repo:
+ type: string
+ node_type:
+ type: string
+ default: "cpu16"
+ cuda_ver:
+ type: string
+ default: "12.5.1"
+ linux_ver:
+ type: string
+ default: "rockylinux8"
+ script:
+ type: string
+ default: "continuous_integration/scripts/build_wheel_linux.bash"
+ matrix_filter:
+ type: string
+ default: "."
+
+defaults:
+ run:
+ shell: bash -eou pipefail {0}
+
+permissions:
+ actions: read
+ checks: none
+ contents: read
+ deployments: none
+ discussions: none
+ id-token: write
+ issues: none
+ packages: read
+ pages: none
+ pull-requests: read
+ repository-projects: none
+ security-events: none
+ statuses: none
+
+jobs:
+ compute-matrix:
+ runs-on: linux-amd64-cpu4
+ outputs:
+ MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }}
+ steps:
+ - name: Compute Build Matrix
+ id: compute-matrix
+ run: |
+ set -eo pipefail
+
+ # please keep the matrices sorted in ascending order by the following:
+ #
+ # [ARCH, PY_VER, CUDA_VER, LINUX_VER]
+ #
+ export MATRIX="
+ # amd64
+ - { ARCH: 'amd64', PY_VER: '3.10', TARGET_DEV: 'gpu', BUILD_MODE: 'release' }
+ - { ARCH: 'amd64', PY_VER: '3.11', TARGET_DEV: 'gpu', BUILD_MODE: 'release' }
+ - { ARCH: 'amd64', PY_VER: '3.12', TARGET_DEV: 'gpu', BUILD_MODE: 'release' }
+ # arm64
+ - { ARCH: 'arm64', PY_VER: '3.10', TARGET_DEV: 'gpu', BUILD_MODE: 'release' }
+ - { ARCH: 'arm64', PY_VER: '3.11', TARGET_DEV: 'gpu', BUILD_MODE: 'release' }
+ - { ARCH: 'arm64', PY_VER: '3.12', TARGET_DEV: 'gpu', BUILD_MODE: 'release' }
+ "
+
+ MATRIX="$(
+ yq -n -o json 'env(MATRIX)' | \
+ jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end'
+ )"
+
+ echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"
+ build:
+ name: ${{ matrix.ARCH }}, py${{ matrix.PY_VER }}, ${{ matrix.TARGET_DEV }}, ${{ matrix.BUILD_MODE }}
+ needs: compute-matrix
+ timeout-minutes: 480
+ strategy:
+ fail-fast: false
+ matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
+ runs-on: "linux-${{ matrix.ARCH }}-${{ inputs.node_type }}"
+ container:
+ image: rapidsai/ci-wheel:cuda${{ inputs.cuda_ver }}-${{ inputs.linux_ver }}-py${{ matrix.PY_VER }}
+ env:
+ BUILD_MODE: ${{ matrix.BUILD_MODE }}
+ steps:
+ - name: Get the SHA
+ id: get-sha
+ run: |
+ sha=$(echo ${{github.sha}} | head -c 10)
+ echo "sha=$sha" >> $GITHUB_OUTPUT
+ - if: github.repository_owner == 'nv-legate'
+ name: Get AWS credentials for sccache bucket
+ uses: aws-actions/configure-aws-credentials@v4
+ with:
+ aws-region: us-east-2
+ role-duration-seconds: 28800 # 8 hours
+ role-to-assume: arn:aws:iam::279114543810:role/gha-oidc-nv-legate
+ - uses: actions/checkout@v4
+ with:
+ repository: ${{ inputs.repo }}
+ ref: ${{ inputs.sha }}
+ fetch-depth: 0
+ - name: Add default paths to the env
+ run: |
+ echo "$(pwd)"/continuous_integration/scripts/tools >> "${GITHUB_PATH}"
+ - name: Download the legate wheel
+ env:
+ BUILD_NAME: ${{ matrix.ARCH }}-${{ matrix.TARGET_DEV }}-cuda${{ inputs.cuda_ver }}-py${{ matrix.PY_VER }}
+ GH_TOKEN: ${{ secrets.NV_LEGATE_INTER_REPOS_ACCESS_RO }}
+ run: |
+ legate-gh-download-artifact ${{ inputs.legate-sha }} "legate-wheel-${{ env.BUILD_NAME }}" "wheel"
+ - name: Wheel build
+ run: ${{ inputs.script }}
+ env:
+ STEP_NAME: "C++ build"
+ GH_TOKEN: ${{ github.token }}
+ - name: Wheel upload
+ env:
+ BUILD_SHA: ${{ steps.get-sha.outputs.sha }}
+ BUILD_NAME: ${{ matrix.ARCH }}-${{ matrix.TARGET_DEV }}-cuda${{ inputs.cuda_ver }}-py${{ matrix.PY_VER }}
+ uses: actions/upload-artifact@v4
+ with:
+ name: cupynumeric-wheel-${{ env.BUILD_NAME }}-g${{ env.BUILD_SHA }}
+ path: final-dist/*.whl
diff --git a/.github/workflows/wheels-test.yml b/.github/workflows/wheels-test.yml
new file mode 100644
index 0000000000..a0db1b5145
--- /dev/null
+++ b/.github/workflows/wheels-test.yml
@@ -0,0 +1,129 @@
+on:
+ workflow_call:
+ inputs:
+ build-type:
+ required: true
+ type: string
+ legate-sha:
+ type: string
+ required: true
+ branch:
+ type: string
+ sha:
+ type: string
+ repo:
+ type: string
+ node_type:
+ type: string
+ default: "cpu16"
+ cuda_ver:
+ type: string
+ default: "12.8.0"
+ script:
+ type: string
+ default: "continuous_integration/scripts/test_wheel_linux.bash"
+ matrix_filter:
+ type: string
+ default: "."
+
+defaults:
+ run:
+ shell: bash -eou pipefail {0}
+
+permissions:
+ actions: read
+ checks: none
+ contents: read
+ deployments: none
+ discussions: none
+ id-token: write
+ issues: none
+ packages: read
+ pages: none
+ pull-requests: read
+ repository-projects: none
+ security-events: none
+ statuses: none
+
+jobs:
+ compute-matrix:
+ runs-on: linux-amd64-cpu4
+ outputs:
+ MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }}
+ steps:
+ - name: Compute Build Matrix
+ id: compute-matrix
+ run: |
+ set -eo pipefail
+
+ # please keep the matrices sorted in ascending order by the following:
+ #
+ # [ARCH, PY_VER, CUDA_VER, LINUX_VER]
+ #
+ export MATRIX="
+ # amd64
+ - { ARCH: 'amd64', PY_VER: '3.10', TARGET_DEV: 'gpu', GPU: 'l4', LINUX_VER: 'ubuntu22.04' }
+ - { ARCH: 'amd64', PY_VER: '3.11', TARGET_DEV: 'gpu', GPU: 'l4', LINUX_VER: 'ubuntu22.04' }
+ - { ARCH: 'amd64', PY_VER: '3.12', TARGET_DEV: 'gpu', GPU: 'l4', LINUX_VER: 'ubuntu24.04' }
+ # arm64 - disabled due to ARM GPU runner availability
+ # - { ARCH: 'arm64', PY_VER: '3.10', TARGET_DEV: 'gpu', GPU: 'a100', LINUX_VER: 'ubuntu22.04' }
+ # - { ARCH: 'arm64', PY_VER: '3.11', TARGET_DEV: 'gpu', GPU: 'a100', LINUX_VER: 'ubuntu22.04' }
+ # - { ARCH: 'arm64', PY_VER: '3.12', TARGET_DEV: 'gpu', GPU: 'a100', LINUX_VER: 'ubuntu24.04' }
+ "
+
+ MATRIX="$(
+ yq -n -o json 'env(MATRIX)' | \
+ jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end'
+ )"
+
+ echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"
+
+ build:
+ name: ${{ matrix.ARCH }}, py${{ matrix.PY_VER }}, ${{ matrix.LINUX_VER }}, ${{ matrix.GPU }}
+ needs: compute-matrix
+ timeout-minutes: 60
+ strategy:
+ fail-fast: false
+ matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
+ runs-on: ${{ matrix.ARCH == 'arm64' && 'linux-aarch64-2gpu' || format('linux-{0}-gpu-{1}-latest-1', matrix.ARCH, matrix.GPU) }}
+ container:
+ image: rapidsai/citestwheel:cuda${{ inputs.cuda_ver }}-${{ matrix.LINUX_VER }}-py${{ matrix.PY_VER }}
+ env:
+ NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
+ steps:
+ - name: Get the SHA
+ id: get-sha
+ run: |
+ sha=$(echo ${{github.sha}} | head -c 10)
+ echo "sha=$sha" >> $GITHUB_OUTPUT
+ - uses: actions/checkout@v4
+ with:
+ repository: ${{ inputs.repo }}
+ ref: ${{ inputs.sha }}
+ fetch-depth: 0
+ - name: Add default paths to the env
+ run: |
+ echo $(pwd)/continuous_integration/scripts/tools >> "${GITHUB_PATH}"
+ - name: Run nvidia-smi to make sure GPU is working
+ run: nvidia-smi
+ - name: Setup proxy cache
+ uses: nv-gha-runners/setup-proxy-cache@main
+ continue-on-error: true
+ # Skip the cache on RDS Lab nodes
+ if: ${{ matrix.GPU != 'v100' && matrix.GPU != 'a100' }}
+ - name: Download the legate wheel
+ env:
+ BUILD_NAME: ${{ matrix.ARCH }}-${{ matrix.TARGET_DEV }}-cuda12.5.1-py${{ matrix.PY_VER }}
+ GH_TOKEN: ${{ secrets.NV_LEGATE_INTER_REPOS_ACCESS_RO }}
+ run: |
+ legate-gh-download-artifact ${{ inputs.legate-sha }} "legate-wheel-${{ env.BUILD_NAME }}" "wheel"
+ - name: Download the wheel from the build job
+ env:
+ BUILD_SHA: ${{ steps.get-sha.outputs.sha }}
+ BUILD_NAME: ${{ matrix.ARCH }}-${{ matrix.TARGET_DEV }}-cuda12.5.1-py${{ matrix.PY_VER }}
+ uses: actions/download-artifact@v4
+ with:
+ path: final-dist
+ name: cupynumeric-wheel-${{ env.BUILD_NAME }}-g${{ env.BUILD_SHA }}
+ - name: Run tests
+ run: ${{ inputs.script }}
diff --git a/.gitignore b/.gitignore
index 84244ce827..d4ccc950aa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,11 +27,11 @@ legion
gasnet*
legion_defines.h
realm_defines.h
-cunumeric/install_info.py
+cupynumeric/install_info.py
/build/*
-/docs/cunumeric/build
-/docs/cunumeric/source/api/generated
-/docs/cunumeric/source/comparison/comparison_table.rst.inc
+/docs/cupynumeric/build
+/docs/cupynumeric/source/api/generated
+/docs/cupynumeric/source/comparison/comparison_table.rst.inc
*.egg-info
.cache
.vscode
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c8b84bdb52..db16bb5d34 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,25 +1,26 @@
repos:
- repo: https://github.com/pre-commit/mirrors-mypy
- rev: 'v1.5.1'
+ rev: 'v1.15.0'
hooks:
- id: mypy
language: system
pass_filenames: false
- args: ['cunumeric']
+ args: ['cupynumeric']
- repo: https://github.com/PyCQA/isort
- rev: 5.12.0
+ rev: 6.0.1
hooks:
- - id: isort
+ - id: isort
- repo: https://github.com/psf/black
- rev: 23.9.1
+ rev: 25.1.0
hooks:
- - id: black
+ - id: black
+ args: ["--target-version", "py310"]
- repo: https://github.com/PyCQA/flake8
- rev: 6.1.0
+ rev: 7.2.0
hooks:
- - id: flake8
+ - id: flake8
- repo: https://github.com/pre-commit/mirrors-clang-format
- rev: 'v16.0.6' # Use the sha / tag you want to point at
+ rev: 'v20.1.0' # Use the sha / tag you want to point at
hooks:
- id: clang-format
files: \.(cu|cuh|h|cc|inl)$
@@ -40,7 +41,7 @@ repos:
'types_or': [c++, c, cuda]
require_serial: false
stages: [pre-commit]
- exclude: '^src/cunumeric/cunumeric_c\.h$'
+ exclude: '^src/cupynumeric/cupynumeric_c\.h$'
ci:
skip: [mypy]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 55cd0547c2..866be2eab4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,32 +14,10 @@
# limitations under the License.
#=============================================================================
-cmake_minimum_required(VERSION 3.22.1 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
-if(POLICY CMP0077)
- cmake_policy(SET CMP0077 NEW)
- set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
-endif()
-
-if(POLICY CMP0096)
- cmake_policy(SET CMP0096 NEW)
- set(CMAKE_POLICY_DEFAULT_CMP0096 NEW)
-endif()
-
-if(POLICY CMP0135)
- # make the timestamps of ExternalProject_ADD match the download time
- # https://cmake.org/cmake/help/latest/policy/CMP0135.html
- cmake_policy(SET CMP0135 NEW)
- set(CMAKE_POLICY_DEFAULT_CMP0135 NEW)
-endif()
-
-if(POLICY CMP0132)
- # Avoid an inconsistency, where cmake would only set the CC/CXX env vars on
- # the first run, but not subsequent ones. This would come up when building
- # TBLIS.
- cmake_policy(SET CMP0132 NEW)
- set(CMAKE_POLICY_DEFAULT_CMP0132 NEW)
-endif()
+cmake_path(SET CUPYNUMERIC_CMAKE_DIR NORMALIZE "${CMAKE_CURRENT_LIST_DIR}/cmake")
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
set(CMAKE_CXX_STANDARD 17 CACHE STRING "" FORCE)
set(CMAKE_CXX_STANDARD_REQUIRED ON CACHE STRING "" FORCE)
@@ -50,8 +28,8 @@ set(CMAKE_CUDA_STANDARD_REQUIRED ON CACHE STRING "" FORCE)
##############################################################################
# - Download and initialize RAPIDS CMake helpers -----------------------------
-set(rapids-cmake-version 24.04)
-set(rapids-cmake-sha "365322aca32fd6ecd7027f5d7ec7be50b7f3cc2a")
+set(rapids-cmake-version 24.12)
+set(rapids-cmake-sha "4cb2123dc08ef5d47ecdc9cc51c96bea7b5bb79c")
if(NOT EXISTS ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${rapids-cmake-version}/RAPIDS.cmake
${CMAKE_BINARY_DIR}/RAPIDS.cmake)
@@ -63,7 +41,7 @@ include(rapids-cuda)
include(rapids-export)
include(rapids-find)
-set(cunumeric_version 24.09.00)
+set(cupynumeric_version 25.05.00)
# For now we want the optimization flags to match on both normal make and cmake
# builds so we override the cmake defaults here for release, this changes
@@ -78,40 +56,40 @@ set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g")
set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g")
if(NOT SKBUILD)
- project(cunumeric VERSION ${cunumeric_version} LANGUAGES C CXX)
- include(cunumeric_cpp.cmake)
+ project(cupynumeric VERSION ${cupynumeric_version} LANGUAGES C CXX)
+ include(cupynumeric_cpp.cmake)
else()
project(
- cunumeric_python
- VERSION ${cunumeric_version}
+ cupynumeric_python
+ VERSION ${cupynumeric_version}
LANGUAGES # TODO: Building Python extension modules via the python_extension_module requires the C
# language to be enabled here. The test project that is built in scikit-build to verify
# various linking options for the python library is hardcoded to build with C, so until
# that is fixed we need to keep C.
C CXX)
- include(cunumeric_python.cmake)
+ include(cupynumeric_python.cmake)
endif()
if(CMAKE_GENERATOR STREQUAL "Ninja")
- function(add_touch_cunumeric_ninja_build_target)
+ function(add_touch_cupynumeric_ninja_build_target)
set(_suf )
if(SKBUILD)
set(_suf "_python")
endif()
- add_custom_target("touch_cunumeric${_suf}_ninja_build" ALL
+ add_custom_target("touch_cupynumeric${_suf}_ninja_build" ALL
COMMAND ${CMAKE_COMMAND} -E touch_nocreate "${CMAKE_CURRENT_BINARY_DIR}/build.ninja"
COMMENT "touch build.ninja so ninja doesn't re-run CMake on rebuild"
VERBATIM
)
- foreach(_dep IN ITEMS cunumeric cunumeric_python
+ foreach(_dep IN ITEMS cupynumeric cupynumeric_python
legate legate_python
Legion LegionRuntime
Realm RealmRuntime
Regent)
if(TARGET ${_dep})
- add_dependencies("touch_cunumeric${_suf}_ninja_build" ${_dep})
+ add_dependencies("touch_cupynumeric${_suf}_ninja_build" ${_dep})
endif()
endforeach()
endfunction()
- add_touch_cunumeric_ninja_build_target()
+ add_touch_cupynumeric_ninja_build_target()
endif()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index e083cc3c0c..b4ac11a6a5 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,16 +1,16 @@
-# Contributing to cuNumeric
+# Contributing to cuPyNumeric
-CuNumeric is an open-source project released under the [Apache license, version 2.0](https://www.apache.org/licenses/LICENSE-2.0). We welcome any and all contributions, and we hope that you can help us develop a strong community.
+CuPyNumeric is an open-source project released under the [Apache license, version 2.0](https://www.apache.org/licenses/LICENSE-2.0). We welcome any and all contributions, and we hope that you can help us develop a strong community.
## How to begin
-Most of the time, the best thing is to begin by [opening an issue](https://github.com/nv-legate/cunumeric/issues). This gives us a chance to discuss the contribution and to define the problem or feature that it addresses. Often, opening of the issue first may help prevent you from doing unnecessary work or to enhance and further develop your idea.
+Most of the time, the best thing is to begin by [opening an issue](https://github.com/nv-legate/cupynumeric/issues). This gives us a chance to discuss the contribution and to define the problem or feature that it addresses. Often, opening of the issue first may help prevent you from doing unnecessary work or to enhance and further develop your idea.
Once you are ready to start development, we ask you to work on a [fork](https://docs.github.com/en/get-started/quickstart/fork-a-repo) of our repository. The next step is to create a (pull request)[https://help.github.com/en/articles/about-pull-requests]. Feel free to open the pull request as soon as you begin your development (just mark it [as a draft](https://github.blog/2019-02-14-introducing-draft-pull-requests/)) or when you are ready to have your contribution merged.
## The Legalese: Developer Certificate of Origin
-CuNumeric is released under the open-source [Apache license, version 2.0](https://www.apache.org/licenses/LICENSE-2.0), and is free to use, modify, and redistribute. To ensure that the license can be exercised without encumbrance, we ask you that you only contribute your own work or work to which you have the intellectual rights. To that end, we employ the Developer's Certificate of Origin (DCO), which is the lightweight mechanism for you to certify that you are legally able to make your contribution. Here is the full text of the certificate (also available at [DeveloperCertificate.org](https://developercertificate.org/):
+CuPyNumeric is released under the open-source [Apache license, version 2.0](https://www.apache.org/licenses/LICENSE-2.0), and is free to use, modify, and redistribute. To ensure that the license can be exercised without encumbrance, we ask you that you only contribute your own work or work to which you have the intellectual rights. To that end, we employ the Developer's Certificate of Origin (DCO), which is the lightweight mechanism for you to certify that you are legally able to make your contribution. Here is the full text of the certificate (also available at [DeveloperCertificate.org](https://developercertificate.org/):
````
Developer Certificate of Origin
@@ -61,12 +61,12 @@ Please use your real name and a valid email address at which you can be reached.
## Review Process
-We are really grateful that you are thinking of contributing to cuNumeric. We will make every effort to review your contributions as soon as possible.
+We are really grateful that you are thinking of contributing to cuPyNumeric. We will make every effort to review your contributions as soon as possible.
As we suggested at the beginning of this document, it will be really helpful to start with an issue unless your proposed change is really trivial. An issue will help to save work in the review process (e.g., maybe somebody is already working on exactly the same thing you want to work on). After you open your pull request (PR), there usually will be a community feedback that often will require further changes to your contribution (the usual open-source process). Usually, this will conclude in the PR being merged by a maintainer, but on rare occasions a PR may be rejected. This may happen, for example, if the PR appears abandoned (no response to the community feedback) or if the PR does not seem to be approaching community acceptance in a reasonable time frame. In any case, an explanation will always be given why a PR is closed. Even if a PR is closed for some reason, it may always be reopened if the situation evolves (feel free to comment on closed PRs to discuss reopening them).
## Code Formatting Requirements
-CuNumeric has a set of coding standards that are expected from all the code merged into the project. The coding standards are defined by the set of tools we use to format our code. We use the [pre-commit](https://pre-commit.com/) framework to run our formatting tools. The easiest way to meet the coding standards is to simply use the pre-commit framework to run all the checks for you. Please visit the [pre-commit project page](https://pre-commit.com/) for pre-commit installation and usage instructions. Once pre-commit is installed in the cuNumeric repo, all the checks and formatting will be run on every commit, but one can also run the checks explicitly as detailed in pre-commit documentation.
+CuPyNumeric has a set of coding standards that are expected from all the code merged into the project. The coding standards are defined by the set of tools we use to format our code. We use the [pre-commit](https://pre-commit.com/) framework to run our formatting tools. The easiest way to meet the coding standards is to simply use the pre-commit framework to run all the checks for you. Please visit the [pre-commit project page](https://pre-commit.com/) for pre-commit installation and usage instructions. Once pre-commit is installed in the cuPyNumeric repo, all the checks and formatting will be run on every commit, but one can also run the checks explicitly as detailed in pre-commit documentation.
We hope that the automation of our formatting checks will make it easy to comply with our coding standards. If you encounter problems with code formatting, however, please let us know in a comment on your PR, and we will do our best to help.
diff --git a/MANIFEST.in b/MANIFEST.in
index 8f77ed2002..3eb2279b7b 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1,3 @@
include versioneer.py
-include cunumeric/_version.py
-include cunumeric/py.typed
+include cupynumeric/_version.py
+include cupynumeric/py.typed
diff --git a/README.md b/README.md
index 7e42b9e92d..97428aacf7 100644
--- a/README.md
+++ b/README.md
@@ -15,14 +15,19 @@ limitations under the License.
-->
-[](https://github.com/nv-legate/cunumeric.internal/actions/workflows/ci-gh-nightly-release.yml)
+[](https://github.com/nv-legate/cupynumeric.internal/actions/workflows/ci-gh-nightly-release.yml)
-# cuNumeric
+# cuPyNumeric
-cuNumeric is a [Legate](https://github.com/nv-legate/legate.core) library
-that aims to provide a distributed and accelerated drop-in replacement for the
-[NumPy API](https://numpy.org/doc/stable/reference/) on top of the
-[Legion](https://legion.stanford.edu) runtime. Using cuNumeric you can do things like run
+cuPyNumeric is a library that aims to provide a distributed and accelerated
+drop-in replacement for [NumPy](https://numpy.org/) built on top of the
+[Legate](https://github.com/nv-legate/legate) framework.
+
+With cuPyNumeric you can write code productively in Python, using the familiar
+NumPy API, and have your program scale with no code changes from single-CPU
+computers to multi-node-multi-GPU clusters.
+
+For example, you can run
[the final example of the Python CFD course](https://github.com/barbagroup/CFDPython/blob/master/lessons/15_Step_12.ipynb)
completely unmodified on 2048 A100 GPUs in a
[DGX SuperPOD](https://www.nvidia.com/en-us/data-center/dgx-superpod/)
@@ -30,7 +35,7 @@ and achieve good weak scaling.
-cuNumeric works best for programs that have very large arrays of data
+cuPyNumeric works best for programs that have very large arrays of data
that cannot fit in the memory of a single GPU or a single node and need
to span multiple nodes and GPUs. While our implementation of the current
NumPy API is still incomplete, programs that use unimplemented features
@@ -39,16 +44,16 @@ canonical NumPy implementation.
## Installation
-cuNumeric is available from [conda](https://docs.conda.io/projects/conda/en/latest/index.html)
-on the [legate channel](https://anaconda.org/legate/cunumeric).
-See https://docs.nvidia.com/cunumeric/latest/installation.html for
+cuPyNumeric is available from [conda](https://docs.conda.io/projects/conda/en/latest/index.html)
+on the [legate channel](https://anaconda.org/legate/cupynumeric).
+See https://docs.nvidia.com/cupynumeric/latest/installation.html for
details about different install configurations, or building
-cuNumeric from source.
+cuPyNumeric from source.
## Documentation
-The cuNumeric documentation can be found
-[here](https://docs.nvidia.com/cunumeric).
+The cuPyNumeric documentation can be found
+[here](https://docs.nvidia.com/cupynumeric).
## Contributing
@@ -56,7 +61,10 @@ See the discussion on contributing in [CONTRIBUTING.md](CONTRIBUTING.md).
## Contact
-For technical questions about Cunumeric and Legate-based tools, please visit
+For technical questions about cuPyNumeric and Legate-based tools, please visit
the [community discussion forum](https://github.com/nv-legate/discussion).
If you have other questions, please contact us at legate(at)nvidia.com.
+
+## Note
+*This project, i.e., cuPyNumeric, is separate and independent of the CuPy project. CuPy is a registered trademark of Preferred Networks.*
diff --git a/cmake/generate_install_info_py.cmake b/cmake/generate_install_info_py.cmake
index 190641a463..724640cbb7 100644
--- a/cmake/generate_install_info_py.cmake
+++ b/cmake/generate_install_info_py.cmake
@@ -17,8 +17,8 @@
execute_process(
COMMAND ${CMAKE_C_COMPILER}
-E -DLEGATE_USE_PYTHON_CFFI
- -I "${CMAKE_CURRENT_LIST_DIR}/../src/cunumeric"
- -P "${CMAKE_CURRENT_LIST_DIR}/../src/cunumeric/cunumeric_c.h"
+ -I "${CMAKE_CURRENT_LIST_DIR}/../src/cupynumeric"
+ -P "${CMAKE_CURRENT_LIST_DIR}/../src/cupynumeric/cupynumeric_c.h"
ECHO_ERROR_VARIABLE
OUTPUT_VARIABLE header
COMMAND_ERROR_IS_FATAL ANY
@@ -26,6 +26,6 @@ execute_process(
set(libpath "")
configure_file(
- "${CMAKE_CURRENT_LIST_DIR}/../cunumeric/install_info.py.in"
- "${CMAKE_CURRENT_LIST_DIR}/../cunumeric/install_info.py"
+ "${CMAKE_CURRENT_LIST_DIR}/../cupynumeric/install_info.py.in"
+ "${CMAKE_CURRENT_LIST_DIR}/../cupynumeric/install_info.py"
@ONLY)
diff --git a/cmake/thirdparty/get_legate.cmake b/cmake/thirdparty/get_legate.cmake
index b8fcb1c356..7951bd2919 100644
--- a/cmake/thirdparty/get_legate.cmake
+++ b/cmake/thirdparty/get_legate.cmake
@@ -14,17 +14,61 @@
# limitations under the License.
#=============================================================================
+# This is based on the similar function for Legion in the Legate code
+function(cupynumeric_maybe_override_legate user_repository user_branch user_version)
+ # CPM_ARGS GIT_TAG and GIT_REPOSITORY don't do anything if you have already overridden
+ # those options via a rapids_cpm_package_override() call. So we have to conditionally
+ # override the defaults (by creating a temporary json file in build dir) only if the
+ # user sets them.
+
+ # See https://github.com/rapidsai/rapids-cmake/issues/575. Specifically, this function
+ # is pretty much identical to
+ # https://github.com/rapidsai/rapids-cmake/issues/575#issuecomment-2045374410.
+ cmake_path(SET legate_overrides_json NORMALIZE
+ "${CUPYNUMERIC_CMAKE_DIR}/versions.json")
+ if(user_repository OR user_branch OR user_version)
+ # The user has set either one of these, time to create our cludge.
+ file(READ "${legate_overrides_json}" default_legate_json)
+ set(new_legate_json "${default_legate_json}")
+
+ if(user_repository)
+ string(JSON new_legate_json SET "${new_legate_json}" "packages" "Legate" "git_url"
+ "\"${user_repository}\"")
+ endif()
+
+ if(user_branch)
+ string(JSON new_legate_json SET "${new_legate_json}" "packages" "Legate" "git_tag"
+ "\"${user_branch}\"")
+ endif()
+
+ if(user_version)
+ string(JSON new_legate_json SET "${new_legate_json}" "packages" "Legate" "version"
+ "\"${user_version}\"")
+ endif()
+
+ string(JSON eq_json EQUAL "${default_legate_json}" "${new_legate_json}")
+ if(NOT eq_json)
+ cmake_path(SET legate_overrides_json NORMALIZE
+ "${CMAKE_CURRENT_BINARY_DIR}/versions.json")
+ file(WRITE "${legate_overrides_json}" "${new_legate_json}")
+ endif()
+ endif()
+ rapids_cpm_package_override("${legate_overrides_json}")
+endfunction()
+
function(find_or_configure_legate)
+ set(options)
set(oneValueArgs VERSION REPOSITORY BRANCH EXCLUDE_FROM_ALL)
+ set(multiValueArgs)
cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
- include("${rapids-cmake-dir}/export/detail/parse_version.cmake")
- rapids_export_parse_version(${PKG_VERSION} legate PKG_VERSION)
+ cupynumeric_maybe_override_legate("${PKG_REPOSITORY}" "${PKG_BRANCH}" "${PKG_VERSION}")
include("${rapids-cmake-dir}/cpm/detail/package_details.cmake")
rapids_cpm_package_details(legate version git_repo git_branch shallow exclude_from_all)
- set(version ${PKG_VERSION})
+ string(REPLACE "00" "0" version "${version}")
+
set(exclude_from_all ${PKG_EXCLUDE_FROM_ALL})
if(PKG_BRANCH)
set(git_branch "${PKG_BRANCH}")
@@ -35,16 +79,17 @@ function(find_or_configure_legate)
set(FIND_PKG_ARGS
GLOBAL_TARGETS legate::legate
- BUILD_EXPORT_SET cunumeric-exports
- INSTALL_EXPORT_SET cunumeric-exports)
+ BUILD_EXPORT_SET cupynumeric-exports
+ INSTALL_EXPORT_SET cupynumeric-exports)
# First try to find legate via find_package()
# so the `Legion_USE_*` variables are visible
# Use QUIET find by default.
set(_find_mode QUIET)
- # If legate_DIR/legate_ROOT are defined as something other than empty or NOTFOUND
- # use a REQUIRED find so that the build does not silently download legate.
- if(legate_DIR OR legate_ROOT)
+ # If legate_DIR/legate_ROOT or CUPYNUMERIC_BUILD_PIP_WHEELS are defined as
+ # something other than empty or NOTFOUND use a REQUIRED find so that the
+ # build does not silently download legate.
+ if(legate_DIR OR legate_ROOT OR CUPYNUMERIC_BUILD_PIP_WHEELS)
set(_find_mode REQUIRED)
endif()
rapids_find_package(legate ${version} EXACT CONFIG ${_find_mode} ${FIND_PKG_ARGS})
@@ -55,11 +100,11 @@ function(find_or_configure_legate)
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/cpm_helpers.cmake)
get_cpm_git_args(legate_cpm_git_args REPOSITORY ${git_repo} BRANCH ${git_branch})
- message(VERBOSE "cunumeric: legate version: ${version}")
- message(VERBOSE "cunumeric: legate git_repo: ${git_repo}")
- message(VERBOSE "cunumeric: legate git_branch: ${git_branch}")
- message(VERBOSE "cunumeric: legate exclude_from_all: ${exclude_from_all}")
- message(VERBOSE "cunumeric: legate legate_cpm_git_args: ${legate_cpm_git_args}")
+ message(VERBOSE "cupynumeric: legate version: ${version}")
+ message(VERBOSE "cupynumeric: legate git_repo: ${git_repo}")
+ message(VERBOSE "cupynumeric: legate git_branch: ${git_branch}")
+ message(VERBOSE "cupynumeric: legate exclude_from_all: ${exclude_from_all}")
+ message(VERBOSE "cupynumeric: legate legate_cpm_git_args: ${legate_cpm_git_args}")
rapids_cpm_find(legate ${version} ${FIND_PKG_ARGS}
CPM_ARGS
@@ -70,35 +115,33 @@ function(find_or_configure_legate)
endif()
set(Legion_USE_CUDA ${Legion_USE_CUDA} PARENT_SCOPE)
+ set(Legion_CUDA_ARCH ${Legion_CUDA_ARCH} PARENT_SCOPE)
set(Legion_USE_OpenMP ${Legion_USE_OpenMP} PARENT_SCOPE)
set(Legion_BOUNDS_CHECKS ${Legion_BOUNDS_CHECKS} PARENT_SCOPE)
message(VERBOSE "Legion_USE_CUDA=${Legion_USE_CUDA}")
+ message(VERBOSE "Legion_CUDA_ARCH=${Legion_CUDA_ARCH}")
message(VERBOSE "Legion_USE_OpenMP=${Legion_USE_OpenMP}")
message(VERBOSE "Legion_BOUNDS_CHECKS=${Legion_BOUNDS_CHECKS}")
endfunction()
-foreach(_var IN ITEMS "cunumeric_LEGATE_VERSION"
- "cunumeric_LEGATE_BRANCH"
- "cunumeric_LEGATE_REPOSITORY"
- "cunumeric_EXCLUDE_LEGATE_FROM_ALL")
+foreach(_var IN ITEMS "cupynumeric_LEGATE_VERSION"
+ "cupynumeric_LEGATE_BRANCH"
+ "cupynumeric_LEGATE_REPOSITORY"
+ "cupynumeric_EXCLUDE_LEGATE_FROM_ALL")
if(DEFINED ${_var})
- # Create a cunumeric_LEGATE_BRANCH variable in the current scope either from the existing
+ # Create a cupynumeric_LEGATE_BRANCH variable in the current scope either from the existing
# current-scope variable, or the cache variable.
set(${_var} "${${_var}}")
- # Remove cunumeric_LEGATE_BRANCH from the CMakeCache.txt. This ensures reconfiguring the same
- # build dir without passing `-Dcunumeric_LEGATE_BRANCH=` reverts to the value in versions.json
- # instead of reusing the previous `-Dcunumeric_LEGATE_BRANCH=` value.
+ # Remove cupynumeric_LEGATE_BRANCH from the CMakeCache.txt. This ensures reconfiguring the same
+ # build dir without passing `-Dcupynumeric_LEGATE_BRANCH=` reverts to the value in versions.json
+ # instead of reusing the previous `-Dcupynumeric_LEGATE_BRANCH=` value.
unset(${_var} CACHE)
endif()
endforeach()
-if(NOT DEFINED cunumeric_LEGATE_VERSION)
- set(cunumeric_LEGATE_VERSION "${cunumeric_VERSION}")
-endif()
-
-find_or_configure_legate(VERSION ${cunumeric_LEGATE_VERSION}
- REPOSITORY ${cunumeric_LEGATE_REPOSITORY}
- BRANCH ${cunumeric_LEGATE_BRANCH}
- EXCLUDE_FROM_ALL ${cunumeric_EXCLUDE_LEGATE_FROM_ALL}
+find_or_configure_legate(VERSION ${cupynumeric_LEGATE_VERSION}
+ REPOSITORY ${cupynumeric_LEGATE_REPOSITORY}
+ BRANCH ${cupynumeric_LEGATE_BRANCH}
+ EXCLUDE_FROM_ALL ${cupynumeric_EXCLUDE_LEGATE_FROM_ALL}
)
diff --git a/cmake/thirdparty/get_openblas.cmake b/cmake/thirdparty/get_openblas.cmake
index d4e4454a09..b384ecf024 100644
--- a/cmake/thirdparty/get_openblas.cmake
+++ b/cmake/thirdparty/get_openblas.cmake
@@ -22,7 +22,7 @@ function(find_or_configure_OpenBLAS)
set(BLAS_name "OpenBLAS")
set(BLAS_target "openblas")
- # cuNumeric presently requires OpenBLAS
+ # cuPyNumeric presently requires OpenBLAS
set(BLA_VENDOR OpenBLAS)
# TODO: should we find (or build) 64-bit BLAS?
@@ -35,8 +35,8 @@ function(find_or_configure_OpenBLAS)
set(FIND_PKG_ARGS ${PKG_VERSION}
GLOBAL_TARGETS ${BLAS_target}
- BUILD_EXPORT_SET cunumeric-exports
- INSTALL_EXPORT_SET cunumeric-exports)
+ BUILD_EXPORT_SET cupynumeric-exports
+ INSTALL_EXPORT_SET cupynumeric-exports)
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/cpm_helpers.cmake)
if(PKG_BRANCH)
@@ -50,10 +50,24 @@ function(find_or_configure_OpenBLAS)
set(CMAKE_POLICY_DEFAULT_CMP0048 OLD)
set(CMAKE_POLICY_DEFAULT_CMP0054 NEW)
+ # Force a base CPU type for the openblas build.
+ set(_target HASWELL)
+ if(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+ set(_target ARMV8)
+ endif()
+
+ # BLAS emits a bunch of warnings, -w is the "silence all warnings" flag for clang and
+ # GCC
+ if(MSVC)
+ message(FATAL_ERROR "Don't know how to silence warnings with MSVC")
+ endif()
+ set(c_flags "${CMAKE_C_FLAGS} -w")
+ set(f_flags "${CMAKE_Fortran_FLAGS} -w")
rapids_cpm_find(BLAS ${FIND_PKG_ARGS}
CPM_ARGS
${BLAS_cpm_git_args}
EXCLUDE_FROM_ALL ${PKG_EXCLUDE_FROM_ALL}
+ SYSTEM TRUE
OPTIONS "USE_CUDA 0"
"C_LAPACK ON"
"USE_THREAD ON"
@@ -62,7 +76,10 @@ function(find_or_configure_OpenBLAS)
"BUILD_WITHOUT_CBLAS OFF"
"BUILD_WITHOUT_LAPACK OFF"
"INTERFACE64 ${INTERFACE64}"
- "USE_OPENMP ${Legion_USE_OpenMP}")
+ "TARGET ${_target}"
+ "USE_OPENMP ${Legion_USE_OpenMP}"
+ "CMAKE_C_FLAGS ${c_flags}"
+ "CMAKE_Fortran_FLAGS ${f_flags}")
set(CMAKE_POLICY_DEFAULT_CMP0048 ${CMP0048_orig})
set(CMAKE_POLICY_DEFAULT_CMP0054 ${CMP0054_orig})
@@ -89,7 +106,7 @@ function(find_or_configure_OpenBLAS)
$
# contains cblas.h and f77blas.h
$
- )
+ )
string(JOIN "\n" code_string
"if(NOT TARGET BLAS::BLAS)"
@@ -105,35 +122,35 @@ function(find_or_configure_OpenBLAS)
FINAL_CODE_BLOCK code_string)
# Do `CPMFindPackage(BLAS)` in build dir
- rapids_export_package(BUILD BLAS cunumeric-exports
+ rapids_export_package(BUILD BLAS cupynumeric-exports
VERSION ${PKG_VERSION} GLOBAL_TARGETS ${BLAS_target})
# Tell cmake where it can find the generated blas-config.cmake
include("${rapids-cmake-dir}/export/find_package_root.cmake")
- rapids_export_find_package_root(BUILD BLAS [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cunumeric-exports)
+ rapids_export_find_package_root(BUILD BLAS [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cupynumeric-exports)
endif()
endfunction()
-if(NOT DEFINED cunumeric_OPENBLAS_VERSION)
+if(NOT DEFINED cupynumeric_OPENBLAS_VERSION)
# Before v0.3.18, OpenBLAS's throws CMake errors when configuring
- set(cunumeric_OPENBLAS_VERSION "0.3.20")
+ set(cupynumeric_OPENBLAS_VERSION "0.3.29")
endif()
-if(NOT DEFINED cunumeric_OPENBLAS_BRANCH)
- set(cunumeric_OPENBLAS_BRANCH "")
+if(NOT DEFINED cupynumeric_OPENBLAS_BRANCH)
+ set(cupynumeric_OPENBLAS_BRANCH "")
endif()
-if(NOT DEFINED cunumeric_OPENBLAS_TAG)
- set(cunumeric_OPENBLAS_TAG v${cunumeric_OPENBLAS_VERSION})
+if(NOT DEFINED cupynumeric_OPENBLAS_TAG)
+ set(cupynumeric_OPENBLAS_TAG v${cupynumeric_OPENBLAS_VERSION})
endif()
-if(NOT DEFINED cunumeric_OPENBLAS_REPOSITORY)
- set(cunumeric_OPENBLAS_REPOSITORY https://github.com/xianyi/OpenBLAS.git)
+if(NOT DEFINED cupynumeric_OPENBLAS_REPOSITORY)
+ set(cupynumeric_OPENBLAS_REPOSITORY https://github.com/xianyi/OpenBLAS.git)
endif()
-find_or_configure_OpenBLAS(VERSION ${cunumeric_OPENBLAS_VERSION}
- REPOSITORY ${cunumeric_OPENBLAS_REPOSITORY}
- BRANCH ${cunumeric_OPENBLAS_BRANCH}
- PINNED_TAG ${cunumeric_OPENBLAS_TAG}
- EXCLUDE_FROM_ALL ${cunumeric_EXCLUDE_OPENBLAS_FROM_ALL}
+find_or_configure_OpenBLAS(VERSION ${cupynumeric_OPENBLAS_VERSION}
+ REPOSITORY ${cupynumeric_OPENBLAS_REPOSITORY}
+ BRANCH ${cupynumeric_OPENBLAS_BRANCH}
+ PINNED_TAG ${cupynumeric_OPENBLAS_TAG}
+ EXCLUDE_FROM_ALL ${cupynumeric_EXCLUDE_OPENBLAS_FROM_ALL}
)
diff --git a/cmake/thirdparty/get_tblis.cmake b/cmake/thirdparty/get_tblis.cmake
index dbe0d4e935..164923601b 100644
--- a/cmake/thirdparty/get_tblis.cmake
+++ b/cmake/thirdparty/get_tblis.cmake
@@ -34,14 +34,14 @@ function(find_or_configure_tblis)
HEADER_NAMES "tblis/tblis.h"
LIBRARY_NAMES "libtblis${lib_suffix}"
NO_CONFIG
- BUILD_EXPORT_SET cunumeric-exports
- INSTALL_EXPORT_SET cunumeric-exports
+ BUILD_EXPORT_SET cupynumeric-exports
+ INSTALL_EXPORT_SET cupynumeric-exports
)
rapids_cpm_find(tblis ${PKG_VERSION}
GLOBAL_TARGETS tblis::tblis
- BUILD_EXPORT_SET cunumeric-exports
- INSTALL_EXPORT_SET cunumeric-exports
+ BUILD_EXPORT_SET cupynumeric-exports
+ INSTALL_EXPORT_SET cupynumeric-exports
CPM_ARGS
${tblis_cpm_git_args}
EXCLUDE_FROM_ALL ${PKG_EXCLUDE_FROM_ALL}
@@ -95,8 +95,8 @@ function(find_or_configure_tblis)
set(ENV{CC} "${_CC}")
set(ENV{CXX} "${_CXX}")
- message(VERBOSE "cunumeric: ENV{CC}=\"$ENV{CC}\"")
- message(VERBOSE "cunumeric: ENV{CXX}=\"$ENV{CXX}\"")
+ message(VERBOSE "cupynumeric: ENV{CC}=\"$ENV{CC}\"")
+ message(VERBOSE "cupynumeric: ENV{CXX}=\"$ENV{CXX}\"")
set(tblis_verbosity "--enable-silent-rules")
if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.25")
@@ -167,20 +167,20 @@ function(find_or_configure_tblis)
endif()
set(tblis_BINARY_DIR ${tblis_BINARY_DIR} PARENT_SCOPE)
- set(cunumeric_INSTALL_TBLIS ${should_build_tblis} PARENT_SCOPE)
+ set(cupynumeric_INSTALL_TBLIS ${should_build_tblis} PARENT_SCOPE)
endfunction()
-if(NOT DEFINED cunumeric_TBLIS_BRANCH)
- set(cunumeric_TBLIS_BRANCH arm-build)
+if(NOT DEFINED cupynumeric_TBLIS_BRANCH)
+ set(cupynumeric_TBLIS_BRANCH arm-build)
endif()
-if(NOT DEFINED cunumeric_TBLIS_REPOSITORY)
- set(cunumeric_TBLIS_REPOSITORY https://github.com/nv-legate/tblis.git)
+if(NOT DEFINED cupynumeric_TBLIS_REPOSITORY)
+ set(cupynumeric_TBLIS_REPOSITORY https://github.com/nv-legate/tblis.git)
endif()
find_or_configure_tblis(VERSION 1.2.0
- REPOSITORY ${cunumeric_TBLIS_REPOSITORY}
- BRANCH ${cunumeric_TBLIS_BRANCH}
- EXCLUDE_FROM_ALL ${cunumeric_EXCLUDE_TBLIS_FROM_ALL}
+ REPOSITORY ${cupynumeric_TBLIS_REPOSITORY}
+ BRANCH ${cupynumeric_TBLIS_BRANCH}
+ EXCLUDE_FROM_ALL ${cupynumeric_EXCLUDE_TBLIS_FROM_ALL}
USE_OPENMP ${Legion_USE_OpenMP}
)
diff --git a/cmake/versions.json b/cmake/versions.json
index 1240dad06f..a8b44699a5 100644
--- a/cmake/versions.json
+++ b/cmake/versions.json
@@ -1,15 +1,17 @@
{
"packages" : {
"legate" : {
- "repo": "legate.core.internal",
+ "repo": "legate.internal",
"artifact_name": "${{ inputs.platform }}-${{ inputs.build-type }}-<>-python${{ inputs.python-version }}-${{ inputs.target-device }}-release-with_tests-${{ inputs.network }}-<>",
"org": "nv-legate",
+ "artifact_workflow": "ci-gh.yml",
"nightly_workflow": "ci-gh-nightly-release.yml",
- "version": "24.09.00",
- "git_url" : "git@github.com:nv-legate/legate.core.internal.git",
+ "version": "25.05.00",
+ "git_url" : "git@github.com:nv-legate/legate.internal.git",
"git_shallow": false,
"always_download": false,
- "git_tag" : "32137a65cf40c56db1db9f76bb508ade81da000a"
+ "git_tag" : "fe71160b63291c1d073090ad2cb7a11c618d958a",
+ "anaconda_label": "experimental"
}
}
}
diff --git a/conda/conda-build/build.sh b/conda/conda-build/build.sh
index c78cbbcca9..4cbf5d1afb 100644
--- a/conda/conda-build/build.sh
+++ b/conda/conda-build/build.sh
@@ -26,7 +26,13 @@ if [ -z "$CPU_ONLY" ]; then
else
# When we build without cuda, we need to provide the location of curand
CMAKE_ARGS+="
--Dcunumeric_cuRAND_INCLUDE_DIR=$PREFIX/targets/x86_64-linux/include"
+-Dcupynumeric_cuRAND_INCLUDE_DIR=$PREFIX/targets/x86_64-linux/include"
+fi
+
+# We rely on an environment variable to determine if we need to build cpp tests
+if [[ "$BUILD_TESTS" == "1" ]]; then
+ CMAKE_ARGS+="
+-Dcupynumeric_BUILD_TESTS=ON"
fi
export CMAKE_GENERATOR=Ninja
@@ -45,8 +51,8 @@ cmake --build build -j$CPU_COUNT --verbose
cmake --install build
CMAKE_ARGS="
--DFIND_CUNUMERIC_CPP=ON
--Dcunumeric_ROOT=$PREFIX"
+-DFIND_CUPYNUMERIC_CPP=ON
+-Dcupynumeric_ROOT=$PREFIX"
SKBUILD_BUILD_OPTIONS=-j$CPU_COUNT \
$PYTHON -m pip install \
diff --git a/conda/conda-build/meta.yaml b/conda/conda-build/meta.yaml
index d7db54b8bc..d29e2a1279 100644
--- a/conda/conda-build/meta.yaml
+++ b/conda/conda-build/meta.yaml
@@ -1,12 +1,16 @@
-{% set name = "cunumeric" %}
+{% set name = "cupynumeric" %}
{% if gpu_enabled == "true" %}
{% set gpu_enabled_bool = true %}
+ {% set cpu_gpu_tag='_gpu' %}
{% elif gpu_enabled == "false" %}
{% set gpu_enabled_bool = false %}
+ {% set cpu_gpu_tag='_cpu' %}
{% else %}
{# We need to have a default value for the initial pass over the recipe #}
{% set gpu_enabled_bool = false %}
+ {% set cpu_gpu_tag='_cpu' %}
{% endif %}
+
{% if upload_build == "true" %}
{% set upload_build_bool = true %}
{% elif upload_build == "false" %}
@@ -15,6 +19,14 @@
{# We need to have a default value for the initial pass over the recipe #}
{% set upload_build_bool = false %}
{% endif %}
+{% if build_tests == "true" %}
+ {% set build_tests_bool = true %}
+{% elif build_tests == "false" %}
+ {% set build_tests_bool = false %}
+{% else %}
+ {# We need to have a default value for the initial pass over the recipe #}
+ {% set build_tests_bool = false %}
+{% endif %}
## The placeholder version is strictly for making two-pass conda build process.
## It should not be used for any other purpose, and this is not a default version.
{% set placeholder_version = '0.0.0.dev' %}
@@ -37,11 +49,13 @@
## Note: default values are only given to make conda build work. They should not be necessary in principle.
{% elif 'dev' in environ.get('GIT_DESCRIBE_TAG', placeholder_version) %}
{% set version = (environ.get('GIT_DESCRIBE_TAG', placeholder_version) ~ environ.get('GIT_DESCRIBE_NUMBER', '')).lstrip('v') %}
- {% set legate_version = (version.rsplit('.',1)[0] ~ ".dev" ~ "|>=" ~ version.rsplit('.',1)[0]) %}
+ {% set legate_version_default = (version.rsplit('.',1)[0] ~ ".dev" ~ "|>=" ~ version.rsplit('.',1)[0]) %}
+ {% set legate_version = os.environ.get("LEGATE_VERSION", legate_version_default) %}
{% else %}
{% set version = environ.get('GIT_DESCRIBE_TAG', placeholder_version).lstrip('v') %}
- {% set legate_version = version %}
+ {% set legate_version = os.environ.get("LEGATE_VERSION", version) %}
{% endif %}
+{% set legate_buildstr = "_".join(["cuda" ~ cuda_major, "py" ~ py_version, os.environ.get("LEGATE_BUILDSTR", ""), cpu_gpu_tag.strip('_') ]) %}
package:
name: {{ name|lower }}
@@ -61,18 +75,13 @@ build:
number: {{ build_number }}
missing_dso_whitelist:
- '*libcuda.so*'
-{% if gpu_enabled_bool %}
-{% set cpu_gpu_tag='_gpu' %}
-{% else %}
-{% set cpu_gpu_tag='_cpu' %}
-{% endif %}
{% set upload_tag='' if upload_build_bool else '_with_tests' %}
{% if use_local_path is not defined %}
# use git hash
- string: "cuda{{ cuda_major }}_py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ PKG_BUILDNUM }}{{ cpu_gpu_tag }}{{ upload_tag }}"
+ string: "cuda{{ cuda_major }}_py{{ py_version }}{{ cpu_gpu_tag }}{{ upload_tag }}_{{ GIT_DESCRIBE_HASH }}_{{ PKG_BUILDNUM }}"
{% else %}
# do not use git hash
- string: "cuda{{ cuda_major }}_py{{ py_version }}_{{ PKG_BUILDNUM }}{{ cpu_gpu_tag }}{{ upload_tag }}"
+ string: "cuda{{ cuda_major }}_py{{ py_version }}{{ cpu_gpu_tag }}{{ upload_tag }}_{{ PKG_BUILDNUM }}"
{% endif %}
script_env:
- SCCACHE_BUCKET
@@ -80,8 +89,17 @@ build:
- SCCACHE_IDLE_TIMEOUT
- SCCACHE_S3_KEY_PREFIX
- SCCACHE_S3_KEY_PREFIX
+ - SCCACHE_S3_USE_SSL
+ - SCCACHE_S3_NO_CREDENTIALS
- AWS_ACCESS_KEY_ID
- AWS_SECRET_ACCESS_KEY
+ - AWS_SESSION_TOKEN
+ - CMAKE_C_COMPILER_LAUNCHER
+ - CMAKE_CUDA_COMPILER_LAUNCHER
+ - CMAKE_CXX_COMPILER_LAUNCHER
+{% if build_tests_bool %}
+ - BUILD_TESTS=1
+{% endif %}
{% if not gpu_enabled_bool %}
- CPU_ONLY=1
# The CPU-only packages having more track_features than the GPU builds helps
@@ -115,8 +133,8 @@ requirements:
- python
- scikit-build
- openblas =* =*openmp*
+ - legate ={{ legate_version }}={{ legate_buildstr }}
{% if gpu_enabled_bool %}
- - legate >={{ legate_version }} =*_gpu*
- cuda-cccl
- cutensor >=2.0 =*_*
- libcublas-dev
@@ -125,16 +143,16 @@ requirements:
- libcurand-dev
- libcufile-dev
- cuda-version ={{ cuda_version }}
-{% else %}
- - legate >={{ legate_version }} =*_cpu*
{% endif %}
run:
+ - cffi
- numpy {{ numpy_version }}
- opt_einsum >=3.3
- scipy
- openblas =* =*openmp*
{% if gpu_enabled_bool %}
+ - cupy
- libnvjitlink
- libcusparse
- cutensor >=2.0 =*_*
@@ -148,16 +166,16 @@ requirements:
- __glibc >=2.17 # [linux]
about:
- home: https://github.com/nv-legate/cunumeric
+ home: https://github.com/nv-legate/cupynumeric
license: Apache-2.0
license_file: LICENSE
summary: 'Drop-in Replacment for NumPy'
description: |
- cuNumeric is a Legate library that aims to provide
+ cuPyNumeric is a Legate library that aims to provide
a distributed and accelerated drop-in replacement
for the NumPy API on top of the Legion runtime.
- doc_url: https://github.com/nv-legate/cunumeric
- dev_url: https://github.com/nv-legate/cunumeric
+ doc_url: https://github.com/nv-legate/cupynumeric
+ dev_url: https://github.com/nv-legate/cupynumeric
extra:
recipe-maintainers:
diff --git a/continuous_integration/requirements-build.txt b/continuous_integration/requirements-build.txt
new file mode 100644
index 0000000000..6ce8269a49
--- /dev/null
+++ b/continuous_integration/requirements-build.txt
@@ -0,0 +1,10 @@
+--extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+--extra-index-url=https://pypi.nvidia.com
+cmake>=3.26.4,!=3.30.0
+ninja
+nvidia-nccl-cu12
+cutensor-cu12
+scikit-build
+scikit-build-core[pyproject]>=0.10.0
+setuptools_scm
+cython
diff --git a/continuous_integration/scripts/build b/continuous_integration/scripts/build
index f68c5ba800..8287fc517a 100755
--- a/continuous_integration/scripts/build
+++ b/continuous_integration/scripts/build
@@ -2,22 +2,19 @@
set -x
build_release_product() {
- set -xeo pipefail;
+ set -xeuo pipefail;
echo "RUNNING build_release_product"
mkdir -p /tmp/env_yaml /tmp/conda-build /tmp/out
- cp -r "${ARTIFACTS_DIR}/conda-build/legate" /tmp/conda-build/
-
local conda_build_args=();
- conda_build_args+=(--override-channels);
-
# The channel sequence below needs to be preserved
+ conda_build_args+=(-c https://conda.anaconda.org/${CONDA_CHANNEL}/label/${CONDA_LABEL});
+ conda_build_args+=(-c legate/label/ucc140);
conda_build_args+=(-c conda-forge);
conda_build_args+=(--override-channels);
- conda_build_args+=(-c file:///tmp/conda-build/legate);
- conda_build_args+=(--croot /tmp/conda-build/cunumeric);
+ conda_build_args+=(--croot /tmp/conda-build/cupynumeric);
conda_build_args+=(--numpy 1.22);
conda_build_args+=(--no-test);
conda_build_args+=(--no-verify);
@@ -32,13 +29,29 @@ build_release_product() {
UPLOAD_BUILD=true
[ "${UPLOAD_ENABLED:-}" = "OFF" ] && UPLOAD_BUILD=false
- variantOpts=$(printf "{\"gpu_enabled\": [$GPU_ENABLED], \"upload_build\": [$UPLOAD_BUILD], \"python\": [$PYTHON_VERSION]}")
+ variantOpts=$(printf "{\"gpu_enabled\": [$GPU_ENABLED], \"build_tests\": [$BUILD_TESTS], \"upload_build\": [$UPLOAD_BUILD], \"python\": [$PYTHON_VERSION]}")
conda_build_args+=(--variants "$variantOpts")
- # https://github.com/nv-legate/cunumeric.internal/pull/351#issuecomment-2286922486
+ # https://github.com/nv-legate/cupynumeric.internal/pull/351#issuecomment-2286922486
export CONDA_OVERRIDE_CUDA="${CUDA_VERSION}"
- conda mambabuild "${conda_build_args[@]}" "${REPO_DIR}/conda/conda-build";
+
+ # Use the new .conda format.
+ conda config --set conda_build.pkg_format 2
+
+ # Set up the SCCACHE environment variables
+ export CI=true
+ source "${REPO_DIR}/continuous_integration/scripts/tools/legate-configure-sccache"
+ sccache --zero-stats
+
+ set +u;
+
+ # For whatever reason, the default buffering of conda/mamba is not sufficient, and
+ # leads to garbled output in CI (mixing conda output and whatever build.sh prints). So
+ # we need to force unbuffered output.
+ stdbuf -o0 -e0 conda mambabuild "${conda_build_args[@]}" "${REPO_DIR}/conda/conda-build";
+
+ sccache --show-adv-stats
copy_release_artifacts
}
@@ -52,53 +65,20 @@ copy_release_artifacts() {
ls -lahR $ARTIFACTS_DIR
}
-copy_ci_artifacts() {
- set -xeuo pipefail;
- echo Copying CI artifacts
-
- mkdir -p "$ARTIFACTS_DIR"
-
- cp -r /tmp/out "$ARTIFACTS_DIR"
- cp -r /tmp/conda-build "$ARTIFACTS_DIR"
-}
-
-build_ci_product() {
- set -xeuo pipefail;
-
- printf "\n\n\n\n********* BUILDING CUNUMERIC CPP *********\n"
- build-cunumeric-cpp;
-
- printf "\n\n\n\n********* BUILDING CUNUMERIC WHEEL *********\n"
- build-cunumeric-wheel;
-
- printf "\n\n\n\n********* BUILDING CUNUMERIC CONDA *********\n"
- build-cunumeric-conda;
-
- copy_ci_artifacts;
-}
-
-build_cunumeric_fake() {
- set -xeuo pipefail;
-
- mkdir -p /tmp/out /tmp/conda-build/legate /tmp/conda-build/cunumeric
- touch /tmp/out/legate-23.11.00-dummy.tar.bz2
- touch /tmp/conda-build/legate/dummy.txt
- touch /tmp/conda-build/cunumeric/dummy.txt
-
- copy_ci_artifacts
-}
-
build_project() {
. setup-utils;
init_build_env "$@";
+ . conda-dnld-utils;
+ setup_conda_channel;
+ generate_legate_version
+
case "$BUILD_TYPE" in
- ci) build_ci_product;;
- release) build_release_product;;
+ ci) build_release_product;;
+ nightly) build_release_product;;
*) return 1;;
esac
}
-
(build_project "$@");
diff --git a/continuous_integration/scripts/build-cunumeric-conda b/continuous_integration/scripts/build-cunumeric-conda
deleted file mode 100755
index e1b83ca699..0000000000
--- a/continuous_integration/scripts/build-cunumeric-conda
+++ /dev/null
@@ -1,100 +0,0 @@
-#!/usr/bin/env bash
-
-build_cunumeric_conda_package() {
- set -xeuo pipefail;
-
- local python_version="${PYTHON_VERSION:-}";
-
- if [ -z "${python_version}" ]; then
- python_version="$(python3 --version 2>&1 | cut -d' ' -f2 | cut -d'.' -f3 --complement)";
- fi
-
- mkdir -p /tmp/conda-build /tmp/out
- cp -r "${ARTIFACTS_DIR}/conda-build/legate" /tmp/conda-build/
-
- local conda_build_args=();
- conda_build_args+=(--override-channels);
- conda_build_args+=(-c conda-forge);
- # the ucx channel is only necessary as a WAR until the real ucx 1.17 package is available on conda-forge
- conda_build_args+=(-c https://github.com/nv-legate/ucx-package/raw/main);
- conda_build_args+=(-c file:///tmp/conda-build/legate);
- conda_build_args+=(--croot /tmp/conda-build/cunumeric);
- conda_build_args+=(--numpy 1.22);
- conda_build_args+=(--python ${python_version});
- conda_build_args+=(--no-test);
- conda_build_args+=(--no-verify);
- conda_build_args+=(--no-build-id);
- conda_build_args+=("--build-id-pat=''");
- conda_build_args+=(--no-include-recipe);
- conda_build_args+=(--no-anaconda-upload);
-
- GPU_ENABLED=true
- [ "${USE_CUDA}" = "OFF" ] && GPU_ENABLED=false
-
- UPLOAD_BUILD=true
- [ "${UPLOAD_ENABLED:-}" = "OFF" ] && UPLOAD_BUILD=false
-
- conda_build_args+=(--variants "{gpu_enabled:${GPU_ENABLED},python:${python_version}}");
-
- rm -rf /tmp/conda-build/cunumeric;
- mkdir -p /tmp/conda-build/cunumeric;
-
- # Synthesize new cunumeric conda-build build.sh script
-
- cat < "${REPO_DIR}/conda/conda-build/conda_build_config.yaml"
-gpu_enabled:
- - "${GPU_ENABLED}"
-
-upload_build:
- - "${UPLOAD_BUILD}"
-
-python:
- - "${python_version}"
-
-numpy_version:
- - ">=1.22,<2"
-
-cmake_version:
- - ">=3.20.1,!=3.23.0"
-
-use_local_path:
- - "true"
-
-numpy:
- - 1.22
-
-package_version:
- - "$(git -C "${REPO_DIR}" describe --abbrev=0 --tags | sed 's/[a-zA-Z]//g' | cut -d '.' -f -2).00"
-EOF
-
- cat <<"EOF" > "${REPO_DIR}/conda/conda-build/build.sh"
-# Install cunumeric C++ libs
-tar -C "$PREFIX" --exclude="*.a" --strip-components=1 -xvf /tmp/out/cunumeric-*-Linux.tar.gz;
-
-# Install cunumeric Python wheel
-pip install --no-deps --root / --prefix "$PREFIX" /tmp/out/cunumeric-*.whl;
-EOF
-
- git -C "${REPO_DIR}" add .;
- git -C "${REPO_DIR}" commit --allow-empty --allow-empty-message -n -m "";
-
- # Build cuNumeric conda package
- set +ux
- eval "$(conda shell.bash hook)"
- conda deactivate
- conda create -n build
- conda activate build
- set -ux
- conda install boa
-
- CUDA=${CUDA_VERSION} \
- conda mambabuild ${conda_build_args[@]} "${REPO_DIR}/conda/conda-build";
-
- git -C "${REPO_DIR}" reset --hard HEAD~1;
-
- cp /tmp/conda-build/cunumeric/linux-64/cunumeric-*.tar.bz2 /tmp/out/;
-
- { set +x; } 2>/dev/null;
-}
-
-(build_cunumeric_conda_package "$@");
diff --git a/continuous_integration/scripts/build-cunumeric-cpp b/continuous_integration/scripts/build-cunumeric-cpp
deleted file mode 100755
index e608ec385d..0000000000
--- a/continuous_integration/scripts/build-cunumeric-cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/usr/bin/env bash
-
-build_cunumeric_cpp() {
- set -xeuo pipefail;
-
- # Build + package cuNumeric C++ libs
- local cmake_args=(${CMAKE_ARGS:-});
- cmake_args+=(-DBUILD_SHARED_LIBS=ON);
- cmake_args+=(-DBUILD_MARCH=${BUILD_MARCH});
- cmake_args+=(-DCMAKE_BUILD_TYPE=Release);
- cmake_args+=(-DCMAKE_CUDA_ARCHITECTURES=RAPIDS);
- cmake_args+=(-DCMAKE_BUILD_PARALLEL_LEVEL=${JOBS:-$(nproc --ignore=1)});
- cmake_args+=(${@});
-
- cmake -S "${REPO_DIR}" -B "${REPO_DIR}/build" ${cmake_args[@]} -GNinja;
-
- sccache --show-stats;
-
- time cmake --build "${REPO_DIR}/build" --verbose --parallel ${JOBS:-$(nproc --ignore=1)};
-
- sccache --show-stats;
-
- (
- mkdir -p /tmp/out;
- cd "${REPO_DIR}/build";
- cpack -G TGZ;
- cp ./*-Linux.tar.gz /tmp/out/;
- );
-
- { set +x; } 2>/dev/null;
-}
-
-(build_cunumeric_cpp "$@");
diff --git a/continuous_integration/scripts/build-cunumeric-wheel b/continuous_integration/scripts/build-cunumeric-wheel
deleted file mode 100755
index 93ae353118..0000000000
--- a/continuous_integration/scripts/build-cunumeric-wheel
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/env bash
-
-build_cunumeric_wheel() {
- set -xeuo pipefail;
-
- mkdir -p /tmp/out;
-
- local pip_args=(-vv);
- pip_args+=(--wheel-dir /tmp/out);
-
- if type conda 2>&1 >/dev/null; then
- pip_args+=(--no-deps);
- pip_args+=(--no-build-isolation);
- fi
-
- local cmake_args=(${CMAKE_ARGS:-});
- cmake_args+=("-DFIND_CUNUMERIC_CPP=ON");
-
- pwd
- echo $REPO_DIR
- ls -lahR $REPO_DIR
-
- cmake_args+=("-Dcunumeric_ROOT=$REPO_DIR/build");
-
- # Build + package cuNumeric Python wheel
- CMAKE_ARGS="${cmake_args[@]}" \
- pip wheel ${pip_args[@]} "${REPO_DIR}";
-
- { set +x; } 2>/dev/null;
-}
-
-(build_cunumeric_wheel "$@");
diff --git a/continuous_integration/scripts/build_wheel_linux.bash b/continuous_integration/scripts/build_wheel_linux.bash
new file mode 100755
index 0000000000..fdd14e668e
--- /dev/null
+++ b/continuous_integration/scripts/build_wheel_linux.bash
@@ -0,0 +1,143 @@
+#!/usr/bin/env bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2025-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+set -euo pipefail
+
+echo "Building a wheel..."
+
+pwd
+
+ls -lah
+
+ls -lh wheel
+
+# Configure and enable sccache for the build.
+source legate-configure-sccache
+export CMAKE_BUILD_PARALLEL_LEVEL=${PARALLEL_LEVEL}
+
+if [[ "${CI:-false}" == "true" ]]; then
+ echo "Installing extra system packages"
+ dnf install -y gcc-toolset-11-libatomic-devel
+ # Enable gcc-toolset-11 environment
+ source /opt/rh/gcc-toolset-11/enable
+ # Verify compiler version
+ gcc --version
+ g++ --version
+fi
+
+echo "PATH: ${PATH}"
+
+if [[ "${CUPYNUMERIC_DIR:-}" == "" ]]; then
+ # If we are running in an action then GITHUB_WORKSPACE is set.
+ if [[ "${GITHUB_WORKSPACE:-}" == "" ]]; then
+ script_dir="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
+ CUPYNUMERIC_DIR="$(realpath "${script_dir}"/../../)"
+ else
+ # Simple path witin GitHub actions workflows.
+ CUPYNUMERIC_DIR="${GITHUB_WORKSPACE}"
+ fi
+ export CUPYNUMERIC_DIR
+fi
+package_name="cupynumeric"
+package_dir="${CUPYNUMERIC_DIR}/scripts/build/python/cupynumeric"
+
+# This is all very hackish and needs to be fixed up.
+echo "Installing build requirements"
+python -m pip install -v --prefer-binary -r continuous_integration/requirements-build.txt
+
+# Install the legate wheel that was downloaded.
+pip install wheel/*.whl
+
+sitepkgs=$(python -c 'import site; print(site.getsitepackages()[0], end="")')
+# Add in the symbolic links for cuTensor so that CMake can find it (hack)
+ln -fs "${sitepkgs}"/cutensor/lib/libcutensor.so.2 "${sitepkgs}"/cutensor/lib/libcutensor.so
+ln -fs "${sitepkgs}"/cutensor/lib/libcutensorMg.so.2 "${sitepkgs}"/cutensor/lib/libcutensorMg.so
+
+# TODO(cryos): https://github.com/nv-legate/cupynumeric.internal/issues/666
+# This is a very hackish way to generate the version for now.
+scm_version=$(python -m setuptools_scm -c "${CUPYNUMERIC_DIR}"/scripts/build/python/cupynumeric/pyproject.toml)
+export SETUPTOOLS_SCM_PRETEND_VERSION="${scm_version}"
+echo "Building wheels with version '${scm_version}'"
+
+# build with '--no-build-isolation', for better sccache hit rate
+# 0 really means "add --no-build-isolation" (ref: https://github.com/pypa/pip/issues/5735)
+export PIP_NO_BUILD_ISOLATION=0
+
+# The cupynumeric build system defaults to -march=native, which is not going to work
+# for packages we want to reuse! Set some reasonable defaults for the wheels.
+ARCH=$(uname -m)
+echo "Building on architecture: ${ARCH}"
+if [[ "$ARCH" == "aarch64" ]]; then
+ BUILD_MARCH=armv8-a
+else
+ BUILD_MARCH=haswell
+fi
+
+echo "Building ${package_name}"
+# TODO(cryos): https://github.com/nv-legate/legate.internal/issues/1894
+# Improve the use of CMAKE_PREFIX_PATH to find legate and cutensor once
+# scikit-build supports it.
+CMAKE_ARGS="-DCMAKE_PREFIX_PATH=${sitepkgs}/legate;${sitepkgs}/cutensor"
+export CMAKE_ARGS
+SKBUILD_CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES:STRING=all-major;-DBUILD_SHARED_LIBS:BOOL=ON;-DBUILD_MARCH=${BUILD_MARCH}"
+export SKBUILD_CMAKE_ARGS
+echo "SKBUILD_CMAKE_ARGS='${SKBUILD_CMAKE_ARGS}'"
+
+# TODO: Remove this hackish removal of scikit-build files needed as conda
+# uses scikit-build and wheels are using scikit-build-core. Migrate conda to
+# be consistent with legate and wheels. If not deleted we get inconsistent
+# metadata failure during the pip wheel build.
+mv "${CUPYNUMERIC_DIR}"/cupynumeric/_version.py "${CUPYNUMERIC_DIR}"/cupynumeric/_version.py.bak
+echo "Removed scikit-build _version.py file"
+ls -lah
+
+echo "Building wheel..."
+cd "${package_dir}"
+
+sccache --zero-stats
+
+python -m pip wheel \
+ -w "${CUPYNUMERIC_DIR}"/dist \
+ -v \
+ --no-deps \
+ --disable-pip-version-check \
+ .
+
+sccache --show-adv-stats
+
+echo "Show dist contents"
+pwd
+ls -lh "${CUPYNUMERIC_DIR}"/dist
+
+echo "Repairing the wheel"
+mkdir -p "${CUPYNUMERIC_DIR}"/final-dist
+python -m auditwheel repair \
+ --exclude libnvJitLink.so* \
+ --exclude libcuda.so* \
+ --exclude liblegate.so* \
+ --exclude libcublas.so* \
+ --exclude libcublasLt.so* \
+ --exclude libnccl.so* \
+ --exclude libcusparse.so* \
+ --exclude libcutensor.so* \
+ --exclude libcufft.so* \
+ --exclude libcusolver.so* \
+ --exclude liblegion-legate.so* \
+ --exclude librealm-legate.so* \
+ -w "${CUPYNUMERIC_DIR}"/final-dist \
+ "${CUPYNUMERIC_DIR}"/dist/*.whl
+
+echo "Wheel has been repaired. Contents:"
+ls -lh "${CUPYNUMERIC_DIR}"/final-dist
+
+echo "Restoring scikit-build _verion.py file"
+mv "${CUPYNUMERIC_DIR}"/cupynumeric/_version.py.bak "${CUPYNUMERIC_DIR}"/cupynumeric/_version.py
diff --git a/continuous_integration/scripts/conda-dnld-utils b/continuous_integration/scripts/conda-dnld-utils
new file mode 100755
index 0000000000..ddf1c6bbe5
--- /dev/null
+++ b/continuous_integration/scripts/conda-dnld-utils
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+
+set -x
+
+generate_legate_version() {
+ legate_json_version="$(jq -r '.packages.legate.version' ${REPO_DIR}/cmake/versions.json)";
+ legate_SHA="$(jq -r '.packages.legate.git_tag' ${REPO_DIR}/cmake/versions.json)";
+ legate_hash="g${legate_SHA:0:8}"
+ export LEGATE_VERSION="${legate_json_version}*"
+ export LEGATE_BUILDSTR="*${legate_hash}*"
+ echo "LEGATE_VERSION=${LEGATE_VERSION} : LEGATE_BUILDSTR=${LEGATE_BUILDSTR}"
+}
+
+verify_legate_version() {
+ legate-mamba-retry search legate=${LEGATE_VERSION} --channel https://conda.anaconda.org/${CONDA_CHANNEL}/label/${CONDA_LABEL}
+ if [ $? -ne 0 ]; then
+ echo "Error: conda search failed for legate." >&2; exit 1
+ fi
+}
+
+setup_conda_channel() {
+ if ! command -v jq &> /dev/null; then
+ echo "Installing jq"
+ apt-get update -q
+ apt-get -q install -y jq
+ fi
+ # strict channel ordering is required for prioritizing packages from artifacts
+ conda config --set channel_priority strict
+ legate_conda_label="$(jq -r '.packages.legate.anaconda_label' ${REPO_DIR}/cmake/versions.json)";
+ export CONDA_CHANNEL="legate"
+ export CONDA_LABEL="${legate_conda_label}"
+ echo "CONDA_CHANNEL=${CONDA_CHANNEL} : CONDA_LABEL=${CONDA_LABEL}"
+}
diff --git a/continuous_integration/scripts/make-conda-env b/continuous_integration/scripts/make-conda-env
index 597d9ee613..eacc3c5891 100755
--- a/continuous_integration/scripts/make-conda-env
+++ b/continuous_integration/scripts/make-conda-env
@@ -4,39 +4,36 @@ set -x
. conda-utils
-make_ci_env() {
- set -xeuo pipefail
- yaml_file=$(find "${ARTIFACTS_DIR}" -name "environment*.yaml" | head -n 1)
+make_release_env() {
+ legate-mamba-retry create -q -y -n "${CONDA_ENV}" -c conda-forge boa
+}
- sed -i '$ d' ${yaml_file}
- echo " - legate" >> "${yaml_file}"
- sed -i "/channels:/!b;:a;n;/^- /ba;i\- ${ARTIFACTS_DIR}/conda-build/legate" ${yaml_file}
- [ "${USE_CUDA}" = "ON" ] &&
- echo " - libcublas-dev" >> "${yaml_file}" &&
- echo " - libcufft-dev" >> "${yaml_file}" &&
- echo " - libcurand-dev" >> "${yaml_file}" &&
- echo " - libcusolver-dev" >> "${yaml_file}";
+make_docs_env() {
+ set -xeuo pipefail
- echo "YAML file..."
- cat "${yaml_file}"
+ export DEBIAN_FRONTEND=non-interactive
+ export CONDA_ENV=legate
- mkdir -p /tmp/out;
+ # Run package updates and install packages
+ apt-get update
+ apt-get install -y numactl make
- cp "${yaml_file}" /tmp/out
+ legate-mamba-retry create -yn "${CONDA_ENV}" pandoc doxygen
- mamba env create -n legate -f "$yaml_file"
-}
+ . conda-utils;
+ activate_conda_env;
-make_release_env() {
- mamba create -q -y -n "${CONDA_ENV}" -c conda-forge boa
+ # mamba install -y pandoc doxygen
+ pip install ipython jinja2 "markdown<3.4.0" myst-parser nbsphinx sphinx-copybutton "sphinx>=8" nvidia-sphinx-theme cffi
}
make_conda_env() {
set -xeuo pipefail
case "$1" in
- ci) make_ci_env;;
- release) make_release_env;;
+ ci) make_release_env;;
+ nightly) make_release_env;;
+ docs) make_docs_env;;
*) return 1;;
esac
diff --git a/continuous_integration/scripts/test b/continuous_integration/scripts/test
index 60bf105959..0bdb65d914 100755
--- a/continuous_integration/scripts/test
+++ b/continuous_integration/scripts/test
@@ -5,35 +5,48 @@ set -x
setup_env() {
set -xeuo pipefail
+ . conda-dnld-utils
+ setup_conda_channel;
export DEBIAN_FRONTEND=non-interactive
# Run package updates and install packages
apt-get update
apt-get install -y numactl make
- mamba create -yn legate -c "${ARTIFACTS_DIR}/conda-build/legate" -c "${ARTIFACTS_DIR}/conda-build/cunumeric" -c conda-forge legate cunumeric
+ legate-mamba-retry search --override-channels -c "${ARTIFACTS_DIR}/conda-build/cupynumeric" --info cupynumeric
+
+ # This requires strict channel priority to work (prioritize local channel)
+ legate-mamba-retry create -y -n legate -c "${ARTIFACTS_DIR}/conda-build/cupynumeric" -c https://conda.anaconda.org/${CONDA_CHANNEL}/label/${CONDA_LABEL} -c legate/label/ucc140 -c conda-forge legate cupynumeric
}
setup_test_env() {
- mamba install -y "clang-tools>=8" "clang>=8" colorama coverage mock pre-commit pytest-cov pytest-mock "pytest" types-docutils pynvml psutil
+ legate-mamba-retry install -y "clang-tools>=8" "clang>=8" colorama coverage mock pre-commit pytest-cov pytest-mock "pytest" types-docutils pynvml psutil
pip install tifffile
}
setup_docs_env() {
- mamba install -y pandoc doxygen
- pip install ipython jinja2 "markdown<3.4.0" "pydata-sphinx-theme>=0.13" myst-parser nbsphinx sphinx-copybutton "sphinx>=4.4.0"
+ legate-mamba-retry install -y pandoc doxygen
+ pip install ipython jinja2 "markdown<3.4.0" myst-parser nbsphinx sphinx-copybutton "sphinx>=8" nvidia-sphinx-theme cffi
}
setup_mypy_env() {
- mamba install -y "mypy>=0.961" jinja2 nbsphinx sphinx-copybutton "sphinx>=4.4.0" types-docutils
+ legate-mamba-retry install -y "mypy>=0.961" jinja2 nbsphinx sphinx-copybutton "sphinx>=4.4.0" types-docutils
}
setup_unit_env() {
- mamba install -y pytest pytest-mock mock
+ legate-mamba-retry install -y pytest pytest-mock mock cffi
+}
+
+run_legate_issue() {
+ if command -v "legate-issue" &> /dev/null; then
+ legate-issue
+ else
+ echo "WARNING: legate-issue not found."
+ fi
}
-test_cunumeric() {
+test_cupynumeric() {
set -xeo pipefail
. conda-utils;
@@ -51,27 +64,42 @@ test_cunumeric() {
echo "Executing tests..."
shift;
setup_test_env;
+ run_legate_issue;
./test.py -vv --timeout 300 "$@"
;;
"mypy")
echo "Installing and executing mypy..."
shift;
setup_mypy_env;
- mypy cunumeric
+ run_legate_issue;
+ mypy cupynumeric
;;
"docs")
echo "Building docs..."
shift;
setup_docs_env;
- cd docs/cunumeric
+ run_legate_issue;
+ cd docs/cupynumeric
make clean html
+ # ls -lah .
+ echo Copying artifacts
+ cd build/html
+ cp -r . "${OUTPUT_ARTIFACTS_DIR}"
;;
"unit")
echo "Running Unit tests..."
shift;
setup_unit_env;
+ run_legate_issue;
LEGATE_AUTO_CONFIG=0 pytest tests/unit
;;
+ "cpp")
+ echo "Running CPP tests..."
+ shift;
+ run_legate_issue;
+ export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib/legate/deps:${LD_LIBRARY_PATH:-}
+ REALM_BACKTRACE=1 LEGATE_TEST=1 LEGATE_LOG_MAPPING=1 ${CONDA_PREFIX}/bin/cpp_tests
+ ;;
*)
echo "Invalid command: $1"
return 1
@@ -79,4 +107,4 @@ test_cunumeric() {
esac
}
-(test_cunumeric "$@");
+(test_cupynumeric "$@");
diff --git a/continuous_integration/scripts/test_wheel_linux.bash b/continuous_integration/scripts/test_wheel_linux.bash
new file mode 100755
index 0000000000..4414d96003
--- /dev/null
+++ b/continuous_integration/scripts/test_wheel_linux.bash
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2025-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+set -euo pipefail
+
+echo "Are my wheels there???"
+
+ls -lh
+
+ls -lh wheel
+ls -lh final-dist
+
+# Install legate first and then cupynumeric.
+pip install wheel/*.whl final-dist/*.whl
+
+echo "Let's explore the wheels and see if they are installed correctly."
+sitepkgs=$(python -c 'import site; print(site.getsitepackages()[0], end="")')
+echo "=== cupynumeric ==="
+ls -lh "${sitepkgs}/cupynumeric"
+echo "=== legate ==="
+ls -lh "${sitepkgs}/legate"
+
+echo "Lamest of proof of life tests for legate"
+export LEGATE_SHOW_CONFIG=1
+export LEGATE_CONFIG="--fbmem 1024"
+export LEGION_DEFAULT_ARGS="-ll:show_rsrv"
+
+# Attempt to run the tests...
+mv cupynumeric cupynumeric-moved
+pip install pytest pynvml psutil scipy
+
+echo "Attempt to run an example"
+legate examples/gemm.py
+
+echo "Example done, attempt to import cupynumeric"
+python -c 'import cupynumeric as np'
+echo "Maybe that worked"
+
+echo "Running the CPU tests"
+python test.py
+echo "Done"
+
+echo "Running the GPU tests"
+python test.py --use cuda
+echo "Done"
diff --git a/continuous_integration/scripts/tools/legate-configure-sccache b/continuous_integration/scripts/tools/legate-configure-sccache
new file mode 100755
index 0000000000..bd7a5e0be5
--- /dev/null
+++ b/continuous_integration/scripts/tools/legate-configure-sccache
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+# A utility script that configures sccache environment variables
+
+export CMAKE_CUDA_COMPILER_LAUNCHER=sccache
+export CMAKE_CXX_COMPILER_LAUNCHER=sccache
+export CMAKE_C_COMPILER_LAUNCHER=sccache
+export RUSTC_WRAPPER=sccache
+export PARALLEL_LEVEL=${PARALLEL_LEVEL:-$(nproc --all --ignore=2)}
+export SCCACHE_BUCKET=rapids-sccache-east
+export SCCACHE_IDLE_TIMEOUT=32768
+export SCCACHE_REGION=us-east-2
+export SCCACHE_S3_KEY_PREFIX=legate-cunumeric-dev
+export SCCACHE_S3_NO_CREDENTIALS=false
+export SCCACHE_S3_USE_SSL=true
+
+if [[ "${CI:-false}" == "false" ]]; then
+ # Configure sccache for read-only mode since no credentials
+ # are available in local builds.
+ export SCCACHE_S3_NO_CREDENTIALS=true
+fi
diff --git a/continuous_integration/scripts/tools/legate-gh-download-artifact b/continuous_integration/scripts/tools/legate-gh-download-artifact
new file mode 100755
index 0000000000..5accd59edd
--- /dev/null
+++ b/continuous_integration/scripts/tools/legate-gh-download-artifact
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2025-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# A utility script adapted from https://github.com/rapidsai/gha-tools/blob/main/tools/rapids-download-from-github
+# Given a git SHA, artifact name and output path grab the artifact from the run.
+
+set -euo pipefail
+
+# Default values for the environment variables.
+LEGATE_REPO_NAME=${LEGATE_REPO_NAME:-"nv-legate/legate.internal"}
+
+# Check if the script was called with exactly 1 argument
+if [[ ${#} -ne 3 ]]; then
+ echo "Error: This script requires exactly 3 arguments (the git SHA, the artifact name, and the output path)."
+ echo "You provided ${#} arguments."
+ echo "Usage: ${0} git-sha artifact-name output-path"
+ exit 1
+fi
+
+# Poppulate our variables from the arguments.
+run_id=$(legate-gh-run-id "${1}")
+artifact_name="${2}"
+output_path="${3}"
+
+echo "Downloading and decompressing artifact ${artifact_name} from run ${run_id} to ${output_path}"
+
+gh run download "${run_id}" \
+ --repo "${LEGATE_REPO_NAME}" \
+ --name "${artifact_name}" \
+ --dir "${output_path}"
+
+echo -n "${output_path}"
diff --git a/continuous_integration/scripts/tools/legate-gh-run-id b/continuous_integration/scripts/tools/legate-gh-run-id
new file mode 100755
index 0000000000..339a674296
--- /dev/null
+++ b/continuous_integration/scripts/tools/legate-gh-run-id
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2025-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# A utility script adapted from https://github.com/rapidsai/gha-tools/blob/main/tools/rapids-github-run-id
+# This gets the GitHub run ID for the specified workflow and commit SHA.
+
+set -euo pipefail
+
+# Default values for the environment variables.
+LEGATE_WORKFLOW_NAME=${LEGATE_WORKFLOW_NAME:-"pr"}
+LEGATE_REF_NAME=${LEGATE_REF_NAME:-"main"}
+LEGATE_REPO_NAME=${LEGATE_REPO_NAME:-"nv-legate/legate.internal"}
+
+# Check if the script was called with exactly 1 argument
+if [[ ${#} -ne 1 ]]; then
+ echo "Error: This script requires exactly 1 argument (the git SHA). You provided ${#}"
+ echo "Usage: ${0} git-sha"
+ exit 1
+fi
+
+gh_run_id=$(gh run list \
+ --repo "${LEGATE_REPO_NAME}" \
+ --branch "${LEGATE_REF_NAME}" \
+ --workflow "${LEGATE_WORKFLOW_NAME}" \
+ --commit "${1}" \
+ --json databaseId --jq '.[0] | .databaseId')
+
+echo -n "${gh_run_id}"
diff --git a/cunumeric/config.py b/cunumeric/config.py
deleted file mode 100644
index 310ca9f416..0000000000
--- a/cunumeric/config.py
+++ /dev/null
@@ -1,824 +0,0 @@
-# Copyright 2024 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from __future__ import annotations
-
-import os
-import platform
-from abc import abstractmethod
-from ctypes import CDLL, RTLD_GLOBAL
-from enum import IntEnum, unique
-from typing import TYPE_CHECKING, Any, cast
-
-import cffi # type: ignore
-import numpy as np
-
-if TYPE_CHECKING:
- import numpy.typing as npt
-
-
-class _ReductionOpIds:
- argmax_redop_id: int
- argmin_redop_id: int
-
-
-class _CunumericSharedLib:
- CUNUMERIC_ADVANCED_INDEXING: int
- CUNUMERIC_ARANGE: int
- CUNUMERIC_ARGWHERE: int
- CUNUMERIC_BATCHED_CHOLESKY: int
- CUNUMERIC_BINARY_OP: int
- CUNUMERIC_BINARY_RED: int
- CUNUMERIC_BINCOUNT: int
- CUNUMERIC_BINOP_ADD: int
- CUNUMERIC_BINOP_ARCTAN2: int
- CUNUMERIC_BINOP_BITWISE_AND: int
- CUNUMERIC_BINOP_BITWISE_OR: int
- CUNUMERIC_BINOP_BITWISE_XOR: int
- CUNUMERIC_BINOP_COPYSIGN: int
- CUNUMERIC_BINOP_DIVIDE: int
- CUNUMERIC_BINOP_EQUAL: int
- CUNUMERIC_BINOP_FLOAT_POWER: int
- CUNUMERIC_BINOP_FLOOR_DIVIDE: int
- CUNUMERIC_BINOP_FMOD: int
- CUNUMERIC_BINOP_GCD: int
- CUNUMERIC_BINOP_GREATER: int
- CUNUMERIC_BINOP_GREATER_EQUAL: int
- CUNUMERIC_BINOP_HYPOT: int
- CUNUMERIC_BINOP_ISCLOSE: int
- CUNUMERIC_BINOP_LCM: int
- CUNUMERIC_BINOP_LDEXP: int
- CUNUMERIC_BINOP_LEFT_SHIFT: int
- CUNUMERIC_BINOP_LESS: int
- CUNUMERIC_BINOP_LESS_EQUAL: int
- CUNUMERIC_BINOP_LOGADDEXP2: int
- CUNUMERIC_BINOP_LOGADDEXP: int
- CUNUMERIC_BINOP_LOGICAL_AND: int
- CUNUMERIC_BINOP_LOGICAL_OR: int
- CUNUMERIC_BINOP_LOGICAL_XOR: int
- CUNUMERIC_BINOP_MAXIMUM: int
- CUNUMERIC_BINOP_MINIMUM: int
- CUNUMERIC_BINOP_MOD: int
- CUNUMERIC_BINOP_MULTIPLY: int
- CUNUMERIC_BINOP_NEXTAFTER: int
- CUNUMERIC_BINOP_NOT_EQUAL: int
- CUNUMERIC_BINOP_POWER: int
- CUNUMERIC_BINOP_RIGHT_SHIFT: int
- CUNUMERIC_BINOP_SUBTRACT: int
- CUNUMERIC_BITGENERATOR: int
- CUNUMERIC_BITGENOP_DISTRIBUTION: int
- CUNUMERIC_BITGENTYPE_DEFAULT: int
- CUNUMERIC_BITGENTYPE_XORWOW: int
- CUNUMERIC_BITGENTYPE_MRG32K3A: int
- CUNUMERIC_BITGENTYPE_MTGP32: int
- CUNUMERIC_BITGENTYPE_MT19937: int
- CUNUMERIC_BITGENTYPE_PHILOX4_32_10: int
- CUNUMERIC_BITGENDIST_INTEGERS_16: int
- CUNUMERIC_BITGENDIST_INTEGERS_32: int
- CUNUMERIC_BITGENDIST_INTEGERS_64: int
- CUNUMERIC_BITGENDIST_UNIFORM_32: int
- CUNUMERIC_BITGENDIST_UNIFORM_64: int
- CUNUMERIC_BITGENDIST_LOGNORMAL_32: int
- CUNUMERIC_BITGENDIST_LOGNORMAL_64: int
- CUNUMERIC_BITGENDIST_NORMAL_32: int
- CUNUMERIC_BITGENDIST_NORMAL_64: int
- CUNUMERIC_BITGENDIST_POISSON: int
- CUNUMERIC_BITGENDIST_EXPONENTIAL_32: int
- CUNUMERIC_BITGENDIST_EXPONENTIAL_64: int
- CUNUMERIC_BITGENDIST_GUMBEL_32: int
- CUNUMERIC_BITGENDIST_GUMBEL_64: int
- CUNUMERIC_BITGENDIST_LAPLACE_32: int
- CUNUMERIC_BITGENDIST_LAPLACE_64: int
- CUNUMERIC_BITGENDIST_LOGISTIC_32: int
- CUNUMERIC_BITGENDIST_LOGISTIC_64: int
- CUNUMERIC_BITGENDIST_PARETO_32: int
- CUNUMERIC_BITGENDIST_PARETO_64: int
- CUNUMERIC_BITGENDIST_POWER_32: int
- CUNUMERIC_BITGENDIST_POWER_64: int
- CUNUMERIC_BITGENDIST_RAYLEIGH_32: int
- CUNUMERIC_BITGENDIST_RAYLEIGH_64: int
- CUNUMERIC_BITGENDIST_CAUCHY_32: int
- CUNUMERIC_BITGENDIST_CAUCHY_64: int
- CUNUMERIC_BITGENDIST_TRIANGULAR_32: int
- CUNUMERIC_BITGENDIST_TRIANGULAR_64: int
- CUNUMERIC_BITGENDIST_WEIBULL_32: int
- CUNUMERIC_BITGENDIST_WEIBULL_64: int
- CUNUMERIC_BITGENDIST_BYTES: int
- CUNUMERIC_BITGENDIST_BETA_32: int
- CUNUMERIC_BITGENDIST_BETA_64: int
- CUNUMERIC_BITGENDIST_F_32: int
- CUNUMERIC_BITGENDIST_F_64: int
- CUNUMERIC_BITGENDIST_LOGSERIES: int
- CUNUMERIC_BITGENDIST_NONCENTRAL_F_32: int
- CUNUMERIC_BITGENDIST_NONCENTRAL_F_64: int
- CUNUMERIC_BITGENDIST_CHISQUARE_32: int
- CUNUMERIC_BITGENDIST_CHISQUARE_64: int
- CUNUMERIC_BITGENDIST_GAMMA_32: int
- CUNUMERIC_BITGENDIST_GAMMA_64: int
- CUNUMERIC_BITGENDIST_STANDARD_T_32: int
- CUNUMERIC_BITGENDIST_STANDARD_T_64: int
- CUNUMERIC_BITGENDIST_HYPERGEOMETRIC: int
- CUNUMERIC_BITGENDIST_VONMISES_32: int
- CUNUMERIC_BITGENDIST_VONMISES_64: int
- CUNUMERIC_BITGENDIST_ZIPF: int
- CUNUMERIC_BITGENDIST_GEOMETRIC: int
- CUNUMERIC_BITGENDIST_WALD_32: int
- CUNUMERIC_BITGENDIST_WALD_64: int
- CUNUMERIC_BITGENDIST_BINOMIAL: int
- CUNUMERIC_BITGENDIST_NEGATIVE_BINOMIAL: int
- CUNUMERIC_BITGENOP_CREATE: int
- CUNUMERIC_BITGENOP_DESTROY: int
- CUNUMERIC_BITGENOP_RAND_RAW: int
- CUNUMERIC_BITORDER_BIG: int
- CUNUMERIC_BITORDER_LITTLE: int
- CUNUMERIC_CHOOSE: int
- CUNUMERIC_CONTRACT: int
- CUNUMERIC_CONVERT: int
- CUNUMERIC_CONVERT_NAN_NOOP: int
- CUNUMERIC_CONVERT_NAN_PROD: int
- CUNUMERIC_CONVERT_NAN_SUM: int
- CUNUMERIC_CONVOLVE: int
- CUNUMERIC_DIAG: int
- CUNUMERIC_DOT: int
- CUNUMERIC_EYE: int
- CUNUMERIC_FFT: int
- CUNUMERIC_FFT_C2C: int
- CUNUMERIC_FFT_C2R: int
- CUNUMERIC_FFT_D2Z: int
- CUNUMERIC_FFT_FORWARD: int
- CUNUMERIC_FFT_INVERSE: int
- CUNUMERIC_FFT_R2C: int
- CUNUMERIC_FFT_Z2D: int
- CUNUMERIC_FFT_Z2Z: int
- CUNUMERIC_FILL: int
- CUNUMERIC_FLIP: int
- CUNUMERIC_GEMM: int
- CUNUMERIC_HISTOGRAM: int
- CUNUMERIC_LOAD_CUDALIBS: int
- CUNUMERIC_MATMUL: int
- CUNUMERIC_MATVECMUL: int
- CUNUMERIC_MAX_MAPPERS: int
- CUNUMERIC_MAX_REDOPS: int
- CUNUMERIC_MAX_TASKS: int
- CUNUMERIC_MP_POTRF: int
- CUNUMERIC_MP_SOLVE: int
- CUNUMERIC_NONZERO: int
- CUNUMERIC_PACKBITS: int
- CUNUMERIC_POTRF: int
- CUNUMERIC_PUTMASK: int
- CUNUMERIC_QR: int
- CUNUMERIC_RAND: int
- CUNUMERIC_READ: int
- CUNUMERIC_RED_ALL: int
- CUNUMERIC_RED_ANY: int
- CUNUMERIC_RED_ARGMAX: int
- CUNUMERIC_RED_ARGMIN: int
- CUNUMERIC_RED_CONTAINS: int
- CUNUMERIC_RED_COUNT_NONZERO: int
- CUNUMERIC_RED_MAX: int
- CUNUMERIC_RED_MIN: int
- CUNUMERIC_RED_NANARGMAX: int
- CUNUMERIC_RED_NANARGMIN: int
- CUNUMERIC_RED_NANMAX: int
- CUNUMERIC_RED_NANMIN: int
- CUNUMERIC_RED_NANPROD: int
- CUNUMERIC_RED_NANSUM: int
- CUNUMERIC_RED_PROD: int
- CUNUMERIC_RED_SUM: int
- CUNUMERIC_RED_SUM_SQUARES: int
- CUNUMERIC_RED_VARIANCE: int
- CUNUMERIC_REPEAT: int
- CUNUMERIC_SCALAR_UNARY_RED: int
- CUNUMERIC_SCAN_GLOBAL: int
- CUNUMERIC_SCAN_LOCAL: int
- CUNUMERIC_SCAN_PROD: int
- CUNUMERIC_SCAN_SUM: int
- CUNUMERIC_SEARCHSORTED: int
- CUNUMERIC_SELECT: int
- CUNUMERIC_SOLVE: int
- CUNUMERIC_SORT: int
- CUNUMERIC_SVD: int
- CUNUMERIC_SYRK: int
- CUNUMERIC_TILE: int
- CUNUMERIC_TRANSPOSE_COPY_2D: int
- CUNUMERIC_TRILU: int
- CUNUMERIC_TRSM: int
- CUNUMERIC_UNARY_OP: int
- CUNUMERIC_UNARY_RED: int
- CUNUMERIC_UNIQUE: int
- CUNUMERIC_UNIQUE_REDUCE: int
- CUNUMERIC_UNLOAD_CUDALIBS: int
- CUNUMERIC_UNPACKBITS: int
- CUNUMERIC_UOP_ABSOLUTE: int
- CUNUMERIC_UOP_ANGLE: int
- CUNUMERIC_UOP_ARCCOS: int
- CUNUMERIC_UOP_ARCCOSH: int
- CUNUMERIC_UOP_ARCSIN: int
- CUNUMERIC_UOP_ARCSINH: int
- CUNUMERIC_UOP_ARCTAN: int
- CUNUMERIC_UOP_ARCTANH: int
- CUNUMERIC_UOP_CBRT: int
- CUNUMERIC_UOP_CEIL: int
- CUNUMERIC_UOP_CLIP: int
- CUNUMERIC_UOP_CONJ: int
- CUNUMERIC_UOP_COPY: int
- CUNUMERIC_UOP_COS: int
- CUNUMERIC_UOP_COSH: int
- CUNUMERIC_UOP_DEG2RAD: int
- CUNUMERIC_UOP_EXP2: int
- CUNUMERIC_UOP_EXP: int
- CUNUMERIC_UOP_EXPM1: int
- CUNUMERIC_UOP_FLOOR: int
- CUNUMERIC_UOP_FREXP: int
- CUNUMERIC_UOP_GETARG: int
- CUNUMERIC_UOP_IMAG: int
- CUNUMERIC_UOP_INVERT: int
- CUNUMERIC_UOP_ISFINITE: int
- CUNUMERIC_UOP_ISINF: int
- CUNUMERIC_UOP_ISNAN: int
- CUNUMERIC_UOP_LOG10: int
- CUNUMERIC_UOP_LOG1P: int
- CUNUMERIC_UOP_LOG2: int
- CUNUMERIC_UOP_LOG: int
- CUNUMERIC_UOP_LOGICAL_NOT: int
- CUNUMERIC_UOP_MODF: int
- CUNUMERIC_UOP_NEGATIVE: int
- CUNUMERIC_UOP_POSITIVE: int
- CUNUMERIC_UOP_RAD2DEG: int
- CUNUMERIC_UOP_REAL: int
- CUNUMERIC_UOP_RECIPROCAL: int
- CUNUMERIC_UOP_RINT: int
- CUNUMERIC_UOP_ROUND: int
- CUNUMERIC_UOP_SIGN: int
- CUNUMERIC_UOP_SIGNBIT: int
- CUNUMERIC_UOP_SIN: int
- CUNUMERIC_UOP_SINH: int
- CUNUMERIC_UOP_SQRT: int
- CUNUMERIC_UOP_SQUARE: int
- CUNUMERIC_UOP_TAN: int
- CUNUMERIC_UOP_TANH: int
- CUNUMERIC_UOP_TRUNC: int
- CUNUMERIC_WHERE: int
- CUNUMERIC_WINDOW: int
- CUNUMERIC_WINDOW_BARLETT: int
- CUNUMERIC_WINDOW_BLACKMAN: int
- CUNUMERIC_WINDOW_HAMMING: int
- CUNUMERIC_WINDOW_HANNING: int
- CUNUMERIC_WINDOW_KAISER: int
- CUNUMERIC_WRAP: int
- CUNUMERIC_WRITE: int
- CUNUMERIC_ZIP: int
-
- @abstractmethod
- def cunumeric_has_cusolvermp(self) -> bool:
- ...
-
- @abstractmethod
- def cunumeric_max_eager_volume(self) -> int:
- ...
-
- @abstractmethod
- def cunumeric_register_reduction_ops(self, code: int) -> _ReductionOpIds:
- ...
-
-
-def dlopen_no_autoclose(ffi: Any, lib_path: str) -> Any:
- # Use an already-opened library handle, which cffi will convert to a
- # regular FFI object (using the definitions previously added using
- # ffi.cdef), but will not automatically dlclose() on collection.
- lib = CDLL(lib_path, mode=RTLD_GLOBAL)
- return ffi.dlopen(ffi.cast("void *", lib._handle))
-
-
-# Load the cuNumeric library first so we have a shard object that
-# we can use to initialize all these configuration enumerations
-class CuNumericLib:
- def __init__(self, name: str) -> None:
- self.name = name
-
- shared_lib_path = self.get_shared_library()
- assert shared_lib_path is not None
- header = self.get_c_header()
- ffi = cffi.FFI()
- if header is not None:
- ffi.cdef(header)
- # Don't use ffi.dlopen(), because that will call dlclose()
- # automatically when the object gets collected, thus removing
- # symbols that may be needed when destroying C++ objects later
- # (e.g. vtable entries, which will be queried for virtual
- # destructors), causing errors at shutdown.
- shared_lib = dlopen_no_autoclose(ffi, shared_lib_path)
- self.shared_object = cast(_CunumericSharedLib, shared_lib)
-
- def register(self) -> None:
- from legate.core import get_legate_runtime
-
- # We need to make sure that the runtime is started
- get_legate_runtime()
-
- callback = getattr(
- self.shared_object, "cunumeric_perform_registration"
- )
- callback()
-
- def get_shared_library(self) -> str:
- from .install_info import libpath
-
- return os.path.join(
- libpath, "libcunumeric" + self.get_library_extension()
- )
-
- def get_c_header(self) -> str:
- from .install_info import header
-
- return header
-
- @staticmethod
- def get_library_extension() -> str:
- os_name = platform.system()
- if os_name == "Linux":
- return ".so"
- elif os_name == "Darwin":
- return ".dylib"
- raise RuntimeError(f"unknown platform {os_name!r}")
-
-
-CUNUMERIC_LIB_NAME = "cunumeric"
-cunumeric_lib = CuNumericLib(CUNUMERIC_LIB_NAME)
-cunumeric_lib.register()
-_cunumeric = cunumeric_lib.shared_object
-
-
-# Match these to CuNumericOpCode in cunumeric_c.h
-@unique
-class CuNumericOpCode(IntEnum):
- ADVANCED_INDEXING = _cunumeric.CUNUMERIC_ADVANCED_INDEXING
- ARANGE = _cunumeric.CUNUMERIC_ARANGE
- ARGWHERE = _cunumeric.CUNUMERIC_ARGWHERE
- BATCHED_CHOLESKY = _cunumeric.CUNUMERIC_BATCHED_CHOLESKY
- BINARY_OP = _cunumeric.CUNUMERIC_BINARY_OP
- BINARY_RED = _cunumeric.CUNUMERIC_BINARY_RED
- BINCOUNT = _cunumeric.CUNUMERIC_BINCOUNT
- BITGENERATOR = _cunumeric.CUNUMERIC_BITGENERATOR
- CHOOSE = _cunumeric.CUNUMERIC_CHOOSE
- CONTRACT = _cunumeric.CUNUMERIC_CONTRACT
- CONVERT = _cunumeric.CUNUMERIC_CONVERT
- CONVOLVE = _cunumeric.CUNUMERIC_CONVOLVE
- DIAG = _cunumeric.CUNUMERIC_DIAG
- DOT = _cunumeric.CUNUMERIC_DOT
- EYE = _cunumeric.CUNUMERIC_EYE
- FFT = _cunumeric.CUNUMERIC_FFT
- FILL = _cunumeric.CUNUMERIC_FILL
- FLIP = _cunumeric.CUNUMERIC_FLIP
- GEMM = _cunumeric.CUNUMERIC_GEMM
- HISTOGRAM = _cunumeric.CUNUMERIC_HISTOGRAM
- LOAD_CUDALIBS = _cunumeric.CUNUMERIC_LOAD_CUDALIBS
- MATMUL = _cunumeric.CUNUMERIC_MATMUL
- MATVECMUL = _cunumeric.CUNUMERIC_MATVECMUL
- MP_POTRF = _cunumeric.CUNUMERIC_MP_POTRF
- MP_SOLVE = _cunumeric.CUNUMERIC_MP_SOLVE
- NONZERO = _cunumeric.CUNUMERIC_NONZERO
- PACKBITS = _cunumeric.CUNUMERIC_PACKBITS
- POTRF = _cunumeric.CUNUMERIC_POTRF
- PUTMASK = _cunumeric.CUNUMERIC_PUTMASK
- QR = _cunumeric.CUNUMERIC_QR
- RAND = _cunumeric.CUNUMERIC_RAND
- READ = _cunumeric.CUNUMERIC_READ
- REPEAT = _cunumeric.CUNUMERIC_REPEAT
- SCALAR_UNARY_RED = _cunumeric.CUNUMERIC_SCALAR_UNARY_RED
- SCAN_GLOBAL = _cunumeric.CUNUMERIC_SCAN_GLOBAL
- SCAN_LOCAL = _cunumeric.CUNUMERIC_SCAN_LOCAL
- SEARCHSORTED = _cunumeric.CUNUMERIC_SEARCHSORTED
- SELECT = _cunumeric.CUNUMERIC_SELECT
- SOLVE = _cunumeric.CUNUMERIC_SOLVE
- SORT = _cunumeric.CUNUMERIC_SORT
- SVD = _cunumeric.CUNUMERIC_SVD
- SYRK = _cunumeric.CUNUMERIC_SYRK
- TILE = _cunumeric.CUNUMERIC_TILE
- TRANSPOSE_COPY_2D = _cunumeric.CUNUMERIC_TRANSPOSE_COPY_2D
- TRILU = _cunumeric.CUNUMERIC_TRILU
- TRSM = _cunumeric.CUNUMERIC_TRSM
- UNARY_OP = _cunumeric.CUNUMERIC_UNARY_OP
- UNARY_RED = _cunumeric.CUNUMERIC_UNARY_RED
- UNIQUE = _cunumeric.CUNUMERIC_UNIQUE
- UNIQUE_REDUCE = _cunumeric.CUNUMERIC_UNIQUE_REDUCE
- UNLOAD_CUDALIBS = _cunumeric.CUNUMERIC_UNLOAD_CUDALIBS
- UNPACKBITS = _cunumeric.CUNUMERIC_UNPACKBITS
- WHERE = _cunumeric.CUNUMERIC_WHERE
- WINDOW = _cunumeric.CUNUMERIC_WINDOW
- WRAP = _cunumeric.CUNUMERIC_WRAP
- WRITE = _cunumeric.CUNUMERIC_WRITE
- ZIP = _cunumeric.CUNUMERIC_ZIP
-
-
-# Match these to CuNumericUnaryOpCode in cunumeric_c.h
-@unique
-class UnaryOpCode(IntEnum):
- ABSOLUTE = _cunumeric.CUNUMERIC_UOP_ABSOLUTE
- ANGLE = _cunumeric.CUNUMERIC_UOP_ANGLE
- ARCCOS = _cunumeric.CUNUMERIC_UOP_ARCCOS
- ARCCOSH = _cunumeric.CUNUMERIC_UOP_ARCCOSH
- ARCSIN = _cunumeric.CUNUMERIC_UOP_ARCSIN
- ARCSINH = _cunumeric.CUNUMERIC_UOP_ARCSINH
- ARCTAN = _cunumeric.CUNUMERIC_UOP_ARCTAN
- ARCTANH = _cunumeric.CUNUMERIC_UOP_ARCTANH
- CBRT = _cunumeric.CUNUMERIC_UOP_CBRT
- CEIL = _cunumeric.CUNUMERIC_UOP_CEIL
- CLIP = _cunumeric.CUNUMERIC_UOP_CLIP
- CONJ = _cunumeric.CUNUMERIC_UOP_CONJ
- COPY = _cunumeric.CUNUMERIC_UOP_COPY
- COS = _cunumeric.CUNUMERIC_UOP_COS
- COSH = _cunumeric.CUNUMERIC_UOP_COSH
- DEG2RAD = _cunumeric.CUNUMERIC_UOP_DEG2RAD
- EXP = _cunumeric.CUNUMERIC_UOP_EXP
- EXP2 = _cunumeric.CUNUMERIC_UOP_EXP2
- EXPM1 = _cunumeric.CUNUMERIC_UOP_EXPM1
- FLOOR = _cunumeric.CUNUMERIC_UOP_FLOOR
- FREXP = _cunumeric.CUNUMERIC_UOP_FREXP
- GETARG = _cunumeric.CUNUMERIC_UOP_GETARG
- IMAG = _cunumeric.CUNUMERIC_UOP_IMAG
- INVERT = _cunumeric.CUNUMERIC_UOP_INVERT
- ISFINITE = _cunumeric.CUNUMERIC_UOP_ISFINITE
- ISINF = _cunumeric.CUNUMERIC_UOP_ISINF
- ISNAN = _cunumeric.CUNUMERIC_UOP_ISNAN
- LOG = _cunumeric.CUNUMERIC_UOP_LOG
- LOG10 = _cunumeric.CUNUMERIC_UOP_LOG10
- LOG1P = _cunumeric.CUNUMERIC_UOP_LOG1P
- LOG2 = _cunumeric.CUNUMERIC_UOP_LOG2
- LOGICAL_NOT = _cunumeric.CUNUMERIC_UOP_LOGICAL_NOT
- MODF = _cunumeric.CUNUMERIC_UOP_MODF
- NEGATIVE = _cunumeric.CUNUMERIC_UOP_NEGATIVE
- POSITIVE = _cunumeric.CUNUMERIC_UOP_POSITIVE
- RAD2DEG = _cunumeric.CUNUMERIC_UOP_RAD2DEG
- REAL = _cunumeric.CUNUMERIC_UOP_REAL
- RECIPROCAL = _cunumeric.CUNUMERIC_UOP_RECIPROCAL
- RINT = _cunumeric.CUNUMERIC_UOP_RINT
- ROUND = _cunumeric.CUNUMERIC_UOP_ROUND
- SIGN = _cunumeric.CUNUMERIC_UOP_SIGN
- SIGNBIT = _cunumeric.CUNUMERIC_UOP_SIGNBIT
- SIN = _cunumeric.CUNUMERIC_UOP_SIN
- SINH = _cunumeric.CUNUMERIC_UOP_SINH
- SQRT = _cunumeric.CUNUMERIC_UOP_SQRT
- SQUARE = _cunumeric.CUNUMERIC_UOP_SQUARE
- TAN = _cunumeric.CUNUMERIC_UOP_TAN
- TANH = _cunumeric.CUNUMERIC_UOP_TANH
- TRUNC = _cunumeric.CUNUMERIC_UOP_TRUNC
-
-
-# Match these to CuNumericUnaryRedCode in cunumeric_c.h
-@unique
-class UnaryRedCode(IntEnum):
- ALL = _cunumeric.CUNUMERIC_RED_ALL
- ANY = _cunumeric.CUNUMERIC_RED_ANY
- ARGMAX = _cunumeric.CUNUMERIC_RED_ARGMAX
- ARGMIN = _cunumeric.CUNUMERIC_RED_ARGMIN
- CONTAINS = _cunumeric.CUNUMERIC_RED_CONTAINS
- COUNT_NONZERO = _cunumeric.CUNUMERIC_RED_COUNT_NONZERO
- MAX = _cunumeric.CUNUMERIC_RED_MAX
- MIN = _cunumeric.CUNUMERIC_RED_MIN
- NANARGMAX = _cunumeric.CUNUMERIC_RED_NANARGMAX
- NANARGMIN = _cunumeric.CUNUMERIC_RED_NANARGMIN
- NANMAX = _cunumeric.CUNUMERIC_RED_NANMAX
- NANMIN = _cunumeric.CUNUMERIC_RED_NANMIN
- NANPROD = _cunumeric.CUNUMERIC_RED_NANPROD
- NANSUM = _cunumeric.CUNUMERIC_RED_NANSUM
- PROD = _cunumeric.CUNUMERIC_RED_PROD
- SUM = _cunumeric.CUNUMERIC_RED_SUM
- SUM_SQUARES = _cunumeric.CUNUMERIC_RED_SUM_SQUARES
- VARIANCE = _cunumeric.CUNUMERIC_RED_VARIANCE
-
-
-# Match these to CuNumericBinaryOpCode in cunumeric_c.h
-@unique
-class BinaryOpCode(IntEnum):
- ADD = _cunumeric.CUNUMERIC_BINOP_ADD
- ARCTAN2 = _cunumeric.CUNUMERIC_BINOP_ARCTAN2
- BITWISE_AND = _cunumeric.CUNUMERIC_BINOP_BITWISE_AND
- BITWISE_OR = _cunumeric.CUNUMERIC_BINOP_BITWISE_OR
- BITWISE_XOR = _cunumeric.CUNUMERIC_BINOP_BITWISE_XOR
- COPYSIGN = _cunumeric.CUNUMERIC_BINOP_COPYSIGN
- DIVIDE = _cunumeric.CUNUMERIC_BINOP_DIVIDE
- EQUAL = _cunumeric.CUNUMERIC_BINOP_EQUAL
- FLOAT_POWER = _cunumeric.CUNUMERIC_BINOP_FLOAT_POWER
- FLOOR_DIVIDE = _cunumeric.CUNUMERIC_BINOP_FLOOR_DIVIDE
- FMOD = _cunumeric.CUNUMERIC_BINOP_FMOD
- GCD = _cunumeric.CUNUMERIC_BINOP_GCD
- GREATER = _cunumeric.CUNUMERIC_BINOP_GREATER
- GREATER_EQUAL = _cunumeric.CUNUMERIC_BINOP_GREATER_EQUAL
- HYPOT = _cunumeric.CUNUMERIC_BINOP_HYPOT
- ISCLOSE = _cunumeric.CUNUMERIC_BINOP_ISCLOSE
- LCM = _cunumeric.CUNUMERIC_BINOP_LCM
- LDEXP = _cunumeric.CUNUMERIC_BINOP_LDEXP
- LEFT_SHIFT = _cunumeric.CUNUMERIC_BINOP_LEFT_SHIFT
- LESS = _cunumeric.CUNUMERIC_BINOP_LESS
- LESS_EQUAL = _cunumeric.CUNUMERIC_BINOP_LESS_EQUAL
- LOGADDEXP = _cunumeric.CUNUMERIC_BINOP_LOGADDEXP
- LOGADDEXP2 = _cunumeric.CUNUMERIC_BINOP_LOGADDEXP2
- LOGICAL_AND = _cunumeric.CUNUMERIC_BINOP_LOGICAL_AND
- LOGICAL_OR = _cunumeric.CUNUMERIC_BINOP_LOGICAL_OR
- LOGICAL_XOR = _cunumeric.CUNUMERIC_BINOP_LOGICAL_XOR
- MAXIMUM = _cunumeric.CUNUMERIC_BINOP_MAXIMUM
- MINIMUM = _cunumeric.CUNUMERIC_BINOP_MINIMUM
- MOD = _cunumeric.CUNUMERIC_BINOP_MOD
- MULTIPLY = _cunumeric.CUNUMERIC_BINOP_MULTIPLY
- NEXTAFTER = _cunumeric.CUNUMERIC_BINOP_NEXTAFTER
- NOT_EQUAL = _cunumeric.CUNUMERIC_BINOP_NOT_EQUAL
- POWER = _cunumeric.CUNUMERIC_BINOP_POWER
- RIGHT_SHIFT = _cunumeric.CUNUMERIC_BINOP_RIGHT_SHIFT
- SUBTRACT = _cunumeric.CUNUMERIC_BINOP_SUBTRACT
-
-
-@unique
-class WindowOpCode(IntEnum):
- BARLETT = _cunumeric.CUNUMERIC_WINDOW_BARLETT
- BLACKMAN = _cunumeric.CUNUMERIC_WINDOW_BLACKMAN
- HAMMING = _cunumeric.CUNUMERIC_WINDOW_HAMMING
- HANNING = _cunumeric.CUNUMERIC_WINDOW_HANNING
- KAISER = _cunumeric.CUNUMERIC_WINDOW_KAISER
-
-
-# Match these to RandGenCode in rand_util.h
-@unique
-class RandGenCode(IntEnum):
- UNIFORM = 1
- NORMAL = 2
- INTEGER = 3
-
-
-# Match these to CuNumericScanCode in cunumeric_c.h
-@unique
-class ScanCode(IntEnum):
- PROD = _cunumeric.CUNUMERIC_SCAN_PROD
- SUM = _cunumeric.CUNUMERIC_SCAN_SUM
-
-
-# Match these to CuNumericConvertCode in cunumeric_c.h
-@unique
-class ConvertCode(IntEnum):
- NOOP = _cunumeric.CUNUMERIC_CONVERT_NAN_NOOP
- PROD = _cunumeric.CUNUMERIC_CONVERT_NAN_PROD
- SUM = _cunumeric.CUNUMERIC_CONVERT_NAN_SUM
-
-
-# Match these to BitGeneratorOperation in cunumeric_c.h
-@unique
-class BitGeneratorOperation(IntEnum):
- CREATE = _cunumeric.CUNUMERIC_BITGENOP_CREATE
- DESTROY = _cunumeric.CUNUMERIC_BITGENOP_DESTROY
- RAND_RAW = _cunumeric.CUNUMERIC_BITGENOP_RAND_RAW
- DISTRIBUTION = _cunumeric.CUNUMERIC_BITGENOP_DISTRIBUTION
-
-
-# Match these to BitGeneratorType in cunumeric_c.h
-@unique
-class BitGeneratorType(IntEnum):
- DEFAULT = _cunumeric.CUNUMERIC_BITGENTYPE_DEFAULT
- XORWOW = _cunumeric.CUNUMERIC_BITGENTYPE_XORWOW
- MRG32K3A = _cunumeric.CUNUMERIC_BITGENTYPE_MRG32K3A
- MTGP32 = _cunumeric.CUNUMERIC_BITGENTYPE_MTGP32
- MT19937 = _cunumeric.CUNUMERIC_BITGENTYPE_MT19937
- PHILOX4_32_10 = _cunumeric.CUNUMERIC_BITGENTYPE_PHILOX4_32_10
-
-
-# Match these to BitGeneratorDistribution in cunumeric_c.h
-@unique
-class BitGeneratorDistribution(IntEnum):
- INTEGERS_16 = _cunumeric.CUNUMERIC_BITGENDIST_INTEGERS_16
- INTEGERS_32 = _cunumeric.CUNUMERIC_BITGENDIST_INTEGERS_32
- INTEGERS_64 = _cunumeric.CUNUMERIC_BITGENDIST_INTEGERS_64
- UNIFORM_32 = _cunumeric.CUNUMERIC_BITGENDIST_UNIFORM_32
- UNIFORM_64 = _cunumeric.CUNUMERIC_BITGENDIST_UNIFORM_64
- LOGNORMAL_32 = _cunumeric.CUNUMERIC_BITGENDIST_LOGNORMAL_32
- LOGNORMAL_64 = _cunumeric.CUNUMERIC_BITGENDIST_LOGNORMAL_64
- NORMAL_32 = _cunumeric.CUNUMERIC_BITGENDIST_NORMAL_32
- NORMAL_64 = _cunumeric.CUNUMERIC_BITGENDIST_NORMAL_64
- POISSON = _cunumeric.CUNUMERIC_BITGENDIST_POISSON
- EXPONENTIAL_32 = _cunumeric.CUNUMERIC_BITGENDIST_EXPONENTIAL_32
- EXPONENTIAL_64 = _cunumeric.CUNUMERIC_BITGENDIST_EXPONENTIAL_64
- GUMBEL_32 = _cunumeric.CUNUMERIC_BITGENDIST_GUMBEL_32
- GUMBEL_64 = _cunumeric.CUNUMERIC_BITGENDIST_GUMBEL_64
- LAPLACE_32 = _cunumeric.CUNUMERIC_BITGENDIST_LAPLACE_32
- LAPLACE_64 = _cunumeric.CUNUMERIC_BITGENDIST_LAPLACE_64
- LOGISTIC_32 = _cunumeric.CUNUMERIC_BITGENDIST_LOGISTIC_32
- LOGISTIC_64 = _cunumeric.CUNUMERIC_BITGENDIST_LOGISTIC_64
- PARETO_32 = _cunumeric.CUNUMERIC_BITGENDIST_PARETO_32
- PARETO_64 = _cunumeric.CUNUMERIC_BITGENDIST_PARETO_64
- POWER_32 = _cunumeric.CUNUMERIC_BITGENDIST_POWER_32
- POWER_64 = _cunumeric.CUNUMERIC_BITGENDIST_POWER_64
- RAYLEIGH_32 = _cunumeric.CUNUMERIC_BITGENDIST_RAYLEIGH_32
- RAYLEIGH_64 = _cunumeric.CUNUMERIC_BITGENDIST_RAYLEIGH_64
- CAUCHY_32 = _cunumeric.CUNUMERIC_BITGENDIST_CAUCHY_32
- CAUCHY_64 = _cunumeric.CUNUMERIC_BITGENDIST_CAUCHY_64
- TRIANGULAR_32 = _cunumeric.CUNUMERIC_BITGENDIST_TRIANGULAR_32
- TRIANGULAR_64 = _cunumeric.CUNUMERIC_BITGENDIST_TRIANGULAR_64
- WEIBULL_32 = _cunumeric.CUNUMERIC_BITGENDIST_WEIBULL_32
- WEIBULL_64 = _cunumeric.CUNUMERIC_BITGENDIST_WEIBULL_64
- BYTES = _cunumeric.CUNUMERIC_BITGENDIST_BYTES
- BETA_32 = _cunumeric.CUNUMERIC_BITGENDIST_BETA_32
- BETA_64 = _cunumeric.CUNUMERIC_BITGENDIST_BETA_64
- F_32 = _cunumeric.CUNUMERIC_BITGENDIST_F_32
- F_64 = _cunumeric.CUNUMERIC_BITGENDIST_F_64
- LOGSERIES = _cunumeric.CUNUMERIC_BITGENDIST_LOGSERIES
- NONCENTRAL_F_32 = _cunumeric.CUNUMERIC_BITGENDIST_NONCENTRAL_F_32
- NONCENTRAL_F_64 = _cunumeric.CUNUMERIC_BITGENDIST_NONCENTRAL_F_64
- CHISQUARE_32 = _cunumeric.CUNUMERIC_BITGENDIST_CHISQUARE_32
- CHISQUARE_64 = _cunumeric.CUNUMERIC_BITGENDIST_CHISQUARE_64
- GAMMA_32 = _cunumeric.CUNUMERIC_BITGENDIST_GAMMA_32
- GAMMA_64 = _cunumeric.CUNUMERIC_BITGENDIST_GAMMA_64
- STANDARD_T_32 = _cunumeric.CUNUMERIC_BITGENDIST_STANDARD_T_32
- STANDARD_T_64 = _cunumeric.CUNUMERIC_BITGENDIST_STANDARD_T_64
- HYPERGEOMETRIC = _cunumeric.CUNUMERIC_BITGENDIST_HYPERGEOMETRIC
- VONMISES_32 = _cunumeric.CUNUMERIC_BITGENDIST_VONMISES_32
- VONMISES_64 = _cunumeric.CUNUMERIC_BITGENDIST_VONMISES_64
- ZIPF = _cunumeric.CUNUMERIC_BITGENDIST_ZIPF
- GEOMETRIC = _cunumeric.CUNUMERIC_BITGENDIST_GEOMETRIC
- WALD_32 = _cunumeric.CUNUMERIC_BITGENDIST_WALD_32
- WALD_64 = _cunumeric.CUNUMERIC_BITGENDIST_WALD_64
- BINOMIAL = _cunumeric.CUNUMERIC_BITGENDIST_BINOMIAL
- NEGATIVE_BINOMIAL = _cunumeric.CUNUMERIC_BITGENDIST_NEGATIVE_BINOMIAL
-
-
-@unique
-class TransferType(IntEnum):
- DONATE = 0
- MAKE_COPY = 1
- SHARE = 2
-
-
-# Match these to fftType in fft_util.h
-class FFTType:
- def __init__(
- self,
- name: str,
- type_id: int,
- input_dtype: npt.DTypeLike,
- output_dtype: npt.DTypeLike,
- single_precision: bool,
- complex_type: FFTType | None = None,
- ) -> None:
- self._name = name
- self._type_id = type_id
- self._complex_type = self if complex_type is None else complex_type
- self._input_dtype = input_dtype
- self._output_dtype = output_dtype
- self._single_precision = single_precision
-
- def __str__(self) -> str:
- return self._name
-
- def __repr__(self) -> str:
- return str(self)
-
- @property
- def type_id(self) -> int:
- return self._type_id
-
- @property
- def complex(self) -> FFTType:
- return self._complex_type
-
- @property
- def input_dtype(self) -> npt.DTypeLike:
- return self._input_dtype
-
- @property
- def output_dtype(self) -> npt.DTypeLike:
- return self._output_dtype
-
- @property
- def is_single_precision(self) -> bool:
- return self._single_precision
-
-
-FFT_C2C = FFTType(
- "C2C",
- _cunumeric.CUNUMERIC_FFT_C2C,
- np.complex64,
- np.complex64,
- True,
-)
-
-FFT_Z2Z = FFTType(
- "Z2Z",
- _cunumeric.CUNUMERIC_FFT_Z2Z,
- np.complex128,
- np.complex128,
- False,
-)
-
-FFT_R2C = FFTType(
- "R2C",
- _cunumeric.CUNUMERIC_FFT_R2C,
- np.float32,
- np.complex64,
- True,
- FFT_C2C,
-)
-
-FFT_C2R = FFTType(
- "C2R",
- _cunumeric.CUNUMERIC_FFT_C2R,
- np.complex64,
- np.float32,
- True,
- FFT_C2C,
-)
-
-FFT_D2Z = FFTType(
- "D2Z",
- _cunumeric.CUNUMERIC_FFT_D2Z,
- np.float64,
- np.complex128,
- False,
- FFT_Z2Z,
-)
-
-FFT_Z2D = FFTType(
- "Z2D",
- _cunumeric.CUNUMERIC_FFT_Z2D,
- np.complex128,
- np.float64,
- False,
- FFT_Z2Z,
-)
-
-
-class FFTCode:
- @staticmethod
- def real_to_complex_code(dtype: npt.DTypeLike) -> FFTType:
- if dtype == np.float64:
- return FFT_D2Z
- elif dtype == np.float32:
- return FFT_R2C
- else:
- raise TypeError(
- (
- "Data type for FFT not supported "
- "(supported types are float32 and float64)"
- )
- )
-
- @staticmethod
- def complex_to_real_code(dtype: npt.DTypeLike) -> FFTType:
- if dtype == np.complex128:
- return FFT_Z2D
- elif dtype == np.complex64:
- return FFT_C2R
- else:
- raise TypeError(
- (
- "Data type for FFT not supported "
- "(supported types are complex64 and complex128)"
- )
- )
-
-
-@unique
-class FFTDirection(IntEnum):
- FORWARD = _cunumeric.CUNUMERIC_FFT_FORWARD
- INVERSE = _cunumeric.CUNUMERIC_FFT_INVERSE
-
-
-# Match these to CuNumericBitorder in cunumeric_c.h
-@unique
-class Bitorder(IntEnum):
- BIG = _cunumeric.CUNUMERIC_BITORDER_BIG
- LITTLE = _cunumeric.CUNUMERIC_BITORDER_LITTLE
-
-
-@unique
-class FFTNormalization(IntEnum):
- FORWARD = 1
- INVERSE = 2
- ORTHOGONAL = 3
-
- @staticmethod
- def from_string(in_string: str) -> FFTNormalization | None:
- if in_string == "forward":
- return FFTNormalization.FORWARD
- elif in_string == "ortho":
- return FFTNormalization.ORTHOGONAL
- elif in_string == "backward" or in_string is None:
- return FFTNormalization.INVERSE
- else:
- return None
-
- @staticmethod
- def reverse(in_string: str | None) -> str:
- if in_string == "forward":
- return "backward"
- elif in_string == "backward" or in_string is None:
- return "forward"
- else:
- return in_string
diff --git a/cunumeric_cpp.cmake b/cunumeric_cpp.cmake
deleted file mode 100644
index 5c324f1dc3..0000000000
--- a/cunumeric_cpp.cmake
+++ /dev/null
@@ -1,565 +0,0 @@
-#=============================================================================
-# Copyright 2024 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
-
-##############################################################################
-# - User Options ------------------------------------------------------------
-
-option(BUILD_SHARED_LIBS "Build cuNumeric shared libraries" ON)
-option(cunumeric_EXCLUDE_TBLIS_FROM_ALL "Exclude tblis targets from cuNumeric's 'all' target" OFF)
-option(cunumeric_EXCLUDE_OPENBLAS_FROM_ALL "Exclude OpenBLAS targets from cuNumeric's 'all' target" OFF)
-option(cunumeric_EXCLUDE_LEGATE_FROM_ALL "Exclude legate targets from cuNumeric's 'all' target" OFF)
-
-##############################################################################
-# - Project definition -------------------------------------------------------
-
-# Write the version header
-rapids_cmake_write_version_file(include/cunumeric/version_config.hpp)
-
-# Needed to integrate with LLVM/clang tooling
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-
-##############################################################################
-# - Build Type ---------------------------------------------------------------
-
-# Set a default build type if none was specified
-rapids_cmake_build_type(Release)
-
-##############################################################################
-# - conda environment --------------------------------------------------------
-
-rapids_cmake_support_conda_env(conda_env MODIFY_PREFIX_PATH)
-
-# We're building python extension libraries, which must always be installed
-# under lib/, even if the system normally uses lib64/. Rapids-cmake currently
-# doesn't realize this when we're going through scikit-build, see
-# https://github.com/rapidsai/rapids-cmake/issues/426
-if(TARGET conda_env)
- set(CMAKE_INSTALL_LIBDIR "lib")
-endif()
-
-##############################################################################
-# - Dependencies -------------------------------------------------------------
-
-# add third party dependencies using CPM
-rapids_cpm_init(OVERRIDE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/versions.json)
-
-rapids_find_package(OpenMP GLOBAL_TARGETS OpenMP::OpenMP_CXX)
-
-option(Legion_USE_CUDA "Use CUDA" ON)
-option(Legion_USE_OpenMP "Use OpenMP" ${OpenMP_FOUND})
-option(Legion_BOUNDS_CHECKS "Build cuNumeric with bounds checks (expensive)" OFF)
-
-# If legate has CUDA support, then including it in a project will automatically call
-# enable_language(CUDA). However, this does not play nice with the rapids-cmake CUDA utils
-# which support a wider range of values for CMAKE_CUDA_ARCHITECTURES than cmake does. You
-# end up with the following error:
-#
-# CMAKE_CUDA_ARCHITECTURES:
-#
-# RAPIDS
-#
-# is not one of the following:
-#
-# * a semicolon-separated list of integers, each optionally
-# followed by '-real' or '-virtual'
-# * a special value: all, all-major, native
-#
-set(cmake_cuda_arch_backup "${CMAKE_CUDA_ARCHITECTURES}")
-set(cmake_cuda_arch_cache_backup "$CACHE{CMAKE_CUDA_ARCHITECTURES}")
-if(("${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "RAPIDS") OR ("${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "NATIVE"))
- unset(CMAKE_CUDA_ARCHITECTURES)
- unset(CMAKE_CUDA_ARCHITECTURES CACHE)
-endif()
-
-###
-# If we find legate already configured on the system, it will report
-# whether it was compiled with bounds checking (Legion_BOUNDS_CHECKS),
-# CUDA (Legion_USE_CUDA), and OpenMP (Legion_USE_OpenMP).
-#
-# We use the same variables as legate because we want to enable/disable
-# each of these features based on how legate was configured (it doesn't
-# make sense to build cuNumeric's CUDA bindings if legate wasn't built
-# with CUDA support).
-###
-include(cmake/thirdparty/get_legate.cmake)
-
-set(CMAKE_CUDA_ARCHITECTURES "${cmake_cuda_arch_cache_backup}" CACHE STRING "" FORCE)
-set(CMAKE_CUDA_ARCHITECTURES "${cmake_cuda_arch_backup}")
-unset(cmake_cuda_arch_backup)
-unset(cmake_cuda_arch_cache_backup)
-
-if(Legion_USE_CUDA)
- include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/cuda_arch_helpers.cmake)
- # Needs to run before `rapids_cuda_init_architectures`
- set_cuda_arch_from_names()
- # Needs to run before `enable_language(CUDA)`
- rapids_cuda_init_architectures(cunumeric)
- enable_language(CUDA)
- # Since cunumeric only enables CUDA optionally we need to manually include
- # the file that rapids_cuda_init_architectures relies on `project` calling
- if(CMAKE_PROJECT_cunumeric_INCLUDE)
- include("${CMAKE_PROJECT_cunumeric_INCLUDE}")
- endif()
-
- # Must come after enable_language(CUDA)
- # Use `-isystem ` instead of `-isystem=`
- # because the former works with clangd intellisense
- set(CMAKE_INCLUDE_SYSTEM_FLAG_CUDA "-isystem ")
-
- rapids_find_package(
- CUDAToolkit REQUIRED
- BUILD_EXPORT_SET cunumeric-exports
- INSTALL_EXPORT_SET cunumeric-exports
- )
-
- include(cmake/thirdparty/get_nccl.cmake)
- include(cmake/thirdparty/get_cutensor.cmake)
-endif()
-
-include(cmake/thirdparty/get_openblas.cmake)
-
-include(cmake/thirdparty/get_tblis.cmake)
-
-##############################################################################
-# - cuNumeric ----------------------------------------------------------------
-
-set(cunumeric_SOURCES "")
-set(cunumeric_CXX_DEFS "")
-set(cunumeric_CUDA_DEFS "")
-set(cunumeric_CXX_OPTIONS "")
-set(cunumeric_CUDA_OPTIONS "")
-
-include(cmake/Modules/set_cpu_arch_flags.cmake)
-set_cpu_arch_flags(cunumeric_CXX_OPTIONS)
-
-# Add `src/cunumeric.mk` sources
-list(APPEND cunumeric_SOURCES
- src/cunumeric/ternary/where.cc
- src/cunumeric/scan/scan_global.cc
- src/cunumeric/scan/scan_local.cc
- src/cunumeric/binary/binary_op.cc
- src/cunumeric/binary/binary_op_util.cc
- src/cunumeric/binary/binary_red.cc
- src/cunumeric/bits/packbits.cc
- src/cunumeric/bits/unpackbits.cc
- src/cunumeric/unary/scalar_unary_red.cc
- src/cunumeric/unary/unary_op.cc
- src/cunumeric/unary/unary_red.cc
- src/cunumeric/unary/convert.cc
- src/cunumeric/nullary/arange.cc
- src/cunumeric/nullary/eye.cc
- src/cunumeric/nullary/fill.cc
- src/cunumeric/nullary/window.cc
- src/cunumeric/index/advanced_indexing.cc
- src/cunumeric/index/choose.cc
- src/cunumeric/index/putmask.cc
- src/cunumeric/index/repeat.cc
- src/cunumeric/index/select.cc
- src/cunumeric/index/wrap.cc
- src/cunumeric/index/zip.cc
- src/cunumeric/item/read.cc
- src/cunumeric/item/write.cc
- src/cunumeric/matrix/batched_cholesky.cc
- src/cunumeric/matrix/contract.cc
- src/cunumeric/matrix/diag.cc
- src/cunumeric/matrix/gemm.cc
- src/cunumeric/matrix/matmul.cc
- src/cunumeric/matrix/matvecmul.cc
- src/cunumeric/matrix/dot.cc
- src/cunumeric/matrix/potrf.cc
- src/cunumeric/matrix/qr.cc
- src/cunumeric/matrix/solve.cc
- src/cunumeric/matrix/svd.cc
- src/cunumeric/matrix/syrk.cc
- src/cunumeric/matrix/tile.cc
- src/cunumeric/matrix/transpose.cc
- src/cunumeric/matrix/trilu.cc
- src/cunumeric/matrix/trsm.cc
- src/cunumeric/matrix/util.cc
- src/cunumeric/random/bitgenerator.cc
- src/cunumeric/random/randutil/generator_host.cc
- src/cunumeric/random/randutil/generator_host_straightforward.cc
- src/cunumeric/random/randutil/generator_host_advanced.cc
- src/cunumeric/random/rand.cc
- src/cunumeric/search/argwhere.cc
- src/cunumeric/search/nonzero.cc
- src/cunumeric/set/unique.cc
- src/cunumeric/set/unique_reduce.cc
- src/cunumeric/stat/bincount.cc
- src/cunumeric/convolution/convolve.cc
- src/cunumeric/transform/flip.cc
- src/cunumeric/utilities/repartition.cc
- src/cunumeric/arg_redop_register.cc
- src/cunumeric/mapper.cc
- src/cunumeric/ndarray.cc
- src/cunumeric/operators.cc
- src/cunumeric/runtime.cc
- src/cunumeric/cephes/chbevl.cc
- src/cunumeric/cephes/i0.cc
- src/cunumeric/stat/histogram.cc
-)
-
-if(Legion_USE_OpenMP)
- list(APPEND cunumeric_SOURCES
- src/cunumeric/ternary/where_omp.cc
- src/cunumeric/scan/scan_global_omp.cc
- src/cunumeric/scan/scan_local_omp.cc
- src/cunumeric/binary/binary_op_omp.cc
- src/cunumeric/binary/binary_red_omp.cc
- src/cunumeric/bits/packbits_omp.cc
- src/cunumeric/bits/unpackbits_omp.cc
- src/cunumeric/unary/unary_op_omp.cc
- src/cunumeric/unary/scalar_unary_red_omp.cc
- src/cunumeric/unary/unary_red_omp.cc
- src/cunumeric/unary/convert_omp.cc
- src/cunumeric/nullary/arange_omp.cc
- src/cunumeric/nullary/eye_omp.cc
- src/cunumeric/nullary/fill_omp.cc
- src/cunumeric/nullary/window_omp.cc
- src/cunumeric/index/advanced_indexing_omp.cc
- src/cunumeric/index/choose_omp.cc
- src/cunumeric/index/putmask_omp.cc
- src/cunumeric/index/repeat_omp.cc
- src/cunumeric/index/select_omp.cc
- src/cunumeric/index/wrap_omp.cc
- src/cunumeric/index/zip_omp.cc
- src/cunumeric/matrix/batched_cholesky_omp.cc
- src/cunumeric/matrix/contract_omp.cc
- src/cunumeric/matrix/diag_omp.cc
- src/cunumeric/matrix/gemm_omp.cc
- src/cunumeric/matrix/matmul_omp.cc
- src/cunumeric/matrix/matvecmul_omp.cc
- src/cunumeric/matrix/dot_omp.cc
- src/cunumeric/matrix/potrf_omp.cc
- src/cunumeric/matrix/qr_omp.cc
- src/cunumeric/matrix/solve_omp.cc
- src/cunumeric/matrix/svd_omp.cc
- src/cunumeric/matrix/syrk_omp.cc
- src/cunumeric/matrix/tile_omp.cc
- src/cunumeric/matrix/transpose_omp.cc
- src/cunumeric/matrix/trilu_omp.cc
- src/cunumeric/matrix/trsm_omp.cc
- src/cunumeric/random/rand_omp.cc
- src/cunumeric/search/argwhere_omp.cc
- src/cunumeric/search/nonzero_omp.cc
- src/cunumeric/set/unique_omp.cc
- src/cunumeric/set/unique_reduce_omp.cc
- src/cunumeric/stat/bincount_omp.cc
- src/cunumeric/convolution/convolve_omp.cc
- src/cunumeric/transform/flip_omp.cc
- src/cunumeric/stat/histogram_omp.cc
- )
-endif()
-
-if(Legion_USE_CUDA)
- list(APPEND cunumeric_SOURCES
- src/cunumeric/ternary/where.cu
- src/cunumeric/scan/scan_global.cu
- src/cunumeric/scan/scan_local.cu
- src/cunumeric/binary/binary_op.cu
- src/cunumeric/binary/binary_red.cu
- src/cunumeric/bits/packbits.cu
- src/cunumeric/bits/unpackbits.cu
- src/cunumeric/unary/scalar_unary_red.cu
- src/cunumeric/unary/unary_red.cu
- src/cunumeric/unary/unary_op.cu
- src/cunumeric/unary/convert.cu
- src/cunumeric/nullary/arange.cu
- src/cunumeric/nullary/eye.cu
- src/cunumeric/nullary/fill.cu
- src/cunumeric/nullary/window.cu
- src/cunumeric/index/advanced_indexing.cu
- src/cunumeric/index/choose.cu
- src/cunumeric/index/putmask.cu
- src/cunumeric/index/repeat.cu
- src/cunumeric/index/select.cu
- src/cunumeric/index/wrap.cu
- src/cunumeric/index/zip.cu
- src/cunumeric/item/read.cu
- src/cunumeric/item/write.cu
- src/cunumeric/matrix/batched_cholesky.cu
- src/cunumeric/matrix/contract.cu
- src/cunumeric/matrix/diag.cu
- src/cunumeric/matrix/gemm.cu
- src/cunumeric/matrix/matmul.cu
- src/cunumeric/matrix/matvecmul.cu
- src/cunumeric/matrix/dot.cu
- src/cunumeric/matrix/potrf.cu
- src/cunumeric/matrix/qr.cu
- src/cunumeric/matrix/solve.cu
- src/cunumeric/matrix/svd.cu
- src/cunumeric/matrix/syrk.cu
- src/cunumeric/matrix/tile.cu
- src/cunumeric/matrix/transpose.cu
- src/cunumeric/matrix/trilu.cu
- src/cunumeric/matrix/trsm.cu
- src/cunumeric/random/rand.cu
- src/cunumeric/search/argwhere.cu
- src/cunumeric/search/nonzero.cu
- src/cunumeric/set/unique.cu
- src/cunumeric/stat/bincount.cu
- src/cunumeric/convolution/convolve.cu
- src/cunumeric/fft/fft.cu
- src/cunumeric/transform/flip.cu
- src/cunumeric/utilities/repartition.cu
- src/cunumeric/arg_redop_register.cu
- src/cunumeric/cudalibs.cu
- src/cunumeric/stat/histogram.cu
- )
-endif()
-
-# Add `src/cunumeric/sort/sort.mk` sources
-list(APPEND cunumeric_SOURCES
- src/cunumeric/sort/sort.cc
- src/cunumeric/sort/searchsorted.cc
-)
-
-if(Legion_USE_OpenMP)
- list(APPEND cunumeric_SOURCES
- src/cunumeric/sort/sort_omp.cc
- src/cunumeric/sort/searchsorted_omp.cc
- )
-endif()
-
-if(Legion_USE_CUDA)
- list(APPEND cunumeric_SOURCES
- src/cunumeric/sort/sort.cu
- src/cunumeric/sort/searchsorted.cu
- src/cunumeric/sort/cub_sort_bool.cu
- src/cunumeric/sort/cub_sort_int8.cu
- src/cunumeric/sort/cub_sort_int16.cu
- src/cunumeric/sort/cub_sort_int32.cu
- src/cunumeric/sort/cub_sort_int64.cu
- src/cunumeric/sort/cub_sort_uint8.cu
- src/cunumeric/sort/cub_sort_uint16.cu
- src/cunumeric/sort/cub_sort_uint32.cu
- src/cunumeric/sort/cub_sort_uint64.cu
- src/cunumeric/sort/cub_sort_half.cu
- src/cunumeric/sort/cub_sort_float.cu
- src/cunumeric/sort/cub_sort_double.cu
- src/cunumeric/sort/thrust_sort_bool.cu
- src/cunumeric/sort/thrust_sort_int8.cu
- src/cunumeric/sort/thrust_sort_int16.cu
- src/cunumeric/sort/thrust_sort_int32.cu
- src/cunumeric/sort/thrust_sort_int64.cu
- src/cunumeric/sort/thrust_sort_uint8.cu
- src/cunumeric/sort/thrust_sort_uint16.cu
- src/cunumeric/sort/thrust_sort_uint32.cu
- src/cunumeric/sort/thrust_sort_uint64.cu
- src/cunumeric/sort/thrust_sort_half.cu
- src/cunumeric/sort/thrust_sort_float.cu
- src/cunumeric/sort/thrust_sort_double.cu
- src/cunumeric/sort/thrust_sort_complex64.cu
- src/cunumeric/sort/thrust_sort_complex128.cu
- )
-endif()
-
-# Add `src/cunumeric/random/random.mk` sources
-if(Legion_USE_CUDA)
- list(APPEND cunumeric_SOURCES
- src/cunumeric/random/bitgenerator.cu
- src/cunumeric/random/randutil/generator_device.cu
- src/cunumeric/random/randutil/generator_device_straightforward.cu
- src/cunumeric/random/randutil/generator_device_advanced.cu
-)
-endif()
-
-# add sources for cusolverMp
-if(Legion_USE_CUDA AND CUSOLVERMP_DIR)
- list(APPEND cunumeric_SOURCES
- src/cunumeric/matrix/mp_potrf.cu
- src/cunumeric/matrix/mp_solve.cu
- )
-endif()
-
-list(APPEND cunumeric_SOURCES
- # This must always be the last file!
- # It guarantees we do our registration callback
- # only after all task variants are recorded
- src/cunumeric/cunumeric.cc
-)
-
-if(CMAKE_BUILD_TYPE STREQUAL "Debug")
- list(APPEND cunumeric_CXX_DEFS DEBUG_CUNUMERIC)
- list(APPEND cunumeric_CUDA_DEFS DEBUG_CUNUMERIC)
-endif()
-
-if(Legion_BOUNDS_CHECKS)
- list(APPEND cunumeric_CXX_DEFS BOUNDS_CHECKS)
- list(APPEND cunumeric_CUDA_DEFS BOUNDS_CHECKS)
-endif()
-
-list(APPEND cunumeric_CUDA_OPTIONS -Xfatbin=-compress-all)
-list(APPEND cunumeric_CUDA_OPTIONS --expt-extended-lambda)
-list(APPEND cunumeric_CUDA_OPTIONS --expt-relaxed-constexpr)
-list(APPEND cunumeric_CXX_OPTIONS -Wno-deprecated-declarations)
-list(APPEND cunumeric_CUDA_OPTIONS -Wno-deprecated-declarations)
-
-add_library(cunumeric ${cunumeric_SOURCES})
-add_library(cunumeric::cunumeric ALIAS cunumeric)
-
-if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
- set(platform_rpath_origin "\$ORIGIN")
-elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin")
- set(platform_rpath_origin "@loader_path")
-endif ()
-
-set_target_properties(cunumeric
- PROPERTIES BUILD_RPATH "${platform_rpath_origin}"
- INSTALL_RPATH "${platform_rpath_origin}"
- CXX_STANDARD 17
- CXX_STANDARD_REQUIRED ON
- POSITION_INDEPENDENT_CODE ON
- INTERFACE_POSITION_INDEPENDENT_CODE ON
- CUDA_STANDARD 17
- CUDA_STANDARD_REQUIRED ON
- LIBRARY_OUTPUT_DIRECTORY lib)
-
-target_link_libraries(cunumeric
- PUBLIC legate::legate
- $
- PRIVATE BLAS::BLAS
- tblis::tblis
- # Add Conda library and include paths
- $
- $
- $
- $
- $
- $)
-
-if(NOT Legion_USE_CUDA AND cunumeric_cuRAND_INCLUDE_DIR)
- list(APPEND cunumeric_CXX_DEFS CUNUMERIC_CURAND_FOR_CPU_BUILD)
- target_include_directories(cunumeric PRIVATE ${cunumeric_cuRAND_INCLUDE_DIR})
-endif()
-
-if(Legion_USE_CUDA AND CUSOLVERMP_DIR)
- message(VERBOSE "cunumeric: CUSOLVERMP_DIR ${CUSOLVERMP_DIR}")
- list(APPEND cunumeric_CXX_DEFS CUNUMERIC_USE_CUSOLVERMP)
- list(APPEND cunumeric_CUDA_DEFS CUNUMERIC_USE_CUSOLVERMP)
- target_include_directories(cunumeric PRIVATE ${CUSOLVERMP_DIR}/include)
- target_link_libraries(cunumeric PRIVATE ${CUSOLVERMP_DIR}/lib/libcusolverMp.so)
-endif()
-
-target_compile_options(cunumeric
- PRIVATE "$<$:${cunumeric_CXX_OPTIONS}>"
- "$<$:${cunumeric_CUDA_OPTIONS}>")
-
-target_compile_definitions(cunumeric
- PUBLIC "$<$:${cunumeric_CXX_DEFS}>"
- "$<$:${cunumeric_CUDA_DEFS}>")
-
-target_include_directories(cunumeric
- PUBLIC
- $
- INTERFACE
- $
-)
-
-if(Legion_USE_CUDA)
- file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld"
-[=[
-SECTIONS
-{
-.nvFatBinSegment : { *(.nvFatBinSegment) }
-.nv_fatbin : { *(.nv_fatbin) }
-}
-]=])
-
- # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
- target_link_options(cunumeric PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
-endif()
-
-##############################################################################
-# - install targets-----------------------------------------------------------
-
-include(CPack)
-include(GNUInstallDirs)
-rapids_cmake_install_lib_dir(lib_dir)
-
-install(TARGETS cunumeric
- DESTINATION ${lib_dir}
- EXPORT cunumeric-exports)
-
-install(
- FILES src/cunumeric.h
- ${CMAKE_CURRENT_BINARY_DIR}/include/cunumeric/version_config.hpp
- DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cunumeric)
-
-install(
- FILES src/cunumeric/cunumeric_c.h
- src/cunumeric/ndarray.h
- src/cunumeric/ndarray.inl
- src/cunumeric/operators.h
- src/cunumeric/operators.inl
- src/cunumeric/runtime.h
- src/cunumeric/slice.h
- src/cunumeric/typedefs.h
- DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cunumeric/cunumeric)
-
-if(cunumeric_INSTALL_TBLIS)
- install(DIRECTORY ${tblis_BINARY_DIR}/lib/ DESTINATION ${lib_dir})
- install(DIRECTORY ${tblis_BINARY_DIR}/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
-endif()
-
-##############################################################################
-# - install export -----------------------------------------------------------
-
-set(doc_string
- [=[
-Provide targets for cuNumeric, an aspiring drop-in replacement for NumPy at scale.
-
-Imported Targets:
- - cunumeric::cunumeric
-
-]=])
-
-string(JOIN "\n" code_string
- "set(Legion_USE_CUDA ${Legion_USE_CUDA})"
- "set(Legion_USE_OpenMP ${Legion_USE_OpenMP})"
- "set(Legion_BOUNDS_CHECKS ${Legion_BOUNDS_CHECKS})"
-)
-
-if(DEFINED Legion_USE_Python)
- string(APPEND code_string "\nset(Legion_USE_Python ${Legion_USE_Python})")
-endif()
-
-if(DEFINED Legion_NETWORKS)
- string(APPEND code_string "\nset(Legion_NETWORKS ${Legion_NETWORKS})")
-endif()
-
-rapids_export(
- INSTALL cunumeric
- EXPORT_SET cunumeric-exports
- GLOBAL_TARGETS cunumeric
- NAMESPACE cunumeric::
- DOCUMENTATION doc_string
- FINAL_CODE_BLOCK code_string)
-
-# build export targets
-rapids_export(
- BUILD cunumeric
- EXPORT_SET cunumeric-exports
- GLOBAL_TARGETS cunumeric
- NAMESPACE cunumeric::
- DOCUMENTATION doc_string
- FINAL_CODE_BLOCK code_string)
-
-if(cunumeric_BUILD_TESTS)
- include(CTest)
-
- add_subdirectory(tests/cpp)
-endif()
diff --git a/cunumeric/__init__.py b/cupynumeric/__init__.py
similarity index 66%
rename from cunumeric/__init__.py
rename to cupynumeric/__init__.py
index 4f0458cf2a..082217fb12 100644
--- a/cunumeric/__init__.py
+++ b/cupynumeric/__init__.py
@@ -14,7 +14,7 @@
#
"""
-cuNumeric
+cuPyNumeric
=====
Provides a distributed task-parallel implementation of the Numpy interface
@@ -31,7 +31,7 @@
from ._array.util import maybe_convert_to_np_ndarray
from ._module import *
from ._ufunc import *
-from ._utils.array import is_supported_dtype
+from ._utils.array import is_supported_dtype, local_task_array
from ._utils.coverage import clone_module
clone_module(_np, globals(), maybe_convert_to_np_ndarray)
@@ -40,6 +40,22 @@
del clone_module
del _np
-from . import _version
-__version__ = _version.get_versions()["version"] # type: ignore [no-untyped-call]
+def _fixup_version() -> str:
+ import os
+
+ if (v := os.environ.get("CUPYNUMERIC_USE_VERSION")) is not None:
+ return v
+
+ from . import _version
+
+ if hasattr(_version, "get_versions"):
+ return _version.get_versions()["version"] # type: ignore [no-untyped-call]
+ if hasattr(_version, "__version__"):
+ return _version.__version__
+
+ raise RuntimeError("Failed to determine version")
+
+
+__version__ = _fixup_version()
+del _fixup_version
diff --git a/cunumeric/_array/__init__.py b/cupynumeric/_array/__init__.py
similarity index 100%
rename from cunumeric/_array/__init__.py
rename to cupynumeric/_array/__init__.py
diff --git a/cunumeric/_array/array.py b/cupynumeric/_array/array.py
similarity index 93%
rename from cunumeric/_array/array.py
rename to cupynumeric/_array/array.py
index 5a6406057b..a985f3cbe4 100644
--- a/cunumeric/_array/array.py
+++ b/cupynumeric/_array/array.py
@@ -17,7 +17,7 @@
import operator
import warnings
from functools import reduce
-from typing import TYPE_CHECKING, Any, Sequence, cast
+from typing import TYPE_CHECKING, Any, Literal, Sequence, cast
import legate.core.types as ty
import numpy as np
@@ -52,7 +52,7 @@
add_boilerplate,
broadcast_where,
check_writeable,
- convert_to_cunumeric_ndarray,
+ convert_to_cupynumeric_ndarray,
maybe_convert_to_np_ndarray,
sanitize_shape,
tuple_pop,
@@ -106,9 +106,10 @@ def __init__(
order: OrderType | None = None,
thunk: NumPyThunk | None = None,
inputs: Any | None = None,
+ force_thunk: Literal["deferred"] | Literal["eager"] | None = None,
writeable: bool = True,
) -> None:
- # `inputs` being a cuNumeric ndarray is definitely a bug
+ # `inputs` being a cuPyNumeric ndarray is definitely a bug
assert not isinstance(inputs, ndarray)
if thunk is None:
assert shape is not None
@@ -138,7 +139,7 @@ def __init__(
]
core_dtype = to_core_type(dtype)
self._thunk = runtime.create_empty_thunk(
- sanitized_shape, core_dtype, inputs
+ sanitized_shape, core_dtype, inputs, force_thunk
)
else:
self._thunk = thunk
@@ -161,7 +162,7 @@ def __legate_data_interface__(self) -> dict[str, Any]:
array = LogicalArray.from_store(deferred_thunk.base)
self._legate_data = dict()
self._legate_data["version"] = 1
- field = Field("cuNumeric Array", dtype)
+ field = Field("cuPyNumeric Array", dtype)
self._legate_data["data"] = {field: array}
return self._legate_data
@@ -186,7 +187,7 @@ def __legate_data_interface__(self) -> dict[str, Any]:
def __array_function__(
self, func: Any, types: Any, args: tuple[Any], kwargs: dict[str, Any]
) -> Any:
- import cunumeric as cn
+ import cupynumeric as cn
what = func.__name__
@@ -197,19 +198,19 @@ def __array_function__(
return NotImplemented
# We are wrapping all NumPy modules, so we can expect to find every
- # NumPy API call in cuNumeric, even if just an "unimplemented" stub.
+ # NumPy API call in cuPyNumeric, even if just an "unimplemented" stub.
module = reduce(getattr, func.__module__.split(".")[1:], cn)
cn_func = getattr(module, func.__name__)
- # We can't immediately forward to the corresponding cuNumeric
+ # We can't immediately forward to the corresponding cuPyNumeric
# entrypoint. Say that we reached this point because the user code
- # invoked `np.foo(x, bar=True)` where `x` is a `cunumeric.ndarray`. If
- # our implementation of `foo` is not complete, and cannot handle
+ # invoked `np.foo(x, bar=True)` where `x` is a `cupynumeric.ndarray`.
+ # If our implementation of `foo` is not complete, and cannot handle
# `bar=True`, then forwarding this call to `cn.foo` would fail. This
# goes against the semantics of `__array_function__`, which shouldn't
# fail if the custom implementation cannot handle the provided
# arguments. Conversely, if the user calls `cn.foo(x, bar=True)`
- # directly, that means they requested the cuNumeric implementation
+ # directly, that means they requested the cuPyNumeric implementation
# specifically, and the `NotImplementedError` should not be hidden.
if is_implemented(cn_func):
try:
@@ -265,6 +266,12 @@ def __array_ufunc__(
except NotImplementedError:
what = f"the requested combination of arguments to {what}"
+ # special case for @ matmul
+ if what == "matmul.__call__":
+ from .._module import matmul
+
+ return matmul(*inputs, **kwargs)
+
# We cannot handle this ufunc call, so we will fall back to NumPy.
warnings.warn(
FALLBACK_WARNING.format(what=what),
@@ -285,7 +292,7 @@ def T(self) -> ndarray:
See Also
--------
- cunumeric.transpose
+ cupynumeric.transpose
ndarray.transpose
"""
@@ -297,8 +304,8 @@ def base(self) -> npt.NDArray[Any] | None:
Base object if memory is from some other object.
"""
raise NotImplementedError(
- "cunumeric.ndarray doesn't keep track of the array view hierarchy "
- "yet"
+ "cupynumeric.ndarray doesn't keep track of the array view "
+ "hierarchy yet"
)
@property
@@ -313,6 +320,17 @@ def data(self) -> memoryview:
"""
return self.__array__().data
+ def __buffer__(self, flags: int, /) -> memoryview:
+ """
+ Python buffer object pointing to the start of the array's data.
+
+ Notes
+ -----
+ This causes the entire (potentially distributed) array to be collected
+ into one memory.
+ """
+ return self.__array__().__buffer__(flags) # type: ignore
+
@property
def dtype(self) -> np.dtype[Any]:
"""
@@ -332,9 +350,9 @@ def flags(self) -> Any:
"""
Information about the memory layout of the array.
- These flags don't reflect the properties of the cuNumeric array, but
- rather the NumPy array that will be produced if the cuNumeric array is
- materialized on a single node.
+ These flags don't reflect the properties of the cuPyNumeric array, but
+ rather the NumPy array that will be produced if the cuPyNumeric array
+ is materialized on a single node.
Attributes
----------
@@ -416,7 +434,7 @@ def flat(self) -> np.flatiter[npt.NDArray[Any]]:
flatten : Return a copy of the array collapsed into one dimension.
Availability
- --------
+ ------------
Single CPU
"""
@@ -734,7 +752,7 @@ def __divmod__(self, rhs: Any) -> ndarray:
"""
raise NotImplementedError(
- "cunumeric.ndarray doesn't support __divmod__ yet"
+ "cupynumeric.ndarray doesn't support __divmod__ yet"
)
def __eq__(self, rhs: object) -> ndarray: # type: ignore [override]
@@ -787,7 +805,7 @@ def __ge__(self, rhs: Any) -> ndarray:
# __getattribute__
def _convert_key(self, key: Any, first: bool = True) -> Any:
- # Convert any arrays stored in a key to a cuNumeric array
+ # Convert any arrays stored in a key to a cuPyNumeric array
if isinstance(key, slice):
key = slice(
operator.index(key.start) if key.start is not None else None,
@@ -804,9 +822,9 @@ def _convert_key(self, key: Any, first: bool = True) -> Any:
elif isinstance(key, tuple) and first:
return tuple(self._convert_key(k, first=False) for k in key)
else:
- # Otherwise convert it to a cuNumeric array, check types
+ # Otherwise convert it to a cuPyNumeric array, check types
# and get the thunk
- key = convert_to_cunumeric_ndarray(key)
+ key = convert_to_cupynumeric_ndarray(key)
if key.dtype != bool and not np.issubdtype(key.dtype, np.integer):
raise TypeError("index arrays should be int or bool type")
if key.dtype != bool:
@@ -837,7 +855,7 @@ def __gt__(self, rhs: Any) -> ndarray:
return _ufunc.greater(self, rhs)
def __hash__(self) -> int:
- raise TypeError("unhashable type: cunumeric.ndarray")
+ raise TypeError("unhashable type: cupynumeric.ndarray")
def __iadd__(self, rhs: Any) -> ndarray:
"""a.__iadd__(value, /)
@@ -1154,11 +1172,11 @@ def nonzero(self) -> tuple[ndarray, ...]:
Return the indices of the elements that are non-zero.
- Refer to :func:`cunumeric.nonzero` for full documentation.
+ Refer to :func:`cupynumeric.nonzero` for full documentation.
See Also
--------
- cunumeric.nonzero : equivalent function
+ cupynumeric.nonzero : equivalent function
Availability
--------
@@ -1254,7 +1272,7 @@ def __rdivmod__(self, lhs: Any) -> ndarray:
"""
raise NotImplementedError(
- "cunumeric.ndarray doesn't support __rdivmod__ yet"
+ "cupynumeric.ndarray doesn't support __rdivmod__ yet"
)
def __reduce__(self, *args: Any, **kwargs: Any) -> str | tuple[str, ...]:
@@ -1505,11 +1523,11 @@ def all(
Returns True if all elements evaluate to True.
- Refer to :func:`cunumeric.all` for full documentation.
+ Refer to :func:`cupynumeric.all` for full documentation.
See Also
--------
- cunumeric.all : equivalent function
+ cupynumeric.all : equivalent function
Availability
--------
@@ -1540,11 +1558,11 @@ def any(
Returns True if any of the elements of `a` evaluate to True.
- Refer to :func:`cunumeric.any` for full documentation.
+ Refer to :func:`cupynumeric.any` for full documentation.
See Also
--------
- cunumeric.any : equivalent function
+ cupynumeric.any : equivalent function
Availability
--------
@@ -1573,11 +1591,11 @@ def argmax(
Return indices of the maximum values along the given axis.
- Refer to :func:`cunumeric.argmax` for full documentation.
+ Refer to :func:`cupynumeric.argmax` for full documentation.
See Also
--------
- cunumeric.argmax : equivalent function
+ cupynumeric.argmax : equivalent function
Availability
--------
@@ -1608,11 +1626,11 @@ def argmin(
Return indices of the minimum values along the given axis.
- Refer to :func:`cunumeric.argmin` for detailed documentation.
+ Refer to :func:`cupynumeric.argmin` for detailed documentation.
See Also
--------
- cunumeric.argmin : equivalent function
+ cupynumeric.argmin : equivalent function
Availability
--------
@@ -1741,11 +1759,11 @@ def take(
Take elements from an array along an axis.
- Refer to :func:`cunumeric.take` for full documentation.
+ Refer to :func:`cupynumeric.take` for full documentation.
See Also
--------
- cunumeric.take : equivalent function
+ cupynumeric.take : equivalent function
Availability
--------
@@ -1755,7 +1773,7 @@ def take(
if not np.isscalar(indices):
# if indices is a tuple or list, bring sub-tuples to the same shape
# and concatenate them
- indices = convert_to_cunumeric_ndarray(indices)
+ indices = convert_to_cupynumeric_ndarray(indices)
if axis is None:
self = self.ravel()
@@ -1821,11 +1839,11 @@ def choose(
Use an index array to construct a new array from a set of choices.
- Refer to :func:`cunumeric.choose` for full documentation.
+ Refer to :func:`cupynumeric.choose` for full documentation.
See Also
--------
- cunumeric.choose : equivalent function
+ cupynumeric.choose : equivalent function
Availability
--------
@@ -1843,12 +1861,12 @@ def choose(
dtypes = [ch.dtype for ch in choices]
ch_dtype = np.result_type(*dtypes)
choices = tuple(
- convert_to_cunumeric_ndarray(choices[i]).astype(ch_dtype)
+ convert_to_cupynumeric_ndarray(choices[i]).astype(ch_dtype)
for i in range(n)
)
else:
- choices = convert_to_cunumeric_ndarray(choices)
+ choices = convert_to_cupynumeric_ndarray(choices)
n = choices.shape[0]
ch_dtype = choices.dtype
choices = tuple(choices[i, ...] for i in range(n))
@@ -1922,11 +1940,11 @@ def compress(
Return selected slices of an array along given axis.
- Refer to :func:`cunumeric.compress` for full documentation.
+ Refer to :func:`cupynumeric.compress` for full documentation.
See Also
--------
- cunumeric.compress : equivalent function
+ cupynumeric.compress : equivalent function
Availability
--------
@@ -1985,11 +2003,11 @@ def clip(
One of max or min must be given.
- Refer to :func:`cunumeric.clip` for full documentation.
+ Refer to :func:`cupynumeric.clip` for full documentation.
See Also
--------
- cunumeric.clip : equivalent function
+ cupynumeric.clip : equivalent function
Availability
--------
@@ -2005,7 +2023,7 @@ def clip(
)
if args[0].size != 1 or args[1].size != 1:
runtime.warn(
- "cuNumeric has not implemented clip with array-like "
+ "cuPyNumeric has not implemented clip with array-like "
"arguments and is falling back to canonical numpy. You "
"may notice significantly decreased performance for this "
"function call.",
@@ -2015,7 +2033,7 @@ def clip(
self.__array__().clip(args[0], args[1], out=out.__array__())
return out
else:
- return convert_to_cunumeric_ndarray(
+ return convert_to_cupynumeric_ndarray(
self.__array__().clip(args[0], args[1])
)
core_dtype = to_core_type(self.dtype)
@@ -2034,7 +2052,7 @@ def round(
Return a with each element rounded to the given number of decimals.
- Refer to :func:`cunumeric.round` for full documentation.
+ Refer to :func:`cupynumeric.round` for full documentation.
Availability
--------
@@ -2054,11 +2072,11 @@ def conj(self) -> ndarray:
Complex-conjugate all elements.
- Refer to :func:`cunumeric.conjugate` for full documentation.
+ Refer to :func:`cupynumeric.conjugate` for full documentation.
See Also
--------
- cunumeric.conjugate : equivalent function
+ cupynumeric.conjugate : equivalent function
Availability
--------
@@ -2076,11 +2094,11 @@ def conjugate(self) -> ndarray:
Return the complex conjugate, element-wise.
- Refer to :func:`cunumeric.conjugate` for full documentation.
+ Refer to :func:`cupynumeric.conjugate` for full documentation.
See Also
--------
- cunumeric.conjugate : equivalent function
+ cupynumeric.conjugate : equivalent function
Availability
--------
@@ -2099,7 +2117,7 @@ def copy(self, order: OrderType = "C") -> ndarray:
Multiple GPUs, Multiple CPUs
"""
- # We don't care about dimension order in cuNumeric
+ # We don't care about dimension order in cuPyNumeric
return self.__copy__()
@add_boilerplate()
@@ -2274,9 +2292,7 @@ def _diag_helper(
res_dtype = (
dtype
if dtype is not None
- else out.dtype
- if out is not None
- else a.dtype
+ else out.dtype if out is not None else a.dtype
)
a = a._maybe_convert(res_dtype, (a,))
if out is not None and out.shape != out_shape:
@@ -2306,11 +2322,11 @@ def diagonal(
Return specified diagonals.
- Refer to :func:`cunumeric.diagonal` for full documentation.
+ Refer to :func:`cupynumeric.diagonal` for full documentation.
See Also
--------
- cunumeric.diagonal : equivalent function
+ cupynumeric.diagonal : equivalent function
Availability
--------
@@ -2332,11 +2348,11 @@ def put(
"""
Replaces specified elements of the array with given values.
- Refer to :func:`cunumeric.put` for full documentation.
+ Refer to :func:`cupynumeric.put` for full documentation.
See Also
--------
- cunumeric.put : equivalent function
+ cupynumeric.put : equivalent function
Availability
--------
@@ -2395,11 +2411,11 @@ def trace(
Return the sum along diagonals of the array.
- Refer to :func:`cunumeric.trace` for full documentation.
+ Refer to :func:`cupynumeric.trace` for full documentation.
See Also
--------
- cunumeric.trace : equivalent function
+ cupynumeric.trace : equivalent function
Availability
--------
@@ -2436,11 +2452,11 @@ def dot(self, rhs: ndarray, out: ndarray | None = None) -> ndarray:
Return the dot product of this array with ``rhs``.
- Refer to :func:`cunumeric.dot` for full documentation.
+ Refer to :func:`cupynumeric.dot` for full documentation.
See Also
--------
- cunumeric.dot : equivalent function
+ cupynumeric.dot : equivalent function
Availability
--------
@@ -2469,7 +2485,7 @@ def dump(self, file: str | Path) -> None:
Dump a pickle of the array to the specified file.
- The array can be read back with pickle.load or cunumeric.load.
+ The array can be read back with pickle.load or cupynumeric.load.
Parameters
----------
@@ -2538,7 +2554,7 @@ def fft(
Return the ``kind`` ``direction`` FFT of this array
with normalization ``norm``.
- Common entrypoint for FFT functionality in cunumeric.fft module.
+ Common entrypoint for FFT functionality in cupynumeric.fft module.
Notes
-----
@@ -2546,7 +2562,7 @@ def fft(
See Also
--------
- cunumeric.fft : FFT functions for different ``kind`` and
+ cupynumeric.fft : FFT functions for different ``kind`` and
``direction`` arguments
Availability
@@ -2693,7 +2709,7 @@ def flatten(self, order: OrderType = "C") -> ndarray:
def getfield(self, dtype: np.dtype[Any], offset: int = 0) -> None:
raise NotImplementedError(
- "cuNumeric does not currently support type reinterpretation "
+ "cuPyNumeric does not currently support type reinterpretation "
"for ndarray.getfield"
)
@@ -2815,11 +2831,11 @@ def max(
Return the maximum along a given axis.
- Refer to :func:`cunumeric.amax` for full documentation.
+ Refer to :func:`cupynumeric.amax` for full documentation.
See Also
--------
- cunumeric.amax : equivalent function
+ cupynumeric.amax : equivalent function
Availability
--------
@@ -2906,11 +2922,11 @@ def mean(
Returns the average of the array elements along given axis.
- Refer to :func:`cunumeric.mean` for full documentation.
+ Refer to :func:`cupynumeric.mean` for full documentation.
See Also
--------
- cunumeric.mean : equivalent function
+ cupynumeric.mean : equivalent function
Availability
--------
@@ -2919,7 +2935,7 @@ def mean(
"""
if axis is not None and not isinstance(axis, int):
raise NotImplementedError(
- "cunumeric.mean only supports int types for "
+ "cupynumeric.mean only supports int types for "
"`axis` currently"
)
@@ -2994,11 +3010,11 @@ def var(
Returns the variance of the array elements along given axis.
- Refer to :func:`cunumeric.var` for full documentation.
+ Refer to :func:`cupynumeric.var` for full documentation.
See Also
--------
- cunumeric.var : equivalent function
+ cupynumeric.var : equivalent function
Availability
--------
@@ -3007,7 +3023,7 @@ def var(
"""
if axis is not None and not isinstance(axis, int):
raise NotImplementedError(
- "cunumeric.var only supports int types for `axis` currently"
+ "cupynumeric.var only supports int types for `axis` currently"
)
# this could be computed as a single pass through the array
@@ -3017,7 +3033,7 @@ def var(
# directly as <(x-mu)^2>, which then requires two passes through the
# data to first compute the mean and then compute the variance
# see https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
- # TODO(https://github.com/nv-legate/cunumeric/issues/590)
+ # TODO(https://github.com/nv-legate/cupynumeric/issues/590)
dtype = self._summation_dtype(dtype)
# calculate the mean, but keep the dimensions so that the
@@ -3044,7 +3060,7 @@ def var(
args=(Scalar(mu.__array__(), to_core_type(self.dtype)),),
)
else:
- # TODO(https://github.com/nv-legate/cunumeric/issues/591)
+ # TODO(https://github.com/nv-legate/cupynumeric/issues/591)
# there isn't really support for generic binary reductions
# right now all of the current binary reductions are boolean
# reductions like allclose. To implement this a single pass would
@@ -3088,11 +3104,11 @@ def min(
Return the minimum along a given axis.
- Refer to :func:`cunumeric.amin` for full documentation.
+ Refer to :func:`cupynumeric.amin` for full documentation.
See Also
--------
- cunumeric.amin : equivalent function
+ cupynumeric.amin : equivalent function
Availability
--------
@@ -3121,11 +3137,11 @@ def partition(
Partition of an array in-place.
- Refer to :func:`cunumeric.partition` for full documentation.
+ Refer to :func:`cupynumeric.partition` for full documentation.
See Also
--------
- cunumeric.partition : equivalent function
+ cupynumeric.partition : equivalent function
Availability
--------
@@ -3149,11 +3165,11 @@ def argpartition(
Returns the indices that would partition this array.
- Refer to :func:`cunumeric.argpartition` for full documentation.
+ Refer to :func:`cupynumeric.argpartition` for full documentation.
See Also
--------
- cunumeric.argpartition : equivalent function
+ cupynumeric.argpartition : equivalent function
Availability
--------
@@ -3186,11 +3202,11 @@ def prod(
Return the product of the array elements over the given axis
- Refer to :func:`cunumeric.prod` for full documentation.
+ Refer to :func:`cupynumeric.prod` for full documentation.
See Also
--------
- cunumeric.prod : equivalent function
+ cupynumeric.prod : equivalent function
Availability
--------
@@ -3213,11 +3229,11 @@ def ravel(self, order: OrderType = "C") -> ndarray:
Return a flattened array.
- Refer to :func:`cunumeric.ravel` for full documentation.
+ Refer to :func:`cupynumeric.ravel` for full documentation.
See Also
--------
- cunumeric.ravel : equivalent function
+ cupynumeric.ravel : equivalent function
ndarray.flat : a flat iterator on the array.
Availability
@@ -3232,11 +3248,11 @@ def reshape(self, *args: Any, order: OrderType = "C") -> ndarray:
Returns an array containing the same data with a new shape.
- Refer to :func:`cunumeric.reshape` for full documentation.
+ Refer to :func:`cupynumeric.reshape` for full documentation.
See Also
--------
- cunumeric.reshape : equivalent function
+ cupynumeric.reshape : equivalent function
Availability
@@ -3307,7 +3323,7 @@ def setfield(
self, val: Any, dtype: npt.DTypeLike, offset: int = 0
) -> None:
raise NotImplementedError(
- "cuNumeric does not currently support type reinterpretation "
+ "cuPyNumeric does not currently support type reinterpretation "
"for ndarray.setfield"
)
@@ -3415,7 +3431,7 @@ def searchsorted(
raise ValueError("Dimension mismatch: self must be a 1D array")
# this is needed in case v is a scalar
- v_ndarray = convert_to_cunumeric_ndarray(v)
+ v_ndarray = convert_to_cupynumeric_ndarray(v)
a = self
# in case we have different dtypes we ned to find a common type
@@ -3459,11 +3475,11 @@ def sort(
Sort an array in-place.
- Refer to :func:`cunumeric.sort` for full documentation.
+ Refer to :func:`cupynumeric.sort` for full documentation.
See Also
--------
- cunumeric.sort : equivalent function
+ cupynumeric.sort : equivalent function
Availability
--------
@@ -3483,11 +3499,11 @@ def argsort(
Returns the indices that would sort this array.
- Refer to :func:`cunumeric.argsort` for full documentation.
+ Refer to :func:`cupynumeric.argsort` for full documentation.
See Also
--------
- cunumeric.argsort : equivalent function
+ cupynumeric.argsort : equivalent function
Availability
--------
@@ -3505,11 +3521,11 @@ def squeeze(self, axis: Any = None) -> ndarray:
Remove axes of length one from `a`.
- Refer to :func:`cunumeric.squeeze` for full documentation.
+ Refer to :func:`cupynumeric.squeeze` for full documentation.
See Also
--------
- cunumeric.squeeze : equivalent function
+ cupynumeric.squeeze : equivalent function
Availability
--------
@@ -3546,11 +3562,11 @@ def sum(
Return the sum of the array elements over the given axis.
- Refer to :func:`cunumeric.sum` for full documentation.
+ Refer to :func:`cupynumeric.sum` for full documentation.
See Also
--------
- cunumeric.sum : equivalent function
+ cupynumeric.sum : equivalent function
Availability
--------
@@ -3601,11 +3617,11 @@ def swapaxes(self, axis1: Any, axis2: Any) -> ndarray:
Return a view of the array with `axis1` and `axis2` interchanged.
- Refer to :func:`cunumeric.swapaxes` for full documentation.
+ Refer to :func:`cupynumeric.swapaxes` for full documentation.
See Also
--------
- cunumeric.swapaxes : equivalent function
+ cupynumeric.swapaxes : equivalent function
Availability
--------
@@ -3703,7 +3719,7 @@ def tolist(self) -> Any:
Return a copy of the array data as a (nested) Python list.
Data items are converted to the nearest compatible builtin Python
- type, via the `~cunumeric.ndarray.item` function.
+ type, via the `~cupynumeric.ndarray.item` function.
If ``a.ndim`` is 0, then since the depth of the nested list is 0, it
will not be a list at all, but a simple Python scalar.
@@ -3720,7 +3736,7 @@ def tolist(self) -> Any:
Notes
-----
- The array may be recreated via ``a = cunumeric.array(a.tolist())``,
+ The array may be recreated via ``a = cupynumeric.array(a.tolist())``,
although this may sometimes lose precision.
Availability
@@ -3856,7 +3872,7 @@ def view(
Notes
-----
- cuNumeric does not currently support type reinterpretation, or
+ cuPyNumeric does not currently support type reinterpretation, or
conversion to ndarray sub-classes; use :func:`ndarray.__array__()` to
convert to `numpy.ndarray`.
@@ -3870,11 +3886,11 @@ def view(
"""
if dtype is not None and dtype != self.dtype:
raise NotImplementedError(
- "cuNumeric does not currently support type reinterpretation"
+ "cuPyNumeric does not currently support type reinterpretation"
)
if type is not None:
raise NotImplementedError(
- "cuNumeric does not currently support conversion to ndarray "
+ "cuPyNumeric does not currently support conversion to ndarray "
"sub-classes; use __array__() to convert to numpy.ndarray"
)
return ndarray(
@@ -3889,11 +3905,11 @@ def unique(self) -> ndarray:
Find the unique elements of an array.
- Refer to :func:`cunumeric.unique` for full documentation.
+ Refer to :func:`cupynumeric.unique` for full documentation.
See Also
--------
- cunumeric.unique : equivalent function
+ cupynumeric.unique : equivalent function
Availability
--------
@@ -3939,12 +3955,12 @@ def stencil_hint(
high_offsets: tuple[int, ...],
) -> None:
"""
- Inform cuNumeric that this array will be used in a stencil computation
- in the following code.
+ Inform cuPyNumeric that this array will be used in a stencil
+ computation in the following code.
- This allows cuNumeric to allocate space for the "ghost" elements ahead
- of time, rather than discover the full extent of accesses incrementally,
- and thus avoid intermediate copies.
+ This allows cuPyNumeric to allocate space for the "ghost" elements
+ ahead of time, rather than discovering the full extent of accesses
+ incrementally, and thus avoid intermediate copies.
For example, let's say we have a 1-D array A of size 10 and we want to
partition A across two GPUs. By default, A would be partitioned equally
@@ -3953,8 +3969,8 @@ def stencil_hint(
`B = A[:9] + A[1:]`. The runtime would now need to adjust the
partitioning such that GPU0 has elements 0-5 and GPU1 has elements 4-9
inclusive. Since the original instance on GPU0 does not cover index 5,
- cuNumeric needs to allocate a full new instance that covers 0-5, leading
- to an extra copy. In this case, if the code calls
+ cuPyNumeric needs to allocate a full new instance that covers 0-5,
+ leading to an extra copy. In this case, if the code calls
`A.stencil_hint([1], [1])` to pre-allocate instances that contain the
extra elements before it uses A, the extra copies can be avoided.
diff --git a/cunumeric/_array/flags.py b/cupynumeric/_array/flags.py
similarity index 91%
rename from cunumeric/_array/flags.py
rename to cupynumeric/_array/flags.py
index 0ed9c81e31..d58a5480ab 100644
--- a/cunumeric/_array/flags.py
+++ b/cupynumeric/_array/flags.py
@@ -24,8 +24,8 @@ class flagsobj:
"""
Information about the memory layout of the array.
- These flags don't reflect the properties of the cuNumeric array, but
- rather the NumPy array that will be produced if the cuNumeric array is
+ These flags don't reflect the properties of the cuPyNumeric array, but
+ rather the NumPy array that will be produced if the cuPyNumeric array is
materialized on a single node.
"""
@@ -78,5 +78,5 @@ def __setitem__(self, key: str, value: Any) -> None:
def _check_writeable(self, value: Any) -> None:
if value and not self._array._writeable:
raise ValueError(
- "non-writeable cunumeric arrays cannot be made writeable"
+ "non-writeable cupynumeric arrays cannot be made writeable"
)
diff --git a/cunumeric/_array/thunk.py b/cupynumeric/_array/thunk.py
similarity index 100%
rename from cunumeric/_array/thunk.py
rename to cupynumeric/_array/thunk.py
diff --git a/cunumeric/_array/util.py b/cupynumeric/_array/util.py
similarity index 90%
rename from cunumeric/_array/util.py
rename to cupynumeric/_array/util.py
index 6dc3f68a0e..e0096db857 100644
--- a/cunumeric/_array/util.py
+++ b/cupynumeric/_array/util.py
@@ -47,11 +47,11 @@ def add_boilerplate(
*array_params: str,
) -> Callable[[Callable[P, R]], Callable[P, R]]:
"""
- Adds required boilerplate to the wrapped cunumeric.ndarray or module-level
- function.
+ Adds required boilerplate to the wrapped cupynumeric.ndarray or
+ module-level function.
Every time the wrapped function is called, this wrapper will convert all
- specified array-like parameters to cuNumeric ndarrays. Additionally, any
+ specified array-like parameters to cuPyNumeric ndarrays. Additionally, any
"out" or "where" arguments will also always be automatically converted.
"""
to_convert = set(array_params)
@@ -86,11 +86,11 @@ def wrapper(*args: Any, **kwargs: Any) -> R:
for idx, arg in enumerate(args):
if idx in indices and arg is not None:
if idx == out_idx:
- arg = convert_to_cunumeric_ndarray(arg, share=True)
+ arg = convert_to_cupynumeric_ndarray(arg, share=True)
if not arg.flags.writeable:
raise ValueError("out is not writeable")
else:
- arg = convert_to_cunumeric_ndarray(arg)
+ arg = convert_to_cupynumeric_ndarray(arg)
converted_args.append(arg)
args = tuple(converted_args)
@@ -99,11 +99,13 @@ def wrapper(*args: Any, **kwargs: Any) -> R:
for k, v in kwargs.items():
if k in to_convert and v is not None:
if k == "out":
- kwargs[k] = convert_to_cunumeric_ndarray(v, share=True)
+ kwargs[k] = convert_to_cupynumeric_ndarray(
+ v, share=True
+ )
if not kwargs[k].flags.writeable:
raise ValueError("out is not writeable")
else:
- kwargs[k] = convert_to_cunumeric_ndarray(v)
+ kwargs[k] = convert_to_cupynumeric_ndarray(v)
return func(*args, **kwargs)
@@ -120,7 +122,7 @@ def broadcast_where(where: ndarray | None, shape: NdShape) -> ndarray | None:
return where
-def convert_to_cunumeric_ndarray(obj: Any, share: bool = False) -> ndarray:
+def convert_to_cupynumeric_ndarray(obj: Any, share: bool = False) -> ndarray:
from .array import ndarray
# If this is an instance of one of our ndarrays then we're done
@@ -136,7 +138,7 @@ def convert_to_cunumeric_ndarray(obj: Any, share: bool = False) -> ndarray:
def maybe_convert_to_np_ndarray(obj: Any) -> Any:
"""
- Converts cuNumeric arrays into NumPy arrays, otherwise has no effect.
+ Converts cuPyNumeric arrays into NumPy arrays, otherwise has no effect.
"""
from ..ma import MaskedArray
from .array import ndarray
diff --git a/cunumeric/_module/__init__.py b/cupynumeric/_module/__init__.py
similarity index 96%
rename from cunumeric/_module/__init__.py
rename to cupynumeric/_module/__init__.py
index 86a4105bb0..e96566d914 100644
--- a/cunumeric/_module/__init__.py
+++ b/cupynumeric/_module/__init__.py
@@ -140,7 +140,7 @@
def test(*args: Any, **kw: Any) -> None:
warn(
- "cuNumeric cannot execute numpy.test() due to reliance "
+ "cuPyNumeric cannot execute numpy.test() due to reliance "
"on Numpy internals. For information about running the "
- "cuNumeric test suite, see: https://docs.nvidia.com/cunumeric/latest/developer/index.html"
+ "cuPyNumeric test suite, see: https://docs.nvidia.com/cupynumeric/latest/developer/index.html"
)
diff --git a/cunumeric/_module/_unary_red_utils.py b/cupynumeric/_module/_unary_red_utils.py
similarity index 100%
rename from cunumeric/_module/_unary_red_utils.py
rename to cupynumeric/_module/_unary_red_utils.py
diff --git a/cunumeric/_module/array_basic.py b/cupynumeric/_module/array_basic.py
similarity index 100%
rename from cunumeric/_module/array_basic.py
rename to cupynumeric/_module/array_basic.py
diff --git a/cunumeric/_module/array_dimension.py b/cupynumeric/_module/array_dimension.py
similarity index 96%
rename from cunumeric/_module/array_dimension.py
rename to cupynumeric/_module/array_dimension.py
index b75bf45404..01629b2cb9 100644
--- a/cunumeric/_module/array_dimension.py
+++ b/cupynumeric/_module/array_dimension.py
@@ -19,7 +19,7 @@
import numpy as np
from .._array.array import ndarray
-from .._array.util import add_boilerplate, convert_to_cunumeric_ndarray
+from .._array.util import add_boilerplate, convert_to_cupynumeric_ndarray
from .._utils import is_np2
from .creation_data import array
@@ -45,7 +45,7 @@ def _reshape_recur(ndim: int, arr: ndarray) -> tuple[int, ...]:
def _atleast_nd(ndim: int, arys: Sequence[ndarray]) -> list[ndarray] | ndarray:
- inputs = list(convert_to_cunumeric_ndarray(arr) for arr in arys)
+ inputs = list(convert_to_cupynumeric_ndarray(arr) for arr in arys)
# 'reshape' change the shape of arrays
# only when arr.shape != _reshape_recur(ndim,arr)
result = list(arr.reshape(_reshape_recur(ndim, arr)) for arr in inputs)
@@ -251,7 +251,7 @@ def broadcast_to(
The shape of the desired array.
A single integer i is interpreted as (i,).
subok : bool, optional
- This option is ignored by cuNumeric.
+ This option is ignored by cuPyNumeric.
Returns
-------
@@ -298,7 +298,7 @@ def broadcast_arrays(*args: Any, subok: bool = False) -> list[ndarray]:
The arrays to broadcast.
subok : bool, optional
- This option is ignored by cuNumeric
+ This option is ignored by cuPyNumeric
Returns
-------
@@ -314,7 +314,7 @@ def broadcast_arrays(*args: Any, subok: bool = False) -> list[ndarray]:
Multiple GPUs, Multiple CPUs
"""
- arrs = [convert_to_cunumeric_ndarray(arr) for arr in args]
+ arrs = [convert_to_cupynumeric_ndarray(arr) for arr in args]
return _broadcast_arrays(arrs, subok=subok)
@@ -337,7 +337,7 @@ class broadcast:
"""
def __init__(self, *arrays: Any) -> None:
- arrs = [convert_to_cunumeric_ndarray(arr) for arr in arrays]
+ arrs = [convert_to_cupynumeric_ndarray(arr) for arr in arrays]
broadcasted = _broadcast_arrays(arrs)
self._iters = tuple(arr.flat for arr in broadcasted)
self._index = 0
diff --git a/cunumeric/_module/array_joining.py b/cupynumeric/_module/array_joining.py
similarity index 96%
rename from cunumeric/_module/array_joining.py
rename to cupynumeric/_module/array_joining.py
index 13956a7aad..fbdf2adda4 100644
--- a/cunumeric/_module/array_joining.py
+++ b/cupynumeric/_module/array_joining.py
@@ -20,7 +20,7 @@
import numpy as np
from .._array.array import ndarray
-from .._array.util import convert_to_cunumeric_ndarray
+from .._array.util import convert_to_cupynumeric_ndarray
from .._utils import is_np2
from .array_dimension import _atleast_nd
@@ -82,7 +82,7 @@ def check_list_depth(arr: Any, prefix: NdShape = (0,)) -> int:
"List depths are mismatched. First element was at depth "
f"{first_depth}, but there is an element at"
f" depth {other_depth}, "
- f"arrays{convert_to_array_form(prefix+(idx+1,))}"
+ f"arrays{convert_to_array_form(prefix + (idx + 1,))}"
)
return depths[0] + 1
@@ -121,7 +121,7 @@ def check_shape_dtype_without_axis(
if len(inputs) == 0:
raise ValueError("need at least one array to concatenate")
- inputs = list(convert_to_cunumeric_ndarray(inp) for inp in inputs)
+ inputs = list(convert_to_cupynumeric_ndarray(inp) for inp in inputs)
ndim = inputs[0].ndim
shape = inputs[0].shape
@@ -184,7 +184,7 @@ def _block_collect_slices(
# flatten lists of slices into a single list
slices = list(chain(*updated_slices))
else:
- arrays = list(convert_to_cunumeric_ndarray(inp) for inp in arr)
+ arrays = list(convert_to_cupynumeric_ndarray(inp) for inp in arr)
common_shape = arrays[0].shape
if len(arr) > 1:
arrays, common_info = check_shape_dtype_without_axis(
@@ -248,7 +248,7 @@ def _concatenate(
shape=out_shape, dtype=common_info.dtype, inputs=inputs
)
else:
- out = convert_to_cunumeric_ndarray(out)
+ out = convert_to_cupynumeric_ndarray(out)
if not isinstance(out, ndarray):
raise TypeError("out should be ndarray")
elif list(out.shape) != out_shape:
@@ -295,8 +295,8 @@ def append(arr: ndarray, values: ndarray, axis: int | None = None) -> ndarray:
Multiple GPUs, Multiple CPUs
"""
- # Check to see if we can build a new tuple of cuNumeric arrays
- inputs = list(convert_to_cunumeric_ndarray(inp) for inp in [arr, values])
+ # Check to see if we can build a new tuple of cuPyNumeric arrays
+ inputs = list(convert_to_cupynumeric_ndarray(inp) for inp in [arr, values])
return concatenate(inputs, axis)
@@ -427,14 +427,14 @@ def concatenate(
inputs = list(inp.ravel() for inp in reshaped)
axis = 0
- # Check to see if we can build a new tuple of cuNumeric arrays
- cunumeric_inputs, common_info = check_shape_dtype_without_axis(
+ # Check to see if we can build a new tuple of cuPyNumeric arrays
+ cupynumeric_inputs, common_info = check_shape_dtype_without_axis(
inputs, concatenate.__name__, dtype, casting
)
- check_shape_with_axis(cunumeric_inputs, concatenate.__name__, axis)
+ check_shape_with_axis(cupynumeric_inputs, concatenate.__name__, axis)
return _concatenate(
- cunumeric_inputs,
+ cupynumeric_inputs,
common_info,
axis,
out,
diff --git a/cunumeric/_module/array_rearrange.py b/cupynumeric/_module/array_rearrange.py
similarity index 97%
rename from cunumeric/_module/array_rearrange.py
rename to cupynumeric/_module/array_rearrange.py
index ea30e08746..7f27075835 100644
--- a/cunumeric/_module/array_rearrange.py
+++ b/cupynumeric/_module/array_rearrange.py
@@ -68,7 +68,7 @@ def flip(m: ndarray, axis: NdShapeLike | None = None) -> ndarray:
Notes
-----
- cuNumeric implementation doesn't return a view, it returns a new array
+ cuPyNumeric implementation doesn't return a view, it returns a new array
"""
return m.flip(axis=axis)
@@ -101,7 +101,7 @@ def flipud(m: ndarray) -> ndarray:
Notes
-----
- cuNumeric implementation doesn't return a view, it returns a new array
+ cuPyNumeric implementation doesn't return a view, it returns a new array
"""
if m.ndim < 1:
raise ValueError("Input must be >= 1-d.")
@@ -137,7 +137,7 @@ def fliplr(m: ndarray) -> ndarray:
Notes
-----
- cuNumeric implementation doesn't return a view, it returns a new array
+ cuPyNumeric implementation doesn't return a view, it returns a new array
"""
if m.ndim < 2:
raise ValueError("Input must be >= 2-d.")
diff --git a/cunumeric/_module/array_shape.py b/cupynumeric/_module/array_shape.py
similarity index 100%
rename from cunumeric/_module/array_shape.py
rename to cupynumeric/_module/array_shape.py
diff --git a/cunumeric/_module/array_splitting.py b/cupynumeric/_module/array_splitting.py
similarity index 98%
rename from cunumeric/_module/array_splitting.py
rename to cupynumeric/_module/array_splitting.py
index dd4a9e2b1d..4462ee5e69 100644
--- a/cunumeric/_module/array_splitting.py
+++ b/cupynumeric/_module/array_splitting.py
@@ -19,7 +19,7 @@
import numpy as np
from .._array.array import ndarray
-from .._array.util import convert_to_cunumeric_ndarray
+from .._array.util import convert_to_cupynumeric_ndarray
if TYPE_CHECKING:
import numpy.typing as npt
@@ -99,7 +99,7 @@ def array_split(
--------
Multiple GPUs, Multiple CPUs
"""
- array = convert_to_cunumeric_ndarray(a)
+ array = convert_to_cupynumeric_ndarray(a)
split_pts = []
if axis >= array.ndim:
raise ValueError(
diff --git a/cunumeric/_module/array_tiling.py b/cupynumeric/_module/array_tiling.py
similarity index 97%
rename from cunumeric/_module/array_tiling.py
rename to cupynumeric/_module/array_tiling.py
index 72e5287bc2..6dca2939d6 100644
--- a/cunumeric/_module/array_tiling.py
+++ b/cupynumeric/_module/array_tiling.py
@@ -19,7 +19,7 @@
import numpy as np
from .._array.array import ndarray
-from .._array.util import add_boilerplate, convert_to_cunumeric_ndarray
+from .._array.util import add_boilerplate, convert_to_cupynumeric_ndarray
from .._utils import is_np2
from ..runtime import runtime
from .creation_shape import full
@@ -183,9 +183,9 @@ def repeat(a: ndarray, repeats: Any, axis: int | None = None) -> ndarray:
)
# array is an array
- array = convert_to_cunumeric_ndarray(a)
+ array = convert_to_cupynumeric_ndarray(a)
if np.ndim(repeats) == 1:
- repeats = convert_to_cunumeric_ndarray(repeats)
+ repeats = convert_to_cupynumeric_ndarray(repeats)
# if no axes specified, flatten array
if axis is None:
diff --git a/cunumeric/_module/array_transpose.py b/cupynumeric/_module/array_transpose.py
similarity index 100%
rename from cunumeric/_module/array_transpose.py
rename to cupynumeric/_module/array_transpose.py
diff --git a/cunumeric/_module/binary_bit_packing.py b/cupynumeric/_module/binary_bit_packing.py
similarity index 100%
rename from cunumeric/_module/binary_bit_packing.py
rename to cupynumeric/_module/binary_bit_packing.py
diff --git a/cunumeric/_module/creation_data.py b/cupynumeric/_module/creation_data.py
similarity index 100%
rename from cunumeric/_module/creation_data.py
rename to cupynumeric/_module/creation_data.py
diff --git a/cunumeric/_module/creation_matrices.py b/cupynumeric/_module/creation_matrices.py
similarity index 98%
rename from cunumeric/_module/creation_matrices.py
rename to cupynumeric/_module/creation_matrices.py
index 7b97ef488f..540276c532 100644
--- a/cunumeric/_module/creation_matrices.py
+++ b/cupynumeric/_module/creation_matrices.py
@@ -30,7 +30,7 @@ def diag(v: ndarray, k: int = 0) -> ndarray:
Extract a diagonal or construct a diagonal array.
- See the more detailed documentation for ``cunumeric.diagonal`` if you use
+ See the more detailed documentation for ``cupynumeric.diagonal`` if you use
this function to extract a diagonal and wish to write to the resulting
array; whether it returns a copy or a view depends on what version of numpy
you are using.
diff --git a/cunumeric/_module/creation_ranges.py b/cupynumeric/_module/creation_ranges.py
similarity index 98%
rename from cunumeric/_module/creation_ranges.py
rename to cupynumeric/_module/creation_ranges.py
index ca72f401e4..dc09d8ad09 100644
--- a/cunumeric/_module/creation_ranges.py
+++ b/cupynumeric/_module/creation_ranges.py
@@ -15,7 +15,8 @@
from __future__ import annotations
import math
-from typing import TYPE_CHECKING, Any
+from types import EllipsisType
+from typing import TYPE_CHECKING
import numpy as np
@@ -49,7 +50,7 @@ def arange(
`range` function, but returns an ndarray rather than a list.
When using a non-integer step, such as 0.1, the results will often not
- be consistent. It is better to use `cunumeric.linspace` for these cases.
+ be consistent. It is better to use `cupynumeric.linspace` for these cases.
Parameters
----------
@@ -180,7 +181,7 @@ def linspace(
delta = stop - start
y = arange(0, num, dtype=dt)
- out: tuple[Any, ...] # EllipsisType not even in typing_extensions yet
+ out: tuple[int | EllipsisType | slice, ...]
# Reshape these arrays into dimensions that allow them to broadcast
if delta.ndim > 0:
diff --git a/cunumeric/_module/creation_shape.py b/cupynumeric/_module/creation_shape.py
similarity index 94%
rename from cunumeric/_module/creation_shape.py
rename to cupynumeric/_module/creation_shape.py
index b208bc57bd..d14aa7298d 100644
--- a/cunumeric/_module/creation_shape.py
+++ b/cupynumeric/_module/creation_shape.py
@@ -38,7 +38,8 @@ def empty(shape: NdShapeLike, dtype: npt.DTypeLike = np.float64) -> ndarray:
shape : int or tuple[int]
Shape of the empty array.
dtype : data-type, optional
- Desired output data-type for the array. Default is `cunumeric.float64`.
+ Desired output data-type for the array. Default is
+ ``cupynumeric.float64``.
Returns
-------
@@ -189,7 +190,7 @@ def ones(shape: NdShapeLike, dtype: npt.DTypeLike = np.float64) -> ndarray:
shape : int or tuple[int]
Shape of the new array.
dtype : data-type, optional
- The desired data-type for the array. Default is `cunumeric.float64`.
+ The desired data-type for the array. Default is `cupynumeric.float64`.
Returns
-------
@@ -256,7 +257,7 @@ def zeros(shape: NdShapeLike, dtype: npt.DTypeLike = np.float64) -> ndarray:
shape : int or tuple[int]
Shape of the new array.
dtype : data-type, optional
- The desired data-type for the array. Default is `cunumeric.float64`.
+ The desired data-type for the array. Default is `cupynumeric.float64`.
Returns
-------
@@ -331,7 +332,7 @@ def full(
Fill value.
dtype : data-type, optional
The desired data-type for the array The default, None, means
- `cunumeric.array(fill_value).dtype`.
+ `cupynumeric.array(fill_value).dtype`.
Returns
-------
@@ -351,6 +352,8 @@ def full(
else:
dtype = np.dtype(dtype)
val = np.array(value, dtype=dtype)
+ if np.dtype(dtype).itemsize == 1 and value > 255:
+ raise OverflowError(f"Value {value} out of bounds for {dtype}")
result = empty(shape, dtype=val.dtype)
result._thunk.fill(val)
return result
@@ -395,6 +398,8 @@ def full_like(
dtype = np.dtype(dtype)
else:
dtype = a.dtype
+ if np.dtype(dtype).itemsize == 1 and value > 255:
+ raise OverflowError(f"Value {value} out of bounds for {dtype}")
result = empty_like(a, dtype=dtype, shape=shape)
val = np.array(value).astype(dtype)
result._thunk.fill(val)
diff --git a/cunumeric/_module/indexing.py b/cupynumeric/_module/indexing.py
similarity index 95%
rename from cunumeric/_module/indexing.py
rename to cupynumeric/_module/indexing.py
index 30f4c1633b..3af4622565 100644
--- a/cunumeric/_module/indexing.py
+++ b/cupynumeric/_module/indexing.py
@@ -22,7 +22,7 @@
from .._array.util import (
add_boilerplate,
check_writeable,
- convert_to_cunumeric_ndarray,
+ convert_to_cupynumeric_ndarray,
)
from .._utils import is_np2
from .._utils.array import calculate_volume
@@ -195,7 +195,7 @@ def mask_indices(
Assume `mask_func` is a function that, for a square array a of size
``(n, n)`` with a possible offset argument `k`, when called as
``mask_func(a, k)`` returns a new array with zeros in certain locations
- (functions like :func:`cunumeric.triu` or :func:`cunumeric.tril`
+ (functions like :func:`cupynumeric.triu` or :func:`cupynumeric.tril`
do precisely this). Then this function returns the indices where
the non-zero values would be located.
@@ -205,12 +205,12 @@ def mask_indices(
The returned indices will be valid to access arrays of shape (n, n).
mask_func : callable
A function whose call signature is similar to that of
- :func:`cunumeric.triu`, :func:`cunumeric.tril`.
+ :func:`cupynumeric.triu`, :func:`cupynumeric.tril`.
That is, ``mask_func(x, k)`` returns a boolean array, shaped like `x`.
`k` is an optional argument to the function.
k : scalar
An optional argument which is passed through to `mask_func`. Functions
- like :func:`cunumeric.triu`, :func:`cunumeric,tril`
+ like :func:`cupynumeric.triu`, :func:`cupynumeric,tril`
take a second argument that is interpreted as an offset.
Returns
@@ -225,10 +225,10 @@ def mask_indices(
Notes
-----
- WARNING: `mask_indices` expects `mask_function` to call cuNumeric functions
- for good performance. In case non-cuNumeric functions are called by
- `mask_function`, cuNumeric will have to materialize all data on the host
- which might result in running out of system memory.
+ WARNING: ``mask_indices`` expects ``mask_function`` to call cuPyNumeric
+ functions for good performance. In case non-cuPyNumeric functions are
+ called by ``mask_function``, cuPyNumeric will have to materialize all data
+ on the host which might result in running out of system memory.
Availability
--------
@@ -238,7 +238,7 @@ def mask_indices(
a = ones((n, n), dtype=bool)
if not is_implemented(mask_func):
runtime.warn(
- "Calling non-cuNumeric functions in mask_func can result in bad "
+ "Calling non-cuPyNumeric functions in mask_func can result in bad "
"performance",
category=UserWarning,
)
@@ -389,7 +389,7 @@ def tril_indices(
The row dimension of the arrays for which the returned
indices will be valid.
k : int, optional
- Diagonal offset (see :func:`cunumeric.tril` for details).
+ Diagonal offset (see :func:`cupynumeric.tril` for details).
m : int, optional
The column dimension of the arrays for which the returned
indices will be valid.
@@ -422,7 +422,7 @@ def tril_indices_from(arr: ndarray, k: int = 0) -> tuple[ndarray, ...]:
"""
Return the indices for the lower-triangle of arr.
- See :func:`cunumeric.tril_indices` for full details.
+ See :func:`cupynumeric.tril_indices` for full details.
Parameters
----------
@@ -430,7 +430,7 @@ def tril_indices_from(arr: ndarray, k: int = 0) -> tuple[ndarray, ...]:
The indices will be valid for arrays whose dimensions are
the same as arr.
k : int, optional
- Diagonal offset (see :func:`cunumeric.tril` for details).
+ Diagonal offset (see :func:`cupynumeric.tril` for details).
Returns
-------
@@ -468,7 +468,7 @@ def triu_indices(
The size of the arrays for which the returned indices will
be valid.
k : int, optional
- Diagonal offset (see :func:`cunumeric.triu` for details).
+ Diagonal offset (see :func:`cupynumeric.triu` for details).
m : int, optional
The column dimension of the arrays for which the returned
arrays will be valid.
@@ -501,7 +501,7 @@ def triu_indices_from(arr: ndarray, k: int = 0) -> tuple[ndarray, ...]:
"""
Return the indices for the upper-triangle of arr.
- See :func:`cunumeric.triu_indices` for full details.
+ See :func:`cupynumeric.triu_indices` for full details.
Parameters
----------
@@ -509,7 +509,7 @@ def triu_indices_from(arr: ndarray, k: int = 0) -> tuple[ndarray, ...]:
The indices will be valid for arrays whose dimensions are
the same as arr.
k : int, optional
- Diagonal offset (see :func:`cunumeric.triu` for details).
+ Diagonal offset (see :func:`cupynumeric.triu` for details).
Returns
-------
@@ -674,7 +674,7 @@ def take_along_axis(a: ndarray, indices: ndarray, axis: int | None) -> ndarray:
latter. These slices can be different lengths.
Functions returning an index along an axis, like
- :func:`cunumeric.argsort` and :func:`cunumeric.argpartition`,
+ :func:`cupynumeric.argsort` and :func:`cupynumeric.argpartition`,
produce suitable indices for this function.
Parameters
@@ -688,7 +688,7 @@ def take_along_axis(a: ndarray, indices: ndarray, axis: int | None) -> ndarray:
axis : int
The axis to take 1d slices along. If axis is None, the input array is
treated as if it had first been flattened to 1d, for consistency with
- :func:`cunumeric.sort` and :func:`cunumeric.argsort`.
+ :func:`cupynumeric.sort` and :func:`cupynumeric.argsort`.
Returns
-------
@@ -738,9 +738,9 @@ def put_along_axis(
the index and data arrays, and uses the former to place values into the
latter. These slices can be different lengths.
- Functions returning an index along an axis, like :func:`cunumeric.argsort`
- and :func:`cunumeric.argpartition`, produce suitable indices for
- this function.
+ Functions returning an index along an axis, like
+ :func:`cupynumeric.argsort` and :func:`cupynumeric.argpartition`, produce
+ suitable indices for this function.
Parameters
----------
@@ -924,14 +924,14 @@ def select(
if len(condlist) == 0:
raise ValueError("select with an empty condition list is not possible")
- condlist_ = tuple(convert_to_cunumeric_ndarray(c) for c in condlist)
+ condlist_ = tuple(convert_to_cupynumeric_ndarray(c) for c in condlist)
for i, c in enumerate(condlist_):
if c.dtype != bool:
raise TypeError(
f"invalid entry {i} in condlist: should be boolean ndarray"
)
- choicelist_ = tuple(convert_to_cunumeric_ndarray(c) for c in choicelist)
+ choicelist_ = tuple(convert_to_cupynumeric_ndarray(c) for c in choicelist)
common_type = np.result_type(*choicelist_, default)
args = condlist_ + choicelist_
choicelist_ = tuple(
@@ -1065,7 +1065,7 @@ def diagonal(
Notes
-----
- Unlike NumPy's, the cuNumeric implementation always returns a copy
+ Unlike NumPy's, the cuPyNumeric implementation always returns a copy
See Also
--------
diff --git a/cunumeric/_module/io_numpy.py b/cupynumeric/_module/io_numpy.py
similarity index 96%
rename from cunumeric/_module/io_numpy.py
rename to cupynumeric/_module/io_numpy.py
index 67ea13c051..42d4ebdf53 100644
--- a/cunumeric/_module/io_numpy.py
+++ b/cupynumeric/_module/io_numpy.py
@@ -61,7 +61,7 @@ def load(
Notes
-----
- cuNumeric does not currently support ``.npz`` and pickled files.
+ cuPyNumeric does not currently support ``.npz`` and pickled files.
Availability
--------
diff --git a/cunumeric/_module/linalg_mvp.py b/cupynumeric/_module/linalg_mvp.py
similarity index 97%
rename from cunumeric/_module/linalg_mvp.py
rename to cupynumeric/_module/linalg_mvp.py
index dd764c04ec..8650b1b00c 100644
--- a/cunumeric/_module/linalg_mvp.py
+++ b/cupynumeric/_module/linalg_mvp.py
@@ -25,7 +25,7 @@
from .._array.array import ndarray
from .._array.util import (
add_boilerplate,
- convert_to_cunumeric_ndarray,
+ convert_to_cupynumeric_ndarray,
find_common_type,
)
from .._ufunc.math import multiply
@@ -72,7 +72,7 @@ def inner(a: ndarray, b: ndarray, out: ndarray | None = None) -> ndarray:
Notes
-----
- The cuNumeric implementation is a little more liberal than NumPy in terms
+ The cuPyNumeric implementation is a little more liberal than NumPy in terms
of allowed broadcasting, e.g. ``inner(ones((1,)), ones((4,)))`` is allowed.
See Also
@@ -109,7 +109,7 @@ def dot(a: ndarray, b: ndarray, out: ndarray | None = None) -> ndarray:
but using ``a @ b`` is preferred.
- If either `a` or `b` is 0-D (scalar), it is equivalent to
- :func:`multiply` and using ``cunumeric.multiply(a, b)`` or ``a * b`` is
+ :func:`multiply` and using ``cupynumeric.multiply(a, b)`` or ``a * b`` is
preferred.
- If `a` is an N-D array and `b` is a 1-D array, it is a sum product over
@@ -139,7 +139,7 @@ def dot(a: ndarray, b: ndarray, out: ndarray | None = None) -> ndarray:
Notes
-----
- The cuNumeric implementation is a little more liberal than NumPy in terms
+ The cuPyNumeric implementation is a little more liberal than NumPy in terms
of allowed broadcasting, e.g. ``dot(ones((3,1)), ones((4,5)))`` is allowed.
Except for the inner-product case, only floating-point types are supported.
@@ -227,7 +227,7 @@ def matmul(
(9, 5, 7, 3)
>>> # n is 7, k is 4, m is 3
- The cuNumeric implementation is a little more liberal than NumPy in terms
+ The cuPyNumeric implementation is a little more liberal than NumPy in terms
of allowed broadcasting, e.g. ``matmul(ones((3,1)), ones((4,5)))`` is
allowed.
@@ -290,7 +290,7 @@ def vdot(a: ndarray, b: ndarray, out: ndarray | None = None) -> ndarray:
Notes
-----
- The cuNumeric implementation is a little more liberal than NumPy in terms
+ The cuPyNumeric implementation is a little more liberal than NumPy in terms
of allowed broadcasting, e.g. ``vdot(ones((1,)), ones((4,)))`` is allowed.
See Also
@@ -389,7 +389,7 @@ def tensordot(
Notes
-----
- The cuNumeric implementation is a little more liberal than NumPy in terms
+ The cuPyNumeric implementation is a little more liberal than NumPy in terms
of allowed broadcasting, e.g. ``tensordot(ones((3,1)), ones((1,4)))`` is
allowed.
@@ -710,8 +710,9 @@ def einsum(
optimize : ``{False, True, 'greedy', 'optimal'}``, optional
Controls if intermediate optimization should occur. If False then
arrays will be contracted in input order, one at a time. True (the
- default) will use the 'greedy' algorithm. See ``cunumeric.einsum_path``
- for more information on the available optimization algorithms.
+ default) will use the 'greedy' algorithm. See
+ ``cupynumeric.einsum_path`` for more information on the available
+ optimization algorithms.
Returns
-------
@@ -730,10 +731,10 @@ def einsum(
--------
Multiple GPUs, Multiple CPUs
"""
- operands_list = [convert_to_cunumeric_ndarray(op) for op in operands]
+ operands_list = [convert_to_cupynumeric_ndarray(op) for op in operands]
if out is not None:
- out = convert_to_cunumeric_ndarray(out, share=True)
+ out = convert_to_cupynumeric_ndarray(out, share=True)
if optimize is True:
optimize = "greedy"
@@ -841,7 +842,7 @@ def einsum_path(
--------
Multiple GPUs, Multiple CPUs
"""
- computed_operands = [convert_to_cunumeric_ndarray(op) for op in operands]
+ computed_operands = [convert_to_cupynumeric_ndarray(op) for op in operands]
memory_limit = _builtin_max(op.size for op in computed_operands)
if isinstance(optimize, tuple):
if len(optimize) != 2:
diff --git a/cunumeric/_module/logic_array_contents.py b/cupynumeric/_module/logic_array_contents.py
similarity index 92%
rename from cunumeric/_module/logic_array_contents.py
rename to cupynumeric/_module/logic_array_contents.py
index a1fe574b98..e5bb9bd9ee 100644
--- a/cunumeric/_module/logic_array_contents.py
+++ b/cupynumeric/_module/logic_array_contents.py
@@ -16,7 +16,7 @@
from typing import TYPE_CHECKING
-from .._array.util import convert_to_cunumeric_ndarray
+from .._array.util import convert_to_cupynumeric_ndarray
from .._ufunc.comparison import logical_and
from .._ufunc.floating import isinf, signbit
@@ -61,9 +61,9 @@ def isneginf(x: ndarray, out: ndarray | None = None) -> ndarray:
Multiple GPUs, Multiple CPUs
"""
- x = convert_to_cunumeric_ndarray(x)
+ x = convert_to_cupynumeric_ndarray(x)
if out is not None:
- out = convert_to_cunumeric_ndarray(out, share=True)
+ out = convert_to_cupynumeric_ndarray(out, share=True)
rhs1 = isinf(x)
rhs2 = signbit(x)
return logical_and(rhs1, rhs2, out=out)
@@ -106,9 +106,9 @@ def isposinf(x: ndarray, out: ndarray | None = None) -> ndarray:
Multiple GPUs, Multiple CPUs
"""
- x = convert_to_cunumeric_ndarray(x)
+ x = convert_to_cupynumeric_ndarray(x)
if out is not None:
- out = convert_to_cunumeric_ndarray(out, share=True)
+ out = convert_to_cupynumeric_ndarray(out, share=True)
rhs1 = isinf(x)
rhs2 = ~signbit(x)
return logical_and(rhs1, rhs2, out=out)
diff --git a/cunumeric/_module/logic_array_type.py b/cupynumeric/_module/logic_array_type.py
similarity index 93%
rename from cunumeric/_module/logic_array_type.py
rename to cupynumeric/_module/logic_array_type.py
index 2c8553078c..1e39754a7b 100644
--- a/cunumeric/_module/logic_array_type.py
+++ b/cupynumeric/_module/logic_array_type.py
@@ -19,7 +19,7 @@
import numpy as np
from .._array.array import ndarray
-from .._array.util import convert_to_cunumeric_ndarray
+from .._array.util import convert_to_cupynumeric_ndarray
from .creation_shape import full
if TYPE_CHECKING:
@@ -53,7 +53,7 @@ def iscomplex(x: ndarray | npt.NDArray[Any]) -> ndarray:
Multiple GPUs, Multiple CPUs
"""
- x = convert_to_cunumeric_ndarray(x)
+ x = convert_to_cupynumeric_ndarray(x)
if x.dtype.kind != "c":
return full(x.shape, False, dtype=bool)
else:
@@ -121,7 +121,7 @@ def isreal(x: ndarray | npt.NDArray[Any]) -> ndarray:
Multiple GPUs, Multiple CPUs
"""
- x = convert_to_cunumeric_ndarray(x)
+ x = convert_to_cupynumeric_ndarray(x)
if x.dtype.kind != "c":
return full(x.shape, True, dtype=bool)
else:
@@ -179,7 +179,7 @@ def isscalar(x: ndarray | npt.NDArray[Any]) -> bool:
Notes
-----
- This function falls back to NumPy for all object types but cuNumeric's
+ This function falls back to NumPy for all object types but cuPyNumeric's
ndarray, which always returns `False`.
Availability
@@ -187,9 +187,9 @@ def isscalar(x: ndarray | npt.NDArray[Any]) -> bool:
Multiple GPUs, Multiple CPUs
"""
- # Since the input can be any value, we can't just convert it to cunumeric
- # ndarray. Instead we check if the input is cunumeric ndarray and, if not,
- # fall back to Numpy
+ # Since the input can be any value, we can't just convert it to cupynumeric
+ # ndarray. Instead we check if the input is cupynumeric ndarray and, if
+ # not, fall back to Numpy
if isinstance(x, ndarray):
return False
else:
diff --git a/cunumeric/_module/logic_comparison.py b/cupynumeric/_module/logic_comparison.py
similarity index 95%
rename from cunumeric/_module/logic_comparison.py
rename to cupynumeric/_module/logic_comparison.py
index dad4782027..46c6410a4a 100644
--- a/cunumeric/_module/logic_comparison.py
+++ b/cupynumeric/_module/logic_comparison.py
@@ -84,7 +84,7 @@ def allclose(
"""
if equal_nan:
raise NotImplementedError(
- "cuNumeric does not support `equal_nan` yet for allclose"
+ "cuPyNumeric does not support `equal_nan` yet for allclose"
)
args = (Scalar(rtol, ty.float64), Scalar(atol, ty.float64))
return perform_binary_reduction(
@@ -145,7 +145,7 @@ def isclose(
"""
if equal_nan:
raise NotImplementedError(
- "cuNumeric does not support `equal_nan` yet for isclose"
+ "cuPyNumeric does not support `equal_nan` yet for isclose"
)
out_shape = np.broadcast_shapes(a.shape, b.shape)
@@ -191,7 +191,7 @@ def array_equal(
"""
if equal_nan:
raise NotImplementedError(
- "cuNumeric does not support `equal_nan` yet for `array_equal`"
+ "cuPyNumeric does not support `equal_nan` yet for `array_equal`"
)
if a1.shape != a2.shape:
diff --git a/cunumeric/_module/logic_truth.py b/cupynumeric/_module/logic_truth.py
similarity index 100%
rename from cunumeric/_module/logic_truth.py
rename to cupynumeric/_module/logic_truth.py
diff --git a/cunumeric/_module/math_complex.py b/cupynumeric/_module/math_complex.py
similarity index 98%
rename from cunumeric/_module/math_complex.py
rename to cupynumeric/_module/math_complex.py
index 3d05580ad2..29f3787f75 100644
--- a/cunumeric/_module/math_complex.py
+++ b/cupynumeric/_module/math_complex.py
@@ -20,7 +20,6 @@
from .._array.thunk import perform_unary_op
from .._array.util import add_boilerplate
-from .._utils.array import to_core_type
from ..config import UnaryOpCode
if TYPE_CHECKING:
diff --git a/cunumeric/_module/math_extrema.py b/cupynumeric/_module/math_extrema.py
similarity index 93%
rename from cunumeric/_module/math_extrema.py
rename to cupynumeric/_module/math_extrema.py
index ad805c00f6..0b576684d7 100644
--- a/cunumeric/_module/math_extrema.py
+++ b/cupynumeric/_module/math_extrema.py
@@ -66,10 +66,11 @@ def amax(
initial : scalar, optional
The minimum value of an output element. Must be present to allow
- computation on empty slice. See `~cunumeric.ufunc.reduce` for details.
+ computation on empty slice. See `~cupynumeric.ufunc.reduce` for
+ details.
where : array_like[bool], optional
- Elements to compare for the maximum. See `~cunumeric.ufunc.reduce`
+ Elements to compare for the maximum. See `~cupynumeric.ufunc.reduce`
for details.
Returns
@@ -142,10 +143,11 @@ def amin(
initial : scalar, optional
The maximum value of an output element. Must be present to allow
- computation on empty slice. See `~cunumeric.ufunc.reduce` for details.
+ computation on empty slice. See `~cupynumeric.ufunc.reduce` for
+ details.
where : array_like[bool], optional
- Elements to compare for the minimum. See `~cunumeric.ufunc.reduce`
+ Elements to compare for the minimum. See `~cupynumeric.ufunc.reduce`
for details.
Returns
diff --git a/cunumeric/_module/math_misc.py b/cupynumeric/_module/math_misc.py
similarity index 80%
rename from cunumeric/_module/math_misc.py
rename to cupynumeric/_module/math_misc.py
index 251d92eae1..a91e3faccc 100644
--- a/cunumeric/_module/math_misc.py
+++ b/cupynumeric/_module/math_misc.py
@@ -18,15 +18,21 @@
from .._array.array import ndarray
from .._array.util import add_boilerplate
+from ..config import ConvolveMethod
if TYPE_CHECKING:
import numpy.typing as npt
- from ..types import ConvolveMode
+ from ..types import ConvolveMethod as ConvolveMethodType, ConvolveMode
@add_boilerplate("a", "v")
-def convolve(a: ndarray, v: ndarray, mode: ConvolveMode = "full") -> ndarray:
+def convolve(
+ a: ndarray,
+ v: ndarray,
+ mode: ConvolveMode = "full",
+ method: ConvolveMethodType = "auto",
+) -> ndarray:
"""
Returns the discrete, linear convolution of two ndarrays.
@@ -52,6 +58,19 @@ def convolve(a: ndarray, v: ndarray, mode: ConvolveMode = "full") -> ndarray:
The output consists only of those elements that do not
rely on the zero-padding. In 'valid' mode, either `a` or `v`
must be at least as large as the other in every dimension.
+ method : ``{'auto', 'direct', 'fft'}``, optional
+ A string indicating which method to use to calculate the convolution.
+
+ 'auto':
+ Automatically chooses direct or Fourier method based on an estimate of
+ which is faster (default)
+
+ 'direct':
+ The convolution is determined directly from sums, the definition of
+ convolution
+
+ 'fft':
+ The Fourier Transform is used to perform the convolution
Returns
-------
@@ -66,7 +85,7 @@ def convolve(a: ndarray, v: ndarray, mode: ConvolveMode = "full") -> ndarray:
-----
The current implementation only supports the 'same' mode.
- Unlike `numpy.convolve`, `cunumeric.convolve` supports N-dimensional
+ Unlike `numpy.convolve`, `cupynumeric.convolve` supports N-dimensional
inputs, but it follows NumPy's behavior for 1-D inputs.
Availability
@@ -74,7 +93,7 @@ def convolve(a: ndarray, v: ndarray, mode: ConvolveMode = "full") -> ndarray:
Multiple GPUs, Multiple CPUs
"""
if mode != "same":
- raise NotImplementedError("Need to implement other convolution modes")
+ raise NotImplementedError("Only support mode='same'")
if a.ndim != v.ndim:
raise RuntimeError("Arrays should have the same dimensions")
@@ -84,6 +103,11 @@ def convolve(a: ndarray, v: ndarray, mode: ConvolveMode = "full") -> ndarray:
if a.ndim == 1 and a.size < v.size:
v, a = a, v
+ if not hasattr(ConvolveMethod, method.upper()):
+ raise ValueError(
+ "Acceptable method flags are 'auto', 'direct', or 'fft'."
+ )
+
if a.dtype != v.dtype:
v = v.astype(a.dtype)
out = ndarray(
@@ -91,7 +115,7 @@ def convolve(a: ndarray, v: ndarray, mode: ConvolveMode = "full") -> ndarray:
dtype=a.dtype,
inputs=(a, v),
)
- out._thunk.convolve(a._thunk, v._thunk, mode)
+ out._thunk.convolve(a._thunk, v._thunk, mode, method)
return out
diff --git a/cunumeric/_module/math_rounding.py b/cupynumeric/_module/math_rounding.py
similarity index 100%
rename from cunumeric/_module/math_rounding.py
rename to cupynumeric/_module/math_rounding.py
diff --git a/cunumeric/_module/math_sum_prod_diff.py b/cupynumeric/_module/math_sum_prod_diff.py
similarity index 93%
rename from cunumeric/_module/math_sum_prod_diff.py
rename to cupynumeric/_module/math_sum_prod_diff.py
index 8a18f57bf7..6027fb1c1a 100644
--- a/cunumeric/_module/math_sum_prod_diff.py
+++ b/cupynumeric/_module/math_sum_prod_diff.py
@@ -26,7 +26,7 @@
from .._ufunc.math import add, multiply, subtract
from .._utils import is_np2
from ..config import ScanCode, UnaryRedCode
-from ..settings import settings as cunumeric_settings
+from ..settings import settings as cupynumeric_settings
from ._unary_red_utils import get_non_nan_unary_red_code
from .array_dimension import broadcast_to
from .array_joining import concatenate
@@ -93,12 +93,12 @@ def prod(
sub-class' method does not implement `keepdims` any
exceptions will be raised.
initial : scalar, optional
- The starting value for this product. See `~cunumeric.ufunc.reduce` for
- details.
+ The starting value for this product. See `~cupynumeric.ufunc.reduce`
+ for details.
where : array_like[bool], optional
- Elements to include in the product. See `~cunumeric.ufunc.reduce` for
- details.
+ Elements to include in the product. See `~cupynumeric.ufunc.reduce`
+ for details.
Returns
-------
@@ -177,10 +177,11 @@ def sum(
sub-class' method does not implement `keepdims` any
exceptions will be raised.
initial : scalar, optional
- Starting value for the sum. See `~cunumeric.ufunc.reduce` for details.
+ Starting value for the sum. See `~cupynumeric.ufunc.reduce` for
+ details.
where : array_like[bool], optional
- Elements to include in the sum. See `~cunumeric.ufunc.reduce` for
+ Elements to include in the sum. See `~cupynumeric.ufunc.reduce` for
details.
Returns
@@ -253,13 +254,14 @@ def cumprod(
Notes
-----
- CuNumeric's parallel implementation may yield different results from NumPy
- with floating point and complex types. For example, when boundary values
- such as inf occur they may not propagate as expected. Consider the float32
- array ``[3e+37, 1, 100, 0.01]``. NumPy's cumprod will return a result of
- ``[3e+37, 3e+37, inf, inf]``. However, cuNumeric might internally partition
- the array such that partition 0 has ``[3e+37, 1]`` and partition 1 has
- ``[100, 0.01]``, returning the result ``[3e+37, 3e+37, inf, 3e+37]``.
+ cuPyNumeric's parallel implementation may yield different results from
+ NumPy with floating point and complex types. For example, when boundary
+ values such as inf occur they may not propagate as expected. Consider the
+ float32 array ``[3e+37, 1, 100, 0.01]``. NumPy's cumprod will return a
+ result ofc``[3e+37, 3e+37, inf, inf]``. However, cuPyNumeric might
+ internally partition the array such that partition 0 has ``[3e+37, 1]``
+ and partition 1 has ``[100, 0.01]``, returning the result
+ ``[3e+37, 3e+37, inf, 3e+37]``.
Availability
--------
@@ -318,10 +320,10 @@ def cumsum(
Notes
-----
- CuNumeric's parallel implementation may yield different results from NumPy
- with floating point and complex types. For example, when boundary values
- such as inf occur they may not propagate as expected. For more explanation
- check cunumeric.cumprod.
+ CuPyNumeric's parallel implementation may yield different results from
+ NumPy with floating point and complex types. For example, when boundary
+ values such as inf occur they may not propagate as expected. For more
+ explanation check cupynumeric.cumprod.
Availability
--------
@@ -379,10 +381,10 @@ def nancumprod(
Notes
-----
- CuNumeric's parallel implementation may yield different results from NumPy
- with floating point and complex types. For example, when boundary values
- such as inf occur they may not propagate as expected. For more explanation
- check cunumeric.cumprod.
+ CuPyNumeric's parallel implementation may yield different results from
+ NumPy with floating point and complex types. For example, when boundary
+ values such as inf occur they may not propagate as expected. For more
+ explanation check cupynumeric.cumprod.
Availability
--------
@@ -440,10 +442,10 @@ def nancumsum(
Notes
-----
- CuNumeric's parallel implementation may yield different results from NumPy
- with floating point and complex types. For example, when boundary values
- such as inf occur they may not propagate as expected. For more explanation
- check cunumeric.cumprod.
+ CuPyNumeric's parallel implementation may yield different results from
+ NumPy with floating point and complex types. For example, when boundary
+ values such as inf occur they may not propagate as expected. For more
+ explanation check cupynumeric.cumprod.
Availability
--------
@@ -465,7 +467,7 @@ def nanargmax(
"""
Return the indices of the maximum values in the specified axis ignoring
NaNs. For empty arrays, ValueError is raised. For all-NaN slices,
- ValueError is raised only when CUNUMERIC_NUMPY_COMPATIBILITY
+ ValueError is raised only when CUPYNUMERIC_NUMPY_COMPATIBILITY
environment variable is set, otherwise identity is returned.
Warning: results cannot be trusted if a slice contains only NaNs
@@ -504,7 +506,7 @@ def nanargmax(
if a.size == 0:
raise ValueError("attempt to get nanargmax of an empty sequence")
- if cunumeric_settings.numpy_compat() and a.dtype.kind == "f":
+ if cupynumeric_settings.numpy_compat() and a.dtype.kind == "f":
if any(all(isnan(a), axis=axis)):
raise ValueError("Array/Slice contains only NaNs")
@@ -533,7 +535,7 @@ def nanargmin(
"""
Return the indices of the minimum values in the specified axis ignoring
NaNs. For empty arrays, ValueError is raised. For all-NaN slices,
- ValueError is raised only when CUNUMERIC_NUMPY_COMPATIBILITY
+ ValueError is raised only when CUPYNUMERIC_NUMPY_COMPATIBILITY
environment variable is set, otherwise identity is returned.
Warning: results cannot be trusted if a slice contains only NaNs
@@ -572,7 +574,7 @@ def nanargmin(
if a.size == 0:
raise ValueError("attempt to get nanargmin of an empty sequence")
- if cunumeric_settings.numpy_compat() and a.dtype.kind == "f":
+ if cupynumeric_settings.numpy_compat() and a.dtype.kind == "f":
if any(all(isnan(a), axis=axis)):
raise ValueError("Array/Slice contains only NaNs")
@@ -602,7 +604,7 @@ def nanmin(
"""
Return minimum of an array or minimum along an axis, ignoring any
NaNs. When all-NaN slices are encountered, a NaN is returned
- for that slice only when CUNUMERIC_NUMPY_COMPATIBILITY environment
+ for that slice only when CUPYNUMERIC_NUMPY_COMPATIBILITY environment
variable is set, otherwise identity is returned.
Empty slices will raise a ValueError
@@ -633,10 +635,11 @@ def nanmin(
initial : scalar, optional
The maximum value of an output element. Must be present to allow
- computation on empty slice. See `~cunumeric.ufunc.reduce` for details.
+ computation on empty slice. See `~cupynumeric.ufunc.reduce` for
+ details.
where : array_like[bool], optional
- Elements to compare for the minimum. See `~cunumeric.ufunc.reduce`
+ Elements to compare for the minimum. See `~cupynumeric.ufunc.reduce`
for details.
Returns
@@ -648,7 +651,7 @@ def nanmin(
Notes
-----
- CuNumeric's implementation will not raise a Runtime Warning for
+ CuPyNumeric's implementation will not raise a Runtime Warning for
slices with all-NaNs
See Also
@@ -675,7 +678,7 @@ def nanmin(
where=where,
)
- if cunumeric_settings.numpy_compat() and a.dtype.kind == "f":
+ if cupynumeric_settings.numpy_compat() and a.dtype.kind == "f":
all_nan = all(isnan(a), axis=axis, keepdims=keepdims, where=where)
putmask(out_array, all_nan, np.nan) # type: ignore
@@ -694,7 +697,7 @@ def nanmax(
"""
Return the maximum of an array or maximum along an axis, ignoring any
NaNs. When all-NaN slices are encountered, a NaN is returned
- for that slice only when CUNUMERIC_NUMPY_COMPATIBILITY environment
+ for that slice only when CUPYNUMERIC_NUMPY_COMPATIBILITY environment
variable is set, otherwise identity is returned.
Empty slices will raise a ValueError
@@ -728,10 +731,11 @@ def nanmax(
initial : scalar, optional
The minimum value of an output element. Must be present to allow
- computation on empty slice. See `~cunumeric.ufunc.reduce` for details.
+ computation on empty slice. See `~cupynumeric.ufunc.reduce` for
+ details.
where : array_like[bool], optional
- Elements to compare for the maximum. See `~cunumeric.ufunc.reduce`
+ Elements to compare for the maximum. See `~cupynumeric.ufunc.reduce`
for details.
Returns
@@ -743,7 +747,7 @@ def nanmax(
Notes
-----
- CuNumeric's implementation will not raise a Runtime Warning for
+ CuPyNumeric's implementation will not raise a Runtime Warning for
slices with all-NaNs
See Also
@@ -770,7 +774,7 @@ def nanmax(
where=where,
)
- if cunumeric_settings.numpy_compat() and a.dtype.kind == "f":
+ if cupynumeric_settings.numpy_compat() and a.dtype.kind == "f":
all_nan = all(isnan(a), axis=axis, keepdims=keepdims, where=where)
putmask(out_array, all_nan, np.nan) # type: ignore
@@ -825,11 +829,11 @@ def nanprod(
sub-class' method does not implement `keepdims` any
exceptions will be raised.
initial : scalar, optional
- The starting value for this product. See `~cunumeric.ufunc.reduce` for
- details.
+ The starting value for this product. See `~cupynumeric.ufunc.reduce`
+ for details.
where : array_like[bool], optional
- Elements to include in the product. See `~cunumeric.ufunc.reduce` for
- details.
+ Elements to include in the product. See `~cupynumeric.ufunc.reduce`
+ for details.
Returns
-------
@@ -924,11 +928,11 @@ def nansum(
the result will broadcast correctly against the input array.
initial : scalar, optional
- Starting value for the sum. See `~cunumeric.ufunc.reduce` for
+ Starting value for the sum. See `~cupynumeric.ufunc.reduce` for
details.
where : array_like[bool], optional
- Elements to include in the sum. See `~cunumeric.ufunc.reduce` for
+ Elements to include in the sum. See `~cupynumeric.ufunc.reduce` for
details.
Returns
diff --git a/cunumeric/_module/sets_making.py b/cupynumeric/_module/sets_making.py
similarity index 100%
rename from cunumeric/_module/sets_making.py
rename to cupynumeric/_module/sets_making.py
diff --git a/cunumeric/_module/ssc_counting.py b/cupynumeric/_module/ssc_counting.py
similarity index 100%
rename from cunumeric/_module/ssc_counting.py
rename to cupynumeric/_module/ssc_counting.py
diff --git a/cunumeric/_module/ssc_searching.py b/cupynumeric/_module/ssc_searching.py
similarity index 96%
rename from cunumeric/_module/ssc_searching.py
rename to cupynumeric/_module/ssc_searching.py
index de8319ca6b..bcdba11cee 100644
--- a/cunumeric/_module/ssc_searching.py
+++ b/cupynumeric/_module/ssc_searching.py
@@ -107,8 +107,8 @@ def argmax(
Notes
-----
- CuNumeric's parallel implementation may yield different results from NumPy
- when the array contains NaN(s).
+ cuPyNumeric's parallel implementation may yield different results from
+ NumPy when the array contains NaN(s).
Availability
--------
@@ -156,8 +156,8 @@ def argmin(
Notes
-----
- CuNumeric's parallel implementation may yield different results from NumPy
- when the array contains NaN(s).
+ cuPyNumeric's parallel implementation may yield different results from
+ NumPy when the array contains NaN(s).
Availability
--------
@@ -197,8 +197,9 @@ def flatnonzero(a: ndarray) -> ndarray:
@overload
-def where(a: npt.ArrayLike | ndarray, x: None, y: None) -> tuple[ndarray, ...]:
- ...
+def where(
+ a: npt.ArrayLike | ndarray, x: None, y: None
+) -> tuple[ndarray, ...]: ...
@overload
@@ -206,8 +207,7 @@ def where(
a: npt.ArrayLike | ndarray,
x: npt.ArrayLike | ndarray,
y: npt.ArrayLike | ndarray,
-) -> ndarray:
- ...
+) -> ndarray: ...
@add_boilerplate("a", "x", "y") # type: ignore [misc]
diff --git a/cunumeric/_module/ssc_sorting.py b/cupynumeric/_module/ssc_sorting.py
similarity index 98%
rename from cunumeric/_module/ssc_sorting.py
rename to cupynumeric/_module/ssc_sorting.py
index 1ee86e0d02..4f32d0194f 100644
--- a/cunumeric/_module/ssc_sorting.py
+++ b/cupynumeric/_module/ssc_sorting.py
@@ -219,7 +219,7 @@ def argpartition(
Notes
-----
- The current implementation falls back to `cunumeric.argsort`.
+ The current implementation falls back to `cupynumeric.argsort`.
See Also
--------
@@ -274,7 +274,7 @@ def partition(
Notes
-----
- The current implementation falls back to `cunumeric.sort`.
+ The current implementation falls back to `cupynumeric.sort`.
See Also
--------
diff --git a/cunumeric/_module/stats_avgs_vars.py b/cupynumeric/_module/stats_avgs_vars.py
similarity index 100%
rename from cunumeric/_module/stats_avgs_vars.py
rename to cupynumeric/_module/stats_avgs_vars.py
diff --git a/cunumeric/_module/stats_correlating.py b/cupynumeric/_module/stats_correlating.py
similarity index 100%
rename from cunumeric/_module/stats_correlating.py
rename to cupynumeric/_module/stats_correlating.py
diff --git a/cunumeric/_module/stats_histograms.py b/cupynumeric/_module/stats_histograms.py
similarity index 99%
rename from cunumeric/_module/stats_histograms.py
rename to cupynumeric/_module/stats_histograms.py
index d6397760f2..05ab4e9289 100644
--- a/cunumeric/_module/stats_histograms.py
+++ b/cupynumeric/_module/stats_histograms.py
@@ -64,7 +64,7 @@ def bincount(
-------
out : ndarray[int]
The result of binning the input array.
- The length of `out` is equal to ``cunumeric.amax(x)+1``.
+ The length of `out` is equal to ``cupynumeric.amax(x)+1``.
Raises
------
diff --git a/cunumeric/_module/stats_order.py b/cupynumeric/_module/stats_order.py
similarity index 99%
rename from cunumeric/_module/stats_order.py
rename to cupynumeric/_module/stats_order.py
index 7c70424761..7d7564a3df 100644
--- a/cunumeric/_module/stats_order.py
+++ b/cupynumeric/_module/stats_order.py
@@ -720,7 +720,7 @@ def nanquantile_impl(
assert qs_all[qindex].shape == remaining_shape
# TODO(aschaffer): Vectorize this operation, see
- # github.com/nv-legate/cunumeric/pull/1121#discussion_r1484731763
+ # github.com/nv-legate/cupynumeric/pull/1121#discussion_r1484731763
gamma = None
for aindex, n in np.ndenumerate(non_nan_counts):
# TODO (2024-08): `n` should be an integral type, but wasn't:
diff --git a/cunumeric/_module/window.py b/cupynumeric/_module/window.py
similarity index 100%
rename from cunumeric/_module/window.py
rename to cupynumeric/_module/window.py
diff --git a/cunumeric/_sphinxext/__init__.py b/cupynumeric/_sphinxext/__init__.py
similarity index 100%
rename from cunumeric/_sphinxext/__init__.py
rename to cupynumeric/_sphinxext/__init__.py
diff --git a/cunumeric/_sphinxext/_comparison_config.py b/cupynumeric/_sphinxext/_comparison_config.py
similarity index 95%
rename from cunumeric/_sphinxext/_comparison_config.py
rename to cupynumeric/_sphinxext/_comparison_config.py
index 3623a61a0c..911e487973 100644
--- a/cunumeric/_sphinxext/_comparison_config.py
+++ b/cupynumeric/_sphinxext/_comparison_config.py
@@ -83,12 +83,11 @@ class SectionConfig:
UFUNCS = (numpy.ufunc,)
NUMPY_CONFIGS = [
- SectionConfig("Module-Level", None, types=FUNCTIONS),
- SectionConfig("Ufuncs", None, types=UFUNCS),
- SectionConfig("Multi-Dimensional Array", "ndarray", types=METHODS),
- SectionConfig("Linear Algebra", "linalg", types=FUNCTIONS),
- SectionConfig("Discrete Fourier Transform", "fft", types=FUNCTIONS),
- SectionConfig("Random Sampling", "random", types=FUNCTIONS),
+ SectionConfig("Module-Level", None),
+ SectionConfig("Multi-Dimensional Array", "ndarray"),
+ SectionConfig("Linear Algebra", "linalg"),
+ SectionConfig("Discrete Fourier Transform", "fft"),
+ SectionConfig("Random Sampling", "random"),
]
CONVOLVE = ("convolve", "correlate")
diff --git a/cunumeric/_sphinxext/_comparison_util.py b/cupynumeric/_sphinxext/_comparison_util.py
similarity index 72%
rename from cunumeric/_sphinxext/_comparison_util.py
rename to cupynumeric/_sphinxext/_comparison_util.py
index ddd9bab2b4..a7168cee47 100644
--- a/cunumeric/_sphinxext/_comparison_util.py
+++ b/cupynumeric/_sphinxext/_comparison_util.py
@@ -16,16 +16,16 @@
from dataclasses import dataclass
from types import ModuleType
-from typing import TYPE_CHECKING, Any, Iterable, Iterator, Type
+from typing import TYPE_CHECKING, Any, Iterable, Iterator
-from .._utils.coverage import is_implemented, is_multi, is_single
+from .._utils.coverage import is_implemented, is_multi, is_single, is_wrapped
from ._comparison_config import MISSING_NP_REFS, SKIP
if TYPE_CHECKING:
from ._comparison_config import SectionConfig
YES = "\u2713"
-NO = "\u274C"
+NO = "\u274c"
@dataclass(frozen=True)
@@ -66,24 +66,38 @@ def _lgref(name: str, obj: Any, implemented: bool) -> str:
if isinstance(obj, ModuleType):
full_name = f"{obj.__name__}.{name}"
else:
- full_name = f"cunumeric.{obj.__name__}.{name}"
+ full_name = f"cupynumeric.{obj.__name__}.{name}"
role = "meth" if "ndarray" in full_name else "obj"
return f":{role}:`{full_name}`"
-def filter_names(
+def filter_wrapped_names(
obj: Any,
- types: tuple[Type[Any], ...] | None = None,
+ *,
skip: Iterable[str] = (),
) -> Iterator[str]:
names = (n for n in dir(obj)) # every name in the module or class
+ names = (
+ n for n in names if is_wrapped(getattr(obj, n))
+ ) # that is wrapped
+ names = (n for n in names if n not in skip) # except the ones we skip
+ names = (n for n in names if not n.startswith("_")) # or any private names
+ return names
+
+
+def filter_type_names(
+ obj: Any,
+ *,
+ skip: Iterable[str] = (),
+) -> Iterator[str]:
+ names = (n for n in dir(obj)) # every name in the module or class
+ names = (
+ n for n in names if isinstance(getattr(obj, n), type)
+ ) # that is a type (class, dtype, etc)
names = (n for n in names if n not in skip) # except the ones we skip
names = (n for n in names if not n.startswith("_")) # or any private names
- if types:
- # optionally filtered by type
- names = (n for n in names if isinstance(getattr(obj, n), types))
return names
@@ -109,12 +123,12 @@ def get_item(name: str, np_obj: Any, lg_obj: Any) -> ItemDetail:
def get_namespaces(attr: str | None) -> tuple[Any, Any]:
import numpy
- import cunumeric
+ import cupynumeric
if attr is None:
- return numpy, cunumeric
+ return numpy, cupynumeric
- return getattr(numpy, attr), getattr(cunumeric, attr)
+ return getattr(numpy, attr), getattr(cupynumeric, attr)
def generate_section(config: SectionConfig) -> SectionDetail:
@@ -123,9 +137,14 @@ def generate_section(config: SectionConfig) -> SectionDetail:
names: Iterable[str]
if config.names:
- names = config.names
+ names = set(config.names)
else:
- names = filter_names(np_obj, config.types, skip=SKIP)
+ wrapped_names = filter_wrapped_names(lg_obj, skip=SKIP)
+ type_names = filter_type_names(lg_obj, skip=SKIP)
+ names = set(wrapped_names) | set(type_names)
+
+ # we can omit anything that isn't in np namespace to begin with
+ names = {n for n in names if n in dir(np_obj)}
items = [get_item(name, np_obj, lg_obj) for name in names]
diff --git a/cunumeric/_sphinxext/_cunumeric_directive.py b/cupynumeric/_sphinxext/_cupynumeric_directive.py
similarity index 96%
rename from cunumeric/_sphinxext/_cunumeric_directive.py
rename to cupynumeric/_sphinxext/_cupynumeric_directive.py
index 62b7c9672d..593d25b241 100644
--- a/cunumeric/_sphinxext/_cunumeric_directive.py
+++ b/cupynumeric/_sphinxext/_cupynumeric_directive.py
@@ -20,7 +20,7 @@
from sphinx.util.nodes import nested_parse_with_titles
-class CunumericDirective(SphinxDirective):
+class CupynumericDirective(SphinxDirective):
def parse(self, rst_text: str, annotation: str) -> list[nodes.Node]:
result = StringList()
for line in rst_text.split("\n"):
diff --git a/cunumeric/_sphinxext/_templates.py b/cupynumeric/_sphinxext/_templates.py
similarity index 100%
rename from cunumeric/_sphinxext/_templates.py
rename to cupynumeric/_sphinxext/_templates.py
diff --git a/cunumeric/_sphinxext/_templates/comparison_table.rst b/cupynumeric/_sphinxext/_templates/comparison_table.rst
similarity index 69%
rename from cunumeric/_sphinxext/_templates/comparison_table.rst
rename to cupynumeric/_sphinxext/_templates/comparison_table.rst
index 3a4211100d..55d1d583f3 100644
--- a/cunumeric/_sphinxext/_templates/comparison_table.rst
+++ b/cupynumeric/_sphinxext/_templates/comparison_table.rst
@@ -3,13 +3,13 @@
{{ section.title }}
{{ "~" * section.title|length }}
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
.. autosummary::
:toctree: generated/
.. csv-table::
- :header: NumPy, cunumeric, single-GPU/CPU, multi-GPU/CPU
+ :header: NumPy, cupynumeric, single-GPU/CPU, multi-GPU/CPU
{% for item in section.items -%}
{{ item.np_ref }}, {{ item.lg_ref }}, {{ item.single }}, {{ item.multi }}
@@ -19,6 +19,6 @@
Number of NumPy functions: {{ section.np_count }}
-Number of functions covered by cunumeric: {{ section.lg_count }}
+Number of functions covered by cupynumeric: {{ section.lg_count }}
{% endfor %}
\ No newline at end of file
diff --git a/cunumeric/_sphinxext/comparison_table.py b/cupynumeric/_sphinxext/comparison_table.py
similarity index 94%
rename from cunumeric/_sphinxext/comparison_table.py
rename to cupynumeric/_sphinxext/comparison_table.py
index a00d4bca7a..baa62a53d8 100644
--- a/cunumeric/_sphinxext/comparison_table.py
+++ b/cupynumeric/_sphinxext/comparison_table.py
@@ -22,13 +22,13 @@
from . import PARALLEL_SAFE, SphinxParallelSpec
from ._comparison_config import GROUPED_CONFIGS, NUMPY_CONFIGS
from ._comparison_util import generate_section
-from ._cunumeric_directive import CunumericDirective
+from ._cupynumeric_directive import CupynumericDirective
from ._templates import COMPARISON_TABLE
log = getLogger(__name__)
-class ComparisonTable(CunumericDirective):
+class ComparisonTable(CupynumericDirective):
has_content = False
required_arguments = 0
optional_arguments = 1
diff --git a/cunumeric/_sphinxext/implemented_index.py b/cupynumeric/_sphinxext/implemented_index.py
similarity index 90%
rename from cunumeric/_sphinxext/implemented_index.py
rename to cupynumeric/_sphinxext/implemented_index.py
index 175e12d693..f0e9598bc7 100644
--- a/cunumeric/_sphinxext/implemented_index.py
+++ b/cupynumeric/_sphinxext/implemented_index.py
@@ -20,11 +20,11 @@
from sphinx.application import Sphinx
from sphinx.util.logging import getLogger
-import cunumeric as cn
+import cupynumeric as cn
from .._utils.coverage import is_implemented
from . import PARALLEL_SAFE, SphinxParallelSpec
-from ._cunumeric_directive import CunumericDirective
+from ._cupynumeric_directive import CupynumericDirective
log = getLogger(__name__)
@@ -45,7 +45,7 @@ def _filter(x: Any) -> bool:
)
-class ImplementedIndex(CunumericDirective):
+class ImplementedIndex(CupynumericDirective):
has_content = False
required_arguments = 0
optional_arguments = 0
@@ -59,7 +59,7 @@ def run(self) -> list[nodes.Node]:
if _filter(x)
]
refs += [
- f"* :obj:`cunumeric.ndarray.{x.__name__}`"
+ f"* :obj:`cupynumeric.ndarray.{x.__name__}`"
for x in cn.ndarray.__dict__.values()
if _filter(x)
]
diff --git a/cunumeric/_sphinxext/missing_refs.py b/cupynumeric/_sphinxext/missing_refs.py
similarity index 70%
rename from cunumeric/_sphinxext/missing_refs.py
rename to cupynumeric/_sphinxext/missing_refs.py
index bd55cb5d41..99938b80dd 100644
--- a/cunumeric/_sphinxext/missing_refs.py
+++ b/cupynumeric/_sphinxext/missing_refs.py
@@ -28,25 +28,25 @@
log = getLogger(__name__)
SKIP = (
- "cunumeric.cast",
- "cunumeric.ndarray.__array_function__",
- "cunumeric.ndarray.__array_ufunc__",
- "cunumeric.ndarray.__format__",
- "cunumeric.ndarray.__hash__",
- "cunumeric.ndarray.__iter__",
- "cunumeric.ndarray.__radd__",
- "cunumeric.ndarray.__rand__",
- "cunumeric.ndarray.__rdivmod__",
- "cunumeric.ndarray.__reduce_ex__",
- "cunumeric.ndarray.__rfloordiv__",
- "cunumeric.ndarray.__rmod__",
- "cunumeric.ndarray.__rmul__",
- "cunumeric.ndarray.__ror__",
- "cunumeric.ndarray.__rpow__",
- "cunumeric.ndarray.__rsub__",
- "cunumeric.ndarray.__rtruediv__",
- "cunumeric.ndarray.__rxor__",
- "cunumeric.ndarray.__sizeof__",
+ "cupynumeric.cast",
+ "cupynumeric.ndarray.__array_function__",
+ "cupynumeric.ndarray.__array_ufunc__",
+ "cupynumeric.ndarray.__format__",
+ "cupynumeric.ndarray.__hash__",
+ "cupynumeric.ndarray.__iter__",
+ "cupynumeric.ndarray.__radd__",
+ "cupynumeric.ndarray.__rand__",
+ "cupynumeric.ndarray.__rdivmod__",
+ "cupynumeric.ndarray.__reduce_ex__",
+ "cupynumeric.ndarray.__rfloordiv__",
+ "cupynumeric.ndarray.__rmod__",
+ "cupynumeric.ndarray.__rmul__",
+ "cupynumeric.ndarray.__ror__",
+ "cupynumeric.ndarray.__rpow__",
+ "cupynumeric.ndarray.__rsub__",
+ "cupynumeric.ndarray.__rtruediv__",
+ "cupynumeric.ndarray.__rxor__",
+ "cupynumeric.ndarray.__sizeof__",
)
MISSING: list[tuple[str, str]] = []
@@ -62,7 +62,7 @@ def run(self, **kwargs: Any) -> None:
def _check_target(self, node: Any) -> None:
target = node["reftarget"]
- if not target.startswith("cunumeric.") or target in SKIP:
+ if not target.startswith("cupynumeric.") or target in SKIP:
return
domain = self.env.domains[node["refdomain"]]
@@ -85,7 +85,7 @@ def _check_target(self, node: Any) -> None:
if uri is None:
loc = get_node_location(node)
log.warning(
- f"Cunumeric reference missing a target: {loc}: {target}",
+ f"cuPyNumeric reference missing a target: {loc}: {target}",
type="ref",
)
diff --git a/cunumeric/_sphinxext/ufunc_formatter.py b/cupynumeric/_sphinxext/ufunc_formatter.py
similarity index 97%
rename from cunumeric/_sphinxext/ufunc_formatter.py
rename to cupynumeric/_sphinxext/ufunc_formatter.py
index 05cac694e6..6f574d7541 100644
--- a/cunumeric/_sphinxext/ufunc_formatter.py
+++ b/cupynumeric/_sphinxext/ufunc_formatter.py
@@ -19,7 +19,7 @@
from sphinx.application import Sphinx
from sphinx.ext.autodoc import FunctionDocumenter
-from cunumeric import ufunc
+from cupynumeric import ufunc
from . import PARALLEL_SAFE, SphinxParallelSpec
diff --git a/cunumeric/_thunk/__init__.py b/cupynumeric/_thunk/__init__.py
similarity index 100%
rename from cunumeric/_thunk/__init__.py
rename to cupynumeric/_thunk/__init__.py
diff --git a/cunumeric/_thunk/_sort.py b/cupynumeric/_thunk/_sort.py
similarity index 98%
rename from cunumeric/_thunk/_sort.py
rename to cupynumeric/_thunk/_sort.py
index b97a8eba0b..82ab738479 100644
--- a/cunumeric/_thunk/_sort.py
+++ b/cupynumeric/_thunk/_sort.py
@@ -19,7 +19,7 @@
from legate.core import get_legate_runtime, types as ty
from .._utils import is_np2
-from ..config import CuNumericOpCode
+from ..config import CuPyNumericOpCode
from ..runtime import runtime
if is_np2:
@@ -92,7 +92,7 @@ def sort_task(
) -> None:
legate_runtime = get_legate_runtime()
task = legate_runtime.create_auto_task(
- output.library, CuNumericOpCode.SORT
+ output.library, CuPyNumericOpCode.SORT
)
uses_unbound_output = runtime.num_procs > 1 and input.ndim == 1
diff --git a/cunumeric/_thunk/deferred.py b/cupynumeric/_thunk/deferred.py
similarity index 96%
rename from cunumeric/_thunk/deferred.py
rename to cupynumeric/_thunk/deferred.py
index 0a0ae7fcbd..58349cd57d 100644
--- a/cunumeric/_thunk/deferred.py
+++ b/cupynumeric/_thunk/deferred.py
@@ -62,12 +62,14 @@
BitGeneratorOperation,
Bitorder,
ConvertCode,
- CuNumericOpCode,
+ ConvolveMethod,
+ CuPyNumericOpCode,
RandGenCode,
UnaryOpCode,
UnaryRedCode,
)
from ..linalg._cholesky import cholesky_deferred
+from ..linalg._eigen import eig_deferred
from ..linalg._qr import qr_deferred
from ..linalg._solve import solve_deferred
from ..linalg._svd import svd_deferred
@@ -87,6 +89,7 @@
from ..config import BitGeneratorType, FFTDirection, FFTType, WindowOpCode
from ..types import (
BitOrder,
+ ConvolveMethod as ConvolveMethodType,
ConvolveMode,
NdShape,
OrderType,
@@ -140,9 +143,11 @@ def decorator(func: Callable[P, R]) -> Callable[P, R]:
def wrapper(*args: Any, **kwargs: Any) -> R:
# Convert relevant arguments to DeferredArrays
args = tuple(
- runtime.to_deferred_array(arg, read_only=True)
- if idx in indices and arg is not None
- else arg
+ (
+ runtime.to_deferred_array(arg, read_only=True)
+ if idx in indices and arg is not None
+ else arg
+ )
for (idx, arg) in enumerate(args)
)
for k, v in kwargs.items():
@@ -429,7 +434,7 @@ def _zip_indices(
# call ZIP function to combine index arrays into a singe array
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.ZIP
+ self.library, CuPyNumericOpCode.ZIP
)
task.throws_exception(IndexError)
p_out = task.add_output(output_arr.base)
@@ -646,7 +651,7 @@ def _advanced_indexing_with_boolean_array(
task = legate_runtime.create_auto_task(
self.library,
- CuNumericOpCode.ADVANCED_INDEXING,
+ CuPyNumericOpCode.ADVANCED_INDEXING,
)
task.add_output(out.base)
p_rhs = task.add_input(rhs.base)
@@ -931,7 +936,7 @@ def get_item(self, key: Any) -> NumPyThunk:
)
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.READ
+ self.library, CuPyNumericOpCode.READ
)
task.add_input(input.base)
task.add_output(result.base) # type: ignore
@@ -1002,7 +1007,7 @@ def set_item(self, key: Any, rhs: Any) -> None:
assert rhs.size == 1
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.WRITE
+ self.library, CuPyNumericOpCode.WRITE
)
# Since we pass the view with write discard privilege,
# we should make sure that the mapper either creates a fresh
@@ -1015,7 +1020,7 @@ def set_item(self, key: Any, rhs: Any) -> None:
# In Python, any inplace update of form arr[key] op= value
# goes through three steps: 1) __getitem__ fetching the object
# for the key, 2) __iop__ for the update, and 3) __setitem__
- # to set the result back. In cuNumeric, the object we
+ # to set the result back. In cuPyNumeric, the object we
# return in step (1) is actually a subview to the array arr
# through which we make updates in place, so after step (2) is
# done, the effect of inplace update is already reflected
@@ -1040,7 +1045,7 @@ def reshape(self, newshape: NdShape, order: OrderType) -> NumPyThunk:
if order != "C":
# If we don't have a transform then we need to make a copy
runtime.warn(
- "cuNumeric has not implemented reshape using Fortran-like "
+ "cuPyNumeric has not implemented reshape using Fortran-like "
"index order and is falling back to canonical numpy. You may "
"notice significantly decreased performance for this "
"function call.",
@@ -1269,7 +1274,7 @@ def convert(
if warn:
runtime.warn(
- "cuNumeric performing implicit type conversion from "
+ "cuPyNumeric performing implicit type conversion from "
+ str(rhs_array.dtype)
+ " to "
+ str(lhs_array.dtype),
@@ -1280,7 +1285,7 @@ def convert(
rhs = rhs_array.base
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.CONVERT
+ self.library, CuPyNumericOpCode.CONVERT
)
p_lhs = task.add_output(lhs)
p_rhs = task.add_input(rhs)
@@ -1291,9 +1296,18 @@ def convert(
task.execute()
@auto_convert("input", "filter")
- def convolve(self, input: Any, filter: Any, mode: ConvolveMode) -> None:
+ def convolve(
+ self,
+ input: Any,
+ filter: Any,
+ mode: ConvolveMode,
+ method: ConvolveMethodType,
+ ) -> None:
+ if method != "auto" and runtime.num_gpus == 0:
+ runtime.warn(f"the method {method} is ignored on CPUs")
+
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.CONVOLVE
+ self.library, CuPyNumericOpCode.CONVOLVE
)
offsets = tuple((ext + 1) // 2 for ext in filter.shape)
@@ -1304,6 +1318,7 @@ def convolve(self, input: Any, filter: Any, mode: ConvolveMode) -> None:
p_halo = task.declare_partition()
task.add_input(input.base, p_halo)
task.add_scalar_arg(input.shape, (ty.int64,))
+ task.add_scalar_arg(getattr(ConvolveMethod, method.upper()), ty.int32)
task.add_constraint(align(p_out, p_in))
task.add_constraint(bloat(p_out, p_halo, offsets, offsets))
@@ -1333,7 +1348,7 @@ def fft(
output = lhs.base
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.FFT
+ self.library, CuPyNumericOpCode.FFT
)
p_output = task.add_output(output)
@@ -1363,7 +1378,7 @@ def fft(
task.execute()
- # Fill the cuNumeric array with the value in the numpy array
+ # Fill the cuPyNumeric array with the value in the numpy array
def _fill(self, value: LogicalStore | Scalar) -> None:
assert self.base is not None
@@ -1379,7 +1394,7 @@ def _fill(self, value: LogicalStore | Scalar) -> None:
# If this is a fill for an arg value, make sure to pass
# the value dtype so that we get it packed correctly
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.FILL
+ self.library, CuPyNumericOpCode.FILL
)
task.add_output(self.base)
task.add_input(value)
@@ -1508,7 +1523,7 @@ def contract(
if blas_op == BlasOperation.VV:
# Vector dot product
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.DOT
+ self.library, CuPyNumericOpCode.DOT
)
task.add_reduction(lhs, ReductionOpKind.ADD)
p_rhs1 = task.add_input(rhs1)
@@ -1533,7 +1548,7 @@ def contract(
lhs = lhs.promote(1, n)
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.MATVECMUL
+ self.library, CuPyNumericOpCode.MATVECMUL
)
p_lhs = task.add_reduction(lhs, ReductionOpKind.ADD)
p_rhs1 = task.add_input(rhs1)
@@ -1577,7 +1592,7 @@ def rounding_divide(
# TODO: better heuristics
def choose_2d_color_shape(
- shape: tuple[int, int]
+ shape: tuple[int, int],
) -> tuple[int, int]:
# 1M elements, we should probably even go larger
MIN_MATRIX_SIZE = 1 << 20
@@ -1603,6 +1618,10 @@ def choose_2d_color_shape(
def choose_batchsize(
tilesize: tuple[int, int], k: int, itemsize: int
) -> int:
+ # don't batch in case we only have 1 proc
+ if runtime.num_procs == 1:
+ return k
+
# default corresponds to 128MB (to store A and B tile)
from ..settings import settings
@@ -1642,7 +1661,7 @@ def run_matmul_for_batch(
i: int,
) -> None:
manual_task = legate_runtime.create_manual_task(
- self.library, CuNumericOpCode.MATMUL, color_shape
+ self.library, CuPyNumericOpCode.MATMUL, color_shape
)
manual_task.add_output(tiled_lhs)
@@ -1714,7 +1733,7 @@ def add_mode(
# Prepare the launch
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.CONTRACT
+ self.library, CuPyNumericOpCode.CONTRACT
)
p_lhs = task.add_reduction(lhs, ReductionOpKind.ADD)
p_rhs1 = task.add_input(rhs1)
@@ -1740,7 +1759,7 @@ def choose(self, rhs: Any, *args: Any) -> None:
ch_tuple = tuple(c._broadcast(tuple(out_arr.shape)) for c in ch_def)
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.CHOOSE
+ self.library, CuPyNumericOpCode.CHOOSE
)
p_out = task.add_output(out_arr)
p_ind = task.add_input(index)
@@ -1764,7 +1783,7 @@ def select(
)
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.SELECT
+ self.library, CuPyNumericOpCode.SELECT
)
out_arr = self.base
task.add_output(out_arr)
@@ -1829,7 +1848,7 @@ def _diag_helper(
diag = diag.promote(0, matrix.shape[0])
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.DIAG
+ self.library, CuPyNumericOpCode.DIAG
)
if extract:
@@ -1883,7 +1902,7 @@ def put(self, indices: Any, values: Any, check_bounds: bool) -> None:
shape = self_tmp.shape
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.WRAP
+ self.library, CuPyNumericOpCode.WRAP
)
p_indirect = task.add_output(indirect.base)
task.add_scalar_arg(shape, (ty.int64,))
@@ -1910,7 +1929,7 @@ def putmask(self, mask: Any, values: Any) -> None:
else:
values_new = values.base
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.PUTMASK
+ self.library, CuPyNumericOpCode.PUTMASK
)
p_self = task.add_input(self.base)
p_mask = task.add_input(mask.base)
@@ -1935,7 +1954,7 @@ def eye(self, k: int) -> None:
# tells the runtime that it can throw away the previous contents of the
# entire region.
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.EYE
+ self.library, CuPyNumericOpCode.EYE
)
task.add_input(self.base)
task.add_output(self.base)
@@ -1952,7 +1971,7 @@ def arange(self, start: float, stop: float, step: float) -> None:
return
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.ARANGE
+ self.library, CuPyNumericOpCode.ARANGE
)
task.add_output(self.base)
task.add_scalar_arg(start, self.base.type)
@@ -1972,7 +1991,7 @@ def tile(self, rhs: Any, reps: Any | Sequence[int]) -> None:
return
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.TILE
+ self.library, CuPyNumericOpCode.TILE
)
task.add_output(self.base)
@@ -1996,7 +2015,7 @@ def trilu(self, rhs: Any, k: int, lower: bool) -> None:
rhs = rhs._broadcast(lhs.shape)
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.TRILU
+ self.library, CuPyNumericOpCode.TRILU
)
p_lhs = task.add_output(lhs)
@@ -2013,7 +2032,7 @@ def repeat(
self, repeats: Any, axis: int, scalar_repeats: bool
) -> DeferredArray:
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.REPEAT
+ self.library, CuPyNumericOpCode.REPEAT
)
if scalar_repeats:
out_shape = tuple(
@@ -2068,7 +2087,7 @@ def flip(self, rhs: Any, axes: int | tuple[int, ...] | None) -> None:
axes = normalize_axis_tuple(axes, self.ndim)
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.FLIP
+ self.library, CuPyNumericOpCode.FLIP
)
p_out = task.add_output(output)
p_in = task.add_input(input)
@@ -2095,7 +2114,7 @@ def bincount(self, rhs: Any, weights: NumPyThunk | None = None) -> None:
dst_array.fill(np.array(0, dst_array.dtype))
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.BINCOUNT
+ self.library, CuPyNumericOpCode.BINCOUNT
)
p_dst = task.add_reduction(dst_array.base, ReductionOpKind.ADD)
p_src = task.add_input(src_array.base)
@@ -2113,7 +2132,7 @@ def nonzero(self) -> tuple[NumPyThunk, ...]:
)
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.NONZERO
+ self.library, CuPyNumericOpCode.NONZERO
)
p_self = task.add_input(self.base)
@@ -2134,7 +2153,7 @@ def bitgenerator_random_raw(
flags: int,
) -> None:
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.BITGENERATOR
+ self.library, CuPyNumericOpCode.BITGENERATOR
)
task.add_output(self.base)
@@ -2162,7 +2181,7 @@ def bitgenerator_distribution(
doubleparams: tuple[float, ...],
) -> None:
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.BITGENERATOR
+ self.library, CuPyNumericOpCode.BITGENERATOR
)
task.add_output(self.base)
@@ -3124,7 +3143,7 @@ def bitgenerator_negative_binomial(
def random(self, gen_code: Any, args: tuple[Scalar, ...] = ()) -> None:
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.RAND
+ self.library, CuPyNumericOpCode.RAND
)
task.add_output(self.base)
@@ -3170,7 +3189,7 @@ def unary_op(
with Annotation({"OpCode": op.name}):
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.UNARY_OP
+ self.library, CuPyNumericOpCode.UNARY_OP
)
p_lhs = task.add_output(lhs)
p_rhs = task.add_input(rhs)
@@ -3242,7 +3261,7 @@ def unary_reduction(
with Annotation({"OpCode": op.name, "ArgRed?": str(argred)}):
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.SCALAR_UNARY_RED
+ self.library, CuPyNumericOpCode.SCALAR_UNARY_RED
)
task.add_reduction(lhs, _UNARY_RED_TO_REDUCTION_OPS[op])
@@ -3288,7 +3307,7 @@ def unary_reduction(
with Annotation({"OpCode": op.name, "ArgRed?": str(argred)}):
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.UNARY_RED
+ self.library, CuPyNumericOpCode.UNARY_RED
)
p_rhs = task.add_input(rhs_array.base)
@@ -3345,7 +3364,7 @@ def binary_op(
with Annotation({"OpCode": op_code.name}):
# Populate the Legate launcher
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.BINARY_OP
+ self.library, CuPyNumericOpCode.BINARY_OP
)
p_lhs = task.add_output(lhs)
p_rhs1 = task.add_input(rhs1)
@@ -3369,13 +3388,14 @@ def binary_reduction(
args: tuple[Scalar, ...],
) -> None:
lhs = self.base
- rhs1 = src1.base
- rhs2 = src2.base
assert lhs.has_scalar_storage
if broadcast is not None:
- rhs1 = rhs1._broadcast(broadcast)
- rhs2 = rhs2._broadcast(broadcast)
+ rhs1 = src1._broadcast(broadcast)
+ rhs2 = src2._broadcast(broadcast)
+ else:
+ rhs1 = src1.base
+ rhs2 = src2.base
# Populate the Legate launcher
if op == BinaryOpCode.NOT_EQUAL:
@@ -3385,7 +3405,7 @@ def binary_reduction(
redop = ReductionOpKind.MUL
self.fill(np.array(True))
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.BINARY_RED
+ self.library, CuPyNumericOpCode.BINARY_RED
)
task.add_reduction(lhs, redop)
p_rhs1 = task.add_input(rhs1)
@@ -3407,7 +3427,7 @@ def where(self, src1: Any, src2: Any, src3: Any) -> None:
# Populate the Legate launcher
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.WHERE
+ self.library, CuPyNumericOpCode.WHERE
)
p_lhs = task.add_output(lhs)
p_rhs1 = task.add_input(rhs1)
@@ -3424,7 +3444,7 @@ def argwhere(self) -> NumPyThunk:
result = runtime.create_unbound_thunk(ty.int64, ndim=2)
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.ARGWHERE
+ self.library, CuPyNumericOpCode.ARGWHERE
)
task.add_output(result.base)
@@ -3446,8 +3466,16 @@ def compute_strides(shape: NdShape) -> tuple[int, ...]:
return result
@auto_convert("src")
- def cholesky(self, src: Any, no_tril: bool = False) -> None:
- cholesky_deferred(self, src, no_tril)
+ def cholesky(self, src: Any) -> None:
+ cholesky_deferred(self, src)
+
+ @auto_convert("ew", "ev")
+ def eig(self, ew: Any, ev: Any) -> None:
+ eig_deferred(self, ew, ev)
+
+ @auto_convert("ew")
+ def eigvals(self, ew: Any) -> None:
+ eig_deferred(self, ew)
@auto_convert("q", "r")
def qr(self, q: Any, r: Any) -> None:
@@ -3489,7 +3517,7 @@ def scan(
output = input
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.SCAN_LOCAL
+ self.library, CuPyNumericOpCode.SCAN_LOCAL
)
p_out = task.add_output(output.base)
p_in = task.add_input(input.base)
@@ -3505,7 +3533,7 @@ def scan(
# NOTE: Each node will do a sum up to its index, alternatively could
# do one centralized scan and broadcast (slightly less redundant work)
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.SCAN_GLOBAL
+ self.library, CuPyNumericOpCode.SCAN_GLOBAL
)
task.add_input(output.base)
p_temp = task.add_input(temp.base)
@@ -3526,7 +3554,7 @@ def unique(self) -> NumPyThunk:
result = runtime.create_unbound_thunk(self.base.type)
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.UNIQUE
+ self.library, CuPyNumericOpCode.UNIQUE
)
task.add_output(result.base)
@@ -3539,7 +3567,7 @@ def unique(self) -> NumPyThunk:
if runtime.num_gpus == 0 and runtime.num_procs > 1:
result.base = legate_runtime.tree_reduce(
- self.library, CuNumericOpCode.UNIQUE_REDUCE, result.base
+ self.library, CuPyNumericOpCode.UNIQUE_REDUCE, result.base
)
return result
@@ -3547,7 +3575,7 @@ def unique(self) -> NumPyThunk:
@auto_convert("rhs", "v")
def searchsorted(self, rhs: Any, v: Any, side: SortSide = "left") -> None:
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.SEARCHSORTED
+ self.library, CuPyNumericOpCode.SEARCHSORTED
)
is_left = side == "left"
@@ -3587,7 +3615,7 @@ def sort(
if order is not None:
raise NotImplementedError(
- "cuNumeric does not support sorting with 'order' as "
+ "cuPyNumeric does not support sorting with 'order' as "
"ndarray only supports numeric values"
)
if axis is not None and (axis >= rhs.ndim or axis < -rhs.ndim):
@@ -3607,7 +3635,7 @@ def partition(
) -> None:
if order is not None:
raise NotImplementedError(
- "cuNumeric does not support partitioning with 'order' as "
+ "cuPyNumeric does not support partitioning with 'order' as "
"ndarray only supports numeric values"
)
if axis is not None and (axis >= rhs.ndim or axis < -rhs.ndim):
@@ -3618,7 +3646,7 @@ def partition(
def create_window(self, op_code: WindowOpCode, M: int, *args: Any) -> None:
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.WINDOW
+ self.library, CuPyNumericOpCode.WINDOW
)
task.add_output(self.base)
task.add_scalar_arg(op_code, ty.int32)
@@ -3631,7 +3659,7 @@ def create_window(self, op_code: WindowOpCode, M: int, *args: Any) -> None:
def packbits(self, src: Any, axis: int | None, bitorder: BitOrder) -> None:
bitorder_code = getattr(Bitorder, bitorder.upper())
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.PACKBITS
+ self.library, CuPyNumericOpCode.PACKBITS
)
p_out = task.declare_partition()
p_in = task.declare_partition()
@@ -3649,7 +3677,7 @@ def unpackbits(
) -> None:
bitorder_code = getattr(Bitorder, bitorder.upper())
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.UNPACKBITS
+ self.library, CuPyNumericOpCode.UNPACKBITS
)
p_out = task.declare_partition()
p_in = task.declare_partition()
@@ -3682,7 +3710,7 @@ def _wrap(self, src: Any, new_len: int) -> None:
)
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.WRAP
+ self.library, CuPyNumericOpCode.WRAP
)
task.add_output(indirect.base)
task.add_scalar_arg(src.shape, (ty.int64,))
@@ -3710,7 +3738,7 @@ def histogram(self, src: Any, bins: Any, weights: Any) -> None:
dst_array.fill(np.array(0, dst_array.dtype))
task = legate_runtime.create_auto_task(
- self.library, CuNumericOpCode.HISTOGRAM
+ self.library, CuPyNumericOpCode.HISTOGRAM
)
p_dst = task.add_reduction(dst_array.base, ReductionOpKind.ADD)
p_src = task.add_input(src_array.base)
diff --git a/cunumeric/_thunk/eager.py b/cupynumeric/_thunk/eager.py
similarity index 95%
rename from cunumeric/_thunk/eager.py
rename to cupynumeric/_thunk/eager.py
index 868fb97bf9..4eb86df694 100644
--- a/cunumeric/_thunk/eager.py
+++ b/cupynumeric/_thunk/eager.py
@@ -45,6 +45,7 @@
from ..config import BitGeneratorType, FFTType
from ..types import (
BitOrder,
+ ConvolveMethod,
ConvolveMode,
NdShape,
OrderType,
@@ -336,17 +337,30 @@ def conj(self) -> NumPyThunk:
return EagerArray(self.array.conj())
- def convolve(self, input: Any, filter: Any, mode: ConvolveMode) -> None:
+ def convolve(
+ self,
+ input: Any,
+ filter: Any,
+ mode: ConvolveMode,
+ method: ConvolveMethod,
+ ) -> None:
self.check_eager_args(input, filter)
if self.deferred is not None:
- self.deferred.convolve(input, filter, mode)
+ self.deferred.convolve(input, filter, mode, method)
else:
if self.ndim == 1:
+ if method != "auto":
+ runtime.warn(
+ f"the method {method} is ignored "
+ "for the 1D convolution"
+ )
self.array[:] = np.convolve(input.array, filter.array, mode)
else:
from scipy.signal import convolve # type: ignore [import]
- self.array[...] = convolve(input.array, filter.array, mode)
+ self.array[...] = convolve(
+ input.array, filter.array, mode, method
+ )
def fft(
self,
@@ -1453,17 +1467,21 @@ def unary_op(
func(
rhs.array,
out=self.array,
- where=where
- if not isinstance(where, EagerArray)
- else where.array,
+ where=(
+ where
+ if not isinstance(where, EagerArray)
+ else where.array
+ ),
)
else:
func(
rhs.array,
out=(self.array, *(out.array for out in multiout)),
- where=where
- if not isinstance(where, EagerArray)
- else where.array,
+ where=(
+ where
+ if not isinstance(where, EagerArray)
+ else where.array
+ ),
)
elif op == UnaryOpCode.CLIP:
np.clip(
@@ -1535,9 +1553,9 @@ def unary_reduction(
out=self.array,
axis=orig_axis,
keepdims=keepdims,
- where=where
- if not isinstance(where, EagerArray)
- else where.array,
+ where=(
+ where if not isinstance(where, EagerArray) else where.array
+ ),
**kws,
)
elif op == UnaryRedCode.SUM_SQUARES:
@@ -1546,9 +1564,9 @@ def unary_reduction(
squared,
out=self.array,
axis=orig_axis,
- where=where
- if not isinstance(where, EagerArray)
- else where.array,
+ where=(
+ where if not isinstance(where, EagerArray) else where.array
+ ),
keepdims=keepdims,
)
elif op == UnaryRedCode.VARIANCE:
@@ -1558,9 +1576,9 @@ def unary_reduction(
np.sum(
squares,
axis=orig_axis,
- where=where
- if not isinstance(where, EagerArray)
- else where.array,
+ where=(
+ where if not isinstance(where, EagerArray) else where.array
+ ),
keepdims=keepdims,
out=self.array,
)
@@ -1605,9 +1623,9 @@ def binary_op(
rhs1.array,
rhs2.array,
out=self.array,
- where=where
- if not isinstance(where, EagerArray)
- else where.array,
+ where=(
+ where if not isinstance(where, EagerArray) else where.array
+ ),
)
def binary_reduction(
@@ -1661,10 +1679,10 @@ def trilu(self, rhs: Any, k: int, lower: bool) -> None:
else:
self.array[:] = np.triu(rhs.array, k)
- def cholesky(self, src: Any, no_tril: bool) -> None:
+ def cholesky(self, src: Any) -> None:
self.check_eager_args(src)
if self.deferred is not None:
- self.deferred.cholesky(src, no_tril)
+ self.deferred.cholesky(src)
else:
try:
result = np.linalg.cholesky(src.array)
@@ -1672,10 +1690,40 @@ def cholesky(self, src: Any, no_tril: bool) -> None:
from ..linalg import LinAlgError
raise LinAlgError(e) from e
- if no_tril:
- result = np.triu(result.T.conj(), k=1) + result
+
self.array[:] = result
+ def eig(self, ew: Any, ev: Any) -> None:
+ self.check_eager_args(ew, ev)
+ if self.deferred is not None and (
+ runtime.num_gpus == 0 or runtime.cusolver_has_geev()
+ ):
+ self.deferred.eig(ew, ev)
+ else:
+ try:
+ result_ew, result_ev = np.linalg.eig(self.array)
+ except np.linalg.LinAlgError as e:
+ from ..linalg import LinAlgError
+
+ raise LinAlgError(e) from e
+ ew.array[:] = result_ew
+ ev.array[:] = result_ev
+
+ def eigvals(self, ew: Any) -> None:
+ self.check_eager_args(ew)
+ if self.deferred is not None and (
+ runtime.num_gpus == 0 or runtime.cusolver_has_geev()
+ ):
+ self.deferred.eigvals(ew)
+ else:
+ try:
+ result_ew = np.linalg.eigvals(self.array)
+ except np.linalg.LinAlgError as e:
+ from ..linalg import LinAlgError
+
+ raise LinAlgError(e) from e
+ ew.array[:] = result_ew
+
def qr(self, q: Any, r: Any) -> None:
self.check_eager_args(q, r)
if self.deferred is not None:
diff --git a/cunumeric/_thunk/thunk.py b/cupynumeric/_thunk/thunk.py
similarity index 78%
rename from cunumeric/_thunk/thunk.py
rename to cupynumeric/_thunk/thunk.py
index 5dbe09264c..06619d7dc1 100644
--- a/cunumeric/_thunk/thunk.py
+++ b/cupynumeric/_thunk/thunk.py
@@ -36,6 +36,7 @@
)
from ..types import (
BitOrder,
+ ConvolveMethod,
ConvolveMode,
NdShape,
OrderType,
@@ -48,7 +49,7 @@
class NumPyThunk(ABC):
"""This is the base class for NumPy computations. It has methods
for all the kinds of computations and operations that can be done
- on cuNumeric ndarrays.
+ on cuPyNumeric ndarrays.
:meta private:
"""
@@ -73,28 +74,28 @@ def size(self) -> int:
# Abstract methods
@abstractproperty
- def shape(self) -> NdShape:
- ...
+ def shape(self) -> NdShape: ...
@abstractmethod
- def __numpy_array__(self) -> npt.NDArray[Any]:
- ...
+ def __numpy_array__(self) -> npt.NDArray[Any]: ...
@abstractmethod
- def imag(self) -> NumPyThunk:
- ...
+ def imag(self) -> NumPyThunk: ...
@abstractmethod
- def real(self) -> NumPyThunk:
- ...
+ def real(self) -> NumPyThunk: ...
@abstractmethod
- def conj(self) -> NumPyThunk:
- ...
+ def conj(self) -> NumPyThunk: ...
@abstractmethod
- def convolve(self, input: Any, filter: Any, mode: ConvolveMode) -> None:
- ...
+ def convolve(
+ self,
+ input: Any,
+ filter: Any,
+ mode: ConvolveMode,
+ method: ConvolveMethod,
+ ) -> None: ...
@abstractmethod
def fft(
@@ -103,43 +104,34 @@ def fft(
axes: Sequence[int],
kind: FFTType,
direction: FFTDirection,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
- def copy(self, rhs: Any, deep: bool) -> None:
- ...
+ def copy(self, rhs: Any, deep: bool) -> None: ...
@abstractmethod
def repeat(
self, repeats: Any, axis: int, scalar_repeats: bool
- ) -> NumPyThunk:
- ...
+ ) -> NumPyThunk: ...
@property
@abstractmethod
- def scalar(self) -> bool:
- ...
+ def scalar(self) -> bool: ...
@abstractmethod
- def get_item(self, key: Any) -> NumPyThunk:
- ...
+ def get_item(self, key: Any) -> NumPyThunk: ...
@abstractmethod
- def set_item(self, key: Any, value: Any) -> None:
- ...
+ def set_item(self, key: Any, value: Any) -> None: ...
@abstractmethod
- def reshape(self, newshape: NdShape, order: OrderType) -> NumPyThunk:
- ...
+ def reshape(self, newshape: NdShape, order: OrderType) -> NumPyThunk: ...
@abstractmethod
- def squeeze(self, axis: int | tuple[int, ...] | None) -> NumPyThunk:
- ...
+ def squeeze(self, axis: int | tuple[int, ...] | None) -> NumPyThunk: ...
@abstractmethod
- def swapaxes(self, axis1: int, axis2: int) -> NumPyThunk:
- ...
+ def swapaxes(self, axis1: int, axis2: int) -> NumPyThunk: ...
@abstractmethod
def convert(
@@ -148,20 +140,16 @@ def convert(
warn: bool = True,
nan_op: ConvertCode = ConvertCode.NOOP,
temporary: bool = False,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
- def fill(self, value: Any) -> None:
- ...
+ def fill(self, value: Any) -> None: ...
@abstractmethod
- def transpose(self, axes: tuple[int, ...] | list[int]) -> NumPyThunk:
- ...
+ def transpose(self, axes: tuple[int, ...] | list[int]) -> NumPyThunk: ...
@abstractmethod
- def flip(self, rhs: Any, axes: int | tuple[int, ...] | None) -> None:
- ...
+ def flip(self, rhs: Any, axes: int | tuple[int, ...] | None) -> None: ...
@abstractmethod
def contract(
@@ -172,12 +160,10 @@ def contract(
rhs2_thunk: Any,
rhs2_modes: list[str],
mode2extent: dict[str, int],
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
- def choose(self, rhs: Any, *args: Any) -> None:
- ...
+ def choose(self, rhs: Any, *args: Any) -> None: ...
@abstractmethod
def select(
@@ -185,46 +171,38 @@ def select(
condlist: Iterable[Any],
choicelist: Iterable[Any],
default: npt.NDArray[Any],
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def _diag_helper(
self, rhs: Any, offset: int, naxes: int, extract: bool, trace: bool
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
- def put(self, indices: Any, values: Any, check_bounds: bool) -> None:
- ...
+ def put(self, indices: Any, values: Any, check_bounds: bool) -> None: ...
@abstractmethod
- def putmask(self, mask: Any, values: Any) -> None:
- ...
+ def putmask(self, mask: Any, values: Any) -> None: ...
@abstractmethod
- def eye(self, k: int) -> None:
- ...
+ def eye(self, k: int) -> None: ...
@abstractmethod
- def arange(self, start: float, stop: float, step: float) -> None:
- ...
+ def arange(self, start: float, stop: float, step: float) -> None: ...
@abstractmethod
- def tile(self, rhs: Any, reps: Any | Sequence[int]) -> None:
- ...
+ def tile(self, rhs: Any, reps: Any | Sequence[int]) -> None: ...
@abstractmethod
- def trilu(self, rhs: Any, k: int, lower: bool) -> None:
- ...
+ def trilu(self, rhs: Any, k: int, lower: bool) -> None: ...
@abstractmethod
- def bincount(self, rhs: Any, weights: NumPyThunk | None = None) -> None:
- ...
+ def bincount(
+ self, rhs: Any, weights: NumPyThunk | None = None
+ ) -> None: ...
@abstractmethod
- def nonzero(self) -> tuple[NumPyThunk, ...]:
- ...
+ def nonzero(self) -> tuple[NumPyThunk, ...]: ...
@abstractmethod
def bitgenerator_random_raw(
@@ -233,8 +211,7 @@ def bitgenerator_random_raw(
generatorType: BitGeneratorType,
seed: int | None,
flags: int,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_integers(
@@ -245,8 +222,7 @@ def bitgenerator_integers(
flags: int,
low: int,
high: int,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_uniform(
@@ -257,8 +233,7 @@ def bitgenerator_uniform(
flags: int,
low: float,
high: float,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_lognormal(
@@ -269,8 +244,7 @@ def bitgenerator_lognormal(
flags: int,
mean: float,
sigma: float,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_normal(
@@ -281,8 +255,7 @@ def bitgenerator_normal(
flags: int,
mean: float,
sigma: float,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_poisson(
@@ -292,8 +265,7 @@ def bitgenerator_poisson(
seed: int | None,
flags: int,
lam: float,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_exponential(
@@ -303,8 +275,7 @@ def bitgenerator_exponential(
seed: int | None,
flags: int,
scale: float,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_gumbel(
@@ -315,8 +286,7 @@ def bitgenerator_gumbel(
flags: int,
mu: float,
beta: float,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_laplace(
@@ -327,8 +297,7 @@ def bitgenerator_laplace(
flags: int,
mu: float,
beta: float,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_logistic(
@@ -339,8 +308,7 @@ def bitgenerator_logistic(
flags: int,
mu: float,
beta: float,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_pareto(
@@ -350,8 +318,7 @@ def bitgenerator_pareto(
seed: int | None,
flags: int,
alpha: float,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_power(
@@ -361,8 +328,7 @@ def bitgenerator_power(
seed: int | None,
flags: int,
alpha: float,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_rayleigh(
@@ -372,8 +338,7 @@ def bitgenerator_rayleigh(
seed: int | None,
flags: int,
sigma: float,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_cauchy(
@@ -384,8 +349,7 @@ def bitgenerator_cauchy(
flags: int,
x0: float,
gamma: float,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_triangular(
@@ -397,8 +361,7 @@ def bitgenerator_triangular(
a: float,
b: float,
c: float,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_weibull(
@@ -409,8 +372,7 @@ def bitgenerator_weibull(
flags: int,
lam: float,
k: float,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_bytes(
@@ -419,8 +381,7 @@ def bitgenerator_bytes(
generatorType: BitGeneratorType,
seed: int | None,
flags: int,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_beta(
@@ -431,8 +392,7 @@ def bitgenerator_beta(
flags: int,
a: float,
b: float,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_f(
@@ -443,8 +403,7 @@ def bitgenerator_f(
flags: int,
dfnum: float,
dfden: float,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_logseries(
@@ -454,8 +413,7 @@ def bitgenerator_logseries(
seed: int | None,
flags: int,
p: float,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_noncentral_f(
@@ -467,8 +425,7 @@ def bitgenerator_noncentral_f(
dfnum: float,
dfden: float,
nonc: float,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_chisquare(
@@ -479,8 +436,7 @@ def bitgenerator_chisquare(
flags: int,
df: float,
nonc: float,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_gamma(
@@ -491,8 +447,7 @@ def bitgenerator_gamma(
flags: int,
k: float,
theta: float,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_standard_t(
@@ -502,8 +457,7 @@ def bitgenerator_standard_t(
seed: int | None,
flags: int,
df: float,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_hypergeometric(
@@ -515,8 +469,7 @@ def bitgenerator_hypergeometric(
ngood: int,
nbad: int,
nsample: int,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_vonmises(
@@ -527,8 +480,7 @@ def bitgenerator_vonmises(
flags: int,
mu: float,
kappa: float,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_zipf(
@@ -538,8 +490,7 @@ def bitgenerator_zipf(
seed: int | None,
flags: int,
alpha: float,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_geometric(
@@ -549,8 +500,7 @@ def bitgenerator_geometric(
seed: int | None,
flags: int,
p: float,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_wald(
@@ -561,8 +511,7 @@ def bitgenerator_wald(
flags: int,
mean: float,
scale: float,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_binomial(
@@ -573,8 +522,7 @@ def bitgenerator_binomial(
flags: int,
ntrials: int,
p: float,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def bitgenerator_negative_binomial(
@@ -585,12 +533,10 @@ def bitgenerator_negative_binomial(
flags: int,
ntrials: int,
p: float,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
- def random_uniform(self) -> None:
- ...
+ def random_uniform(self) -> None: ...
@abstractmethod
def partition(
@@ -601,24 +547,22 @@ def partition(
axis: int | None = -1,
kind: SelectKind = "introselect",
order: str | list[str] | None = None,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
- def random_normal(self) -> None:
- ...
+ def random_normal(self) -> None: ...
@abstractmethod
def random_integer(
self,
low: int | npt.NDArray[Any],
high: int | npt.NDArray[Any],
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
- def searchsorted(self, rhs: Any, v: Any, side: SortSide = "left") -> None:
- ...
+ def searchsorted(
+ self, rhs: Any, v: Any, side: SortSide = "left"
+ ) -> None: ...
@abstractmethod
def sort(
@@ -628,8 +572,7 @@ def sort(
axis: int | None = -1,
kind: SortType = "quicksort",
order: str | list[str] | None = None,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def unary_op(
@@ -639,8 +582,7 @@ def unary_op(
where: Any,
args: tuple[Scalar, ...] = (),
multiout: Any | None = None,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def unary_reduction(
@@ -653,14 +595,12 @@ def unary_reduction(
keepdims: bool,
args: tuple[Scalar, ...],
initial: Any,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def isclose(
self, rhs1: Any, rhs2: Any, rtol: float, atol: float, equal_nan: bool
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def binary_op(
@@ -670,8 +610,7 @@ def binary_op(
rhs2: Any,
where: Any,
args: tuple[Scalar, ...],
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
def binary_reduction(
@@ -681,36 +620,34 @@ def binary_reduction(
rhs2: Any,
broadcast: NdShape | None,
args: tuple[Scalar, ...],
- ) -> None:
- ...
+ ) -> None: ...
+
+ @abstractmethod
+ def broadcast_to(self, shape: NdShape) -> NumPyThunk: ...
@abstractmethod
- def broadcast_to(self, shape: NdShape) -> NumPyThunk:
- ...
+ def argwhere(self) -> NumPyThunk: ...
@abstractmethod
- def argwhere(self) -> NumPyThunk:
- ...
+ def where(self, rhs1: Any, rhs2: Any, rhs3: Any) -> None: ...
@abstractmethod
- def where(self, rhs1: Any, rhs2: Any, rhs3: Any) -> None:
- ...
+ def cholesky(self, src: Any) -> None: ...
@abstractmethod
- def cholesky(self, src: Any, no_tril: bool) -> None:
- ...
+ def eig(self, ew: Any, ev: Any) -> None: ...
@abstractmethod
- def qr(self, q: Any, r: Any) -> None:
- ...
+ def eigvals(self, ew: Any) -> None: ...
@abstractmethod
- def solve(self, a: Any, b: Any) -> None:
- ...
+ def qr(self, q: Any, r: Any) -> None: ...
@abstractmethod
- def svd(self, u: Any, s: Any, vh: Any) -> None:
- ...
+ def solve(self, a: Any, b: Any) -> None: ...
+
+ @abstractmethod
+ def svd(self, u: Any, s: Any, vh: Any) -> None: ...
@abstractmethod
def scan(
@@ -720,39 +657,35 @@ def scan(
axis: int,
dtype: npt.DTypeLike | None,
nan_to_identity: bool,
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
- def unique(self) -> NumPyThunk:
- ...
+ def unique(self) -> NumPyThunk: ...
@abstractmethod
- def create_window(self, op_code: WindowOpCode, M: Any, *args: Any) -> None:
- ...
+ def create_window(
+ self, op_code: WindowOpCode, M: Any, *args: Any
+ ) -> None: ...
@abstractmethod
- def packbits(self, src: Any, axis: int | None, bitorder: BitOrder) -> None:
- ...
+ def packbits(
+ self, src: Any, axis: int | None, bitorder: BitOrder
+ ) -> None: ...
@abstractmethod
def unpackbits(
self, src: Any, axis: int | None, bitorder: BitOrder
- ) -> None:
- ...
+ ) -> None: ...
@abstractmethod
- def _wrap(self, src: Any, new_len: int) -> None:
- ...
+ def _wrap(self, src: Any, new_len: int) -> None: ...
@abstractmethod
- def histogram(self, src: Any, bins: Any, weights: Any) -> None:
- ...
+ def histogram(self, src: Any, bins: Any, weights: Any) -> None: ...
@abstractmethod
def stencil_hint(
self,
low_offsets: tuple[int, ...],
high_offsets: tuple[int, ...],
- ) -> None:
- ...
+ ) -> None: ...
diff --git a/cunumeric/_ufunc/__init__.py b/cupynumeric/_ufunc/__init__.py
similarity index 100%
rename from cunumeric/_ufunc/__init__.py
rename to cupynumeric/_ufunc/__init__.py
diff --git a/cunumeric/_ufunc/bit_twiddling.py b/cupynumeric/_ufunc/bit_twiddling.py
similarity index 100%
rename from cunumeric/_ufunc/bit_twiddling.py
rename to cupynumeric/_ufunc/bit_twiddling.py
diff --git a/cunumeric/_ufunc/comparison.py b/cupynumeric/_ufunc/comparison.py
similarity index 97%
rename from cunumeric/_ufunc/comparison.py
rename to cupynumeric/_ufunc/comparison.py
index 089aa7f0fe..148854fad0 100644
--- a/cunumeric/_ufunc/comparison.py
+++ b/cupynumeric/_ufunc/comparison.py
@@ -18,7 +18,7 @@
import numpy as np
-from .._array.util import convert_to_cunumeric_ndarray
+from .._array.util import convert_to_cupynumeric_ndarray
from ..config import BinaryOpCode, UnaryOpCode, UnaryRedCode
from .ufunc import (
all_dtypes,
@@ -74,7 +74,7 @@ def _post_resolution_check(
if truthiness is not None:
# Replace with an always-true/always-false operation
- arr_x = convert_to_cunumeric_ndarray(
+ arr_x = convert_to_cupynumeric_ndarray(
np.array(iinfo.min, dtype=arr_x.dtype)
)
op_code = (
@@ -98,7 +98,7 @@ def _post_resolution_check(
if truthiness is not None:
# Replace with an always-true/always-false operation
- arr_y = convert_to_cunumeric_ndarray(
+ arr_y = convert_to_cupynumeric_ndarray(
np.array(iinfo.min, dtype=arr_y.dtype)
)
op_code = (
diff --git a/cunumeric/_ufunc/floating.py b/cupynumeric/_ufunc/floating.py
similarity index 100%
rename from cunumeric/_ufunc/floating.py
rename to cupynumeric/_ufunc/floating.py
diff --git a/cunumeric/_ufunc/math.py b/cupynumeric/_ufunc/math.py
similarity index 100%
rename from cunumeric/_ufunc/math.py
rename to cupynumeric/_ufunc/math.py
diff --git a/cunumeric/_ufunc/trigonometric.py b/cupynumeric/_ufunc/trigonometric.py
similarity index 100%
rename from cunumeric/_ufunc/trigonometric.py
rename to cupynumeric/_ufunc/trigonometric.py
diff --git a/cunumeric/_ufunc/ufunc.py b/cupynumeric/_ufunc/ufunc.py
similarity index 97%
rename from cunumeric/_ufunc/ufunc.py
rename to cupynumeric/_ufunc/ufunc.py
index 74b4f8badf..6eb42a3221 100644
--- a/cunumeric/_ufunc/ufunc.py
+++ b/cupynumeric/_ufunc/ufunc.py
@@ -19,11 +19,13 @@
import numpy as np
from legate.core.utils import OrderedSet
+from cupynumeric._utils import is_np2_1
+
from .._array.thunk import perform_unary_reduction
from .._array.util import (
add_boilerplate,
check_writeable,
- convert_to_cunumeric_ndarray,
+ convert_to_cupynumeric_ndarray,
)
from ..config import BinaryOpCode, UnaryOpCode, UnaryRedCode
from ..types import NdShape
@@ -79,7 +81,7 @@
numpy.{}
Availability
---------
+------------
Multiple GPUs, Multiple CPUs
"""
@@ -117,7 +119,7 @@
numpy.{}
Availability
---------
+------------
Multiple GPUs, Multiple CPUs
"""
@@ -155,7 +157,7 @@
numpy.{}
Availability
---------
+------------
Multiple GPUs, Multiple CPUs
"""
@@ -322,7 +324,7 @@ def _maybe_cast_output(out: ndarray | None, result: ndarray) -> ndarray:
return out
@staticmethod
- def _maybe_convert_output_to_cunumeric_ndarray(
+ def _maybe_convert_output_to_cupynumeric_ndarray(
out: ndarray | npt.NDArray[Any] | None,
) -> ndarray | None:
from .._array.array import ndarray
@@ -332,7 +334,7 @@ def _maybe_convert_output_to_cunumeric_ndarray(
if isinstance(out, ndarray):
return out
if isinstance(out, np.ndarray):
- return convert_to_cunumeric_ndarray(out, share=True)
+ return convert_to_cupynumeric_ndarray(out, share=True)
raise TypeError("return arrays must be of ArrayType")
def _prepare_operands(
@@ -354,7 +356,7 @@ def _prepare_operands(
)
inputs = tuple(
- convert_to_cunumeric_ndarray(arr) for arr in args[: self.nin]
+ convert_to_cupynumeric_ndarray(arr) for arr in args[: self.nin]
)
if len(args) > self.nin:
@@ -374,7 +376,7 @@ def _prepare_operands(
computed_out = out
outputs = tuple(
- self._maybe_convert_output_to_cunumeric_ndarray(arr)
+ self._maybe_convert_output_to_cupynumeric_ndarray(arr)
for arr in computed_out
)
@@ -486,6 +488,14 @@ def __call__(
precision_fixed = True
x = self._maybe_cast_input(x, dtype, casting)
+ if (
+ self._name in {"ceil", "floor", "trunc"}
+ and is_np2_1
+ and np.issubdtype(x.dtype, np.integer)
+ ):
+ result = x
+ return self._maybe_cast_output(out, result)
+
# Resolve the dtype to use for the computation and cast the input
# if necessary. If the dtype is already fixed by the caller,
# the dtype must be one of the dtypes supported by this operation.
@@ -666,9 +676,11 @@ def _resolve_dtype(
else:
to_dtypes = tuple(arr.dtype for arr in arrs)
key = tuple(
- arr.dtype.char
- if type(orig) not in (int, float, complex)
- else type(orig)
+ (
+ arr.dtype.char
+ if type(orig) not in (int, float, complex)
+ else type(orig)
+ )
for orig, arr in zip(orig_args, arrs)
)
# When all inputs are scalars, cannot use weak logic below.
diff --git a/cunumeric/_utils/__init__.py b/cupynumeric/_utils/__init__.py
similarity index 92%
rename from cunumeric/_utils/__init__.py
rename to cupynumeric/_utils/__init__.py
index 626ef7aae5..d292c29016 100644
--- a/cunumeric/_utils/__init__.py
+++ b/cupynumeric/_utils/__init__.py
@@ -17,3 +17,4 @@
import numpy as np
is_np2 = np.lib.NumpyVersion(np.__version__) >= "2.0.0b1"
+is_np2_1 = np.lib.NumpyVersion(np.__version__) >= "2.1.0b1"
diff --git a/cunumeric/_utils/array.py b/cupynumeric/_utils/array.py
similarity index 71%
rename from cunumeric/_utils/array.py
rename to cupynumeric/_utils/array.py
index 6e35735d30..5ad037e39b 100644
--- a/cunumeric/_utils/array.py
+++ b/cupynumeric/_utils/array.py
@@ -15,13 +15,17 @@
from __future__ import annotations
from functools import reduce
-from typing import Any
+from typing import TYPE_CHECKING, Any
import legate.core.types as ty
import numpy as np
+from legate.core import PhysicalArray, StoreTarget
from ..types import NdShape
+if TYPE_CHECKING:
+ from legate.core import PhysicalStore
+
SUPPORTED_DTYPES = {
np.dtype(bool): ty.bool_,
np.dtype(np.int8): ty.int8,
@@ -42,7 +46,7 @@
def is_supported_dtype(dtype: str | np.dtype[Any]) -> bool:
"""
- Whether a NumPy dtype is supported by cuNumeric
+ Whether a NumPy dtype is supported by cuPyNumeric
Parameters
----------
@@ -60,7 +64,7 @@ def is_supported_dtype(dtype: str | np.dtype[Any]) -> bool:
def to_core_type(dtype: str | np.dtype[Any]) -> ty.Type:
core_dtype = SUPPORTED_DTYPES.get(np.dtype(dtype))
if core_dtype is None:
- raise TypeError(f"cuNumeric does not support dtype={dtype}")
+ raise TypeError(f"cuPyNumeric does not support dtype={dtype}")
return core_dtype
@@ -111,3 +115,32 @@ def min_identity(
return True
else:
raise ValueError(f"Unsupported dtype: {ty}")
+
+
+def local_task_array(obj: PhysicalArray | PhysicalStore) -> Any:
+ """
+ Generate an appropriate local-memory ndarray object, that is backed by the
+ portion of a Legate array or store that was passed to a task.
+
+ Parameters
+ ----------
+ obj : PhysicalArray | PhysicalStore
+ A Legate physical array or store to adapt.
+
+ Returns
+ -------
+ arr : cupy.ndarray or np.ndarray
+ If the array or store is located on GPU, then this function will return
+ a CuPy array. Otherwise, a NumPy array is returned.
+
+ """
+ store = obj.data() if isinstance(obj, PhysicalArray) else obj
+
+ if store.target in {StoreTarget.FBMEM, StoreTarget.ZCMEM}:
+ # cupy is only a dependency for GPU packages -- but we should
+ # only hit this import in case the store is located on a GPU
+ import cupy # type: ignore [import-untyped,import-not-found]
+
+ return cupy.asarray(store)
+ else:
+ return np.asarray(store)
diff --git a/cunumeric/_utils/coverage.py b/cupynumeric/_utils/coverage.py
similarity index 88%
rename from cunumeric/_utils/coverage.py
rename to cupynumeric/_utils/coverage.py
index 3b87bb89f6..0a05f82360 100644
--- a/cunumeric/_utils/coverage.py
+++ b/cupynumeric/_utils/coverage.py
@@ -17,13 +17,7 @@
import warnings
from dataclasses import dataclass
from functools import WRAPPER_ASSIGNMENTS, wraps
-from types import (
- BuiltinFunctionType,
- FunctionType,
- MethodDescriptorType,
- MethodType,
- ModuleType,
-)
+from types import BuiltinFunctionType, ModuleType
from typing import Any, Callable, Container, Iterable, Mapping, Protocol, cast
from legate.core import track_provenance
@@ -37,7 +31,7 @@
__all__ = ("clone_module", "clone_class")
FALLBACK_WARNING = (
- "cuNumeric has not implemented {what} "
+ "cuPyNumeric has not implemented {what} "
+ "and is falling back to canonical NumPy. "
+ "You may notice significantly decreased performance "
+ "for this function call."
@@ -63,8 +57,7 @@ def filter_namespace(
class AnyCallable(Protocol):
- def __call__(self, *args: Any, **kwargs: Any) -> Any:
- ...
+ def __call__(self, *args: Any, **kwargs: Any) -> Any: ...
@dataclass(frozen=True)
@@ -75,7 +68,7 @@ class CuWrapperMetadata:
class CuWrapped(AnyCallable, Protocol):
- _cunumeric: CuWrapperMetadata
+ _cupynumeric_metadata: CuWrapperMetadata
__wrapped__: AnyCallable
__name__: str
__qualname__: str
@@ -122,7 +115,7 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
multi = "Multiple GPUs" in (getattr(func, "__doc__", None) or "")
single = "Single GPU" in (getattr(func, "__doc__", None) or "") or multi
- wrapper._cunumeric = CuWrapperMetadata(
+ wrapper._cupynumeric_metadata = CuWrapperMetadata(
implemented=True, single=single, multi=multi
)
@@ -147,7 +140,7 @@ def unimplemented(
# all array-like arguments to `numpy.ndarray` through `__array__()` (taking
# some care to skip the `__array_function__` dispatch logic, to avoid
# infinite loops). However, it appears that this behavior is inconsistent
- # in NumPy, so we will instead convert any `cunumeric.ndarray`s manually
+ # in NumPy, so we will instead convert any `cupynumeric.ndarray`s manually
# before calling into NumPy.
wrapper: CuWrapped
@@ -185,13 +178,13 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
return func(*args, **kwargs)
wrapper.__doc__ = f"""
- cuNumeric has not implemented this function, and will fall back to NumPy.
+ cuPyNumeric has not implemented this function, and will fall back to NumPy.
See Also
--------
{name}
"""
- wrapper._cunumeric = CuWrapperMetadata(implemented=False)
+ wrapper._cupynumeric_metadata = CuWrapperMetadata(implemented=False)
return wrapper
@@ -248,7 +241,7 @@ def clone_module(
# Only need to wrap things that are in the origin module to begin with
if attr not in origin_module.__dict__:
continue
- if isinstance(value, (FunctionType, lgufunc)) or (
+ if should_wrap(value) or (
include_builtin_function_type
and isinstance(value, BuiltinFunctionType)
):
@@ -279,7 +272,7 @@ def clone_module(
from numpy import ufunc as npufunc
for attr, value in missing.items():
- if isinstance(value, (FunctionType, npufunc)) or (
+ if should_wrap(value) or (
include_builtin_function_type
and isinstance(value, BuiltinFunctionType)
):
@@ -306,7 +299,19 @@ def clone_module(
def should_wrap(obj: object) -> bool:
- return isinstance(obj, (FunctionType, MethodType, MethodDescriptorType))
+ from numpy import ufunc as npufunc
+
+ from .._ufunc.ufunc import ufunc as lgufunc
+
+ # Custom callables, e.g. cython functions used in np2, do not inherit
+ # anything, so we check callable() instead (and include the __get__/__set__
+ # checks to filter out classes). OTOH ufuncs need to be checked specially
+ # because they do not have __get__.
+ return (
+ callable(obj)
+ and hasattr(obj, "__get__")
+ and not hasattr(obj, "__set__")
+ ) or isinstance(obj, (lgufunc, npufunc))
def clone_class(
@@ -363,13 +368,17 @@ def _clone_class(cls: type) -> type:
return _clone_class
+def is_wrapped(obj: Any) -> bool:
+ return hasattr(obj, "_cupynumeric_metadata")
+
+
def is_implemented(obj: Any) -> bool:
- return hasattr(obj, "_cunumeric") and obj._cunumeric.implemented
+ return is_wrapped(obj) and obj._cupynumeric_metadata.implemented
def is_single(obj: Any) -> bool:
- return hasattr(obj, "_cunumeric") and obj._cunumeric.single
+ return is_wrapped(obj) and obj._cupynumeric_metadata.single
def is_multi(obj: Any) -> bool:
- return hasattr(obj, "_cunumeric") and obj._cunumeric.multi
+ return is_wrapped(obj) and obj._cupynumeric_metadata.multi
diff --git a/cunumeric/_utils/linalg.py b/cupynumeric/_utils/linalg.py
similarity index 100%
rename from cunumeric/_utils/linalg.py
rename to cupynumeric/_utils/linalg.py
diff --git a/cunumeric/_utils/stack.py b/cupynumeric/_utils/stack.py
similarity index 91%
rename from cunumeric/_utils/stack.py
rename to cupynumeric/_utils/stack.py
index 470cf77750..f5e714a3c6 100644
--- a/cunumeric/_utils/stack.py
+++ b/cupynumeric/_utils/stack.py
@@ -21,7 +21,7 @@
def find_last_user_stacklevel() -> int:
stacklevel = 1
for frame, _ in traceback.walk_stack(None):
- if not frame.f_globals["__name__"].startswith("cunumeric"):
+ if not frame.f_globals["__name__"].startswith("cupynumeric"):
break
stacklevel += 1
return stacklevel
@@ -36,7 +36,7 @@ def find_last_user_frames(top_only: bool = True) -> str:
if "__name__" not in last.f_globals:
continue
name = last.f_globals["__name__"]
- if not any(name.startswith(pkg) for pkg in ("cunumeric", "legate")):
+ if not any(name.startswith(pkg) for pkg in ("cupynumeric", "legate")):
break
if top_only:
diff --git a/cunumeric/_utils/structure.py b/cupynumeric/_utils/structure.py
similarity index 100%
rename from cunumeric/_utils/structure.py
rename to cupynumeric/_utils/structure.py
diff --git a/cunumeric/_version.py b/cupynumeric/_version.py
similarity index 99%
rename from cunumeric/_version.py
rename to cupynumeric/_version.py
index 7c006fdc15..9d05050897 100644
--- a/cunumeric/_version.py
+++ b/cupynumeric/_version.py
@@ -43,8 +43,8 @@ def get_config():
cfg.VCS = "git"
cfg.style = "pep440"
cfg.tag_prefix = "v"
- cfg.parentdir_prefix = "cunumeric-"
- cfg.versionfile_source = "cunumeric/_version.py"
+ cfg.parentdir_prefix = "cupynumeric-"
+ cfg.versionfile_source = "cupynumeric/_version.py"
cfg.verbose = False
return cfg
diff --git a/cupynumeric/config.py b/cupynumeric/config.py
new file mode 100644
index 0000000000..c7a351d8f5
--- /dev/null
+++ b/cupynumeric/config.py
@@ -0,0 +1,842 @@
+# Copyright 2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+import os
+import platform
+from abc import abstractmethod
+from ctypes import CDLL, RTLD_GLOBAL
+from enum import IntEnum, unique
+from typing import TYPE_CHECKING, Any, cast
+
+import cffi # type: ignore
+import numpy as np
+
+if TYPE_CHECKING:
+ import numpy.typing as npt
+
+
+class _ReductionOpIds:
+ argmax_redop_id: int
+ argmin_redop_id: int
+
+
+class _CupynumericSharedLib:
+ CUPYNUMERIC_ADVANCED_INDEXING: int
+ CUPYNUMERIC_ARANGE: int
+ CUPYNUMERIC_ARGWHERE: int
+ CUPYNUMERIC_BATCHED_CHOLESKY: int
+ CUPYNUMERIC_BINARY_OP: int
+ CUPYNUMERIC_BINARY_RED: int
+ CUPYNUMERIC_BINCOUNT: int
+ CUPYNUMERIC_BINOP_ADD: int
+ CUPYNUMERIC_BINOP_ARCTAN2: int
+ CUPYNUMERIC_BINOP_BITWISE_AND: int
+ CUPYNUMERIC_BINOP_BITWISE_OR: int
+ CUPYNUMERIC_BINOP_BITWISE_XOR: int
+ CUPYNUMERIC_BINOP_COPYSIGN: int
+ CUPYNUMERIC_BINOP_DIVIDE: int
+ CUPYNUMERIC_BINOP_EQUAL: int
+ CUPYNUMERIC_BINOP_FLOAT_POWER: int
+ CUPYNUMERIC_BINOP_FLOOR_DIVIDE: int
+ CUPYNUMERIC_BINOP_FMOD: int
+ CUPYNUMERIC_BINOP_GCD: int
+ CUPYNUMERIC_BINOP_GREATER: int
+ CUPYNUMERIC_BINOP_GREATER_EQUAL: int
+ CUPYNUMERIC_BINOP_HYPOT: int
+ CUPYNUMERIC_BINOP_ISCLOSE: int
+ CUPYNUMERIC_BINOP_LCM: int
+ CUPYNUMERIC_BINOP_LDEXP: int
+ CUPYNUMERIC_BINOP_LEFT_SHIFT: int
+ CUPYNUMERIC_BINOP_LESS: int
+ CUPYNUMERIC_BINOP_LESS_EQUAL: int
+ CUPYNUMERIC_BINOP_LOGADDEXP2: int
+ CUPYNUMERIC_BINOP_LOGADDEXP: int
+ CUPYNUMERIC_BINOP_LOGICAL_AND: int
+ CUPYNUMERIC_BINOP_LOGICAL_OR: int
+ CUPYNUMERIC_BINOP_LOGICAL_XOR: int
+ CUPYNUMERIC_BINOP_MAXIMUM: int
+ CUPYNUMERIC_BINOP_MINIMUM: int
+ CUPYNUMERIC_BINOP_MOD: int
+ CUPYNUMERIC_BINOP_MULTIPLY: int
+ CUPYNUMERIC_BINOP_NEXTAFTER: int
+ CUPYNUMERIC_BINOP_NOT_EQUAL: int
+ CUPYNUMERIC_BINOP_POWER: int
+ CUPYNUMERIC_BINOP_RIGHT_SHIFT: int
+ CUPYNUMERIC_BINOP_SUBTRACT: int
+ CUPYNUMERIC_BITGENERATOR: int
+ CUPYNUMERIC_BITGENOP_DISTRIBUTION: int
+ CUPYNUMERIC_BITGENTYPE_DEFAULT: int
+ CUPYNUMERIC_BITGENTYPE_XORWOW: int
+ CUPYNUMERIC_BITGENTYPE_MRG32K3A: int
+ CUPYNUMERIC_BITGENTYPE_MTGP32: int
+ CUPYNUMERIC_BITGENTYPE_MT19937: int
+ CUPYNUMERIC_BITGENTYPE_PHILOX4_32_10: int
+ CUPYNUMERIC_BITGENDIST_INTEGERS_16: int
+ CUPYNUMERIC_BITGENDIST_INTEGERS_32: int
+ CUPYNUMERIC_BITGENDIST_INTEGERS_64: int
+ CUPYNUMERIC_BITGENDIST_UNIFORM_32: int
+ CUPYNUMERIC_BITGENDIST_UNIFORM_64: int
+ CUPYNUMERIC_BITGENDIST_LOGNORMAL_32: int
+ CUPYNUMERIC_BITGENDIST_LOGNORMAL_64: int
+ CUPYNUMERIC_BITGENDIST_NORMAL_32: int
+ CUPYNUMERIC_BITGENDIST_NORMAL_64: int
+ CUPYNUMERIC_BITGENDIST_POISSON: int
+ CUPYNUMERIC_BITGENDIST_EXPONENTIAL_32: int
+ CUPYNUMERIC_BITGENDIST_EXPONENTIAL_64: int
+ CUPYNUMERIC_BITGENDIST_GUMBEL_32: int
+ CUPYNUMERIC_BITGENDIST_GUMBEL_64: int
+ CUPYNUMERIC_BITGENDIST_LAPLACE_32: int
+ CUPYNUMERIC_BITGENDIST_LAPLACE_64: int
+ CUPYNUMERIC_BITGENDIST_LOGISTIC_32: int
+ CUPYNUMERIC_BITGENDIST_LOGISTIC_64: int
+ CUPYNUMERIC_BITGENDIST_PARETO_32: int
+ CUPYNUMERIC_BITGENDIST_PARETO_64: int
+ CUPYNUMERIC_BITGENDIST_POWER_32: int
+ CUPYNUMERIC_BITGENDIST_POWER_64: int
+ CUPYNUMERIC_BITGENDIST_RAYLEIGH_32: int
+ CUPYNUMERIC_BITGENDIST_RAYLEIGH_64: int
+ CUPYNUMERIC_BITGENDIST_CAUCHY_32: int
+ CUPYNUMERIC_BITGENDIST_CAUCHY_64: int
+ CUPYNUMERIC_BITGENDIST_TRIANGULAR_32: int
+ CUPYNUMERIC_BITGENDIST_TRIANGULAR_64: int
+ CUPYNUMERIC_BITGENDIST_WEIBULL_32: int
+ CUPYNUMERIC_BITGENDIST_WEIBULL_64: int
+ CUPYNUMERIC_BITGENDIST_BYTES: int
+ CUPYNUMERIC_BITGENDIST_BETA_32: int
+ CUPYNUMERIC_BITGENDIST_BETA_64: int
+ CUPYNUMERIC_BITGENDIST_F_32: int
+ CUPYNUMERIC_BITGENDIST_F_64: int
+ CUPYNUMERIC_BITGENDIST_LOGSERIES: int
+ CUPYNUMERIC_BITGENDIST_NONCENTRAL_F_32: int
+ CUPYNUMERIC_BITGENDIST_NONCENTRAL_F_64: int
+ CUPYNUMERIC_BITGENDIST_CHISQUARE_32: int
+ CUPYNUMERIC_BITGENDIST_CHISQUARE_64: int
+ CUPYNUMERIC_BITGENDIST_GAMMA_32: int
+ CUPYNUMERIC_BITGENDIST_GAMMA_64: int
+ CUPYNUMERIC_BITGENDIST_STANDARD_T_32: int
+ CUPYNUMERIC_BITGENDIST_STANDARD_T_64: int
+ CUPYNUMERIC_BITGENDIST_HYPERGEOMETRIC: int
+ CUPYNUMERIC_BITGENDIST_VONMISES_32: int
+ CUPYNUMERIC_BITGENDIST_VONMISES_64: int
+ CUPYNUMERIC_BITGENDIST_ZIPF: int
+ CUPYNUMERIC_BITGENDIST_GEOMETRIC: int
+ CUPYNUMERIC_BITGENDIST_WALD_32: int
+ CUPYNUMERIC_BITGENDIST_WALD_64: int
+ CUPYNUMERIC_BITGENDIST_BINOMIAL: int
+ CUPYNUMERIC_BITGENDIST_NEGATIVE_BINOMIAL: int
+ CUPYNUMERIC_BITGENOP_CREATE: int
+ CUPYNUMERIC_BITGENOP_DESTROY: int
+ CUPYNUMERIC_BITGENOP_RAND_RAW: int
+ CUPYNUMERIC_BITORDER_BIG: int
+ CUPYNUMERIC_BITORDER_LITTLE: int
+ CUPYNUMERIC_CHOOSE: int
+ CUPYNUMERIC_CONTRACT: int
+ CUPYNUMERIC_CONVERT: int
+ CUPYNUMERIC_CONVERT_NAN_NOOP: int
+ CUPYNUMERIC_CONVERT_NAN_PROD: int
+ CUPYNUMERIC_CONVERT_NAN_SUM: int
+ CUPYNUMERIC_CONVOLVE: int
+ CUPYNUMERIC_CONVOLVE_AUTO: int
+ CUPYNUMERIC_CONVOLVE_DIRECT: int
+ CUPYNUMERIC_CONVOLVE_FFT: int
+ CUPYNUMERIC_DIAG: int
+ CUPYNUMERIC_DOT: int
+ CUPYNUMERIC_EYE: int
+ CUPYNUMERIC_FFT: int
+ CUPYNUMERIC_FFT_C2C: int
+ CUPYNUMERIC_FFT_C2R: int
+ CUPYNUMERIC_FFT_D2Z: int
+ CUPYNUMERIC_FFT_FORWARD: int
+ CUPYNUMERIC_FFT_INVERSE: int
+ CUPYNUMERIC_FFT_R2C: int
+ CUPYNUMERIC_FFT_Z2D: int
+ CUPYNUMERIC_FFT_Z2Z: int
+ CUPYNUMERIC_FILL: int
+ CUPYNUMERIC_FLIP: int
+ CUPYNUMERIC_GEEV: int
+ CUPYNUMERIC_GEMM: int
+ CUPYNUMERIC_HISTOGRAM: int
+ CUPYNUMERIC_LOAD_CUDALIBS: int
+ CUPYNUMERIC_MATMUL: int
+ CUPYNUMERIC_MATVECMUL: int
+ CUPYNUMERIC_MAX_MAPPERS: int
+ CUPYNUMERIC_MAX_REDOPS: int
+ CUPYNUMERIC_MAX_TASKS: int
+ CUPYNUMERIC_MP_POTRF: int
+ CUPYNUMERIC_MP_SOLVE: int
+ CUPYNUMERIC_NONZERO: int
+ CUPYNUMERIC_PACKBITS: int
+ CUPYNUMERIC_POTRF: int
+ CUPYNUMERIC_PUTMASK: int
+ CUPYNUMERIC_QR: int
+ CUPYNUMERIC_RAND: int
+ CUPYNUMERIC_READ: int
+ CUPYNUMERIC_RED_ALL: int
+ CUPYNUMERIC_RED_ANY: int
+ CUPYNUMERIC_RED_ARGMAX: int
+ CUPYNUMERIC_RED_ARGMIN: int
+ CUPYNUMERIC_RED_CONTAINS: int
+ CUPYNUMERIC_RED_COUNT_NONZERO: int
+ CUPYNUMERIC_RED_MAX: int
+ CUPYNUMERIC_RED_MIN: int
+ CUPYNUMERIC_RED_NANARGMAX: int
+ CUPYNUMERIC_RED_NANARGMIN: int
+ CUPYNUMERIC_RED_NANMAX: int
+ CUPYNUMERIC_RED_NANMIN: int
+ CUPYNUMERIC_RED_NANPROD: int
+ CUPYNUMERIC_RED_NANSUM: int
+ CUPYNUMERIC_RED_PROD: int
+ CUPYNUMERIC_RED_SUM: int
+ CUPYNUMERIC_RED_SUM_SQUARES: int
+ CUPYNUMERIC_RED_VARIANCE: int
+ CUPYNUMERIC_REPEAT: int
+ CUPYNUMERIC_SCALAR_UNARY_RED: int
+ CUPYNUMERIC_SCAN_GLOBAL: int
+ CUPYNUMERIC_SCAN_LOCAL: int
+ CUPYNUMERIC_SCAN_PROD: int
+ CUPYNUMERIC_SCAN_SUM: int
+ CUPYNUMERIC_SEARCHSORTED: int
+ CUPYNUMERIC_SELECT: int
+ CUPYNUMERIC_SOLVE: int
+ CUPYNUMERIC_SORT: int
+ CUPYNUMERIC_SVD: int
+ CUPYNUMERIC_SYRK: int
+ CUPYNUMERIC_TILE: int
+ CUPYNUMERIC_TRANSPOSE_COPY_2D: int
+ CUPYNUMERIC_TRILU: int
+ CUPYNUMERIC_TRSM: int
+ CUPYNUMERIC_UNARY_OP: int
+ CUPYNUMERIC_UNARY_RED: int
+ CUPYNUMERIC_UNIQUE: int
+ CUPYNUMERIC_UNIQUE_REDUCE: int
+ CUPYNUMERIC_UNLOAD_CUDALIBS: int
+ CUPYNUMERIC_UNPACKBITS: int
+ CUPYNUMERIC_UOP_ABSOLUTE: int
+ CUPYNUMERIC_UOP_ANGLE: int
+ CUPYNUMERIC_UOP_ARCCOS: int
+ CUPYNUMERIC_UOP_ARCCOSH: int
+ CUPYNUMERIC_UOP_ARCSIN: int
+ CUPYNUMERIC_UOP_ARCSINH: int
+ CUPYNUMERIC_UOP_ARCTAN: int
+ CUPYNUMERIC_UOP_ARCTANH: int
+ CUPYNUMERIC_UOP_CBRT: int
+ CUPYNUMERIC_UOP_CEIL: int
+ CUPYNUMERIC_UOP_CLIP: int
+ CUPYNUMERIC_UOP_CONJ: int
+ CUPYNUMERIC_UOP_COPY: int
+ CUPYNUMERIC_UOP_COS: int
+ CUPYNUMERIC_UOP_COSH: int
+ CUPYNUMERIC_UOP_DEG2RAD: int
+ CUPYNUMERIC_UOP_EXP2: int
+ CUPYNUMERIC_UOP_EXP: int
+ CUPYNUMERIC_UOP_EXPM1: int
+ CUPYNUMERIC_UOP_FLOOR: int
+ CUPYNUMERIC_UOP_FREXP: int
+ CUPYNUMERIC_UOP_GETARG: int
+ CUPYNUMERIC_UOP_IMAG: int
+ CUPYNUMERIC_UOP_INVERT: int
+ CUPYNUMERIC_UOP_ISFINITE: int
+ CUPYNUMERIC_UOP_ISINF: int
+ CUPYNUMERIC_UOP_ISNAN: int
+ CUPYNUMERIC_UOP_LOG10: int
+ CUPYNUMERIC_UOP_LOG1P: int
+ CUPYNUMERIC_UOP_LOG2: int
+ CUPYNUMERIC_UOP_LOG: int
+ CUPYNUMERIC_UOP_LOGICAL_NOT: int
+ CUPYNUMERIC_UOP_MODF: int
+ CUPYNUMERIC_UOP_NEGATIVE: int
+ CUPYNUMERIC_UOP_POSITIVE: int
+ CUPYNUMERIC_UOP_RAD2DEG: int
+ CUPYNUMERIC_UOP_REAL: int
+ CUPYNUMERIC_UOP_RECIPROCAL: int
+ CUPYNUMERIC_UOP_RINT: int
+ CUPYNUMERIC_UOP_ROUND: int
+ CUPYNUMERIC_UOP_SIGN: int
+ CUPYNUMERIC_UOP_SIGNBIT: int
+ CUPYNUMERIC_UOP_SIN: int
+ CUPYNUMERIC_UOP_SINH: int
+ CUPYNUMERIC_UOP_SQRT: int
+ CUPYNUMERIC_UOP_SQUARE: int
+ CUPYNUMERIC_UOP_TAN: int
+ CUPYNUMERIC_UOP_TANH: int
+ CUPYNUMERIC_UOP_TRUNC: int
+ CUPYNUMERIC_WHERE: int
+ CUPYNUMERIC_WINDOW: int
+ CUPYNUMERIC_WINDOW_BARLETT: int
+ CUPYNUMERIC_WINDOW_BLACKMAN: int
+ CUPYNUMERIC_WINDOW_HAMMING: int
+ CUPYNUMERIC_WINDOW_HANNING: int
+ CUPYNUMERIC_WINDOW_KAISER: int
+ CUPYNUMERIC_WRAP: int
+ CUPYNUMERIC_WRITE: int
+ CUPYNUMERIC_ZIP: int
+
+ @abstractmethod
+ def cupynumeric_has_cusolvermp(self) -> bool: ...
+
+ @abstractmethod
+ def cupynumeric_cusolver_has_geev(self) -> bool: ...
+
+ @abstractmethod
+ def cupynumeric_max_eager_volume(self) -> int: ...
+
+ @abstractmethod
+ def cupynumeric_register_reduction_ops(
+ self, code: int
+ ) -> _ReductionOpIds: ...
+
+
+def dlopen_no_autoclose(ffi: Any, lib_path: str) -> Any:
+ # Use an already-opened library handle, which cffi will convert to a
+ # regular FFI object (using the definitions previously added using
+ # ffi.cdef), but will not automatically dlclose() on collection.
+ lib = CDLL(lib_path, mode=RTLD_GLOBAL)
+ return ffi.dlopen(ffi.cast("void *", lib._handle))
+
+
+# Load the cuPyNumeric library first so we have a shard object that
+# we can use to initialize all these configuration enumerations
+class CuPyNumericLib:
+ def __init__(self, name: str) -> None:
+ self.name = name
+
+ shared_lib_path = self.get_shared_library()
+ assert shared_lib_path is not None
+ header = self.get_c_header()
+ ffi = cffi.FFI()
+ if header is not None:
+ ffi.cdef(header)
+ # Don't use ffi.dlopen(), because that will call dlclose()
+ # automatically when the object gets collected, thus removing
+ # symbols that may be needed when destroying C++ objects later
+ # (e.g. vtable entries, which will be queried for virtual
+ # destructors), causing errors at shutdown.
+ shared_lib = dlopen_no_autoclose(ffi, shared_lib_path)
+ self.shared_object = cast(_CupynumericSharedLib, shared_lib)
+
+ def register(self) -> None:
+ from legate.core import get_legate_runtime
+
+ # We need to make sure that the runtime is started
+ get_legate_runtime()
+
+ callback = getattr(
+ self.shared_object, "cupynumeric_perform_registration"
+ )
+ callback()
+
+ def get_shared_library(self) -> str:
+ from .install_info import libpath
+
+ return os.path.join(
+ libpath, "libcupynumeric" + self.get_library_extension()
+ )
+
+ def get_c_header(self) -> str:
+ from .install_info import header
+
+ return header
+
+ @staticmethod
+ def get_library_extension() -> str:
+ os_name = platform.system()
+ if os_name == "Linux":
+ return ".so"
+ elif os_name == "Darwin":
+ return ".dylib"
+ raise RuntimeError(f"unknown platform {os_name!r}")
+
+
+CUPYNUMERIC_LIB_NAME = "cupynumeric"
+cupynumeric_lib = CuPyNumericLib(CUPYNUMERIC_LIB_NAME)
+cupynumeric_lib.register()
+_cupynumeric = cupynumeric_lib.shared_object
+
+
+# Match these to CuPyNumericOpCode in cupynumeric_c.h
+@unique
+class CuPyNumericOpCode(IntEnum):
+ ADVANCED_INDEXING = _cupynumeric.CUPYNUMERIC_ADVANCED_INDEXING
+ ARANGE = _cupynumeric.CUPYNUMERIC_ARANGE
+ ARGWHERE = _cupynumeric.CUPYNUMERIC_ARGWHERE
+ BATCHED_CHOLESKY = _cupynumeric.CUPYNUMERIC_BATCHED_CHOLESKY
+ BINARY_OP = _cupynumeric.CUPYNUMERIC_BINARY_OP
+ BINARY_RED = _cupynumeric.CUPYNUMERIC_BINARY_RED
+ BINCOUNT = _cupynumeric.CUPYNUMERIC_BINCOUNT
+ BITGENERATOR = _cupynumeric.CUPYNUMERIC_BITGENERATOR
+ CHOOSE = _cupynumeric.CUPYNUMERIC_CHOOSE
+ CONTRACT = _cupynumeric.CUPYNUMERIC_CONTRACT
+ CONVERT = _cupynumeric.CUPYNUMERIC_CONVERT
+ CONVOLVE = _cupynumeric.CUPYNUMERIC_CONVOLVE
+ DIAG = _cupynumeric.CUPYNUMERIC_DIAG
+ DOT = _cupynumeric.CUPYNUMERIC_DOT
+ EYE = _cupynumeric.CUPYNUMERIC_EYE
+ FFT = _cupynumeric.CUPYNUMERIC_FFT
+ FILL = _cupynumeric.CUPYNUMERIC_FILL
+ FLIP = _cupynumeric.CUPYNUMERIC_FLIP
+ GEEV = _cupynumeric.CUPYNUMERIC_GEEV
+ GEMM = _cupynumeric.CUPYNUMERIC_GEMM
+ HISTOGRAM = _cupynumeric.CUPYNUMERIC_HISTOGRAM
+ LOAD_CUDALIBS = _cupynumeric.CUPYNUMERIC_LOAD_CUDALIBS
+ MATMUL = _cupynumeric.CUPYNUMERIC_MATMUL
+ MATVECMUL = _cupynumeric.CUPYNUMERIC_MATVECMUL
+ MP_POTRF = _cupynumeric.CUPYNUMERIC_MP_POTRF
+ MP_SOLVE = _cupynumeric.CUPYNUMERIC_MP_SOLVE
+ NONZERO = _cupynumeric.CUPYNUMERIC_NONZERO
+ PACKBITS = _cupynumeric.CUPYNUMERIC_PACKBITS
+ POTRF = _cupynumeric.CUPYNUMERIC_POTRF
+ PUTMASK = _cupynumeric.CUPYNUMERIC_PUTMASK
+ QR = _cupynumeric.CUPYNUMERIC_QR
+ RAND = _cupynumeric.CUPYNUMERIC_RAND
+ READ = _cupynumeric.CUPYNUMERIC_READ
+ REPEAT = _cupynumeric.CUPYNUMERIC_REPEAT
+ SCALAR_UNARY_RED = _cupynumeric.CUPYNUMERIC_SCALAR_UNARY_RED
+ SCAN_GLOBAL = _cupynumeric.CUPYNUMERIC_SCAN_GLOBAL
+ SCAN_LOCAL = _cupynumeric.CUPYNUMERIC_SCAN_LOCAL
+ SEARCHSORTED = _cupynumeric.CUPYNUMERIC_SEARCHSORTED
+ SELECT = _cupynumeric.CUPYNUMERIC_SELECT
+ SOLVE = _cupynumeric.CUPYNUMERIC_SOLVE
+ SORT = _cupynumeric.CUPYNUMERIC_SORT
+ SVD = _cupynumeric.CUPYNUMERIC_SVD
+ SYRK = _cupynumeric.CUPYNUMERIC_SYRK
+ TILE = _cupynumeric.CUPYNUMERIC_TILE
+ TRANSPOSE_COPY_2D = _cupynumeric.CUPYNUMERIC_TRANSPOSE_COPY_2D
+ TRILU = _cupynumeric.CUPYNUMERIC_TRILU
+ TRSM = _cupynumeric.CUPYNUMERIC_TRSM
+ UNARY_OP = _cupynumeric.CUPYNUMERIC_UNARY_OP
+ UNARY_RED = _cupynumeric.CUPYNUMERIC_UNARY_RED
+ UNIQUE = _cupynumeric.CUPYNUMERIC_UNIQUE
+ UNIQUE_REDUCE = _cupynumeric.CUPYNUMERIC_UNIQUE_REDUCE
+ UNLOAD_CUDALIBS = _cupynumeric.CUPYNUMERIC_UNLOAD_CUDALIBS
+ UNPACKBITS = _cupynumeric.CUPYNUMERIC_UNPACKBITS
+ WHERE = _cupynumeric.CUPYNUMERIC_WHERE
+ WINDOW = _cupynumeric.CUPYNUMERIC_WINDOW
+ WRAP = _cupynumeric.CUPYNUMERIC_WRAP
+ WRITE = _cupynumeric.CUPYNUMERIC_WRITE
+ ZIP = _cupynumeric.CUPYNUMERIC_ZIP
+
+
+# Match these to CuPyNumericUnaryOpCode in cupynumeric_c.h
+@unique
+class UnaryOpCode(IntEnum):
+ ABSOLUTE = _cupynumeric.CUPYNUMERIC_UOP_ABSOLUTE
+ ANGLE = _cupynumeric.CUPYNUMERIC_UOP_ANGLE
+ ARCCOS = _cupynumeric.CUPYNUMERIC_UOP_ARCCOS
+ ARCCOSH = _cupynumeric.CUPYNUMERIC_UOP_ARCCOSH
+ ARCSIN = _cupynumeric.CUPYNUMERIC_UOP_ARCSIN
+ ARCSINH = _cupynumeric.CUPYNUMERIC_UOP_ARCSINH
+ ARCTAN = _cupynumeric.CUPYNUMERIC_UOP_ARCTAN
+ ARCTANH = _cupynumeric.CUPYNUMERIC_UOP_ARCTANH
+ CBRT = _cupynumeric.CUPYNUMERIC_UOP_CBRT
+ CEIL = _cupynumeric.CUPYNUMERIC_UOP_CEIL
+ CLIP = _cupynumeric.CUPYNUMERIC_UOP_CLIP
+ CONJ = _cupynumeric.CUPYNUMERIC_UOP_CONJ
+ COPY = _cupynumeric.CUPYNUMERIC_UOP_COPY
+ COS = _cupynumeric.CUPYNUMERIC_UOP_COS
+ COSH = _cupynumeric.CUPYNUMERIC_UOP_COSH
+ DEG2RAD = _cupynumeric.CUPYNUMERIC_UOP_DEG2RAD
+ EXP = _cupynumeric.CUPYNUMERIC_UOP_EXP
+ EXP2 = _cupynumeric.CUPYNUMERIC_UOP_EXP2
+ EXPM1 = _cupynumeric.CUPYNUMERIC_UOP_EXPM1
+ FLOOR = _cupynumeric.CUPYNUMERIC_UOP_FLOOR
+ FREXP = _cupynumeric.CUPYNUMERIC_UOP_FREXP
+ GETARG = _cupynumeric.CUPYNUMERIC_UOP_GETARG
+ IMAG = _cupynumeric.CUPYNUMERIC_UOP_IMAG
+ INVERT = _cupynumeric.CUPYNUMERIC_UOP_INVERT
+ ISFINITE = _cupynumeric.CUPYNUMERIC_UOP_ISFINITE
+ ISINF = _cupynumeric.CUPYNUMERIC_UOP_ISINF
+ ISNAN = _cupynumeric.CUPYNUMERIC_UOP_ISNAN
+ LOG = _cupynumeric.CUPYNUMERIC_UOP_LOG
+ LOG10 = _cupynumeric.CUPYNUMERIC_UOP_LOG10
+ LOG1P = _cupynumeric.CUPYNUMERIC_UOP_LOG1P
+ LOG2 = _cupynumeric.CUPYNUMERIC_UOP_LOG2
+ LOGICAL_NOT = _cupynumeric.CUPYNUMERIC_UOP_LOGICAL_NOT
+ MODF = _cupynumeric.CUPYNUMERIC_UOP_MODF
+ NEGATIVE = _cupynumeric.CUPYNUMERIC_UOP_NEGATIVE
+ POSITIVE = _cupynumeric.CUPYNUMERIC_UOP_POSITIVE
+ RAD2DEG = _cupynumeric.CUPYNUMERIC_UOP_RAD2DEG
+ REAL = _cupynumeric.CUPYNUMERIC_UOP_REAL
+ RECIPROCAL = _cupynumeric.CUPYNUMERIC_UOP_RECIPROCAL
+ RINT = _cupynumeric.CUPYNUMERIC_UOP_RINT
+ ROUND = _cupynumeric.CUPYNUMERIC_UOP_ROUND
+ SIGN = _cupynumeric.CUPYNUMERIC_UOP_SIGN
+ SIGNBIT = _cupynumeric.CUPYNUMERIC_UOP_SIGNBIT
+ SIN = _cupynumeric.CUPYNUMERIC_UOP_SIN
+ SINH = _cupynumeric.CUPYNUMERIC_UOP_SINH
+ SQRT = _cupynumeric.CUPYNUMERIC_UOP_SQRT
+ SQUARE = _cupynumeric.CUPYNUMERIC_UOP_SQUARE
+ TAN = _cupynumeric.CUPYNUMERIC_UOP_TAN
+ TANH = _cupynumeric.CUPYNUMERIC_UOP_TANH
+ TRUNC = _cupynumeric.CUPYNUMERIC_UOP_TRUNC
+
+
+# Match these to CuPyNumericUnaryRedCode in cupynumeric_c.h
+@unique
+class UnaryRedCode(IntEnum):
+ ALL = _cupynumeric.CUPYNUMERIC_RED_ALL
+ ANY = _cupynumeric.CUPYNUMERIC_RED_ANY
+ ARGMAX = _cupynumeric.CUPYNUMERIC_RED_ARGMAX
+ ARGMIN = _cupynumeric.CUPYNUMERIC_RED_ARGMIN
+ CONTAINS = _cupynumeric.CUPYNUMERIC_RED_CONTAINS
+ COUNT_NONZERO = _cupynumeric.CUPYNUMERIC_RED_COUNT_NONZERO
+ MAX = _cupynumeric.CUPYNUMERIC_RED_MAX
+ MIN = _cupynumeric.CUPYNUMERIC_RED_MIN
+ NANARGMAX = _cupynumeric.CUPYNUMERIC_RED_NANARGMAX
+ NANARGMIN = _cupynumeric.CUPYNUMERIC_RED_NANARGMIN
+ NANMAX = _cupynumeric.CUPYNUMERIC_RED_NANMAX
+ NANMIN = _cupynumeric.CUPYNUMERIC_RED_NANMIN
+ NANPROD = _cupynumeric.CUPYNUMERIC_RED_NANPROD
+ NANSUM = _cupynumeric.CUPYNUMERIC_RED_NANSUM
+ PROD = _cupynumeric.CUPYNUMERIC_RED_PROD
+ SUM = _cupynumeric.CUPYNUMERIC_RED_SUM
+ SUM_SQUARES = _cupynumeric.CUPYNUMERIC_RED_SUM_SQUARES
+ VARIANCE = _cupynumeric.CUPYNUMERIC_RED_VARIANCE
+
+
+# Match these to CuPyNumericBinaryOpCode in cupynumeric_c.h
+@unique
+class BinaryOpCode(IntEnum):
+ ADD = _cupynumeric.CUPYNUMERIC_BINOP_ADD
+ ARCTAN2 = _cupynumeric.CUPYNUMERIC_BINOP_ARCTAN2
+ BITWISE_AND = _cupynumeric.CUPYNUMERIC_BINOP_BITWISE_AND
+ BITWISE_OR = _cupynumeric.CUPYNUMERIC_BINOP_BITWISE_OR
+ BITWISE_XOR = _cupynumeric.CUPYNUMERIC_BINOP_BITWISE_XOR
+ COPYSIGN = _cupynumeric.CUPYNUMERIC_BINOP_COPYSIGN
+ DIVIDE = _cupynumeric.CUPYNUMERIC_BINOP_DIVIDE
+ EQUAL = _cupynumeric.CUPYNUMERIC_BINOP_EQUAL
+ FLOAT_POWER = _cupynumeric.CUPYNUMERIC_BINOP_FLOAT_POWER
+ FLOOR_DIVIDE = _cupynumeric.CUPYNUMERIC_BINOP_FLOOR_DIVIDE
+ FMOD = _cupynumeric.CUPYNUMERIC_BINOP_FMOD
+ GCD = _cupynumeric.CUPYNUMERIC_BINOP_GCD
+ GREATER = _cupynumeric.CUPYNUMERIC_BINOP_GREATER
+ GREATER_EQUAL = _cupynumeric.CUPYNUMERIC_BINOP_GREATER_EQUAL
+ HYPOT = _cupynumeric.CUPYNUMERIC_BINOP_HYPOT
+ ISCLOSE = _cupynumeric.CUPYNUMERIC_BINOP_ISCLOSE
+ LCM = _cupynumeric.CUPYNUMERIC_BINOP_LCM
+ LDEXP = _cupynumeric.CUPYNUMERIC_BINOP_LDEXP
+ LEFT_SHIFT = _cupynumeric.CUPYNUMERIC_BINOP_LEFT_SHIFT
+ LESS = _cupynumeric.CUPYNUMERIC_BINOP_LESS
+ LESS_EQUAL = _cupynumeric.CUPYNUMERIC_BINOP_LESS_EQUAL
+ LOGADDEXP = _cupynumeric.CUPYNUMERIC_BINOP_LOGADDEXP
+ LOGADDEXP2 = _cupynumeric.CUPYNUMERIC_BINOP_LOGADDEXP2
+ LOGICAL_AND = _cupynumeric.CUPYNUMERIC_BINOP_LOGICAL_AND
+ LOGICAL_OR = _cupynumeric.CUPYNUMERIC_BINOP_LOGICAL_OR
+ LOGICAL_XOR = _cupynumeric.CUPYNUMERIC_BINOP_LOGICAL_XOR
+ MAXIMUM = _cupynumeric.CUPYNUMERIC_BINOP_MAXIMUM
+ MINIMUM = _cupynumeric.CUPYNUMERIC_BINOP_MINIMUM
+ MOD = _cupynumeric.CUPYNUMERIC_BINOP_MOD
+ MULTIPLY = _cupynumeric.CUPYNUMERIC_BINOP_MULTIPLY
+ NEXTAFTER = _cupynumeric.CUPYNUMERIC_BINOP_NEXTAFTER
+ NOT_EQUAL = _cupynumeric.CUPYNUMERIC_BINOP_NOT_EQUAL
+ POWER = _cupynumeric.CUPYNUMERIC_BINOP_POWER
+ RIGHT_SHIFT = _cupynumeric.CUPYNUMERIC_BINOP_RIGHT_SHIFT
+ SUBTRACT = _cupynumeric.CUPYNUMERIC_BINOP_SUBTRACT
+
+
+@unique
+class WindowOpCode(IntEnum):
+ BARLETT = _cupynumeric.CUPYNUMERIC_WINDOW_BARLETT
+ BLACKMAN = _cupynumeric.CUPYNUMERIC_WINDOW_BLACKMAN
+ HAMMING = _cupynumeric.CUPYNUMERIC_WINDOW_HAMMING
+ HANNING = _cupynumeric.CUPYNUMERIC_WINDOW_HANNING
+ KAISER = _cupynumeric.CUPYNUMERIC_WINDOW_KAISER
+
+
+# Match these to RandGenCode in rand_util.h
+@unique
+class RandGenCode(IntEnum):
+ UNIFORM = 1
+ NORMAL = 2
+ INTEGER = 3
+
+
+# Match these to CuPyNumericScanCode in cupynumeric_c.h
+@unique
+class ScanCode(IntEnum):
+ PROD = _cupynumeric.CUPYNUMERIC_SCAN_PROD
+ SUM = _cupynumeric.CUPYNUMERIC_SCAN_SUM
+
+
+# Match these to CuPyNumericConvertCode in cupynumeric_c.h
+@unique
+class ConvertCode(IntEnum):
+ NOOP = _cupynumeric.CUPYNUMERIC_CONVERT_NAN_NOOP
+ PROD = _cupynumeric.CUPYNUMERIC_CONVERT_NAN_PROD
+ SUM = _cupynumeric.CUPYNUMERIC_CONVERT_NAN_SUM
+
+
+# Match these to BitGeneratorOperation in cupynumeric_c.h
+@unique
+class BitGeneratorOperation(IntEnum):
+ CREATE = _cupynumeric.CUPYNUMERIC_BITGENOP_CREATE
+ DESTROY = _cupynumeric.CUPYNUMERIC_BITGENOP_DESTROY
+ RAND_RAW = _cupynumeric.CUPYNUMERIC_BITGENOP_RAND_RAW
+ DISTRIBUTION = _cupynumeric.CUPYNUMERIC_BITGENOP_DISTRIBUTION
+
+
+# Match these to BitGeneratorType in cupynumeric_c.h
+@unique
+class BitGeneratorType(IntEnum):
+ DEFAULT = _cupynumeric.CUPYNUMERIC_BITGENTYPE_DEFAULT
+ XORWOW = _cupynumeric.CUPYNUMERIC_BITGENTYPE_XORWOW
+ MRG32K3A = _cupynumeric.CUPYNUMERIC_BITGENTYPE_MRG32K3A
+ MTGP32 = _cupynumeric.CUPYNUMERIC_BITGENTYPE_MTGP32
+ MT19937 = _cupynumeric.CUPYNUMERIC_BITGENTYPE_MT19937
+ PHILOX4_32_10 = _cupynumeric.CUPYNUMERIC_BITGENTYPE_PHILOX4_32_10
+
+
+# Match these to BitGeneratorDistribution in cupynumeric_c.h
+@unique
+class BitGeneratorDistribution(IntEnum):
+ INTEGERS_16 = _cupynumeric.CUPYNUMERIC_BITGENDIST_INTEGERS_16
+ INTEGERS_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_INTEGERS_32
+ INTEGERS_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_INTEGERS_64
+ UNIFORM_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_UNIFORM_32
+ UNIFORM_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_UNIFORM_64
+ LOGNORMAL_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_LOGNORMAL_32
+ LOGNORMAL_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_LOGNORMAL_64
+ NORMAL_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_NORMAL_32
+ NORMAL_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_NORMAL_64
+ POISSON = _cupynumeric.CUPYNUMERIC_BITGENDIST_POISSON
+ EXPONENTIAL_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_EXPONENTIAL_32
+ EXPONENTIAL_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_EXPONENTIAL_64
+ GUMBEL_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_GUMBEL_32
+ GUMBEL_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_GUMBEL_64
+ LAPLACE_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_LAPLACE_32
+ LAPLACE_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_LAPLACE_64
+ LOGISTIC_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_LOGISTIC_32
+ LOGISTIC_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_LOGISTIC_64
+ PARETO_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_PARETO_32
+ PARETO_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_PARETO_64
+ POWER_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_POWER_32
+ POWER_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_POWER_64
+ RAYLEIGH_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_RAYLEIGH_32
+ RAYLEIGH_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_RAYLEIGH_64
+ CAUCHY_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_CAUCHY_32
+ CAUCHY_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_CAUCHY_64
+ TRIANGULAR_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_TRIANGULAR_32
+ TRIANGULAR_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_TRIANGULAR_64
+ WEIBULL_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_WEIBULL_32
+ WEIBULL_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_WEIBULL_64
+ BYTES = _cupynumeric.CUPYNUMERIC_BITGENDIST_BYTES
+ BETA_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_BETA_32
+ BETA_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_BETA_64
+ F_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_F_32
+ F_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_F_64
+ LOGSERIES = _cupynumeric.CUPYNUMERIC_BITGENDIST_LOGSERIES
+ NONCENTRAL_F_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_NONCENTRAL_F_32
+ NONCENTRAL_F_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_NONCENTRAL_F_64
+ CHISQUARE_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_CHISQUARE_32
+ CHISQUARE_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_CHISQUARE_64
+ GAMMA_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_GAMMA_32
+ GAMMA_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_GAMMA_64
+ STANDARD_T_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_STANDARD_T_32
+ STANDARD_T_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_STANDARD_T_64
+ HYPERGEOMETRIC = _cupynumeric.CUPYNUMERIC_BITGENDIST_HYPERGEOMETRIC
+ VONMISES_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_VONMISES_32
+ VONMISES_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_VONMISES_64
+ ZIPF = _cupynumeric.CUPYNUMERIC_BITGENDIST_ZIPF
+ GEOMETRIC = _cupynumeric.CUPYNUMERIC_BITGENDIST_GEOMETRIC
+ WALD_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_WALD_32
+ WALD_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_WALD_64
+ BINOMIAL = _cupynumeric.CUPYNUMERIC_BITGENDIST_BINOMIAL
+ NEGATIVE_BINOMIAL = _cupynumeric.CUPYNUMERIC_BITGENDIST_NEGATIVE_BINOMIAL
+
+
+# Match these to CuPyNumericConvolveMethod in cupynumeric_c.h
+@unique
+class ConvolveMethod(IntEnum):
+ AUTO = _cupynumeric.CUPYNUMERIC_CONVOLVE_AUTO
+ DIRECT = _cupynumeric.CUPYNUMERIC_CONVOLVE_DIRECT
+ FFT = _cupynumeric.CUPYNUMERIC_CONVOLVE_FFT
+
+
+@unique
+class TransferType(IntEnum):
+ DONATE = 0
+ MAKE_COPY = 1
+ SHARE = 2
+
+
+# Match these to fftType in fft_util.h
+class FFTType:
+ def __init__(
+ self,
+ name: str,
+ type_id: int,
+ input_dtype: npt.DTypeLike,
+ output_dtype: npt.DTypeLike,
+ single_precision: bool,
+ complex_type: FFTType | None = None,
+ ) -> None:
+ self._name = name
+ self._type_id = type_id
+ self._complex_type = self if complex_type is None else complex_type
+ self._input_dtype = input_dtype
+ self._output_dtype = output_dtype
+ self._single_precision = single_precision
+
+ def __str__(self) -> str:
+ return self._name
+
+ def __repr__(self) -> str:
+ return str(self)
+
+ @property
+ def type_id(self) -> int:
+ return self._type_id
+
+ @property
+ def complex(self) -> FFTType:
+ return self._complex_type
+
+ @property
+ def input_dtype(self) -> npt.DTypeLike:
+ return self._input_dtype
+
+ @property
+ def output_dtype(self) -> npt.DTypeLike:
+ return self._output_dtype
+
+ @property
+ def is_single_precision(self) -> bool:
+ return self._single_precision
+
+
+FFT_C2C = FFTType(
+ "C2C",
+ _cupynumeric.CUPYNUMERIC_FFT_C2C,
+ np.complex64,
+ np.complex64,
+ True,
+)
+
+FFT_Z2Z = FFTType(
+ "Z2Z",
+ _cupynumeric.CUPYNUMERIC_FFT_Z2Z,
+ np.complex128,
+ np.complex128,
+ False,
+)
+
+FFT_R2C = FFTType(
+ "R2C",
+ _cupynumeric.CUPYNUMERIC_FFT_R2C,
+ np.float32,
+ np.complex64,
+ True,
+ FFT_C2C,
+)
+
+FFT_C2R = FFTType(
+ "C2R",
+ _cupynumeric.CUPYNUMERIC_FFT_C2R,
+ np.complex64,
+ np.float32,
+ True,
+ FFT_C2C,
+)
+
+FFT_D2Z = FFTType(
+ "D2Z",
+ _cupynumeric.CUPYNUMERIC_FFT_D2Z,
+ np.float64,
+ np.complex128,
+ False,
+ FFT_Z2Z,
+)
+
+FFT_Z2D = FFTType(
+ "Z2D",
+ _cupynumeric.CUPYNUMERIC_FFT_Z2D,
+ np.complex128,
+ np.float64,
+ False,
+ FFT_Z2Z,
+)
+
+
+class FFTCode:
+ @staticmethod
+ def real_to_complex_code(dtype: npt.DTypeLike) -> FFTType:
+ if dtype == np.float64:
+ return FFT_D2Z
+ elif dtype == np.float32:
+ return FFT_R2C
+ else:
+ raise TypeError(
+ (
+ "Data type for FFT not supported "
+ "(supported types are float32 and float64)"
+ )
+ )
+
+ @staticmethod
+ def complex_to_real_code(dtype: npt.DTypeLike) -> FFTType:
+ if dtype == np.complex128:
+ return FFT_Z2D
+ elif dtype == np.complex64:
+ return FFT_C2R
+ else:
+ raise TypeError(
+ (
+ "Data type for FFT not supported "
+ "(supported types are complex64 and complex128)"
+ )
+ )
+
+
+@unique
+class FFTDirection(IntEnum):
+ FORWARD = _cupynumeric.CUPYNUMERIC_FFT_FORWARD
+ INVERSE = _cupynumeric.CUPYNUMERIC_FFT_INVERSE
+
+
+# Match these to CuPyNumericBitorder in cupynumeric_c.h
+@unique
+class Bitorder(IntEnum):
+ BIG = _cupynumeric.CUPYNUMERIC_BITORDER_BIG
+ LITTLE = _cupynumeric.CUPYNUMERIC_BITORDER_LITTLE
+
+
+@unique
+class FFTNormalization(IntEnum):
+ FORWARD = 1
+ INVERSE = 2
+ ORTHOGONAL = 3
+
+ @staticmethod
+ def from_string(in_string: str) -> FFTNormalization | None:
+ if in_string == "forward":
+ return FFTNormalization.FORWARD
+ elif in_string == "ortho":
+ return FFTNormalization.ORTHOGONAL
+ elif in_string == "backward" or in_string is None:
+ return FFTNormalization.INVERSE
+ else:
+ raise ValueError(
+ f'Invalid norm value {in_string}; should be "backward",'
+ '"ortho" or "forward".'
+ )
+
+ @staticmethod
+ def reverse(in_string: str | None) -> str:
+ if in_string == "forward":
+ return "backward"
+ elif in_string == "backward" or in_string is None:
+ return "forward"
+ else:
+ return in_string
diff --git a/cunumeric/fft/__init__.py b/cupynumeric/fft/__init__.py
similarity index 100%
rename from cunumeric/fft/__init__.py
rename to cupynumeric/fft/__init__.py
diff --git a/cunumeric/fft/fft.py b/cupynumeric/fft/fft.py
similarity index 99%
rename from cunumeric/fft/fft.py
rename to cupynumeric/fft/fft.py
index ad6b7caafd..7576f3dd40 100644
--- a/cunumeric/fft/fft.py
+++ b/cupynumeric/fft/fft.py
@@ -20,7 +20,6 @@
from .._array.util import add_boilerplate
from .._module.array_rearrange import roll
-from .._module.creation_data import asarray
from ..config import FFT_C2C, FFT_Z2Z, FFTCode, FFTDirection, FFTNormalization
if TYPE_CHECKING:
@@ -105,7 +104,7 @@ def fft(
numpy.fft.fft
Availability
- --------
+ ------------
Multiple GPUs
"""
s = (n,) if n is not None else None
diff --git a/cunumeric/install_info.py.in b/cupynumeric/install_info.py.in
similarity index 66%
rename from cunumeric/install_info.py.in
rename to cupynumeric/install_info.py.in
index cc683b2252..9175f52a37 100644
--- a/cunumeric/install_info.py.in
+++ b/cupynumeric/install_info.py.in
@@ -30,17 +30,22 @@ def get_libpath():
"Windows": ".dll"
}[platform.system()]
- def find_libcunumeric(libdir):
- if exists(join(libdir, f"libcunumeric{so_ext}")):
+ def find_libcupynumeric(libdir):
+ if exists(join(libdir, f"libcupynumeric{so_ext}")):
return libdir
return None
- return (
- find_libcunumeric(join(cn_path, "build", "lib")) or
- find_libcunumeric(join(dirname(dirname(dirname(cn_path))), "lib")) or
- find_libcunumeric(join(dirname(dirname(sys.executable)), "lib")) or
- ""
- )
+ for libdir in ("lib", "lib64"):
+ if ret := find_libcupynumeric(join(cn_path, "build", libdir)):
+ return ret
+ if ret := find_libcupynumeric(join(cn_path, "cupynumeric", libdir)):
+ return ret
+ if ret := find_libcupynumeric(join(dirname(dirname(dirname(cn_path))), libdir)):
+ return ret
+ if ret := find_libcupynumeric(join(dirname(dirname(sys.executable)), libdir)):
+ return ret
+
+ return ""
libpath: str = get_libpath()
diff --git a/cunumeric/linalg/__init__.py b/cupynumeric/linalg/__init__.py
similarity index 100%
rename from cunumeric/linalg/__init__.py
rename to cupynumeric/linalg/__init__.py
diff --git a/cunumeric/linalg/_cholesky.py b/cupynumeric/linalg/_cholesky.py
similarity index 85%
rename from cunumeric/linalg/_cholesky.py
rename to cupynumeric/linalg/_cholesky.py
index 3775951dcd..a99ae68117 100644
--- a/cunumeric/linalg/_cholesky.py
+++ b/cupynumeric/linalg/_cholesky.py
@@ -25,7 +25,7 @@
)
from legate.settings import settings
-from ..config import CuNumericOpCode
+from ..config import CuPyNumericOpCode
from ..runtime import runtime
from ._exception import LinAlgError
@@ -42,7 +42,7 @@ def transpose_copy_single(
library: Library, input: LogicalStore, output: LogicalStore
) -> None:
task = legate_runtime.create_auto_task(
- library, CuNumericOpCode.TRANSPOSE_COPY_2D
+ library, CuPyNumericOpCode.TRANSPOSE_COPY_2D
)
p_out = task.add_output(output)
p_in = task.add_input(input)
@@ -63,7 +63,7 @@ def transpose_copy(
) -> None:
task = legate_runtime.create_manual_task(
library,
- CuNumericOpCode.TRANSPOSE_COPY_2D,
+ CuPyNumericOpCode.TRANSPOSE_COPY_2D,
launch_domain,
)
task.add_output(p_output)
@@ -75,7 +75,7 @@ def transpose_copy(
def potrf_single(library: Library, output: LogicalStore) -> None:
- task = legate_runtime.create_auto_task(library, CuNumericOpCode.POTRF)
+ task = legate_runtime.create_auto_task(library, CuPyNumericOpCode.POTRF)
task.throws_exception(LinAlgError)
task.add_output(output)
task.add_input(output)
@@ -89,7 +89,7 @@ def mp_potrf(
input: LogicalStore,
output: LogicalStore,
) -> None:
- task = legate_runtime.create_auto_task(library, CuNumericOpCode.MP_POTRF)
+ task = legate_runtime.create_auto_task(library, CuPyNumericOpCode.MP_POTRF)
task.throws_exception(LinAlgError)
task.add_input(input)
task.add_output(output)
@@ -103,7 +103,7 @@ def mp_potrf(
def potrf(library: Library, p_output: LogicalStorePartition, i: int) -> None:
task = legate_runtime.create_manual_task(
- library, CuNumericOpCode.POTRF, (i + 1, i + 1), lower_bounds=(i, i)
+ library, CuPyNumericOpCode.POTRF, (i + 1, i + 1), lower_bounds=(i, i)
)
task.throws_exception(LinAlgError)
task.add_output(p_output)
@@ -121,7 +121,7 @@ def trsm(
lhs = p_output
task = legate_runtime.create_manual_task(
- library, CuNumericOpCode.TRSM, (hi, i + 1), lower_bounds=(lo, i)
+ library, CuPyNumericOpCode.TRSM, (hi, i + 1), lower_bounds=(lo, i)
)
task.add_output(lhs)
task.add_input(rhs)
@@ -136,7 +136,7 @@ def syrk(
lhs = p_output
task = legate_runtime.create_manual_task(
- library, CuNumericOpCode.SYRK, (k + 1, k + 1), lower_bounds=(k, k)
+ library, CuPyNumericOpCode.SYRK, (k + 1, k + 1), lower_bounds=(k, k)
)
task.add_output(lhs)
task.add_input(rhs)
@@ -160,7 +160,7 @@ def gemm(
rhs1 = p_output
task = legate_runtime.create_manual_task(
- library, CuNumericOpCode.GEMM, (hi, k + 1), lower_bounds=(lo, k)
+ library, CuPyNumericOpCode.GEMM, (hi, k + 1), lower_bounds=(lo, k)
)
task.add_output(lhs)
task.add_input(rhs1, (dimension(0), constant(i)))
@@ -169,19 +169,16 @@ def gemm(
task.execute()
-MIN_CHOLESKY_TILE_SIZE = 2048
-MIN_CHOLESKY_MATRIX_SIZE = 8192
+MIN_CHOLESKY_TILE_SIZE = 2 if settings.test() else 2048
+MIN_CHOLESKY_MATRIX_SIZE = 4 if settings.test() else 8192
# TODO: We need a better cost model
def choose_color_shape(
runtime: Runtime, shape: tuple[int, ...]
) -> tuple[int, ...]:
- if settings.test():
- num_tiles = runtime.num_procs * 2
- return (num_tiles, num_tiles)
-
extent = shape[0]
+
# If there's only one processor or the matrix is too small,
# don't even bother to partition it at all
if runtime.num_procs == 1 or extent <= MIN_CHOLESKY_MATRIX_SIZE:
@@ -201,7 +198,7 @@ def choose_color_shape(
def tril_single(library: Library, output: LogicalStore) -> None:
- task = legate_runtime.create_auto_task(library, CuNumericOpCode.TRILU)
+ task = legate_runtime.create_auto_task(library, CuPyNumericOpCode.TRILU)
task.add_output(output)
task.add_input(output)
task.add_scalar_arg(True, ty.bool_)
@@ -214,7 +211,7 @@ def tril_single(library: Library, output: LogicalStore) -> None:
def tril(library: Library, p_output: LogicalStorePartition, n: int) -> None:
task = legate_runtime.create_manual_task(
- library, CuNumericOpCode.TRILU, (n, n)
+ library, CuPyNumericOpCode.TRILU, (n, n)
)
task.add_output(p_output)
@@ -242,7 +239,7 @@ def _batched_cholesky(
# Just use a fixed cutoff to provide some sensible warning.
# TODO: find a better way to inform the user dims are too big
task = legate_runtime.create_auto_task(
- library, CuNumericOpCode.BATCHED_CHOLESKY
+ library, CuPyNumericOpCode.BATCHED_CHOLESKY
)
task.add_input(input.base)
task.add_output(output.base)
@@ -254,16 +251,9 @@ def _batched_cholesky(
task.execute()
-def cholesky_deferred(
- output: DeferredArray, input: DeferredArray, no_tril: bool
-) -> None:
+def cholesky_deferred(output: DeferredArray, input: DeferredArray) -> None:
library = runtime.library
if len(input.base.shape) > 2:
- if no_tril:
- raise NotImplementedError(
- "batched cholesky expects to only "
- "produce the lower triangular matrix"
- )
size = input.base.shape[-1]
# Choose 32768 as dimension cutoff for warning
# so that for float64 anything larger than
@@ -280,8 +270,7 @@ def cholesky_deferred(
if runtime.num_procs == 1:
transpose_copy_single(library, input.base, output.base)
potrf_single(library, output.base)
- if not no_tril:
- tril_single(library, output.base)
+ tril_single(library, output.base)
return
shape = tuple(output.base.shape)
@@ -295,8 +284,7 @@ def cholesky_deferred(
library, shape[0], MIN_CHOLESKY_TILE_SIZE, input.base, output.base
)
- if not no_tril:
- tril_single(library, output.base)
+ tril_single(library, output.base)
else:
initial_color_shape = choose_color_shape(runtime, shape)
tile_shape = _rounding_divide(shape, initial_color_shape)
@@ -314,5 +302,4 @@ def cholesky_deferred(
syrk(library, p_output, k, i)
gemm(library, p_output, k, i, k + 1, n)
- if not no_tril:
- tril(library, p_output, n)
+ tril(library, p_output, n)
diff --git a/cupynumeric/linalg/_eigen.py b/cupynumeric/linalg/_eigen.py
new file mode 100644
index 0000000000..cb6b0dd057
--- /dev/null
+++ b/cupynumeric/linalg/_eigen.py
@@ -0,0 +1,87 @@
+# Copyright 2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Optional
+
+from legate.core import dimension, get_legate_runtime
+
+from cupynumeric.config import CuPyNumericOpCode
+
+from ..runtime import runtime
+from ._exception import LinAlgError
+
+if TYPE_CHECKING:
+ from .._thunk.deferred import DeferredArray
+
+
+def eig_deferred(
+ a: DeferredArray, ew: DeferredArray, ev: Optional[DeferredArray] = None
+) -> None:
+ library = a.library
+
+ m = a.shape[-1]
+
+ if m == 0:
+ raise ValueError("Input shape dimension 0 not allowed!")
+
+ def choose_nd_color_shape(shape: tuple[int, ...]) -> tuple[int, ...]:
+ # start with 1D and re-balance by powers of 2
+ # (don't worry about other primes)
+ color_shape = [1 for i in shape]
+ if len(shape) > 2:
+ color_shape[0] = runtime.num_procs
+
+ done = False
+ while not done and color_shape[0] % 2 == 0:
+ # find max idx
+ # if large enough --> switch
+ weight_per_dim = list(
+ map(lambda x, y: x / y, list(shape), color_shape)
+ )[:-2]
+
+ max_weight = max(weight_per_dim)
+ idx = weight_per_dim.index(max_weight)
+
+ if weight_per_dim[idx] > 2 * weight_per_dim[0]:
+ color_shape[0] = color_shape[0] // 2
+ color_shape[idx] = color_shape[idx] * 2
+ else:
+ done = True
+
+ return tuple(color_shape)
+
+ # coloring via num_procs to get utilization
+ initial_color_shape = choose_nd_color_shape(a.shape)
+ tilesize = tuple(
+ map(lambda x, y: (x + y - 1) // y, a.shape, initial_color_shape)
+ )
+ color_shape = tuple(map(lambda x, y: (x + y - 1) // y, a.shape, tilesize))
+
+ # partition defined py local batchsize
+ tiled_a = a.base.partition_by_tiling(tilesize)
+ tiled_ew = ew.base.partition_by_tiling(tilesize[:-1])
+
+ task = get_legate_runtime().create_manual_task(
+ library, CuPyNumericOpCode.GEEV, color_shape
+ )
+ task.throws_exception(LinAlgError)
+ partition = tuple(dimension(i) for i in range(len(color_shape)))
+ task.add_input(tiled_a, partition)
+ task.add_output(tiled_ew, partition[:-1])
+ if ev is not None:
+ tiled_ev = ev.base.partition_by_tiling(tilesize)
+ task.add_output(tiled_ev, partition)
+ task.execute()
diff --git a/cunumeric/linalg/_exception.py b/cupynumeric/linalg/_exception.py
similarity index 100%
rename from cunumeric/linalg/_exception.py
rename to cupynumeric/linalg/_exception.py
diff --git a/cunumeric/linalg/_qr.py b/cupynumeric/linalg/_qr.py
similarity index 91%
rename from cunumeric/linalg/_qr.py
rename to cupynumeric/linalg/_qr.py
index aa2c38e1cb..4b20d5fe62 100644
--- a/cunumeric/linalg/_qr.py
+++ b/cupynumeric/linalg/_qr.py
@@ -18,7 +18,7 @@
from legate.core import get_legate_runtime
-from cunumeric.config import CuNumericOpCode
+from cupynumeric.config import CuPyNumericOpCode
from ._exception import LinAlgError
@@ -31,7 +31,7 @@
def qr_single(
library: Library, a: LogicalStore, q: LogicalStore, r: LogicalStore
) -> None:
- task = get_legate_runtime().create_auto_task(library, CuNumericOpCode.QR)
+ task = get_legate_runtime().create_auto_task(library, CuPyNumericOpCode.QR)
task.throws_exception(LinAlgError)
task.add_input(a)
task.add_output(q)
diff --git a/cunumeric/linalg/_solve.py b/cupynumeric/linalg/_solve.py
similarity index 95%
rename from cunumeric/linalg/_solve.py
rename to cupynumeric/linalg/_solve.py
index 7681444ac3..325fe301de 100644
--- a/cunumeric/linalg/_solve.py
+++ b/cupynumeric/linalg/_solve.py
@@ -19,7 +19,7 @@
import legate.core.types as ty
from legate.core import broadcast, get_legate_runtime
-from ..config import CuNumericOpCode
+from ..config import CuPyNumericOpCode
from ..runtime import runtime
from ._cholesky import transpose_copy_single
from ._exception import LinAlgError
@@ -32,7 +32,7 @@
def solve_single(library: Library, a: LogicalStore, b: LogicalStore) -> None:
task = get_legate_runtime().create_auto_task(
- library, CuNumericOpCode.SOLVE
+ library, CuPyNumericOpCode.SOLVE
)
task.throws_exception(LinAlgError)
p_a = task.add_input(a)
@@ -60,7 +60,7 @@ def mp_solve(
output: LogicalStore,
) -> None:
task = get_legate_runtime().create_auto_task(
- library, CuNumericOpCode.MP_SOLVE
+ library, CuPyNumericOpCode.MP_SOLVE
)
task.throws_exception(LinAlgError)
task.add_input(a)
diff --git a/cunumeric/linalg/_svd.py b/cupynumeric/linalg/_svd.py
similarity index 90%
rename from cunumeric/linalg/_svd.py
rename to cupynumeric/linalg/_svd.py
index 9579f06849..a9be94924d 100644
--- a/cunumeric/linalg/_svd.py
+++ b/cupynumeric/linalg/_svd.py
@@ -18,7 +18,7 @@
from legate.core import get_legate_runtime
-from cunumeric.config import CuNumericOpCode
+from cupynumeric.config import CuPyNumericOpCode
from ._exception import LinAlgError
@@ -35,7 +35,9 @@ def svd_single(
s: LogicalStore,
vh: LogicalStore,
) -> None:
- task = get_legate_runtime().create_auto_task(library, CuNumericOpCode.SVD)
+ task = get_legate_runtime().create_auto_task(
+ library, CuPyNumericOpCode.SVD
+ )
task.throws_exception(LinAlgError)
task.add_input(a)
task.add_output(u)
diff --git a/cunumeric/linalg/linalg.py b/cupynumeric/linalg/linalg.py
similarity index 62%
rename from cunumeric/linalg/linalg.py
rename to cupynumeric/linalg/linalg.py
index 31f64eca0a..39b04adc5c 100644
--- a/cunumeric/linalg/linalg.py
+++ b/cupynumeric/linalg/linalg.py
@@ -14,11 +14,12 @@
#
from __future__ import annotations
-from typing import TYPE_CHECKING, Sequence
+from typing import TYPE_CHECKING, Any, Sequence
import numpy as np
from .._utils import is_np2
+from ..runtime import runtime
if is_np2:
from numpy.lib.array_utils import normalize_axis_index # type: ignore
@@ -31,8 +32,11 @@
normalize_axis_tuple,
)
-from .._array.util import add_boilerplate, convert_to_cunumeric_ndarray
+from legate.core import get_machine
+
+from .._array.util import add_boilerplate, convert_to_cupynumeric_ndarray
from .._module import dot, empty_like, eye, matmul, ndarray
+from .._module.creation_shape import zeros, zeros_like
from .._ufunc.math import add, sqrt as _sqrt
from ._exception import LinAlgError
@@ -89,6 +93,108 @@ def cholesky(a: ndarray) -> ndarray:
return _thunk_cholesky(a)
+@add_boilerplate("a")
+def eig(a: ndarray) -> tuple[ndarray, ...]:
+ """
+ Compute the eigenvalues and right eigenvectors of a square array.
+
+ Parameters
+ ----------
+ a : (..., M, M) array_like
+ Matrices for which the eigenvalues and right eigenvectors will be
+ computed, at least dimension 2.
+
+ Returns
+ -------
+ eigenvalues : (…, M) array_like
+ The eigenvalues, each repeated according to its multiplicity.
+ eigenvectors : (…, M, M) array
+ The normalized (unit “length”) eigenvectors, such that the column
+ eigenvectors[:,i] is the eigenvector corresponding to the eigenvalue
+ eigenvalues[i].
+
+ Raises
+ ------
+ LinAlgError
+ If the eigenvalue computation does not converge.
+
+ Notes
+ -----
+ Unlike NumPy, cuPyNumeric always returns complex-dtype results, even if the
+ imaginary part is zero.
+
+ Multi-GPU/CPU usage is limited to data parallel matrix-wise batching.
+
+ See Also
+ --------
+ numpy.linalg.eig
+
+ Availability
+ --------
+ Multiple GPU, Multiple CPU
+ """
+ shape = a.shape
+ if len(shape) < 2:
+ raise LinAlgError(
+ f"{len(shape)}-dimensional array given. "
+ "Array must be at least two-dimensional"
+ )
+ if shape[-2] != shape[-1]:
+ raise LinAlgError("Last 2 dimensions of the array must be square")
+ if np.dtype("e") == a.dtype:
+ raise TypeError("array type float16 is unsupported in linalg")
+ return _thunk_eig(a)
+
+
+@add_boilerplate("a")
+def eigvals(a: ndarray) -> ndarray:
+ """
+ Compute the eigenvalues of a square array.
+
+ Parameters
+ ----------
+ a : (..., M, M) array_like
+ Matrices for which the eigenvalues will be computed, at least
+ dimension 2.
+
+ Returns
+ -------
+ w : (…, M) array_like
+ The eigenvalues, each repeated according to its multiplicity.
+
+ Raises
+ ------
+ LinAlgError
+ If the eigenvalue computation does not converge.
+
+ Notes
+ -----
+ Unlike NumPy, cuPyNumeric always returns complex-dtype results, even if the
+ imaginary part is zero.
+
+ Multi-GPU/CPU usage is limited to data parallel matrix-wise batching.
+
+ See Also
+ --------
+ numpy.linalg.eigvals
+
+ Availability
+ --------
+ Multiple GPU, Multiple CPU
+ """
+ shape = a.shape
+ if len(shape) < 2:
+ raise LinAlgError(
+ f"{len(shape)}-dimensional array given. "
+ "Array must be at least two-dimensional"
+ )
+ if shape[-2] != shape[-1]:
+ raise LinAlgError("Last 2 dimensions of the array must be square")
+ if np.dtype("e") == a.dtype:
+ raise TypeError("array type float16 is unsupported in linalg")
+ return _thunk_eigvals(a)
+
+
@add_boilerplate("a")
def qr(a: ndarray) -> tuple[ndarray, ...]:
"""
@@ -134,7 +240,7 @@ def qr(a: ndarray) -> tuple[ndarray, ...]:
)
if len(shape) > 2:
raise NotImplementedError(
- "cuNumeric does not yet support stacked 2d arrays"
+ "cuPyNumeric does not yet support stacked 2d arrays"
)
if np.dtype("e") == a.dtype:
raise TypeError("array type float16 is unsupported in linalg")
@@ -194,7 +300,7 @@ def solve(a: ndarray, b: ndarray, out: ndarray | None = None) -> ndarray:
raise TypeError("array type float16 is unsupported in linalg")
if a.ndim > 2 or b.ndim > 2:
raise NotImplementedError(
- "cuNumeric does not yet support stacked 2d arrays"
+ "cuPyNumeric does not yet support stacked 2d arrays"
)
if a.shape[-2] != a.shape[-1]:
raise LinAlgError("Last 2 dimensions of the array must be square")
@@ -246,8 +352,7 @@ def svd(a: ndarray, full_matrices: bool = True) -> tuple[ndarray, ...]:
Notes
-----
- Currently does not support the parameters 'full_matrices', 'compute_uv',
- and 'hermitian'.
+ Currently does not support the parameters 'compute_uv' and 'hermitian'.
See Also
--------
@@ -265,10 +370,10 @@ def svd(a: ndarray, full_matrices: bool = True) -> tuple[ndarray, ...]:
)
if len(shape) > 2:
raise NotImplementedError(
- "cuNumeric does not yet support stacked 2d arrays"
+ "cuPyNumeric does not yet support stacked 2d arrays"
)
if shape[0] < shape[1]:
- raise NotImplementedError("cuNumeric only supports M >= N")
+ raise NotImplementedError("cuPyNumeric only supports M >= N")
if np.dtype("e") == a.dtype:
raise TypeError("array type float16 is unsupported in linalg")
return _thunk_svd(a, full_matrices)
@@ -323,7 +428,7 @@ def matrix_power(a: ndarray, n: int) -> ndarray:
# Invert if necessary
if n < 0:
- # TODO: Add this once cunumeric.inv is implemented
+ # TODO: Add this once cupynumeric.inv is implemented
# a = inv(a)
# n = abs(n)
raise NotImplementedError("Negative exponent in matrix_power")
@@ -385,9 +490,9 @@ def multi_dot(
--------
Multiple GPUs, Multiple CPUs
"""
- arrays = [convert_to_cunumeric_ndarray(x) for x in arrays]
+ arrays = [convert_to_cupynumeric_ndarray(x) for x in arrays]
if out is not None:
- out = convert_to_cunumeric_ndarray(out, share=True)
+ out = convert_to_cupynumeric_ndarray(out, share=True)
n = len(arrays)
# optimization only makes sense for len(arrays) > 2
@@ -700,7 +805,7 @@ def norm(
raise ValueError("Improper number of dimensions to norm")
-def _thunk_cholesky(a: ndarray, no_tril: bool = False) -> ndarray:
+def _thunk_cholesky(a: ndarray) -> ndarray:
"""Cholesky decomposition.
Return the Cholesky decomposition, `L * L.H`, of the square matrix `a`,
@@ -744,10 +849,84 @@ def _thunk_cholesky(a: ndarray, no_tril: bool = False) -> ndarray:
dtype=input.dtype,
inputs=(input,),
)
- output._thunk.cholesky(input._thunk, no_tril=no_tril)
+ output._thunk.cholesky(input._thunk)
return output
+def _thunk_eig(a: ndarray) -> tuple[ndarray, ...]:
+ if a.dtype.kind not in ("f", "c"):
+ a = a.astype("float64")
+
+ if a.dtype == np.float32:
+ complex_dtype = np.dtype(np.complex64)
+ elif a.dtype == np.float64:
+ complex_dtype = np.dtype(np.complex128) # type: ignore
+ elif a.dtype.kind in ("c"):
+ complex_dtype = a.dtype
+ else:
+ raise TypeError("Eig input not supported (missing a conversion?)")
+
+ if runtime.num_gpus > 0 and not runtime.cusolver_has_geev():
+ a = ndarray(a.shape, a.dtype, thunk=runtime.to_eager_array(a._thunk))
+ out_ew = ndarray(
+ shape=a.shape[:-1],
+ dtype=complex_dtype,
+ force_thunk="eager",
+ )
+ out_ev = ndarray(
+ shape=a.shape,
+ dtype=complex_dtype,
+ force_thunk="eager",
+ )
+ else:
+ out_ew = ndarray(
+ shape=a.shape[:-1],
+ dtype=complex_dtype,
+ inputs=(a,),
+ )
+ out_ev = ndarray(
+ shape=a.shape,
+ dtype=complex_dtype,
+ inputs=(a,),
+ )
+
+ if a.shape[-1] > 0:
+ a._thunk.eig(out_ew._thunk, out_ev._thunk)
+ return out_ew, out_ev
+
+
+def _thunk_eigvals(a: ndarray) -> ndarray:
+ if a.dtype.kind not in ("f", "c"):
+ a = a.astype("float64")
+
+ if a.dtype == np.float32:
+ complex_dtype = np.dtype(np.complex64)
+ elif a.dtype == np.float64:
+ complex_dtype = np.dtype(np.complex128) # type: ignore
+ elif a.dtype.kind in ("c"):
+ complex_dtype = a.dtype
+ else:
+ raise TypeError("Eigvals input not supported (missing a conversion?)")
+
+ if runtime.num_gpus > 0 and not runtime.cusolver_has_geev():
+ a = ndarray(a.shape, a.dtype, thunk=runtime.to_eager_array(a._thunk))
+ out_ew = ndarray(
+ shape=a.shape[:-1],
+ dtype=complex_dtype,
+ force_thunk="eager",
+ )
+ else:
+ out_ew = ndarray(
+ shape=a.shape[:-1],
+ dtype=complex_dtype,
+ inputs=(a,),
+ )
+
+ if a.shape[-1] > 0:
+ a._thunk.eigvals(out_ew._thunk)
+ return out_ew
+
+
def _thunk_qr(a: ndarray) -> tuple[ndarray, ...]:
if a.dtype.kind not in ("f", "c"):
a = a.astype("float64")
@@ -833,3 +1012,369 @@ def _thunk_svd(a: ndarray, full_matrices: bool) -> tuple[ndarray, ...]:
a._thunk.svd(out_u._thunk, out_s._thunk, out_vh._thunk)
return out_u, out_s, out_vh
+
+
+# helper function to construct rational Pade
+# numerator / denominator for expm(A):
+#
+def make_uv(A: ndarray, b: Any, m: int) -> tuple[ndarray, ndarray]:
+ # 1 + floor(m/2):
+ #
+ k = 1 + m // 2
+ n = A.shape[0]
+
+ U = zeros((n, n), dtype=A.dtype)
+ V = zeros((n, n), dtype=A.dtype)
+
+ # U := A * ∑_{j=0, k} b_{2j+1} * A^{2j};
+ # V := ∑_{j=0, k} b_{2j} * A^{2j};
+ #
+ A2 = matmul(A, A)
+ A2k = eye(n, dtype=A.dtype)
+ for j in range(k):
+ U = U + b[2 * j + 1] * A2k
+ V = V + b[2 * j] * A2k
+ A2k = matmul(A2k, A2)
+
+ U = matmul(A, U)
+
+ return (U, V)
+
+
+class ExpmConstants:
+ """
+ Aggregates all the necessary expm(A) constants.
+ """
+
+ # Pade `b` coefficient generators
+ # for both numerator `p(x)` and
+ # denominator `q(x)` coefficients
+ #
+ # dictionary key := `m`, degree of
+ # both `p(x)` and `q(x)` for
+ # diagonal Pade implementation;
+ #
+ b_coeff = {
+ 3: np.array([120, 60, 12, 1], dtype=np.float64),
+ 5: np.array([30240, 15120, 3360, 420, 30, 1], dtype=np.float64),
+ 7: np.array(
+ [17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1],
+ dtype=np.float64,
+ ),
+ 9: np.array(
+ [
+ 17643225600,
+ 8821612800,
+ 2075673600,
+ 302702400,
+ 30270240,
+ 2162160,
+ 110880,
+ 3960,
+ 90,
+ 1,
+ ],
+ dtype=np.float64,
+ ),
+ 13: np.array(
+ [
+ 64764752532480000,
+ 32382376266240000,
+ 7771770303897600,
+ 1187353796428800,
+ 129060195264000,
+ 10559470521600,
+ 670442572800,
+ 33522128640,
+ 1323241920,
+ 40840800,
+ 960960,
+ 16380,
+ 182,
+ 1,
+ ],
+ dtype=np.float64,
+ ),
+ }
+
+ # Pade error control: absolute error tolerance
+ # parameter `theta`, also degree `m` dependent:
+ #
+ theta = {
+ 3: 1.5e-2,
+ 5: 2.5e-1,
+ 7: 9.5e-1,
+ 9: 2.1,
+ 13: 5.4,
+ }
+
+ # Taylor-18 coefficients
+ #
+ a01 = 0
+ a11 = -0.10036558103014462001
+ a21 = -0.00802924648241156960
+ a31 = -0.00089213849804572995
+
+ b01 = 0
+ b11 = 0.39784974949964507614
+ b21 = 1.36783778460411719922
+ b31 = 0.49828962252538267755
+ b61 = -0.00063789819459472330
+ b02 = -10.9676396052962062593
+ b12 = 1.68015813878906197182
+ b22 = 0.05717798464788655127
+ b32 = -0.00698210122488052084
+ b62 = 0.00003349750170860705
+ b03 = -0.09043168323908105619
+ b13 = -0.06764045190713819075
+ b23 = 0.06759613017704596460
+ b33 = 0.02955525704293155274
+ b63 = -0.00001391802575160607
+ b04 = 0
+ b14 = 0
+ b24 = -0.09233646193671185927
+ b34 = -0.01693649390020817171
+ b64 = -0.00001400867981820361
+
+ # Taylor-18 error control (squaring and scalling decision):
+ #
+ theta_m = 1.09
+
+
+def expm_impl(a: ndarray, output: ndarray) -> tuple[int, int]:
+ """
+ Implements Pade rational aproximant of
+ Algorithm 10.20, p.246-247 in
+ "Functions of Matrices - Theory and Computation",
+ Nicholas J. Higham, SIAM 2008.
+ """
+
+ lst_keys = list(ExpmConstants.theta.keys())
+
+ # maximum polynomial degree for [p(x)/q(x)]:
+ max_deg = lst_keys[-1]
+
+ # L1 norm of matrix input:
+ l1_norm_a = norm(a, 1)
+
+ # loop decides which Pade degree, `m`, to
+ # use, starting with the lowest degree
+ # up to the one before last degree;
+ #
+ # if neither satisfies the theta tolerance
+ # then exit the loop and proceed by using
+ # m=max_deg degree + scaling (to achieve
+ # desired tolerance);
+ #
+ requires_scaling = True
+ s = 0
+ a_scaled = a
+
+ for m in lst_keys[0:-1]:
+ tol_m = ExpmConstants.theta[m]
+ b_arr = ExpmConstants.b_coeff[m]
+ if l1_norm_a <= tol_m:
+ requires_scaling = False
+ break
+
+ # at this point scaling + squaring with [max_deg/max_deg]
+ # Pade rational approximation is done;
+ #
+ # using [max_deg/max_deg] Pade with scaling A/(2^s)
+ # until || A / (2^s) ||_1 <= tol_13;
+ # i.e., s = ceil(log_2(|| A / (2^s) ||_1)):
+ #
+ if requires_scaling:
+ m = max_deg
+ tol_m = ExpmConstants.theta[m]
+ b_arr = ExpmConstants.b_coeff[m]
+
+ s = np.maximum(1, int(np.ceil(np.log2(l1_norm_a / tol_m))))
+ #
+ # scale `a` by sfactor = 1.0/2^s = 2^(-s):
+ #
+ sfactor = np.power(2.0, s)
+ #
+ # A' <- A / sfactor
+ #
+ a_scaled = a / sfactor
+
+ # evaluate U, V matrices, via Eq. 10.33 of [1]
+ # k = 1 + floor(m/2):
+ # U := A * ∑_{j=0, k} b_{2j+1} * A^{2j};
+ # V := ∑_{j=0, k} b_{2j} * A^{2j};
+ #
+ (U, V) = make_uv(a_scaled, b_arr, m)
+ A = V - U
+ B = V + U
+
+ # independently solve for each column:
+ # TODO: can more parallelism be harvested here?
+ # at the very least avoid oversolving by
+ # doing LU / QR factorization once, followed
+ # by `n` backward-forward substitutions;
+ #
+ output[:] = solve(A, B)
+
+ # if scaling by 1/2^s was done then
+ # squaring s times is necessary:
+ #
+ if requires_scaling:
+ for j in range(s):
+ output[:] = matmul(output, output)
+
+ return (m, s)
+
+
+def expm_expl(a: ndarray, output: ndarray) -> tuple[int, int]:
+ """
+ Implements Taylor expansion, algorithm T_18
+ in "Computing the Matrix Exponential with an
+ Optimized Taylor Polynomial Approximation",
+ Philipp Bader et. al.,
+ which minimizes the number of matrix products
+ for given number of terms in the expansion.
+ """
+
+ tol_m = ExpmConstants.theta_m # may vary w/ degree, m, in future impls.
+
+ # L1 norm of matrix input:
+ l1_norm_a = norm(a, 1)
+
+ requires_scaling = l1_norm_a > tol_m
+
+ s = 0
+ A = a
+ m = 18
+
+ if requires_scaling:
+ s = np.maximum(1, int(np.ceil(np.log2(l1_norm_a / tol_m))))
+ #
+ # scale `a` by sfactor = 1.0/2^s = 2^(-s):
+ #
+ sfactor = np.power(2.0, s)
+ #
+ # A' <- A / sfactor
+ #
+ A = a / sfactor
+
+ EYE = eye(A.shape[0], dtype=A.dtype)
+ A2 = matmul(A, A)
+ A3 = matmul(A2, A)
+ A6 = matmul(A3, A3)
+ B1 = (
+ ExpmConstants.a11 * A + ExpmConstants.a21 * A2 + ExpmConstants.a31 * A3
+ )
+ B2 = (
+ ExpmConstants.b11 * A
+ + ExpmConstants.b21 * A2
+ + ExpmConstants.b31 * A3
+ + ExpmConstants.b61 * A6
+ )
+ B3 = (
+ ExpmConstants.b02 * EYE
+ + ExpmConstants.b12 * A
+ + ExpmConstants.b22 * A2
+ + ExpmConstants.b32 * A3
+ + ExpmConstants.b62 * A6
+ )
+ B4 = (
+ ExpmConstants.b03 * EYE
+ + ExpmConstants.b13 * A
+ + ExpmConstants.b23 * A2
+ + ExpmConstants.b33 * A3
+ + ExpmConstants.b63 * A6
+ )
+ B5 = (
+ ExpmConstants.b24 * A2
+ + ExpmConstants.b34 * A3
+ + ExpmConstants.b64 * A6
+ )
+
+ A9 = B4 + matmul(B1, B5)
+ B39 = B3 + A9
+
+ output[:] = B2 + matmul(B39, A9)
+
+ # if scaling by 1/2^s was done then
+ # squaring s times is necessary:
+ #
+ if requires_scaling:
+ for j in range(s):
+ output[:] = matmul(output, output)
+
+ return (m, s)
+
+
+@add_boilerplate("a")
+def expm(a: ndarray, method: str = "pade") -> ndarray:
+ """
+ Matrix exponential.
+
+ Returns exp(A) for each (M x M) slice into a multi-dimensional
+ array, assumed to be of shape (..., M, M);
+
+ By default Pade (implicit) implementation is used.
+ However, explicit Taylor(deg = 18) implementation can be used,
+ by supplying additional flag `use_explicit = True`.
+
+ Parameters
+ ----------
+ a : (..., M, M) array_like
+ Input matrix or multi-dimensional array of shape (..., M, M).
+
+ method : String method selector to use explicit ('taylor')
+ or implicit ('pade'); default = 'pade'.
+
+ Returns
+ -------
+ exp(A): matrix exponential of input, or a matrix exponential
+ for each slice in the input.
+
+ Notes
+ -----
+ Implicit Pade implementation is more stable but more computationally
+ intensive than explicit Taylor, which is less stable when matrix norm is
+ big enough. Also, Taylor can be slightly more performant for matrices of
+ small enough norms, but more memory consuming.
+
+ See Also
+ --------
+ scipy.linalg.expm
+
+ Availability
+ --------
+ Multiple GPUs, Multiple CPUs
+ """
+
+ if a.ndim < 2 or a.shape[-1] != a.shape[-2] or a.size <= 1:
+ raise ValueError(f"Invalid input shape for expm: {a.shape}")
+
+ output = zeros_like(a)
+
+ m_info = get_machine()
+ num_PEs = m_info.count()
+
+ # run implicit (Pade) method by default:
+ #
+ if method == "pade":
+ expm_func = expm_impl
+ elif method == "taylor":
+ expm_func = expm_expl
+ else:
+ raise ValueError(f"Method {method} not supported.")
+
+ if num_PEs < 2:
+ for idx in np.ndindex(a.shape[:-2]):
+ mdeg, s = expm_func(a[idx], output[idx])
+ else:
+ for idx in np.ndindex(a.shape[:-2]):
+ flat_index = np.ravel_multi_index(idx, a.shape[:-2])
+
+ # assign work to multiple GPUs in round-robin way:
+ #
+ findx = int(flat_index)
+ with m_info[findx % num_PEs]:
+ mdeg, s = expm_func(a[idx], output[idx])
+
+ return output
diff --git a/cunumeric/ma/__init__.py b/cupynumeric/ma/__init__.py
similarity index 100%
rename from cunumeric/ma/__init__.py
rename to cupynumeric/ma/__init__.py
diff --git a/cunumeric/ma/_masked_array.py b/cupynumeric/ma/_masked_array.py
similarity index 100%
rename from cunumeric/ma/_masked_array.py
rename to cupynumeric/ma/_masked_array.py
diff --git a/cunumeric/patch.py b/cupynumeric/patch.py
similarity index 76%
rename from cunumeric/patch.py
rename to cupynumeric/patch.py
index 2cc72266e1..569499fc29 100644
--- a/cunumeric/patch.py
+++ b/cupynumeric/patch.py
@@ -12,13 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
-""" This module may be imported in order to globably replace NumPy with
-CuNumeric.
+"""This module may be imported in order to globably replace NumPy with
+cuPyNumeric.
In order to function properly, this module must be imported early (ideally
at the very start of a script). The ``numpy`` module in ``sys.modules``
-will be replaced with ``cunumeric`` so that any subsequent use of the
-``numpy`` module will use ``cunumeric`` instead.
+will be replaced with ``cupynumeric`` so that any subsequent use of the
+``numpy`` module will use ``cupynumeric`` instead.
This module is primarily intended for quick demonstrations or proofs of
concept.
@@ -28,6 +28,6 @@
import sys
-import cunumeric
+import cupynumeric
-sys.modules["numpy"] = cunumeric
+sys.modules["numpy"] = cupynumeric
diff --git a/cunumeric/py.typed b/cupynumeric/py.typed
similarity index 100%
rename from cunumeric/py.typed
rename to cupynumeric/py.typed
diff --git a/cunumeric/random/__init__.py b/cupynumeric/random/__init__.py
similarity index 100%
rename from cunumeric/random/__init__.py
rename to cupynumeric/random/__init__.py
diff --git a/cunumeric/random/_bitgenerator.py b/cupynumeric/random/_bitgenerator.py
similarity index 99%
rename from cunumeric/random/_bitgenerator.py
rename to cupynumeric/random/_bitgenerator.py
index 2dbd41a29b..c4f62691b1 100644
--- a/cunumeric/random/_bitgenerator.py
+++ b/cupynumeric/random/_bitgenerator.py
@@ -53,7 +53,7 @@ def __init__(
numpy.random.BitGenerator
Availability
- --------
+ ------------
Multiple GPUs, Multiple CPUs
"""
if type(self) is BitGenerator:
@@ -68,8 +68,7 @@ def __init__(
)
@abstractproperty
- def generatorType(self) -> BitGeneratorType:
- ...
+ def generatorType(self) -> BitGeneratorType: ...
def __del__(self) -> None:
if self.handle != 0:
diff --git a/cunumeric/random/_generator.py b/cupynumeric/random/_generator.py
similarity index 98%
rename from cunumeric/random/_generator.py
rename to cupynumeric/random/_generator.py
index c84cce39de..4736bd8981 100644
--- a/cunumeric/random/_generator.py
+++ b/cupynumeric/random/_generator.py
@@ -43,8 +43,8 @@ def __init__(self, bit_generator: BitGenerator) -> None:
then an array with that shape is filled and returned.
- The function :func:`cunumeric.random.default_rng` will instantiate
- a `Generator` with cuNumeric's default `BitGenerator`.
+ The function :func:`cupynumeric.random.default_rng` will instantiate
+ a `Generator` with cuPyNumeric's default `BitGenerator`.
Parameters
----------
@@ -57,7 +57,7 @@ def __init__(self, bit_generator: BitGenerator) -> None:
default_rng : Recommended constructor for `Generator`.
Availability
- --------
+ ------------
Multiple GPUs, Multiple CPUs
"""
diff --git a/cunumeric/random/_random.py b/cupynumeric/random/_random.py
similarity index 99%
rename from cunumeric/random/_random.py
rename to cupynumeric/random/_random.py
index 8299da0608..6879e9053b 100644
--- a/cunumeric/random/_random.py
+++ b/cupynumeric/random/_random.py
@@ -1713,7 +1713,7 @@ def _random_state_fallback(obj: Any) -> Any:
# wrapped vanilla NumPy RandomState
if isinstance(obj, RandomState):
return obj._np_random_state
- # eagerly convert any cuNumeric ndarrays to NumPy
+ # eagerly convert any cuPyNumeric ndarrays to NumPy
if isinstance(obj, ndarray):
return obj.__array__()
return obj
diff --git a/cunumeric/runtime.py b/cupynumeric/runtime.py
similarity index 92%
rename from cunumeric/runtime.py
rename to cupynumeric/runtime.py
index 85cc7d9548..ca7a32bf43 100644
--- a/cunumeric/runtime.py
+++ b/cupynumeric/runtime.py
@@ -17,7 +17,7 @@
import math
import warnings
from functools import lru_cache, reduce
-from typing import TYPE_CHECKING, Any, Sequence, TypeGuard
+from typing import TYPE_CHECKING, Any, Literal, Sequence, TypeGuard
import legate.core.types as ty
import numpy as np
@@ -28,12 +28,12 @@
from ._utils.stack import find_last_user_stacklevel
from .config import (
BitGeneratorOperation,
- CuNumericOpCode,
+ CuPyNumericOpCode,
TransferType,
- cunumeric_lib,
+ cupynumeric_lib,
)
-# We need to be careful about importing from other cunumeric modules here. The
+# We need to be careful about importing from other cupynumeric modules. The
# runtime is global and used in many places, but also depends on many of the
# other modules. Things like config and utils are OK, but imports for thunks,
# array types, etc. need to be deferred in order to avoid circular imports.
@@ -75,7 +75,7 @@ def cached_thunk_from_scalar(
class Runtime(object):
def __init__(self) -> None:
- self.library = legate_runtime.find_library(cunumeric_lib.name)
+ self.library = legate_runtime.find_library(cupynumeric_lib.name)
self.current_random_epoch = 0
self.current_random_bitgenid = 0
self.current_random_bitgen_zombies: tuple[Any, ...] = ()
@@ -83,14 +83,14 @@ def __init__(self) -> None:
self.api_calls: list[tuple[str, str, bool]] = []
max_eager_volume = (
- cunumeric_lib.shared_object.cunumeric_max_eager_volume()
+ cupynumeric_lib.shared_object.cupynumeric_max_eager_volume()
)
self.max_eager_volume = int(np.asarray(max_eager_volume))
- assert cunumeric_lib.shared_object is not None
- self.cunumeric_lib = cunumeric_lib.shared_object
+ assert cupynumeric_lib.shared_object is not None
+ self.cupynumeric_lib = cupynumeric_lib.shared_object
self.has_cusolvermp = (
- cunumeric_lib.shared_object.cunumeric_has_cusolvermp()
+ cupynumeric_lib.shared_object.cupynumeric_has_cusolvermp()
)
from .settings import settings
@@ -103,6 +103,13 @@ def __init__(self) -> None:
# Maps value types to struct types used in argmin/argmax
self._cached_argred_types: dict[ty.Type, ty.Type] = dict()
+ def cusolver_has_geev(self) -> bool:
+ if not hasattr(self, "cusolver_has_geev_"):
+ self.cusolver_has_geev_ = (
+ cupynumeric_lib.shared_object.cupynumeric_cusolver_has_geev()
+ )
+ return self.cusolver_has_geev_
+
@property
def num_procs(self) -> int:
return len(legate_runtime.machine)
@@ -122,7 +129,7 @@ def record_api_call(
def _load_cudalibs(self) -> None:
task = legate_runtime.create_manual_task(
self.library,
- CuNumericOpCode.LOAD_CUDALIBS,
+ CuPyNumericOpCode.LOAD_CUDALIBS,
[self.num_gpus],
)
task.execute()
@@ -134,7 +141,7 @@ def get_argred_type(self, value_dtype: ty.Type) -> ty.Type:
return cached
argred_dtype = ty.struct_type([ty.int64, value_dtype], True)
self._cached_argred_types[value_dtype] = argred_dtype
- ids = self.cunumeric_lib.cunumeric_register_reduction_ops(
+ ids = self.cupynumeric_lib.cupynumeric_register_reduction_ops(
value_dtype.code
)
argred_dtype.record_reduction_op(
@@ -150,10 +157,10 @@ def _report_coverage(self) -> None:
implemented = sum(int(impl) for (_, _, impl) in self.api_calls)
if total == 0:
- print("cuNumeric API coverage: 0/0")
+ print("cuPyNumeric API coverage: 0/0")
else:
print(
- f"cuNumeric API coverage: {implemented}/{total} "
+ f"cuPyNumeric API coverage: {implemented}/{total} "
f"({implemented / total * 100}%)"
)
@@ -199,7 +206,7 @@ def bitgenerator_create(
if forceCreate:
task = legate_runtime.create_manual_task(
self.library,
- CuNumericOpCode.BITGENERATOR,
+ CuPyNumericOpCode.BITGENERATOR,
(self.num_procs,),
)
self.bitgenerator_populate_task(
@@ -229,7 +236,7 @@ def bitgenerator_destroy(
legate_runtime.issue_execution_fence()
task = legate_runtime.create_manual_task(
self.library,
- CuNumericOpCode.BITGENERATOR,
+ CuPyNumericOpCode.BITGENERATOR,
(self.num_procs,),
)
self.bitgenerator_populate_task(
@@ -395,7 +402,9 @@ def find_or_create_array_thunk(
assert isinstance(array, np.ndarray)
if not is_supported_dtype(array.dtype):
- raise TypeError(f"cuNumeric does not support dtype={array.dtype}")
+ raise TypeError(
+ f"cuPyNumeric does not support dtype={array.dtype}"
+ )
# We have to be really careful here to handle the case of
# aliased numpy arrays that are passed in from the application
@@ -412,7 +421,7 @@ def find_or_create_array_thunk(
if key is None:
# This base array wasn't made with a view
raise NotImplementedError(
- "cuNumeric does not currently know "
+ "cuPyNumeric does not currently know "
+ "how to attach to array views that are not affine "
+ "transforms of their parent array."
)
@@ -471,10 +480,16 @@ def create_empty_thunk(
shape: NdShape,
dtype: ty.Type,
inputs: Sequence[NumPyThunk] | None = None,
+ force_thunk: Literal["deferred"] | Literal["eager"] | None = None,
) -> NumPyThunk:
from ._thunk.deferred import DeferredArray
- if self.is_eager_shape(shape) and self.are_all_eager_inputs(inputs):
+ assert inputs is None or force_thunk is None
+ if force_thunk == "eager" or (
+ force_thunk is None
+ and self.is_eager_shape(shape)
+ and self.are_all_eager_inputs(inputs)
+ ):
return self.create_eager_thunk(shape, dtype.to_numpy_dtype())
store = legate_runtime.create_store(
@@ -514,7 +529,7 @@ def is_eager_shape(self, shape: NdShape) -> bool:
from .settings import settings
- # CUNUMERIC_FORCE_THUNK == "eager"
+ # CUPYNUMERIC_FORCE_THUNK == "eager"
if settings.force_thunk() == "eager":
return True
diff --git a/cunumeric/settings.py b/cupynumeric/settings.py
similarity index 84%
rename from cunumeric/settings.py
rename to cupynumeric/settings.py
index 292699d260..d73eee2616 100644
--- a/cunumeric/settings.py
+++ b/cupynumeric/settings.py
@@ -25,21 +25,21 @@
__all__ = ("settings",)
-class CunumericRuntimeSettings(Settings):
+class CupynumericRuntimeSettings(Settings):
preload_cudalibs: PrioritizedSetting[bool] = PrioritizedSetting(
"preload_cudalibs",
- "CUNUMERIC_PRELOAD_CUDALIBS",
+ "CUPYNUMERIC_PRELOAD_CUDALIBS",
default=False,
convert=convert_bool,
help="""
Preload and initialize handles of all CUDA libraries (cuBLAS, cuSOLVER,
- etc.) used in cuNumeric.
+ etc.) used in cuPyNumeric.
""",
)
warn: PrioritizedSetting[bool] = PrioritizedSetting(
"warn",
- "CUNUMERIC_WARN",
+ "CUPYNUMERIC_WARN",
default=False,
convert=convert_bool,
help="""
@@ -49,27 +49,27 @@ class CunumericRuntimeSettings(Settings):
report_coverage: PrioritizedSetting[bool] = PrioritizedSetting(
"report_coverage",
- "CUNUMERIC_REPORT_COVERAGE",
+ "CUPYNUMERIC_REPORT_COVERAGE",
default=False,
convert=convert_bool,
help="""
- Print an overall percentage of cunumeric coverage.
+ Print an overall percentage of cupynumeric coverage.
""",
)
report_dump_callstack: PrioritizedSetting[bool] = PrioritizedSetting(
"report_dump_callstack",
- "CUNUMERIC_REPORT_DUMP_CALLSTACK",
+ "CUPYNUMERIC_REPORT_DUMP_CALLSTACK",
default=False,
convert=convert_bool,
help="""
- Print an overall percentage of cunumeric coverage with call stack info.
+ Print an overall percentage of cupynumeric coverage with a call stack.
""",
)
report_dump_csv: PrioritizedSetting[str | None] = PrioritizedSetting(
"report_dump_csv",
- "CUNUMERIC_REPORT_DUMP_CSV",
+ "CUPYNUMERIC_REPORT_DUMP_CSV",
default=None,
help="""
Save a coverage report to a specified CSV file.
@@ -78,11 +78,11 @@ class CunumericRuntimeSettings(Settings):
numpy_compat: PrioritizedSetting[bool] = PrioritizedSetting(
"numpy_compat",
- "CUNUMERIC_NUMPY_COMPATIBILITY",
+ "CUPYNUMERIC_NUMPY_COMPATIBILITY",
default=False,
convert=convert_bool,
help="""
- cuNumeric will issue additional tasks to match numpy's results
+ cuPyNumeric will issue additional tasks to match numpy's results
and behavior. This is currently used in the following
APIs: nanmin, nanmax, nanargmin, nanargmax
""",
@@ -90,7 +90,7 @@ class CunumericRuntimeSettings(Settings):
fast_math: EnvOnlySetting[int] = EnvOnlySetting(
"fast_math",
- "CUNUMERIC_FAST_MATH",
+ "CUPYNUMERIC_FAST_MATH",
default=False,
convert=convert_bool,
help="""
@@ -105,7 +105,7 @@ class CunumericRuntimeSettings(Settings):
min_gpu_chunk: EnvOnlySetting[int] = EnvOnlySetting(
"min_gpu_chunk",
- "CUNUMERIC_MIN_GPU_CHUNK",
+ "CUPYNUMERIC_MIN_GPU_CHUNK",
default=65536, # 1 << 16
test_default=2,
convert=convert_int,
@@ -121,7 +121,7 @@ class CunumericRuntimeSettings(Settings):
min_cpu_chunk: EnvOnlySetting[int] = EnvOnlySetting(
"min_cpu_chunk",
- "CUNUMERIC_MIN_CPU_CHUNK",
+ "CUPYNUMERIC_MIN_CPU_CHUNK",
default=1024, # 1 << 10
test_default=2,
convert=convert_int,
@@ -137,7 +137,7 @@ class CunumericRuntimeSettings(Settings):
min_omp_chunk: EnvOnlySetting[int] = EnvOnlySetting(
"min_omp_chunk",
- "CUNUMERIC_MIN_OMP_CHUNK",
+ "CUPYNUMERIC_MIN_OMP_CHUNK",
default=8192, # 1 << 13
test_default=2,
convert=convert_int,
@@ -153,15 +153,15 @@ class CunumericRuntimeSettings(Settings):
force_thunk: EnvOnlySetting[str | None] = EnvOnlySetting(
"force_thunk",
- "CUNUMERIC_FORCE_THUNK",
+ "CUPYNUMERIC_FORCE_THUNK",
default=None,
test_default="deferred",
help="""
- Force cuNumeric to always use a specific strategy for backing
+ Force cuPyNumeric to always use a specific strategy for backing
ndarrays: "deferred", i.e. managed by the Legate runtime, which
enables distribution and accelerated operations, but has some
up-front offloading overhead, or "eager", i.e. falling back to
- using a vanilla NumPy array. By default cuNumeric will decide
+ using a vanilla NumPy array. By default cuPyNumeric will decide
this on a per-array basis, based on the size of the array and
the accelerator in use.
@@ -171,12 +171,12 @@ class CunumericRuntimeSettings(Settings):
matmul_cache_size: EnvOnlySetting[int] = EnvOnlySetting(
"matmul_cache_size",
- "CUNUMERIC_MATMUL_CACHE_SIZE",
+ "CUPYNUMERIC_MATMUL_CACHE_SIZE",
default=134217728, # 128MB
test_default=4096, # 4KB
convert=convert_int,
help="""
- Force cuNumeric to keep temporary task slices during matmul
+ Force cuPyNumeric to keep temporary task slices during matmul
computations smaller than this threshold. Whenever the temporary
space needed during computation would exceed this value the task
will be batched over 'k' to fulfill the requirement.
@@ -186,4 +186,4 @@ class CunumericRuntimeSettings(Settings):
)
-settings = CunumericRuntimeSettings()
+settings = CupynumericRuntimeSettings()
diff --git a/cunumeric/types.py b/cupynumeric/types.py
similarity index 95%
rename from cunumeric/types.py
rename to cupynumeric/types.py
index 35f2e012f5..f2fbf83114 100644
--- a/cunumeric/types.py
+++ b/cupynumeric/types.py
@@ -34,4 +34,6 @@
ConvolveMode: TypeAlias = Literal["full", "valid", "same"]
+ConvolveMethod: TypeAlias = Literal["auto", "direct", "fft"]
+
SelectKind: TypeAlias = Literal["introselect"]
diff --git a/cupynumeric_cpp.cmake b/cupynumeric_cpp.cmake
new file mode 100644
index 0000000000..2a56ccbc0a
--- /dev/null
+++ b/cupynumeric_cpp.cmake
@@ -0,0 +1,539 @@
+#=============================================================================
+# Copyright 2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+##############################################################################
+# - User Options ------------------------------------------------------------
+
+option(BUILD_SHARED_LIBS "Build cuPyNumeric shared libraries" ON)
+option(cupynumeric_EXCLUDE_TBLIS_FROM_ALL "Exclude tblis targets from cuPyNumeric's 'all' target" OFF)
+option(cupynumeric_EXCLUDE_OPENBLAS_FROM_ALL "Exclude OpenBLAS targets from cuPyNumeric's 'all' target" OFF)
+option(cupynumeric_EXCLUDE_LEGATE_FROM_ALL "Exclude legate targets from cuPyNumeric's 'all' target" OFF)
+
+##############################################################################
+# - Project definition -------------------------------------------------------
+
+# Write the version header
+rapids_cmake_write_version_file(include/cupynumeric/version_config.hpp)
+
+# Needed to integrate with LLVM/clang tooling
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+##############################################################################
+# - Build Type ---------------------------------------------------------------
+
+# Set a default build type if none was specified
+rapids_cmake_build_type(Release)
+
+##############################################################################
+# - conda environment --------------------------------------------------------
+
+rapids_cmake_support_conda_env(conda_env MODIFY_PREFIX_PATH)
+
+# We're building python extension libraries, which must always be installed
+# under lib/, even if the system normally uses lib64/. Rapids-cmake currently
+# doesn't realize this when we're going through scikit-build, see
+# https://github.com/rapidsai/rapids-cmake/issues/426
+if(TARGET conda_env)
+ set(CMAKE_INSTALL_LIBDIR "lib")
+endif()
+
+##############################################################################
+# - Dependencies -------------------------------------------------------------
+
+# add third party dependencies using CPM
+rapids_cpm_init(OVERRIDE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/versions.json)
+
+rapids_find_package(OpenMP GLOBAL_TARGETS OpenMP::OpenMP_CXX)
+
+option(Legion_USE_CUDA "Use CUDA" ON)
+option(Legion_USE_OpenMP "Use OpenMP" ${OpenMP_FOUND})
+option(Legion_BOUNDS_CHECKS "Build cuPyNumeric with bounds checks (expensive)" OFF)
+
+###
+# If we find legate already configured on the system, it will report
+# whether it was compiled with bounds checking (Legion_BOUNDS_CHECKS),
+# CUDA (Legion_USE_CUDA), and OpenMP (Legion_USE_OpenMP).
+#
+# We use the same variables as legate because we want to enable/disable
+# each of these features based on how legate was configured (it doesn't
+# make sense to build cuPyNumeric's CUDA bindings if legate wasn't built
+# with CUDA support).
+###
+include(thirdparty/get_legate)
+
+# Use of DEFINED is deliberate. CMAKE_CUDA_ARCHITECTURES may be OFF which we want to leave
+# in place. Legion_CUDA_ARCH is defined by Legate.
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+ set(CMAKE_CUDA_ARCHITECTURES "${Legion_CUDA_ARCH}")
+endif()
+
+if(Legion_USE_CUDA)
+ include(Modules/cuda_arch_helpers)
+ # Needs to run before `rapids_cuda_init_architectures`
+ set_cuda_arch_from_names()
+ # Needs to run before `enable_language(CUDA)`
+ rapids_cuda_init_architectures(cupynumeric)
+ message(STATUS "CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}")
+ enable_language(CUDA)
+ # Since cupynumeric only enables CUDA optionally we need to manually include
+ # the file that rapids_cuda_init_architectures relies on `project` calling
+ if(CMAKE_PROJECT_cupynumeric_INCLUDE)
+ include("${CMAKE_PROJECT_cupynumeric_INCLUDE}")
+ endif()
+
+ # Must come after enable_language(CUDA)
+ # Use `-isystem ` instead of `-isystem=`
+ # because the former works with clangd intellisense
+ set(CMAKE_INCLUDE_SYSTEM_FLAG_CUDA "-isystem ")
+
+ rapids_find_package(
+ CUDAToolkit REQUIRED
+ BUILD_EXPORT_SET cupynumeric-exports
+ INSTALL_EXPORT_SET cupynumeric-exports
+ )
+
+ include(thirdparty/get_nccl)
+ include(thirdparty/get_cutensor)
+endif()
+
+include(thirdparty/get_openblas)
+
+include(thirdparty/get_tblis)
+
+##############################################################################
+# - cuPyNumeric ----------------------------------------------------------------
+
+add_library(cupynumeric)
+add_library(cupynumeric::cupynumeric ALIAS cupynumeric)
+
+set(cupynumeric_CXX_OPTIONS "")
+set(cupynumeric_CUDA_OPTIONS "")
+
+include(Modules/set_cpu_arch_flags)
+set_cpu_arch_flags(cupynumeric_CXX_OPTIONS)
+
+# Add `src/cupynumeric.mk` sources
+target_sources(cupynumeric PRIVATE
+ src/cupynumeric/ternary/where.cc
+ src/cupynumeric/scan/scan_global.cc
+ src/cupynumeric/scan/scan_local.cc
+ src/cupynumeric/binary/binary_op.cc
+ src/cupynumeric/binary/binary_op_util.cc
+ src/cupynumeric/binary/binary_red.cc
+ src/cupynumeric/bits/packbits.cc
+ src/cupynumeric/bits/unpackbits.cc
+ src/cupynumeric/unary/scalar_unary_red.cc
+ src/cupynumeric/unary/unary_op.cc
+ src/cupynumeric/unary/unary_red.cc
+ src/cupynumeric/unary/convert.cc
+ src/cupynumeric/nullary/arange.cc
+ src/cupynumeric/nullary/eye.cc
+ src/cupynumeric/nullary/fill.cc
+ src/cupynumeric/nullary/window.cc
+ src/cupynumeric/index/advanced_indexing.cc
+ src/cupynumeric/index/choose.cc
+ src/cupynumeric/index/putmask.cc
+ src/cupynumeric/index/repeat.cc
+ src/cupynumeric/index/select.cc
+ src/cupynumeric/index/wrap.cc
+ src/cupynumeric/index/zip.cc
+ src/cupynumeric/item/read.cc
+ src/cupynumeric/item/write.cc
+ src/cupynumeric/matrix/batched_cholesky.cc
+ src/cupynumeric/matrix/contract.cc
+ src/cupynumeric/matrix/diag.cc
+ src/cupynumeric/matrix/geev.cc
+ src/cupynumeric/matrix/gemm.cc
+ src/cupynumeric/matrix/matmul.cc
+ src/cupynumeric/matrix/matvecmul.cc
+ src/cupynumeric/matrix/dot.cc
+ src/cupynumeric/matrix/potrf.cc
+ src/cupynumeric/matrix/qr.cc
+ src/cupynumeric/matrix/solve.cc
+ src/cupynumeric/matrix/svd.cc
+ src/cupynumeric/matrix/syrk.cc
+ src/cupynumeric/matrix/tile.cc
+ src/cupynumeric/matrix/transpose.cc
+ src/cupynumeric/matrix/trilu.cc
+ src/cupynumeric/matrix/trsm.cc
+ src/cupynumeric/matrix/util.cc
+ src/cupynumeric/random/bitgenerator.cc
+ src/cupynumeric/random/randutil/generator_host.cc
+ src/cupynumeric/random/randutil/generator_host_straightforward.cc
+ src/cupynumeric/random/randutil/generator_host_advanced.cc
+ src/cupynumeric/random/rand.cc
+ src/cupynumeric/search/argwhere.cc
+ src/cupynumeric/search/nonzero.cc
+ src/cupynumeric/set/unique.cc
+ src/cupynumeric/set/unique_reduce.cc
+ src/cupynumeric/stat/bincount.cc
+ src/cupynumeric/convolution/convolve.cc
+ src/cupynumeric/transform/flip.cc
+ src/cupynumeric/utilities/repartition.cc
+ src/cupynumeric/arg_redop_register.cc
+ src/cupynumeric/mapper.cc
+ src/cupynumeric/ndarray.cc
+ src/cupynumeric/operators.cc
+ src/cupynumeric/runtime.cc
+ src/cupynumeric/cephes/chbevl.cc
+ src/cupynumeric/cephes/i0.cc
+ src/cupynumeric/stat/histogram.cc
+)
+
+if(Legion_USE_OpenMP)
+ target_sources(cupynumeric PRIVATE
+ src/cupynumeric/ternary/where_omp.cc
+ src/cupynumeric/scan/scan_global_omp.cc
+ src/cupynumeric/scan/scan_local_omp.cc
+ src/cupynumeric/binary/binary_op_omp.cc
+ src/cupynumeric/binary/binary_red_omp.cc
+ src/cupynumeric/bits/packbits_omp.cc
+ src/cupynumeric/bits/unpackbits_omp.cc
+ src/cupynumeric/unary/unary_op_omp.cc
+ src/cupynumeric/unary/scalar_unary_red_omp.cc
+ src/cupynumeric/unary/unary_red_omp.cc
+ src/cupynumeric/unary/convert_omp.cc
+ src/cupynumeric/nullary/arange_omp.cc
+ src/cupynumeric/nullary/eye_omp.cc
+ src/cupynumeric/nullary/fill_omp.cc
+ src/cupynumeric/nullary/window_omp.cc
+ src/cupynumeric/index/advanced_indexing_omp.cc
+ src/cupynumeric/index/choose_omp.cc
+ src/cupynumeric/index/putmask_omp.cc
+ src/cupynumeric/index/repeat_omp.cc
+ src/cupynumeric/index/select_omp.cc
+ src/cupynumeric/index/wrap_omp.cc
+ src/cupynumeric/index/zip_omp.cc
+ src/cupynumeric/matrix/batched_cholesky_omp.cc
+ src/cupynumeric/matrix/contract_omp.cc
+ src/cupynumeric/matrix/diag_omp.cc
+ src/cupynumeric/matrix/geev_omp.cc
+ src/cupynumeric/matrix/gemm_omp.cc
+ src/cupynumeric/matrix/matmul_omp.cc
+ src/cupynumeric/matrix/matvecmul_omp.cc
+ src/cupynumeric/matrix/dot_omp.cc
+ src/cupynumeric/matrix/potrf_omp.cc
+ src/cupynumeric/matrix/qr_omp.cc
+ src/cupynumeric/matrix/solve_omp.cc
+ src/cupynumeric/matrix/svd_omp.cc
+ src/cupynumeric/matrix/syrk_omp.cc
+ src/cupynumeric/matrix/tile_omp.cc
+ src/cupynumeric/matrix/transpose_omp.cc
+ src/cupynumeric/matrix/trilu_omp.cc
+ src/cupynumeric/matrix/trsm_omp.cc
+ src/cupynumeric/random/rand_omp.cc
+ src/cupynumeric/search/argwhere_omp.cc
+ src/cupynumeric/search/nonzero_omp.cc
+ src/cupynumeric/set/unique_omp.cc
+ src/cupynumeric/set/unique_reduce_omp.cc
+ src/cupynumeric/stat/bincount_omp.cc
+ src/cupynumeric/convolution/convolve_omp.cc
+ src/cupynumeric/transform/flip_omp.cc
+ src/cupynumeric/stat/histogram_omp.cc
+ )
+endif()
+
+if(Legion_USE_CUDA)
+ target_sources(cupynumeric PRIVATE
+ src/cupynumeric/ternary/where.cu
+ src/cupynumeric/scan/scan_global.cu
+ src/cupynumeric/scan/scan_local.cu
+ src/cupynumeric/binary/binary_op.cu
+ src/cupynumeric/binary/binary_red.cu
+ src/cupynumeric/bits/packbits.cu
+ src/cupynumeric/bits/unpackbits.cu
+ src/cupynumeric/unary/scalar_unary_red.cu
+ src/cupynumeric/unary/unary_red.cu
+ src/cupynumeric/unary/unary_op.cu
+ src/cupynumeric/unary/convert.cu
+ src/cupynumeric/nullary/arange.cu
+ src/cupynumeric/nullary/eye.cu
+ src/cupynumeric/nullary/fill.cu
+ src/cupynumeric/nullary/window.cu
+ src/cupynumeric/index/advanced_indexing.cu
+ src/cupynumeric/index/choose.cu
+ src/cupynumeric/index/putmask.cu
+ src/cupynumeric/index/repeat.cu
+ src/cupynumeric/index/select.cu
+ src/cupynumeric/index/wrap.cu
+ src/cupynumeric/index/zip.cu
+ src/cupynumeric/item/read.cu
+ src/cupynumeric/item/write.cu
+ src/cupynumeric/matrix/batched_cholesky.cu
+ src/cupynumeric/matrix/contract.cu
+ src/cupynumeric/matrix/diag.cu
+ src/cupynumeric/matrix/geev.cu
+ src/cupynumeric/matrix/gemm.cu
+ src/cupynumeric/matrix/matmul.cu
+ src/cupynumeric/matrix/matvecmul.cu
+ src/cupynumeric/matrix/dot.cu
+ src/cupynumeric/matrix/potrf.cu
+ src/cupynumeric/matrix/qr.cu
+ src/cupynumeric/matrix/solve.cu
+ src/cupynumeric/matrix/svd.cu
+ src/cupynumeric/matrix/syrk.cu
+ src/cupynumeric/matrix/tile.cu
+ src/cupynumeric/matrix/transpose.cu
+ src/cupynumeric/matrix/trilu.cu
+ src/cupynumeric/matrix/trsm.cu
+ src/cupynumeric/random/rand.cu
+ src/cupynumeric/search/argwhere.cu
+ src/cupynumeric/search/nonzero.cu
+ src/cupynumeric/set/unique.cu
+ src/cupynumeric/stat/bincount.cu
+ src/cupynumeric/convolution/convolve.cu
+ src/cupynumeric/fft/fft.cu
+ src/cupynumeric/transform/flip.cu
+ src/cupynumeric/utilities/repartition.cu
+ src/cupynumeric/arg_redop_register.cu
+ src/cupynumeric/cudalibs.cu
+ src/cupynumeric/stat/histogram.cu
+ )
+endif()
+
+# Add `src/cupynumeric/sort/sort.mk` sources
+target_sources(cupynumeric PRIVATE
+ src/cupynumeric/sort/sort.cc
+ src/cupynumeric/sort/searchsorted.cc
+)
+
+if(Legion_USE_OpenMP)
+ target_sources(cupynumeric PRIVATE
+ src/cupynumeric/sort/sort_omp.cc
+ src/cupynumeric/sort/searchsorted_omp.cc
+ )
+endif()
+
+if(Legion_USE_CUDA)
+ target_sources(cupynumeric PRIVATE
+ src/cupynumeric/sort/sort.cu
+ src/cupynumeric/sort/searchsorted.cu
+ src/cupynumeric/sort/cub_sort_bool.cu
+ src/cupynumeric/sort/cub_sort_int8.cu
+ src/cupynumeric/sort/cub_sort_int16.cu
+ src/cupynumeric/sort/cub_sort_int32.cu
+ src/cupynumeric/sort/cub_sort_int64.cu
+ src/cupynumeric/sort/cub_sort_uint8.cu
+ src/cupynumeric/sort/cub_sort_uint16.cu
+ src/cupynumeric/sort/cub_sort_uint32.cu
+ src/cupynumeric/sort/cub_sort_uint64.cu
+ src/cupynumeric/sort/cub_sort_half.cu
+ src/cupynumeric/sort/cub_sort_float.cu
+ src/cupynumeric/sort/cub_sort_double.cu
+ src/cupynumeric/sort/thrust_sort_bool.cu
+ src/cupynumeric/sort/thrust_sort_int8.cu
+ src/cupynumeric/sort/thrust_sort_int16.cu
+ src/cupynumeric/sort/thrust_sort_int32.cu
+ src/cupynumeric/sort/thrust_sort_int64.cu
+ src/cupynumeric/sort/thrust_sort_uint8.cu
+ src/cupynumeric/sort/thrust_sort_uint16.cu
+ src/cupynumeric/sort/thrust_sort_uint32.cu
+ src/cupynumeric/sort/thrust_sort_uint64.cu
+ src/cupynumeric/sort/thrust_sort_half.cu
+ src/cupynumeric/sort/thrust_sort_float.cu
+ src/cupynumeric/sort/thrust_sort_double.cu
+ src/cupynumeric/sort/thrust_sort_complex64.cu
+ src/cupynumeric/sort/thrust_sort_complex128.cu
+ )
+endif()
+
+# Add `src/cupynumeric/random/random.mk` sources
+if(Legion_USE_CUDA)
+ target_sources(cupynumeric PRIVATE
+ src/cupynumeric/random/bitgenerator.cu
+ src/cupynumeric/random/randutil/generator_device.cu
+ src/cupynumeric/random/randutil/generator_device_straightforward.cu
+ src/cupynumeric/random/randutil/generator_device_advanced.cu
+ )
+endif()
+
+# add sources for cusolverMp
+if(Legion_USE_CUDA AND CUSOLVERMP_DIR)
+ target_sources(cupynumeric PRIVATE
+ src/cupynumeric/matrix/mp_potrf.cu
+ src/cupynumeric/matrix/mp_solve.cu
+ )
+endif()
+
+target_sources(cupynumeric PRIVATE
+ # This must always be the last file!
+ # It guarantees we do our registration callback
+ # only after all task variants are recorded
+ src/cupynumeric/cupynumeric.cc
+)
+
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+ target_compile_definitions(cupynumeric PUBLIC "$<$:DEBUG_CUPYNUMERIC>")
+endif()
+
+if(Legion_BOUNDS_CHECKS)
+ target_compile_definitions(cupynumeric PUBLIC "$<$:BOUNDS_CHECKS>")
+endif()
+
+list(APPEND cupynumeric_CUDA_OPTIONS -Xfatbin=-compress-all)
+list(APPEND cupynumeric_CUDA_OPTIONS --expt-extended-lambda)
+list(APPEND cupynumeric_CUDA_OPTIONS --expt-relaxed-constexpr)
+list(APPEND cupynumeric_CXX_OPTIONS -Wno-deprecated-declarations)
+list(APPEND cupynumeric_CUDA_OPTIONS -Wno-deprecated-declarations)
+
+if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
+ set(platform_rpath_origin "\$ORIGIN")
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+ set(platform_rpath_origin "@loader_path")
+endif ()
+
+set_target_properties(cupynumeric
+ PROPERTIES BUILD_RPATH "${platform_rpath_origin}"
+ INSTALL_RPATH "${platform_rpath_origin}"
+ CXX_STANDARD 17
+ CXX_STANDARD_REQUIRED ON
+ POSITION_INDEPENDENT_CODE ON
+ INTERFACE_POSITION_INDEPENDENT_CODE ON
+ CUDA_STANDARD 17
+ CUDA_STANDARD_REQUIRED ON
+ LIBRARY_OUTPUT_DIRECTORY lib)
+
+target_link_libraries(cupynumeric
+ PUBLIC legate::legate
+ $
+ PRIVATE BLAS::BLAS
+ tblis::tblis
+ # Add Conda library and include paths
+ $
+ $
+ $
+ $
+ $
+ $)
+
+if(NOT Legion_USE_CUDA AND cupynumeric_cuRAND_INCLUDE_DIR)
+ target_compile_definitions(cupynumeric
+ PUBLIC "$<$:CUPYNUMERIC_CURAND_FOR_CPU_BUILD>")
+ target_include_directories(cupynumeric PRIVATE ${cupynumeric_cuRAND_INCLUDE_DIR})
+endif()
+
+if(Legion_USE_CUDA AND CUSOLVERMP_DIR)
+ message(VERBOSE "cupynumeric: CUSOLVERMP_DIR ${CUSOLVERMP_DIR}")
+ target_compile_definitions(cupynumeric PUBLIC "$<$:CUPYNUMERIC_USE_CUSOLVERMP>")
+ target_include_directories(cupynumeric PRIVATE ${CUSOLVERMP_DIR}/include)
+ target_link_libraries(cupynumeric PRIVATE ${CUSOLVERMP_DIR}/lib/libcusolverMp.so)
+endif()
+
+target_compile_options(cupynumeric
+ PRIVATE "$<$:${cupynumeric_CXX_OPTIONS}>"
+ "$<$:${cupynumeric_CUDA_OPTIONS}>")
+
+target_include_directories(cupynumeric
+ PUBLIC
+ $
+ INTERFACE
+ $
+)
+
+if(Legion_USE_CUDA)
+ file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld"
+[=[
+SECTIONS
+{
+.nvFatBinSegment : { *(.nvFatBinSegment) }
+.nv_fatbin : { *(.nv_fatbin) }
+}
+]=])
+
+ # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
+ target_link_options(cupynumeric PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
+endif()
+
+##############################################################################
+# - install targets-----------------------------------------------------------
+
+include(CPack)
+include(GNUInstallDirs)
+rapids_cmake_install_lib_dir(lib_dir)
+
+install(TARGETS cupynumeric
+ DESTINATION ${lib_dir}
+ EXPORT cupynumeric-exports)
+
+install(
+ FILES src/cupynumeric.h
+ ${CMAKE_CURRENT_BINARY_DIR}/include/cupynumeric/version_config.hpp
+ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cupynumeric)
+
+install(
+ FILES src/cupynumeric/cupynumeric_c.h
+ src/cupynumeric/ndarray.h
+ src/cupynumeric/ndarray.inl
+ src/cupynumeric/operators.h
+ src/cupynumeric/operators.inl
+ src/cupynumeric/runtime.h
+ src/cupynumeric/slice.h
+ src/cupynumeric/typedefs.h
+ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cupynumeric/cupynumeric)
+
+if(cupynumeric_INSTALL_TBLIS)
+ install(DIRECTORY ${tblis_BINARY_DIR}/lib/ DESTINATION ${lib_dir})
+ install(DIRECTORY ${tblis_BINARY_DIR}/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+endif()
+
+##############################################################################
+# - install export -----------------------------------------------------------
+
+set(doc_string
+ [=[
+Provide targets for cuPyNumeric, an aspiring drop-in replacement for NumPy at scale.
+
+Imported Targets:
+ - cupynumeric::cupynumeric
+
+]=])
+
+string(JOIN "\n" code_string
+ "set(Legion_USE_CUDA ${Legion_USE_CUDA})"
+ "set(Legion_USE_OpenMP ${Legion_USE_OpenMP})"
+ "set(Legion_BOUNDS_CHECKS ${Legion_BOUNDS_CHECKS})"
+)
+
+if(DEFINED Legion_USE_Python)
+ string(APPEND code_string "\nset(Legion_USE_Python ${Legion_USE_Python})")
+endif()
+
+if(DEFINED Legion_NETWORKS)
+ string(APPEND code_string "\nset(Legion_NETWORKS ${Legion_NETWORKS})")
+endif()
+
+rapids_export(
+ INSTALL cupynumeric
+ EXPORT_SET cupynumeric-exports
+ GLOBAL_TARGETS cupynumeric
+ NAMESPACE cupynumeric::
+ DOCUMENTATION doc_string
+ FINAL_CODE_BLOCK code_string)
+
+# build export targets
+rapids_export(
+ BUILD cupynumeric
+ EXPORT_SET cupynumeric-exports
+ GLOBAL_TARGETS cupynumeric
+ NAMESPACE cupynumeric::
+ DOCUMENTATION doc_string
+ FINAL_CODE_BLOCK code_string)
+
+if(cupynumeric_BUILD_TESTS)
+ include(CTest)
+
+ add_subdirectory(tests/cpp)
+endif()
diff --git a/cunumeric_python.cmake b/cupynumeric_python.cmake
similarity index 69%
rename from cunumeric_python.cmake
rename to cupynumeric_python.cmake
index 3c4b891cfd..1be5b35c62 100644
--- a/cunumeric_python.cmake
+++ b/cupynumeric_python.cmake
@@ -17,25 +17,25 @@
##############################################################################
# - User Options ------------------------------------------------------------
-option(FIND_CUNUMERIC_CPP "Search for existing cuNumeric C++ installations before defaulting to local files"
+option(FIND_CUPYNUMERIC_CPP "Search for existing cuPyNumeric C++ installations before defaulting to local files"
OFF)
##############################################################################
# - Dependencies -------------------------------------------------------------
-# If the user requested it we attempt to find cunumeric.
-if(FIND_CUNUMERIC_CPP)
+# If the user requested it we attempt to find cupynumeric.
+if(FIND_CUPYNUMERIC_CPP)
include("${rapids-cmake-dir}/export/detail/parse_version.cmake")
- rapids_export_parse_version(${cunumeric_version} cunumeric parsed_ver)
- rapids_find_package(cunumeric ${parsed_ver} EXACT CONFIG
- GLOBAL_TARGETS cunumeric::cunumeric
- BUILD_EXPORT_SET cunumeric-python-exports
- INSTALL_EXPORT_SET cunumeric-python-exports)
+ rapids_export_parse_version(${cupynumeric_version} cupynumeric parsed_ver)
+ rapids_find_package(cupynumeric ${parsed_ver} EXACT CONFIG
+ GLOBAL_TARGETS cupynumeric::cupynumeric
+ BUILD_EXPORT_SET cupynumeric-python-exports
+ INSTALL_EXPORT_SET cupynumeric-python-exports)
else()
- set(cunumeric_FOUND OFF)
+ set(cupynumeric_FOUND OFF)
endif()
-if(NOT cunumeric_FOUND)
+if(NOT cupynumeric_FOUND)
set(SKBUILD OFF)
set(Legion_USE_Python ON)
set(Legion_BUILD_BINDINGS ON)
@@ -51,9 +51,9 @@ add_custom_target("generate_install_info_py" ALL
VERBATIM
)
-add_library(cunumeric_python INTERFACE)
-add_library(cunumeric::cunumeric_python ALIAS cunumeric_python)
-target_link_libraries(cunumeric_python INTERFACE legate::legate)
+add_library(cupynumeric_python INTERFACE)
+add_library(cupynumeric::cupynumeric_python ALIAS cupynumeric_python)
+target_link_libraries(cupynumeric_python INTERFACE legate::legate)
# ############################################################################
# - conda environment --------------------------------------------------------
@@ -75,37 +75,37 @@ include(CPack)
include(GNUInstallDirs)
rapids_cmake_install_lib_dir(lib_dir)
-install(TARGETS cunumeric_python
+install(TARGETS cupynumeric_python
DESTINATION ${lib_dir}
- EXPORT cunumeric-python-exports)
+ EXPORT cupynumeric-python-exports)
##############################################################################
# - install export -----------------------------------------------------------
set(doc_string
[=[
-Provide Python targets for cuNumeric, an aspiring drop-in replacement for NumPy at scale.
+Provide Python targets for cuPyNumeric, an aspiring drop-in replacement for NumPy at scale.
Imported Targets:
- - cunumeric::cunumeric_python
+ - cupynumeric::cupynumeric_python
]=])
set(code_string "")
rapids_export(
- INSTALL cunumeric_python
- EXPORT_SET cunumeric-python-exports
- GLOBAL_TARGETS cunumeric_python
- NAMESPACE cunumeric::
+ INSTALL cupynumeric_python
+ EXPORT_SET cupynumeric-python-exports
+ GLOBAL_TARGETS cupynumeric_python
+ NAMESPACE cupynumeric::
DOCUMENTATION doc_string
FINAL_CODE_BLOCK code_string)
# build export targets
rapids_export(
- BUILD cunumeric_python
- EXPORT_SET cunumeric-python-exports
- GLOBAL_TARGETS cunumeric_python
- NAMESPACE cunumeric::
+ BUILD cupynumeric_python
+ EXPORT_SET cupynumeric-python-exports
+ GLOBAL_TARGETS cupynumeric_python
+ NAMESPACE cupynumeric::
DOCUMENTATION doc_string
FINAL_CODE_BLOCK code_string)
diff --git a/docs/cunumeric/source/_templates/layout.html b/docs/cunumeric/source/_templates/layout.html
deleted file mode 100644
index 2f473f38ee..0000000000
--- a/docs/cunumeric/source/_templates/layout.html
+++ /dev/null
@@ -1,13 +0,0 @@
-{% extends "!layout.html" %}
-
-{% block extrahead %}
-
-
-
-{% endblock %}
-
-{% block footer %}
-
-
-
-{% endblock %}
diff --git a/docs/cunumeric/source/api/broadcast.rst b/docs/cunumeric/source/api/broadcast.rst
deleted file mode 100644
index 50d329a2e8..0000000000
--- a/docs/cunumeric/source/api/broadcast.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-.. currentmodule:: cunumeric
-
-cunumeric.broadcast
-===================
-
-.. autoclass:: broadcast
- :members:
\ No newline at end of file
diff --git a/docs/cunumeric/source/api/comparison.rst b/docs/cunumeric/source/api/comparison.rst
deleted file mode 100644
index 139a02d76e..0000000000
--- a/docs/cunumeric/source/api/comparison.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-Project comparisons
-===================
-
-Here is a list of NumPy APIs and corresponding cuNumeric implementations.
-
-A dot in the cunumeric column denotes that cuNumeric implementation
-is not provided yet. We welcome contributions for these functions.
-
-NumPy vs cuNumeric APIs
------------------------
-
-.. comparison-table::
diff --git a/docs/cunumeric/source/api/settings.rst b/docs/cunumeric/source/api/settings.rst
deleted file mode 100644
index abc807f0b4..0000000000
--- a/docs/cunumeric/source/api/settings.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-Settings
-========
-
-cuNumeric has a number of runtime settings that can be configured through
-environment variables.
-
-.. settings:: settings
- :module: cunumeric.settings
\ No newline at end of file
diff --git a/docs/cunumeric/source/developer/CONTRIBUTING.md b/docs/cunumeric/source/developer/CONTRIBUTING.md
deleted file mode 120000
index 069558fad2..0000000000
--- a/docs/cunumeric/source/developer/CONTRIBUTING.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../../CONTRIBUTING.md
\ No newline at end of file
diff --git a/docs/cunumeric/source/developer/building.rst b/docs/cunumeric/source/developer/building.rst
deleted file mode 100644
index b4ba151e99..0000000000
--- a/docs/cunumeric/source/developer/building.rst
+++ /dev/null
@@ -1,71 +0,0 @@
-.. _building cunumeric from source:
-
-Building from source
-====================
-
-Basic build
------------
-
-Users must have a working installation of the `Legate`_ library prior to
-installing cuNumeric.
-**Installing cuNumeric by itself will not automatically install Legate.**
-
-As for other dependencies, the Dependencies section on the
-`Legate build instructions`_ also covers cuNumeric, so no additional
-packages are required.
-
-Once Legate is installed, you can simply invoke ``./install.py`` from the
-cuNumeric top-level directory. The build will automatically pick up the
-configuration used when building Legate (e.g. the CUDA Toolkit directory).
-
-Advanced topics
----------------
-
-Building through pip & cmake
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-cuNumeric uses the same cmake/scikit-build-based build workflow as Legate.
-See the `Legate build instructions`_ for an overview.
-
-There are several examples in the ``scripts`` folder. We walk through the steps in
-``build-with-legate-separately-no-install.sh`` here.
-
-We assume a pre-existing Legate build. For details on building Legate,
-consult the `Legate repository`_.
-
-First, the CMake build needs to be configured:
-
-.. code:: sh
-
- $ cmake -S . -B build -GNinja -D legate_ROOT:STRING=path/to/legate/build
-
-We point cuNumeric to the Legate *build* tree, not an installation.
-This generates all build-dependent headers and Python files.
-
-Once configured, we can build the C++ libraries:
-
-.. code:: sh
-
- $ cmake --build build
-
-This will invoke Ninja (or make) to execute the build.
-Once the C++ libraries are available, we can do an editable (development) pip installation.
-
-.. code:: sh
-
- $ SKBUILD_BUILD_OPTIONS="-D FIND_CUNUMERIC_CPP=ON -D cunumeric_ROOT=$(pwd)/build" \
- python3 -m pip install \
- --root / --no-deps --no-build-isolation
- --editable .
-
-The Python source tree and CMake build tree are now available with the environment Python
-for running cuNumeric programs. The diagram below illustrates the
-complete workflow for building both Legate and cuNumeric.
-
-.. image:: /_images/developer-build.png
- :width: 600
- :alt: "notional diagram of cunumeric build process"
-
-.. _Legate: https://github.com/nv-legate/legate.core
-.. _Legate build instructions: https://github.com/nv-legate/legate.core/blob/HEAD/BUILD.md
-.. _Legate repository: https://github.com/nv-legate/legate.core
diff --git a/docs/cunumeric/source/index.rst b/docs/cunumeric/source/index.rst
deleted file mode 100644
index afd32f6530..0000000000
--- a/docs/cunumeric/source/index.rst
+++ /dev/null
@@ -1,41 +0,0 @@
-:html_theme.sidebar_secondary.remove:
-
-NVIDIA cuNumeric
-================
-
-cuNumeric is a `Legate`_ library that aims to provide a distributed and
-accelerated drop-in replacement for the `NumPy API`_ on top of the `Legion`_
-runtime.
-
-Using cuNumeric you do things like run the final example of the
-`Python CFD course`_ completely unmodified on 2048 A100 GPUs in a
-`DGX SuperPOD`_ and achieve good weak scaling.
-
-.. toctree::
- :maxdepth: 1
- :caption: Contents:
-
- installation
- user/index
- examples/index
- api/index
- faqs
- developer/index
-
-.. toctree::
- :maxdepth: 1
-
- versions
-
-
-Indices and tables
-------------------
-
-* :ref:`genindex`
-* :ref:`search`
-
-.. _DGX SuperPOD: https://www.nvidia.com/en-us/data-center/dgx-superpod/
-.. _Legate: https://github.com/nv-legate/legate.core
-.. _Legion: https://legion.stanford.edu/
-.. _Numpy API: https://numpy.org/doc/stable/reference/
-.. _Python CFD course: https://github.com/barbagroup/CFDPython/blob/master/lessons/15_Step_12.ipynb
\ No newline at end of file
diff --git a/docs/cunumeric/source/installation.rst b/docs/cunumeric/source/installation.rst
deleted file mode 100644
index 7e4a3d4720..0000000000
--- a/docs/cunumeric/source/installation.rst
+++ /dev/null
@@ -1,58 +0,0 @@
-Installation
-============
-
-Default conda install
----------------------
-
-cuNumeric is available from
-`conda `_
-on the `legate channel `_.
-Please make sure you have at least conda version 24.1 installed, then create
-a new environment containing cuNumeric:
-
-.. code-block:: sh
-
- conda create -n myenv -c conda-forge -c legate cunumeric
-
-or install it into an existing environment:
-
-.. code-block:: sh
-
- conda install -c conda-forge -c legate cunumeric
-
-Packages with GPU support are available, and will be chosen automatically by
-``conda install`` on systems with GPUs.
-
-In an environment without GPUs available, ``conda install`` will by default
-choose a CPU-only package. To install a version with GPU support in such an
-environment, use environment variable ``CONDA_OVERRIDE_CUDA``:
-
-.. code-block:: sh
-
- CONDA_OVERRIDE_CUDA="12.2" \
- conda install -c conda-forge -c legate cunumeric
-
-Once installed, you can verify the installation by running one of the examples
-from the cuNumeric repository, for instance:
-
-.. code-block:: sh
-
- $ legate examples/black_scholes.py
- Running black scholes on 10K options...
- Elapsed Time: 129.017 ms
-
-Building from source
----------------------
-
-See :ref:`building cunumeric from source` for instructions on building
-cuNumeric manually.
-
-Licenses
---------
-
-This project will download and install additional third-party open source
-software projects at install time. Review the license terms of these open
-source projects before use.
-
-For license information regarding projects bundled directly, see
-:ref:`thirdparty`.
\ No newline at end of file
diff --git a/docs/cunumeric/source/user/howtos/jupyter.rst b/docs/cunumeric/source/user/howtos/jupyter.rst
deleted file mode 100644
index c0c3f8ffdf..0000000000
--- a/docs/cunumeric/source/user/howtos/jupyter.rst
+++ /dev/null
@@ -1,107 +0,0 @@
-Configuring Jupyter kernels
-===========================
-
-Legate supports single-node execution of programs using Jupyter Notebooks.
-Please use the instructions given below to set up IPython kernels that
-will be used in the notebooks.
-
-Setup
------
-
-IPython Kernel
-~~~~~~~~~~~~~~
-
-Inputs that are passed to the Legate launcher will now be passed to the
-notebook through IPython kernels. By default, ``LEGATE_SM_GPU`` kernel will
-be available and set to use one GPU.
-
-For each set of inputs to legate, a new kernel will have to be created using
-``legate-jupyter`` and then selected from the drop-down menu for
-"Select Kernel" from your notebook.
-
-Use the following to list all the installed kernels. By default,
-``LEGATE_SM_GPU`` should be available.
-
-.. code-block:: sh
-
- jupyter kernelspec list
-
-To create a new kernel that corresponds to a particular set of inputs to
-``legate``, say, to run on 2 CPUs with 10GB of memory and 10% of memory
-reserved for eager allocations, run the following:
-
-.. code-block:: sh
-
- legate-jupyter --name "legate_cpus_2" --cpus 2 --sysmem 10000 --eager-alloc-percentage 10
-
- jupyter kernelspec list
-
-This should create a new kernel named ``legate_cpus_2``. The installed kernel
-can then be selected from the notebook to run on two CPUs.
-
-You can also see input arguments that were passed to Legate by the kernel by
-using magic commands from a cell in the notebook (including the % character),
-like below:
-
-.. code-block:: text
-
- %load_ext legate.info
- %legate_info
-
-A sample output from a custom kernel is given below:
-
-.. code-block:: text
-
- Kernel 'legate_cpus_2' configured for 1 node(s)
-
- Cores:
- CPUs to use per rank : 2
- GPUs to use per rank : 0
- OpenMP groups to use per rank : 0
- Threads per OpenMP group : 4
- Utility processors per rank : 2
-
- Memory:
- DRAM memory per rank (in MBs) : 10000
- DRAM memory per NUMA domain per rank (in MBs) : 0
- Framebuffer memory per GPU (in MBs) : 4000
- Zero-copy memory per rank (in MBs) : 32
- Registered CPU-side pinned memory per rank (in MBs) : 0
-
-Running on a remote server
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-If you intend to run the notebook on a remote server or a laptop, you will
-have to create a tunnel from your localhost to the remote server. Substitute
-remote-server-hostname with the hostname of the remote server you plan to use,
-
-.. code-block:: sh
-
- ssh -4 -t -L 8888:localhost:8002 username@remote-server-hostname ssh -t -L 8002:localhost:8888 remote-server-hostname
-
-and then run on your local machine:
-
-.. code-block:: sh
-
- jupyter notebook --port=8888 --no-browser
-
-This should give a URL where the Jupyter server is running and will look like
-this:
-
-.. code-block:: text
-
- http://localhost:8888/tree?token=
-
-Where ```` will be different each time you launch jupyter. Launch
-the URL from your browser and choose the ``Legate_SM_GPU`` kernel. This ensures
-that the underlying computations can be run using the resources specified
-in the ``Legate_SM_GPU`` kernel.
-
-For more information on how this works with the runtime, we refer the readers
-to respective sections in Legion and Legate documentation.
-
-Running Jupyter Notebooks
--------------------------
-
-You are now set up to run the notebooks using Jupyter with your configured
-options. Check out the notebooks in the `examples` section.
diff --git a/docs/cunumeric/source/user/usage.rst b/docs/cunumeric/source/user/usage.rst
deleted file mode 100644
index 384e8d74ab..0000000000
--- a/docs/cunumeric/source/user/usage.rst
+++ /dev/null
@@ -1,148 +0,0 @@
-.. _usage:
-
-Usage
-=====
-
-Running cuNumeric programs
---------------------------
-
-Using cuNumeric as a replacement for NumPy is simple. Replace your NumPy import
-statement with cuNumeric:
-
-.. code-block:: python
-
- import numpy as np
-
-becomes
-
-.. code-block:: python
-
- import cunumeric as np
-
-Then, run the application like you usually do. For example, if you had a script
-``main.py`` written in NumPy that adds two vectors,
-
-.. code-block:: python
-
- import numpy as np
- x = np.array([1.0, 2.0, 3.0, 4.0])
- y = np.array([4.0, 3.0, 2.0, 1.0])
- z = x + y
- print(z)
-
-change the import statement to use cuNumeric like below,
-
-.. code-block:: python
-
- import cunumeric as np
- x = np.array([1.0, 2.0, 3.0, 4.0])
- y = np.array([4.0, 3.0, 2.0, 1.0])
- z = x + y
- print(z)
-
-And run the program, like this
-
-.. code-block:: sh
-
- python main.py
-
-By default, this command will use 4 CPUs to run the program, but is
-configurable through the LEGATE_CONFIG environment variable. For
-example, to use 2 GPUs instead, run the following
-
-.. code-block:: sh
-
- LEGATE_CONFIG="--gpus 2" python main.py
-
-For execution with multiple nodes (assuming Legate is installed
-with networking support) users can supply the `--nodes` option.
-
-
-For more information on how resources can be allocated using this
-environment variable, see `Using LEGATE_CONFIG`_.
-
-.. note::
-
- Usage of standard Python is intended as a quick on-ramp for users to try
- out cuNumeric more easily. Several legate command line configuration
- options, especially for multi-node execution, are not available when
- running programs with standard Python. See the output of ``legate --help``
- for more details.
-
-To fully utilize the power of cuNumeric and overcome these restrictions, we
-recommend requesting resource allocation using Legate.
-
-Resource allocation
--------------------
-
-Legate allows you to prescribe the resources required to successfully execute
-your application. Applications can be run on three different types of
-processors, also known as task variants: CPU, OMP, and GPU. The OMP variant
-will use OpenMP threads to parallelize your application while the CPU variant
-will use individual processes per processor. In addition to the number or
-processors, you can also specify the amount of memory required for your
-application on each of these processors.
-
-Check the relevant command line arguments to legate and their default values
-before using them. In summary, if you want to change the number of processors,
-make sure to check out the following arguments in the documentation for legate:
-``--cpus``, ``--omps``, ``--ompthreads``, and ``--gpus``. Similarly, if you
-need to change the amount of memory required for your application, check the
-following arguments: ``--sysmem``, ``--numamem``, and ``--fbmem``.
-
-Legate reserves a fraction of the requested memory, denoted by
-``--eager-alloc-percentage``, to be used eagerly, with the rest used for
-deferred allocations. Reducing this typically helps you run larger problems.
-
-If you encounter errors related to resource allocation, check out our
-:ref:`faqs` to debug them.
-
-Using legate launcher
-~~~~~~~~~~~~~~~~~~~~~
-
-To run the above program using four OpenMP threads using the Legate launcher,
-run the following command
-
-.. code-block:: sh
-
- legate --omps 1 --ompthreads 4 --sysmem 40000 --eager-alloc-percentage 10 ./main.py
-
-This will use one OpenMP group and two OpenMP threads to parallelize the
-application. We defer discussions on changing the OpenMP group to a later
-section.
-
-To run on 8 CPUs and use 40GB of system memory with 10% of that memory reserved
-for eager allocations, use the following command:
-
-.. code-block:: sh
-
- legate --cpus 8 --sysmem 40000 --eager-alloc-percentage 10 ./main.py
-
-To run on multiple GPUs and use 40GB of framebuffer memory per GPU with 10%
-of that memory reserved for eager allocations, use the following command:
-
-.. code-block:: sh
-
- legate --gpus 2 --fbmem 40000 --eager-alloc-percentage 10 ./main.py
-
-Using LEGATE_CONFIG
-~~~~~~~~~~~~~~~~~~~
-
-All of the above commands can also be passed through the environment variable
-``LEGATE_CONFIG`` as shown below:
-
-.. code-block:: sh
-
- LEGATE_CONFIG="--omps 1 --ompthreads 4 --sysmem 40000 --eager-alloc-percentage 10" legate main.py
-
-.. code-block:: sh
-
- LEGATE_CONFIG="--cpus 8 --sysmem 40000 --eager-alloc-percentage 10" legate main.py
-
-.. code-block:: sh
-
- LEGATE_CONFIG="--gpus 2 --fbmem 40000 --eager-alloc-percentage 10" legate main.py
-
-Using the environment variable might be useful for users using the same set of
-resources for their runs where they can just set the environment variable once
-and use ``legate main.py`` for all subsequent runs.
diff --git a/docs/cunumeric/source/versions.rst b/docs/cunumeric/source/versions.rst
deleted file mode 100644
index 1760786d8e..0000000000
--- a/docs/cunumeric/source/versions.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-Versions
-========
-
-.. toctree::
- :caption: Versions:
-
- 22.05
- 22.08
- 22.10
- 23.01
- 23.03
- 23.07
- 23.09
- 23.11
diff --git a/docs/cunumeric/switcher.json b/docs/cunumeric/switcher.json
deleted file mode 100644
index e62a26d440..0000000000
--- a/docs/cunumeric/switcher.json
+++ /dev/null
@@ -1,7 +0,0 @@
-[
- {
- "name": "24.06",
- "version": "24.06",
- "url": "https://docs.nvidia.com/cunumeric/24.06/"
- }
-]
\ No newline at end of file
diff --git a/docs/cunumeric/Makefile b/docs/cupynumeric/Makefile
similarity index 100%
rename from docs/cunumeric/Makefile
rename to docs/cupynumeric/Makefile
diff --git a/docs/cunumeric/make.bat b/docs/cupynumeric/make.bat
similarity index 100%
rename from docs/cunumeric/make.bat
rename to docs/cupynumeric/make.bat
diff --git a/docs/cunumeric/source/_images/developer-build.png b/docs/cupynumeric/source/_images/developer-build.png
similarity index 100%
rename from docs/cunumeric/source/_images/developer-build.png
rename to docs/cupynumeric/source/_images/developer-build.png
diff --git a/docs/cunumeric/source/_implemented.rst b/docs/cupynumeric/source/_implemented.rst
similarity index 73%
rename from docs/cunumeric/source/_implemented.rst
rename to docs/cupynumeric/source/_implemented.rst
index f3a76189da..03181433ed 100644
--- a/docs/cunumeric/source/_implemented.rst
+++ b/docs/cupynumeric/source/_implemented.rst
@@ -1,4 +1,4 @@
-.. This page exists to collect references to all cunumeric functions and
+.. This page exists to collect references to all cupynumeric functions and
.. methods that are "implemented". Doing so, any implemented functions or
.. methods that are not present in the docs (but should be) will result in
.. docs build errors
diff --git a/docs/cunumeric/source/_static/.keep b/docs/cupynumeric/source/_static/.keep
similarity index 100%
rename from docs/cunumeric/source/_static/.keep
rename to docs/cupynumeric/source/_static/.keep
diff --git a/docs/cupynumeric/source/_templates/layout.html b/docs/cupynumeric/source/_templates/layout.html
new file mode 100644
index 0000000000..c84d8e5e56
--- /dev/null
+++ b/docs/cupynumeric/source/_templates/layout.html
@@ -0,0 +1,7 @@
+{% extends "!layout.html" %}
+
+{% block extrahead %}
+
+
+
+{% endblock %}
\ No newline at end of file
diff --git a/docs/cunumeric/source/api/_bitgenerator.rst b/docs/cupynumeric/source/api/_bitgenerator.rst
similarity index 50%
rename from docs/cunumeric/source/api/_bitgenerator.rst
rename to docs/cupynumeric/source/api/_bitgenerator.rst
index 32854eff96..0ad24527d2 100644
--- a/docs/cunumeric/source/api/_bitgenerator.rst
+++ b/docs/cupynumeric/source/api/_bitgenerator.rst
@@ -1,7 +1,7 @@
-cunumeric.random.BitGenerator
-=============================
+cupynumeric.random.BitGenerator
+===============================
-.. currentmodule:: cunumeric.random
+.. currentmodule:: cupynumeric.random
.. autoclass:: BitGenerator
diff --git a/docs/cunumeric/source/api/_generator.rst b/docs/cupynumeric/source/api/_generator.rst
similarity index 51%
rename from docs/cunumeric/source/api/_generator.rst
rename to docs/cupynumeric/source/api/_generator.rst
index 539a3c0014..5bffd5501b 100644
--- a/docs/cunumeric/source/api/_generator.rst
+++ b/docs/cupynumeric/source/api/_generator.rst
@@ -1,7 +1,7 @@
-cunumeric.random.Generator
-==========================
+cupynumeric.random.Generator
+============================
-.. currentmodule:: cunumeric.random
+.. currentmodule:: cupynumeric.random
.. autoclass:: Generator
diff --git a/docs/cunumeric/source/api/_grouped.rst b/docs/cupynumeric/source/api/_grouped.rst
similarity index 100%
rename from docs/cunumeric/source/api/_grouped.rst
rename to docs/cupynumeric/source/api/_grouped.rst
diff --git a/docs/cunumeric/source/api/_ndarray.rst b/docs/cupynumeric/source/api/_ndarray.rst
similarity index 95%
rename from docs/cunumeric/source/api/_ndarray.rst
rename to docs/cupynumeric/source/api/_ndarray.rst
index 8e3f03de7d..5dfff107f7 100644
--- a/docs/cunumeric/source/api/_ndarray.rst
+++ b/docs/cupynumeric/source/api/_ndarray.rst
@@ -1,7 +1,7 @@
-cunumeric.ndarray
-=================
+cupynumeric.ndarray
+===================
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
.. autoclass:: ndarray
diff --git a/docs/cunumeric/source/api/binary.rst b/docs/cupynumeric/source/api/binary.rst
similarity index 90%
rename from docs/cunumeric/source/api/binary.rst
rename to docs/cupynumeric/source/api/binary.rst
index 237fdc071c..38b0260ab8 100644
--- a/docs/cunumeric/source/api/binary.rst
+++ b/docs/cupynumeric/source/api/binary.rst
@@ -1,7 +1,7 @@
Binary operations
=================
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
Elementwise bit operations
--------------------------
diff --git a/docs/cupynumeric/source/api/broadcast.rst b/docs/cupynumeric/source/api/broadcast.rst
new file mode 100644
index 0000000000..df9197044c
--- /dev/null
+++ b/docs/cupynumeric/source/api/broadcast.rst
@@ -0,0 +1,7 @@
+.. currentmodule:: cupynumeric
+
+cupynumeric.broadcast
+=====================
+
+.. autoclass:: broadcast
+ :members:
\ No newline at end of file
diff --git a/docs/cunumeric/source/api/classes.rst b/docs/cupynumeric/source/api/classes.rst
similarity index 100%
rename from docs/cunumeric/source/api/classes.rst
rename to docs/cupynumeric/source/api/classes.rst
diff --git a/docs/cupynumeric/source/api/comparison.rst b/docs/cupynumeric/source/api/comparison.rst
new file mode 100644
index 0000000000..eda6dddecb
--- /dev/null
+++ b/docs/cupynumeric/source/api/comparison.rst
@@ -0,0 +1,12 @@
+Project comparisons
+===================
+
+Here is a list of NumPy APIs and corresponding cuPyNumeric implementations.
+
+A dot in the cupynumeric column denotes that cuPyNumeric implementation
+is not provided yet. We welcome contributions for these functions.
+
+NumPy vs cuPyNumeric APIs
+-------------------------
+
+.. comparison-table::
diff --git a/docs/cunumeric/source/api/creation.rst b/docs/cupynumeric/source/api/creation.rst
similarity index 94%
rename from docs/cunumeric/source/api/creation.rst
rename to docs/cupynumeric/source/api/creation.rst
index 153db24475..e35f6ab4cb 100644
--- a/docs/cunumeric/source/api/creation.rst
+++ b/docs/cupynumeric/source/api/creation.rst
@@ -1,7 +1,7 @@
Array creation routines
=======================
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
From shape or value
-------------------
diff --git a/docs/cunumeric/source/api/datatype.rst b/docs/cupynumeric/source/api/datatype.rst
similarity index 81%
rename from docs/cunumeric/source/api/datatype.rst
rename to docs/cupynumeric/source/api/datatype.rst
index 1e4d521e95..bb5667fd05 100644
--- a/docs/cunumeric/source/api/datatype.rst
+++ b/docs/cupynumeric/source/api/datatype.rst
@@ -1,7 +1,7 @@
Data type routines
==================
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
Data type testing
-----------------
diff --git a/docs/cunumeric/source/api/fft.rst b/docs/cupynumeric/source/api/fft.rst
similarity index 76%
rename from docs/cunumeric/source/api/fft.rst
rename to docs/cupynumeric/source/api/fft.rst
index 4dce08d136..6ffe039d9f 100644
--- a/docs/cunumeric/source/api/fft.rst
+++ b/docs/cupynumeric/source/api/fft.rst
@@ -1,7 +1,7 @@
-.. module:: cunumeric.fft
+.. module:: cupynumeric.fft
-Discrete Fourier Transform (:mod:`cunumeric.fft`)
-==================================================
+Discrete Fourier Transform (:mod:`cupynumeric.fft`)
+===================================================
Standard FFTs
---------------
diff --git a/docs/cunumeric/source/api/index.rst b/docs/cupynumeric/source/api/index.rst
similarity index 77%
rename from docs/cunumeric/source/api/index.rst
rename to docs/cupynumeric/source/api/index.rst
index ea740628ec..d57ccba21e 100644
--- a/docs/cunumeric/source/api/index.rst
+++ b/docs/cupynumeric/source/api/index.rst
@@ -1,7 +1,7 @@
API Reference
=============
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
.. toctree::
:maxdepth: 2
diff --git a/docs/cunumeric/source/api/indexing.rst b/docs/cupynumeric/source/api/indexing.rst
similarity index 95%
rename from docs/cunumeric/source/api/indexing.rst
rename to docs/cupynumeric/source/api/indexing.rst
index 2723a2d317..3468e893ee 100644
--- a/docs/cunumeric/source/api/indexing.rst
+++ b/docs/cupynumeric/source/api/indexing.rst
@@ -1,7 +1,7 @@
Indexing routines
=================
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
Generating index arrays
-----------------------
diff --git a/docs/cunumeric/source/api/io.rst b/docs/cupynumeric/source/api/io.rst
similarity index 82%
rename from docs/cunumeric/source/api/io.rst
rename to docs/cupynumeric/source/api/io.rst
index 0fd4ee4b3a..a5ba6f6709 100644
--- a/docs/cunumeric/source/api/io.rst
+++ b/docs/cupynumeric/source/api/io.rst
@@ -1,7 +1,7 @@
Input and output
================
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
NumPy binary files (npy, npz)
-----------------------------
diff --git a/docs/cunumeric/source/api/linalg.rst b/docs/cupynumeric/source/api/linalg.rst
similarity index 68%
rename from docs/cunumeric/source/api/linalg.rst
rename to docs/cupynumeric/source/api/linalg.rst
index 5d94889803..c3beaf9c61 100644
--- a/docs/cunumeric/source/api/linalg.rst
+++ b/docs/cupynumeric/source/api/linalg.rst
@@ -1,9 +1,9 @@
-.. module:: cunumeric.linalg
+.. module:: cupynumeric.linalg
-Linear algebra (:mod:`cunumeric.linalg`)
-========================================
+Linear algebra (:mod:`cupynumeric.linalg`)
+==========================================
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
Matrix and vector products
--------------------------
@@ -29,6 +29,8 @@ Decompositions
:toctree: generated/
linalg.cholesky
+ linalg.eig
+ linalg.eigvals
linalg.qr
linalg.svd
@@ -49,3 +51,12 @@ Solving equations and inverting matrices
:toctree: generated/
linalg.solve
+
+
+Matrix Functions
+----------------
+
+.. autosummary::
+ :toctree: generated/
+
+ linalg.expm
diff --git a/docs/cunumeric/source/api/logic.rst b/docs/cupynumeric/source/api/logic.rst
similarity index 95%
rename from docs/cunumeric/source/api/logic.rst
rename to docs/cupynumeric/source/api/logic.rst
index abc016c653..1ab6c7873c 100644
--- a/docs/cunumeric/source/api/logic.rst
+++ b/docs/cupynumeric/source/api/logic.rst
@@ -1,7 +1,7 @@
Logic functions
===============
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
Truth value testing
-------------------
diff --git a/docs/cunumeric/source/api/manipulation.rst b/docs/cupynumeric/source/api/manipulation.rst
similarity index 93%
rename from docs/cunumeric/source/api/manipulation.rst
rename to docs/cupynumeric/source/api/manipulation.rst
index 6f8bf6f33f..b1d3f54c32 100644
--- a/docs/cunumeric/source/api/manipulation.rst
+++ b/docs/cupynumeric/source/api/manipulation.rst
@@ -1,7 +1,7 @@
Array manipulation routines
===========================
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
Basic operations
----------------
@@ -32,7 +32,7 @@ Transpose-like operations
swapaxes
transpose
-See also :attr:`cunumeric.ndarray.T` property.
+See also :attr:`cupynumeric.ndarray.T` property.
Changing number of dimensions
-----------------------------
diff --git a/docs/cunumeric/source/api/math.rst b/docs/cupynumeric/source/api/math.rst
similarity index 98%
rename from docs/cunumeric/source/api/math.rst
rename to docs/cupynumeric/source/api/math.rst
index ef40212852..5764a93727 100644
--- a/docs/cunumeric/source/api/math.rst
+++ b/docs/cupynumeric/source/api/math.rst
@@ -1,7 +1,7 @@
Mathematical functions
======================
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
Trigonometric functions
-----------------------
diff --git a/docs/cunumeric/source/api/ndarray.rst b/docs/cupynumeric/source/api/ndarray.rst
similarity index 97%
rename from docs/cunumeric/source/api/ndarray.rst
rename to docs/cupynumeric/source/api/ndarray.rst
index 4efec7e0a1..50bfd1a2a3 100644
--- a/docs/cunumeric/source/api/ndarray.rst
+++ b/docs/cupynumeric/source/api/ndarray.rst
@@ -1,7 +1,7 @@
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
-The N-Dimensional array (:class:`cunumeric.ndarray`)
-====================================================
+The N-Dimensional array (:class:`cupynumeric.ndarray`)
+======================================================
Constructing arrays
-------------------
diff --git a/docs/cunumeric/source/api/random.rst b/docs/cupynumeric/source/api/random.rst
similarity index 89%
rename from docs/cunumeric/source/api/random.rst
rename to docs/cupynumeric/source/api/random.rst
index 0cf5a61a99..79a0f2adbd 100644
--- a/docs/cunumeric/source/api/random.rst
+++ b/docs/cupynumeric/source/api/random.rst
@@ -1,7 +1,7 @@
-.. module:: cunumeric.random
+.. module:: cupynumeric.random
-Random sampling (:mod:`cunumeric.random`)
-=========================================
+Random sampling (:mod:`cupynumeric.random`)
+===========================================
Random Generator
-----------------
diff --git a/docs/cunumeric/source/api/routines.rst b/docs/cupynumeric/source/api/routines.rst
similarity index 100%
rename from docs/cunumeric/source/api/routines.rst
rename to docs/cupynumeric/source/api/routines.rst
diff --git a/docs/cunumeric/source/api/set.rst b/docs/cupynumeric/source/api/set.rst
similarity index 79%
rename from docs/cunumeric/source/api/set.rst
rename to docs/cupynumeric/source/api/set.rst
index c4299e870d..e797379d13 100644
--- a/docs/cunumeric/source/api/set.rst
+++ b/docs/cupynumeric/source/api/set.rst
@@ -1,7 +1,7 @@
Set routines
============
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
Making proper sets
------------------
diff --git a/docs/cupynumeric/source/api/settings.rst b/docs/cupynumeric/source/api/settings.rst
new file mode 100644
index 0000000000..6a424f0fbc
--- /dev/null
+++ b/docs/cupynumeric/source/api/settings.rst
@@ -0,0 +1,8 @@
+Settings
+========
+
+cuPyNumeric has a number of runtime settings that can be configured through
+environment variables.
+
+.. settings:: settings
+ :module: cupynumeric.settings
\ No newline at end of file
diff --git a/docs/cunumeric/source/api/sorting.rst b/docs/cupynumeric/source/api/sorting.rst
similarity index 93%
rename from docs/cunumeric/source/api/sorting.rst
rename to docs/cupynumeric/source/api/sorting.rst
index 86d8e65dc0..ab5570cfde 100644
--- a/docs/cunumeric/source/api/sorting.rst
+++ b/docs/cupynumeric/source/api/sorting.rst
@@ -1,7 +1,7 @@
Sorting, searching, and counting
================================
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
Sorting
-------
diff --git a/docs/cunumeric/source/api/statistics.rst b/docs/cupynumeric/source/api/statistics.rst
similarity index 94%
rename from docs/cunumeric/source/api/statistics.rst
rename to docs/cupynumeric/source/api/statistics.rst
index 5fb0cdc95f..9430ea3240 100644
--- a/docs/cunumeric/source/api/statistics.rst
+++ b/docs/cupynumeric/source/api/statistics.rst
@@ -1,7 +1,7 @@
Statistics
==========
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
Order statistics
----------------
diff --git a/docs/cunumeric/source/api/window.rst b/docs/cupynumeric/source/api/window.rst
similarity index 85%
rename from docs/cunumeric/source/api/window.rst
rename to docs/cupynumeric/source/api/window.rst
index 28058d21fd..e50dc58984 100644
--- a/docs/cunumeric/source/api/window.rst
+++ b/docs/cupynumeric/source/api/window.rst
@@ -1,7 +1,7 @@
Window functions
======================
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
Various windows
-----------------------
diff --git a/docs/cunumeric/source/conf.py b/docs/cupynumeric/source/conf.py
similarity index 54%
rename from docs/cunumeric/source/conf.py
rename to docs/cupynumeric/source/conf.py
index 592defb55c..832a83ebac 100644
--- a/docs/cunumeric/source/conf.py
+++ b/docs/cupynumeric/source/conf.py
@@ -15,20 +15,37 @@
from os import getenv
-from cunumeric import __version__
+import cupynumeric
-SWITCHER_PROD = "https://docs.nvidia.com/cunumeric/switcher.json"
+SWITCHER_PROD = "https://docs.nvidia.com/cupynumeric/switcher.json"
SWITCHER_DEV = "http://localhost:8000/switcher.json"
JSON_URL = SWITCHER_DEV if getenv("SWITCHER_DEV") == "1" else SWITCHER_PROD
+ANNOTATE = getenv("LEGATE_ANNOTATION_DOCS") == "1"
+
+# This is the "YY.MM" version string that we want users to see
+BASE_VERSION = ".".join(cupynumeric.__version__.split(".", 2)[:2])
+
+# make sure BASE VERSION is formatted as expected
+_yy, _mm = BASE_VERSION.split(".")
+assert _yy.isdigit()
+assert _mm.isdigit()
+
# -- Project information -----------------------------------------------------
-project = "NVIDIA cuNumeric"
-if "dev" in __version__:
- project += f" ({__version__})"
+project = "NVIDIA cuPyNumeric"
copyright = "2024, NVIDIA"
author = "NVIDIA Corporation"
+if "dev" in cupynumeric.__version__ or "rc" in cupynumeric.__version__:
+ # for dev/rc versions just use the entire version with everything, and
+ # add it to the page title as well, for easy recognition
+ version = release = cupynumeric.__version__
+ project += f" ({cupynumeric.__version__})"
+else:
+ # otherwise, we actually only want the YY.MM to be visible for releases
+ version = release = BASE_VERSION
+
# -- General configuration ---------------------------------------------------
extensions = [
@@ -42,10 +59,10 @@
"myst_parser",
"nbsphinx",
"legate._sphinxext.settings",
- "cunumeric._sphinxext.comparison_table",
- "cunumeric._sphinxext.implemented_index",
- "cunumeric._sphinxext.missing_refs",
- "cunumeric._sphinxext.ufunc_formatter",
+ "cupynumeric._sphinxext.comparison_table",
+ "cupynumeric._sphinxext.implemented_index",
+ "cupynumeric._sphinxext.missing_refs",
+ "cupynumeric._sphinxext.ufunc_formatter",
]
source_suffix = {".rst": "restructuredtext", ".md": "markdown"}
@@ -55,43 +72,25 @@
html_context = {
# "default_mode": "light",
"AUTHOR": author,
- "DESCRIPTION": "cuNumeric documentation site.",
+ "DESCRIPTION": "cuPyNumeric documentation site.",
}
html_static_path = ["_static"]
-# This is pretty kludgy but the nv theme is not publicly available to
-# install on CI, etc. We will use the pydata theme in those situations
-if getenv("NV_THEME") == "1":
- html_theme = "nvidia_sphinx_theme"
-
- html_theme_options = {
- "switcher": {
- "json_url": JSON_URL,
- "navbar_start": ["navbar-logo", "version-switcher"],
- "version_match": ".".join(__version__.split(".", 2)[:2]),
- }
- }
-
-else:
- html_theme = "pydata_sphinx_theme"
-
- html_theme_options = {
- "footer_start": ["copyright"],
- "github_url": "https://github.com/nv-legate/cunumeric",
- # https://github.com/pydata/pydata-sphinx-theme/issues/1220
- "icon_links": [],
- "logo": {
- "text": project,
- "link": "https://nv-legate.github.io/cunumeric",
- },
- "navbar_align": "left",
- "navbar_end": ["navbar-icon-links", "theme-switcher"],
- "primary_sidebar_end": ["indices.html"],
- "secondary_sidebar_items": ["page-toc"],
- "show_nav_level": 2,
- "show_toc_level": 2,
- }
+html_theme = "nvidia_sphinx_theme"
+
+html_theme_options = {
+ "switcher": {
+ "json_url": JSON_URL,
+ "navbar_start": ["navbar-logo", "version-switcher"],
+ "version_match": BASE_VERSION,
+ },
+ "extra_footer": [
+ "This project, i.e., cuPyNumeric, is separate and independent of the CuPy project. CuPy is a registered trademark of Preferred Networks.", # NOQA
+ '', # NOQA
+ ],
+ "show_version_warning_banner": True,
+}
templates_path = ["_templates"]
@@ -116,4 +115,6 @@
def setup(app):
+ if ANNOTATE:
+ app.add_js_file("https://hypothes.is/embed.js", kind="hypothesis")
app.add_css_file("params.css")
diff --git a/docs/cupynumeric/source/developer/CONTRIBUTING.md b/docs/cupynumeric/source/developer/CONTRIBUTING.md
new file mode 100644
index 0000000000..8dacfa72c3
--- /dev/null
+++ b/docs/cupynumeric/source/developer/CONTRIBUTING.md
@@ -0,0 +1,72 @@
+# Contributing to cuPyNumeric
+
+cuPyNumeric is an open-source project released under the [Apache license, version 2.0](https://www.apache.org/licenses/LICENSE-2.0). We welcome any and all contributions, and we hope that you can help us develop a strong community.
+
+## How to begin
+
+Most of the time, the best thing is to begin by [opening an issue](https://github.com/nv-legate/cupynumeric/issues). This gives us a chance to discuss the contribution and to define the problem or feature that it addresses. Often, opening of the issue first may help prevent you from doing unnecessary work or to enhance and further develop your idea.
+
+Once you are ready to start development, we ask you to work on a [fork](https://docs.github.com/en/get-started/quickstart/fork-a-repo) of our repository. The next step is to create a (pull request)[https://help.github.com/en/articles/about-pull-requests]. Feel free to open the pull request as soon as you begin your development (just mark it [as a draft](https://github.blog/2019-02-14-introducing-draft-pull-requests/)) or when you are ready to have your contribution merged.
+
+## The Legalese: Developer Certificate of Origin
+
+cuPyNumeric is released under the open-source [Apache license, version 2.0](https://www.apache.org/licenses/LICENSE-2.0), and is free to use, modify, and redistribute. To ensure that the license can be exercised without encumbrance, we ask you that you only contribute your own work or work to which you have the intellectual rights. To that end, we employ the Developer's Certificate of Origin (DCO), which is the lightweight mechanism for you to certify that you are legally able to make your contribution. Here is the full text of the certificate (also available at [DeveloperCertificate.org](https://developercertificate.org/):
+
+````
+Developer Certificate of Origin
+Version 1.1
+
+Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
+
+Everyone is permitted to copy and distribute verbatim copies of this
+license document, but changing it is not allowed.
+
+
+Developer's Certificate of Origin 1.1
+
+By making a contribution to this project, I certify that:
+
+(a) The contribution was created in whole or in part by me and I
+ have the right to submit it under the open source license
+ indicated in the file; or
+
+(b) The contribution is based upon previous work that, to the best
+ of my knowledge, is covered under an appropriate open source
+ license and I have the right under that license to submit that
+ work with modifications, whether created in whole or in part
+ by me, under the same open source license (unless I am
+ permitted to submit under a different license), as indicated
+ in the file; or
+
+(c) The contribution was provided directly to me by some other
+ person who certified (a), (b) or (c) and I have not modified
+ it.
+
+(d) I understand and agree that this project and the contribution
+ are public and that a record of the contribution (including all
+ personal information I submit with it, including my sign-off) is
+ maintained indefinitely and may be redistributed consistent with
+ this project or the open source license(s) involved.
+````
+
+### How Do I Sign the DCO?
+
+Fortunately, it does not take much work to sign the DCO. The only thing that you have to do is to mark all your commits with a `Signed-off-by` line that looks like that:
+
+````
+Signed-off-by: Your Name
+````
+
+Please use your real name and a valid email address at which you can be reached. For legal reasons, we will not be able to accept contributions that use pseudonyms in the signature. You can simply add this line at the end of all your commits manually, or you can use the `-s` or the `--signoff` options provided by Git to automatically tack on the signature.
+
+## Review Process
+
+We are really grateful that you are thinking of contributing to cuPyNumeric. We will make every effort to review your contributions as soon as possible.
+
+As we suggested at the beginning of this document, it will be really helpful to start with an issue unless your proposed change is really trivial. An issue will help to save work in the review process (e.g., maybe somebody is already working on exactly the same thing you want to work on). After you open your pull request (PR), there usually will be a community feedback that often will require further changes to your contribution (the usual open-source process). Usually, this will conclude in the PR being merged by a maintainer, but on rare occasions a PR may be rejected. This may happen, for example, if the PR appears abandoned (no response to the community feedback) or if the PR does not seem to be approaching community acceptance in a reasonable time frame. In any case, an explanation will always be given why a PR is closed. Even if a PR is closed for some reason, it may always be reopened if the situation evolves (feel free to comment on closed PRs to discuss reopening them).
+
+## Code Formatting Requirements
+
+cuPyNumeric has a set of coding standards that are expected from all the code merged into the project. The coding standards are defined by the set of tools we use to format our code. We use the [pre-commit](https://pre-commit.com/) framework to run our formatting tools. The easiest way to meet the coding standards is to simply use the pre-commit framework to run all the checks for you. Please visit the [pre-commit project page](https://pre-commit.com/) for pre-commit installation and usage instructions. Once pre-commit is installed in the cuPyNumeric repo, all the checks and formatting will be run on every commit, but one can also run the checks explicitly as detailed in pre-commit documentation.
+
+We hope that the automation of our formatting checks will make it easy to comply with our coding standards. If you encounter problems with code formatting, however, please let us know in a comment on your PR, and we will do our best to help.
diff --git a/docs/cupynumeric/source/developer/building.rst b/docs/cupynumeric/source/developer/building.rst
new file mode 100644
index 0000000000..25e61cc940
--- /dev/null
+++ b/docs/cupynumeric/source/developer/building.rst
@@ -0,0 +1,108 @@
+.. _building cupynumeric from source:
+
+Building from source
+====================
+
+Basic build
+-----------
+
+Users must have a working installation of the `Legate`_ library prior to
+installing cuPyNumeric.
+**Installing cuPyNumeric by itself will not automatically install Legate.**
+
+See below for a list of cuPyNumeric's dependencies. The easiest way to set up a
+build environment that includes all of cuPyNumeric dependencies is to use the
+``scripts/generate-conda-envs.py`` script from the `Legate build instructions`_,
+passing the ``--cupynumeric`` flag.
+
+Once all dependencies are installed, you can simply invoke ``./install.py`` from
+the cuPyNumeric top-level directory. The build will automatically pick up the
+configuration used when building Legate (e.g. the CUDA Toolkit directory).
+
+Dependencies
+------------
+
+OpenBLAS
+~~~~~~~~
+
+Used for implementing linear algebra routines on CPUs.
+
+If you want to use a custom build of OpenBLAS, you will need to get a
+Fortran compiler, e.g. by pulling ``fortran-compiler`` from conda-forge.
+
+If using a build of Legate that includes OpenMP support, then you need a build
+of OpenBLAS configured with the following options:
+
+* ``USE_THREAD=1``
+* ``USE_OPENMP=1``
+* ``NUM_PARALLEL=32`` (or at least as many as the NUMA domains on the target
+ machine) -- The ``NUM_PARALLEL`` flag defines how many instances of OpenBLAS's
+ calculation API can run in parallel. Legate will typically instantiate a
+ separate OpenMP group per NUMA domain, and each group can launch independent
+ BLAS work. If ``NUM_PARALLEL`` is not high enough, some of this parallel work
+ will be serialized.
+
+TBLIS
+~~~~~
+
+Used for implementing tensor contraction routines on CPUs.
+
+This library will be automatically downloaded and built during cuPyNumeric
+installation.
+
+cuPyNumeric requires a build of TBLIS configured as follows:
+
+.. code-block:: none
+
+ --with-label-type=int32_t --with-length-type=int64_t --with-stride-type=int64_t
+
+and additionally ``--enable-thread-model=openmp`` if using a build of Legate
+that includes OpenMP support.
+
+Advanced topics
+---------------
+
+Building through pip & cmake
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+cuPyNumeric uses a cmake/scikit-build-based build workflow. There are several
+examples in the ``scripts`` directory, showing how to build different
+configurations of cuPyNumeric. We walk through the steps in
+``build-with-legate-separately-no-install.sh`` here. We assume a pre-existing
+Legate build.
+
+First, the CMake build needs to be configured:
+
+.. code:: sh
+
+ $ cmake -S . -B build -GNinja -D legate_ROOT:STRING=path/to/legate/build
+
+We point cuPyNumeric to the Legate *build* tree, not an installation.
+This generates all build-dependent headers and Python files.
+
+Once configured, we can build the C++ libraries:
+
+.. code:: sh
+
+ $ cmake --build build
+
+This will invoke Ninja (or make) to execute the build.
+Once the C++ libraries are available, we can do an editable (development) pip installation.
+
+.. code:: sh
+
+ $ SKBUILD_BUILD_OPTIONS="-D FIND_CUPYNUMERIC_CPP=ON -D cupynumeric_ROOT=$(pwd)/build" \
+ python3 -m pip install \
+ --root / --no-deps --no-build-isolation
+ --editable .
+
+The Python source tree and CMake build tree are now available with the environment Python
+for running cuPyNumeric programs. The diagram below illustrates the
+complete workflow for building both Legate and cuPyNumeric.
+
+.. image:: /_images/developer-build.png
+ :width: 600
+ :alt: "notional diagram of cupynumeric build process"
+
+.. _Legate: https://github.com/nv-legate/legate
+.. _Legate build instructions: https://docs.nvidia.com/legate/latest/BUILD.html#dependencies
diff --git a/docs/cunumeric/source/developer/index.rst b/docs/cupynumeric/source/developer/index.rst
similarity index 100%
rename from docs/cunumeric/source/developer/index.rst
rename to docs/cupynumeric/source/developer/index.rst
diff --git a/docs/cunumeric/source/developer/testing.rst b/docs/cupynumeric/source/developer/testing.rst
similarity index 97%
rename from docs/cunumeric/source/developer/testing.rst
rename to docs/cupynumeric/source/developer/testing.rst
index f5485b7874..55aa39e366 100644
--- a/docs/cunumeric/source/developer/testing.rst
+++ b/docs/cupynumeric/source/developer/testing.rst
@@ -4,7 +4,7 @@ Running tests
Basic usage
-----------
-The simplest way to run the cuNumeric test suite is to use the ``test.py``
+The simplest way to run the cuPyNumeric test suite is to use the ``test.py``
test driver script.
.. code-block:: sh
diff --git a/docs/cunumeric/source/examples/black_scholes.ipynb b/docs/cupynumeric/source/examples/black_scholes.ipynb
similarity index 99%
rename from docs/cunumeric/source/examples/black_scholes.ipynb
rename to docs/cupynumeric/source/examples/black_scholes.ipynb
index a091201f63..e5868463a1 100644
--- a/docs/cunumeric/source/examples/black_scholes.ipynb
+++ b/docs/cupynumeric/source/examples/black_scholes.ipynb
@@ -41,7 +41,7 @@
"id": "5b787e94-e440-4e1c-bd66-29faf9b59041",
"metadata": {},
"source": [
- "To get started, `import cunumeric as np` (just the same way we would import `numpy`)"
+ "To get started, `import cupynumeric as np` (just the same way we would import `numpy`)"
]
},
{
@@ -51,7 +51,7 @@
"metadata": {},
"outputs": [],
"source": [
- "import cunumeric as np # instead of numpy"
+ "import cupynumeric as np # instead of numpy"
]
},
{
@@ -162,7 +162,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "/home/bryan/work/legate.core/legate/core/context.py:280: RuntimeWarning: cuNumeric has not implemented numpy.result_type and is falling back to canonical numpy. You may notice significantly decreased performance for this function call.\n",
+ "/home/bryan/work/legate.core/legate/core/context.py:280: RuntimeWarning: cuPyNumeric has not implemented numpy.result_type and is falling back to canonical numpy. You may notice significantly decreased performance for this function call.\n",
" result = func(*args, **kwargs)\n",
"Elapsed Time: 45.659 ms\n"
]
diff --git a/docs/cunumeric/source/examples/cholesky.ipynb b/docs/cupynumeric/source/examples/cholesky.ipynb
similarity index 87%
rename from docs/cunumeric/source/examples/cholesky.ipynb
rename to docs/cupynumeric/source/examples/cholesky.ipynb
index 0e82d20ee2..ee39c6ec07 100644
--- a/docs/cunumeric/source/examples/cholesky.ipynb
+++ b/docs/cupynumeric/source/examples/cholesky.ipynb
@@ -9,7 +9,7 @@
"\n",
"A [Cholesky decomposition](https://en.wikipedia.org/wiki/Cholesky_decomposition) is a useful factorization of Hermitian, positive-definite matrices into the product of a lower triangular matrix $L$ with its conjugate transpose $L^{*}$.\n",
"\n",
- "Numpy has a function [numpy.linalg.cholesky](https://numpy.org/doc/stable/reference/generated/numpy.linalg.cholesky.html) built-in for computing Cholesky decompositions. Cunumeric also implements this function, and it can be used as an immediate drop-in replacement.\n",
+ "Numpy has a function [numpy.linalg.cholesky](https://numpy.org/doc/stable/reference/generated/numpy.linalg.cholesky.html) built-in for computing Cholesky decompositions. cuPyNumeric also implements this function, and it can be used as an immediate drop-in replacement.\n",
"\n",
"\n",
"License
\n",
@@ -37,7 +37,7 @@
"id": "389cd191-ccda-4597-8e08-8d01ac226bee",
"metadata": {},
"source": [
- "To get started, `import cunumeric as np` (just the same way we would import `numpy`)\n"
+ "To get started, `import cupynumeric as np` (just the same way we would import `numpy`)\n"
]
},
{
@@ -49,7 +49,7 @@
},
"outputs": [],
"source": [
- "import cunumeric as np # instead of numpy"
+ "import cupynumeric as np # instead of numpy"
]
},
{
@@ -57,7 +57,7 @@
"id": "9ef2bc57-e703-40ce-8aaa-d45408259c7a",
"metadata": {},
"source": [
- "At this point we can call `np.linalg.cholesky`, exactly how we would with Numpy, but will get the result computed by Cunumeric's `cholesky` function. Let's quickly try it out with a simple identitity matrix:"
+ "At this point we can call `np.linalg.cholesky`, exactly how we would with Numpy, but will get the result computed by cuPyNumeric's `cholesky` function. Let's quickly try it out with a simple identitity matrix:"
]
},
{
@@ -96,7 +96,7 @@
"tags": []
},
"source": [
- "We'd like to get some information about how well Cunumeric's `cholesky` function performs. In order to obtain accurate timings, we need to use the `time` function from `legate.timing`. Let's define a helper function `cholesky_timed` that calls the `time` function for us, and prints out the results as well:"
+ "We'd like to get some information about how well cuPyNumeric's `cholesky` function performs. In order to obtain accurate timings, we need to use the `time` function from `legate.timing`. Let's define a helper function `cholesky_timed` that calls the `time` function for us, and prints out the results as well:"
]
},
{
diff --git a/docs/cunumeric/source/examples/compact_finite_difference.ipynb b/docs/cupynumeric/source/examples/compact_finite_difference.ipynb
similarity index 100%
rename from docs/cunumeric/source/examples/compact_finite_difference.ipynb
rename to docs/cupynumeric/source/examples/compact_finite_difference.ipynb
diff --git a/docs/cunumeric/source/examples/edge_detection.ipynb b/docs/cupynumeric/source/examples/edge_detection.ipynb
similarity index 99%
rename from docs/cunumeric/source/examples/edge_detection.ipynb
rename to docs/cupynumeric/source/examples/edge_detection.ipynb
index 6836020ee9..c83093f02c 100644
--- a/docs/cunumeric/source/examples/edge_detection.ipynb
+++ b/docs/cupynumeric/source/examples/edge_detection.ipynb
@@ -16,7 +16,7 @@
"## Learning Outcomes\n",
"This example identifies edges in an image using Sobol edge detection algorithm and is implemented using NumPy and SciPy. An edge is defined as an abrupt change in intensity of the image. The Sobol edge detection algorithm uses a kernel in each direction to compute derivative of intensity of the image. The gradient of the intensity will help us determine the locations where changes in intensity are abrupt, which can then be used to detect edges in an image.\n",
"\n",
- "This example uses the following packages in addition to NumPy/cuNumeric: Scipy, Matplotlib, PIL"
+ "This example uses the following packages in addition to NumPy/cuPyNumeric: Scipy, Matplotlib, PIL"
]
},
{
@@ -68,7 +68,7 @@
"id": "78273013-cea0-4c28-a376-c3c40e681276",
"metadata": {},
"source": [
- "Since NumPy's `convolve` API does not allow two-dimensional arrays and our image is represented in an two-dimensional array, we will use the `convolve` API from SciPy for this example. cuNumeric's implementation of `convolve` permits two-dimensional array and will be used if `cuNumeric` is imported instead of `NumPy`. Try changing the import statement from \"import numpy as np\" to \"import cunumeric as np\"!"
+ "Since NumPy's `convolve` API does not allow two-dimensional arrays and our image is represented in an two-dimensional array, we will use the `convolve` API from SciPy for this example. cuPyNumeric's implementation of `convolve` permits two-dimensional array and will be used if `cuPyNumeric` is imported instead of `NumPy`. Try changing the import statement from \"import numpy as np\" to \"import cupynumeric as np\"!"
]
},
{
@@ -85,7 +85,7 @@
" kernel: ndarray\n",
" Kernel to compute the gradient in x or y as per Sobel Edge Detector\n",
" mode: str\n",
- " The default convolution mode. Note that cuNumeric only\n",
+ " The default convolution mode. Note that cuPyNumeric only\n",
" supports the convolution mode \"same\".\n",
"\n",
" Notes:\n",
@@ -95,7 +95,7 @@
" The image was taken from:\n",
" https://docs.nvidia.com/vpi/algo_canny_edge_detector.html\n",
" \"\"\"\n",
- " if np.__name__ == \"cunumeric\":\n",
+ " if np.__name__ == \"cupynumeric\":\n",
" return np.convolve(array, kernel, mode)\n",
" return convolve(array, kernel, mode)"
]
diff --git a/docs/cunumeric/source/examples/image.png b/docs/cupynumeric/source/examples/image.png
similarity index 100%
rename from docs/cunumeric/source/examples/image.png
rename to docs/cupynumeric/source/examples/image.png
diff --git a/docs/cunumeric/source/examples/index.rst b/docs/cupynumeric/source/examples/index.rst
similarity index 93%
rename from docs/cunumeric/source/examples/index.rst
rename to docs/cupynumeric/source/examples/index.rst
index 6c2f0cfba9..bd1adf4e2a 100644
--- a/docs/cunumeric/source/examples/index.rst
+++ b/docs/cupynumeric/source/examples/index.rst
@@ -11,3 +11,4 @@ Examples
edge_detection
newton_raphson_2d
compact_finite_difference
+ torchswe
diff --git a/docs/cunumeric/source/examples/kmeans.ipynb b/docs/cupynumeric/source/examples/kmeans.ipynb
similarity index 99%
rename from docs/cunumeric/source/examples/kmeans.ipynb
rename to docs/cupynumeric/source/examples/kmeans.ipynb
index 5118b4ef18..29e78003c0 100644
--- a/docs/cunumeric/source/examples/kmeans.ipynb
+++ b/docs/cupynumeric/source/examples/kmeans.ipynb
@@ -14,7 +14,7 @@
"metadata": {},
"source": [
"## Learning Outcomes\n",
- "This example teaches how to implement k-means clustering algorithm using NumPy and is based on the k-means example in cuNumeric. \n",
+ "This example teaches how to implement k-means clustering algorithm using NumPy and is based on the k-means example in cuPyNumeric. \n",
"\n",
"In this example, you will learn:\n",
"* how to compute pairwise distances using `newaxis`\n",
diff --git a/docs/cunumeric/source/examples/newton_raphson_2d.ipynb b/docs/cupynumeric/source/examples/newton_raphson_2d.ipynb
similarity index 91%
rename from docs/cunumeric/source/examples/newton_raphson_2d.ipynb
rename to docs/cupynumeric/source/examples/newton_raphson_2d.ipynb
index 3ab628a284..43a7edf747 100644
--- a/docs/cunumeric/source/examples/newton_raphson_2d.ipynb
+++ b/docs/cupynumeric/source/examples/newton_raphson_2d.ipynb
@@ -16,7 +16,7 @@
"## Learning Outcomes\n",
"This example teaches how to compute the solution for systems of equations in two variables using NumPy. There are two equations, $f_{1}(x,y)$ and $f_{2}(x, y)$, with two variables each, $x$ and $y$. We seek to find a solution that satisfies these two equations using Newton's method. To understand Newton's method in multiple dimensions, please see [this](https://wiki.math.ntnu.no/_media/tma4125/2017v/newton.pdf) note by Markus Grasmair.\n",
"\n",
- "The example also teaches how to interpret a warning from cuNumeric when the import statement is changed from importing numpy to importing cuNumeric.\n",
+ "The example also teaches how to interpret a warning from cuPyNumeric when the import statement is changed from importing numpy to importing cuPyNumeric.\n",
"\n",
"---"
]
@@ -106,15 +106,15 @@
"id": "a91752f1-5ca8-44dd-9a26-525cdf87ab51",
"metadata": {},
"source": [
- "When you switch the import statement from importing to importing cunumeric, you might see a warning like this:\n",
+ "When you switch the import statement from importing to importing cupynumeric, you might see a warning like this:\n",
"\n",
"---\n",
"\n",
- "*RuntimeWarning: cuNumeric has not implemented inv and is falling back to canonical NumPy. You may notice significantly decreased performance for this function call.*\n",
+ "*RuntimeWarning: cuPyNumeric has not implemented inv and is falling back to canonical NumPy. You may notice significantly decreased performance for this function call.*\n",
"\n",
"---\n",
"\n",
- "This means that cuNumeric has not implemented the `linalg.inv` API and is falling back to NumPy's implementation. This means that the API would be *eagerly* executed using NumPy's single-threaded implementation. If the API was intended to be invoked from a GPU, the data will get transferred from the GPU to the CPU before the API is executed. This can have performance implications, as indicated by the warning."
+ "This means that cuPyNumeric has not implemented the `linalg.inv` API and is falling back to NumPy's implementation. This means that the API would be *eagerly* executed using NumPy's single-threaded implementation. If the API was intended to be invoked from a GPU, the data will get transferred from the GPU to the CPU before the API is executed. This can have performance implications, as indicated by the warning."
]
},
{
diff --git a/docs/cunumeric/source/examples/stencil.ipynb b/docs/cupynumeric/source/examples/stencil.ipynb
similarity index 99%
rename from docs/cunumeric/source/examples/stencil.ipynb
rename to docs/cupynumeric/source/examples/stencil.ipynb
index 95b91744c6..72a635efae 100644
--- a/docs/cunumeric/source/examples/stencil.ipynb
+++ b/docs/cupynumeric/source/examples/stencil.ipynb
@@ -33,7 +33,7 @@
"id": "35c48e6f-1bde-4aac-af55-b7218cc22491",
"metadata": {},
"source": [
- "To get started, `import cunumeric as np` (just the same way we would import `numpy`)\n"
+ "To get started, `import cupynumeric as np` (just the same way we would import `numpy`)\n"
]
},
{
@@ -45,7 +45,7 @@
},
"outputs": [],
"source": [
- "import cunumeric as np # instead of numpy"
+ "import cupynumeric as np # instead of numpy"
]
},
{
diff --git a/docs/cupynumeric/source/examples/torchswe.ipynb b/docs/cupynumeric/source/examples/torchswe.ipynb
new file mode 100644
index 0000000000..c4b6173b9e
--- /dev/null
+++ b/docs/cupynumeric/source/examples/torchswe.ipynb
@@ -0,0 +1,219 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "5be6c57b-7cae-4fc1-b78f-899becabc6ee",
+ "metadata": {},
+ "source": [
+ "# TorchSWE case study\n",
+ "\n",
+ "\n",
+ "[TorchSWE](https://github.com/piyueh/TorchSWE) is a shallow-water solver created by Dr. Pi-Yueh Chuang and Prof. Lorena Barba that solves the vertically averaged Navier-Stokes equations using MPI and CuPy. It can simulate free-surface water flow in rivers, channels, and coastal areas, as well as model flood inundation. Given a topography, TorchSWE can predict flood-prone areas and the height of water inundation, making it a valuable tool for risk mapping.\n",
+ "\n",
+ "High-resolution numerical simulations—such as those on real topographies requiring hundreds of millions of data points—demand distributed computation across multiple GPUs. Although scalability is achievable with MPI4Py and CuPy, this approach requires manually partitioning the problem and managing inter-GPU data communication, which are complex and error-prone tasks.\n",
+ "\n",
+ "cuPyNumeric enables a distributed implementation of TorchSWE using only NumPy operations, without the complexities of MPI+CuPy. After porting TorchSWE to cuPyNumeric by removing all domain decomposition logic, it scaled effortlessly across multiple GPUs and nodes without further code modifications. This scalability enabled high-fidelity simulations exceeding 1.2 billion data points using 32 GPUs, allowing researchers to tackle critical scientific problems in flood inundation modeling without needing specialized distributed computing expertise. Overall, the cuPyNumeric implementation reduced the lines of code by over 20%, and simplified development and maintenance by eliminating complex logic for managing distribution and communication.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0402fb01-748b-48d9-9caa-80e7510ade80",
+ "metadata": {},
+ "source": [
+ "\n",
+ "Deep dive into the TorchSWE code implementation
\n",
+ "\n",
+ " Original code details
\n",
+ "\n",
+ "TorchSWE uses stencil operations to model shallow-water equations on a 2D grid, where each point is updated based on neighboring values, simulating water flow dynamics. The stencil computations are structured to update each grid cell iteratively, based on data from surrounding cells, mimicking fluid behavior over time. Below is an example that mimics the basic structure of the stencil logic from the TorchSWE repository:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "640f0b62-f70f-4d8a-86c5-7b4739e60a33",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ " \n",
+ "# Example dimensions for the grid\n",
+ "nx, ny = 128, 128\n",
+ "grid = np.ones((nx, ny)) # Initialize the grid with \"1\"\n",
+ "\n",
+ "# Stencil operation \n",
+ "for i in range(1, nx - 1):\n",
+ " for j in range(1, ny - 1):\n",
+ " grid[i, j] = (grid[i + 1, j] + grid[i - 1, j] + grid[i, j + 1] + grid[i, j - 1]) / 4\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0281b3f4-5a48-40cc-9ec8-0fc9d7fd760c",
+ "metadata": {},
+ "source": [
+ "This code iteratively updates cell `h[i, j]` using adjacent cells, representing a basic averaging stencil operation that can be extended to various boundary conditions and flow dynamics in the shallow-water model. For full context, refer to [TorchSWE on GitHub](https://github.com/piyueh/TorchSWE).\n",
+ "\n",
+ "Parallelizing stencil operations for multi-GPU systems is challenging. When arrays are partitioned across multiple GPUs, any update to a cell requires the updated values to be shared between GPUs to maintain consistency across boundaries. This communication overhead and synchronization make parallelizing stencil code complex and difficult to implement efficiently on multi-GPU architectures.\n",
+ "\n",
+ "Below, we outline TorchSWE’s MPI4Py logic in more detail to highlight the complexity involved in this implementation.\n",
+ "Here’s an example code snippet that mirrors the TorchSWE MPI logic, implementing a simple MPI stencil operation from above:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "0d7db631-3ae9-41ca-a0f1-07390349fbd0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from mpi4py import MPI\n",
+ "import cupy as cp\n",
+ "\n",
+ "num_timesteps=10\n",
+ "\n",
+ "def set_device(comm: MPI.Comm):\n",
+ " # Device selection for each rank on multi-GPU nodes (TorchSWE-specific)\n",
+ " n_gpus = cp.cuda.runtime.getDeviceCount()\n",
+ " local_rank = comm.Get_rank() % n_gpus\n",
+ " cp.cuda.runtime.setDevice(local_rank)\n",
+ "\n",
+ "comm = MPI.COMM_WORLD\n",
+ "rank = comm.Get_rank()\n",
+ "size = comm.Get_size()\n",
+ "\n",
+ "# Determine grid size and decompose domain\n",
+ "gnx, gny = 126,126 # global grid dimensions\n",
+ "local_nx, local_ny = gnx // size, gny # local grid dimensions per rank\n",
+ "local_grid = cp.ones((local_nx + 2, local_ny + 2)) # with halo boundaries\n",
+ "\n",
+ "# Set up MPI data types and boundaries\n",
+ "send_type, recv_type = MPI.DOUBLE.Create_subarray((local_nx + 2, local_ny + 2), (local_nx, local_ny), (1, 1)), MPI.DOUBLE.Create_subarray((local_nx + 2, local_ny + 2), (local_nx, local_ny), (1, 1))\n",
+ "send_type.Commit()\n",
+ "recv_type.Commit()\n",
+ "\n",
+ "# Stencil computation loop\n",
+ "for timestep in range(num_timesteps):\n",
+ " # Boundary exchange with non-blocking sends/receives\n",
+ " reqs = []\n",
+ " if rank > 0:\n",
+ " reqs.append(comm.Isend(local_grid[1, :], dest=rank - 1))\n",
+ " reqs.append(comm.Irecv(local_grid[0, :], source=rank - 1))\n",
+ " if rank < size - 1:\n",
+ " reqs.append(comm.Isend(local_grid[local_nx, :], dest=rank + 1))\n",
+ " reqs.append(comm.Irecv(local_grid[local_nx + 1, :], source=rank + 1))\n",
+ "\n",
+ " # Ensure all sends/receives are complete\n",
+ " MPI.Request.Waitall(reqs)\n",
+ "\n",
+ " # Perform stencil operation\n",
+ " for i in range(1, local_nx + 1):\n",
+ " for j in range(1, local_ny + 1):\n",
+ " local_grid[i, j] = 0.25 * (local_grid[i - 1, j] + local_grid[i + 1, j] +\n",
+ " local_grid[i, j - 1] + local_grid[i, j + 1])\n",
+ "\n",
+ "# Clean up MPI data types\n",
+ "send_type.Free()\n",
+ "recv_type.Free()\n",
+ "MPI.Finalize()\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "660621f9-2bc9-49a3-be59-cde1ce87df65",
+ "metadata": {},
+ "source": [
+ "This example follows TorchSWE's approach to domain decomposition and parallelization as in the original implementation. It starts with MPI initialization and sets up logic to manage GPU assignment per rank, dividing the global grid into subdomains. Each rank is responsible for a local subgrid with added halo rows to hold neighboring data. Once the domain is decomposed, the user must ensure proper communication of data at processor boundaries, accounting for datatype differences between CuPy and MPI4Py. For optimal performance, the appropriate type of point-to-point communication, such as non-blocking send/recv, must be selected, as incorrect implementation can cause deadlock. Users must also handle varying numbers of neighboring ranks on domain boundaries and ensure data exchange across mesh, topography, and solution variables. Non-blocking `Isend` and `Irecv` functions handle boundary data exchanges, allowing each rank to receive necessary data for stencil computations. After a `Waitall` synchronization step, each rank performs computations on its subdomain. Finally, custom MPI data types are freed, and `MPI_Finalize()` concludes the environment.\n",
+ "\n",
+ "The actual TorchSWE code has additional complexities specific to its use of multiple arrays, GPU memory management, one-sided communications etc.\n",
+ "For the complete implementation, you can refer to the [TorchSWE repository](https://github.com/piyueh/TorchSWE).\n",
+ "\n",
+ "Explicit distributed logic, like that in TorchSWE, is difficult to debug and maintain throughout the lifespan of simulation codes. Most applications, including TorchSWE, require specialized validation tests to ensure correct outputs. This results in significant programming effort and further complicates development. \n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e93aa24e-fc18-4f69-819d-59b5997aa087",
+ "metadata": {},
+ "source": [
+ "cuPyNumeric Implementation
\n",
+ "\n",
+ "In the [cuPyNumeric version of TorchSWE](https://github.com/shriram-jagan/TorchSWE), stencil operations are implemented using distributed array handling from cuPyNumeric, simplifying the code and removing the need for manual partitioning or boundary synchronization. The code operates similarly to NumPy slicing but scales across multiple GPUs. For example, the stencil computation in this version would typically involve using simple array slices like below (instead of the nested loops with integrated MPI logic as in the original implementation).\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b6e15757-a681-4a09-9f82-6304adf82fb4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import cupynumeric as np\n",
+ " \n",
+ "# Example dimensions\n",
+ "nx, ny = 128, 128\n",
+ "\n",
+ "# Initialize the array h\n",
+ "grid = np.ones((nx, ny))\n",
+ "\n",
+ "# Stencil operation using slicing\n",
+ "grid[1:-1, 1:-1] = (\n",
+ " grid[2:, 1:-1] + # Below\n",
+ " grid[:-2, 1:-1] + # Above\n",
+ " grid[1:-1, 2:] + # Right\n",
+ " grid[1:-1, :-2] # Left\n",
+ ") / 4\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f29f5387-3408-4bff-948d-55519412de31",
+ "metadata": {},
+ "source": [
+ "This operation is automatically managed across nodes and GPUs without needing MPI-specific code. More details can be found in the [cuPyNumeric port of TorchSWE](https://github.com/shriram-jagan/TorchSWE).\n",
+ "\n",
+ "The cuPyNumeric version of TorchSWE eliminates 600 lines of code related to domain decomposition, communication, synchronization, and validation that would otherwise be needed when using MPI4Py with CuPy. These 600 lines require substantial knowledge of distributed computing from domain scientists. By using cuPyNumeric, the simplified NumPy code scales efficiently to 1024 GPUs, making high-fidelity flood modeling accessible without requiring specialized expertise in distributed systems."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7e5d6565-ceda-4b61-8826-b6ae5aff3c83",
+ "metadata": {},
+ "source": [
+ "Conclusion
\n",
+ "\n",
+ "cuPyNumeric significantly simplifies the development and maintenance of distributed simulations, such as TorchSWE, by abstracting complex parallelization, synchronization, and communication logic. This eliminates the need for specialized HPC knowledge and reduces the risk of errors, allowing domain scientists to focus on their research. With cuPyNumeric, large-scale simulations can scale efficiently across large HPC systems, enhancing productivity, reducing programming effort, and lowering development costs. \n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "eb3a186a-3ea7-4150-8ec0-7760ad2adf1f",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/cunumeric/source/faqs.rst b/docs/cupynumeric/source/faqs.rst
similarity index 72%
rename from docs/cunumeric/source/faqs.rst
rename to docs/cupynumeric/source/faqs.rst
index 553bc16710..7d542437f1 100644
--- a/docs/cunumeric/source/faqs.rst
+++ b/docs/cupynumeric/source/faqs.rst
@@ -10,20 +10,20 @@ What are the different task variants available in Legate?
Legate offers three different task variants: CPU, OMP, and GPU. A task variant
determines the type of processor Legate chooses to perform the computations.
-What is the difference between Legate and cuNumeric?
-----------------------------------------------------
+What is the difference between Legate and cuPyNumeric?
+------------------------------------------------------
Legate is a task-based runtime software stack that enables development of
scalable and composable libraries for distributed and accelerated computing.
-cuNumeric is one of the foundational libraries built using Legate and aspires
+cuPyNumeric is one of the foundational libraries built using Legate and aspires
to be a distributed and accelerated drop-in replacement library for NumPy, an
-array programming library widely used in scientific computing. cuNumeric scales
+array programming library widely used in scientific computing. cuPyNumeric scales
idiomatic NumPy programs to multiple GPUs and CPUs and seamlessly interoperates
with other Legate libraries.
-Check out this `blog post `_
-to learn more about cuNumeric.
+Check out this `blog post `_
+to learn more about cuPyNumeric.
When to use python vs legate?
-----------------------------
@@ -45,9 +45,9 @@ What does this warning mean?
.. code-block:: text
- RuntimeWarning: cuNumeric has not implemented and is falling back to canonical NumPy. You may notice significantly decreased performance for this function call.
+ RuntimeWarning: cuPyNumeric has not implemented and is falling back to canonical NumPy. You may notice significantly decreased performance for this function call.
-This means that the NumPy has not been implemented in cuNumeric and that
+This means that the NumPy has not been implemented in cuPyNumeric and that
the Legate runtime is falling back to using NumPy’s implementation which will
be single-threaded execution and can lead to decreased performance for that
function call.
@@ -101,14 +101,13 @@ How to handle Out-Of-Memory errors?
.. code-block:: text
- [0 - 7fb9fc426000] 0.985000 {5}{cunumeric.mapper}: Mapper cunumeric on Node 0 failed to allocate 144000000 bytes on memory 1e00000000000000 (of kind SYSTEM_MEM: Visible to all processors on a node) for region requirement 1 of Task cunumeric::WhereTask[./script.py:90] (UID 39).
+ [0 - 7fda18f26000] 0.805182 {5}{cunumeric.mapper}: Failed to allocate 8388608 bytes on memory 1e00000000000000 (of kind SYSTEM_MEM) for region requirement(s) 1 of Task cupynumeric::BinaryOpTask[oom.py:24] (UID 18)
The above error indicates that the application ran out of memory during
execution. More granular details on the type of memory, the task that triggered
-the error are provided in the error message, but this usually indicates that
-resources (add more cores/threads/ GPUs, or increase the amount of system
-memory or framebuffer memory) or decrease the problem size and confirm that you
-are able to run the program to completion.
+the error, and what was using up the available memory are provided in the error
+message. If possible, try increasing the amount of system memory or framebuffer
+memory allocated to the program, or decrease the problem size.
Reducing the ``--eager-alloc-percentage`` to, say, 10 or less can also help
since this reduces the amount of available memory available to the eager memory
@@ -121,12 +120,12 @@ Why are the results different from NumPy?
While a majority of the APIs will give the same result as NumPy, some APIs
might be implemented differently from that of NumPy which might lead to
differences in results. One such example is, :ref:`reshape`, which returns a
-copy of the array in cuNumeric but returns a view in NumPy. Another example
+copy of the array in cuPyNumeric but returns a view in NumPy. Another example
is :ref:`astype` which does *not* return a copy by default, where NumPy does.
Such differences in implementation are noted in the documentation of the
-cuNumeric APIs, please review them before opening an issue on the
-`cuNumeric issue tracker `_.
+cuPyNumeric APIs, please review them before opening an issue on the
+`cuPyNumeric issue tracker `_.
Why doesn’t Legate use my GPU?
------------------------------
@@ -148,20 +147,20 @@ How do I time the execution of my application?
----------------------------------------------
Check out the :ref:`benchmarking` section for information on how to accurately
-measure cuNumeric execution.
+measure cuPyNumeric execution.
-Why is cuNumeric slower than NumPy on my laptop?
-------------------------------------------------
+Why is cuPyNumeric slower than NumPy on my laptop?
+--------------------------------------------------
-For small problem sizes, cuNumeric might be slower than NumPy. We suggest you
+For small problem sizes, cuPyNumeric might be slower than NumPy. We suggest you
increase the problem size and correspondingly increase the resources needed
for the problem size as described in the Usage section. Take a look at our
:ref:`practices` on how to do that.
-Why is cuNumeric slower than cuPy on my laptop?
------------------------------------------------
+Why is cuPyNumeric slower than CuPy on my laptop?
+-------------------------------------------------
-For small problem sizes, cuNumeric might be slower than cuPy. We suggest you
+For small problem sizes, cuPyNumeric might be slower than CuPy. We suggest you
increase the problem size and correspondingly increase the resources needed for
the problem size as described in the :ref:`Usage` section. Take a look at
performance :ref:`practices`.
@@ -169,7 +168,7 @@ performance :ref:`practices`.
How do I use Jupyter Notebooks?
-------------------------------
-Notebooks are useful for experimentation and evaluation on a single node.
+See https://docs.nvidia.com/legate/latest/jupyter.html.
How to pass Legion and Realm arguments?
---------------------------------------
@@ -191,19 +190,17 @@ What are the defaults?
The default values for several input arguments to Legate are mentioned in
Legate's documentation.
-Are there resources where I can read more about Legate?
--------------------------------------------------------
+Where I can read more about cuPyNumeric?
+----------------------------------------
-Check out this `blog post `_
-to learn more about cuNumeric.
+Check out this `blog post `_
+or this `tutorial `_
+to learn more about cuPyNumeric.
-Technical questions?
---------------------
+Questions?
+----------
-For technical questions about Cunumeric and Legate-based tools, please visit
+For technical questions about cuPyNumeric and Legate-based tools, please visit
the `community discussion forum `_.
-Other questions?
-----------------
-
-Follow us on `GitHub `_ or reach out to us there.
+If you have other questions, please contact us at *legate@nvidia.com*.
diff --git a/docs/cupynumeric/source/index.rst b/docs/cupynumeric/source/index.rst
new file mode 100644
index 0000000000..43ca3f8347
--- /dev/null
+++ b/docs/cupynumeric/source/index.rst
@@ -0,0 +1,38 @@
+:html_theme.sidebar_secondary.remove:
+
+NVIDIA cuPyNumeric
+==================
+
+cuPyNumeric is a library that aims to provide a distributed and accelerated
+drop-in replacement for `NumPy`_ built on top of the `Legate`_ framework.
+
+With cuPyNumeric you can write code productively in Python, using the familiar
+NumPy API, and have your program scale with no code changes from single-CPU
+computers to multi-node-multi-GPU clusters.
+
+For example, you can run `the final example of the Python CFD course`_
+completely unmodified on 2048 A100 GPUs in a `DGX SuperPOD`_ and achieve
+good weak scaling.
+
+.. toctree::
+ :maxdepth: 1
+ :caption: Contents:
+
+ installation
+ user/index
+ examples/index
+ api/index
+ faqs
+ developer/index
+
+
+Indices and tables
+------------------
+
+* :ref:`genindex`
+* :ref:`search`
+
+.. _NumPy: https://numpy.org/
+.. _Legate: https://github.com/nv-legate/legate
+.. _DGX SuperPOD: https://www.nvidia.com/en-us/data-center/dgx-superpod/
+.. _the final example of the Python CFD course: https://github.com/barbagroup/CFDPython/blob/master/lessons/15_Step_12.ipynb
\ No newline at end of file
diff --git a/docs/cupynumeric/source/installation.rst b/docs/cupynumeric/source/installation.rst
new file mode 100644
index 0000000000..1f1e88dafc
--- /dev/null
+++ b/docs/cupynumeric/source/installation.rst
@@ -0,0 +1,96 @@
+Installation
+============
+
+Installing Conda Packages
+-------------------------
+
+cuPyNumeric supports the
+`same platforms as Legate `_.
+
+cuPyNumeric is available from
+`conda `_
+on the `legate channel `_.
+
+.. note::
+ conda version >= 24.1 required
+
+.. code-block:: bash
+
+ # with a new environment
+ $ conda create -n myenv -c conda-forge -c legate cupynumeric
+
+ # =========== OR =========== #
+
+ # into an existing environment
+ $ conda install -c conda-forge -c legate cupynumeric
+
+Installing PyPI Packages
+------------------------
+
+cuPyNumeric is also available from `PyPI
+`_. To install, run the following
+command:
+
+.. code-block:: bash
+
+ # into existing environment
+ $ pip install nvidia-cupynumeric
+
+ # =========== OR =========== #
+
+ # into new environment
+ $ python -m venv myenv
+ $ source myenv/bin/activate
+ $ pip install nvidia-cupynumeric
+
+This will install the latest version of cuPyNumeric and the corresponding
+version of `Legate `_.
+
+The cuPyNumeric package on PyPI is multi-node and multi-rank capable. Please
+check `Legate `_ documentation to find more
+details about running on multiple nodes.
+
+Verify your Installation
+------------------------
+
+You can verify the installation by running one of the
+`examples `_.
+
+For instance:
+
+.. code-block:: sh
+
+ $ legate examples/black_scholes.py
+ Running black scholes on 10K options...
+ Elapsed Time: 129.017 ms
+
+Conda and GPU / CPU Variants
+----------------------------
+
+``conda`` automatically installs the right variant for the system:
+* CPU variant if no NVIDIA GPU is detected
+* GPU variant if an NVIDIA GPU is detected
+
+To override this behavior and force install a version with GPU support, use the
+following (with the desired CUDA version):
+
+.. code-block:: sh
+
+ $ CONDA_OVERRIDE_CUDA="12.2" conda install -c conda-forge -c legate cupynumeric
+
+
+Building from source
+---------------------
+
+See :ref:`building cupynumeric from source` for instructions on building
+cuPyNumeric manually.
+
+Licenses
+--------
+
+This project will download and install additional third-party open source
+software projects at install time. Review the license terms of these open
+source projects before use.
+
+For license information regarding projects bundled directly, see
+:ref:`thirdparty`.
\ No newline at end of file
diff --git a/docs/cunumeric/source/oss-licenses.rst b/docs/cupynumeric/source/oss-licenses.rst
similarity index 77%
rename from docs/cunumeric/source/oss-licenses.rst
rename to docs/cupynumeric/source/oss-licenses.rst
index a6a9b0226b..84c0d96456 100644
--- a/docs/cunumeric/source/oss-licenses.rst
+++ b/docs/cupynumeric/source/oss-licenses.rst
@@ -5,6 +5,42 @@
Third-party notices
===================
+NumPy
+-----
+
+.. code-block:: none
+
+ Copyright (c) 2005-2025, NumPy Developers.
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials provided
+ with the distribution.
+
+ * Neither the name of the NumPy Developers nor the names of any
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
TBLIS
-----
diff --git a/docs/cunumeric/source/user/advanced.rst b/docs/cupynumeric/source/user/advanced.rst
similarity index 92%
rename from docs/cunumeric/source/user/advanced.rst
rename to docs/cupynumeric/source/user/advanced.rst
index 2fdd96d974..b6bbc31fc6 100644
--- a/docs/cunumeric/source/user/advanced.rst
+++ b/docs/cupynumeric/source/user/advanced.rst
@@ -9,7 +9,7 @@ Multi-node execution
Using ``legate``
~~~~~~~~~~~~~~~~
-Cunumeric programs can be run in parallel by using the ``--nodes`` option to
+cuPyNumeric programs can be run in parallel by using the ``--nodes`` option to
the ``legate`` driver, followed by the number of nodes to be used.
When running on 2+ nodes, a task launcher must be specified.
diff --git a/docs/cunumeric/source/user/differences.rst b/docs/cupynumeric/source/user/differences.rst
similarity index 77%
rename from docs/cunumeric/source/user/differences.rst
rename to docs/cupynumeric/source/user/differences.rst
index 5195ccdd37..efab90df11 100644
--- a/docs/cunumeric/source/user/differences.rst
+++ b/docs/cupynumeric/source/user/differences.rst
@@ -3,10 +3,10 @@ Differences with Numpy
Supported shapes and datatypes
------------------------------
-cuNumeric natively supports arrays of dimensionality only up to the maximum
+cuPyNumeric natively supports arrays of dimensionality only up to the maximum
number of dimensions supported by the linked build of Legate.
-cuNumeric natively supports only numerical datatypes, and doesn't support
+cuPyNumeric natively supports only numerical datatypes, and doesn't support
extended-precision floats (e.g. `np.float128`).
Trying to use an unsupported number of dimensions or datatype will trigger a
@@ -15,7 +15,7 @@ fallback to base NumPy.
Returning a copy instead of a view
----------------------------------
-Some functions that return a view in Numpy return a copy in cuNumeric. These
+Some functions that return a view in Numpy return a copy in cuPyNumeric. These
include:
* ``np.diag``
@@ -46,21 +46,21 @@ Scalar return values
--------------------
NumPy will occasionally convert a 0d array to a python-level scalar, but
-cuNumeric avoids doing that, because in our system an array value can
+cuPyNumeric avoids doing that, because in our system an array value can
potentially represent an asynchronous computation. As a result, sometimes
-cuNumeric will return 0d arrays (possibly deferred), in cases where NumPy
+cuPyNumeric will return 0d arrays (possibly deferred), in cases where NumPy
returns a scalar.
Indexing behavior
-----------------
-``x[:,True]`` works differently from NumPy. cuNumeric broadcasts it up to the
+``x[:,True]`` works differently from NumPy. cuPyNumeric broadcasts it up to the
corresponding dimension, whereas NumPy adds a dimension.
Additionally ``[]`` does not work for advanced indexing since ``[]`` is
``float64`` by default.
-cuNumeric doesn't support non-unit steps on index expressions, e.g. `arr[::2]`.
+cuPyNumeric doesn't support non-unit steps on index expressions, e.g. `arr[::2]`.
Duplicate indices on advanced indexing expressions produce undefined behavior.
This is also the case in NumPy but the current NumPy implementation happens
diff --git a/docs/cunumeric/source/user/howtos/benchmarking.rst b/docs/cupynumeric/source/user/howtos/benchmarking.rst
similarity index 94%
rename from docs/cunumeric/source/user/howtos/benchmarking.rst
rename to docs/cupynumeric/source/user/howtos/benchmarking.rst
index f744e10683..2be87f8483 100644
--- a/docs/cunumeric/source/user/howtos/benchmarking.rst
+++ b/docs/cupynumeric/source/user/howtos/benchmarking.rst
@@ -7,7 +7,7 @@ Using Legate timing tools
-------------------------
Use legate's timing API to measure elapsed time, rather than standard Python
-timers. cuNumeric executes work asynchronously when possible, and a standard
+timers. cuPyNumeric executes work asynchronously when possible, and a standard
Python timer will only measure the time taken to launch the work, not the time
spent in actual computation.
@@ -18,7 +18,7 @@ Here is an example of how to measure elapsed time in milliseconds:
.. code-block:: python
- import cunumeric as np
+ import cupynumeric as np
from legate.timing import time
init() # Initialization step
diff --git a/docs/cunumeric/source/user/howtos/index.rst b/docs/cupynumeric/source/user/howtos/index.rst
similarity index 89%
rename from docs/cunumeric/source/user/howtos/index.rst
rename to docs/cupynumeric/source/user/howtos/index.rst
index 1e07c8f0b2..72140ffd72 100644
--- a/docs/cunumeric/source/user/howtos/index.rst
+++ b/docs/cupynumeric/source/user/howtos/index.rst
@@ -6,5 +6,4 @@ Howtos
measuring
benchmarking
- jupyter
patching
diff --git a/docs/cunumeric/source/user/howtos/measuring.rst b/docs/cupynumeric/source/user/howtos/measuring.rst
similarity index 56%
rename from docs/cunumeric/source/user/howtos/measuring.rst
rename to docs/cupynumeric/source/user/howtos/measuring.rst
index 3513a86287..4e146a3868 100644
--- a/docs/cunumeric/source/user/howtos/measuring.rst
+++ b/docs/cupynumeric/source/user/howtos/measuring.rst
@@ -3,42 +3,42 @@
Measure API coverage
====================
-cuNumeric does not currently implment all of NumPy's APIs. If necessary,
-cuNumeric will fall back to using NumPy directly to complete a compuation.
-When running applications that use cuNumeric, the command line options below
+cuPyNumeric does not currently implment all of NumPy's APIs. If necessary,
+cuPyNumeric will fall back to using NumPy directly to complete a compuation.
+When running applications that use cuPyNumeric, the command line options below
may be used to generate coverage reports that show which APIs are implemented
-and optimized by cuNumeric and which APIs required falling back to NumPy.
+and optimized by cuPyNumeric and which APIs required falling back to NumPy.
Overall coverage report
~~~~~~~~~~~~~~~~~~~~~~~
-The environment variable ``CUNUMERIC_REPORT_COVERAGE`` may be used to print an
-overall percentage of cunumeric coverage:
+The environment variable ``CUPYNUMERIC_REPORT_COVERAGE`` may be used to print an
+overall percentage of cupynumeric coverage:
.. code-block:: sh
- CUNUMERIC_REPORT_COVERAGE=1 legate test.py
+ CUPYNUMERIC_REPORT_COVERAGE=1 legate test.py
After execution completes, the percentage of NumPy API calls that were handled
-by cunumeric is printed:
+by cupynumeric is printed:
.. code-block::
- cuNumeric API coverage: 26/26 (100.0%)
+ cuPyNumeric API coverage: 26/26 (100.0%)
Detailed coverage report
~~~~~~~~~~~~~~~~~~~~~~~~
-The environment variable ``CUNUMERIC_REPORT_DUMP_CSV`` may be used to save a
+The environment variable ``CUPYNUMERIC_REPORT_DUMP_CSV`` may be used to save a
detailed coverage report:
.. code-block:: sh
- CUNUMERIC_REPORT_COVERAGE=1 CUNUMERIC_REPORT_DUMP_CSV="out.csv" legate test.py
+ CUPYNUMERIC_REPORT_COVERAGE=1 CUPYNUMERIC_REPORT_DUMP_CSV="out.csv" legate test.py
After execution completes, a CSV file will be saved to the specified location
(in this case ``out.csv``). The file shows exactly what NumPy API functions
-were called, whether the are implemented by cunumeric, and the location of
+were called, whether the are implemented by cupynumeric, and the location of
the call site:
.. code-block::
@@ -56,12 +56,12 @@ the call site:
Call stack reporting
~~~~~~~~~~~~~~~~~~~~
-The environment variable ``CUNUMERIC_REPORT_DUMP_CALLSTACK`` may be added to
+The environment variable ``CUPYNUMERIC_REPORT_DUMP_CALLSTACK`` may be added to
include full call stack information in a CSV report:
.. code-block:: sh
- CUNUMERIC_REPORT_COVERAGE=1 CUNUMERIC_REPORT_DUMP_CALLSTACK=1 CUNUMERIC_REPORT_DUMP_CALLSTACK=1 legate test.py
+ CUPYNUMERIC_REPORT_COVERAGE=1 CUPYNUMERIC_REPORT_DUMP_CALLSTACK=1 CUPYNUMERIC_REPORT_DUMP_CALLSTACK=1 legate test.py
After execution completes, the CSV output file have full call stack
information in the location column, with individual stack frames separated
diff --git a/docs/cunumeric/source/user/howtos/patching.rst b/docs/cupynumeric/source/user/howtos/patching.rst
similarity index 64%
rename from docs/cunumeric/source/user/howtos/patching.rst
rename to docs/cupynumeric/source/user/howtos/patching.rst
index cdac9223cf..576e9396c1 100644
--- a/docs/cunumeric/source/user/howtos/patching.rst
+++ b/docs/cupynumeric/source/user/howtos/patching.rst
@@ -2,7 +2,7 @@ Trying Numpy code without changes
=================================
The ``lgpatch`` script (in the same location as the ``legate`` executable) can
-help facilitate quick demonstrations of ``cunumeric`` on existing codebases
+help facilitate quick demonstrations of ``cupynumeric`` on existing codebases
that make use of ``numpy``.
To use this tool, invoke it as shown below, with the name of the program to
@@ -23,13 +23,13 @@ For example, here is a small ``test.py`` program that imports and uses various
input = np.eye(10, dtype=np.float32)
np.linalg.cholesky(input)
-You can invoke ``lgpatch`` to run ``test.py`` using ``cunumeric`` functions
+You can invoke ``lgpatch`` to run ``test.py`` using ``cupynumeric`` functions
instead, without any changes to the original source code. Any standard
-``cunumeric`` runtime options (e.g. for :ref:`measuring api coverage`) may
+``cupynumeric`` runtime options (e.g. for :ref:`measuring api coverage`) may
also be used:
.. code-block:: sh
- $ CUNUMERIC_REPORT_COVERAGE=1 LEGATE_CONFIG="--cpus 4" lgpatch test.py -patch numpy
- cuNumeric API coverage: 4/4 (100.0%)
+ $ CUPYNUMERIC_REPORT_COVERAGE=1 LEGATE_CONFIG="--cpus 4" lgpatch test.py -patch numpy
+ cuPyNumeric API coverage: 4/4 (100.0%)
diff --git a/docs/cunumeric/source/user/index.rst b/docs/cupynumeric/source/user/index.rst
similarity index 100%
rename from docs/cunumeric/source/user/index.rst
rename to docs/cupynumeric/source/user/index.rst
diff --git a/docs/cunumeric/source/user/practices.rst b/docs/cupynumeric/source/user/practices.rst
similarity index 91%
rename from docs/cunumeric/source/user/practices.rst
rename to docs/cupynumeric/source/user/practices.rst
index c064fe8f6e..063a7a0fb7 100644
--- a/docs/cunumeric/source/user/practices.rst
+++ b/docs/cupynumeric/source/user/practices.rst
@@ -8,7 +8,7 @@ General Recommendations
Following the basics of numpy as documented
`here `_ is highly recommended.
-Here we highlight some of the anti-patterns and best practices for cuNumeric
+Here we highlight some of the anti-patterns and best practices for cuPyNumeric
to avoid commonly encountered problems related to performance. In general,
array-based computations are recommended.
@@ -16,14 +16,14 @@ Availability of each API (e.g., single CPU or Multiple GPUs/Multiple CPUs,
etc.) is noted in the docstring of the API. This would be useful to know while
designing the application since it can impact the scalability.
-Guidelines on using cuNumeric APIs
-----------------------------------
+Guidelines on using cuPyNumeric APIs
+------------------------------------
-Use cuNumeric or NumPy arrays, AVOID native lists
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Use cuPyNumeric or NumPy arrays, AVOID native lists
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Create a cuNumeric array from data structures native to Python like lists,
-tuples, etc., and operate on the cuNumeric array, as shown in the example
+Create a cuPyNumeric array from data structures native to Python like lists,
+tuples, etc., and operate on the cuPyNumeric array, as shown in the example
below. Find more details on this here:
.. https://numpy.org/doc/stable/user/basics.creation.html
@@ -37,7 +37,7 @@ below. Find more details on this here:
for val in x:
y.append(val + 2)
- # Recommended: Create a cuNumeric array and use array-based operations
+ # Recommended: Create a cuPyNumeric array and use array-based operations
y = np.array(x)
y = x + 2
@@ -48,7 +48,7 @@ thus performing an array-based operation.
.. code-block:: python
- import cunumeric as np
+ import cupynumeric as np
def transform(input):
return (input + 3) * 4
@@ -121,7 +121,7 @@ performance.
.. code-block:: python
- import cunumeric as np
+ import cupynumeric as np
# Not recommended: don't use nonzero to get indices
indices = np.nonzero(h < 0)
@@ -141,7 +141,7 @@ condition is met, which can be described using the ``putmask`` API.
.. code-block:: python
- import cunumeric as np
+ import cupynumeric as np
# We need to update elements of x from y based on a condition
cond = y < tol
@@ -177,12 +177,12 @@ Use mathematical functions, AVOID element-wise loops
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
When there are nested element-wise operations, it is recommended that they
-are translated to array-based operations using equivalent cuNumeric APIs, if
+are translated to array-based operations using equivalent cuPyNumeric APIs, if
possible. Here is an example:
.. code-block:: python
- import cunumeric as np
+ import cupynumeric as np
# Not recommended: Naive element-wise implementation
for i in range(ny):
@@ -208,14 +208,14 @@ can also make it run slower, so we recommend using it as sparingly as possible.
.. code-block:: python
- import cunumeric as np
+ import cupynumeric as np
x = np.ones((3,4))
y = x.reshape((12,))
y[0] = 42
- assert x[0,0] == 42 # succeeds in NumPy, fails in cuNumeric
+ assert x[0,0] == 42 # succeeds in NumPy, fails in cuPyNumeric
Stack results in a performance penalty
......................................
@@ -231,8 +231,8 @@ Faster I/O Routines
As of 23.07, we recommend using `h5py `_ to perform I/O.
-Guidelines on designing cuNumeric applications
-----------------------------------------------
+Guidelines on designing cuPyNumeric applications
+------------------------------------------------
Use output arguments to reduce memory allocation
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -242,7 +242,7 @@ intermediate array in our implementation.
.. code-block:: python
- import cunumeric as np
+ import cupynumeric as np
# Acceptable
x = x + y
@@ -338,10 +338,10 @@ here.
.. code-block:: python
- import cunumeric as np
+ import cupynumeric as np
# compute() does some computations and returns a multi-dimensional
- # cuNumeric array. The application stops after the iterative computation
+ # cuPyNumeric array. The application stops after the iterative computation
# is converged
# Acceptable: Performing convergence checks every iteration
diff --git a/docs/cupynumeric/source/user/usage.rst b/docs/cupynumeric/source/user/usage.rst
new file mode 100644
index 0000000000..aebdad2763
--- /dev/null
+++ b/docs/cupynumeric/source/user/usage.rst
@@ -0,0 +1,50 @@
+.. _usage:
+
+Usage
+=====
+
+Using cuPyNumeric as a replacement for NumPy is simple. Replace your NumPy import
+statement with cuPyNumeric:
+
+.. code-block:: python
+
+ import numpy as np
+
+becomes
+
+.. code-block:: python
+
+ import cupynumeric as np
+
+Then, run the application like you usually do. For example, if you had a script
+``main.py`` written in NumPy that adds two vectors,
+
+.. code-block:: python
+
+ import numpy as np
+ x = np.array([1.0, 2.0, 3.0, 4.0])
+ y = np.array([4.0, 3.0, 2.0, 1.0])
+ z = x + y
+ print(z)
+
+change the import statement to use cuPyNumeric like below,
+
+.. code-block:: python
+
+ import cupynumeric as np
+ x = np.array([1.0, 2.0, 3.0, 4.0])
+ y = np.array([4.0, 3.0, 2.0, 1.0])
+ z = x + y
+ print(z)
+
+And run the program, like this
+
+.. code-block:: sh
+
+ python main.py
+
+By default this invocation will use all the hardware resources (e.g. CPU cores,
+RAM, GPUs) available on the current machine.
+
+For more information on controlling the resource allocation, running on multiple
+nodes etc. see https://docs.nvidia.com/legate/latest/usage.html.
diff --git a/docs/cupynumeric/switcher.json b/docs/cupynumeric/switcher.json
new file mode 100644
index 0000000000..7d049e3dd7
--- /dev/null
+++ b/docs/cupynumeric/switcher.json
@@ -0,0 +1,23 @@
+[
+ {
+ "name": "24.11",
+ "version": "24.11",
+ "url": "https://docs.nvidia.com/cupynumeric/24.11/"
+ },
+ {
+ "name": "25.01",
+ "version": "25.01",
+ "url": "https://docs.nvidia.com/cupynumeric/25.01/"
+ },
+ {
+ "name": "25.03",
+ "version": "25.03",
+ "url": "https://docs.nvidia.com/cupynumeric/25.03/"
+ },
+ {
+ "name": "25.05",
+ "version": "25.05",
+ "preferred": true,
+ "url": "https://docs.nvidia.com/cupynumeric/25.05/"
+ }
+]
diff --git a/examples/benchmark.py b/examples/benchmark.py
index d882e120fb..29a7f4a451 100644
--- a/examples/benchmark.py
+++ b/examples/benchmark.py
@@ -21,8 +21,7 @@
class Timer(Protocol):
- def start(self):
- ...
+ def start(self): ...
def stop(self):
"""
@@ -32,7 +31,7 @@ def stop(self):
...
-class CuNumericTimer(Timer):
+class CuPyNumericTimer(Timer):
def __init__(self):
self._start_time = None
@@ -112,9 +111,9 @@ def parse_args(parser):
)
args, _ = parser.parse_known_args()
if args.package == "legate":
- import cunumeric as np
+ import cupynumeric as np
- timer = CuNumericTimer()
+ timer = CuPyNumericTimer()
elif args.package == "cupy":
import cupy as np
diff --git a/examples/cpp/gemm/CMakeLists.txt b/examples/cpp/gemm/CMakeLists.txt
new file mode 100644
index 0000000000..91c1ff2723
--- /dev/null
+++ b/examples/cpp/gemm/CMakeLists.txt
@@ -0,0 +1,31 @@
+#=============================================================================
+# Copyright 2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+cmake_minimum_required(VERSION 3.22.1 FATAL_ERROR)
+
+project(stencil VERSION 0.1 LANGUAGES C CXX)
+
+if (NOT CMAKE_CXX_STANDARD)
+ set(CMAKE_CXX_STANDARD 17)
+endif()
+
+find_package(cupynumeric REQUIRED)
+
+add_executable(gemm gemm.cc)
+
+target_link_libraries(gemm PRIVATE cupynumeric::cupynumeric)
+
+install(TARGETS gemm DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/cmake-install")
diff --git a/examples/cpp/gemm/build.sh b/examples/cpp/gemm/build.sh
new file mode 100755
index 0000000000..53ed6d6c09
--- /dev/null
+++ b/examples/cpp/gemm/build.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# Copyright 2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+legate_root=`python -c 'import legate.install_info as i; from pathlib import Path; print(Path(i.libpath).parent.resolve())'`
+echo "Using Legate at $legate_root"
+cupynumeric_root=`python -c 'import cupynumeric.install_info as i; from pathlib import Path; print(Path(i.libpath).parent.resolve())'`
+echo "Using cuPyNumeric at $cupynumeric_root"
+cmake -S . -B build -D legate_ROOT="$legate_root" -D cupynumeric_ROOT="$cupynumeric_root" -D CMAKE_BUILD_TYPE=RelWithDebInfo
+cmake --build build --parallel 8
diff --git a/examples/cpp/gemm/gemm.cc b/examples/cpp/gemm/gemm.cc
new file mode 100644
index 0000000000..7ddc290522
--- /dev/null
+++ b/examples/cpp/gemm/gemm.cc
@@ -0,0 +1,111 @@
+/* Copyright 2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+
+namespace gemm {
+
+struct Config {
+ bool timing{false};
+ std::int32_t iter{100};
+ std::int32_t warmup{5};
+ std::uint64_t N{100};
+};
+
+[[nodiscard]] std::tuple
+initialize(std::uint64_t N, const legate::Type& ft)
+{
+ auto A = cupynumeric::random({N, N}).as_type(ft);
+ auto B = cupynumeric::random({N, N}).as_type(ft);
+ auto C = cupynumeric::zeros({N, N}, ft);
+ return {A, B, C};
+}
+
+[[nodiscard]] std::size_t total_flops(std::uint64_t M, std::uint64_t N, std::uint64_t K)
+{
+ return M * N * (2 * K - 1);
+}
+
+[[nodiscard]] std::size_t total_space(std::uint64_t M,
+ std::uint64_t N,
+ std::uint64_t K,
+ const legate::Type& ft)
+{
+ return (M * N + M * K + K * N) * ft.size();
+}
+
+void run_gemm(const Config& config)
+{
+ const auto ft = legate::float32();
+ const auto N = config.N;
+ std::printf("Problem Size: M=%lu N=%lu K=%lu\n", N, N, N);
+ std::printf("Total Iterations: %d\n", config.iter);
+ const auto flops = total_flops(N, N, N);
+ std::printf("Total Flops: %lf GFLOPS/iter\n", flops / 1e9);
+ const auto space = total_space(N, N, N, ft);
+ std::printf("Total Size: %lf MB\n", space / 1e6);
+ auto [A, B, C] = initialize(config.N, legate::float32());
+
+ auto start = legate::timing::measure_microseconds();
+ auto max_iter = config.iter + config.warmup;
+ for (int32_t iter = 0; iter < max_iter; ++iter) {
+ if (iter == config.warmup) {
+ start = legate::timing::measure_microseconds();
+ }
+ C.dot(A, B);
+ // We need to rotate the matrices to keep Legate honest
+ // about moving data so it can't just duplicate A and B
+ // on the first iteration and reuse them, this means
+ // that A, B, C all need to be square
+ A, B, C = B, C, A;
+ }
+ auto stop = legate::timing::measure_microseconds();
+
+ const auto total = (stop.value() - start.value()) / 1e3;
+ std::printf("Elapsed Time: %lf ms\n", total);
+ const auto average = total / config.iter;
+ std::printf("Average GEMM: %lf ms\n", average);
+ std::printf("FLOPS/s: %lf GFLOPS/s\n", flops / (average * 1e6));
+}
+
+} // namespace gemm
+
+int main(int argc, char** argv)
+{
+ legate::start();
+
+ cupynumeric::initialize(argc, argv);
+
+ gemm::Config config{};
+
+ Realm::CommandLineParser cp;
+ cp.add_option_int("--iter", config.iter)
+ .add_option_int("--warmup", config.warmup)
+ .add_option_int("--num", config.N)
+ .add_option_bool("--time", config.timing)
+ .parse_command_line(argc, argv);
+
+ gemm::run_gemm(config);
+
+ return legate::finish();
+}
diff --git a/examples/cpp/stencil/CMakeLists.txt b/examples/cpp/stencil/CMakeLists.txt
index d17920c4a3..3def9488f9 100644
--- a/examples/cpp/stencil/CMakeLists.txt
+++ b/examples/cpp/stencil/CMakeLists.txt
@@ -22,10 +22,10 @@ if (NOT CMAKE_CXX_STANDARD)
set(CMAKE_CXX_STANDARD 17)
endif()
-find_package(cunumeric REQUIRED)
+find_package(cupynumeric REQUIRED)
add_executable(stencil stencil.cc)
-target_link_libraries(stencil PRIVATE cunumeric::cunumeric)
+target_link_libraries(stencil PRIVATE cupynumeric::cupynumeric)
install(TARGETS stencil DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/cmake-install")
diff --git a/examples/cpp/stencil/build.sh b/examples/cpp/stencil/build.sh
index 485365ae3c..1eac0fe8d9 100755
--- a/examples/cpp/stencil/build.sh
+++ b/examples/cpp/stencil/build.sh
@@ -16,7 +16,7 @@
legate_root=`python -c 'import legate.install_info as i; from pathlib import Path; print(Path(i.libpath).parent.resolve())'`
echo "Using Legate at $legate_root"
-cunumeric_root=`python -c 'import cunumeric.install_info as i; from pathlib import Path; print(Path(i.libpath).parent.resolve())'`
-echo "Using cuNumeric at $cunumeric_root"
-cmake -S . -B build -D legate_ROOT="$legate_root" -D cunumeric_ROOT="$cunumeric_root" -D CMAKE_BUILD_TYPE=Debug
+cupynumeric_root=`python -c 'import cupynumeric.install_info as i; from pathlib import Path; print(Path(i.libpath).parent.resolve())'`
+echo "Using cuPyNumeric at $cupynumeric_root"
+cmake -S . -B build -D legate_ROOT="$legate_root" -D cupynumeric_ROOT="$cupynumeric_root" -D CMAKE_BUILD_TYPE=Debug
cmake --build build --parallel 8
diff --git a/examples/cpp/stencil/stencil.cc b/examples/cpp/stencil/stencil.cc
index 600535123c..022b3f222c 100644
--- a/examples/cpp/stencil/stencil.cc
+++ b/examples/cpp/stencil/stencil.cc
@@ -15,15 +15,15 @@
*/
#include "legate.h"
-#include "cunumeric.h"
+#include "cupynumeric.h"
#include "realm/cmdline.h"
#include
namespace stencil {
-using cunumeric::open;
-using cunumeric::slice;
+using cupynumeric::open;
+using cupynumeric::slice;
struct Config {
bool timing{false};
@@ -32,7 +32,7 @@ struct Config {
uint64_t N{100};
};
-void print_array(cunumeric::NDArray array)
+void print_array(cupynumeric::NDArray array)
{
auto acc = array.get_read_accessor();
auto& shape = array.shape();
@@ -49,9 +49,9 @@ void print_array(cunumeric::NDArray array)
std::cerr << std::move(ss).str();
}
-cunumeric::NDArray initialize(uint64_t N)
+cupynumeric::NDArray initialize(uint64_t N)
{
- auto grid = cunumeric::zeros({N + 2, N + 2});
+ auto grid = cupynumeric::zeros({N + 2, N + 2});
grid[{slice(), slice(0, 1)}].assign(legate::Scalar{-273.15});
grid[{slice(), slice(-1, open)}].assign(legate::Scalar{-273.15});
grid[{slice(-1, open), slice()}].assign(legate::Scalar{-273.15});
@@ -84,7 +84,7 @@ int main(int argc, char** argv)
auto result = legate::start(argc, argv);
assert(result == 0);
- cunumeric::initialize(argc, argv);
+ cupynumeric::initialize(argc, argv);
stencil::Config config{};
diff --git a/examples/gemm.py b/examples/gemm.py
index 183f65a7b2..3830459be7 100644
--- a/examples/gemm.py
+++ b/examples/gemm.py
@@ -21,8 +21,8 @@
def initialize(M, N, K, ft):
- A = np.random.rand(N, N).astype(ft)
- B = np.random.rand(N, N).astype(ft)
+ A = np.random.uniform(size=(N, N), dtype=ft)
+ B = np.random.uniform(size=(N, N), dtype=ft)
C = np.zeros((N, N), dtype=ft)
return A, B, C
diff --git a/examples/richardson_lucy.py b/examples/richardson_lucy.py
index b024d46e75..25ed2154ee 100644
--- a/examples/richardson_lucy.py
+++ b/examples/richardson_lucy.py
@@ -22,7 +22,9 @@
# A simplified implementation of Richardson-Lucy deconvolution
-def run_richardson_lucy(shape, filter_shape, num_iter, warmup, timing):
+def run_richardson_lucy(
+ shape, filter_shape, num_iter, warmup, timing, conv_method
+):
image = np.random.rand(*shape).astype(float_type)
psf = np.random.rand(*filter_shape).astype(float_type)
im_deconv = np.full(image.shape, 0.5, dtype=float_type)
@@ -33,13 +35,16 @@ def run_richardson_lucy(shape, filter_shape, num_iter, warmup, timing):
for idx in range(num_iter + warmup):
if idx == warmup:
timer.start()
- conv = np.convolve(im_deconv, psf, mode="same")
+ conv = np.convolve(im_deconv, psf, mode="same", method=conv_method)
relative_blur = image / conv
- im_deconv *= np.convolve(relative_blur, psf_mirror, mode="same")
+ im_deconv *= np.convolve(
+ relative_blur, psf_mirror, mode="same", method=conv_method
+ )
total = timer.stop()
if timing:
print("Elapsed Time: " + str(total) + " ms")
+ return total
if __name__ == "__main__":
@@ -109,6 +114,13 @@ def run_richardson_lucy(shape, filter_shape, num_iter, warmup, timing):
action="store_true",
help="perform timing",
)
+ parser.add_argument(
+ "--conv-method",
+ dest="conv_method",
+ type=str,
+ default="auto",
+ help="convolution method (auto by default)",
+ )
args, np, timer = parse_args(parser)
@@ -122,5 +134,6 @@ def run_richardson_lucy(shape, filter_shape, num_iter, warmup, timing):
args.I,
args.warmup,
args.timing,
+ args.conv_method,
),
)
diff --git a/examples/scan.py b/examples/scan.py
index 09acad0608..00907a60af 100644
--- a/examples/scan.py
+++ b/examples/scan.py
@@ -62,7 +62,7 @@ def check_scan(OP, A, B, ax):
else:
print("FAIL!")
print(f"INPUT : {A}")
- print(f"CUNUMERIC: {B}")
+ print(f"CUPYNUMERIC: {B}")
print(f"NUMPY : {C}")
assert False
diff --git a/install.py b/install.py
index f58e19ec05..ae0639a7dd 100755
--- a/install.py
+++ b/install.py
@@ -108,13 +108,15 @@ def find_cmake_val(pattern, filepath):
def was_previously_built_with_different_build_isolation(
- isolated, cunumeric_build_dir
+ isolated, cupynumeric_build_dir
):
if (
- cunumeric_build_dir is not None
- and os.path.exists(cunumeric_build_dir)
+ cupynumeric_build_dir is not None
+ and os.path.exists(cupynumeric_build_dir)
and os.path.exists(
- cmake_cache := os.path.join(cunumeric_build_dir, "CMakeCache.txt")
+ cmake_cache := os.path.join(
+ cupynumeric_build_dir, "CMakeCache.txt"
+ )
)
):
try:
@@ -154,8 +156,8 @@ def find_legate_cmake_dir() -> Path:
# conda env.
return path
- # Possibly installed in an editable installation, in which case legate-config.cmake
- # and friends will live in the root binary directory.
+ # Possibly installed in an editable installation, in which case legate
+ # config.cmake and friends will live in the root binary directory.
root_path = path.root
assert isinstance(root_path, str)
while not any(p.name == "legate-config.cmake" for p in path.iterdir()):
@@ -167,7 +169,7 @@ def find_legate_cmake_dir() -> Path:
return path
-def install_cunumeric(
+def install_cupynumeric(
arch,
build_isolation,
with_tests,
@@ -251,7 +253,7 @@ def install_cunumeric(
dirname = os.path.dirname
realpath = os.path.realpath
- cunumeric_dir = dirname(realpath(__file__))
+ cupynumeric_dir = dirname(realpath(__file__))
if thread_count is None:
thread_count = multiprocessing.cpu_count()
@@ -260,7 +262,7 @@ def validate_path(path):
if path is None or (path := str(path)) == "":
return None
if not os.path.isabs(path):
- path = join(cunumeric_dir, path)
+ path = join(cupynumeric_dir, path)
if not exists(path := realpath(path)):
print(f"Error: path does not exist: {path}")
sys.exit(1)
@@ -288,20 +290,20 @@ def validate_path(path):
print("cutensor_dir: ", cutensor_dir)
print("openblas_dir: ", openblas_dir)
- skbuild_dir = join(cunumeric_dir, "_skbuild")
- cunumeric_build_dir = scikit_build_cmake_build_dir(skbuild_dir)
+ skbuild_dir = join(cupynumeric_dir, "_skbuild")
+ cupynumeric_build_dir = scikit_build_cmake_build_dir(skbuild_dir)
if was_previously_built_with_different_build_isolation(
- build_isolation and not editable, cunumeric_build_dir
+ build_isolation and not editable, cupynumeric_build_dir
):
print("Performing a clean build to accommodate build isolation.")
clean_first = True
cmd_env = dict(os.environ.items())
- # Explicitly uninstall cunumeric if doing a clean/isolated build.
+ # Explicitly uninstall cupynumeric if doing a clean/isolated build.
#
- # A prior installation may have built and installed cunumeric C++
+ # A prior installation may have built and installed cupynumeric C++
# dependencies (like BLAS or tblis).
#
# CMake will find and use them for the current build, which would normally
@@ -313,23 +315,23 @@ def validate_path(path):
# these dependencies, triggering CMake to build and install them again.
if clean_first or (build_isolation and not editable):
execute_command(
- [sys.executable, "-m", "pip", "uninstall", "-y", "cunumeric"],
+ [sys.executable, "-m", "pip", "uninstall", "-y", "cupynumeric"],
verbose,
ignore_errors=True,
- cwd=cunumeric_dir,
+ cwd=cupynumeric_dir,
env=cmd_env,
)
if clean_first:
shutil.rmtree(skbuild_dir, ignore_errors=True)
- shutil.rmtree(join(cunumeric_dir, "dist"), ignore_errors=True)
- shutil.rmtree(join(cunumeric_dir, "build"), ignore_errors=True)
+ shutil.rmtree(join(cupynumeric_dir, "dist"), ignore_errors=True)
+ shutil.rmtree(join(cupynumeric_dir, "build"), ignore_errors=True)
shutil.rmtree(
- join(cunumeric_dir, "cunumeric.egg-info"),
+ join(cupynumeric_dir, "cupynumeric.egg-info"),
ignore_errors=True,
)
- # Configure and build cuNumeric via setup.py
+ # Configure and build cuPyNumeric via setup.py
pip_install_cmd = [sys.executable, "-m", "pip", "install"]
install_dir = None
@@ -376,8 +378,8 @@ def validate_path(path):
cmake_flags += f"""\
-DCMAKE_BUILD_TYPE={(
- "Debug" if debug else "RelWithDebInfo" if debug_release else "Release"
-)}
+ "Debug" if debug else "RelWithDebInfo" if debug_release else "Release"
+ )}
-DBUILD_SHARED_LIBS=ON
-DCMAKE_CUDA_ARCHITECTURES={str(arch)}
-DLegion_MAX_DIM={str(maxdim)}
@@ -389,7 +391,7 @@ def validate_path(path):
-DLegion_USE_LLVM={("ON" if llvm else "OFF")}
-DLegion_NETWORKS={";".join(networks)}
-DLegion_USE_HDF5={("ON" if hdf else "OFF")}
--Dcunumeric_BUILD_TESTS={("ON" if with_tests else "OFF")}
+-Dcupynumeric_BUILD_TESTS={("ON" if with_tests else "OFF")}
""".splitlines()
if march:
@@ -412,7 +414,7 @@ def validate_path(path):
cmake_flags += ["-Dcutensor_DIR=%s" % cutensor_dir]
# A custom path to cuRAND is ignored when CUDA support is available
if cuda and curand_dir is not None:
- cmake_flags += ["-Dcunumeric_cuRAND_INCLUDE_DIR=%s" % curand_dir]
+ cmake_flags += ["-Dcupynumeric_cuRAND_INCLUDE_DIR=%s" % curand_dir]
cmake_flags += ["-Dlegate_ROOT=%s" % str(legate_dir)]
cmake_flags += ["-DCMAKE_BUILD_PARALLEL_LEVEL=%s" % thread_count]
@@ -433,18 +435,18 @@ def validate_path(path):
}
)
- execute_command(pip_install_cmd, verbose, cwd=cunumeric_dir, env=cmd_env)
+ execute_command(pip_install_cmd, verbose, cwd=cupynumeric_dir, env=cmd_env)
def driver():
- parser = argparse.ArgumentParser(description="Install cuNumeric.")
+ parser = argparse.ArgumentParser(description="Install cuPyNumeric.")
parser.add_argument(
"--debug",
dest="debug",
action="store_true",
required=False,
default=os.environ.get("DEBUG", "0") == "1",
- help="Build cuNumeric with no optimizations.",
+ help="Build cuPyNumeric with no optimizations.",
)
parser.add_argument(
"--debug-release",
@@ -452,7 +454,7 @@ def driver():
action="store_true",
required=False,
default=os.environ.get("DEBUG_RELEASE", "0") == "1",
- help="Build cuNumeric with optimizations, but include debugging "
+ help="Build cuPyNumeric with optimizations, but include debugging "
"symbols.",
)
parser.add_argument(
@@ -461,7 +463,7 @@ def driver():
action="store_true",
required=False,
default=False,
- help="Build cuNumeric tests.",
+ help="Build cuPyNumeric tests.",
)
parser.add_argument(
"--check-bounds",
@@ -469,21 +471,21 @@ def driver():
action="store_true",
required=False,
default=False,
- help="Build cuNumeric with bounds checks.",
+ help="Build cuPyNumeric with bounds checks.",
)
parser.add_argument(
"--max-dim",
dest="maxdim",
type=int,
default=int(os.environ.get("LEGION_MAX_DIM", 4)),
- help="Maximum number of dimensions that cuNumeric will support",
+ help="Maximum number of dimensions that cuPyNumeric will support",
)
parser.add_argument(
"--max-fields",
dest="maxfields",
type=int,
default=int(os.environ.get("LEGION_MAX_FIELDS", 256)),
- help="Maximum number of fields that cuNumeric will support",
+ help="Maximum number of fields that cuPyNumeric will support",
)
parser.add_argument(
"--network",
@@ -510,7 +512,7 @@ def driver():
default=os.environ.get("OPENBLAS_PATH"),
help="Path to OpenBLAS installation directory. Note that providing a "
"user-defined BLAS library may lead to dynamic library conflicts with "
- "BLAS loaded by Python's Numpy. When using cuNumeric's BLAS, this "
+ "BLAS loaded by Python's Numpy. When using cuPyNumeric's BLAS, this "
"issue is prevented by a custom library name.",
)
parser.add_argument(
@@ -579,7 +581,7 @@ def driver():
"--cuda",
action=BooleanFlag,
default=os.environ.get("USE_CUDA", "0") == "1",
- help="Build cuNumeric with CUDA support.",
+ help="Build cuPyNumeric with CUDA support.",
)
parser.add_argument(
"--with-cuda",
@@ -601,7 +603,7 @@ def driver():
"--openmp",
action=BooleanFlag,
default=os.environ.get("USE_OPENMP", "0") == "1",
- help="Build cuNumeric with OpenMP support.",
+ help="Build cuPyNumeric with OpenMP support.",
)
parser.add_argument(
"--march",
@@ -616,7 +618,7 @@ def driver():
action="store_true",
required=False,
default=os.environ.get("USE_LLVM", "0") == "1",
- help="Build cuNumeric with LLVM support.",
+ help="Build cuPyNumeric with LLVM support.",
)
parser.add_argument(
"--hdf5",
@@ -625,7 +627,7 @@ def driver():
action="store_true",
required=False,
default=os.environ.get("USE_HDF", "0") == "1",
- help="Build cuNumeric with HDF support.",
+ help="Build cuPyNumeric with HDF support.",
)
parser.add_argument(
"--spy",
@@ -633,7 +635,7 @@ def driver():
action="store_true",
required=False,
default=os.environ.get("USE_SPY", "0") == "1",
- help="Build cuNumeric with detailed Legion Spy enabled.",
+ help="Build cuPyNumeric with detailed Legion Spy enabled.",
)
parser.add_argument(
"--conduit",
@@ -645,7 +647,7 @@ def driver():
# See https://github.com/nv-legate/legate.core/issues/294.
choices=["ibv", "ucx", "aries", "mpi"],
default=os.environ.get("CONDUIT"),
- help="Build cuNumeric with specified GASNet conduit.",
+ help="Build cuPyNumeric with specified GASNet conduit.",
)
parser.add_argument(
"--clean",
@@ -701,7 +703,7 @@ def driver():
)
args, unknown = parser.parse_known_args()
- install_cunumeric(unknown=unknown, **vars(args))
+ install_cupynumeric(unknown=unknown, **vars(args))
if __name__ == "__main__":
diff --git a/pyproject.toml b/pyproject.toml
index 022a0f0a97..cc807dbb1d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -90,8 +90,8 @@ warn_unused_configs = true
# legate files need to be listed here for now
# since they are included in the type check
module = [
- "cunumeric.install_info",
- "cunumeric._version",
+ "cupynumeric.install_info",
+ "cupynumeric._version",
"legate._version",
"legate.__main__",
"legate.install_info",
diff --git a/scripts/api_compare.py b/scripts/api_compare.py
index 37923157e2..7f3561fa22 100644
--- a/scripts/api_compare.py
+++ b/scripts/api_compare.py
@@ -18,9 +18,9 @@
import sys
from dataclasses import astuple, dataclass
-from cunumeric._sphinxext._comparison_config import GROUPED_CONFIGS
-from cunumeric._sphinxext._comparison_util import filter_names
-from cunumeric.coverage import is_implemented
+from cupynumeric._sphinxext._comparison_config import GROUPED_CONFIGS
+from cupynumeric._sphinxext._comparison_util import filter_names
+from cupynumeric.coverage import is_implemented
@dataclass
@@ -35,16 +35,20 @@ def get_namespaces(attr):
import cupy
import numpy
- import cunumeric
+ import cupynumeric
if attr is None:
- return numpy, cunumeric, cupy
+ return numpy, cupynumeric, cupy
- return getattr(numpy, attr), getattr(cunumeric, attr), getattr(cupy, attr)
+ return (
+ getattr(numpy, attr),
+ getattr(cupynumeric, attr),
+ getattr(cupy, attr),
+ )
def write_rows(rows):
- headers = ("group", "numpy", "cunumeric", "cupy")
+ headers = ("group", "numpy", "cupynumeric", "cupy")
writer = csv.writer(sys.stdout)
writer.writerow(headers)
for row in rows:
diff --git a/scripts/build/python/cupynumeric/CMakeLists.txt b/scripts/build/python/cupynumeric/CMakeLists.txt
new file mode 100644
index 0000000000..f0fa381c3a
--- /dev/null
+++ b/scripts/build/python/cupynumeric/CMakeLists.txt
@@ -0,0 +1,39 @@
+#=============================================================================
+# Copyright 2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+cmake_minimum_required(VERSION 3.26.4)
+
+project(cupynumeric-python VERSION 25.05.00 LANGUAGES CXX)
+
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
+set(CUPYNUMERIC_BUILD_PIP_WHEELS ON)
+
+add_subdirectory(../../../.. cupynumeric-all)
+
+set(rpaths
+ "$ORIGIN/../../legate/lib64"
+ "$ORIGIN/../../cutensor/lib"
+ "$ORIGIN/../../nvidia/cublas/lib"
+ "$ORIGIN/../../nvidia/cufft/lib"
+ "$ORIGIN/../../nvidia/cusolver/lib"
+ "$ORIGIN/../../nvidia/cusparse/lib"
+ "$ORIGIN/../../nvidia/nvjitlink/lib"
+)
+set_property(
+ TARGET cupynumeric
+ PROPERTY INSTALL_RPATH ${rpaths}
+ APPEND
+)
diff --git a/scripts/build/python/cupynumeric/pyproject.toml b/scripts/build/python/cupynumeric/pyproject.toml
new file mode 100644
index 0000000000..ae17766497
--- /dev/null
+++ b/scripts/build/python/cupynumeric/pyproject.toml
@@ -0,0 +1,84 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+[build-system]
+requires = [
+ "scikit-build-core",
+ "cython>=3.0.1",
+ "rich",
+]
+build-backend = "scikit_build_core.build"
+python-requires = ">=3.10"
+
+[project]
+name = "nvidia-cupynumeric"
+authors = [{name = "NVIDIA Corporation"}]
+license = {text = "Apache-2.0"}
+description = "cupynumeric - drop in replacement for numpy"
+classifiers = [
+ "Intended Audience :: Developers",
+ "Topic :: Database",
+ "Topic :: Scientific/Engineering",
+ "License :: OSI Approved :: Apache Software License",
+ "Programming Language :: Python",
+ "Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12"
+]
+dependencies = [
+ "numpy!=2.1.0",
+ "cffi",
+ "opt_einsum",
+ "legate==25.5.*,>=0.0.0a0",
+ "cutensor-cu12",
+ "nvidia-cublas-cu12",
+ "nvidia-cufft-cu12",
+ "nvidia-cusolver-cu12",
+ "nvidia-cusparse-cu12",
+ "nvidia-nvjitlink-cu12",
+]
+dynamic = ["version"]
+
+[project.urls]
+homepage = "https://github.com/nv-legate/cupynumeric"
+
+[project.entry-points."cmake.prefix"]
+cupynumeric = "cupynumeric"
+
+[tool.scikit-build.cmake]
+version = ">=3.26.4"
+
+[tool.scikit-build.metadata.version]
+provider = "scikit_build_core.metadata.setuptools_scm"
+
+[tool.scikit-build.sdist]
+include = [
+ "../../../../cupynumeric/_version.py",
+]
+
+[tool.setuptools_scm]
+write_to = "cupynumeric/_version.py"
+root = "../../../../"
+
+[tool.scikit-build.build]
+verbose = true
+
+[tool.scikit-build.logging]
+level = "DEBUG"
+
+[tool.scikit-build.wheel]
+exclude = ["**.pyx", "**CMakeLists.txt", "**.pxd"]
+install-dir = "cupynumeric"
+
+[tool.scikit-build]
+build-dir = "buildwheel"
+
+[tool.scikit-build.wheel.packages]
+"cupynumeric" = "../../../../cupynumeric"
diff --git a/scripts/conda-build.sh b/scripts/conda-build.sh
index 47a4528274..a01c27ef29 100755
--- a/scripts/conda-build.sh
+++ b/scripts/conda-build.sh
@@ -1,11 +1,11 @@
#! /usr/bin/env bash
-# mamba create -n cunumeric_build python=$PYTHON_VERSION boa git
+# mamba create -n cupynumeric_build python=$PYTHON_VERSION boa git
cd $(dirname "$(realpath "$0")")/..
-mkdir -p /tmp/conda-build/cunumeric
-rm -rf /tmp/conda-build/cunumeric/*
+mkdir -p /tmp/conda-build/cupynumeric
+rm -rf /tmp/conda-build/cupynumeric/*
PYTHON_VERSION="${PYTHON_VERSION:-3.10}"
@@ -15,7 +15,7 @@ conda mambabuild \
--override-channels \
-c conda-forge -c https://github.com/nv-legate/ucx-package/raw/main \
-c file:///tmp/conda-build/legate_core \
- --croot /tmp/conda-build/cunumeric \
+ --croot /tmp/conda-build/cupynumeric \
--no-test \
--no-verify \
--no-build-id \
diff --git a/setup.cfg b/setup.cfg
index fb6cf969a2..fd1da9c82a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -3,10 +3,10 @@
[versioneer]
VCS = git
style = pep440
-versionfile_source = cunumeric/_version.py
-versionfile_build = cunumeric/_version.py
+versionfile_source = cupynumeric/_version.py
+versionfile_build = cupynumeric/_version.py
tag_prefix = v
-parentdir_prefix = cunumeric-
+parentdir_prefix = cupynumeric-
[flake8]
exclude = __init__.py
@@ -31,7 +31,7 @@ known_legion=
legion_cffi
legion_top
known_first_party=
- cunumeric
+ cupynumeric
default_section=THIRDPARTY
sections=FUTURE,STDLIB,THIRDPARTY,LEGION,FIRSTPARTY,LOCALFOLDER
skip=
diff --git a/setup.py b/setup.py
index 530216c86b..bc9b8918c1 100644
--- a/setup.py
+++ b/setup.py
@@ -21,10 +21,10 @@
import versioneer
setup(
- name="cunumeric",
+ name="cupynumeric",
version=versioneer.get_version(),
description="An Aspiring Drop-In Replacement for NumPy at Scale",
- url="https://github.com/nv-legate/cunumeric",
+ url="https://github.com/nv-legate/cupynumeric",
author="NVIDIA Corporation",
license="Apache 2.0",
classifiers=[
@@ -39,11 +39,11 @@
],
packages=find_packages(
where=".",
- include=["cunumeric*"],
+ include=["cupynumeric*"],
),
- package_data={"cunumeric": ["_sphinxext/_templates/*.rst"]},
+ package_data={"cupynumeric": ["_sphinxext/_templates/*.rst"]},
include_package_data=True,
cmdclass=versioneer.get_cmdclass(),
- install_requires=["numpy>=1.22,<2"],
+ install_requires=["cffi", "numpy>=1.22,<2", "opt_einsum>=3.3"],
zip_safe=False,
)
diff --git a/src/cunumeric/cunumeric.cc b/src/cunumeric/cunumeric.cc
deleted file mode 100644
index 2f62991118..0000000000
--- a/src/cunumeric/cunumeric.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright 2024 NVIDIA Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#include "cunumeric/cunumeric_c.h"
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/mapper.h"
-#include "cunumeric/runtime.h"
-#include "cunumeric/unary/unary_red_util.h"
-
-using namespace legate;
-
-namespace cunumeric {
-
-static const char* const cunumeric_library_name = "cunumeric";
-
-/*static*/ TaskRegistrar& CuNumericRegistrar::get_registrar()
-{
- static TaskRegistrar registrar;
- return registrar;
-}
-
-void unload_cudalibs() noexcept
-{
- auto machine = legate::get_machine();
-
- auto num_gpus = machine.count(legate::mapping::TaskTarget::GPU);
- if (0 == num_gpus) {
- return;
- }
-
- auto runtime = legate::Runtime::get_runtime();
- auto library = runtime->find_library(cunumeric_library_name);
-
- // Issue an execution fence so all outstanding tasks are done before we start destroying handles
- runtime->issue_execution_fence();
-
- runtime->submit(
- runtime->create_task(library,
- legate::LocalTaskID{CuNumericOpCode::CUNUMERIC_UNLOAD_CUDALIBS},
- legate::tuple{num_gpus}));
-}
-
-void registration_callback()
-{
- ResourceConfig config;
- config.max_tasks = CUNUMERIC_MAX_TASKS;
- config.max_reduction_ops = CUNUMERIC_MAX_REDOPS;
-
- auto runtime = legate::Runtime::get_runtime();
- auto library =
- runtime->create_library(cunumeric_library_name, config, std::make_unique());
-
- CuNumericRegistrar::get_registrar().register_all_tasks(library);
- CuNumericRuntime::initialize(runtime, library);
-
- legate::register_shutdown_callback(unload_cudalibs);
-}
-
-} // namespace cunumeric
-
-extern "C" {
-
-void cunumeric_perform_registration(void) { cunumeric::registration_callback(); }
-
-bool cunumeric_has_cusolvermp()
-{
- return LEGATE_DEFINED(LEGATE_USE_CUDA) && LEGATE_DEFINED(CUNUMERIC_USE_CUSOLVERMP);
-}
-}
diff --git a/src/cunumeric/cunumeric_c.h b/src/cunumeric/cunumeric_c.h
deleted file mode 100644
index c569f786e1..0000000000
--- a/src/cunumeric/cunumeric_c.h
+++ /dev/null
@@ -1,349 +0,0 @@
-/* Copyright 2024 NVIDIA Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#ifndef __CUNUMERIC_C_H__
-#define __CUNUMERIC_C_H__
-
-// Match these to CuNumericOpCode in config.py
-// Also, sort these alphabetically except the first one for easy lookup later
-enum CuNumericOpCode {
- _CUNUMERIC_OP_CODE_BASE = 0,
- CUNUMERIC_ADVANCED_INDEXING,
- CUNUMERIC_ARANGE,
- CUNUMERIC_ARGWHERE,
- CUNUMERIC_BATCHED_CHOLESKY,
- CUNUMERIC_BINARY_OP,
- CUNUMERIC_BINARY_RED,
- CUNUMERIC_BINCOUNT,
- CUNUMERIC_BITGENERATOR,
- CUNUMERIC_CHOOSE,
- CUNUMERIC_CONTRACT,
- CUNUMERIC_CONVERT,
- CUNUMERIC_CONVOLVE,
- CUNUMERIC_SCAN_GLOBAL,
- CUNUMERIC_SCAN_LOCAL,
- CUNUMERIC_DIAG,
- CUNUMERIC_DOT,
- CUNUMERIC_EYE,
- CUNUMERIC_FFT,
- CUNUMERIC_FILL,
- CUNUMERIC_FLIP,
- CUNUMERIC_GEMM,
- CUNUMERIC_HISTOGRAM,
- CUNUMERIC_LOAD_CUDALIBS,
- CUNUMERIC_MATMUL,
- CUNUMERIC_MATVECMUL,
- CUNUMERIC_MP_POTRF,
- CUNUMERIC_MP_SOLVE,
- CUNUMERIC_NONZERO,
- CUNUMERIC_PACKBITS,
- CUNUMERIC_POTRF,
- CUNUMERIC_PUTMASK,
- CUNUMERIC_QR,
- CUNUMERIC_RAND,
- CUNUMERIC_READ,
- CUNUMERIC_REPEAT,
- CUNUMERIC_SCALAR_UNARY_RED,
- CUNUMERIC_SEARCHSORTED,
- CUNUMERIC_SELECT,
- CUNUMERIC_SOLVE,
- CUNUMERIC_SORT,
- CUNUMERIC_SVD,
- CUNUMERIC_SYRK,
- CUNUMERIC_TILE,
- CUNUMERIC_TRANSPOSE_COPY_2D,
- CUNUMERIC_TRILU,
- CUNUMERIC_TRSM,
- CUNUMERIC_UNARY_OP,
- CUNUMERIC_UNARY_RED,
- CUNUMERIC_UNIQUE,
- CUNUMERIC_UNIQUE_REDUCE,
- CUNUMERIC_UNLOAD_CUDALIBS,
- CUNUMERIC_UNPACKBITS,
- CUNUMERIC_WHERE,
- CUNUMERIC_WINDOW,
- CUNUMERIC_WRAP,
- CUNUMERIC_WRITE,
- CUNUMERIC_ZIP,
-};
-
-// Match these to UnaryOpCode in config.py
-// Also, sort these alphabetically for easy lookup later
-enum CuNumericUnaryOpCode {
- CUNUMERIC_UOP_ABSOLUTE = 1,
- CUNUMERIC_UOP_ANGLE,
- CUNUMERIC_UOP_ARCCOS,
- CUNUMERIC_UOP_ARCCOSH,
- CUNUMERIC_UOP_ARCSIN,
- CUNUMERIC_UOP_ARCSINH,
- CUNUMERIC_UOP_ARCTAN,
- CUNUMERIC_UOP_ARCTANH,
- CUNUMERIC_UOP_CBRT,
- CUNUMERIC_UOP_CEIL,
- CUNUMERIC_UOP_CLIP,
- CUNUMERIC_UOP_CONJ,
- CUNUMERIC_UOP_COPY,
- CUNUMERIC_UOP_COS,
- CUNUMERIC_UOP_COSH,
- CUNUMERIC_UOP_DEG2RAD,
- CUNUMERIC_UOP_EXP,
- CUNUMERIC_UOP_EXP2,
- CUNUMERIC_UOP_EXPM1,
- CUNUMERIC_UOP_FLOOR,
- CUNUMERIC_UOP_FREXP,
- CUNUMERIC_UOP_GETARG,
- CUNUMERIC_UOP_IMAG,
- CUNUMERIC_UOP_INVERT,
- CUNUMERIC_UOP_ISFINITE,
- CUNUMERIC_UOP_ISINF,
- CUNUMERIC_UOP_ISNAN,
- CUNUMERIC_UOP_LOG,
- CUNUMERIC_UOP_LOG10,
- CUNUMERIC_UOP_LOG1P,
- CUNUMERIC_UOP_LOG2,
- CUNUMERIC_UOP_LOGICAL_NOT,
- CUNUMERIC_UOP_MODF,
- CUNUMERIC_UOP_NEGATIVE,
- CUNUMERIC_UOP_POSITIVE,
- CUNUMERIC_UOP_RAD2DEG,
- CUNUMERIC_UOP_REAL,
- CUNUMERIC_UOP_RECIPROCAL,
- CUNUMERIC_UOP_RINT,
- CUNUMERIC_UOP_ROUND,
- CUNUMERIC_UOP_SIGN,
- CUNUMERIC_UOP_SIGNBIT,
- CUNUMERIC_UOP_SIN,
- CUNUMERIC_UOP_SINH,
- CUNUMERIC_UOP_SQRT,
- CUNUMERIC_UOP_SQUARE,
- CUNUMERIC_UOP_TAN,
- CUNUMERIC_UOP_TANH,
- CUNUMERIC_UOP_TRUNC,
-};
-
-// Match these to UnaryRedCode in config.py
-// Also, sort these alphabetically for easy lookup later
-enum CuNumericUnaryRedCode {
- CUNUMERIC_RED_ALL = 1,
- CUNUMERIC_RED_ANY,
- CUNUMERIC_RED_ARGMAX,
- CUNUMERIC_RED_ARGMIN,
- CUNUMERIC_RED_CONTAINS,
- CUNUMERIC_RED_COUNT_NONZERO,
- CUNUMERIC_RED_MAX,
- CUNUMERIC_RED_MIN,
- CUNUMERIC_RED_NANARGMAX,
- CUNUMERIC_RED_NANARGMIN,
- CUNUMERIC_RED_NANMAX,
- CUNUMERIC_RED_NANMIN,
- CUNUMERIC_RED_NANPROD,
- CUNUMERIC_RED_NANSUM,
- CUNUMERIC_RED_PROD,
- CUNUMERIC_RED_SUM,
- CUNUMERIC_RED_SUM_SQUARES,
- CUNUMERIC_RED_VARIANCE
-};
-
-// Match these to BinaryOpCode in config.py
-// Also, sort these alphabetically for easy lookup later
-enum CuNumericBinaryOpCode {
- CUNUMERIC_BINOP_ADD = 1,
- CUNUMERIC_BINOP_ARCTAN2,
- CUNUMERIC_BINOP_BITWISE_AND,
- CUNUMERIC_BINOP_BITWISE_OR,
- CUNUMERIC_BINOP_BITWISE_XOR,
- CUNUMERIC_BINOP_COPYSIGN,
- CUNUMERIC_BINOP_DIVIDE,
- CUNUMERIC_BINOP_EQUAL,
- CUNUMERIC_BINOP_FLOAT_POWER,
- CUNUMERIC_BINOP_FLOOR_DIVIDE,
- CUNUMERIC_BINOP_FMOD,
- CUNUMERIC_BINOP_GCD,
- CUNUMERIC_BINOP_GREATER,
- CUNUMERIC_BINOP_GREATER_EQUAL,
- CUNUMERIC_BINOP_HYPOT,
- CUNUMERIC_BINOP_ISCLOSE,
- CUNUMERIC_BINOP_LCM,
- CUNUMERIC_BINOP_LDEXP,
- CUNUMERIC_BINOP_LEFT_SHIFT,
- CUNUMERIC_BINOP_LESS,
- CUNUMERIC_BINOP_LESS_EQUAL,
- CUNUMERIC_BINOP_LOGADDEXP,
- CUNUMERIC_BINOP_LOGADDEXP2,
- CUNUMERIC_BINOP_LOGICAL_AND,
- CUNUMERIC_BINOP_LOGICAL_OR,
- CUNUMERIC_BINOP_LOGICAL_XOR,
- CUNUMERIC_BINOP_MAXIMUM,
- CUNUMERIC_BINOP_MINIMUM,
- CUNUMERIC_BINOP_MOD,
- CUNUMERIC_BINOP_MULTIPLY,
- CUNUMERIC_BINOP_NEXTAFTER,
- CUNUMERIC_BINOP_NOT_EQUAL,
- CUNUMERIC_BINOP_POWER,
- CUNUMERIC_BINOP_RIGHT_SHIFT,
- CUNUMERIC_BINOP_SUBTRACT,
-};
-
-// Match these to WindowOpCode in config.py
-// Also, sort these alphabetically for easy lookup later
-enum CuNumericWindowOpCode {
- CUNUMERIC_WINDOW_BARLETT = 1,
- CUNUMERIC_WINDOW_BLACKMAN,
- CUNUMERIC_WINDOW_HAMMING,
- CUNUMERIC_WINDOW_HANNING,
- CUNUMERIC_WINDOW_KAISER,
-};
-
-// Match these to CuNumericRedopCode in config.py
-enum CuNumericRedopID {
- CUNUMERIC_ARGMAX_REDOP = 1,
- CUNUMERIC_ARGMIN_REDOP = 2,
-};
-
-enum CuNumericBounds {
- CUNUMERIC_MAX_REDOPS = 1024,
- CUNUMERIC_MAX_TASKS = 1048576,
-};
-
-// Match these to ScanCode in config.py
-// Also, sort these alphabetically for easy lookup later
-enum CuNumericScanCode {
- CUNUMERIC_SCAN_PROD = 1,
- CUNUMERIC_SCAN_SUM,
-};
-
-// Match these to ConvertCode in config.py
-// Also, sort these alphabetically for easy lookup later
-enum CuNumericConvertCode {
- CUNUMERIC_CONVERT_NAN_NOOP = 1,
- CUNUMERIC_CONVERT_NAN_PROD,
- CUNUMERIC_CONVERT_NAN_SUM,
-};
-
-// Match these to BitGeneratorOperation in config.py
-enum CuNumericBitGeneratorOperation {
- CUNUMERIC_BITGENOP_CREATE = 1,
- CUNUMERIC_BITGENOP_DESTROY = 2,
- CUNUMERIC_BITGENOP_RAND_RAW = 3,
- CUNUMERIC_BITGENOP_DISTRIBUTION = 4,
-};
-
-// Match these to BitGeneratorType in config.py
-enum CuNumericBitGeneratorType {
- CUNUMERIC_BITGENTYPE_DEFAULT = 0,
- CUNUMERIC_BITGENTYPE_XORWOW = 1,
- CUNUMERIC_BITGENTYPE_MRG32K3A = 2,
- CUNUMERIC_BITGENTYPE_MTGP32 = 3,
- CUNUMERIC_BITGENTYPE_MT19937 = 4,
- CUNUMERIC_BITGENTYPE_PHILOX4_32_10 = 5,
-};
-
-// Match these to BitGeneratorDistribution in config.py
-enum CuNumericBitGeneratorDistribution {
- CUNUMERIC_BITGENDIST_INTEGERS_16 = 1,
- CUNUMERIC_BITGENDIST_INTEGERS_32,
- CUNUMERIC_BITGENDIST_INTEGERS_64,
- CUNUMERIC_BITGENDIST_UNIFORM_32,
- CUNUMERIC_BITGENDIST_UNIFORM_64,
- CUNUMERIC_BITGENDIST_LOGNORMAL_32,
- CUNUMERIC_BITGENDIST_LOGNORMAL_64,
- CUNUMERIC_BITGENDIST_NORMAL_32,
- CUNUMERIC_BITGENDIST_NORMAL_64,
- CUNUMERIC_BITGENDIST_POISSON,
- CUNUMERIC_BITGENDIST_EXPONENTIAL_32,
- CUNUMERIC_BITGENDIST_EXPONENTIAL_64,
- CUNUMERIC_BITGENDIST_GUMBEL_32,
- CUNUMERIC_BITGENDIST_GUMBEL_64,
- CUNUMERIC_BITGENDIST_LAPLACE_32,
- CUNUMERIC_BITGENDIST_LAPLACE_64,
- CUNUMERIC_BITGENDIST_LOGISTIC_32,
- CUNUMERIC_BITGENDIST_LOGISTIC_64,
- CUNUMERIC_BITGENDIST_PARETO_32,
- CUNUMERIC_BITGENDIST_PARETO_64,
- CUNUMERIC_BITGENDIST_POWER_32,
- CUNUMERIC_BITGENDIST_POWER_64,
- CUNUMERIC_BITGENDIST_RAYLEIGH_32,
- CUNUMERIC_BITGENDIST_RAYLEIGH_64,
- CUNUMERIC_BITGENDIST_CAUCHY_32,
- CUNUMERIC_BITGENDIST_CAUCHY_64,
- CUNUMERIC_BITGENDIST_TRIANGULAR_32,
- CUNUMERIC_BITGENDIST_TRIANGULAR_64,
- CUNUMERIC_BITGENDIST_WEIBULL_32,
- CUNUMERIC_BITGENDIST_WEIBULL_64,
- CUNUMERIC_BITGENDIST_BYTES,
- CUNUMERIC_BITGENDIST_BETA_32,
- CUNUMERIC_BITGENDIST_BETA_64,
- CUNUMERIC_BITGENDIST_F_32,
- CUNUMERIC_BITGENDIST_F_64,
- CUNUMERIC_BITGENDIST_LOGSERIES,
- CUNUMERIC_BITGENDIST_NONCENTRAL_F_32,
- CUNUMERIC_BITGENDIST_NONCENTRAL_F_64,
- CUNUMERIC_BITGENDIST_CHISQUARE_32,
- CUNUMERIC_BITGENDIST_CHISQUARE_64,
- CUNUMERIC_BITGENDIST_GAMMA_32,
- CUNUMERIC_BITGENDIST_GAMMA_64,
- CUNUMERIC_BITGENDIST_STANDARD_T_32,
- CUNUMERIC_BITGENDIST_STANDARD_T_64,
- CUNUMERIC_BITGENDIST_HYPERGEOMETRIC,
- CUNUMERIC_BITGENDIST_VONMISES_32,
- CUNUMERIC_BITGENDIST_VONMISES_64,
- CUNUMERIC_BITGENDIST_ZIPF,
- CUNUMERIC_BITGENDIST_GEOMETRIC,
- CUNUMERIC_BITGENDIST_WALD_32,
- CUNUMERIC_BITGENDIST_WALD_64,
- CUNUMERIC_BITGENDIST_BINOMIAL,
- CUNUMERIC_BITGENDIST_NEGATIVE_BINOMIAL,
-};
-
-// These fft types match CuNumericFFTType in config.py and cufftType
-enum CuNumericFFTType {
- CUNUMERIC_FFT_R2C = 0x2a, // Real to complex (interleaved)
- CUNUMERIC_FFT_C2R = 0x2c, // Complex (interleaved) to real
- CUNUMERIC_FFT_C2C = 0x29, // Complex to complex (interleaved)
- CUNUMERIC_FFT_D2Z = 0x6a, // Double to double-complex (interleaved)
- CUNUMERIC_FFT_Z2D = 0x6c, // Double-complex (interleaved) to double
- CUNUMERIC_FFT_Z2Z = 0x69 // Double-complex to double-complex (interleaved)
-};
-
-// These fft types match CuNumericFFTDirection in config.py and cufftDirection
-enum CuNumericFFTDirection { CUNUMERIC_FFT_FORWARD = -1, CUNUMERIC_FFT_INVERSE = 1 };
-
-// Match these to Bitorder in config.py
-enum CuNumericBitorder { CUNUMERIC_BITORDER_BIG = 0, CUNUMERIC_BITORDER_LITTLE = 1 };
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct ReductionOpIds {
- int argmax_redop_id;
- int argmin_redop_id;
-} ReductionOpIds;
-
-void cunumeric_perform_registration();
-bool cunumeric_has_cusolvermp();
-
-unsigned cunumeric_max_eager_volume();
-
-unsigned cunumeric_matmul_cache_size();
-
-struct ReductionOpIds cunumeric_register_reduction_ops(int code);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // __CUNUMERIC_C_H__
diff --git a/src/cunumeric/mapper.cc b/src/cunumeric/mapper.cc
deleted file mode 100644
index 711ee0363e..0000000000
--- a/src/cunumeric/mapper.cc
+++ /dev/null
@@ -1,229 +0,0 @@
-/* Copyright 2024 NVIDIA Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#include "cunumeric/mapper.h"
-
-using namespace legate;
-using namespace legate::mapping;
-
-namespace cunumeric {
-
-TaskTarget CuNumericMapper::task_target(const legate::mapping::Task& task,
- const std::vector& options)
-{
- return *options.begin();
-}
-
-Scalar CuNumericMapper::tunable_value(TunableID tunable_id)
-{
- LEGATE_ABORT("cuNumeric does not use any tunable values");
-}
-
-std::vector CuNumericMapper::store_mappings(
- const mapping::Task& task, const std::vector& options)
-{
- switch (static_cast(task.task_id())) {
- case CUNUMERIC_CONVOLVE: {
- std::vector mappings;
- auto inputs = task.inputs();
- mappings.push_back(StoreMapping::default_mapping(inputs[0].data(), options.front()));
- mappings.push_back(StoreMapping::default_mapping(inputs[1].data(), options.front()));
- auto& input_mapping = mappings.back();
- for (uint32_t idx = 2; idx < inputs.size(); ++idx) {
- input_mapping.add_store(inputs[idx].data());
- }
- return mappings;
- }
- case CUNUMERIC_FFT: {
- std::vector mappings;
- auto inputs = task.inputs();
- auto outputs = task.outputs();
- mappings.push_back(StoreMapping::default_mapping(inputs[0].data(), options.front()));
- mappings.push_back(
- StoreMapping::default_mapping(outputs[0].data(), options.front(), true /*exact*/));
- return mappings;
- }
- case CUNUMERIC_TRANSPOSE_COPY_2D: {
- std::vector mappings;
- auto output = task.output(0);
- mappings.push_back(StoreMapping::default_mapping(output.data(), options.front()));
- mappings.back().policy().ordering.set_fortran_order();
- mappings.back().policy().exact = true;
- return std::move(mappings);
- }
- case CUNUMERIC_MATMUL: {
- std::vector mappings;
- auto inputA = task.input(1);
- auto inputB = task.input(2);
-
- mappings.push_back(
- StoreMapping::default_mapping(inputA.data(), options.front(), true /*exact*/));
- mappings.back().policy().redundant = true;
- mappings.push_back(
- StoreMapping::default_mapping(inputB.data(), options.front(), true /*exact*/));
- mappings.back().policy().redundant = true;
-
- auto outputC = task.output(0);
- mappings.push_back(
- StoreMapping::default_mapping(outputC.data(), options.front(), true /*exact*/));
-
- return mappings;
- }
- case CUNUMERIC_MATVECMUL:
- case CUNUMERIC_UNIQUE_REDUCE: {
- // TODO: Our actual requirements are a little less strict than this; we require each array or
- // vector to have a stride of 1 on at least one dimension.
- std::vector mappings;
- auto inputs = task.inputs();
- auto reductions = task.reductions();
- for (auto& input : inputs) {
- mappings.push_back(
- StoreMapping::default_mapping(input.data(), options.front(), true /*exact*/));
- }
- for (auto& reduction : reductions) {
- mappings.push_back(
- StoreMapping::default_mapping(reduction.data(), options.front(), true /*exact*/));
- }
- return mappings;
- }
- case CUNUMERIC_POTRF:
- case CUNUMERIC_QR:
- case CUNUMERIC_TRSM:
- case CUNUMERIC_SOLVE:
- case CUNUMERIC_SVD:
- case CUNUMERIC_SYRK:
- case CUNUMERIC_GEMM:
- case CUNUMERIC_MP_POTRF:
- case CUNUMERIC_MP_SOLVE: {
- std::vector mappings;
- auto inputs = task.inputs();
- auto outputs = task.outputs();
- for (auto& input : inputs) {
- mappings.push_back(
- StoreMapping::default_mapping(input.data(), options.front(), true /*exact*/));
- mappings.back().policy().ordering.set_fortran_order();
- }
- for (auto& output : outputs) {
- mappings.push_back(
- StoreMapping::default_mapping(output.data(), options.front(), true /*exact*/));
- mappings.back().policy().ordering.set_fortran_order();
- }
- return mappings;
- }
- // CHANGE: If this code is changed, make sure all layouts are
- // consistent with those assumed in batched_cholesky.cu, etc
- case CUNUMERIC_BATCHED_CHOLESKY: {
- std::vector mappings;
- auto inputs = task.inputs();
- auto outputs = task.outputs();
- mappings.reserve(inputs.size() + outputs.size());
- for (auto& input : inputs) {
- mappings.push_back(StoreMapping::default_mapping(input.data(), options.front()));
- mappings.back().policy().exact = true;
- mappings.back().policy().ordering.set_c_order();
- }
- for (auto& output : outputs) {
- mappings.push_back(StoreMapping::default_mapping(output.data(), options.front()));
- mappings.back().policy().exact = true;
- mappings.back().policy().ordering.set_c_order();
- }
- return std::move(mappings);
- }
- case CUNUMERIC_TRILU: {
- if (task.scalars().size() == 2) {
- return {};
- }
- // If we're here, this task was the post-processing for Cholesky.
- // So we will request fortran ordering
- std::vector mappings;
- auto input = task.input(0);
- mappings.push_back(
- StoreMapping::default_mapping(input.data(), options.front(), true /*exact*/));
- mappings.back().policy().ordering.set_fortran_order();
- return mappings;
- }
- case CUNUMERIC_SEARCHSORTED: {
- std::vector mappings;
- auto inputs = task.inputs();
- mappings.push_back(
- StoreMapping::default_mapping(inputs[0].data(), options.front(), true /*exact*/));
- return mappings;
- }
- case CUNUMERIC_SORT: {
- std::vector mappings;
- auto inputs = task.inputs();
- auto outputs = task.outputs();
- for (auto& input : inputs) {
- mappings.push_back(
- StoreMapping::default_mapping(input.data(), options.front(), true /*exact*/));
- }
- for (auto& output : outputs) {
- mappings.push_back(
- StoreMapping::default_mapping(output.data(), options.front(), true /*exact*/));
- }
- return mappings;
- }
- case CUNUMERIC_SCAN_LOCAL: {
- std::vector mappings;
- auto inputs = task.inputs();
- auto outputs = task.outputs();
- for (auto& input : inputs) {
- mappings.push_back(
- StoreMapping::default_mapping(input.data(), options.front(), true /*exact*/));
- }
- for (auto& output : outputs) {
- mappings.push_back(
- StoreMapping::default_mapping(output.data(), options.front(), true /*exact*/));
- }
- return mappings;
- }
- case CUNUMERIC_SCAN_GLOBAL: {
- std::vector mappings;
- auto inputs = task.inputs();
- auto outputs = task.outputs();
- for (auto& input : inputs) {
- mappings.push_back(
- StoreMapping::default_mapping(input.data(), options.front(), true /*exact*/));
- }
- for (auto& output : outputs) {
- mappings.push_back(
- StoreMapping::default_mapping(output.data(), options.front(), true /*exact*/));
- }
- return mappings;
- }
- case CUNUMERIC_BITGENERATOR: {
- std::vector mappings;
- auto inputs = task.inputs();
- auto outputs = task.outputs();
- for (auto& input : inputs) {
- mappings.push_back(
- StoreMapping::default_mapping(input.data(), options.front(), true /*exact*/));
- }
- for (auto& output : outputs) {
- mappings.push_back(
- StoreMapping::default_mapping(output.data(), options.front(), true /*exact*/));
- }
- return mappings;
- }
- default: {
- return {};
- }
- }
- assert(false);
- return {};
-}
-
-} // namespace cunumeric
diff --git a/src/cunumeric/matrix/batched_cholesky.h b/src/cunumeric/matrix/batched_cholesky.h
deleted file mode 100644
index 94713beffe..0000000000
--- a/src/cunumeric/matrix/batched_cholesky.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2024 NVIDIA Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#pragma once
-
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/cunumeric_c.h"
-
-namespace cunumeric {
-
-class BatchedCholeskyTask : public CuNumericTask {
- public:
- static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_BATCHED_CHOLESKY};
-
- public:
- static void cpu_variant(legate::TaskContext context);
-#if LEGATE_DEFINED(LEGATE_USE_OPENMP)
- static void omp_variant(legate::TaskContext context);
-#endif
-#if LEGATE_DEFINED(LEGATE_USE_CUDA)
- static void gpu_variant(legate::TaskContext context);
-#endif
-};
-
-} // namespace cunumeric
diff --git a/src/cunumeric/matrix/potrf.h b/src/cunumeric/matrix/potrf.h
deleted file mode 100644
index d2928df9fc..0000000000
--- a/src/cunumeric/matrix/potrf.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2024 NVIDIA Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#pragma once
-
-#include "cunumeric/cunumeric_task.h"
-
-namespace cunumeric {
-
-class PotrfTask : public CuNumericTask {
- public:
- static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_POTRF};
-
- public:
- static void cpu_variant(legate::TaskContext context);
-#if LEGATE_DEFINED(LEGATE_USE_OPENMP)
- static void omp_variant(legate::TaskContext context);
-#endif
-#if LEGATE_DEFINED(LEGATE_USE_CUDA)
- static void gpu_variant(legate::TaskContext context);
-#endif
-};
-
-} // namespace cunumeric
diff --git a/src/cunumeric/random/bitgenerator_util.h b/src/cunumeric/random/bitgenerator_util.h
deleted file mode 100644
index 0a726a9f08..0000000000
--- a/src/cunumeric/random/bitgenerator_util.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright 2024 NVIDIA Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#pragma once
-
-#include "cunumeric/cunumeric_task.h"
-
-namespace cunumeric {
-
-// Match these to BitGeneratorOperation in config.py
-enum class BitGeneratorOperation : int32_t {
- CREATE = CUNUMERIC_BITGENOP_CREATE,
- DESTROY = CUNUMERIC_BITGENOP_DESTROY,
- RAND_RAW = CUNUMERIC_BITGENOP_RAND_RAW,
- DISTRIBUTION = CUNUMERIC_BITGENOP_DISTRIBUTION,
-};
-
-// Match these to BitGeneratorType in config.py
-enum class BitGeneratorType : uint32_t {
- DEFAULT = CUNUMERIC_BITGENTYPE_DEFAULT,
- XORWOW = CUNUMERIC_BITGENTYPE_XORWOW,
- MRG32K3A = CUNUMERIC_BITGENTYPE_MRG32K3A,
- MTGP32 = CUNUMERIC_BITGENTYPE_MTGP32,
- MT19937 = CUNUMERIC_BITGENTYPE_MT19937,
- PHILOX4_32_10 = CUNUMERIC_BITGENTYPE_PHILOX4_32_10,
-};
-
-// Match these to BitGeneratorDistribution in config.py
-enum class BitGeneratorDistribution : int32_t {
- INTEGERS_16 = CUNUMERIC_BITGENDIST_INTEGERS_16,
- INTEGERS_32 = CUNUMERIC_BITGENDIST_INTEGERS_32,
- INTEGERS_64 = CUNUMERIC_BITGENDIST_INTEGERS_64,
- UNIFORM_32 = CUNUMERIC_BITGENDIST_UNIFORM_32,
- UNIFORM_64 = CUNUMERIC_BITGENDIST_UNIFORM_64,
- LOGNORMAL_32 = CUNUMERIC_BITGENDIST_LOGNORMAL_32,
- LOGNORMAL_64 = CUNUMERIC_BITGENDIST_LOGNORMAL_64,
- NORMAL_32 = CUNUMERIC_BITGENDIST_NORMAL_32,
- NORMAL_64 = CUNUMERIC_BITGENDIST_NORMAL_64,
- POISSON = CUNUMERIC_BITGENDIST_POISSON,
- EXPONENTIAL_32 = CUNUMERIC_BITGENDIST_EXPONENTIAL_32,
- EXPONENTIAL_64 = CUNUMERIC_BITGENDIST_EXPONENTIAL_64,
- GUMBEL_32 = CUNUMERIC_BITGENDIST_GUMBEL_32,
- GUMBEL_64 = CUNUMERIC_BITGENDIST_GUMBEL_64,
- LAPLACE_32 = CUNUMERIC_BITGENDIST_LAPLACE_32,
- LAPLACE_64 = CUNUMERIC_BITGENDIST_LAPLACE_64,
- LOGISTIC_32 = CUNUMERIC_BITGENDIST_LOGISTIC_32,
- LOGISTIC_64 = CUNUMERIC_BITGENDIST_LOGISTIC_64,
- PARETO_32 = CUNUMERIC_BITGENDIST_PARETO_32,
- PARETO_64 = CUNUMERIC_BITGENDIST_PARETO_64,
- POWER_32 = CUNUMERIC_BITGENDIST_POWER_32,
- POWER_64 = CUNUMERIC_BITGENDIST_POWER_64,
- RAYLEIGH_32 = CUNUMERIC_BITGENDIST_RAYLEIGH_32,
- RAYLEIGH_64 = CUNUMERIC_BITGENDIST_RAYLEIGH_64,
- CAUCHY_32 = CUNUMERIC_BITGENDIST_CAUCHY_32,
- CAUCHY_64 = CUNUMERIC_BITGENDIST_CAUCHY_64,
- TRIANGULAR_32 = CUNUMERIC_BITGENDIST_TRIANGULAR_32,
- TRIANGULAR_64 = CUNUMERIC_BITGENDIST_TRIANGULAR_64,
- WEIBULL_32 = CUNUMERIC_BITGENDIST_WEIBULL_32,
- WEIBULL_64 = CUNUMERIC_BITGENDIST_WEIBULL_64,
- BYTES = CUNUMERIC_BITGENDIST_BYTES,
- BETA_32 = CUNUMERIC_BITGENDIST_BETA_32,
- BETA_64 = CUNUMERIC_BITGENDIST_BETA_64,
- F_32 = CUNUMERIC_BITGENDIST_F_32,
- F_64 = CUNUMERIC_BITGENDIST_F_64,
- LOGSERIES = CUNUMERIC_BITGENDIST_LOGSERIES,
- NONCENTRAL_F_32 = CUNUMERIC_BITGENDIST_NONCENTRAL_F_32,
- NONCENTRAL_F_64 = CUNUMERIC_BITGENDIST_NONCENTRAL_F_64,
- CHISQUARE_32 = CUNUMERIC_BITGENDIST_CHISQUARE_32,
- CHISQUARE_64 = CUNUMERIC_BITGENDIST_CHISQUARE_64,
- GAMMA_32 = CUNUMERIC_BITGENDIST_GAMMA_32,
- GAMMA_64 = CUNUMERIC_BITGENDIST_GAMMA_64,
- STANDARD_T_32 = CUNUMERIC_BITGENDIST_STANDARD_T_32,
- STANDARD_T_64 = CUNUMERIC_BITGENDIST_STANDARD_T_64,
- HYPERGEOMETRIC = CUNUMERIC_BITGENDIST_HYPERGEOMETRIC,
- VONMISES_32 = CUNUMERIC_BITGENDIST_VONMISES_32,
- VONMISES_64 = CUNUMERIC_BITGENDIST_VONMISES_64,
- ZIPF = CUNUMERIC_BITGENDIST_ZIPF,
- GEOMETRIC = CUNUMERIC_BITGENDIST_GEOMETRIC,
- WALD_32 = CUNUMERIC_BITGENDIST_WALD_32,
- WALD_64 = CUNUMERIC_BITGENDIST_WALD_64,
- BINOMIAL = CUNUMERIC_BITGENDIST_BINOMIAL,
- NEGATIVE_BINOMIAL = CUNUMERIC_BITGENDIST_NEGATIVE_BINOMIAL,
-};
-
-} // namespace cunumeric
diff --git a/src/cunumeric/runtime.cc b/src/cunumeric/runtime.cc
deleted file mode 100644
index ff6afc92fc..0000000000
--- a/src/cunumeric/runtime.cc
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright 2024 NVIDIA Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#include "env_defaults.h"
-#include "cunumeric/runtime.h"
-
-#include "cunumeric/ndarray.h"
-#include "cunumeric/unary/unary_red_util.h"
-
-#include
-#include
-#include
-
-namespace cunumeric {
-
-/*static*/ CuNumericRuntime* CuNumericRuntime::runtime_;
-
-extern void bootstrapping_callback(Legion::Machine machine,
- Legion::Runtime* runtime,
- const std::set& local_procs);
-
-void initialize(int32_t argc, char** argv) { cunumeric_perform_registration(); }
-
-CuNumericRuntime::CuNumericRuntime(legate::Runtime* legate_runtime, legate::Library library)
- : legate_runtime_(legate_runtime), library_(library)
-{
-}
-
-NDArray CuNumericRuntime::create_array(const legate::Type& type)
-{
- auto store = legate_runtime_->create_store(type);
- return NDArray(std::move(store));
-}
-
-NDArray CuNumericRuntime::create_array(std::vector shape,
- const legate::Type& type,
- bool optimize_scalar)
-{
- auto store = legate_runtime_->create_store(legate::Shape{shape}, type, optimize_scalar);
- return NDArray(std::move(store));
-}
-
-NDArray CuNumericRuntime::create_array(legate::LogicalStore&& store)
-{
- return NDArray(std::move(store));
-}
-
-NDArray CuNumericRuntime::create_array(const legate::Type& type, int32_t dim)
-{
- auto store = legate_runtime_->create_store(type, dim);
- return NDArray(std::move(store));
-}
-
-legate::LogicalStore CuNumericRuntime::create_scalar_store(const Scalar& value)
-{
- return legate_runtime_->create_store(value);
-}
-
-legate::Type CuNumericRuntime::get_argred_type(const legate::Type& value_type)
-{
- auto finder = argred_types_.find(value_type.code());
- if (finder != argred_types_.end()) {
- return finder->second;
- }
-
- auto argred_type = legate::struct_type({legate::int64(), value_type}, true /*align*/);
- argred_types_.insert({value_type.code(), argred_type});
- return argred_type;
-}
-
-legate::AutoTask CuNumericRuntime::create_task(CuNumericOpCode op_code)
-{
- return legate_runtime_->create_task(library_, legate::LocalTaskID{op_code});
-}
-
-legate::ManualTask CuNumericRuntime::create_task(CuNumericOpCode op_code,
- const legate::tuple& launch_shape)
-{
- return legate_runtime_->create_task(library_, legate::LocalTaskID{op_code}, launch_shape);
-}
-
-void CuNumericRuntime::submit(legate::AutoTask&& task) { legate_runtime_->submit(std::move(task)); }
-
-void CuNumericRuntime::submit(legate::ManualTask&& task)
-{
- legate_runtime_->submit(std::move(task));
-}
-
-uint32_t CuNumericRuntime::get_next_random_epoch() { return next_epoch_++; }
-
-/*static*/ CuNumericRuntime* CuNumericRuntime::get_runtime() { return runtime_; }
-
-/*static*/ void CuNumericRuntime::initialize(legate::Runtime* legate_runtime,
- legate::Library library)
-{
- runtime_ = new CuNumericRuntime(legate_runtime, library);
-}
-
-namespace {
-
-std::uint32_t extract_env(const char* env_name,
- std::uint32_t default_value,
- std::uint32_t test_value)
-{
- auto parse_value = [](const char* value_char) {
- auto value_sv = std::string_view{value_char};
-
- std::uint32_t result{};
- if (auto&& [_, ec] = std::from_chars(value_sv.begin(), value_sv.end(), result);
- ec != std::errc{}) {
- throw std::runtime_error{std::make_error_code(ec).message()};
- }
-
- return result;
- };
-
- if (const auto* env_value = std::getenv(env_name); env_value) {
- return parse_value(env_value);
- }
-
- if (const auto* is_in_test_mode = std::getenv("LEGATE_TEST");
- is_in_test_mode && parse_value(is_in_test_mode)) {
- return test_value;
- }
-
- return default_value;
-}
-
-} // namespace
-
-} // namespace cunumeric
-
-extern "C" {
-
-unsigned cunumeric_max_eager_volume()
-{
- static const auto min_gpu_chunk =
- cunumeric::extract_env("CUNUMERIC_MIN_GPU_CHUNK", MIN_GPU_CHUNK_DEFAULT, MIN_GPU_CHUNK_TEST);
- static const auto min_cpu_chunk =
- cunumeric::extract_env("CUNUMERIC_MIN_CPU_CHUNK", MIN_CPU_CHUNK_DEFAULT, MIN_CPU_CHUNK_TEST);
- static const auto min_omp_chunk =
- cunumeric::extract_env("CUNUMERIC_MIN_OMP_CHUNK", MIN_OMP_CHUNK_DEFAULT, MIN_OMP_CHUNK_TEST);
-
- auto machine = legate::get_machine();
-
- if (machine.count(legate::mapping::TaskTarget::GPU) > 0) {
- return min_gpu_chunk;
- }
- if (machine.count(legate::mapping::TaskTarget::OMP) > 0) {
- return min_omp_chunk;
- }
- return min_cpu_chunk;
-}
-
-unsigned cunumeric_matmul_cache_size()
-{
- static const auto max_cache_size = cunumeric::extract_env(
- "CUNUMERIC_MATMUL_CACHE_SIZE", MATMUL_CACHE_SIZE_DEFAULT, MATMUL_CACHE_SIZE_TEST);
- return max_cache_size;
-}
-
-} // extern "C"
diff --git a/src/cunumeric/set/unique.h b/src/cunumeric/set/unique.h
deleted file mode 100644
index ab6ed17cd7..0000000000
--- a/src/cunumeric/set/unique.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2024 NVIDIA Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#pragma once
-
-#include "cunumeric/cunumeric_task.h"
-
-namespace cunumeric {
-
-class UniqueTask : public CuNumericTask {
- public:
- static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_UNIQUE};
-
- public:
- static void cpu_variant(legate::TaskContext context);
-#if LEGATE_DEFINED(LEGATE_USE_OPENMP)
- static void omp_variant(legate::TaskContext context);
-#endif
-#if LEGATE_DEFINED(LEGATE_USE_CUDA)
- static void gpu_variant(legate::TaskContext context);
-#endif
-};
-
-} // namespace cunumeric
diff --git a/src/cunumeric.h b/src/cupynumeric.h
similarity index 85%
rename from src/cunumeric.h
rename to src/cupynumeric.h
index dfd752c834..fe598bd438 100644
--- a/src/cunumeric.h
+++ b/src/cupynumeric.h
@@ -14,6 +14,6 @@
*
*/
-#include "cunumeric/ndarray.h"
-#include "cunumeric/operators.h"
-#include "cunumeric/slice.h"
+#include "cupynumeric/ndarray.h"
+#include "cupynumeric/operators.h"
+#include "cupynumeric/slice.h"
diff --git a/src/cunumeric/arg.h b/src/cupynumeric/arg.h
similarity index 96%
rename from src/cunumeric/arg.h
rename to src/cupynumeric/arg.h
index 1dd91b12b1..70803223d4 100644
--- a/src/cunumeric/arg.h
+++ b/src/cupynumeric/arg.h
@@ -18,7 +18,7 @@
#include "legate.h"
-namespace cunumeric {
+namespace cupynumeric {
template
class Argval {
@@ -95,6 +95,6 @@ class ArgminReduction {
}
};
-} // namespace cunumeric
+} // namespace cupynumeric
-#include "cunumeric/arg.inl"
+#include "cupynumeric/arg.inl"
diff --git a/src/cunumeric/arg.inl b/src/cupynumeric/arg.inl
similarity index 98%
rename from src/cunumeric/arg.inl
rename to src/cupynumeric/arg.inl
index 5c0ba9b689..995b516486 100644
--- a/src/cunumeric/arg.inl
+++ b/src/cupynumeric/arg.inl
@@ -19,7 +19,7 @@
// Useful for IDEs
#include "arg.h"
-namespace cunumeric {
+namespace cupynumeric {
template
__CUDA_HD__ Argval::Argval(T v) : arg(LLONG_MIN), arg_value(v)
@@ -143,4 +143,4 @@ DECLARE_IDENTITIES(uint64_t)
#undef DECLARE_ARGMIN_IDENTITY
#undef DECLARE_ARGMAX_IDENTITY
-} // namespace cunumeric
+} // namespace cupynumeric
diff --git a/src/cunumeric/arg_redop_register.cc b/src/cupynumeric/arg_redop_register.cc
similarity index 89%
rename from src/cunumeric/arg_redop_register.cc
rename to src/cupynumeric/arg_redop_register.cc
index 7fdf450ac3..2e7372bb06 100644
--- a/src/cunumeric/arg_redop_register.cc
+++ b/src/cupynumeric/arg_redop_register.cc
@@ -14,11 +14,11 @@
*
*/
-#include "cunumeric/arg_redop_register.h"
+#include "cupynumeric/arg_redop_register.h"
#include
-namespace cunumeric {
+namespace cupynumeric {
#define DEFINE_ARGMAX_IDENTITY(TYPE) \
template <> \
@@ -58,15 +58,15 @@ register_reduction_op_fn::register_reduction_op_fn::next_reduction_operator_id()
return legate::LocalRedopID{next_redop_id++};
}
-} // namespace cunumeric
+} // namespace cupynumeric
#if !LEGATE_DEFINED(LEGATE_USE_CUDA)
extern "C" {
-ReductionOpIds cunumeric_register_reduction_ops(int code)
+ReductionOpIds cupynumeric_register_reduction_ops(int code)
{
return legate::type_dispatch(static_cast(code),
- cunumeric::register_reduction_op_fn{});
+ cupynumeric::register_reduction_op_fn{});
}
}
#endif
diff --git a/src/cunumeric/arg_redop_register.cu b/src/cupynumeric/arg_redop_register.cu
similarity index 79%
rename from src/cunumeric/arg_redop_register.cu
rename to src/cupynumeric/arg_redop_register.cu
index 076d02a029..c48ed286a3 100644
--- a/src/cunumeric/arg_redop_register.cu
+++ b/src/cupynumeric/arg_redop_register.cu
@@ -14,13 +14,13 @@
*
*/
-#include "cunumeric/arg_redop_register.h"
+#include "cupynumeric/arg_redop_register.h"
extern "C" {
-ReductionOpIds cunumeric_register_reduction_ops(int code)
+ReductionOpIds cupynumeric_register_reduction_ops(int code)
{
return legate::type_dispatch(static_cast(code),
- cunumeric::register_reduction_op_fn{});
+ cupynumeric::register_reduction_op_fn{});
}
}
diff --git a/src/cunumeric/arg_redop_register.h b/src/cupynumeric/arg_redop_register.h
similarity index 89%
rename from src/cunumeric/arg_redop_register.h
rename to src/cupynumeric/arg_redop_register.h
index 05b764c8e0..68e6e65a63 100644
--- a/src/cunumeric/arg_redop_register.h
+++ b/src/cupynumeric/arg_redop_register.h
@@ -17,10 +17,10 @@
#pragma once
#include "legate.h"
-#include "cunumeric/cunumeric_c.h"
-#include "cunumeric/arg.h"
+#include "cupynumeric/cupynumeric_c.h"
+#include "cupynumeric/arg.h"
-namespace cunumeric {
+namespace cupynumeric {
struct register_reduction_op_fn {
template ::value>* = nullptr>
@@ -29,7 +29,7 @@ struct register_reduction_op_fn {
using VAL = legate::type_of;
ReductionOpIds result;
auto runtime = legate::Runtime::get_runtime();
- auto context = runtime->find_library("cunumeric");
+ auto context = runtime->find_library("cupynumeric");
result.argmax_redop_id = static_cast(
context.register_reduction_operator>(next_reduction_operator_id()));
result.argmin_redop_id = static_cast(
@@ -47,4 +47,4 @@ struct register_reduction_op_fn {
static legate::LocalRedopID next_reduction_operator_id();
};
-} // namespace cunumeric
+} // namespace cupynumeric
diff --git a/src/cunumeric/binary/binary_op.cc b/src/cupynumeric/binary/binary_op.cc
similarity index 87%
rename from src/cunumeric/binary/binary_op.cc
rename to src/cupynumeric/binary/binary_op.cc
index 64a810d981..e8a271b729 100644
--- a/src/cunumeric/binary/binary_op.cc
+++ b/src/cupynumeric/binary/binary_op.cc
@@ -14,10 +14,10 @@
*
*/
-#include "cunumeric/binary/binary_op.h"
-#include "cunumeric/binary/binary_op_template.inl"
+#include "cupynumeric/binary/binary_op.h"
+#include "cupynumeric/binary/binary_op_template.inl"
-namespace cunumeric {
+namespace cupynumeric {
using namespace legate;
@@ -60,7 +60,10 @@ struct BinaryOpImplBody {
namespace // unnamed
{
-static void __attribute__((constructor)) register_tasks(void) { BinaryOpTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+ BinaryOpTask::register_variants();
+ return 0;
+}();
} // namespace
-} // namespace cunumeric
+} // namespace cupynumeric
diff --git a/src/cunumeric/binary/binary_op.cu b/src/cupynumeric/binary/binary_op.cu
similarity index 92%
rename from src/cunumeric/binary/binary_op.cu
rename to src/cupynumeric/binary/binary_op.cu
index d00fa66e7d..ea7f68f4c9 100644
--- a/src/cunumeric/binary/binary_op.cu
+++ b/src/cupynumeric/binary/binary_op.cu
@@ -14,12 +14,12 @@
*
*/
-#include "cunumeric/binary/binary_op.h"
-#include "cunumeric/binary/binary_op_template.inl"
+#include "cupynumeric/binary/binary_op.h"
+#include "cupynumeric/binary/binary_op_template.inl"
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
-namespace cunumeric {
+namespace cupynumeric {
template
static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
@@ -82,7 +82,7 @@ struct BinaryOpImplBody {
generic_kernel<<>>(
volume, func, out, in1, in2, pitches, rect);
}
- CUNUMERIC_CHECK_CUDA_STREAM(stream);
+ CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
}
};
@@ -91,4 +91,4 @@ struct BinaryOpImplBody {
binary_op_template(context);
}
-} // namespace cunumeric
+} // namespace cupynumeric
diff --git a/src/cunumeric/binary/binary_op.h b/src/cupynumeric/binary/binary_op.h
similarity index 77%
rename from src/cunumeric/binary/binary_op.h
rename to src/cupynumeric/binary/binary_op.h
index 8bdf29d7d5..34ac087835 100644
--- a/src/cunumeric/binary/binary_op.h
+++ b/src/cupynumeric/binary/binary_op.h
@@ -16,10 +16,10 @@
#pragma once
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/binary/binary_op_util.h"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/binary/binary_op_util.h"
-namespace cunumeric {
+namespace cupynumeric {
struct BinaryOpArgs {
legate::PhysicalStore in1;
@@ -29,9 +29,10 @@ struct BinaryOpArgs {
std::vector args;
};
-class BinaryOpTask : public CuNumericTask {
+class BinaryOpTask : public CuPyNumericTask {
public:
- static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_BINARY_OP};
+ static inline const auto TASK_CONFIG =
+ legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_BINARY_OP}};
public:
static void cpu_variant(legate::TaskContext context);
@@ -43,4 +44,4 @@ class BinaryOpTask : public CuNumericTask {
#endif
};
-} // namespace cunumeric
+} // namespace cupynumeric
diff --git a/src/cunumeric/binary/binary_op_omp.cc b/src/cupynumeric/binary/binary_op_omp.cc
similarity index 92%
rename from src/cunumeric/binary/binary_op_omp.cc
rename to src/cupynumeric/binary/binary_op_omp.cc
index 684296a53a..9d4542d5f9 100644
--- a/src/cunumeric/binary/binary_op_omp.cc
+++ b/src/cupynumeric/binary/binary_op_omp.cc
@@ -14,10 +14,10 @@
*
*/
-#include "cunumeric/binary/binary_op.h"
-#include "cunumeric/binary/binary_op_template.inl"
+#include "cupynumeric/binary/binary_op.h"
+#include "cupynumeric/binary/binary_op_template.inl"
-namespace cunumeric {
+namespace cupynumeric {
using namespace legate;
@@ -60,4 +60,4 @@ struct BinaryOpImplBody {
binary_op_template(context);
}
-} // namespace cunumeric
+} // namespace cupynumeric
diff --git a/src/cunumeric/binary/binary_op_template.inl b/src/cupynumeric/binary/binary_op_template.inl
similarity index 94%
rename from src/cunumeric/binary/binary_op_template.inl
rename to src/cupynumeric/binary/binary_op_template.inl
index e3f5acbf44..01869a1922 100644
--- a/src/cunumeric/binary/binary_op_template.inl
+++ b/src/cupynumeric/binary/binary_op_template.inl
@@ -17,11 +17,11 @@
#pragma once
// Useful for IDEs
-#include "cunumeric/binary/binary_op.h"
-#include "cunumeric/binary/binary_op_util.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/binary/binary_op.h"
+#include "cupynumeric/binary/binary_op_util.h"
+#include "cupynumeric/pitches.h"
-namespace cunumeric {
+namespace cupynumeric {
using namespace legate;
@@ -94,4 +94,4 @@ static void binary_op_template(TaskContext& context)
op_dispatch(args.op_code, BinaryOpDispatch{}, args);
}
-} // namespace cunumeric
+} // namespace cupynumeric
diff --git a/src/cunumeric/binary/binary_op_util.cc b/src/cupynumeric/binary/binary_op_util.cc
similarity index 90%
rename from src/cunumeric/binary/binary_op_util.cc
rename to src/cupynumeric/binary/binary_op_util.cc
index 180c9d9c02..0f90b40b12 100644
--- a/src/cunumeric/binary/binary_op_util.cc
+++ b/src/cupynumeric/binary/binary_op_util.cc
@@ -14,13 +14,13 @@
*
*/
-#include "cunumeric/binary/binary_op_util.h"
+#include "cupynumeric/binary/binary_op_util.h"
-namespace cunumeric {
+namespace cupynumeric {
std::vector broadcast_shapes(std::vector arrays)
{
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
assert(!arrays.empty());
#endif
int32_t dim = 0;
@@ -46,4 +46,4 @@ std::vector broadcast_shapes(std::vector arrays)
return result;
}
-} // namespace cunumeric
+} // namespace cupynumeric
diff --git a/src/cunumeric/binary/binary_op_util.h b/src/cupynumeric/binary/binary_op_util.h
similarity index 94%
rename from src/cunumeric/binary/binary_op_util.h
rename to src/cupynumeric/binary/binary_op_util.h
index 55189409ea..84c8a88cdb 100644
--- a/src/cunumeric/binary/binary_op_util.h
+++ b/src/cupynumeric/binary/binary_op_util.h
@@ -16,47 +16,47 @@
#pragma once
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/ndarray.h"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/ndarray.h"
-namespace cunumeric {
+namespace cupynumeric {
enum class BinaryOpCode : int {
- ADD = CUNUMERIC_BINOP_ADD,
- ARCTAN2 = CUNUMERIC_BINOP_ARCTAN2,
- BITWISE_AND = CUNUMERIC_BINOP_BITWISE_AND,
- BITWISE_OR = CUNUMERIC_BINOP_BITWISE_OR,
- BITWISE_XOR = CUNUMERIC_BINOP_BITWISE_XOR,
- COPYSIGN = CUNUMERIC_BINOP_COPYSIGN,
- DIVIDE = CUNUMERIC_BINOP_DIVIDE,
- EQUAL = CUNUMERIC_BINOP_EQUAL,
- FLOAT_POWER = CUNUMERIC_BINOP_FLOAT_POWER,
- FLOOR_DIVIDE = CUNUMERIC_BINOP_FLOOR_DIVIDE,
- FMOD = CUNUMERIC_BINOP_FMOD,
- GCD = CUNUMERIC_BINOP_GCD,
- GREATER = CUNUMERIC_BINOP_GREATER,
- GREATER_EQUAL = CUNUMERIC_BINOP_GREATER_EQUAL,
- HYPOT = CUNUMERIC_BINOP_HYPOT,
- ISCLOSE = CUNUMERIC_BINOP_ISCLOSE,
- LCM = CUNUMERIC_BINOP_LCM,
- LDEXP = CUNUMERIC_BINOP_LDEXP,
- LEFT_SHIFT = CUNUMERIC_BINOP_LEFT_SHIFT,
- LESS = CUNUMERIC_BINOP_LESS,
- LESS_EQUAL = CUNUMERIC_BINOP_LESS_EQUAL,
- LOGADDEXP = CUNUMERIC_BINOP_LOGADDEXP,
- LOGADDEXP2 = CUNUMERIC_BINOP_LOGADDEXP2,
- LOGICAL_AND = CUNUMERIC_BINOP_LOGICAL_AND,
- LOGICAL_OR = CUNUMERIC_BINOP_LOGICAL_OR,
- LOGICAL_XOR = CUNUMERIC_BINOP_LOGICAL_XOR,
- MAXIMUM = CUNUMERIC_BINOP_MAXIMUM,
- MINIMUM = CUNUMERIC_BINOP_MINIMUM,
- MOD = CUNUMERIC_BINOP_MOD,
- MULTIPLY = CUNUMERIC_BINOP_MULTIPLY,
- NEXTAFTER = CUNUMERIC_BINOP_NEXTAFTER,
- NOT_EQUAL = CUNUMERIC_BINOP_NOT_EQUAL,
- POWER = CUNUMERIC_BINOP_POWER,
- RIGHT_SHIFT = CUNUMERIC_BINOP_RIGHT_SHIFT,
- SUBTRACT = CUNUMERIC_BINOP_SUBTRACT,
+ ADD = CUPYNUMERIC_BINOP_ADD,
+ ARCTAN2 = CUPYNUMERIC_BINOP_ARCTAN2,
+ BITWISE_AND = CUPYNUMERIC_BINOP_BITWISE_AND,
+ BITWISE_OR = CUPYNUMERIC_BINOP_BITWISE_OR,
+ BITWISE_XOR = CUPYNUMERIC_BINOP_BITWISE_XOR,
+ COPYSIGN = CUPYNUMERIC_BINOP_COPYSIGN,
+ DIVIDE = CUPYNUMERIC_BINOP_DIVIDE,
+ EQUAL = CUPYNUMERIC_BINOP_EQUAL,
+ FLOAT_POWER = CUPYNUMERIC_BINOP_FLOAT_POWER,
+ FLOOR_DIVIDE = CUPYNUMERIC_BINOP_FLOOR_DIVIDE,
+ FMOD = CUPYNUMERIC_BINOP_FMOD,
+ GCD = CUPYNUMERIC_BINOP_GCD,
+ GREATER = CUPYNUMERIC_BINOP_GREATER,
+ GREATER_EQUAL = CUPYNUMERIC_BINOP_GREATER_EQUAL,
+ HYPOT = CUPYNUMERIC_BINOP_HYPOT,
+ ISCLOSE = CUPYNUMERIC_BINOP_ISCLOSE,
+ LCM = CUPYNUMERIC_BINOP_LCM,
+ LDEXP = CUPYNUMERIC_BINOP_LDEXP,
+ LEFT_SHIFT = CUPYNUMERIC_BINOP_LEFT_SHIFT,
+ LESS = CUPYNUMERIC_BINOP_LESS,
+ LESS_EQUAL = CUPYNUMERIC_BINOP_LESS_EQUAL,
+ LOGADDEXP = CUPYNUMERIC_BINOP_LOGADDEXP,
+ LOGADDEXP2 = CUPYNUMERIC_BINOP_LOGADDEXP2,
+ LOGICAL_AND = CUPYNUMERIC_BINOP_LOGICAL_AND,
+ LOGICAL_OR = CUPYNUMERIC_BINOP_LOGICAL_OR,
+ LOGICAL_XOR = CUPYNUMERIC_BINOP_LOGICAL_XOR,
+ MAXIMUM = CUPYNUMERIC_BINOP_MAXIMUM,
+ MINIMUM = CUPYNUMERIC_BINOP_MINIMUM,
+ MOD = CUPYNUMERIC_BINOP_MOD,
+ MULTIPLY = CUPYNUMERIC_BINOP_MULTIPLY,
+ NEXTAFTER = CUPYNUMERIC_BINOP_NEXTAFTER,
+ NOT_EQUAL = CUPYNUMERIC_BINOP_NOT_EQUAL,
+ POWER = CUPYNUMERIC_BINOP_POWER,
+ RIGHT_SHIFT = CUPYNUMERIC_BINOP_RIGHT_SHIFT,
+ SUBTRACT = CUPYNUMERIC_BINOP_SUBTRACT,
};
template
@@ -913,4 +913,4 @@ using rhs2_of_binary_op = typename RHS2OfBinaryOp::type;
std::vector broadcast_shapes(std::vector arrays);
-} // namespace cunumeric
+} // namespace cupynumeric
diff --git a/src/cunumeric/binary/binary_red.cc b/src/cupynumeric/binary/binary_red.cc
similarity index 89%
rename from src/cunumeric/binary/binary_red.cc
rename to src/cupynumeric/binary/binary_red.cc
index 576347b37d..89ad585ccf 100644
--- a/src/cunumeric/binary/binary_red.cc
+++ b/src/cupynumeric/binary/binary_red.cc
@@ -14,10 +14,10 @@
*
*/
-#include "cunumeric/binary/binary_red.h"
-#include "cunumeric/binary/binary_red_template.inl"
+#include "cupynumeric/binary/binary_red.h"
+#include "cupynumeric/binary/binary_red_template.inl"
-namespace cunumeric {
+namespace cupynumeric {
using namespace legate;
@@ -66,10 +66,10 @@ struct BinaryRedImplBody {
namespace // unnamed
{
-static void __attribute__((constructor)) register_tasks(void)
-{
+const auto cupynumeric_reg_task_ = []() -> char {
BinaryRedTask::register_variants();
-}
+ return 0;
+}();
} // namespace
-} // namespace cunumeric
+} // namespace cupynumeric
diff --git a/src/cunumeric/binary/binary_red.cu b/src/cupynumeric/binary/binary_red.cu
similarity index 92%
rename from src/cunumeric/binary/binary_red.cu
rename to src/cupynumeric/binary/binary_red.cu
index 4623e43bdc..47544a5ab4 100644
--- a/src/cunumeric/binary/binary_red.cu
+++ b/src/cupynumeric/binary/binary_red.cu
@@ -14,12 +14,12 @@
*
*/
-#include "cunumeric/binary/binary_red.h"
-#include "cunumeric/binary/binary_red_template.inl"
+#include "cupynumeric/binary/binary_red.h"
+#include "cupynumeric/binary/binary_red_template.inl"
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
-namespace cunumeric {
+namespace cupynumeric {
template
static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
@@ -82,7 +82,7 @@ struct BinaryRedImplBody {
}
copy_kernel<<<1, 1, 0, stream>>>(result, out);
- CUNUMERIC_CHECK_CUDA_STREAM(stream);
+ CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
}
};
@@ -91,4 +91,4 @@ struct BinaryRedImplBody {
binary_red_template(context);
}
-} // namespace cunumeric
+} // namespace cupynumeric
diff --git a/src/cunumeric/binary/binary_red.h b/src/cupynumeric/binary/binary_red.h
similarity index 72%
rename from src/cunumeric/binary/binary_red.h
rename to src/cupynumeric/binary/binary_red.h
index 28ca9f030f..906300e95b 100644
--- a/src/cunumeric/binary/binary_red.h
+++ b/src/cupynumeric/binary/binary_red.h
@@ -16,10 +16,10 @@
#pragma once
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/binary/binary_op_util.h"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/binary/binary_op_util.h"
-namespace cunumeric {
+namespace cupynumeric {
struct BinaryRedArgs {
legate::PhysicalStore out;
@@ -29,9 +29,12 @@ struct BinaryRedArgs {
std::vector args;
};
-class BinaryRedTask : public CuNumericTask {
+class BinaryRedTask : public CuPyNumericTask {
public:
- static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_BINARY_RED};
+ static inline const auto TASK_CONFIG =
+ legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_BINARY_RED}};
+
+ static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
public:
static void cpu_variant(legate::TaskContext context);
@@ -43,4 +46,4 @@ class BinaryRedTask : public CuNumericTask {
#endif
};
-} // namespace cunumeric
+} // namespace cupynumeric
diff --git a/src/cunumeric/binary/binary_red_omp.cc b/src/cupynumeric/binary/binary_red_omp.cc
similarity index 92%
rename from src/cunumeric/binary/binary_red_omp.cc
rename to src/cupynumeric/binary/binary_red_omp.cc
index f3823c2031..021f99943b 100644
--- a/src/cunumeric/binary/binary_red_omp.cc
+++ b/src/cupynumeric/binary/binary_red_omp.cc
@@ -14,10 +14,10 @@
*
*/
-#include "cunumeric/binary/binary_red.h"
-#include "cunumeric/binary/binary_red_template.inl"
+#include "cupynumeric/binary/binary_red.h"
+#include "cupynumeric/binary/binary_red_template.inl"
-namespace cunumeric {
+namespace cupynumeric {
using namespace legate;
@@ -65,4 +65,4 @@ struct BinaryRedImplBody {
binary_red_template(context);
}
-} // namespace cunumeric
+} // namespace cupynumeric
diff --git a/src/cunumeric/binary/binary_red_template.inl b/src/cupynumeric/binary/binary_red_template.inl
similarity index 94%
rename from src/cunumeric/binary/binary_red_template.inl
rename to src/cupynumeric/binary/binary_red_template.inl
index e1971f5b45..15bdf9201f 100644
--- a/src/cunumeric/binary/binary_red_template.inl
+++ b/src/cupynumeric/binary/binary_red_template.inl
@@ -17,11 +17,11 @@
#pragma once
// Useful for IDEs
-#include "cunumeric/binary/binary_red.h"
-#include "cunumeric/binary/binary_op_util.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/binary/binary_red.h"
+#include "cupynumeric/binary/binary_op_util.h"
+#include "cupynumeric/pitches.h"
-namespace cunumeric {
+namespace cupynumeric {
using namespace legate;
@@ -98,4 +98,4 @@ static void binary_red_template(TaskContext& context)
reduce_op_dispatch(args.op_code, BinaryRedDispatch{}, args);
}
-} // namespace cunumeric
+} // namespace cupynumeric
diff --git a/src/cunumeric/bits/bits_util.h b/src/cupynumeric/bits/bits_util.h
similarity index 79%
rename from src/cunumeric/bits/bits_util.h
rename to src/cupynumeric/bits/bits_util.h
index bd3294f19f..3e8cd6d077 100644
--- a/src/cunumeric/bits/bits_util.h
+++ b/src/cupynumeric/bits/bits_util.h
@@ -16,13 +16,13 @@
#pragma once
-#include "cunumeric/cunumeric_c.h"
+#include "cupynumeric/cupynumeric_c.h"
-namespace cunumeric {
+namespace cupynumeric {
enum class Bitorder {
- BIG = CUNUMERIC_BITORDER_BIG,
- LITTLE = CUNUMERIC_BITORDER_LITTLE,
+ BIG = CUPYNUMERIC_BITORDER_BIG,
+ LITTLE = CUPYNUMERIC_BITORDER_LITTLE,
};
-} // namespace cunumeric
+} // namespace cupynumeric
diff --git a/src/cunumeric/bits/packbits.cc b/src/cupynumeric/bits/packbits.cc
similarity index 88%
rename from src/cunumeric/bits/packbits.cc
rename to src/cupynumeric/bits/packbits.cc
index 41b056c1d8..f563a5d4fe 100644
--- a/src/cunumeric/bits/packbits.cc
+++ b/src/cupynumeric/bits/packbits.cc
@@ -14,10 +14,10 @@
*
*/
-#include "cunumeric/bits/packbits.h"
-#include "cunumeric/bits/packbits_template.inl"
+#include "cupynumeric/bits/packbits.h"
+#include "cupynumeric/bits/packbits_template.inl"
-namespace cunumeric {
+namespace cupynumeric {
using namespace legate;
@@ -57,7 +57,10 @@ struct PackbitsImplBody {
namespace // unnamed
{
-static void __attribute__((constructor)) register_tasks(void) { PackbitsTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+ PackbitsTask::register_variants();
+ return 0;
+}();
} // namespace
-} // namespace cunumeric
+} // namespace cupynumeric
diff --git a/src/cunumeric/bits/packbits.cu b/src/cupynumeric/bits/packbits.cu
similarity index 93%
rename from src/cunumeric/bits/packbits.cu
rename to src/cupynumeric/bits/packbits.cu
index 541bed6e8a..2252275cfd 100644
--- a/src/cunumeric/bits/packbits.cu
+++ b/src/cupynumeric/bits/packbits.cu
@@ -14,11 +14,11 @@
*
*/
-#include "cunumeric/bits/packbits.h"
-#include "cunumeric/bits/packbits_template.inl"
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/bits/packbits.h"
+#include "cupynumeric/bits/packbits_template.inl"
+#include "cupynumeric/cuda_help.h"
-namespace cunumeric {
+namespace cupynumeric {
using namespace legate;
@@ -76,7 +76,7 @@ struct PackbitsImplBody {
in_hi_axis,
axis);
}
- CUNUMERIC_CHECK_CUDA_STREAM(stream);
+ CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
}
};
@@ -85,4 +85,4 @@ struct PackbitsImplBody {
packbits_template(context);
}
-} // namespace cunumeric
+} // namespace cupynumeric
diff --git a/src/cunumeric/bits/packbits.h b/src/cupynumeric/bits/packbits.h
similarity index 91%
rename from src/cunumeric/bits/packbits.h
rename to src/cupynumeric/bits/packbits.h
index f24497fe73..6d32bef8a1 100644
--- a/src/cunumeric/bits/packbits.h
+++ b/src/cupynumeric/bits/packbits.h
@@ -16,10 +16,10 @@
#pragma once
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/bits/bits_util.h"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/bits/bits_util.h"
-namespace cunumeric {
+namespace cupynumeric {
template
struct Pack;
@@ -101,9 +101,10 @@ struct Pack {
}
};
-class PackbitsTask : public CuNumericTask {
+class PackbitsTask : public CuPyNumericTask {
public:
- static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_PACKBITS};
+ static inline const auto TASK_CONFIG =
+ legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_PACKBITS}};
public:
static void cpu_variant(legate::TaskContext context);
@@ -115,4 +116,4 @@ class PackbitsTask : public CuNumericTask {
#endif
};
-} // namespace cunumeric
+} // namespace cupynumeric
diff --git a/src/cunumeric/bits/packbits_omp.cc b/src/cupynumeric/bits/packbits_omp.cc
similarity index 93%
rename from src/cunumeric/bits/packbits_omp.cc
rename to src/cupynumeric/bits/packbits_omp.cc
index c4dd57dd8c..18b39c9a55 100644
--- a/src/cunumeric/bits/packbits_omp.cc
+++ b/src/cupynumeric/bits/packbits_omp.cc
@@ -14,10 +14,10 @@
*
*/
-#include "cunumeric/bits/packbits.h"
-#include "cunumeric/bits/packbits_template.inl"
+#include "cupynumeric/bits/packbits.h"
+#include "cupynumeric/bits/packbits_template.inl"
-namespace cunumeric {
+namespace cupynumeric {
using namespace legate;
@@ -57,4 +57,4 @@ struct PackbitsImplBody {
packbits_template(context);
}
-} // namespace cunumeric
+} // namespace cupynumeric
diff --git a/src/cunumeric/bits/packbits_template.inl b/src/cupynumeric/bits/packbits_template.inl
similarity index 95%
rename from src/cunumeric/bits/packbits_template.inl
rename to src/cupynumeric/bits/packbits_template.inl
index 9046b85410..6b84138f0b 100644
--- a/src/cunumeric/bits/packbits_template.inl
+++ b/src/cupynumeric/bits/packbits_template.inl
@@ -17,10 +17,10 @@
#pragma once
// Useful for IDEs
-#include "cunumeric/bits/packbits.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/bits/packbits.h"
+#include "cupynumeric/pitches.h"
-namespace cunumeric {
+namespace cupynumeric {
using namespace legate;
@@ -50,13 +50,13 @@ struct PackbitsImpl {
auto aligned_rect = out_rect;
int64_t axis_extent = in_rect.hi[axis] - in_rect.lo[axis] + 1;
aligned_rect.hi[axis] = aligned_rect.lo[axis] + axis_extent / 8 - 1;
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
assert(aligned_rect.hi[axis] <= out_rect.hi[axis]);
#endif
auto unaligned_rect = out_rect;
unaligned_rect.lo[axis] = aligned_rect.hi[axis] + 1;
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
assert(unaligned_rect.union_bbox(aligned_rect) == out_rect);
#endif
@@ -106,4 +106,4 @@ static void packbits_template(TaskContext& context)
}
}
-} // namespace cunumeric
+} // namespace cupynumeric
diff --git a/src/cunumeric/bits/unpackbits.cc b/src/cupynumeric/bits/unpackbits.cc
similarity index 86%
rename from src/cunumeric/bits/unpackbits.cc
rename to src/cupynumeric/bits/unpackbits.cc
index 15217c5e86..2be36a8287 100644
--- a/src/cunumeric/bits/unpackbits.cc
+++ b/src/cupynumeric/bits/unpackbits.cc
@@ -14,10 +14,10 @@
*
*/
-#include "cunumeric/bits/unpackbits.h"
-#include "cunumeric/bits/unpackbits_template.inl"
+#include "cupynumeric/bits/unpackbits.h"
+#include "cupynumeric/bits/unpackbits_template.inl"
-namespace cunumeric {
+namespace cupynumeric {
using namespace legate;
@@ -45,10 +45,10 @@ struct UnpackbitsImplBody {
namespace // unnamed
{
-static void __attribute__((constructor)) register_tasks(void)
-{
+const auto cupynumeric_reg_task_ = []() -> char {
UnpackbitsTask::register_variants();
-}
+ return 0;
+}();
} // namespace
-} // namespace cunumeric
+} // namespace cupynumeric
diff --git a/src/cunumeric/bits/unpackbits.cu b/src/cupynumeric/bits/unpackbits.cu
similarity index 89%
rename from src/cunumeric/bits/unpackbits.cu
rename to src/cupynumeric/bits/unpackbits.cu
index 71413618a6..f1b5b66890 100644
--- a/src/cunumeric/bits/unpackbits.cu
+++ b/src/cupynumeric/bits/unpackbits.cu
@@ -14,12 +14,12 @@
*
*/
-#include "cunumeric/bits/unpackbits.h"
-#include "cunumeric/bits/unpackbits_template.inl"
+#include "cupynumeric/bits/unpackbits.h"
+#include "cupynumeric/bits/unpackbits_template.inl"
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
-namespace cunumeric {
+namespace cupynumeric {
using namespace legate;
@@ -55,7 +55,7 @@ struct UnpackbitsImplBody {
const size_t blocks = (in_volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
generic_kernel<<>>(
in_volume, unpack, out, in, in_pitches, in_rect.lo, axis);
- CUNUMERIC_CHECK_CUDA_STREAM(stream);
+ CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
}
};
@@ -64,4 +64,4 @@ struct UnpackbitsImplBody {
unpackbits_template(context);
}
-} // namespace cunumeric
+} // namespace cupynumeric
diff --git a/src/cunumeric/bits/unpackbits.h b/src/cupynumeric/bits/unpackbits.h
similarity index 86%
rename from src/cunumeric/bits/unpackbits.h
rename to src/cupynumeric/bits/unpackbits.h
index 96b5d39e03..92061ae43b 100644
--- a/src/cunumeric/bits/unpackbits.h
+++ b/src/cupynumeric/bits/unpackbits.h
@@ -16,10 +16,10 @@
#pragma once
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/bits/bits_util.h"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/bits/bits_util.h"
-namespace cunumeric {
+namespace cupynumeric {
template
struct Unpack;
@@ -58,9 +58,10 @@ struct Unpack {
}
};
-class UnpackbitsTask : public CuNumericTask {
+class UnpackbitsTask : public CuPyNumericTask {
public:
- static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_UNPACKBITS};
+ static inline const auto TASK_CONFIG =
+ legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_UNPACKBITS}};
public:
static void cpu_variant(legate::TaskContext context);
@@ -72,4 +73,4 @@ class UnpackbitsTask : public CuNumericTask {
#endif
};
-} // namespace cunumeric
+} // namespace cupynumeric
diff --git a/src/cunumeric/bits/unpackbits_omp.cc b/src/cupynumeric/bits/unpackbits_omp.cc
similarity index 90%
rename from src/cunumeric/bits/unpackbits_omp.cc
rename to src/cupynumeric/bits/unpackbits_omp.cc
index 02151be529..3f12a5355d 100644
--- a/src/cunumeric/bits/unpackbits_omp.cc
+++ b/src/cupynumeric/bits/unpackbits_omp.cc
@@ -14,10 +14,10 @@
*
*/
-#include "cunumeric/bits/unpackbits.h"
-#include "cunumeric/bits/unpackbits_template.inl"
+#include "cupynumeric/bits/unpackbits.h"
+#include "cupynumeric/bits/unpackbits_template.inl"
-namespace cunumeric {
+namespace cupynumeric {
using namespace legate;
@@ -44,4 +44,4 @@ struct UnpackbitsImplBody {
unpackbits_template(context);
}
-} // namespace cunumeric
+} // namespace cupynumeric
diff --git a/src/cunumeric/bits/unpackbits_template.inl b/src/cupynumeric/bits/unpackbits_template.inl
similarity index 94%
rename from src/cunumeric/bits/unpackbits_template.inl
rename to src/cupynumeric/bits/unpackbits_template.inl
index 0763818c47..2a710b8c01 100644
--- a/src/cunumeric/bits/unpackbits_template.inl
+++ b/src/cupynumeric/bits/unpackbits_template.inl
@@ -17,10 +17,10 @@
#pragma once
// Useful for IDEs
-#include "cunumeric/bits/unpackbits.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/bits/unpackbits.h"
+#include "cupynumeric/pitches.h"
-namespace cunumeric {
+namespace cupynumeric {
using namespace legate;
@@ -77,4 +77,4 @@ static void unpackbits_template(TaskContext& context)
}
}
-} // namespace cunumeric
+} // namespace cupynumeric
diff --git a/src/cunumeric/cephes/chbevl.cc b/src/cupynumeric/cephes/chbevl.cc
similarity index 100%
rename from src/cunumeric/cephes/chbevl.cc
rename to src/cupynumeric/cephes/chbevl.cc
diff --git a/src/cunumeric/cephes/i0.cc b/src/cupynumeric/cephes/i0.cc
similarity index 100%
rename from src/cunumeric/cephes/i0.cc
rename to src/cupynumeric/cephes/i0.cc
diff --git a/src/cunumeric/convolution/convolve.cc b/src/cupynumeric/convolution/convolve.cc
similarity index 96%
rename from src/cunumeric/convolution/convolve.cc
rename to src/cupynumeric/convolution/convolve.cc
index 653933507e..9335606175 100644
--- a/src/cunumeric/convolution/convolve.cc
+++ b/src/cupynumeric/convolution/convolve.cc
@@ -14,11 +14,11 @@
*
*/
-#include "cunumeric/divmod.h"
-#include "cunumeric/convolution/convolve.h"
-#include "cunumeric/convolution/convolve_template.inl"
+#include "cupynumeric/divmod.h"
+#include "cupynumeric/convolution/convolve.h"
+#include "cupynumeric/convolution/convolve_template.inl"
-namespace cunumeric {
+namespace cupynumeric {
// This is the easy to understand functional specification of the
// algorithm, but it is commented out in favor of the faster one
@@ -82,7 +82,8 @@ struct ConvolveImplBody {
AccessorRO in,
const Rect& root_rect,
const Rect& subrect,
- const Rect& filter_rect) const
+ const Rect& filter_rect,
+ CuPyNumericConvolveMethod method) const
{
const Point one = Point::ONES();
Point extents = filter_rect.hi - filter_rect.lo + one;
@@ -272,7 +273,10 @@ struct ConvolveImplBody {
namespace // unnamed
{
-static void __attribute__((constructor)) register_tasks(void) { ConvolveTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+ ConvolveTask::register_variants();
+ return 0;
+}();
} // namespace
-} // namespace cunumeric
+} // namespace cupynumeric
diff --git a/src/cunumeric/convolution/convolve.cu b/src/cupynumeric/convolution/convolve.cu
similarity index 93%
rename from src/cunumeric/convolution/convolve.cu
rename to src/cupynumeric/convolution/convolve.cu
index c2c271577a..6cdacd3b9f 100644
--- a/src/cunumeric/convolution/convolve.cu
+++ b/src/cupynumeric/convolution/convolve.cu
@@ -14,12 +14,12 @@
*
*/
-#include "cunumeric/divmod.h"
-#include "cunumeric/cuda_help.h"
-#include "cunumeric/convolution/convolve.h"
-#include "cunumeric/convolution/convolve_template.inl"
+#include "cupynumeric/divmod.h"
+#include "cupynumeric/cuda_help.h"
+#include "cupynumeric/convolution/convolve.h"
+#include "cupynumeric/convolution/convolve_template.inl"
-namespace cunumeric {
+namespace cupynumeric {
using namespace legate;
@@ -744,7 +744,7 @@ __host__ static inline void launch_small_tile_kernel(AccessorWO out,
out, filter, in, root_rect, subrect, filter_rect, args);
}
}
- CUNUMERIC_CHECK_CUDA_STREAM(stream);
+ CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
}
template
@@ -766,24 +766,24 @@ __host__ void direct_convolution(AccessorWO out,
static unsigned long long mask = 0;
if (!(mask & (1 << device))) {
if (properties.sharedMemPerBlock < max_smem_size) {
- CUNUMERIC_CHECK_CUDA(cudaFuncSetAttribute(convolution_small_tile1,
- cudaFuncAttributeMaxDynamicSharedMemorySize,
- max_smem_size));
- CUNUMERIC_CHECK_CUDA(cudaFuncSetAttribute(convolution_small_tile2,
- cudaFuncAttributeMaxDynamicSharedMemorySize,
- max_smem_size));
- CUNUMERIC_CHECK_CUDA(cudaFuncSetAttribute(convolution_large_tile,
- cudaFuncAttributeMaxDynamicSharedMemorySize,
- max_smem_size));
+ CUPYNUMERIC_CHECK_CUDA(cudaFuncSetAttribute(convolution_small_tile1,
+ cudaFuncAttributeMaxDynamicSharedMemorySize,
+ max_smem_size));
+ CUPYNUMERIC_CHECK_CUDA(cudaFuncSetAttribute(convolution_small_tile2,
+ cudaFuncAttributeMaxDynamicSharedMemorySize,
+ max_smem_size));
+ CUPYNUMERIC_CHECK_CUDA(cudaFuncSetAttribute(convolution_large_tile,
+ cudaFuncAttributeMaxDynamicSharedMemorySize,
+ max_smem_size));
}
if (sizeof(VAL) >= 8) {
// Only need to set this on the first invocation
- CUNUMERIC_CHECK_CUDA(cudaFuncSetSharedMemConfig(convolution_small_tile1,
- cudaSharedMemBankSizeEightByte));
- CUNUMERIC_CHECK_CUDA(cudaFuncSetSharedMemConfig(convolution_small_tile2,
- cudaSharedMemBankSizeEightByte));
- CUNUMERIC_CHECK_CUDA(cudaFuncSetSharedMemConfig(convolution_large_tile,
- cudaSharedMemBankSizeEightByte));
+ CUPYNUMERIC_CHECK_CUDA(cudaFuncSetSharedMemConfig(convolution_small_tile1,
+ cudaSharedMemBankSizeEightByte));
+ CUPYNUMERIC_CHECK_CUDA(cudaFuncSetSharedMemConfig(convolution_small_tile2,
+ cudaSharedMemBankSizeEightByte));
+ CUPYNUMERIC_CHECK_CUDA(cudaFuncSetSharedMemConfig(
+ convolution_large_tile, cudaSharedMemBankSizeEightByte));
}
// Make sure we have enough bits for every device
assert(device < (8 * sizeof(mask)));
@@ -848,7 +848,7 @@ __host__ void direct_convolution(AccessorWO out,
}
if (out_dense) {
size_t bytes = sizeof(VAL) * out_pitch;
- CUNUMERIC_CHECK_CUDA(cudaMemsetAsync(out_ptr, 0, bytes));
+ CUPYNUMERIC_CHECK_CUDA(cudaMemsetAsync(out_ptr, 0, bytes));
} else {
out_pitch = 1;
ConvolutionInitArgs args;
@@ -1168,7 +1168,7 @@ __host__ void direct_convolution(AccessorWO out,
one,
args);
}
- CUNUMERIC_CHECK_CUDA_STREAM(stream);
+ CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
}
}
@@ -1299,7 +1299,8 @@ __host__ static inline void cufft_convolution(AccessorWO out,
AccessorRO in,
const Rect& root_rect,
const Rect& subrect,
- const Rect& filter_rect)
+ const Rect& filter_rect,
+ CuPyNumericConvolveMethod method)
{
int device = get_device_ordinal();
auto& properties = get_device_properties();
@@ -1310,19 +1311,19 @@ __host__ static inline void cufft_convolution(AccessorWO out,
static unsigned long long mask = 0;
if (!(mask & (1 << device))) {
if (properties.sharedMemPerBlock < max_smem_size) {
- CUNUMERIC_CHECK_CUDA(cudaFuncSetAttribute(convolution_small_tile1,
- cudaFuncAttributeMaxDynamicSharedMemorySize,
- max_smem_size));
- CUNUMERIC_CHECK_CUDA(cudaFuncSetAttribute(convolution_small_tile2,
- cudaFuncAttributeMaxDynamicSharedMemorySize,
- max_smem_size));
+ CUPYNUMERIC_CHECK_CUDA(cudaFuncSetAttribute(convolution_small_tile1,
+ cudaFuncAttributeMaxDynamicSharedMemorySize,
+ max_smem_size));
+ CUPYNUMERIC_CHECK_CUDA(cudaFuncSetAttribute(convolution_small_tile2,
+ cudaFuncAttributeMaxDynamicSharedMemorySize,
+ max_smem_size));
}
if (sizeof(VAL) >= 8) {
// Only need to set this on the first invocation
- CUNUMERIC_CHECK_CUDA(cudaFuncSetSharedMemConfig(convolution_small_tile1,
- cudaSharedMemBankSizeEightByte));
- CUNUMERIC_CHECK_CUDA(cudaFuncSetSharedMemConfig(convolution_small_tile2,
- cudaSharedMemBankSizeEightByte));
+ CUPYNUMERIC_CHECK_CUDA(cudaFuncSetSharedMemConfig(convolution_small_tile1,
+ cudaSharedMemBankSizeEightByte));
+ CUPYNUMERIC_CHECK_CUDA(cudaFuncSetSharedMemConfig(convolution_small_tile2,
+ cudaSharedMemBankSizeEightByte));
}
// Make sure we have enough bits for every device
assert(device < (8 * sizeof(mask)));
@@ -1354,7 +1355,7 @@ __host__ static inline void cufft_convolution(AccessorWO out,
for (int d = 0; d < DIM; d++) {
smem_size *= (tile[d] + 2 * centers[d]);
}
- if (smem_size <= max_smem_size) {
+ if (method != CUPYNUMERIC_CONVOLVE_FFT && smem_size <= max_smem_size) {
launch_small_tile_kernel(out,
filter,
in,
@@ -1405,7 +1406,7 @@ __host__ static inline void cufft_convolution(AccessorWO out,
// Zero pad and copy in the input data
auto signal_buffer = create_buffer(buffersize, Memory::GPU_FB_MEM, 128 /*alignment*/);
VAL* signal_ptr = signal_buffer.ptr(zero);
- CUNUMERIC_CHECK_CUDA(cudaMemsetAsync(signal_ptr, 0, buffervolume * sizeof(VAL), stream));
+ CUPYNUMERIC_CHECK_CUDA(cudaMemsetAsync(signal_ptr, 0, buffervolume * sizeof(VAL), stream));
// Check to see if the input pointer is dense and we can do this with a CUDA memcpy
size_t strides[DIM];
const VAL* input_ptr = in.ptr(input_bounds, strides);
@@ -1421,7 +1422,7 @@ __host__ static inline void cufft_convolution(AccessorWO out,
// Zero pad and copy in the filter data
auto filter_buffer = create_buffer(buffersize, Memory::GPU_FB_MEM, 128 /*alignment*/);
VAL* filter_ptr = filter_buffer.ptr(zero);
- CUNUMERIC_CHECK_CUDA(cudaMemsetAsync(filter_ptr, 0, buffervolume * sizeof(VAL), stream));
+ CUPYNUMERIC_CHECK_CUDA(cudaMemsetAsync(filter_ptr, 0, buffervolume * sizeof(VAL), stream));
const VAL* filt_ptr = filter.ptr(filter_rect, strides);
pitch = 1;
for (int d = DIM - 1; d >= 0; d--) {
@@ -1432,7 +1433,7 @@ __host__ static inline void cufft_convolution(AccessorWO out,
copy_into_buffer<<>>(
filter, filter_buffer, filter_rect.lo, copy_pitches, pitch);
- CUNUMERIC_CHECK_CUDA_STREAM(stream);
+ CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
auto forward_plan = get_cufft_plan(ForwardPlanType::value, cufftPlanParams(fftsize));
auto backward_plan = get_cufft_plan(BackwardPlanType::value, cufftPlanParams(fftsize));
@@ -1455,7 +1456,7 @@ __host__ static inline void cufft_convolution(AccessorWO out,
// FFT the filter data
cufft_execute_forward(forward_plan.handle(), filter_ptr, filter_ptr);
- CUNUMERIC_CHECK_CUDA_STREAM(stream);
+ CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
// Perform the pointwise multiplcation
{
@@ -1492,13 +1493,13 @@ __host__ static inline void cufft_convolution(AccessorWO out,
copy_from_buffer<<>>(
filter_ptr, out, buffer_offset, subrect.lo, copy_pitches, fft_pitches, pitch, scaling_factor);
- CUNUMERIC_CHECK_CUDA_STREAM(stream);
+ CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
#if 0
// This is useful debugging code for finding the output
VAL *buffer = (VAL*)malloc(buffervolume*sizeof(VAL));
- CUNUMERIC_CHECK_CUDA( cudaMemcpyAsync(buffer, filter_ptr, buffervolume*sizeof(VAL), cudaMemcpyDeviceToHost, stream) );
- CUNUMERIC_CHECK_CUDA( cudaStreamSynchronize(stream) );
+ CUPYNUMERIC_CHECK_CUDA( cudaMemcpyAsync(buffer, filter_ptr, buffervolume*sizeof(VAL), cudaMemcpyDeviceToHost, stream) );
+ CUPYNUMERIC_CHECK_CUDA( cudaStreamSynchronize(stream) );
for (unsigned idx = 0; idx < buffervolume; idx++) {
if ((idx % fftsize[DIM-1]) == 0)
printf("\n");
@@ -1515,7 +1516,7 @@ __host__ static inline void cufft_convolution(AccessorWO out,
/////////////
template
-struct UseCUFFT {
+struct CanUseCUFFT {
static constexpr bool value = 1 <= DIM && DIM <= 3 && std::is_floating_point::value;
};
@@ -1523,24 +1524,34 @@ template
struct ConvolveImplBody {
using VAL = type_of;
- template ::value>* = nullptr>
+ template ::value>* = nullptr>
__host__ void dispatch(AccessorWO<_VAL, _DIM> out,
AccessorRO<_VAL, _DIM> filter,
AccessorRO<_VAL, _DIM> in,
const Rect<_DIM>& root_rect,
const Rect<_DIM>& subrect,
- const Rect<_DIM>& filter_rect) const
+ const Rect<_DIM>& filter_rect,
+ CuPyNumericConvolveMethod method) const
{
- cufft_convolution<_VAL, _DIM>(out, filter, in, root_rect, subrect, filter_rect);
+ if (method == CUPYNUMERIC_CONVOLVE_DIRECT) {
+ direct_convolution<_VAL, _DIM>(out, filter, in, root_rect, subrect, filter_rect);
+ } else {
+ cufft_convolution<_VAL, _DIM>(out, filter, in, root_rect, subrect, filter_rect, method);
+ }
}
- template ::value>* = nullptr>
+ template ::value>* = nullptr>
__host__ void dispatch(AccessorWO<_VAL, _DIM> out,
AccessorRO<_VAL, _DIM> filter,
AccessorRO<_VAL, _DIM> in,
const Rect<_DIM>& root_rect,
const Rect<_DIM>& subrect,
- const Rect<_DIM>& filter_rect) const
+ const Rect<_DIM>& filter_rect,
+ CuPyNumericConvolveMethod method) const
{
direct_convolution<_VAL, _DIM>(out, filter, in, root_rect, subrect, filter_rect);
}
@@ -1550,9 +1561,10 @@ struct ConvolveImplBody {
AccessorRO in,
const Rect& root_rect,
const Rect& subrect,
- const Rect& filter_rect) const
+ const Rect& filter_rect,
+ CuPyNumericConvolveMethod method) const
{
- dispatch(out, filter, in, root_rect, subrect, filter_rect);
+ dispatch(out, filter, in, root_rect, subrect, filter_rect, method);
}
};
@@ -1561,4 +1573,4 @@ struct ConvolveImplBody {
convolve_template