diff --git a/.gitattributes b/.gitattributes
index 8ae3c80128..1215d42fca 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1 +1 @@
-cunumeric/_version.py export-subst
+cunpyumeric/_version.py export-subst
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 5ac9b710d8..b310312985 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,4 +1,4 @@
 # Code Ownership
-.github                   @marcinz @m3vaz @sandeepd-nv @mag1cp1n
-continuous_integration    @marcinz @m3vaz @sandeepd-nv @mag1cp1n
-conda                     @marcinz @m3vaz @sandeepd-nv @mag1cp1n
+.github                   @nv-legate/devops-reviewers
+continuous_integration    @nv-legate/devops-reviewers
+conda                     @nv-legate/devops-reviewers
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
index 39f252254f..74fb1d45b1 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -8,7 +8,7 @@ body:
       value: "# Bug report"
   - type: markdown
     attributes:
-      value: Thank you for reporting a bug and helping us improve Cunumeric!
+      value: Thank you for reporting a bug and helping us improve cuPyNumeric!
   - type: markdown
     attributes:
       value: >
@@ -29,7 +29,7 @@ body:
         Platform    :  Linux-6.8.0-40-generic-x86_64-with-glibc2.35
         Legion      :  (failed to detect)
         Legate      :  24.05.00+255.g2656afbd
-        Cunumeric   :  24.05.00+132.gc4741d57
+        cuPynumeric :  24.05.00+132.gc4741d57
         Numpy       :  1.26.4
         Scipy       :  1.13.1
         Numba       :  (failed to detect)
diff --git a/.github/workflows/ci-gh-docs.yml b/.github/workflows/ci-gh-docs.yml
new file mode 100644
index 0000000000..349cf79e95
--- /dev/null
+++ b/.github/workflows/ci-gh-docs.yml
@@ -0,0 +1,46 @@
+---
+name: Docs
+
+concurrency:
+  group: ${{ startsWith(github.ref_name, 'main') && format('unique-{0}', github.run_id) || format('ci-build-docs-on-{0}-from-{1}', github.event_name, github.ref_name) }}
+  cancel-in-progress: true
+
+on:
+  push:
+    branches:
+      - "pull-request/[0-9]+"
+      - "branch-*"
+      - "main"
+  merge_group:
+
+jobs:
+  build-and-test:
+    name: Build documentation (${{ matrix.platform }}, ${{ matrix.target-device }}, ${{ matrix.build-mode }}, ucx enabled)
+    strategy:
+      fail-fast: false
+      matrix:
+        platform:
+          - linux
+        target-device:
+          - gpu
+        build-mode:
+          - release
+    uses:
+      ./.github/workflows/gh-build-docs.yml
+    with:
+      platform: ${{ matrix.platform }}
+      target-device: ${{ matrix.target-device }}
+      build-mode: ${{ matrix.build-mode }}
+      build-type: ci
+      upload-docs-to-gh-pages: false
+    secrets: inherit
+
+  docs-pass:
+    if: always()
+    needs:
+      - build-and-test
+    runs-on: linux-amd64-cpu4
+    steps:
+      - name: Check job results
+        if: contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled')
+        run: exit 1
diff --git a/.github/workflows/ci-gh-nightly-release.yml b/.github/workflows/ci-gh-nightly-release.yml
index 0b214d2c63..46b887687c 100644
--- a/.github/workflows/ci-gh-nightly-release.yml
+++ b/.github/workflows/ci-gh-nightly-release.yml
@@ -30,11 +30,42 @@ jobs:
     uses:
       ./.github/workflows/gh-build-and-test.yml
     with:
-      build-type: release
-      dependencies-workflow: ci-gh-nightly-release.yml
+      build-type: nightly
       platform: ${{ matrix.platform }}
       python-version: ${{ matrix.python-version }}
       target-device: ${{ matrix.target-device }}
       upload-enabled: ${{ matrix.upload-enabled }}
-      waive-gpu-tests: ${{ github.workflow == 'Build Release package' && matrix.platform == 'linux-aarch64' }}
+      refname: ${{ github.ref_name }}
+      default-branch: ${{ github.event.repository.default_branch }}
+    secrets: inherit
+
+  build-nightly-docs:
+    name: Build Nightly documentation (${{ matrix.platform }}, ${{ matrix.target-device }}, ${{ matrix.build-mode }}, ucx enabled)
+    strategy:
+      fail-fast: false
+      matrix:
+        platform:
+          - linux
+        target-device:
+          - gpu
+        build-mode:
+          - release
+    uses:
+      ./.github/workflows/gh-build-docs.yml
+    with:
+      platform: ${{ matrix.platform }}
+      target-device: ${{ matrix.target-device }}
+      build-mode: ${{ matrix.build-mode }}
+      build-type: nightly
+      upload-docs-to-gh-pages: true
+    secrets: inherit
+
+  push_code:
+    name: Nightly source release
+    uses:
+      nv-legate/legate-gh-ci/.github/workflows/gh-push-code.yml@nightly_push_to_external_repo
+    with:
+      runs-on: linux-amd64-cpu4
+      source-repo: "${{ github.repository_owner }}/cupynumeric.internal"
+      dest-repo: "${{ github.repository_owner }}/cupynumeric"
     secrets: inherit
diff --git a/.github/workflows/ci-gh-validate-legate-sha.yml b/.github/workflows/ci-gh-validate-legate-sha.yml
index d15982ca3c..9e2309a233 100644
--- a/.github/workflows/ci-gh-validate-legate-sha.yml
+++ b/.github/workflows/ci-gh-validate-legate-sha.yml
@@ -20,7 +20,7 @@ jobs:
     - name: Checkout repository
       uses: actions/checkout@v4
       with:
-        path: cunumeric.internal
+        path: cupynumeric.internal
 
     - name: Set up environment
       run: |
@@ -30,7 +30,7 @@ jobs:
     - name: Parse versions.json
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
-        DEPENDENCIES_FILE="cunumeric.internal/cmake/versions.json"
+        DEPENDENCIES_FILE="cupynumeric.internal/cmake/versions.json"
         GIT_REPO=$(jq -r '.packages.legate.repo' ${DEPENDENCIES_FILE})
         GIT_ORG=$(jq -r '.packages.legate.org' ${DEPENDENCIES_FILE})
         GIT_TAG=$(jq -r '.packages.legate.git_tag' ${DEPENDENCIES_FILE})
diff --git a/.github/workflows/ci-gh-release.yml b/.github/workflows/ci-gh.yml
similarity index 67%
rename from .github/workflows/ci-gh-release.yml
rename to .github/workflows/ci-gh.yml
index 654fad29ef..4bb50dd233 100644
--- a/.github/workflows/ci-gh-release.yml
+++ b/.github/workflows/ci-gh.yml
@@ -1,4 +1,4 @@
-name: Build Release package
+name: Build CI package
 
 concurrency:
   group: ${{ startsWith(github.ref_name, 'main') && format('unique-{0}', github.run_id) || format('ci-build-and-test-on-{0}-from-{1}', github.event_name, github.ref_name) }}
@@ -33,10 +33,21 @@ jobs:
     uses:
       ./.github/workflows/gh-build-and-test.yml
     with:
-      build-type: release
+      build-type: ci
       platform: ${{ matrix.platform }}
-      python-version: "3.10"
+      python-version: ${{ matrix.python-version }}
       target-device: ${{ matrix.target-device }}
       upload-enabled: ${{ matrix.upload-enabled }}
-      waive-gpu-tests: ${{ github.workflow == 'Build Release package' && matrix.platform == 'linux-aarch64' }}
+      refname: ${{ github.ref_name }}
+      default-branch: ${{ github.event.repository.default_branch }}
     secrets: inherit
+
+  tests-pass:
+    if: always()
+    needs:
+      - build-and-test
+    runs-on: linux-amd64-cpu4
+    steps:
+      - name: Check job results
+        if: contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled')
+        run: exit 1
diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
index b9890641a0..06bd77b2ff 100644
--- a/.github/workflows/gh-build-and-test.yml
+++ b/.github/workflows/gh-build-and-test.yml
@@ -13,18 +13,16 @@ on:
       upload-enabled:
         type: boolean
         required: true
-      waive-gpu-tests:
-         required: true
-         type: boolean
-         description: Waive GPU tests based on specific configuration
       python-version:
         required: false
         type: string
         default: "3.12"
-      dependencies-workflow:
-        required: false
+      refname:
+        required: true
+        type: string
+      default-branch:
+        required: true
         type: string
-        default: ci-gh.yml
 
 jobs:
   setup-build:
@@ -51,14 +49,14 @@ jobs:
     needs: setup-build
     name: "Build (${{ inputs.platform }}, ${{ inputs.target-device }}, ${{ inputs.build-type }}, Python ${{ inputs.python-version }})"
     uses:
-      nv-legate/legate-gh-ci/.github/workflows/gh-build.yml@v1.17
+      nv-legate/legate-gh-ci/.github/workflows/gh-build.yml@v1.29
     with:
+      build-has-tests: ${{ !inputs.upload-enabled }}
       build-mode: ""
       build-type: ${{ inputs.build-type }}
       client-repo: ${{ github.event.repository.name }}
-      dependencies-file: "cmake/versions.json"
-      dependencies-workflow: ${{ inputs.dependencies-workflow }}
-      legate-gh-ci-tag: "v1.17"
+      dependencies-file: ""
+      legate-gh-ci-tag: "v1.29"
       network: "ucx"
       platform: ${{ inputs.platform }}
       python-version: ${{ inputs.python-version }}
@@ -68,45 +66,29 @@ jobs:
       use-container: ${{ inputs.platform == 'linux' || inputs.platform == 'linux-aarch64' }}
     secrets: inherit
 
-  nightly-exists:
-    needs: setup-build
-    name: "Check if legate.internal nightly exists for SHA specified in versions.json (${{ inputs.platform }}, ${{ inputs.target-device }}, ${{ inputs.build-type }})"
-    uses:
-      nv-legate/legate-gh-ci/.github/workflows/gh-check-if-nightly-exists-for-all-dependencies.yml@v1.17
-    with:
-      build-mode: ""
-      build-type: ${{ inputs.build-type }}
-      client-repo: ${{ github.event.repository.name }}
-      dependencies-file: "cmake/versions.json"
-      legate-gh-ci-tag: "v1.17"
-      network: "ucx"
-      platform: ${{ inputs.platform }}
-      python-version: ${{ inputs.python-version }}
-      runs-on: linux-amd64-cpu4
-      target-device: ${{ inputs.target-device }}
-      upload-enabled: ${{ inputs.upload-enabled }}
-    secrets: inherit
-
   upload:
     needs: build
     if: ${{ github.repository_owner == 'nv-legate' && contains(github.workflow, 'release') && inputs.upload-enabled == true }}
     name: Upload package to Server
     uses:
-      nv-legate/legate-gh-ci/.github/workflows/gh-upload.yml@v1.17
+      nv-legate/legate-gh-ci/.github/workflows/gh-upload.yml@v1.29
     with:
+      build-has-tests: ${{ !inputs.upload-enabled }}
       build-mode: ""
       build-type: ${{ inputs.build-type }}
       client-repo: ${{ github.event.repository.name }}
-      legate-gh-ci-tag: "v1.17"
+      legate-gh-ci-tag: "v1.29"
       name: Upload package to Server
       network: "ucx"
-      pkgSubString: "cunumeric-"
+      pkgSubString: "cupynumeric-"
       platform: ${{ inputs.platform }}
       python-version: ${{ inputs.python-version }}
-      repos-Root: "cunumeric"
+      repos-Root: "cupynumeric"
       target-device: ${{ inputs.target-device }}
-      upload-action: "upload-package"
+      upload-action: "upload-package-Anaconda"
       upload-enabled: ${{ inputs.upload-enabled }}
+      refname: ${{ inputs.refname }}
+      default-branch: ${{ inputs.default-branch }}
     secrets: inherit
 
   setup-test:
@@ -121,20 +103,25 @@ jobs:
       - id: set-matrix
         run: |
           set -xeuo pipefail
+
           MATRIX_JSON='{"include": ['
+
           RUNNERS=(
-            'linux-amd64-gpu-v100-latest-1:gpu:gpu:linux' 'linux-amd64-2gpu:gpu:2gpu:linux'
+            'linux-amd64-gpu-l4-latest-1:gpu:gpu:linux' 'linux-amd64-2gpu:gpu:2gpu:linux'
             'linux-amd64-cpu16:cpu:cpu:linux'
             'linux-arm64-cpu16:cpu:cpu:linux-aarch64' 'linux-aarch64-2gpu:gpu:2gpu:linux-aarch64' 'linux-aarch64-2gpu:gpu:gpu:linux-aarch64'
             'macos-latest:cpu:cpu:mac')
+
           TEST_CONFIGS=(
             '1 CPU test:test --cpus 1 --debug:cpu'
             '1 CPU test:test --cpus 1 --debug:gpu'
             '2 CPU test:test --cpus 2 --debug:cpu'
             '2 CPU test:test --cpus 2 --debug:gpu'
-            # set the number of workers manually because nvidia runners report 6 gpus when onyl one is really available
-            # this workaround can be removed when the number of available gpus is reported correctly (when we run on VMs)
-            'GPU test:test --use cuda --gpus 1 -j 7 --debug:gpu'
+            # Set the number of workers manually because nvidia runners report 6
+            # gpus when only one is really available this workaround can be
+            # removed when the number of available gpus is reported correctly
+            # (when we run on VMs)
+            'GPU test:test --use cuda --gpus 1 --debug:gpu'
             '2 GPU test:test --use cuda --gpus 2 --debug:2gpu'
             'OpenMP test:test --use openmp --omps 1 --ompthreads 2 --debug:gpu'
             'OpenMP test:test --use openmp --omps 1 --ompthreads 2 --debug:cpu'
@@ -143,31 +130,49 @@ jobs:
             'Eager execution test:test --use eager --debug:gpu'
             'Eager execution test:test --use eager --debug:cpu'
             'mypy:mypy:cpu'
-            'Documentation:docs:cpu'
             'Unit tests:unit:cpu'
+            'CPP tests:cpp:cpu'
+            # TODO: Uncomment the following lines once
+            # https://github.com/nv-legate/cupynumeric.internal/issues/654 has
+            # been fixed.
+            # 'CPP tests:cpp:gpu'
+            # 'CPP tests:cpp:2gpu'
           )
+
           for RUNNER in "${RUNNERS[@]}"; do
             IFS=':' read -ra RUNNER_INFO <<< "$RUNNER"
             RUNNER_NAME=${RUNNER_INFO[0]}
             RUNNER_TYPE=${RUNNER_INFO[1]}
             RUNNER_DEVICE=${RUNNER_INFO[2]}
             RUNNER_PLATFORM=${RUNNER_INFO[3]}
+
             if [[ "$RUNNER_TYPE" == "${{ inputs.target-device }}" && "$RUNNER_PLATFORM" == "${{ inputs.platform }}" ]]; then
+
               for TEST_CONFIG in "${TEST_CONFIGS[@]}"; do
                 IFS=':' read -ra CONFIG_INFO <<< "$TEST_CONFIG"
                 TEST_NAME=${CONFIG_INFO[0]}
                 TEST_OPTIONS=${CONFIG_INFO[1]}
                 TEST_TARGET_DEVICE=${CONFIG_INFO[2]}
+
+                # Note: we don't have enough linux-aarch64 GPU runners to
+                # support per commit testing. This is why these tests are waived
+                # here.
+                WAIVE_TEST="${{ inputs.target-device == 'gpu' && inputs.build-type == 'ci' && inputs.platform == 'linux-aarch64' }}"
+
                 if [[ "$TEST_TARGET_DEVICE" == "$RUNNER_DEVICE" ]]; then
-                  if ! [[ "$TEST_NAME" =~ "GPU" && "${{ inputs.waive-gpu-tests }}" == 'true' ]]; then
+                  if [[ "${WAIVE_TEST}" == "false" ]]; then
                     MATRIX_JSON+="{\"runner\": {\"name\": \"$RUNNER_NAME\", \"type\": \"$RUNNER_TYPE\", \"platform\": \"$RUNNER_PLATFORM\"}, \"test-config\": {\"name\": \"$TEST_NAME\", \"test-options\": \"$TEST_OPTIONS\"}},"
                   fi
                 fi
               done
             fi
           done
-          MATRIX_JSON=$(echo "$MATRIX_JSON" | sed 's/,$//') # Remove the trailing comma
+
+          # Remove the trailing comma
+          MATRIX_JSON=$(echo "$MATRIX_JSON" | sed 's/,$//')
+          # Terminate JSON expression
           MATRIX_JSON+=']}'
+
           echo "matrix=$MATRIX_JSON" >> $GITHUB_OUTPUT
 
   test:
@@ -180,13 +185,14 @@ jobs:
       matrix: ${{fromJson(needs.setup-test.outputs.matrix)}}
 
     uses:
-      nv-legate/legate-gh-ci/.github/workflows/gh-test-within-container.yml@v1.17
+      nv-legate/legate-gh-ci/.github/workflows/gh-test-within-container.yml@v1.29
     with:
+      build-has-tests: ${{ !inputs.upload-enabled }}
       build-mode: ""
       build-type: ${{ inputs.build-type }}
       client-repo: ${{ github.event.repository.name }}
       has-gpu: ${{ matrix.runner.type == 'gpu' }}
-      legate-gh-ci-tag: "v1.17"
+      legate-gh-ci-tag: "v1.29"
       name: ${{ matrix.test-config.name }}
       network: "ucx"
       platform: ${{ inputs.platform }}
@@ -200,21 +206,24 @@ jobs:
   updateTestStatus:
     needs: test
     name: Update Test status on Server
-    if: ${{ (github.repository_owner == 'nv-legate') && contains(github.workflow, 'Nightly') && (inputs.upload-enabled == true) }}
+    if: ${{ false }}
     uses:
-      nv-legate/legate-gh-ci/.github/workflows/gh-upload.yml@v1.17
+      nv-legate/legate-gh-ci/.github/workflows/gh-upload.yml@v1.29
     with:
+      build-has-tests: ${{ !inputs.upload-enabled }}
       build-mode: ""
       build-type: ${{ inputs.build-type }}
       client-repo: ${{ github.event.repository.name }}
-      legate-gh-ci-tag: "v1.17"
+      legate-gh-ci-tag: "v1.29"
       name: UpdateTestStatus
       network: "ucx"
-      pkgSubString: "cunumeric-"
+      pkgSubString: "cupynumeric-"
       platform: ${{ inputs.platform }}
       python-version: ${{ inputs.python-version }}
-      repos-Root: "cunumeric"
+      repos-Root: "cupynumeric"
       target-device: ${{ inputs.target-device }}
       upload-action: "update-test-status"
       upload-enabled: true
+      refname: ${{ inputs.refname }}
+      default-branch: ${{ inputs.default-branch }}
     secrets: inherit
diff --git a/.github/workflows/gh-build-docs.yml b/.github/workflows/gh-build-docs.yml
new file mode 100644
index 0000000000..57dd3bf54e
--- /dev/null
+++ b/.github/workflows/gh-build-docs.yml
@@ -0,0 +1,122 @@
+---
+on:
+  workflow_call:
+    inputs:
+      platform:
+        type: string
+        required: true
+      target-device:
+        type: string
+        required: true
+      build-mode:
+        type: string
+        required: true
+      build-type:
+        type: string
+        required: true
+      upload-docs-to-gh-pages:
+        type: boolean
+        required: false
+        default: false
+
+jobs:
+  build-cupynumeric:
+    if: ${{ github.repository_owner == 'nv-legate' }}
+    uses:
+      nv-legate/legate-gh-ci/.github/workflows/gh-build.yml@v1.29
+    with:
+      build-has-tests: false
+      client-repo: ${{ github.event.repository.name }}
+      target-device: ${{ inputs.target-device }}
+      runs-on: ${{ (inputs.platform == 'linux' && 'linux-amd64-cpu16') || (inputs.platform == 'mac' && 'macos-latest') }}
+      build-type: ${{ inputs.build-type }}
+      use-container: ${{ inputs.platform == 'linux' }}
+      platform: ${{ inputs.platform }}
+      dependencies-file: ""
+      legate-gh-ci-tag: "v1.29"
+      build-mode: ${{ inputs.build-mode }}
+      upload-enabled: false
+      network: "ucx"
+    secrets: inherit
+
+
+  build-docs:
+    needs:
+      - build-cupynumeric
+    name: Build cupynumeric docs (${{ inputs.platform }}, ${{ inputs.target-device }})
+
+    uses:
+      nv-legate/legate-gh-ci/.github/workflows/gh-test-within-container.yml@v1.29
+    with:
+      build-has-tests: false
+      build-mode: ${{ inputs.build-mode }}
+      build-type: ${{ inputs.build-type }}
+      output-build-type: docs
+      client-repo: ${{ github.event.repository.name }}
+      has-gpu: false
+      legate-gh-ci-tag: "v1.29"
+      name: Build documentation
+      network: "ucx"
+      platform: ${{ inputs.platform }}
+      python-version: ${{ inputs.python-version }}
+      runs-on: ${{ (inputs.platform == 'linux' && 'linux-amd64-gpu-l4-latest-1') || (inputs.platform == 'mac' && 'macos-latest') }}
+      target-device: ${{ inputs.target-device }}
+      test-options: docs
+      upload-enabled: false
+    secrets: inherit
+
+
+  upload-docs-to-gh-pages:
+    if: ${{ inputs.upload-docs-to-gh-pages && github.ref_name == 'main' }}
+    needs:
+      - build-docs
+    runs-on: ${{ (inputs.platform == 'linux' && 'linux-amd64-cpu4') || (inputs.platform == 'mac' && 'macos-latest') }}
+    steps:
+      - name: Set environment variables
+        shell: bash --noprofile --norc -xeuo pipefail {0}
+        run: |
+          echo "${{ needs.build-docs.outputs.output-artifact-name }}"
+
+          ARTIFACTS_DIR=$(realpath "$(pwd)/../artifacts")
+          echo "ARTIFACTS_DIR=${ARTIFACTS_DIR}" >> $GITHUB_ENV
+
+          mkdir -p "${ARTIFACTS_DIR}"
+
+      - name: Download build artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ needs.build-docs.outputs.output-artifact-name }}
+          path: ${{ env.ARTIFACTS_DIR }}
+
+      - name: Display structure of downloaded artifacts
+        shell: bash --noprofile --norc -xeuo pipefail {0}
+        run: |
+          pwd
+          ls -lahR ${{ env.ARTIFACTS_DIR }}
+
+      - name: Find index.html's parent folder
+        shell: bash --noprofile --norc -xeuo pipefail {0}
+        id: find_docs_dir
+        run: |
+          FILE_PATH="$(
+            find "${{ env.ARTIFACTS_DIR }}" -name "index.html" -printf '%d %p\n' \
+              | sort -nk1 \
+              | cut -d' ' -f2- \
+              | head -n 1
+          )"
+          if [ -z "${FILE_PATH}" ]; then
+            echo "index.html not found" >&2
+            exit 1
+          fi
+          PARENT_DIR=$(dirname "${FILE_PATH}")
+          echo "docs_dir=${PARENT_DIR}" >> "${GITHUB_OUTPUT}"
+
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Deploy
+        uses: JamesIves/github-pages-deploy-action@v4
+        with:
+          folder: ${{ steps.find_docs_dir.outputs.docs_dir }}
+          token: ${{ secrets.NV_LEGATE_INTER_REPOS_ACCESS }}
+          repository-name: "nv-legate/cupynumeric"
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
new file mode 100644
index 0000000000..3fd2c7f62e
--- /dev/null
+++ b/.github/workflows/pr.yml
@@ -0,0 +1,46 @@
+name: pr
+
+on:
+  push:
+    branches:
+      - "pull-request/[0-9]+"
+      - "branch-*"
+      - "main"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    shell: bash -eou pipefail {0}
+
+jobs:
+  legate-sha:
+    runs-on: linux-amd64-cpu4
+    outputs:
+      LEGATE_SHA: ${{ steps.legate-sha.outputs.sha }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Get the Legate SHA
+        id: legate-sha
+        run: |
+          sha=$(jq .packages.legate.git_tag cmake/versions.json)
+          echo "sha=$sha" >> $GITHUB_OUTPUT
+  wheels-build:
+    needs: legate-sha
+    secrets: inherit
+    uses: ./.github/workflows/wheels-build.yml
+    with:
+      build-type: pull-request
+      legate-sha: ${{ needs.legate-sha.outputs.LEGATE_SHA }}
+  wheels-test:
+    needs: [wheels-build, legate-sha]
+    secrets: inherit
+    uses: ./.github/workflows/wheels-test.yml
+    with:
+      build-type: pull-request
+      legate-sha: ${{ needs.legate-sha.outputs.LEGATE_SHA }}
diff --git a/.github/workflows/wheels-build.yml b/.github/workflows/wheels-build.yml
new file mode 100644
index 0000000000..fa91eebee0
--- /dev/null
+++ b/.github/workflows/wheels-build.yml
@@ -0,0 +1,134 @@
+on:
+  workflow_call:
+    inputs:
+      build-type:
+        required: true
+        type: string
+      legate-sha:
+        type: string
+        required: true
+      branch:
+        type: string
+      sha:
+        type: string
+      repo:
+        type: string
+      node_type:
+        type: string
+        default: "cpu16"
+      cuda_ver:
+        type: string
+        default: "12.5.1"
+      linux_ver:
+        type: string
+        default: "rockylinux8"
+      script:
+        type: string
+        default: "continuous_integration/scripts/build_wheel_linux.bash"
+      matrix_filter:
+        type: string
+        default: "."
+
+defaults:
+  run:
+    shell: bash -eou pipefail {0}
+
+permissions:
+  actions: read
+  checks: none
+  contents: read
+  deployments: none
+  discussions: none
+  id-token: write
+  issues: none
+  packages: read
+  pages: none
+  pull-requests: read
+  repository-projects: none
+  security-events: none
+  statuses: none
+
+jobs:
+  compute-matrix:
+    runs-on: linux-amd64-cpu4
+    outputs:
+      MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }}
+    steps:
+      - name: Compute Build Matrix
+        id: compute-matrix
+        run: |
+          set -eo pipefail
+
+          # please keep the matrices sorted in ascending order by the following:
+          #
+          #     [ARCH, PY_VER, CUDA_VER, LINUX_VER]
+          #
+          export MATRIX="
+          # amd64
+          - { ARCH: 'amd64', PY_VER: '3.10', TARGET_DEV: 'gpu', BUILD_MODE: 'release' }
+          - { ARCH: 'amd64', PY_VER: '3.11', TARGET_DEV: 'gpu', BUILD_MODE: 'release' }
+          - { ARCH: 'amd64', PY_VER: '3.12', TARGET_DEV: 'gpu', BUILD_MODE: 'release' }
+          # arm64
+          - { ARCH: 'arm64', PY_VER: '3.10', TARGET_DEV: 'gpu', BUILD_MODE: 'release' }
+          - { ARCH: 'arm64', PY_VER: '3.11', TARGET_DEV: 'gpu', BUILD_MODE: 'release' }
+          - { ARCH: 'arm64', PY_VER: '3.12', TARGET_DEV: 'gpu', BUILD_MODE: 'release' }
+          "
+
+          MATRIX="$(
+            yq -n -o json 'env(MATRIX)' | \
+            jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end'
+          )"
+
+          echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"
+  build:
+    name:  ${{ matrix.ARCH }}, py${{ matrix.PY_VER }}, ${{ matrix.TARGET_DEV }}, ${{ matrix.BUILD_MODE }}
+    needs: compute-matrix
+    timeout-minutes: 480
+    strategy:
+      fail-fast: false
+      matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
+    runs-on: "linux-${{ matrix.ARCH }}-${{ inputs.node_type }}"
+    container:
+      image: rapidsai/ci-wheel:cuda${{ inputs.cuda_ver }}-${{ inputs.linux_ver }}-py${{ matrix.PY_VER }}
+      env:
+        BUILD_MODE: ${{ matrix.BUILD_MODE }}
+    steps:
+      - name: Get the SHA
+        id: get-sha
+        run: |
+          sha=$(echo ${{github.sha}} | head -c 10)
+          echo "sha=$sha" >> $GITHUB_OUTPUT
+      - if: github.repository_owner == 'nv-legate'
+        name: Get AWS credentials for sccache bucket
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: us-east-2
+          role-duration-seconds: 28800 # 8 hours
+          role-to-assume: arn:aws:iam::279114543810:role/gha-oidc-nv-legate
+      - uses: actions/checkout@v4
+        with:
+          repository: ${{ inputs.repo }}
+          ref: ${{ inputs.sha }}
+          fetch-depth: 0
+      - name: Add default paths to the env
+        run: |
+          echo "$(pwd)"/continuous_integration/scripts/tools >> "${GITHUB_PATH}"
+      - name: Download the legate wheel
+        env:
+          BUILD_NAME: ${{ matrix.ARCH }}-${{ matrix.TARGET_DEV }}-cuda${{ inputs.cuda_ver }}-py${{ matrix.PY_VER }}
+          GH_TOKEN: ${{ secrets.NV_LEGATE_INTER_REPOS_ACCESS_RO }}
+        run: |
+          legate-gh-download-artifact ${{ inputs.legate-sha }} "legate-wheel-${{ env.BUILD_NAME }}" "wheel"
+      - name: Wheel build
+        run: ${{ inputs.script }}
+        env:
+          STEP_NAME: "C++ build"
+          GH_TOKEN: ${{ github.token }}
+      - name: Wheel upload
+        env:
+          BUILD_SHA: ${{ steps.get-sha.outputs.sha }}
+          BUILD_NAME: ${{ matrix.ARCH }}-${{ matrix.TARGET_DEV }}-cuda${{ inputs.cuda_ver }}-py${{ matrix.PY_VER }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: cupynumeric-wheel-${{ env.BUILD_NAME }}-g${{ env.BUILD_SHA }}
+          path: final-dist/*.whl
diff --git a/.github/workflows/wheels-test.yml b/.github/workflows/wheels-test.yml
new file mode 100644
index 0000000000..a0db1b5145
--- /dev/null
+++ b/.github/workflows/wheels-test.yml
@@ -0,0 +1,129 @@
+on:
+  workflow_call:
+    inputs:
+      build-type:
+        required: true
+        type: string
+      legate-sha:
+        type: string
+        required: true
+      branch:
+        type: string
+      sha:
+        type: string
+      repo:
+        type: string
+      node_type:
+        type: string
+        default: "cpu16"
+      cuda_ver:
+        type: string
+        default: "12.8.0"
+      script:
+        type: string
+        default: "continuous_integration/scripts/test_wheel_linux.bash"
+      matrix_filter:
+        type: string
+        default: "."
+
+defaults:
+  run:
+    shell: bash -eou pipefail {0}
+
+permissions:
+  actions: read
+  checks: none
+  contents: read
+  deployments: none
+  discussions: none
+  id-token: write
+  issues: none
+  packages: read
+  pages: none
+  pull-requests: read
+  repository-projects: none
+  security-events: none
+  statuses: none
+
+jobs:
+  compute-matrix:
+    runs-on: linux-amd64-cpu4
+    outputs:
+      MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }}
+    steps:
+      - name: Compute Build Matrix
+        id: compute-matrix
+        run: |
+          set -eo pipefail
+
+          # please keep the matrices sorted in ascending order by the following:
+          #
+          #     [ARCH, PY_VER, CUDA_VER, LINUX_VER]
+          #
+          export MATRIX="
+          # amd64
+          - { ARCH: 'amd64', PY_VER: '3.10', TARGET_DEV: 'gpu', GPU: 'l4', LINUX_VER: 'ubuntu22.04' }
+          - { ARCH: 'amd64', PY_VER: '3.11', TARGET_DEV: 'gpu', GPU: 'l4', LINUX_VER: 'ubuntu22.04' }
+          - { ARCH: 'amd64', PY_VER: '3.12', TARGET_DEV: 'gpu', GPU: 'l4', LINUX_VER: 'ubuntu24.04' }
+          # arm64 - disabled due to ARM GPU runner availability
+          # - { ARCH: 'arm64', PY_VER: '3.10', TARGET_DEV: 'gpu', GPU: 'a100', LINUX_VER: 'ubuntu22.04' }
+          # - { ARCH: 'arm64', PY_VER: '3.11', TARGET_DEV: 'gpu', GPU: 'a100', LINUX_VER: 'ubuntu22.04' }
+          # - { ARCH: 'arm64', PY_VER: '3.12', TARGET_DEV: 'gpu', GPU: 'a100', LINUX_VER: 'ubuntu24.04' }
+          "
+
+          MATRIX="$(
+            yq -n -o json 'env(MATRIX)' | \
+            jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end'
+          )"
+
+          echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"
+
+  build:
+    name: ${{ matrix.ARCH }}, py${{ matrix.PY_VER }}, ${{ matrix.LINUX_VER }}, ${{ matrix.GPU }}
+    needs: compute-matrix
+    timeout-minutes: 60
+    strategy:
+      fail-fast: false
+      matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
+    runs-on: ${{ matrix.ARCH == 'arm64' && 'linux-aarch64-2gpu' || format('linux-{0}-gpu-{1}-latest-1', matrix.ARCH, matrix.GPU) }}
+    container:
+      image: rapidsai/citestwheel:cuda${{ inputs.cuda_ver }}-${{ matrix.LINUX_VER }}-py${{ matrix.PY_VER }}
+      env:
+        NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
+    steps:
+      - name: Get the SHA
+        id: get-sha
+        run: |
+          sha=$(echo ${{github.sha}} | head -c 10)
+          echo "sha=$sha" >> $GITHUB_OUTPUT
+      - uses: actions/checkout@v4
+        with:
+          repository: ${{ inputs.repo }}
+          ref: ${{ inputs.sha }}
+          fetch-depth: 0
+      - name: Add default paths to the env
+        run: |
+          echo $(pwd)/continuous_integration/scripts/tools >> "${GITHUB_PATH}"
+      - name: Run nvidia-smi to make sure GPU is working
+        run: nvidia-smi
+      - name: Setup proxy cache
+        uses: nv-gha-runners/setup-proxy-cache@main
+        continue-on-error: true
+        # Skip the cache on RDS Lab nodes
+        if: ${{ matrix.GPU != 'v100' && matrix.GPU != 'a100' }}
+      - name: Download the legate wheel
+        env:
+          BUILD_NAME: ${{ matrix.ARCH }}-${{ matrix.TARGET_DEV }}-cuda12.5.1-py${{ matrix.PY_VER }}
+          GH_TOKEN: ${{ secrets.NV_LEGATE_INTER_REPOS_ACCESS_RO }}
+        run: |
+          legate-gh-download-artifact ${{ inputs.legate-sha }} "legate-wheel-${{ env.BUILD_NAME }}" "wheel"
+      - name: Download the wheel from the build job
+        env:
+          BUILD_SHA: ${{ steps.get-sha.outputs.sha }}
+          BUILD_NAME: ${{ matrix.ARCH }}-${{ matrix.TARGET_DEV }}-cuda12.5.1-py${{ matrix.PY_VER }}
+        uses: actions/download-artifact@v4
+        with:
+          path: final-dist
+          name: cupynumeric-wheel-${{ env.BUILD_NAME }}-g${{ env.BUILD_SHA }}
+      - name: Run tests
+        run: ${{ inputs.script }}
diff --git a/.gitignore b/.gitignore
index 84244ce827..d4ccc950aa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,11 +27,11 @@ legion
 gasnet*
 legion_defines.h
 realm_defines.h
-cunumeric/install_info.py
+cupynumeric/install_info.py
 /build/*
-/docs/cunumeric/build
-/docs/cunumeric/source/api/generated
-/docs/cunumeric/source/comparison/comparison_table.rst.inc
+/docs/cupynumeric/build
+/docs/cupynumeric/source/api/generated
+/docs/cupynumeric/source/comparison/comparison_table.rst.inc
 *.egg-info
 .cache
 .vscode
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c8b84bdb52..db16bb5d34 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,25 +1,26 @@
 repos:
     - repo: https://github.com/pre-commit/mirrors-mypy
-      rev: 'v1.5.1'
+      rev: 'v1.15.0'
       hooks:
         - id: mypy
           language: system
           pass_filenames: false
-          args: ['cunumeric']
+          args: ['cupynumeric']
     - repo: https://github.com/PyCQA/isort
-      rev: 5.12.0
+      rev: 6.0.1
       hooks:
-            - id: isort
+        - id: isort
     - repo: https://github.com/psf/black
-      rev: 23.9.1
+      rev: 25.1.0
       hooks:
-            - id: black
+        - id: black
+          args: ["--target-version", "py310"]
     - repo: https://github.com/PyCQA/flake8
-      rev: 6.1.0
+      rev: 7.2.0
       hooks:
-            - id: flake8
+        - id: flake8
     - repo: https://github.com/pre-commit/mirrors-clang-format
-      rev: 'v16.0.6'  # Use the sha / tag you want to point at
+      rev: 'v20.1.0'  # Use the sha / tag you want to point at
       hooks:
         - id: clang-format
           files: \.(cu|cuh|h|cc|inl)$
@@ -40,7 +41,7 @@ repos:
           'types_or': [c++, c, cuda]
           require_serial: false
           stages: [pre-commit]
-          exclude: '^src/cunumeric/cunumeric_c\.h$'
+          exclude: '^src/cupynumeric/cupynumeric_c\.h$'
 
 ci:
     skip: [mypy]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 55cd0547c2..866be2eab4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,32 +14,10 @@
 # limitations under the License.
 #=============================================================================
 
-cmake_minimum_required(VERSION 3.22.1 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
-if(POLICY CMP0077)
-  cmake_policy(SET CMP0077 NEW)
-  set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
-endif()
-
-if(POLICY CMP0096)
-  cmake_policy(SET CMP0096 NEW)
-  set(CMAKE_POLICY_DEFAULT_CMP0096 NEW)
-endif()
-
-if(POLICY CMP0135)
-  # make the timestamps of ExternalProject_ADD match the download time
-  # https://cmake.org/cmake/help/latest/policy/CMP0135.html
-  cmake_policy(SET CMP0135 NEW)
-  set(CMAKE_POLICY_DEFAULT_CMP0135 NEW)
-endif()
-
-if(POLICY CMP0132)
-  # Avoid an inconsistency, where cmake would only set the CC/CXX env vars on
-  # the first run, but not subsequent ones. This would come up when building
-  # TBLIS.
-  cmake_policy(SET CMP0132 NEW)
-  set(CMAKE_POLICY_DEFAULT_CMP0132 NEW)
-endif()
+cmake_path(SET CUPYNUMERIC_CMAKE_DIR NORMALIZE "${CMAKE_CURRENT_LIST_DIR}/cmake")
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
 
 set(CMAKE_CXX_STANDARD 17 CACHE STRING "" FORCE)
 set(CMAKE_CXX_STANDARD_REQUIRED ON CACHE STRING "" FORCE)
@@ -50,8 +28,8 @@ set(CMAKE_CUDA_STANDARD_REQUIRED ON CACHE STRING "" FORCE)
 ##############################################################################
 # - Download and initialize RAPIDS CMake helpers -----------------------------
 
-set(rapids-cmake-version 24.04)
-set(rapids-cmake-sha "365322aca32fd6ecd7027f5d7ec7be50b7f3cc2a")
+set(rapids-cmake-version 24.12)
+set(rapids-cmake-sha "4cb2123dc08ef5d47ecdc9cc51c96bea7b5bb79c")
 if(NOT EXISTS ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
   file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${rapids-cmake-version}/RAPIDS.cmake
        ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
@@ -63,7 +41,7 @@ include(rapids-cuda)
 include(rapids-export)
 include(rapids-find)
 
-set(cunumeric_version 24.09.00)
+set(cupynumeric_version 25.05.00)
 
 # For now we want the optimization flags to match on both normal make and cmake
 # builds so we override the cmake defaults here for release, this changes
@@ -78,40 +56,40 @@ set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g")
 set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g")
 
 if(NOT SKBUILD)
-  project(cunumeric VERSION ${cunumeric_version} LANGUAGES C CXX)
-  include(cunumeric_cpp.cmake)
+  project(cupynumeric VERSION ${cupynumeric_version} LANGUAGES C CXX)
+  include(cupynumeric_cpp.cmake)
 else()
   project(
-    cunumeric_python
-    VERSION ${cunumeric_version}
+    cupynumeric_python
+    VERSION ${cupynumeric_version}
     LANGUAGES # TODO: Building Python extension modules via the python_extension_module requires the C
               # language to be enabled here. The test project that is built in scikit-build to verify
               # various linking options for the python library is hardcoded to build with C, so until
               # that is fixed we need to keep C.
               C CXX)
-  include(cunumeric_python.cmake)
+  include(cupynumeric_python.cmake)
 endif()
 
 if(CMAKE_GENERATOR STREQUAL "Ninja")
-  function(add_touch_cunumeric_ninja_build_target)
+  function(add_touch_cupynumeric_ninja_build_target)
     set(_suf )
     if(SKBUILD)
       set(_suf "_python")
     endif()
-    add_custom_target("touch_cunumeric${_suf}_ninja_build" ALL
+    add_custom_target("touch_cupynumeric${_suf}_ninja_build" ALL
       COMMAND ${CMAKE_COMMAND} -E touch_nocreate "${CMAKE_CURRENT_BINARY_DIR}/build.ninja"
       COMMENT "touch build.ninja so ninja doesn't re-run CMake on rebuild"
       VERBATIM
     )
-    foreach(_dep IN ITEMS cunumeric cunumeric_python
+    foreach(_dep IN ITEMS cupynumeric cupynumeric_python
                           legate legate_python
                           Legion LegionRuntime
                           Realm RealmRuntime
                           Regent)
       if(TARGET ${_dep})
-        add_dependencies("touch_cunumeric${_suf}_ninja_build" ${_dep})
+        add_dependencies("touch_cupynumeric${_suf}_ninja_build" ${_dep})
       endif()
     endforeach()
   endfunction()
-  add_touch_cunumeric_ninja_build_target()
+  add_touch_cupynumeric_ninja_build_target()
 endif()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index e083cc3c0c..b4ac11a6a5 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,16 +1,16 @@
-# Contributing to cuNumeric
+# Contributing to cuPyNumeric
 
-CuNumeric is an open-source project released under the [Apache license, version 2.0](https://www.apache.org/licenses/LICENSE-2.0).  We welcome any and all contributions, and we hope that you can help us develop a strong community.
+CuPyNumeric is an open-source project released under the [Apache license, version 2.0](https://www.apache.org/licenses/LICENSE-2.0).  We welcome any and all contributions, and we hope that you can help us develop a strong community.
 
 ## How to begin
 
-Most of the time, the best thing is to begin by [opening an issue](https://github.com/nv-legate/cunumeric/issues).  This gives us a chance to discuss the contribution and to define the problem or feature that it addresses.   Often, opening of the issue first may help prevent you from doing unnecessary work or to enhance and further develop your idea.
+Most of the time, the best thing is to begin by [opening an issue](https://github.com/nv-legate/cupynumeric/issues).  This gives us a chance to discuss the contribution and to define the problem or feature that it addresses.   Often, opening of the issue first may help prevent you from doing unnecessary work or to enhance and further develop your idea.
 
 Once you are ready to start development, we ask you to work on a [fork](https://docs.github.com/en/get-started/quickstart/fork-a-repo) of our repository.  The next step is to create a (pull request)[https://help.github.com/en/articles/about-pull-requests].  Feel free to open the pull request as soon as you begin your development (just mark it [as a draft](https://github.blog/2019-02-14-introducing-draft-pull-requests/)) or when you are ready to have your contribution merged.
 
 ## The Legalese: Developer Certificate of Origin
 
-CuNumeric is released under the open-source [Apache license, version 2.0](https://www.apache.org/licenses/LICENSE-2.0), and is free to use, modify, and redistribute.  To ensure that the license can be exercised without encumbrance, we ask you that you only contribute your own work or work to which you have the intellectual rights.  To that end, we employ the Developer's Certificate of Origin (DCO), which is the lightweight mechanism for you to certify that you are legally able to make your contribution. Here is the full text of the certificate (also available at [DeveloperCertificate.org](https://developercertificate.org/):
+CuPyNumeric is released under the open-source [Apache license, version 2.0](https://www.apache.org/licenses/LICENSE-2.0), and is free to use, modify, and redistribute.  To ensure that the license can be exercised without encumbrance, we ask you that you only contribute your own work or work to which you have the intellectual rights.  To that end, we employ the Developer's Certificate of Origin (DCO), which is the lightweight mechanism for you to certify that you are legally able to make your contribution. Here is the full text of the certificate (also available at [DeveloperCertificate.org](https://developercertificate.org/):
 
 ````
 Developer Certificate of Origin
@@ -61,12 +61,12 @@ Please use your real name and a valid email address at which you can be reached.
 
 ## Review Process
 
-We are really grateful that you are thinking of contributing to cuNumeric.  We will make every effort to review your contributions as soon as possible.
+We are really grateful that you are thinking of contributing to cuPyNumeric.  We will make every effort to review your contributions as soon as possible.
 
 As we suggested at the beginning of this document, it will be really helpful to start with an issue unless your proposed change is really trivial.  An issue will help to save work in the review process (e.g., maybe somebody is already working on exactly the same thing you want to work on).  After you open your pull request (PR), there usually will be a community feedback that often will require further changes to your contribution (the usual open-source process).  Usually, this will conclude in the PR being merged by a maintainer, but on rare occasions a PR may be rejected.  This may happen, for example, if the PR appears abandoned (no response to the community feedback) or if the PR does not seem to be approaching community acceptance in a reasonable time frame.  In any case, an explanation will always be given why a PR is closed.  Even if a PR is closed for some reason, it may always be reopened if the situation evolves (feel free to comment on closed PRs to discuss reopening them).
 
 ## Code Formatting Requirements
 
-CuNumeric has a set of coding standards that are expected from all the code merged into the project.  The coding standards are defined by the set of tools we use to format our code.  We use the [pre-commit](https://pre-commit.com/) framework to run our formatting tools.  The easiest way to meet the coding standards is to simply use the pre-commit framework to run all the checks for you.  Please visit the [pre-commit project page](https://pre-commit.com/) for pre-commit installation and usage instructions.  Once pre-commit is installed in the cuNumeric repo, all the checks and formatting will be run on every commit, but one can also run the checks explicitly as detailed in pre-commit documentation.
+CuPyNumeric has a set of coding standards that are expected from all the code merged into the project.  The coding standards are defined by the set of tools we use to format our code.  We use the [pre-commit](https://pre-commit.com/) framework to run our formatting tools.  The easiest way to meet the coding standards is to simply use the pre-commit framework to run all the checks for you.  Please visit the [pre-commit project page](https://pre-commit.com/) for pre-commit installation and usage instructions.  Once pre-commit is installed in the cuPyNumeric repo, all the checks and formatting will be run on every commit, but one can also run the checks explicitly as detailed in pre-commit documentation.
 
 We hope that the automation of our formatting checks will make it easy to comply with our coding standards.  If you encounter problems with code formatting, however, please let us know in a comment on your PR, and we will do our best to help.
diff --git a/MANIFEST.in b/MANIFEST.in
index 8f77ed2002..3eb2279b7b 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1,3 @@
 include versioneer.py
-include cunumeric/_version.py
-include cunumeric/py.typed
+include cupynumeric/_version.py
+include cupynumeric/py.typed
diff --git a/README.md b/README.md
index 7e42b9e92d..97428aacf7 100644
--- a/README.md
+++ b/README.md
@@ -15,14 +15,19 @@ limitations under the License.
 
 -->
 
-[![Build Nightly release package](https://github.com/nv-legate/cunumeric.internal/actions/workflows/ci-gh-nightly-release.yml/badge.svg)](https://github.com/nv-legate/cunumeric.internal/actions/workflows/ci-gh-nightly-release.yml)
+[![Build Nightly release package](https://github.com/nv-legate/cupynumeric.internal/actions/workflows/ci-gh-nightly-release.yml/badge.svg)](https://github.com/nv-legate/cupynumeric.internal/actions/workflows/ci-gh-nightly-release.yml)
 
-# cuNumeric
+# cuPyNumeric
 
-cuNumeric is a [Legate](https://github.com/nv-legate/legate.core) library
-that aims to provide a distributed and accelerated drop-in replacement for the
-[NumPy API](https://numpy.org/doc/stable/reference/) on top of the
-[Legion](https://legion.stanford.edu) runtime. Using cuNumeric you can do things like run
+cuPyNumeric is a library that aims to provide a distributed and accelerated
+drop-in replacement for [NumPy](https://numpy.org/) built on top of the
+[Legate](https://github.com/nv-legate/legate) framework.
+
+With cuPyNumeric you can write code productively in Python, using the familiar
+NumPy API, and have your program scale with no code changes from single-CPU
+computers to multi-node-multi-GPU clusters.
+
+For example, you can run
 [the final example of the Python CFD course](https://github.com/barbagroup/CFDPython/blob/master/lessons/15_Step_12.ipynb)
 completely unmodified on 2048 A100 GPUs in a
 [DGX SuperPOD](https://www.nvidia.com/en-us/data-center/dgx-superpod/)
@@ -30,7 +35,7 @@ and achieve good weak scaling.
 
 <img src="docs/figures/cfd-demo.png" alt="drawing" width="500"/>
 
-cuNumeric works best for programs that have very large arrays of data
+cuPyNumeric works best for programs that have very large arrays of data
 that cannot fit in the memory of a single GPU or a single node and need
 to span multiple nodes and GPUs. While our implementation of the current
 NumPy API is still incomplete, programs that use unimplemented features
@@ -39,16 +44,16 @@ canonical NumPy implementation.
 
 ## Installation
 
-cuNumeric is available from [conda](https://docs.conda.io/projects/conda/en/latest/index.html)
-on the [legate channel](https://anaconda.org/legate/cunumeric).
-See https://docs.nvidia.com/cunumeric/latest/installation.html for
+cuPyNumeric is available from [conda](https://docs.conda.io/projects/conda/en/latest/index.html)
+on the [legate channel](https://anaconda.org/legate/cupynumeric).
+See https://docs.nvidia.com/cupynumeric/latest/installation.html for
 details about different install configurations, or building
-cuNumeric from source.
+cuPyNumeric from source.
 
 ## Documentation
 
-The cuNumeric documentation can be found
-[here](https://docs.nvidia.com/cunumeric).
+The cuPyNumeric documentation can be found
+[here](https://docs.nvidia.com/cupynumeric).
 
 ## Contributing
 
@@ -56,7 +61,10 @@ See the discussion on contributing in [CONTRIBUTING.md](CONTRIBUTING.md).
 
 ## Contact
 
-For technical questions about Cunumeric and Legate-based tools, please visit
+For technical questions about cuPyNumeric and Legate-based tools, please visit
 the [community discussion forum](https://github.com/nv-legate/discussion).
 
 If you have other questions, please contact us at legate(at)nvidia.com.
+
+## Note
+*This project, i.e., cuPyNumeric, is separate and independent of the CuPy project. CuPy is a registered trademark of Preferred Networks.*
diff --git a/cmake/generate_install_info_py.cmake b/cmake/generate_install_info_py.cmake
index 190641a463..724640cbb7 100644
--- a/cmake/generate_install_info_py.cmake
+++ b/cmake/generate_install_info_py.cmake
@@ -17,8 +17,8 @@
 execute_process(
   COMMAND ${CMAKE_C_COMPILER}
     -E -DLEGATE_USE_PYTHON_CFFI
-    -I "${CMAKE_CURRENT_LIST_DIR}/../src/cunumeric"
-    -P "${CMAKE_CURRENT_LIST_DIR}/../src/cunumeric/cunumeric_c.h"
+    -I "${CMAKE_CURRENT_LIST_DIR}/../src/cupynumeric"
+    -P "${CMAKE_CURRENT_LIST_DIR}/../src/cupynumeric/cupynumeric_c.h"
   ECHO_ERROR_VARIABLE
   OUTPUT_VARIABLE header
   COMMAND_ERROR_IS_FATAL ANY
@@ -26,6 +26,6 @@ execute_process(
 
 set(libpath "")
 configure_file(
-  "${CMAKE_CURRENT_LIST_DIR}/../cunumeric/install_info.py.in"
-  "${CMAKE_CURRENT_LIST_DIR}/../cunumeric/install_info.py"
+  "${CMAKE_CURRENT_LIST_DIR}/../cupynumeric/install_info.py.in"
+  "${CMAKE_CURRENT_LIST_DIR}/../cupynumeric/install_info.py"
 @ONLY)
diff --git a/cmake/thirdparty/get_legate.cmake b/cmake/thirdparty/get_legate.cmake
index b8fcb1c356..7951bd2919 100644
--- a/cmake/thirdparty/get_legate.cmake
+++ b/cmake/thirdparty/get_legate.cmake
@@ -14,17 +14,61 @@
 # limitations under the License.
 #=============================================================================
 
+# This is based on the similar function for Legion in the Legate code
+function(cupynumeric_maybe_override_legate user_repository user_branch user_version)
+  # CPM_ARGS GIT_TAG and GIT_REPOSITORY don't do anything if you have already overridden
+  # those options via a rapids_cpm_package_override() call. So we have to conditionally
+  # override the defaults (by creating a temporary json file in build dir) only if the
+  # user sets them.
+
+  # See https://github.com/rapidsai/rapids-cmake/issues/575. Specifically, this function
+  # is pretty much identical to
+  # https://github.com/rapidsai/rapids-cmake/issues/575#issuecomment-2045374410.
+  cmake_path(SET legate_overrides_json NORMALIZE
+             "${CUPYNUMERIC_CMAKE_DIR}/versions.json")
+  if(user_repository OR user_branch OR user_version)
+    # The user has set either one of these, time to create our cludge.
+    file(READ "${legate_overrides_json}" default_legate_json)
+    set(new_legate_json "${default_legate_json}")
+
+    if(user_repository)
+      string(JSON new_legate_json SET "${new_legate_json}" "packages" "Legate" "git_url"
+             "\"${user_repository}\"")
+    endif()
+
+    if(user_branch)
+      string(JSON new_legate_json SET "${new_legate_json}" "packages" "Legate" "git_tag"
+             "\"${user_branch}\"")
+    endif()
+
+    if(user_version)
+      string(JSON new_legate_json SET "${new_legate_json}" "packages" "Legate" "version"
+             "\"${user_version}\"")
+    endif()
+
+    string(JSON eq_json EQUAL "${default_legate_json}" "${new_legate_json}")
+    if(NOT eq_json)
+      cmake_path(SET legate_overrides_json NORMALIZE
+                 "${CMAKE_CURRENT_BINARY_DIR}/versions.json")
+      file(WRITE "${legate_overrides_json}" "${new_legate_json}")
+    endif()
+  endif()
+  rapids_cpm_package_override("${legate_overrides_json}")
+endfunction()
+
 function(find_or_configure_legate)
+  set(options)
   set(oneValueArgs VERSION REPOSITORY BRANCH EXCLUDE_FROM_ALL)
+  set(multiValueArgs)
   cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-  include("${rapids-cmake-dir}/export/detail/parse_version.cmake")
-  rapids_export_parse_version(${PKG_VERSION} legate PKG_VERSION)
+  cupynumeric_maybe_override_legate("${PKG_REPOSITORY}" "${PKG_BRANCH}" "${PKG_VERSION}")
 
   include("${rapids-cmake-dir}/cpm/detail/package_details.cmake")
   rapids_cpm_package_details(legate version git_repo git_branch shallow exclude_from_all)
 
-  set(version ${PKG_VERSION})
+  string(REPLACE "00" "0" version "${version}")
+
   set(exclude_from_all ${PKG_EXCLUDE_FROM_ALL})
   if(PKG_BRANCH)
     set(git_branch "${PKG_BRANCH}")
@@ -35,16 +79,17 @@ function(find_or_configure_legate)
 
   set(FIND_PKG_ARGS
       GLOBAL_TARGETS     legate::legate
-      BUILD_EXPORT_SET   cunumeric-exports
-      INSTALL_EXPORT_SET cunumeric-exports)
+      BUILD_EXPORT_SET   cupynumeric-exports
+      INSTALL_EXPORT_SET cupynumeric-exports)
 
   # First try to find legate via find_package()
   # so the `Legion_USE_*` variables are visible
   # Use QUIET find by default.
   set(_find_mode QUIET)
-  # If legate_DIR/legate_ROOT are defined as something other than empty or NOTFOUND
-  # use a REQUIRED find so that the build does not silently download legate.
-  if(legate_DIR OR legate_ROOT)
+  # If legate_DIR/legate_ROOT or CUPYNUMERIC_BUILD_PIP_WHEELS are defined as
+  # something other than empty or NOTFOUND use a REQUIRED find so that the
+  # build does not silently download legate.
+  if(legate_DIR OR legate_ROOT OR CUPYNUMERIC_BUILD_PIP_WHEELS)
     set(_find_mode REQUIRED)
   endif()
   rapids_find_package(legate ${version} EXACT CONFIG ${_find_mode} ${FIND_PKG_ARGS})
@@ -55,11 +100,11 @@ function(find_or_configure_legate)
     include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/cpm_helpers.cmake)
     get_cpm_git_args(legate_cpm_git_args REPOSITORY ${git_repo} BRANCH ${git_branch})
 
-    message(VERBOSE "cunumeric: legate version: ${version}")
-    message(VERBOSE "cunumeric: legate git_repo: ${git_repo}")
-    message(VERBOSE "cunumeric: legate git_branch: ${git_branch}")
-    message(VERBOSE "cunumeric: legate exclude_from_all: ${exclude_from_all}")
-    message(VERBOSE "cunumeric: legate legate_cpm_git_args: ${legate_cpm_git_args}")
+    message(VERBOSE "cupynumeric: legate version: ${version}")
+    message(VERBOSE "cupynumeric: legate git_repo: ${git_repo}")
+    message(VERBOSE "cupynumeric: legate git_branch: ${git_branch}")
+    message(VERBOSE "cupynumeric: legate exclude_from_all: ${exclude_from_all}")
+    message(VERBOSE "cupynumeric: legate legate_cpm_git_args: ${legate_cpm_git_args}")
 
     rapids_cpm_find(legate ${version} ${FIND_PKG_ARGS}
         CPM_ARGS
@@ -70,35 +115,33 @@ function(find_or_configure_legate)
   endif()
 
   set(Legion_USE_CUDA ${Legion_USE_CUDA} PARENT_SCOPE)
+  set(Legion_CUDA_ARCH ${Legion_CUDA_ARCH} PARENT_SCOPE)
   set(Legion_USE_OpenMP ${Legion_USE_OpenMP} PARENT_SCOPE)
   set(Legion_BOUNDS_CHECKS ${Legion_BOUNDS_CHECKS} PARENT_SCOPE)
 
   message(VERBOSE "Legion_USE_CUDA=${Legion_USE_CUDA}")
+  message(VERBOSE "Legion_CUDA_ARCH=${Legion_CUDA_ARCH}")
   message(VERBOSE "Legion_USE_OpenMP=${Legion_USE_OpenMP}")
   message(VERBOSE "Legion_BOUNDS_CHECKS=${Legion_BOUNDS_CHECKS}")
 endfunction()
 
-foreach(_var IN ITEMS "cunumeric_LEGATE_VERSION"
-                      "cunumeric_LEGATE_BRANCH"
-                      "cunumeric_LEGATE_REPOSITORY"
-                      "cunumeric_EXCLUDE_LEGATE_FROM_ALL")
+foreach(_var IN ITEMS "cupynumeric_LEGATE_VERSION"
+                      "cupynumeric_LEGATE_BRANCH"
+                      "cupynumeric_LEGATE_REPOSITORY"
+                      "cupynumeric_EXCLUDE_LEGATE_FROM_ALL")
   if(DEFINED ${_var})
-    # Create a cunumeric_LEGATE_BRANCH variable in the current scope either from the existing
+    # Create a cupynumeric_LEGATE_BRANCH variable in the current scope either from the existing
     # current-scope variable, or the cache variable.
     set(${_var} "${${_var}}")
-    # Remove cunumeric_LEGATE_BRANCH from the CMakeCache.txt. This ensures reconfiguring the same
-    # build dir without passing `-Dcunumeric_LEGATE_BRANCH=` reverts to the value in versions.json
-    # instead of reusing the previous `-Dcunumeric_LEGATE_BRANCH=` value.
+    # Remove cupynumeric_LEGATE_BRANCH from the CMakeCache.txt. This ensures reconfiguring the same
+    # build dir without passing `-Dcupynumeric_LEGATE_BRANCH=` reverts to the value in versions.json
+    # instead of reusing the previous `-Dcupynumeric_LEGATE_BRANCH=` value.
     unset(${_var} CACHE)
   endif()
 endforeach()
 
-if(NOT DEFINED cunumeric_LEGATE_VERSION)
-  set(cunumeric_LEGATE_VERSION "${cunumeric_VERSION}")
-endif()
-
-find_or_configure_legate(VERSION          ${cunumeric_LEGATE_VERSION}
-                         REPOSITORY       ${cunumeric_LEGATE_REPOSITORY}
-                         BRANCH           ${cunumeric_LEGATE_BRANCH}
-                         EXCLUDE_FROM_ALL ${cunumeric_EXCLUDE_LEGATE_FROM_ALL}
+find_or_configure_legate(VERSION          ${cupynumeric_LEGATE_VERSION}
+                         REPOSITORY       ${cupynumeric_LEGATE_REPOSITORY}
+                         BRANCH           ${cupynumeric_LEGATE_BRANCH}
+                         EXCLUDE_FROM_ALL ${cupynumeric_EXCLUDE_LEGATE_FROM_ALL}
 )
diff --git a/cmake/thirdparty/get_openblas.cmake b/cmake/thirdparty/get_openblas.cmake
index d4e4454a09..b384ecf024 100644
--- a/cmake/thirdparty/get_openblas.cmake
+++ b/cmake/thirdparty/get_openblas.cmake
@@ -22,7 +22,7 @@ function(find_or_configure_OpenBLAS)
   set(BLAS_name "OpenBLAS")
   set(BLAS_target "openblas")
 
-  # cuNumeric presently requires OpenBLAS
+  # cuPyNumeric presently requires OpenBLAS
   set(BLA_VENDOR OpenBLAS)
 
   # TODO: should we find (or build) 64-bit BLAS?
@@ -35,8 +35,8 @@ function(find_or_configure_OpenBLAS)
 
   set(FIND_PKG_ARGS      ${PKG_VERSION}
       GLOBAL_TARGETS     ${BLAS_target}
-      BUILD_EXPORT_SET   cunumeric-exports
-      INSTALL_EXPORT_SET cunumeric-exports)
+      BUILD_EXPORT_SET   cupynumeric-exports
+      INSTALL_EXPORT_SET cupynumeric-exports)
 
   include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/cpm_helpers.cmake)
   if(PKG_BRANCH)
@@ -50,10 +50,24 @@ function(find_or_configure_OpenBLAS)
   set(CMAKE_POLICY_DEFAULT_CMP0048 OLD)
   set(CMAKE_POLICY_DEFAULT_CMP0054 NEW)
 
+  # Force a base CPU type for the openblas build.
+  set(_target HASWELL)
+  if(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+	  set(_target ARMV8)
+  endif()
+
+  # BLAS emits a bunch of warnings, -w is the "silence all warnings" flag for clang and
+  # GCC
+  if(MSVC)
+    message(FATAL_ERROR "Don't know how to silence warnings with MSVC")
+  endif()
+  set(c_flags "${CMAKE_C_FLAGS} -w")
+  set(f_flags "${CMAKE_Fortran_FLAGS} -w")
   rapids_cpm_find(BLAS ${FIND_PKG_ARGS}
       CPM_ARGS
         ${BLAS_cpm_git_args}
         EXCLUDE_FROM_ALL ${PKG_EXCLUDE_FROM_ALL}
+        SYSTEM TRUE
         OPTIONS "USE_CUDA 0"
                 "C_LAPACK ON"
                 "USE_THREAD ON"
@@ -62,7 +76,10 @@ function(find_or_configure_OpenBLAS)
                 "BUILD_WITHOUT_CBLAS OFF"
                 "BUILD_WITHOUT_LAPACK OFF"
                 "INTERFACE64 ${INTERFACE64}"
-                "USE_OPENMP ${Legion_USE_OpenMP}")
+                "TARGET ${_target}"
+                "USE_OPENMP ${Legion_USE_OpenMP}"
+                "CMAKE_C_FLAGS ${c_flags}"
+                "CMAKE_Fortran_FLAGS ${f_flags}")
 
   set(CMAKE_POLICY_DEFAULT_CMP0048 ${CMP0048_orig})
   set(CMAKE_POLICY_DEFAULT_CMP0054 ${CMP0054_orig})
@@ -89,7 +106,7 @@ function(find_or_configure_OpenBLAS)
                $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}>
                # contains cblas.h and f77blas.h
                $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/generated>
-               )
+             )
 
     string(JOIN "\n" code_string
       "if(NOT TARGET BLAS::BLAS)"
@@ -105,35 +122,35 @@ function(find_or_configure_OpenBLAS)
       FINAL_CODE_BLOCK code_string)
 
     # Do `CPMFindPackage(BLAS)` in build dir
-    rapids_export_package(BUILD BLAS cunumeric-exports
+    rapids_export_package(BUILD BLAS cupynumeric-exports
       VERSION ${PKG_VERSION} GLOBAL_TARGETS ${BLAS_target})
 
     # Tell cmake where it can find the generated blas-config.cmake
     include("${rapids-cmake-dir}/export/find_package_root.cmake")
-    rapids_export_find_package_root(BUILD BLAS [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cunumeric-exports)
+    rapids_export_find_package_root(BUILD BLAS [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cupynumeric-exports)
   endif()
 endfunction()
 
-if(NOT DEFINED cunumeric_OPENBLAS_VERSION)
+if(NOT DEFINED cupynumeric_OPENBLAS_VERSION)
   # Before v0.3.18, OpenBLAS's throws CMake errors when configuring
-  set(cunumeric_OPENBLAS_VERSION "0.3.20")
+  set(cupynumeric_OPENBLAS_VERSION "0.3.29")
 endif()
 
-if(NOT DEFINED cunumeric_OPENBLAS_BRANCH)
-  set(cunumeric_OPENBLAS_BRANCH "")
+if(NOT DEFINED cupynumeric_OPENBLAS_BRANCH)
+  set(cupynumeric_OPENBLAS_BRANCH "")
 endif()
 
-if(NOT DEFINED cunumeric_OPENBLAS_TAG)
-  set(cunumeric_OPENBLAS_TAG v${cunumeric_OPENBLAS_VERSION})
+if(NOT DEFINED cupynumeric_OPENBLAS_TAG)
+  set(cupynumeric_OPENBLAS_TAG v${cupynumeric_OPENBLAS_VERSION})
 endif()
 
-if(NOT DEFINED cunumeric_OPENBLAS_REPOSITORY)
-  set(cunumeric_OPENBLAS_REPOSITORY https://github.com/xianyi/OpenBLAS.git)
+if(NOT DEFINED cupynumeric_OPENBLAS_REPOSITORY)
+  set(cupynumeric_OPENBLAS_REPOSITORY https://github.com/xianyi/OpenBLAS.git)
 endif()
 
-find_or_configure_OpenBLAS(VERSION          ${cunumeric_OPENBLAS_VERSION}
-                           REPOSITORY       ${cunumeric_OPENBLAS_REPOSITORY}
-                           BRANCH           ${cunumeric_OPENBLAS_BRANCH}
-                           PINNED_TAG       ${cunumeric_OPENBLAS_TAG}
-                           EXCLUDE_FROM_ALL ${cunumeric_EXCLUDE_OPENBLAS_FROM_ALL}
+find_or_configure_OpenBLAS(VERSION          ${cupynumeric_OPENBLAS_VERSION}
+                           REPOSITORY       ${cupynumeric_OPENBLAS_REPOSITORY}
+                           BRANCH           ${cupynumeric_OPENBLAS_BRANCH}
+                           PINNED_TAG       ${cupynumeric_OPENBLAS_TAG}
+                           EXCLUDE_FROM_ALL ${cupynumeric_EXCLUDE_OPENBLAS_FROM_ALL}
 )
diff --git a/cmake/thirdparty/get_tblis.cmake b/cmake/thirdparty/get_tblis.cmake
index dbe0d4e935..164923601b 100644
--- a/cmake/thirdparty/get_tblis.cmake
+++ b/cmake/thirdparty/get_tblis.cmake
@@ -34,14 +34,14 @@ function(find_or_configure_tblis)
                 HEADER_NAMES "tblis/tblis.h"
                 LIBRARY_NAMES "libtblis${lib_suffix}"
                 NO_CONFIG
-                BUILD_EXPORT_SET   cunumeric-exports
-                INSTALL_EXPORT_SET cunumeric-exports
+                BUILD_EXPORT_SET   cupynumeric-exports
+                INSTALL_EXPORT_SET cupynumeric-exports
   )
 
   rapids_cpm_find(tblis ${PKG_VERSION}
       GLOBAL_TARGETS    tblis::tblis
-      BUILD_EXPORT_SET   cunumeric-exports
-      INSTALL_EXPORT_SET cunumeric-exports
+      BUILD_EXPORT_SET   cupynumeric-exports
+      INSTALL_EXPORT_SET cupynumeric-exports
       CPM_ARGS
         ${tblis_cpm_git_args}
         EXCLUDE_FROM_ALL  ${PKG_EXCLUDE_FROM_ALL}
@@ -95,8 +95,8 @@ function(find_or_configure_tblis)
 
       set(ENV{CC} "${_CC}")
       set(ENV{CXX} "${_CXX}")
-      message(VERBOSE "cunumeric: ENV{CC}=\"$ENV{CC}\"")
-      message(VERBOSE "cunumeric: ENV{CXX}=\"$ENV{CXX}\"")
+      message(VERBOSE "cupynumeric: ENV{CC}=\"$ENV{CC}\"")
+      message(VERBOSE "cupynumeric: ENV{CXX}=\"$ENV{CXX}\"")
 
       set(tblis_verbosity "--enable-silent-rules")
       if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.25")
@@ -167,20 +167,20 @@ function(find_or_configure_tblis)
   endif()
 
   set(tblis_BINARY_DIR ${tblis_BINARY_DIR} PARENT_SCOPE)
-  set(cunumeric_INSTALL_TBLIS ${should_build_tblis} PARENT_SCOPE)
+  set(cupynumeric_INSTALL_TBLIS ${should_build_tblis} PARENT_SCOPE)
 endfunction()
 
-if(NOT DEFINED cunumeric_TBLIS_BRANCH)
-  set(cunumeric_TBLIS_BRANCH arm-build)
+if(NOT DEFINED cupynumeric_TBLIS_BRANCH)
+  set(cupynumeric_TBLIS_BRANCH arm-build)
 endif()
 
-if(NOT DEFINED cunumeric_TBLIS_REPOSITORY)
-  set(cunumeric_TBLIS_REPOSITORY https://github.com/nv-legate/tblis.git)
+if(NOT DEFINED cupynumeric_TBLIS_REPOSITORY)
+  set(cupynumeric_TBLIS_REPOSITORY https://github.com/nv-legate/tblis.git)
 endif()
 
 find_or_configure_tblis(VERSION          1.2.0
-                        REPOSITORY       ${cunumeric_TBLIS_REPOSITORY}
-                        BRANCH           ${cunumeric_TBLIS_BRANCH}
-                        EXCLUDE_FROM_ALL ${cunumeric_EXCLUDE_TBLIS_FROM_ALL}
+                        REPOSITORY       ${cupynumeric_TBLIS_REPOSITORY}
+                        BRANCH           ${cupynumeric_TBLIS_BRANCH}
+                        EXCLUDE_FROM_ALL ${cupynumeric_EXCLUDE_TBLIS_FROM_ALL}
                         USE_OPENMP       ${Legion_USE_OpenMP}
 )
diff --git a/cmake/versions.json b/cmake/versions.json
index 1240dad06f..a8b44699a5 100644
--- a/cmake/versions.json
+++ b/cmake/versions.json
@@ -1,15 +1,17 @@
 {
   "packages" : {
     "legate" : {
-      "repo": "legate.core.internal",
+      "repo": "legate.internal",
       "artifact_name": "${{ inputs.platform }}-${{ inputs.build-type }}-<<repo>>-python${{ inputs.python-version }}-${{ inputs.target-device }}-release-with_tests-${{ inputs.network }}-<<git_tag>>",
       "org": "nv-legate",
+      "artifact_workflow": "ci-gh.yml",
       "nightly_workflow": "ci-gh-nightly-release.yml",
-      "version": "24.09.00",
-      "git_url" : "git@github.com:nv-legate/legate.core.internal.git",
+      "version": "25.05.00",
+      "git_url" : "git@github.com:nv-legate/legate.internal.git",
       "git_shallow": false,
       "always_download": false,
-      "git_tag" : "32137a65cf40c56db1db9f76bb508ade81da000a"
+      "git_tag" : "fe71160b63291c1d073090ad2cb7a11c618d958a",
+      "anaconda_label": "experimental"
     }
   }
 }
diff --git a/conda/conda-build/build.sh b/conda/conda-build/build.sh
index c78cbbcca9..4cbf5d1afb 100644
--- a/conda/conda-build/build.sh
+++ b/conda/conda-build/build.sh
@@ -26,7 +26,13 @@ if [ -z "$CPU_ONLY" ]; then
 else
   # When we build without cuda, we need to provide the location of curand
   CMAKE_ARGS+="
--Dcunumeric_cuRAND_INCLUDE_DIR=$PREFIX/targets/x86_64-linux/include"
+-Dcupynumeric_cuRAND_INCLUDE_DIR=$PREFIX/targets/x86_64-linux/include"
+fi
+
+# We rely on an environment variable to determine if we need to build cpp tests
+if [[ "$BUILD_TESTS" == "1" ]]; then
+  CMAKE_ARGS+="
+-Dcupynumeric_BUILD_TESTS=ON"
 fi
 
 export CMAKE_GENERATOR=Ninja
@@ -45,8 +51,8 @@ cmake --build build -j$CPU_COUNT --verbose
 cmake --install build
 
 CMAKE_ARGS="
--DFIND_CUNUMERIC_CPP=ON
--Dcunumeric_ROOT=$PREFIX"
+-DFIND_CUPYNUMERIC_CPP=ON
+-Dcupynumeric_ROOT=$PREFIX"
 
 SKBUILD_BUILD_OPTIONS=-j$CPU_COUNT \
 $PYTHON -m pip install             \
diff --git a/conda/conda-build/meta.yaml b/conda/conda-build/meta.yaml
index d7db54b8bc..d29e2a1279 100644
--- a/conda/conda-build/meta.yaml
+++ b/conda/conda-build/meta.yaml
@@ -1,12 +1,16 @@
-{% set name = "cunumeric" %}
+{% set name = "cupynumeric" %}
 {% if gpu_enabled == "true" %}
   {% set gpu_enabled_bool = true %}
+  {% set cpu_gpu_tag='_gpu' %}
 {% elif gpu_enabled == "false" %}
   {% set gpu_enabled_bool = false %}
+  {% set cpu_gpu_tag='_cpu' %}
 {% else %}
   {# We need to have a default value for the initial pass over the recipe #}
   {% set gpu_enabled_bool = false %}
+  {% set cpu_gpu_tag='_cpu' %}
 {% endif %}
+
 {% if upload_build == "true" %}
   {% set upload_build_bool = true %}
 {% elif upload_build == "false" %}
@@ -15,6 +19,14 @@
   {# We need to have a default value for the initial pass over the recipe #}
   {% set upload_build_bool = false %}
 {% endif %}
+{% if build_tests == "true" %}
+  {% set build_tests_bool = true %}
+{% elif build_tests == "false" %}
+  {% set build_tests_bool = false %}
+{% else %}
+  {# We need to have a default value for the initial pass over the recipe #}
+  {% set build_tests_bool = false %}
+{% endif %}
 ## The placeholder version is strictly for making two-pass conda build process.
 ## It should not be used for any other purpose, and this is not a default version.
 {% set placeholder_version = '0.0.0.dev' %}
@@ -37,11 +49,13 @@
 ## Note: default values are only given to make conda build work. They should not be necessary in principle.
 {% elif 'dev' in environ.get('GIT_DESCRIBE_TAG', placeholder_version) %}
     {% set version = (environ.get('GIT_DESCRIBE_TAG', placeholder_version) ~ environ.get('GIT_DESCRIBE_NUMBER', '')).lstrip('v') %}
-    {% set legate_version = (version.rsplit('.',1)[0] ~ ".dev" ~ "|>=" ~ version.rsplit('.',1)[0]) %}
+    {% set legate_version_default = (version.rsplit('.',1)[0] ~ ".dev" ~ "|>=" ~ version.rsplit('.',1)[0]) %}
+    {% set legate_version = os.environ.get("LEGATE_VERSION", legate_version_default) %}
 {% else %}
     {% set version = environ.get('GIT_DESCRIBE_TAG', placeholder_version).lstrip('v') %}
-    {% set legate_version = version %}
+    {% set legate_version = os.environ.get("LEGATE_VERSION", version) %}
 {% endif %}
+{% set legate_buildstr = "_".join(["cuda" ~ cuda_major, "py" ~ py_version, os.environ.get("LEGATE_BUILDSTR", ""), cpu_gpu_tag.strip('_') ]) %}
 
 package:
   name: {{ name|lower }}
@@ -61,18 +75,13 @@ build:
   number: {{ build_number }}
   missing_dso_whitelist:
     -  '*libcuda.so*'
-{% if gpu_enabled_bool %}
-{% set cpu_gpu_tag='_gpu' %}
-{% else %}
-{% set cpu_gpu_tag='_cpu' %}
-{% endif %}
 {% set upload_tag='' if upload_build_bool else '_with_tests' %}
 {% if use_local_path is not defined %}
 # use git hash
-  string: "cuda{{ cuda_major }}_py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ PKG_BUILDNUM }}{{ cpu_gpu_tag }}{{ upload_tag }}"
+  string: "cuda{{ cuda_major }}_py{{ py_version }}{{ cpu_gpu_tag }}{{ upload_tag }}_{{ GIT_DESCRIBE_HASH }}_{{ PKG_BUILDNUM }}"
 {% else %}
 # do not use git hash
-  string: "cuda{{ cuda_major }}_py{{ py_version }}_{{ PKG_BUILDNUM }}{{ cpu_gpu_tag }}{{ upload_tag }}"
+  string: "cuda{{ cuda_major }}_py{{ py_version }}{{ cpu_gpu_tag }}{{ upload_tag }}_{{ PKG_BUILDNUM }}"
 {% endif %}
   script_env:
     - SCCACHE_BUCKET
@@ -80,8 +89,17 @@ build:
     - SCCACHE_IDLE_TIMEOUT
     - SCCACHE_S3_KEY_PREFIX
     - SCCACHE_S3_KEY_PREFIX
+    - SCCACHE_S3_USE_SSL
+    - SCCACHE_S3_NO_CREDENTIALS
     - AWS_ACCESS_KEY_ID
     - AWS_SECRET_ACCESS_KEY
+    - AWS_SESSION_TOKEN
+    - CMAKE_C_COMPILER_LAUNCHER
+    - CMAKE_CUDA_COMPILER_LAUNCHER
+    - CMAKE_CXX_COMPILER_LAUNCHER
+{% if build_tests_bool %}
+    - BUILD_TESTS=1
+{% endif %}
 {% if not gpu_enabled_bool %}
     - CPU_ONLY=1
   # The CPU-only packages having more track_features than the GPU builds helps
@@ -115,8 +133,8 @@ requirements:
     - python
     - scikit-build
     - openblas =* =*openmp*
+    - legate ={{ legate_version }}={{ legate_buildstr }}
 {% if gpu_enabled_bool %}
-    - legate >={{ legate_version }} =*_gpu*
     - cuda-cccl
     - cutensor >=2.0 =*_*
     - libcublas-dev
@@ -125,16 +143,16 @@ requirements:
     - libcurand-dev
     - libcufile-dev
     - cuda-version ={{ cuda_version }}
-{% else %}
-    - legate >={{ legate_version }} =*_cpu*
 {% endif %}
 
   run:
+    - cffi
     - numpy {{ numpy_version }}
     - opt_einsum >=3.3
     - scipy
     - openblas =* =*openmp*
 {% if gpu_enabled_bool %}
+    - cupy
     - libnvjitlink
     - libcusparse
     - cutensor >=2.0 =*_*
@@ -148,16 +166,16 @@ requirements:
     - __glibc >=2.17  # [linux]
 
 about:
-  home: https://github.com/nv-legate/cunumeric
+  home: https://github.com/nv-legate/cupynumeric
   license: Apache-2.0
   license_file: LICENSE
   summary: 'Drop-in Replacment for NumPy'
   description: |
-    cuNumeric is a Legate library that aims to provide
+    cuPyNumeric is a Legate library that aims to provide
     a distributed and accelerated drop-in replacement
     for the NumPy API on top of the Legion runtime.
-  doc_url: https://github.com/nv-legate/cunumeric
-  dev_url: https://github.com/nv-legate/cunumeric
+  doc_url: https://github.com/nv-legate/cupynumeric
+  dev_url: https://github.com/nv-legate/cupynumeric
 
 extra:
   recipe-maintainers:
diff --git a/continuous_integration/requirements-build.txt b/continuous_integration/requirements-build.txt
new file mode 100644
index 0000000000..6ce8269a49
--- /dev/null
+++ b/continuous_integration/requirements-build.txt
@@ -0,0 +1,10 @@
+--extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+--extra-index-url=https://pypi.nvidia.com
+cmake>=3.26.4,!=3.30.0
+ninja
+nvidia-nccl-cu12
+cutensor-cu12
+scikit-build
+scikit-build-core[pyproject]>=0.10.0
+setuptools_scm
+cython
diff --git a/continuous_integration/scripts/build b/continuous_integration/scripts/build
index f68c5ba800..8287fc517a 100755
--- a/continuous_integration/scripts/build
+++ b/continuous_integration/scripts/build
@@ -2,22 +2,19 @@
 set -x
 
 build_release_product() {
-    set -xeo pipefail;
+    set -xeuo pipefail;
 
     echo "RUNNING build_release_product"
 
     mkdir -p /tmp/env_yaml /tmp/conda-build /tmp/out
 
-    cp -r "${ARTIFACTS_DIR}/conda-build/legate" /tmp/conda-build/
-
     local conda_build_args=();
-    conda_build_args+=(--override-channels);
-
     # The channel sequence below needs to be preserved
+    conda_build_args+=(-c https://conda.anaconda.org/${CONDA_CHANNEL}/label/${CONDA_LABEL});
+    conda_build_args+=(-c legate/label/ucc140);
     conda_build_args+=(-c conda-forge);
     conda_build_args+=(--override-channels);
-    conda_build_args+=(-c file:///tmp/conda-build/legate);
-    conda_build_args+=(--croot /tmp/conda-build/cunumeric);
+    conda_build_args+=(--croot /tmp/conda-build/cupynumeric);
     conda_build_args+=(--numpy 1.22);
     conda_build_args+=(--no-test);
     conda_build_args+=(--no-verify);
@@ -32,13 +29,29 @@ build_release_product() {
     UPLOAD_BUILD=true
     [ "${UPLOAD_ENABLED:-}" = "OFF" ] && UPLOAD_BUILD=false
 
-    variantOpts=$(printf "{\"gpu_enabled\": [$GPU_ENABLED], \"upload_build\": [$UPLOAD_BUILD], \"python\": [$PYTHON_VERSION]}")
+    variantOpts=$(printf "{\"gpu_enabled\": [$GPU_ENABLED], \"build_tests\": [$BUILD_TESTS], \"upload_build\": [$UPLOAD_BUILD], \"python\": [$PYTHON_VERSION]}")
 
     conda_build_args+=(--variants "$variantOpts")
 
-    # https://github.com/nv-legate/cunumeric.internal/pull/351#issuecomment-2286922486
+    # https://github.com/nv-legate/cupynumeric.internal/pull/351#issuecomment-2286922486
     export CONDA_OVERRIDE_CUDA="${CUDA_VERSION}"
-    conda mambabuild "${conda_build_args[@]}" "${REPO_DIR}/conda/conda-build";
+
+    # Use the new .conda format.
+    conda config --set conda_build.pkg_format 2
+
+    # Set up the SCCACHE environment variables
+    export CI=true
+    source "${REPO_DIR}/continuous_integration/scripts/tools/legate-configure-sccache"
+    sccache --zero-stats
+
+    set +u;
+
+    # For whatever reason, the default buffering of conda/mamba is not sufficient, and
+    # leads to garbled output in CI (mixing conda output and whatever build.sh prints). So
+    # we need to force unbuffered output.
+    stdbuf -o0 -e0 conda mambabuild "${conda_build_args[@]}" "${REPO_DIR}/conda/conda-build";
+
+    sccache --show-adv-stats
 
     copy_release_artifacts
 }
@@ -52,53 +65,20 @@ copy_release_artifacts() {
     ls -lahR $ARTIFACTS_DIR
 }
 
-copy_ci_artifacts() {
-    set -xeuo pipefail;
-    echo Copying CI artifacts
-
-    mkdir -p "$ARTIFACTS_DIR"
-
-    cp -r /tmp/out          "$ARTIFACTS_DIR"
-    cp -r /tmp/conda-build  "$ARTIFACTS_DIR"
-}
-
-build_ci_product() {
-    set -xeuo pipefail;
-
-    printf "\n\n\n\n********* BUILDING CUNUMERIC CPP *********\n"
-    build-cunumeric-cpp;
-
-    printf "\n\n\n\n********* BUILDING CUNUMERIC WHEEL *********\n"
-    build-cunumeric-wheel;
-
-    printf "\n\n\n\n********* BUILDING CUNUMERIC CONDA *********\n"
-    build-cunumeric-conda;
-
-    copy_ci_artifacts;
-}
-
-build_cunumeric_fake() {
-    set -xeuo pipefail;
-
-    mkdir -p /tmp/out /tmp/conda-build/legate /tmp/conda-build/cunumeric
-    touch /tmp/out/legate-23.11.00-dummy.tar.bz2
-    touch /tmp/conda-build/legate/dummy.txt
-    touch /tmp/conda-build/cunumeric/dummy.txt
-
-    copy_ci_artifacts
-}
-
 build_project() {
     . setup-utils;
 
     init_build_env "$@";
 
+    . conda-dnld-utils;
+    setup_conda_channel;
+    generate_legate_version
+
     case "$BUILD_TYPE" in
-        ci) build_ci_product;;
-        release) build_release_product;;
+        ci) build_release_product;;
+        nightly) build_release_product;;
         *) return 1;;
     esac
 }
 
-
 (build_project "$@");
diff --git a/continuous_integration/scripts/build-cunumeric-conda b/continuous_integration/scripts/build-cunumeric-conda
deleted file mode 100755
index e1b83ca699..0000000000
--- a/continuous_integration/scripts/build-cunumeric-conda
+++ /dev/null
@@ -1,100 +0,0 @@
-#!/usr/bin/env bash
-
-build_cunumeric_conda_package() {
-    set -xeuo pipefail;
-
-    local python_version="${PYTHON_VERSION:-}";
-
-    if [ -z "${python_version}" ]; then
-        python_version="$(python3 --version 2>&1 | cut -d' ' -f2 | cut -d'.' -f3 --complement)";
-    fi
-
-    mkdir -p /tmp/conda-build /tmp/out
-    cp -r "${ARTIFACTS_DIR}/conda-build/legate" /tmp/conda-build/
-
-    local conda_build_args=();
-    conda_build_args+=(--override-channels);
-    conda_build_args+=(-c conda-forge);
-    # the ucx channel is only necessary as a WAR until the real ucx 1.17 package is available on conda-forge
-    conda_build_args+=(-c https://github.com/nv-legate/ucx-package/raw/main);
-    conda_build_args+=(-c file:///tmp/conda-build/legate);
-    conda_build_args+=(--croot /tmp/conda-build/cunumeric);
-    conda_build_args+=(--numpy 1.22);
-    conda_build_args+=(--python ${python_version});
-    conda_build_args+=(--no-test);
-    conda_build_args+=(--no-verify);
-    conda_build_args+=(--no-build-id);
-    conda_build_args+=("--build-id-pat=''");
-    conda_build_args+=(--no-include-recipe);
-    conda_build_args+=(--no-anaconda-upload);
-
-    GPU_ENABLED=true
-    [ "${USE_CUDA}" = "OFF" ] && GPU_ENABLED=false
-
-    UPLOAD_BUILD=true
-    [ "${UPLOAD_ENABLED:-}" = "OFF" ] && UPLOAD_BUILD=false
-
-    conda_build_args+=(--variants "{gpu_enabled:${GPU_ENABLED},python:${python_version}}");
-
-    rm -rf /tmp/conda-build/cunumeric;
-    mkdir -p /tmp/conda-build/cunumeric;
-
-    # Synthesize new cunumeric conda-build build.sh script
-
-    cat <<EOF > "${REPO_DIR}/conda/conda-build/conda_build_config.yaml"
-gpu_enabled:
-  - "${GPU_ENABLED}"
-
-upload_build:
-  - "${UPLOAD_BUILD}"
-
-python:
-  - "${python_version}"
-
-numpy_version:
-  - ">=1.22,<2"
-
-cmake_version:
-  - ">=3.20.1,!=3.23.0"
-
-use_local_path:
-  - "true"
-
-numpy:
-  - 1.22
-
-package_version:
-  - "$(git -C "${REPO_DIR}" describe --abbrev=0 --tags | sed 's/[a-zA-Z]//g' | cut -d '.' -f -2).00"
-EOF
-
-    cat <<"EOF" > "${REPO_DIR}/conda/conda-build/build.sh"
-# Install cunumeric C++ libs
-tar -C "$PREFIX" --exclude="*.a" --strip-components=1 -xvf /tmp/out/cunumeric-*-Linux.tar.gz;
-
-# Install cunumeric Python wheel
-pip install --no-deps --root / --prefix "$PREFIX" /tmp/out/cunumeric-*.whl;
-EOF
-
-    git -C "${REPO_DIR}" add .;
-    git -C "${REPO_DIR}" commit --allow-empty --allow-empty-message -n -m "";
-
-    # Build cuNumeric conda package
-    set +ux
-    eval "$(conda shell.bash hook)"
-    conda deactivate
-    conda create -n build
-    conda activate build
-    set -ux
-    conda install boa
-
-    CUDA=${CUDA_VERSION} \
-    conda mambabuild ${conda_build_args[@]} "${REPO_DIR}/conda/conda-build";
-
-    git -C "${REPO_DIR}" reset --hard HEAD~1;
-
-    cp /tmp/conda-build/cunumeric/linux-64/cunumeric-*.tar.bz2 /tmp/out/;
-
-    { set +x; } 2>/dev/null;
-}
-
-(build_cunumeric_conda_package "$@");
diff --git a/continuous_integration/scripts/build-cunumeric-cpp b/continuous_integration/scripts/build-cunumeric-cpp
deleted file mode 100755
index e608ec385d..0000000000
--- a/continuous_integration/scripts/build-cunumeric-cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/usr/bin/env bash
-
-build_cunumeric_cpp() {
-    set -xeuo pipefail;
-
-    # Build + package cuNumeric C++ libs
-    local cmake_args=(${CMAKE_ARGS:-});
-    cmake_args+=(-DBUILD_SHARED_LIBS=ON);
-    cmake_args+=(-DBUILD_MARCH=${BUILD_MARCH});
-    cmake_args+=(-DCMAKE_BUILD_TYPE=Release);
-    cmake_args+=(-DCMAKE_CUDA_ARCHITECTURES=RAPIDS);
-    cmake_args+=(-DCMAKE_BUILD_PARALLEL_LEVEL=${JOBS:-$(nproc --ignore=1)});
-    cmake_args+=(${@});
-
-    cmake -S "${REPO_DIR}" -B "${REPO_DIR}/build" ${cmake_args[@]} -GNinja;
-
-    sccache --show-stats;
-
-    time cmake --build "${REPO_DIR}/build" --verbose --parallel ${JOBS:-$(nproc --ignore=1)};
-
-    sccache --show-stats;
-
-    (
-    mkdir -p /tmp/out;
-    cd "${REPO_DIR}/build";
-    cpack -G TGZ;
-    cp ./*-Linux.tar.gz /tmp/out/;
-    );
-
-    { set +x; } 2>/dev/null;
-}
-
-(build_cunumeric_cpp "$@");
diff --git a/continuous_integration/scripts/build-cunumeric-wheel b/continuous_integration/scripts/build-cunumeric-wheel
deleted file mode 100755
index 93ae353118..0000000000
--- a/continuous_integration/scripts/build-cunumeric-wheel
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/env bash
-
-build_cunumeric_wheel() {
-    set -xeuo pipefail;
-
-    mkdir -p /tmp/out;
-
-    local pip_args=(-vv);
-    pip_args+=(--wheel-dir /tmp/out);
-
-    if type conda 2>&1 >/dev/null; then
-        pip_args+=(--no-deps);
-        pip_args+=(--no-build-isolation);
-    fi
-
-    local cmake_args=(${CMAKE_ARGS:-});
-    cmake_args+=("-DFIND_CUNUMERIC_CPP=ON");
-
-    pwd
-    echo $REPO_DIR
-    ls -lahR $REPO_DIR
-
-    cmake_args+=("-Dcunumeric_ROOT=$REPO_DIR/build");
-
-    # Build + package cuNumeric Python wheel
-    CMAKE_ARGS="${cmake_args[@]}" \
-        pip wheel ${pip_args[@]} "${REPO_DIR}";
-
-    { set +x; } 2>/dev/null;
-}
-
-(build_cunumeric_wheel "$@");
diff --git a/continuous_integration/scripts/build_wheel_linux.bash b/continuous_integration/scripts/build_wheel_linux.bash
new file mode 100755
index 0000000000..fdd14e668e
--- /dev/null
+++ b/continuous_integration/scripts/build_wheel_linux.bash
@@ -0,0 +1,143 @@
+#!/usr/bin/env bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2025-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+set -euo pipefail
+
+echo "Building a wheel..."
+
+pwd
+
+ls -lah
+
+ls -lh wheel
+
+# Configure and enable sccache for the build.
+source legate-configure-sccache
+export CMAKE_BUILD_PARALLEL_LEVEL=${PARALLEL_LEVEL}
+
+if [[ "${CI:-false}" == "true" ]]; then
+  echo "Installing extra system packages"
+  dnf install -y gcc-toolset-11-libatomic-devel
+    # Enable gcc-toolset-11 environment
+  source /opt/rh/gcc-toolset-11/enable
+  # Verify compiler version
+  gcc --version
+  g++ --version
+fi
+
+echo "PATH: ${PATH}"
+
+if [[ "${CUPYNUMERIC_DIR:-}" == "" ]]; then
+  # If we are running in an action then GITHUB_WORKSPACE is set.
+  if [[ "${GITHUB_WORKSPACE:-}" == "" ]]; then
+    script_dir="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
+    CUPYNUMERIC_DIR="$(realpath "${script_dir}"/../../)"
+  else
+    # Simple path witin GitHub actions workflows.
+    CUPYNUMERIC_DIR="${GITHUB_WORKSPACE}"
+  fi
+  export CUPYNUMERIC_DIR
+fi
+package_name="cupynumeric"
+package_dir="${CUPYNUMERIC_DIR}/scripts/build/python/cupynumeric"
+
+# This is all very hackish and needs to be fixed up.
+echo "Installing build requirements"
+python -m pip install -v --prefer-binary -r continuous_integration/requirements-build.txt
+
+# Install the legate wheel that was downloaded.
+pip install wheel/*.whl
+
+sitepkgs=$(python -c 'import site; print(site.getsitepackages()[0], end="")')
+# Add in the symbolic links for cuTensor so that CMake can find it (hack)
+ln -fs "${sitepkgs}"/cutensor/lib/libcutensor.so.2 "${sitepkgs}"/cutensor/lib/libcutensor.so
+ln -fs "${sitepkgs}"/cutensor/lib/libcutensorMg.so.2 "${sitepkgs}"/cutensor/lib/libcutensorMg.so
+
+# TODO(cryos): https://github.com/nv-legate/cupynumeric.internal/issues/666
+# This is a very hackish way to generate the version for now.
+scm_version=$(python -m setuptools_scm -c "${CUPYNUMERIC_DIR}"/scripts/build/python/cupynumeric/pyproject.toml)
+export SETUPTOOLS_SCM_PRETEND_VERSION="${scm_version}"
+echo "Building wheels with version '${scm_version}'"
+
+# build with '--no-build-isolation', for better sccache hit rate
+# 0 really means "add --no-build-isolation" (ref: https://github.com/pypa/pip/issues/5735)
+export PIP_NO_BUILD_ISOLATION=0
+
+# The cupynumeric build system defaults to -march=native, which is not going to work
+# for packages we want to reuse! Set some reasonable defaults for the wheels.
+ARCH=$(uname -m)
+echo "Building on architecture: ${ARCH}"
+if [[ "$ARCH" == "aarch64" ]]; then
+    BUILD_MARCH=armv8-a
+else
+    BUILD_MARCH=haswell
+fi
+
+echo "Building ${package_name}"
+# TODO(cryos): https://github.com/nv-legate/legate.internal/issues/1894
+# Improve the use of CMAKE_PREFIX_PATH to find legate and cutensor once
+# scikit-build supports it.
+CMAKE_ARGS="-DCMAKE_PREFIX_PATH=${sitepkgs}/legate;${sitepkgs}/cutensor"
+export CMAKE_ARGS
+SKBUILD_CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES:STRING=all-major;-DBUILD_SHARED_LIBS:BOOL=ON;-DBUILD_MARCH=${BUILD_MARCH}"
+export SKBUILD_CMAKE_ARGS
+echo "SKBUILD_CMAKE_ARGS='${SKBUILD_CMAKE_ARGS}'"
+
+# TODO: Remove this hackish removal of scikit-build files needed as conda
+# uses scikit-build and wheels are using scikit-build-core. Migrate conda to
+# be consistent with legate and wheels. If not deleted we get inconsistent
+# metadata failure during the pip wheel build.
+mv "${CUPYNUMERIC_DIR}"/cupynumeric/_version.py "${CUPYNUMERIC_DIR}"/cupynumeric/_version.py.bak
+echo "Removed scikit-build _version.py file"
+ls -lah
+
+echo "Building wheel..."
+cd "${package_dir}"
+
+sccache --zero-stats
+
+python -m pip wheel \
+  -w "${CUPYNUMERIC_DIR}"/dist \
+  -v \
+  --no-deps \
+  --disable-pip-version-check \
+  .
+
+sccache --show-adv-stats
+
+echo "Show dist contents"
+pwd
+ls -lh "${CUPYNUMERIC_DIR}"/dist
+
+echo "Repairing the wheel"
+mkdir -p "${CUPYNUMERIC_DIR}"/final-dist
+python -m auditwheel repair \
+  --exclude libnvJitLink.so* \
+  --exclude libcuda.so* \
+  --exclude liblegate.so* \
+  --exclude libcublas.so* \
+  --exclude libcublasLt.so* \
+  --exclude libnccl.so* \
+  --exclude libcusparse.so* \
+  --exclude libcutensor.so* \
+  --exclude libcufft.so* \
+  --exclude libcusolver.so* \
+  --exclude liblegion-legate.so* \
+  --exclude librealm-legate.so* \
+  -w "${CUPYNUMERIC_DIR}"/final-dist \
+  "${CUPYNUMERIC_DIR}"/dist/*.whl
+
+echo "Wheel has been repaired. Contents:"
+ls -lh "${CUPYNUMERIC_DIR}"/final-dist
+
+echo "Restoring scikit-build _verion.py file"
+mv "${CUPYNUMERIC_DIR}"/cupynumeric/_version.py.bak "${CUPYNUMERIC_DIR}"/cupynumeric/_version.py
diff --git a/continuous_integration/scripts/conda-dnld-utils b/continuous_integration/scripts/conda-dnld-utils
new file mode 100755
index 0000000000..ddf1c6bbe5
--- /dev/null
+++ b/continuous_integration/scripts/conda-dnld-utils
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+
+set -x
+
+generate_legate_version() {
+    legate_json_version="$(jq -r '.packages.legate.version' ${REPO_DIR}/cmake/versions.json)";
+    legate_SHA="$(jq -r '.packages.legate.git_tag' ${REPO_DIR}/cmake/versions.json)";
+    legate_hash="g${legate_SHA:0:8}"
+    export LEGATE_VERSION="${legate_json_version}*"
+    export LEGATE_BUILDSTR="*${legate_hash}*"
+    echo "LEGATE_VERSION=${LEGATE_VERSION} : LEGATE_BUILDSTR=${LEGATE_BUILDSTR}"
+}
+
+verify_legate_version() {
+    legate-mamba-retry search legate=${LEGATE_VERSION} --channel https://conda.anaconda.org/${CONDA_CHANNEL}/label/${CONDA_LABEL}
+    if [ $? -ne 0 ]; then
+        echo "Error: conda search failed for legate." >&2; exit 1
+    fi
+}
+
+setup_conda_channel() {
+    if ! command -v jq &> /dev/null; then
+        echo "Installing jq"
+        apt-get update -q
+        apt-get -q install -y jq
+    fi
+    # strict channel ordering is required for prioritizing packages from artifacts
+    conda config --set channel_priority strict
+    legate_conda_label="$(jq -r '.packages.legate.anaconda_label' ${REPO_DIR}/cmake/versions.json)";
+    export CONDA_CHANNEL="legate"
+    export CONDA_LABEL="${legate_conda_label}"
+    echo "CONDA_CHANNEL=${CONDA_CHANNEL} : CONDA_LABEL=${CONDA_LABEL}"
+}
diff --git a/continuous_integration/scripts/make-conda-env b/continuous_integration/scripts/make-conda-env
index 597d9ee613..eacc3c5891 100755
--- a/continuous_integration/scripts/make-conda-env
+++ b/continuous_integration/scripts/make-conda-env
@@ -4,39 +4,36 @@ set -x
 
 . conda-utils
 
-make_ci_env() {
-    set -xeuo pipefail
-    yaml_file=$(find "${ARTIFACTS_DIR}" -name "environment*.yaml" | head -n 1)
+make_release_env() {
+    legate-mamba-retry create -q -y -n "${CONDA_ENV}" -c conda-forge boa
+}
 
-    sed -i '$ d' ${yaml_file}
-    echo "  - legate" >> "${yaml_file}"
-    sed -i "/channels:/!b;:a;n;/^- /ba;i\- ${ARTIFACTS_DIR}/conda-build/legate" ${yaml_file}
-    [ "${USE_CUDA}" = "ON" ] &&
-        echo "  - libcublas-dev" >> "${yaml_file}" &&
-        echo "  - libcufft-dev" >> "${yaml_file}" &&
-        echo "  - libcurand-dev" >> "${yaml_file}" &&
-        echo "  - libcusolver-dev" >> "${yaml_file}";
+make_docs_env() {
+    set -xeuo pipefail
 
-    echo "YAML file..."
-    cat "${yaml_file}"
+    export DEBIAN_FRONTEND=non-interactive
+    export CONDA_ENV=legate
 
-    mkdir -p /tmp/out;
+    # Run package updates and install packages
+    apt-get update
+    apt-get install -y numactl make
 
-    cp "${yaml_file}" /tmp/out
+    legate-mamba-retry create -yn "${CONDA_ENV}" pandoc doxygen
 
-    mamba env create -n legate -f "$yaml_file"
-}
+    . conda-utils;
+    activate_conda_env;
 
-make_release_env() {
-    mamba create -q -y -n "${CONDA_ENV}" -c conda-forge boa
+    # mamba install -y pandoc doxygen
+    pip install ipython jinja2 "markdown<3.4.0" myst-parser  nbsphinx  sphinx-copybutton  "sphinx>=8"  nvidia-sphinx-theme cffi
 }
 
 make_conda_env() {
     set -xeuo pipefail
 
     case "$1" in
-        ci) make_ci_env;;
-        release) make_release_env;;
+        ci) make_release_env;;
+        nightly) make_release_env;;
+        docs) make_docs_env;;
         *) return 1;;
     esac
 
diff --git a/continuous_integration/scripts/test b/continuous_integration/scripts/test
index 60bf105959..0bdb65d914 100755
--- a/continuous_integration/scripts/test
+++ b/continuous_integration/scripts/test
@@ -5,35 +5,48 @@ set -x
 setup_env() {
     set -xeuo pipefail
 
+    . conda-dnld-utils
+    setup_conda_channel;
     export DEBIAN_FRONTEND=non-interactive
 
     # Run package updates and install packages
     apt-get update
     apt-get install -y numactl make
 
-    mamba create -yn legate -c "${ARTIFACTS_DIR}/conda-build/legate" -c "${ARTIFACTS_DIR}/conda-build/cunumeric" -c conda-forge legate cunumeric
+    legate-mamba-retry search --override-channels -c "${ARTIFACTS_DIR}/conda-build/cupynumeric" --info cupynumeric
+
+    # This requires strict channel priority to work (prioritize local channel)
+    legate-mamba-retry create -y -n legate -c "${ARTIFACTS_DIR}/conda-build/cupynumeric" -c https://conda.anaconda.org/${CONDA_CHANNEL}/label/${CONDA_LABEL} -c legate/label/ucc140 -c conda-forge legate cupynumeric
 }
 
 setup_test_env() {
-    mamba install -y "clang-tools>=8" "clang>=8" colorama coverage mock pre-commit pytest-cov pytest-mock "pytest" types-docutils pynvml psutil
+    legate-mamba-retry install -y "clang-tools>=8" "clang>=8" colorama coverage mock pre-commit pytest-cov pytest-mock "pytest" types-docutils pynvml psutil
 
     pip install tifffile
 }
 
 setup_docs_env() {
-    mamba install -y pandoc doxygen
-    pip install ipython jinja2 "markdown<3.4.0" "pydata-sphinx-theme>=0.13"  myst-parser  nbsphinx  sphinx-copybutton  "sphinx>=4.4.0"
+    legate-mamba-retry install -y pandoc doxygen
+    pip install ipython jinja2 "markdown<3.4.0" myst-parser  nbsphinx  sphinx-copybutton  "sphinx>=8"  nvidia-sphinx-theme cffi
 }
 
 setup_mypy_env() {
-    mamba install -y "mypy>=0.961" jinja2 nbsphinx sphinx-copybutton  "sphinx>=4.4.0" types-docutils
+    legate-mamba-retry install -y "mypy>=0.961" jinja2 nbsphinx sphinx-copybutton  "sphinx>=4.4.0" types-docutils
 }
 
 setup_unit_env() {
-    mamba install -y pytest pytest-mock mock
+    legate-mamba-retry install -y pytest pytest-mock mock cffi
+}
+
+run_legate_issue() {
+    if command -v "legate-issue" &> /dev/null; then
+      legate-issue
+    else
+      echo "WARNING: legate-issue not found."
+    fi
 }
 
-test_cunumeric() {
+test_cupynumeric() {
     set -xeo pipefail
 
     . conda-utils;
@@ -51,27 +64,42 @@ test_cunumeric() {
             echo "Executing tests..."
             shift;
             setup_test_env;
+            run_legate_issue;
             ./test.py -vv --timeout 300 "$@"
             ;;
         "mypy")
             echo "Installing and executing mypy..."
             shift;
             setup_mypy_env;
-            mypy cunumeric
+            run_legate_issue;
+            mypy cupynumeric
             ;;
         "docs")
             echo "Building docs..."
             shift;
             setup_docs_env;
-            cd docs/cunumeric
+            run_legate_issue;
+            cd docs/cupynumeric
             make clean html
+            # ls -lah .
+            echo Copying artifacts
+            cd build/html
+            cp -r . "${OUTPUT_ARTIFACTS_DIR}"
             ;;
         "unit")
             echo "Running Unit tests..."
             shift;
             setup_unit_env;
+            run_legate_issue;
             LEGATE_AUTO_CONFIG=0 pytest tests/unit
             ;;
+        "cpp")
+            echo "Running CPP tests..."
+            shift;
+            run_legate_issue;
+            export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib/legate/deps:${LD_LIBRARY_PATH:-}
+            REALM_BACKTRACE=1 LEGATE_TEST=1 LEGATE_LOG_MAPPING=1 ${CONDA_PREFIX}/bin/cpp_tests
+            ;;
         *)
             echo "Invalid command: $1"
             return 1
@@ -79,4 +107,4 @@ test_cunumeric() {
     esac
 }
 
-(test_cunumeric "$@");
+(test_cupynumeric "$@");
diff --git a/continuous_integration/scripts/test_wheel_linux.bash b/continuous_integration/scripts/test_wheel_linux.bash
new file mode 100755
index 0000000000..4414d96003
--- /dev/null
+++ b/continuous_integration/scripts/test_wheel_linux.bash
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2025-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+set -euo pipefail
+
+echo "Are my wheels there???"
+
+ls -lh
+
+ls -lh wheel
+ls -lh final-dist
+
+# Install legate first and then cupynumeric.
+pip install wheel/*.whl final-dist/*.whl
+
+echo "Let's explore the wheels and see if they are installed correctly."
+sitepkgs=$(python -c 'import site; print(site.getsitepackages()[0], end="")')
+echo "=== cupynumeric ==="
+ls -lh "${sitepkgs}/cupynumeric"
+echo "=== legate ==="
+ls -lh "${sitepkgs}/legate"
+
+echo "Lamest of proof of life tests for legate"
+export LEGATE_SHOW_CONFIG=1
+export LEGATE_CONFIG="--fbmem 1024"
+export LEGION_DEFAULT_ARGS="-ll:show_rsrv"
+
+# Attempt to run the tests...
+mv cupynumeric cupynumeric-moved
+pip install pytest pynvml psutil scipy
+
+echo "Attempt to run an example"
+legate examples/gemm.py
+
+echo "Example done, attempt to import cupynumeric"
+python -c 'import cupynumeric as np'
+echo "Maybe that worked"
+
+echo "Running the CPU tests"
+python test.py
+echo "Done"
+
+echo "Running the GPU tests"
+python test.py --use cuda
+echo "Done"
diff --git a/continuous_integration/scripts/tools/legate-configure-sccache b/continuous_integration/scripts/tools/legate-configure-sccache
new file mode 100755
index 0000000000..bd7a5e0be5
--- /dev/null
+++ b/continuous_integration/scripts/tools/legate-configure-sccache
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+# A utility script that configures sccache environment variables
+
+export CMAKE_CUDA_COMPILER_LAUNCHER=sccache
+export CMAKE_CXX_COMPILER_LAUNCHER=sccache
+export CMAKE_C_COMPILER_LAUNCHER=sccache
+export RUSTC_WRAPPER=sccache
+export PARALLEL_LEVEL=${PARALLEL_LEVEL:-$(nproc --all --ignore=2)}
+export SCCACHE_BUCKET=rapids-sccache-east
+export SCCACHE_IDLE_TIMEOUT=32768
+export SCCACHE_REGION=us-east-2
+export SCCACHE_S3_KEY_PREFIX=legate-cunumeric-dev
+export SCCACHE_S3_NO_CREDENTIALS=false
+export SCCACHE_S3_USE_SSL=true
+
+if [[ "${CI:-false}" == "false" ]]; then
+  # Configure sccache for read-only mode since no credentials
+  # are available in local builds.
+  export SCCACHE_S3_NO_CREDENTIALS=true
+fi
diff --git a/continuous_integration/scripts/tools/legate-gh-download-artifact b/continuous_integration/scripts/tools/legate-gh-download-artifact
new file mode 100755
index 0000000000..5accd59edd
--- /dev/null
+++ b/continuous_integration/scripts/tools/legate-gh-download-artifact
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2025-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# A utility script adapted from https://github.com/rapidsai/gha-tools/blob/main/tools/rapids-download-from-github
+# Given a git SHA, artifact name and output path grab the artifact from the run.
+
+set -euo pipefail
+
+# Default values for the environment variables.
+LEGATE_REPO_NAME=${LEGATE_REPO_NAME:-"nv-legate/legate.internal"}
+
+# Check if the script was called with exactly 1 argument
+if [[ ${#} -ne 3 ]]; then
+  echo "Error: This script requires exactly 3 arguments (the git SHA, the artifact name, and the output path)."
+  echo "You provided ${#} arguments."
+  echo "Usage: ${0} git-sha artifact-name output-path"
+  exit 1
+fi
+
+# Poppulate our variables from the arguments.
+run_id=$(legate-gh-run-id "${1}")
+artifact_name="${2}"
+output_path="${3}"
+
+echo "Downloading and decompressing artifact ${artifact_name} from run ${run_id} to ${output_path}"
+
+gh run download "${run_id}" \
+  --repo "${LEGATE_REPO_NAME}" \
+  --name "${artifact_name}" \
+  --dir "${output_path}"
+
+echo -n "${output_path}"
diff --git a/continuous_integration/scripts/tools/legate-gh-run-id b/continuous_integration/scripts/tools/legate-gh-run-id
new file mode 100755
index 0000000000..339a674296
--- /dev/null
+++ b/continuous_integration/scripts/tools/legate-gh-run-id
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2025-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# A utility script adapted from https://github.com/rapidsai/gha-tools/blob/main/tools/rapids-github-run-id
+# This gets the GitHub run ID for the specified workflow and commit SHA.
+
+set -euo pipefail
+
+# Default values for the environment variables.
+LEGATE_WORKFLOW_NAME=${LEGATE_WORKFLOW_NAME:-"pr"}
+LEGATE_REF_NAME=${LEGATE_REF_NAME:-"main"}
+LEGATE_REPO_NAME=${LEGATE_REPO_NAME:-"nv-legate/legate.internal"}
+
+# Check if the script was called with exactly 1 argument
+if [[ ${#} -ne 1 ]]; then
+  echo "Error: This script requires exactly 1 argument (the git SHA). You provided ${#}"
+  echo "Usage: ${0} git-sha"
+  exit 1
+fi
+
+gh_run_id=$(gh run list \
+  --repo "${LEGATE_REPO_NAME}" \
+  --branch "${LEGATE_REF_NAME}" \
+  --workflow "${LEGATE_WORKFLOW_NAME}" \
+  --commit "${1}" \
+  --json databaseId --jq '.[0] | .databaseId')
+
+echo -n "${gh_run_id}"
diff --git a/cunumeric/config.py b/cunumeric/config.py
deleted file mode 100644
index 310ca9f416..0000000000
--- a/cunumeric/config.py
+++ /dev/null
@@ -1,824 +0,0 @@
-# Copyright 2024 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from __future__ import annotations
-
-import os
-import platform
-from abc import abstractmethod
-from ctypes import CDLL, RTLD_GLOBAL
-from enum import IntEnum, unique
-from typing import TYPE_CHECKING, Any, cast
-
-import cffi  # type: ignore
-import numpy as np
-
-if TYPE_CHECKING:
-    import numpy.typing as npt
-
-
-class _ReductionOpIds:
-    argmax_redop_id: int
-    argmin_redop_id: int
-
-
-class _CunumericSharedLib:
-    CUNUMERIC_ADVANCED_INDEXING: int
-    CUNUMERIC_ARANGE: int
-    CUNUMERIC_ARGWHERE: int
-    CUNUMERIC_BATCHED_CHOLESKY: int
-    CUNUMERIC_BINARY_OP: int
-    CUNUMERIC_BINARY_RED: int
-    CUNUMERIC_BINCOUNT: int
-    CUNUMERIC_BINOP_ADD: int
-    CUNUMERIC_BINOP_ARCTAN2: int
-    CUNUMERIC_BINOP_BITWISE_AND: int
-    CUNUMERIC_BINOP_BITWISE_OR: int
-    CUNUMERIC_BINOP_BITWISE_XOR: int
-    CUNUMERIC_BINOP_COPYSIGN: int
-    CUNUMERIC_BINOP_DIVIDE: int
-    CUNUMERIC_BINOP_EQUAL: int
-    CUNUMERIC_BINOP_FLOAT_POWER: int
-    CUNUMERIC_BINOP_FLOOR_DIVIDE: int
-    CUNUMERIC_BINOP_FMOD: int
-    CUNUMERIC_BINOP_GCD: int
-    CUNUMERIC_BINOP_GREATER: int
-    CUNUMERIC_BINOP_GREATER_EQUAL: int
-    CUNUMERIC_BINOP_HYPOT: int
-    CUNUMERIC_BINOP_ISCLOSE: int
-    CUNUMERIC_BINOP_LCM: int
-    CUNUMERIC_BINOP_LDEXP: int
-    CUNUMERIC_BINOP_LEFT_SHIFT: int
-    CUNUMERIC_BINOP_LESS: int
-    CUNUMERIC_BINOP_LESS_EQUAL: int
-    CUNUMERIC_BINOP_LOGADDEXP2: int
-    CUNUMERIC_BINOP_LOGADDEXP: int
-    CUNUMERIC_BINOP_LOGICAL_AND: int
-    CUNUMERIC_BINOP_LOGICAL_OR: int
-    CUNUMERIC_BINOP_LOGICAL_XOR: int
-    CUNUMERIC_BINOP_MAXIMUM: int
-    CUNUMERIC_BINOP_MINIMUM: int
-    CUNUMERIC_BINOP_MOD: int
-    CUNUMERIC_BINOP_MULTIPLY: int
-    CUNUMERIC_BINOP_NEXTAFTER: int
-    CUNUMERIC_BINOP_NOT_EQUAL: int
-    CUNUMERIC_BINOP_POWER: int
-    CUNUMERIC_BINOP_RIGHT_SHIFT: int
-    CUNUMERIC_BINOP_SUBTRACT: int
-    CUNUMERIC_BITGENERATOR: int
-    CUNUMERIC_BITGENOP_DISTRIBUTION: int
-    CUNUMERIC_BITGENTYPE_DEFAULT: int
-    CUNUMERIC_BITGENTYPE_XORWOW: int
-    CUNUMERIC_BITGENTYPE_MRG32K3A: int
-    CUNUMERIC_BITGENTYPE_MTGP32: int
-    CUNUMERIC_BITGENTYPE_MT19937: int
-    CUNUMERIC_BITGENTYPE_PHILOX4_32_10: int
-    CUNUMERIC_BITGENDIST_INTEGERS_16: int
-    CUNUMERIC_BITGENDIST_INTEGERS_32: int
-    CUNUMERIC_BITGENDIST_INTEGERS_64: int
-    CUNUMERIC_BITGENDIST_UNIFORM_32: int
-    CUNUMERIC_BITGENDIST_UNIFORM_64: int
-    CUNUMERIC_BITGENDIST_LOGNORMAL_32: int
-    CUNUMERIC_BITGENDIST_LOGNORMAL_64: int
-    CUNUMERIC_BITGENDIST_NORMAL_32: int
-    CUNUMERIC_BITGENDIST_NORMAL_64: int
-    CUNUMERIC_BITGENDIST_POISSON: int
-    CUNUMERIC_BITGENDIST_EXPONENTIAL_32: int
-    CUNUMERIC_BITGENDIST_EXPONENTIAL_64: int
-    CUNUMERIC_BITGENDIST_GUMBEL_32: int
-    CUNUMERIC_BITGENDIST_GUMBEL_64: int
-    CUNUMERIC_BITGENDIST_LAPLACE_32: int
-    CUNUMERIC_BITGENDIST_LAPLACE_64: int
-    CUNUMERIC_BITGENDIST_LOGISTIC_32: int
-    CUNUMERIC_BITGENDIST_LOGISTIC_64: int
-    CUNUMERIC_BITGENDIST_PARETO_32: int
-    CUNUMERIC_BITGENDIST_PARETO_64: int
-    CUNUMERIC_BITGENDIST_POWER_32: int
-    CUNUMERIC_BITGENDIST_POWER_64: int
-    CUNUMERIC_BITGENDIST_RAYLEIGH_32: int
-    CUNUMERIC_BITGENDIST_RAYLEIGH_64: int
-    CUNUMERIC_BITGENDIST_CAUCHY_32: int
-    CUNUMERIC_BITGENDIST_CAUCHY_64: int
-    CUNUMERIC_BITGENDIST_TRIANGULAR_32: int
-    CUNUMERIC_BITGENDIST_TRIANGULAR_64: int
-    CUNUMERIC_BITGENDIST_WEIBULL_32: int
-    CUNUMERIC_BITGENDIST_WEIBULL_64: int
-    CUNUMERIC_BITGENDIST_BYTES: int
-    CUNUMERIC_BITGENDIST_BETA_32: int
-    CUNUMERIC_BITGENDIST_BETA_64: int
-    CUNUMERIC_BITGENDIST_F_32: int
-    CUNUMERIC_BITGENDIST_F_64: int
-    CUNUMERIC_BITGENDIST_LOGSERIES: int
-    CUNUMERIC_BITGENDIST_NONCENTRAL_F_32: int
-    CUNUMERIC_BITGENDIST_NONCENTRAL_F_64: int
-    CUNUMERIC_BITGENDIST_CHISQUARE_32: int
-    CUNUMERIC_BITGENDIST_CHISQUARE_64: int
-    CUNUMERIC_BITGENDIST_GAMMA_32: int
-    CUNUMERIC_BITGENDIST_GAMMA_64: int
-    CUNUMERIC_BITGENDIST_STANDARD_T_32: int
-    CUNUMERIC_BITGENDIST_STANDARD_T_64: int
-    CUNUMERIC_BITGENDIST_HYPERGEOMETRIC: int
-    CUNUMERIC_BITGENDIST_VONMISES_32: int
-    CUNUMERIC_BITGENDIST_VONMISES_64: int
-    CUNUMERIC_BITGENDIST_ZIPF: int
-    CUNUMERIC_BITGENDIST_GEOMETRIC: int
-    CUNUMERIC_BITGENDIST_WALD_32: int
-    CUNUMERIC_BITGENDIST_WALD_64: int
-    CUNUMERIC_BITGENDIST_BINOMIAL: int
-    CUNUMERIC_BITGENDIST_NEGATIVE_BINOMIAL: int
-    CUNUMERIC_BITGENOP_CREATE: int
-    CUNUMERIC_BITGENOP_DESTROY: int
-    CUNUMERIC_BITGENOP_RAND_RAW: int
-    CUNUMERIC_BITORDER_BIG: int
-    CUNUMERIC_BITORDER_LITTLE: int
-    CUNUMERIC_CHOOSE: int
-    CUNUMERIC_CONTRACT: int
-    CUNUMERIC_CONVERT: int
-    CUNUMERIC_CONVERT_NAN_NOOP: int
-    CUNUMERIC_CONVERT_NAN_PROD: int
-    CUNUMERIC_CONVERT_NAN_SUM: int
-    CUNUMERIC_CONVOLVE: int
-    CUNUMERIC_DIAG: int
-    CUNUMERIC_DOT: int
-    CUNUMERIC_EYE: int
-    CUNUMERIC_FFT: int
-    CUNUMERIC_FFT_C2C: int
-    CUNUMERIC_FFT_C2R: int
-    CUNUMERIC_FFT_D2Z: int
-    CUNUMERIC_FFT_FORWARD: int
-    CUNUMERIC_FFT_INVERSE: int
-    CUNUMERIC_FFT_R2C: int
-    CUNUMERIC_FFT_Z2D: int
-    CUNUMERIC_FFT_Z2Z: int
-    CUNUMERIC_FILL: int
-    CUNUMERIC_FLIP: int
-    CUNUMERIC_GEMM: int
-    CUNUMERIC_HISTOGRAM: int
-    CUNUMERIC_LOAD_CUDALIBS: int
-    CUNUMERIC_MATMUL: int
-    CUNUMERIC_MATVECMUL: int
-    CUNUMERIC_MAX_MAPPERS: int
-    CUNUMERIC_MAX_REDOPS: int
-    CUNUMERIC_MAX_TASKS: int
-    CUNUMERIC_MP_POTRF: int
-    CUNUMERIC_MP_SOLVE: int
-    CUNUMERIC_NONZERO: int
-    CUNUMERIC_PACKBITS: int
-    CUNUMERIC_POTRF: int
-    CUNUMERIC_PUTMASK: int
-    CUNUMERIC_QR: int
-    CUNUMERIC_RAND: int
-    CUNUMERIC_READ: int
-    CUNUMERIC_RED_ALL: int
-    CUNUMERIC_RED_ANY: int
-    CUNUMERIC_RED_ARGMAX: int
-    CUNUMERIC_RED_ARGMIN: int
-    CUNUMERIC_RED_CONTAINS: int
-    CUNUMERIC_RED_COUNT_NONZERO: int
-    CUNUMERIC_RED_MAX: int
-    CUNUMERIC_RED_MIN: int
-    CUNUMERIC_RED_NANARGMAX: int
-    CUNUMERIC_RED_NANARGMIN: int
-    CUNUMERIC_RED_NANMAX: int
-    CUNUMERIC_RED_NANMIN: int
-    CUNUMERIC_RED_NANPROD: int
-    CUNUMERIC_RED_NANSUM: int
-    CUNUMERIC_RED_PROD: int
-    CUNUMERIC_RED_SUM: int
-    CUNUMERIC_RED_SUM_SQUARES: int
-    CUNUMERIC_RED_VARIANCE: int
-    CUNUMERIC_REPEAT: int
-    CUNUMERIC_SCALAR_UNARY_RED: int
-    CUNUMERIC_SCAN_GLOBAL: int
-    CUNUMERIC_SCAN_LOCAL: int
-    CUNUMERIC_SCAN_PROD: int
-    CUNUMERIC_SCAN_SUM: int
-    CUNUMERIC_SEARCHSORTED: int
-    CUNUMERIC_SELECT: int
-    CUNUMERIC_SOLVE: int
-    CUNUMERIC_SORT: int
-    CUNUMERIC_SVD: int
-    CUNUMERIC_SYRK: int
-    CUNUMERIC_TILE: int
-    CUNUMERIC_TRANSPOSE_COPY_2D: int
-    CUNUMERIC_TRILU: int
-    CUNUMERIC_TRSM: int
-    CUNUMERIC_UNARY_OP: int
-    CUNUMERIC_UNARY_RED: int
-    CUNUMERIC_UNIQUE: int
-    CUNUMERIC_UNIQUE_REDUCE: int
-    CUNUMERIC_UNLOAD_CUDALIBS: int
-    CUNUMERIC_UNPACKBITS: int
-    CUNUMERIC_UOP_ABSOLUTE: int
-    CUNUMERIC_UOP_ANGLE: int
-    CUNUMERIC_UOP_ARCCOS: int
-    CUNUMERIC_UOP_ARCCOSH: int
-    CUNUMERIC_UOP_ARCSIN: int
-    CUNUMERIC_UOP_ARCSINH: int
-    CUNUMERIC_UOP_ARCTAN: int
-    CUNUMERIC_UOP_ARCTANH: int
-    CUNUMERIC_UOP_CBRT: int
-    CUNUMERIC_UOP_CEIL: int
-    CUNUMERIC_UOP_CLIP: int
-    CUNUMERIC_UOP_CONJ: int
-    CUNUMERIC_UOP_COPY: int
-    CUNUMERIC_UOP_COS: int
-    CUNUMERIC_UOP_COSH: int
-    CUNUMERIC_UOP_DEG2RAD: int
-    CUNUMERIC_UOP_EXP2: int
-    CUNUMERIC_UOP_EXP: int
-    CUNUMERIC_UOP_EXPM1: int
-    CUNUMERIC_UOP_FLOOR: int
-    CUNUMERIC_UOP_FREXP: int
-    CUNUMERIC_UOP_GETARG: int
-    CUNUMERIC_UOP_IMAG: int
-    CUNUMERIC_UOP_INVERT: int
-    CUNUMERIC_UOP_ISFINITE: int
-    CUNUMERIC_UOP_ISINF: int
-    CUNUMERIC_UOP_ISNAN: int
-    CUNUMERIC_UOP_LOG10: int
-    CUNUMERIC_UOP_LOG1P: int
-    CUNUMERIC_UOP_LOG2: int
-    CUNUMERIC_UOP_LOG: int
-    CUNUMERIC_UOP_LOGICAL_NOT: int
-    CUNUMERIC_UOP_MODF: int
-    CUNUMERIC_UOP_NEGATIVE: int
-    CUNUMERIC_UOP_POSITIVE: int
-    CUNUMERIC_UOP_RAD2DEG: int
-    CUNUMERIC_UOP_REAL: int
-    CUNUMERIC_UOP_RECIPROCAL: int
-    CUNUMERIC_UOP_RINT: int
-    CUNUMERIC_UOP_ROUND: int
-    CUNUMERIC_UOP_SIGN: int
-    CUNUMERIC_UOP_SIGNBIT: int
-    CUNUMERIC_UOP_SIN: int
-    CUNUMERIC_UOP_SINH: int
-    CUNUMERIC_UOP_SQRT: int
-    CUNUMERIC_UOP_SQUARE: int
-    CUNUMERIC_UOP_TAN: int
-    CUNUMERIC_UOP_TANH: int
-    CUNUMERIC_UOP_TRUNC: int
-    CUNUMERIC_WHERE: int
-    CUNUMERIC_WINDOW: int
-    CUNUMERIC_WINDOW_BARLETT: int
-    CUNUMERIC_WINDOW_BLACKMAN: int
-    CUNUMERIC_WINDOW_HAMMING: int
-    CUNUMERIC_WINDOW_HANNING: int
-    CUNUMERIC_WINDOW_KAISER: int
-    CUNUMERIC_WRAP: int
-    CUNUMERIC_WRITE: int
-    CUNUMERIC_ZIP: int
-
-    @abstractmethod
-    def cunumeric_has_cusolvermp(self) -> bool:
-        ...
-
-    @abstractmethod
-    def cunumeric_max_eager_volume(self) -> int:
-        ...
-
-    @abstractmethod
-    def cunumeric_register_reduction_ops(self, code: int) -> _ReductionOpIds:
-        ...
-
-
-def dlopen_no_autoclose(ffi: Any, lib_path: str) -> Any:
-    # Use an already-opened library handle, which cffi will convert to a
-    # regular FFI object (using the definitions previously added using
-    # ffi.cdef), but will not automatically dlclose() on collection.
-    lib = CDLL(lib_path, mode=RTLD_GLOBAL)
-    return ffi.dlopen(ffi.cast("void *", lib._handle))
-
-
-# Load the cuNumeric library first so we have a shard object that
-# we can use to initialize all these configuration enumerations
-class CuNumericLib:
-    def __init__(self, name: str) -> None:
-        self.name = name
-
-        shared_lib_path = self.get_shared_library()
-        assert shared_lib_path is not None
-        header = self.get_c_header()
-        ffi = cffi.FFI()
-        if header is not None:
-            ffi.cdef(header)
-        # Don't use ffi.dlopen(), because that will call dlclose()
-        # automatically when the object gets collected, thus removing
-        # symbols that may be needed when destroying C++ objects later
-        # (e.g. vtable entries, which will be queried for virtual
-        # destructors), causing errors at shutdown.
-        shared_lib = dlopen_no_autoclose(ffi, shared_lib_path)
-        self.shared_object = cast(_CunumericSharedLib, shared_lib)
-
-    def register(self) -> None:
-        from legate.core import get_legate_runtime
-
-        # We need to make sure that the runtime is started
-        get_legate_runtime()
-
-        callback = getattr(
-            self.shared_object, "cunumeric_perform_registration"
-        )
-        callback()
-
-    def get_shared_library(self) -> str:
-        from .install_info import libpath
-
-        return os.path.join(
-            libpath, "libcunumeric" + self.get_library_extension()
-        )
-
-    def get_c_header(self) -> str:
-        from .install_info import header
-
-        return header
-
-    @staticmethod
-    def get_library_extension() -> str:
-        os_name = platform.system()
-        if os_name == "Linux":
-            return ".so"
-        elif os_name == "Darwin":
-            return ".dylib"
-        raise RuntimeError(f"unknown platform {os_name!r}")
-
-
-CUNUMERIC_LIB_NAME = "cunumeric"
-cunumeric_lib = CuNumericLib(CUNUMERIC_LIB_NAME)
-cunumeric_lib.register()
-_cunumeric = cunumeric_lib.shared_object
-
-
-# Match these to CuNumericOpCode in cunumeric_c.h
-@unique
-class CuNumericOpCode(IntEnum):
-    ADVANCED_INDEXING = _cunumeric.CUNUMERIC_ADVANCED_INDEXING
-    ARANGE = _cunumeric.CUNUMERIC_ARANGE
-    ARGWHERE = _cunumeric.CUNUMERIC_ARGWHERE
-    BATCHED_CHOLESKY = _cunumeric.CUNUMERIC_BATCHED_CHOLESKY
-    BINARY_OP = _cunumeric.CUNUMERIC_BINARY_OP
-    BINARY_RED = _cunumeric.CUNUMERIC_BINARY_RED
-    BINCOUNT = _cunumeric.CUNUMERIC_BINCOUNT
-    BITGENERATOR = _cunumeric.CUNUMERIC_BITGENERATOR
-    CHOOSE = _cunumeric.CUNUMERIC_CHOOSE
-    CONTRACT = _cunumeric.CUNUMERIC_CONTRACT
-    CONVERT = _cunumeric.CUNUMERIC_CONVERT
-    CONVOLVE = _cunumeric.CUNUMERIC_CONVOLVE
-    DIAG = _cunumeric.CUNUMERIC_DIAG
-    DOT = _cunumeric.CUNUMERIC_DOT
-    EYE = _cunumeric.CUNUMERIC_EYE
-    FFT = _cunumeric.CUNUMERIC_FFT
-    FILL = _cunumeric.CUNUMERIC_FILL
-    FLIP = _cunumeric.CUNUMERIC_FLIP
-    GEMM = _cunumeric.CUNUMERIC_GEMM
-    HISTOGRAM = _cunumeric.CUNUMERIC_HISTOGRAM
-    LOAD_CUDALIBS = _cunumeric.CUNUMERIC_LOAD_CUDALIBS
-    MATMUL = _cunumeric.CUNUMERIC_MATMUL
-    MATVECMUL = _cunumeric.CUNUMERIC_MATVECMUL
-    MP_POTRF = _cunumeric.CUNUMERIC_MP_POTRF
-    MP_SOLVE = _cunumeric.CUNUMERIC_MP_SOLVE
-    NONZERO = _cunumeric.CUNUMERIC_NONZERO
-    PACKBITS = _cunumeric.CUNUMERIC_PACKBITS
-    POTRF = _cunumeric.CUNUMERIC_POTRF
-    PUTMASK = _cunumeric.CUNUMERIC_PUTMASK
-    QR = _cunumeric.CUNUMERIC_QR
-    RAND = _cunumeric.CUNUMERIC_RAND
-    READ = _cunumeric.CUNUMERIC_READ
-    REPEAT = _cunumeric.CUNUMERIC_REPEAT
-    SCALAR_UNARY_RED = _cunumeric.CUNUMERIC_SCALAR_UNARY_RED
-    SCAN_GLOBAL = _cunumeric.CUNUMERIC_SCAN_GLOBAL
-    SCAN_LOCAL = _cunumeric.CUNUMERIC_SCAN_LOCAL
-    SEARCHSORTED = _cunumeric.CUNUMERIC_SEARCHSORTED
-    SELECT = _cunumeric.CUNUMERIC_SELECT
-    SOLVE = _cunumeric.CUNUMERIC_SOLVE
-    SORT = _cunumeric.CUNUMERIC_SORT
-    SVD = _cunumeric.CUNUMERIC_SVD
-    SYRK = _cunumeric.CUNUMERIC_SYRK
-    TILE = _cunumeric.CUNUMERIC_TILE
-    TRANSPOSE_COPY_2D = _cunumeric.CUNUMERIC_TRANSPOSE_COPY_2D
-    TRILU = _cunumeric.CUNUMERIC_TRILU
-    TRSM = _cunumeric.CUNUMERIC_TRSM
-    UNARY_OP = _cunumeric.CUNUMERIC_UNARY_OP
-    UNARY_RED = _cunumeric.CUNUMERIC_UNARY_RED
-    UNIQUE = _cunumeric.CUNUMERIC_UNIQUE
-    UNIQUE_REDUCE = _cunumeric.CUNUMERIC_UNIQUE_REDUCE
-    UNLOAD_CUDALIBS = _cunumeric.CUNUMERIC_UNLOAD_CUDALIBS
-    UNPACKBITS = _cunumeric.CUNUMERIC_UNPACKBITS
-    WHERE = _cunumeric.CUNUMERIC_WHERE
-    WINDOW = _cunumeric.CUNUMERIC_WINDOW
-    WRAP = _cunumeric.CUNUMERIC_WRAP
-    WRITE = _cunumeric.CUNUMERIC_WRITE
-    ZIP = _cunumeric.CUNUMERIC_ZIP
-
-
-# Match these to CuNumericUnaryOpCode in cunumeric_c.h
-@unique
-class UnaryOpCode(IntEnum):
-    ABSOLUTE = _cunumeric.CUNUMERIC_UOP_ABSOLUTE
-    ANGLE = _cunumeric.CUNUMERIC_UOP_ANGLE
-    ARCCOS = _cunumeric.CUNUMERIC_UOP_ARCCOS
-    ARCCOSH = _cunumeric.CUNUMERIC_UOP_ARCCOSH
-    ARCSIN = _cunumeric.CUNUMERIC_UOP_ARCSIN
-    ARCSINH = _cunumeric.CUNUMERIC_UOP_ARCSINH
-    ARCTAN = _cunumeric.CUNUMERIC_UOP_ARCTAN
-    ARCTANH = _cunumeric.CUNUMERIC_UOP_ARCTANH
-    CBRT = _cunumeric.CUNUMERIC_UOP_CBRT
-    CEIL = _cunumeric.CUNUMERIC_UOP_CEIL
-    CLIP = _cunumeric.CUNUMERIC_UOP_CLIP
-    CONJ = _cunumeric.CUNUMERIC_UOP_CONJ
-    COPY = _cunumeric.CUNUMERIC_UOP_COPY
-    COS = _cunumeric.CUNUMERIC_UOP_COS
-    COSH = _cunumeric.CUNUMERIC_UOP_COSH
-    DEG2RAD = _cunumeric.CUNUMERIC_UOP_DEG2RAD
-    EXP = _cunumeric.CUNUMERIC_UOP_EXP
-    EXP2 = _cunumeric.CUNUMERIC_UOP_EXP2
-    EXPM1 = _cunumeric.CUNUMERIC_UOP_EXPM1
-    FLOOR = _cunumeric.CUNUMERIC_UOP_FLOOR
-    FREXP = _cunumeric.CUNUMERIC_UOP_FREXP
-    GETARG = _cunumeric.CUNUMERIC_UOP_GETARG
-    IMAG = _cunumeric.CUNUMERIC_UOP_IMAG
-    INVERT = _cunumeric.CUNUMERIC_UOP_INVERT
-    ISFINITE = _cunumeric.CUNUMERIC_UOP_ISFINITE
-    ISINF = _cunumeric.CUNUMERIC_UOP_ISINF
-    ISNAN = _cunumeric.CUNUMERIC_UOP_ISNAN
-    LOG = _cunumeric.CUNUMERIC_UOP_LOG
-    LOG10 = _cunumeric.CUNUMERIC_UOP_LOG10
-    LOG1P = _cunumeric.CUNUMERIC_UOP_LOG1P
-    LOG2 = _cunumeric.CUNUMERIC_UOP_LOG2
-    LOGICAL_NOT = _cunumeric.CUNUMERIC_UOP_LOGICAL_NOT
-    MODF = _cunumeric.CUNUMERIC_UOP_MODF
-    NEGATIVE = _cunumeric.CUNUMERIC_UOP_NEGATIVE
-    POSITIVE = _cunumeric.CUNUMERIC_UOP_POSITIVE
-    RAD2DEG = _cunumeric.CUNUMERIC_UOP_RAD2DEG
-    REAL = _cunumeric.CUNUMERIC_UOP_REAL
-    RECIPROCAL = _cunumeric.CUNUMERIC_UOP_RECIPROCAL
-    RINT = _cunumeric.CUNUMERIC_UOP_RINT
-    ROUND = _cunumeric.CUNUMERIC_UOP_ROUND
-    SIGN = _cunumeric.CUNUMERIC_UOP_SIGN
-    SIGNBIT = _cunumeric.CUNUMERIC_UOP_SIGNBIT
-    SIN = _cunumeric.CUNUMERIC_UOP_SIN
-    SINH = _cunumeric.CUNUMERIC_UOP_SINH
-    SQRT = _cunumeric.CUNUMERIC_UOP_SQRT
-    SQUARE = _cunumeric.CUNUMERIC_UOP_SQUARE
-    TAN = _cunumeric.CUNUMERIC_UOP_TAN
-    TANH = _cunumeric.CUNUMERIC_UOP_TANH
-    TRUNC = _cunumeric.CUNUMERIC_UOP_TRUNC
-
-
-# Match these to CuNumericUnaryRedCode in cunumeric_c.h
-@unique
-class UnaryRedCode(IntEnum):
-    ALL = _cunumeric.CUNUMERIC_RED_ALL
-    ANY = _cunumeric.CUNUMERIC_RED_ANY
-    ARGMAX = _cunumeric.CUNUMERIC_RED_ARGMAX
-    ARGMIN = _cunumeric.CUNUMERIC_RED_ARGMIN
-    CONTAINS = _cunumeric.CUNUMERIC_RED_CONTAINS
-    COUNT_NONZERO = _cunumeric.CUNUMERIC_RED_COUNT_NONZERO
-    MAX = _cunumeric.CUNUMERIC_RED_MAX
-    MIN = _cunumeric.CUNUMERIC_RED_MIN
-    NANARGMAX = _cunumeric.CUNUMERIC_RED_NANARGMAX
-    NANARGMIN = _cunumeric.CUNUMERIC_RED_NANARGMIN
-    NANMAX = _cunumeric.CUNUMERIC_RED_NANMAX
-    NANMIN = _cunumeric.CUNUMERIC_RED_NANMIN
-    NANPROD = _cunumeric.CUNUMERIC_RED_NANPROD
-    NANSUM = _cunumeric.CUNUMERIC_RED_NANSUM
-    PROD = _cunumeric.CUNUMERIC_RED_PROD
-    SUM = _cunumeric.CUNUMERIC_RED_SUM
-    SUM_SQUARES = _cunumeric.CUNUMERIC_RED_SUM_SQUARES
-    VARIANCE = _cunumeric.CUNUMERIC_RED_VARIANCE
-
-
-# Match these to CuNumericBinaryOpCode in cunumeric_c.h
-@unique
-class BinaryOpCode(IntEnum):
-    ADD = _cunumeric.CUNUMERIC_BINOP_ADD
-    ARCTAN2 = _cunumeric.CUNUMERIC_BINOP_ARCTAN2
-    BITWISE_AND = _cunumeric.CUNUMERIC_BINOP_BITWISE_AND
-    BITWISE_OR = _cunumeric.CUNUMERIC_BINOP_BITWISE_OR
-    BITWISE_XOR = _cunumeric.CUNUMERIC_BINOP_BITWISE_XOR
-    COPYSIGN = _cunumeric.CUNUMERIC_BINOP_COPYSIGN
-    DIVIDE = _cunumeric.CUNUMERIC_BINOP_DIVIDE
-    EQUAL = _cunumeric.CUNUMERIC_BINOP_EQUAL
-    FLOAT_POWER = _cunumeric.CUNUMERIC_BINOP_FLOAT_POWER
-    FLOOR_DIVIDE = _cunumeric.CUNUMERIC_BINOP_FLOOR_DIVIDE
-    FMOD = _cunumeric.CUNUMERIC_BINOP_FMOD
-    GCD = _cunumeric.CUNUMERIC_BINOP_GCD
-    GREATER = _cunumeric.CUNUMERIC_BINOP_GREATER
-    GREATER_EQUAL = _cunumeric.CUNUMERIC_BINOP_GREATER_EQUAL
-    HYPOT = _cunumeric.CUNUMERIC_BINOP_HYPOT
-    ISCLOSE = _cunumeric.CUNUMERIC_BINOP_ISCLOSE
-    LCM = _cunumeric.CUNUMERIC_BINOP_LCM
-    LDEXP = _cunumeric.CUNUMERIC_BINOP_LDEXP
-    LEFT_SHIFT = _cunumeric.CUNUMERIC_BINOP_LEFT_SHIFT
-    LESS = _cunumeric.CUNUMERIC_BINOP_LESS
-    LESS_EQUAL = _cunumeric.CUNUMERIC_BINOP_LESS_EQUAL
-    LOGADDEXP = _cunumeric.CUNUMERIC_BINOP_LOGADDEXP
-    LOGADDEXP2 = _cunumeric.CUNUMERIC_BINOP_LOGADDEXP2
-    LOGICAL_AND = _cunumeric.CUNUMERIC_BINOP_LOGICAL_AND
-    LOGICAL_OR = _cunumeric.CUNUMERIC_BINOP_LOGICAL_OR
-    LOGICAL_XOR = _cunumeric.CUNUMERIC_BINOP_LOGICAL_XOR
-    MAXIMUM = _cunumeric.CUNUMERIC_BINOP_MAXIMUM
-    MINIMUM = _cunumeric.CUNUMERIC_BINOP_MINIMUM
-    MOD = _cunumeric.CUNUMERIC_BINOP_MOD
-    MULTIPLY = _cunumeric.CUNUMERIC_BINOP_MULTIPLY
-    NEXTAFTER = _cunumeric.CUNUMERIC_BINOP_NEXTAFTER
-    NOT_EQUAL = _cunumeric.CUNUMERIC_BINOP_NOT_EQUAL
-    POWER = _cunumeric.CUNUMERIC_BINOP_POWER
-    RIGHT_SHIFT = _cunumeric.CUNUMERIC_BINOP_RIGHT_SHIFT
-    SUBTRACT = _cunumeric.CUNUMERIC_BINOP_SUBTRACT
-
-
-@unique
-class WindowOpCode(IntEnum):
-    BARLETT = _cunumeric.CUNUMERIC_WINDOW_BARLETT
-    BLACKMAN = _cunumeric.CUNUMERIC_WINDOW_BLACKMAN
-    HAMMING = _cunumeric.CUNUMERIC_WINDOW_HAMMING
-    HANNING = _cunumeric.CUNUMERIC_WINDOW_HANNING
-    KAISER = _cunumeric.CUNUMERIC_WINDOW_KAISER
-
-
-# Match these to RandGenCode in rand_util.h
-@unique
-class RandGenCode(IntEnum):
-    UNIFORM = 1
-    NORMAL = 2
-    INTEGER = 3
-
-
-# Match these to CuNumericScanCode in cunumeric_c.h
-@unique
-class ScanCode(IntEnum):
-    PROD = _cunumeric.CUNUMERIC_SCAN_PROD
-    SUM = _cunumeric.CUNUMERIC_SCAN_SUM
-
-
-# Match these to CuNumericConvertCode in cunumeric_c.h
-@unique
-class ConvertCode(IntEnum):
-    NOOP = _cunumeric.CUNUMERIC_CONVERT_NAN_NOOP
-    PROD = _cunumeric.CUNUMERIC_CONVERT_NAN_PROD
-    SUM = _cunumeric.CUNUMERIC_CONVERT_NAN_SUM
-
-
-# Match these to BitGeneratorOperation in cunumeric_c.h
-@unique
-class BitGeneratorOperation(IntEnum):
-    CREATE = _cunumeric.CUNUMERIC_BITGENOP_CREATE
-    DESTROY = _cunumeric.CUNUMERIC_BITGENOP_DESTROY
-    RAND_RAW = _cunumeric.CUNUMERIC_BITGENOP_RAND_RAW
-    DISTRIBUTION = _cunumeric.CUNUMERIC_BITGENOP_DISTRIBUTION
-
-
-# Match these to BitGeneratorType in cunumeric_c.h
-@unique
-class BitGeneratorType(IntEnum):
-    DEFAULT = _cunumeric.CUNUMERIC_BITGENTYPE_DEFAULT
-    XORWOW = _cunumeric.CUNUMERIC_BITGENTYPE_XORWOW
-    MRG32K3A = _cunumeric.CUNUMERIC_BITGENTYPE_MRG32K3A
-    MTGP32 = _cunumeric.CUNUMERIC_BITGENTYPE_MTGP32
-    MT19937 = _cunumeric.CUNUMERIC_BITGENTYPE_MT19937
-    PHILOX4_32_10 = _cunumeric.CUNUMERIC_BITGENTYPE_PHILOX4_32_10
-
-
-# Match these to BitGeneratorDistribution in cunumeric_c.h
-@unique
-class BitGeneratorDistribution(IntEnum):
-    INTEGERS_16 = _cunumeric.CUNUMERIC_BITGENDIST_INTEGERS_16
-    INTEGERS_32 = _cunumeric.CUNUMERIC_BITGENDIST_INTEGERS_32
-    INTEGERS_64 = _cunumeric.CUNUMERIC_BITGENDIST_INTEGERS_64
-    UNIFORM_32 = _cunumeric.CUNUMERIC_BITGENDIST_UNIFORM_32
-    UNIFORM_64 = _cunumeric.CUNUMERIC_BITGENDIST_UNIFORM_64
-    LOGNORMAL_32 = _cunumeric.CUNUMERIC_BITGENDIST_LOGNORMAL_32
-    LOGNORMAL_64 = _cunumeric.CUNUMERIC_BITGENDIST_LOGNORMAL_64
-    NORMAL_32 = _cunumeric.CUNUMERIC_BITGENDIST_NORMAL_32
-    NORMAL_64 = _cunumeric.CUNUMERIC_BITGENDIST_NORMAL_64
-    POISSON = _cunumeric.CUNUMERIC_BITGENDIST_POISSON
-    EXPONENTIAL_32 = _cunumeric.CUNUMERIC_BITGENDIST_EXPONENTIAL_32
-    EXPONENTIAL_64 = _cunumeric.CUNUMERIC_BITGENDIST_EXPONENTIAL_64
-    GUMBEL_32 = _cunumeric.CUNUMERIC_BITGENDIST_GUMBEL_32
-    GUMBEL_64 = _cunumeric.CUNUMERIC_BITGENDIST_GUMBEL_64
-    LAPLACE_32 = _cunumeric.CUNUMERIC_BITGENDIST_LAPLACE_32
-    LAPLACE_64 = _cunumeric.CUNUMERIC_BITGENDIST_LAPLACE_64
-    LOGISTIC_32 = _cunumeric.CUNUMERIC_BITGENDIST_LOGISTIC_32
-    LOGISTIC_64 = _cunumeric.CUNUMERIC_BITGENDIST_LOGISTIC_64
-    PARETO_32 = _cunumeric.CUNUMERIC_BITGENDIST_PARETO_32
-    PARETO_64 = _cunumeric.CUNUMERIC_BITGENDIST_PARETO_64
-    POWER_32 = _cunumeric.CUNUMERIC_BITGENDIST_POWER_32
-    POWER_64 = _cunumeric.CUNUMERIC_BITGENDIST_POWER_64
-    RAYLEIGH_32 = _cunumeric.CUNUMERIC_BITGENDIST_RAYLEIGH_32
-    RAYLEIGH_64 = _cunumeric.CUNUMERIC_BITGENDIST_RAYLEIGH_64
-    CAUCHY_32 = _cunumeric.CUNUMERIC_BITGENDIST_CAUCHY_32
-    CAUCHY_64 = _cunumeric.CUNUMERIC_BITGENDIST_CAUCHY_64
-    TRIANGULAR_32 = _cunumeric.CUNUMERIC_BITGENDIST_TRIANGULAR_32
-    TRIANGULAR_64 = _cunumeric.CUNUMERIC_BITGENDIST_TRIANGULAR_64
-    WEIBULL_32 = _cunumeric.CUNUMERIC_BITGENDIST_WEIBULL_32
-    WEIBULL_64 = _cunumeric.CUNUMERIC_BITGENDIST_WEIBULL_64
-    BYTES = _cunumeric.CUNUMERIC_BITGENDIST_BYTES
-    BETA_32 = _cunumeric.CUNUMERIC_BITGENDIST_BETA_32
-    BETA_64 = _cunumeric.CUNUMERIC_BITGENDIST_BETA_64
-    F_32 = _cunumeric.CUNUMERIC_BITGENDIST_F_32
-    F_64 = _cunumeric.CUNUMERIC_BITGENDIST_F_64
-    LOGSERIES = _cunumeric.CUNUMERIC_BITGENDIST_LOGSERIES
-    NONCENTRAL_F_32 = _cunumeric.CUNUMERIC_BITGENDIST_NONCENTRAL_F_32
-    NONCENTRAL_F_64 = _cunumeric.CUNUMERIC_BITGENDIST_NONCENTRAL_F_64
-    CHISQUARE_32 = _cunumeric.CUNUMERIC_BITGENDIST_CHISQUARE_32
-    CHISQUARE_64 = _cunumeric.CUNUMERIC_BITGENDIST_CHISQUARE_64
-    GAMMA_32 = _cunumeric.CUNUMERIC_BITGENDIST_GAMMA_32
-    GAMMA_64 = _cunumeric.CUNUMERIC_BITGENDIST_GAMMA_64
-    STANDARD_T_32 = _cunumeric.CUNUMERIC_BITGENDIST_STANDARD_T_32
-    STANDARD_T_64 = _cunumeric.CUNUMERIC_BITGENDIST_STANDARD_T_64
-    HYPERGEOMETRIC = _cunumeric.CUNUMERIC_BITGENDIST_HYPERGEOMETRIC
-    VONMISES_32 = _cunumeric.CUNUMERIC_BITGENDIST_VONMISES_32
-    VONMISES_64 = _cunumeric.CUNUMERIC_BITGENDIST_VONMISES_64
-    ZIPF = _cunumeric.CUNUMERIC_BITGENDIST_ZIPF
-    GEOMETRIC = _cunumeric.CUNUMERIC_BITGENDIST_GEOMETRIC
-    WALD_32 = _cunumeric.CUNUMERIC_BITGENDIST_WALD_32
-    WALD_64 = _cunumeric.CUNUMERIC_BITGENDIST_WALD_64
-    BINOMIAL = _cunumeric.CUNUMERIC_BITGENDIST_BINOMIAL
-    NEGATIVE_BINOMIAL = _cunumeric.CUNUMERIC_BITGENDIST_NEGATIVE_BINOMIAL
-
-
-@unique
-class TransferType(IntEnum):
-    DONATE = 0
-    MAKE_COPY = 1
-    SHARE = 2
-
-
-# Match these to fftType in fft_util.h
-class FFTType:
-    def __init__(
-        self,
-        name: str,
-        type_id: int,
-        input_dtype: npt.DTypeLike,
-        output_dtype: npt.DTypeLike,
-        single_precision: bool,
-        complex_type: FFTType | None = None,
-    ) -> None:
-        self._name = name
-        self._type_id = type_id
-        self._complex_type = self if complex_type is None else complex_type
-        self._input_dtype = input_dtype
-        self._output_dtype = output_dtype
-        self._single_precision = single_precision
-
-    def __str__(self) -> str:
-        return self._name
-
-    def __repr__(self) -> str:
-        return str(self)
-
-    @property
-    def type_id(self) -> int:
-        return self._type_id
-
-    @property
-    def complex(self) -> FFTType:
-        return self._complex_type
-
-    @property
-    def input_dtype(self) -> npt.DTypeLike:
-        return self._input_dtype
-
-    @property
-    def output_dtype(self) -> npt.DTypeLike:
-        return self._output_dtype
-
-    @property
-    def is_single_precision(self) -> bool:
-        return self._single_precision
-
-
-FFT_C2C = FFTType(
-    "C2C",
-    _cunumeric.CUNUMERIC_FFT_C2C,
-    np.complex64,
-    np.complex64,
-    True,
-)
-
-FFT_Z2Z = FFTType(
-    "Z2Z",
-    _cunumeric.CUNUMERIC_FFT_Z2Z,
-    np.complex128,
-    np.complex128,
-    False,
-)
-
-FFT_R2C = FFTType(
-    "R2C",
-    _cunumeric.CUNUMERIC_FFT_R2C,
-    np.float32,
-    np.complex64,
-    True,
-    FFT_C2C,
-)
-
-FFT_C2R = FFTType(
-    "C2R",
-    _cunumeric.CUNUMERIC_FFT_C2R,
-    np.complex64,
-    np.float32,
-    True,
-    FFT_C2C,
-)
-
-FFT_D2Z = FFTType(
-    "D2Z",
-    _cunumeric.CUNUMERIC_FFT_D2Z,
-    np.float64,
-    np.complex128,
-    False,
-    FFT_Z2Z,
-)
-
-FFT_Z2D = FFTType(
-    "Z2D",
-    _cunumeric.CUNUMERIC_FFT_Z2D,
-    np.complex128,
-    np.float64,
-    False,
-    FFT_Z2Z,
-)
-
-
-class FFTCode:
-    @staticmethod
-    def real_to_complex_code(dtype: npt.DTypeLike) -> FFTType:
-        if dtype == np.float64:
-            return FFT_D2Z
-        elif dtype == np.float32:
-            return FFT_R2C
-        else:
-            raise TypeError(
-                (
-                    "Data type for FFT not supported "
-                    "(supported types are float32 and float64)"
-                )
-            )
-
-    @staticmethod
-    def complex_to_real_code(dtype: npt.DTypeLike) -> FFTType:
-        if dtype == np.complex128:
-            return FFT_Z2D
-        elif dtype == np.complex64:
-            return FFT_C2R
-        else:
-            raise TypeError(
-                (
-                    "Data type for FFT not supported "
-                    "(supported types are complex64 and complex128)"
-                )
-            )
-
-
-@unique
-class FFTDirection(IntEnum):
-    FORWARD = _cunumeric.CUNUMERIC_FFT_FORWARD
-    INVERSE = _cunumeric.CUNUMERIC_FFT_INVERSE
-
-
-# Match these to CuNumericBitorder in cunumeric_c.h
-@unique
-class Bitorder(IntEnum):
-    BIG = _cunumeric.CUNUMERIC_BITORDER_BIG
-    LITTLE = _cunumeric.CUNUMERIC_BITORDER_LITTLE
-
-
-@unique
-class FFTNormalization(IntEnum):
-    FORWARD = 1
-    INVERSE = 2
-    ORTHOGONAL = 3
-
-    @staticmethod
-    def from_string(in_string: str) -> FFTNormalization | None:
-        if in_string == "forward":
-            return FFTNormalization.FORWARD
-        elif in_string == "ortho":
-            return FFTNormalization.ORTHOGONAL
-        elif in_string == "backward" or in_string is None:
-            return FFTNormalization.INVERSE
-        else:
-            return None
-
-    @staticmethod
-    def reverse(in_string: str | None) -> str:
-        if in_string == "forward":
-            return "backward"
-        elif in_string == "backward" or in_string is None:
-            return "forward"
-        else:
-            return in_string
diff --git a/cunumeric_cpp.cmake b/cunumeric_cpp.cmake
deleted file mode 100644
index 5c324f1dc3..0000000000
--- a/cunumeric_cpp.cmake
+++ /dev/null
@@ -1,565 +0,0 @@
-#=============================================================================
-# Copyright 2024 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
-
-##############################################################################
-# - User Options  ------------------------------------------------------------
-
-option(BUILD_SHARED_LIBS "Build cuNumeric shared libraries" ON)
-option(cunumeric_EXCLUDE_TBLIS_FROM_ALL "Exclude tblis targets from cuNumeric's 'all' target" OFF)
-option(cunumeric_EXCLUDE_OPENBLAS_FROM_ALL "Exclude OpenBLAS targets from cuNumeric's 'all' target" OFF)
-option(cunumeric_EXCLUDE_LEGATE_FROM_ALL "Exclude legate targets from cuNumeric's 'all' target" OFF)
-
-##############################################################################
-# - Project definition -------------------------------------------------------
-
-# Write the version header
-rapids_cmake_write_version_file(include/cunumeric/version_config.hpp)
-
-# Needed to integrate with LLVM/clang tooling
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-
-##############################################################################
-# - Build Type ---------------------------------------------------------------
-
-# Set a default build type if none was specified
-rapids_cmake_build_type(Release)
-
-##############################################################################
-# - conda environment --------------------------------------------------------
-
-rapids_cmake_support_conda_env(conda_env MODIFY_PREFIX_PATH)
-
-# We're building python extension libraries, which must always be installed
-# under lib/, even if the system normally uses lib64/. Rapids-cmake currently
-# doesn't realize this when we're going through scikit-build, see
-# https://github.com/rapidsai/rapids-cmake/issues/426
-if(TARGET conda_env)
-  set(CMAKE_INSTALL_LIBDIR "lib")
-endif()
-
-##############################################################################
-# - Dependencies -------------------------------------------------------------
-
-# add third party dependencies using CPM
-rapids_cpm_init(OVERRIDE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/versions.json)
-
-rapids_find_package(OpenMP GLOBAL_TARGETS OpenMP::OpenMP_CXX)
-
-option(Legion_USE_CUDA "Use CUDA" ON)
-option(Legion_USE_OpenMP "Use OpenMP" ${OpenMP_FOUND})
-option(Legion_BOUNDS_CHECKS "Build cuNumeric with bounds checks (expensive)" OFF)
-
-# If legate has CUDA support, then including it in a project will automatically call
-# enable_language(CUDA). However, this does not play nice with the rapids-cmake CUDA utils
-# which support a wider range of values for CMAKE_CUDA_ARCHITECTURES than cmake does. You
-# end up with the following error:
-#
-# CMAKE_CUDA_ARCHITECTURES:
-#
-#    RAPIDS
-#
-#  is not one of the following:
-#
-#    * a semicolon-separated list of integers, each optionally
-#      followed by '-real' or '-virtual'
-#    * a special value: all, all-major, native
-#
-set(cmake_cuda_arch_backup "${CMAKE_CUDA_ARCHITECTURES}")
-set(cmake_cuda_arch_cache_backup "$CACHE{CMAKE_CUDA_ARCHITECTURES}")
-if(("${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "RAPIDS") OR ("${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "NATIVE"))
-  unset(CMAKE_CUDA_ARCHITECTURES)
-  unset(CMAKE_CUDA_ARCHITECTURES CACHE)
-endif()
-
-###
-# If we find legate already configured on the system, it will report
-# whether it was compiled with bounds checking (Legion_BOUNDS_CHECKS),
-# CUDA (Legion_USE_CUDA), and OpenMP (Legion_USE_OpenMP).
-#
-# We use the same variables as legate because we want to enable/disable
-# each of these features based on how legate was configured (it doesn't
-# make sense to build cuNumeric's CUDA bindings if legate wasn't built
-# with CUDA support).
-###
-include(cmake/thirdparty/get_legate.cmake)
-
-set(CMAKE_CUDA_ARCHITECTURES "${cmake_cuda_arch_cache_backup}" CACHE STRING "" FORCE)
-set(CMAKE_CUDA_ARCHITECTURES "${cmake_cuda_arch_backup}")
-unset(cmake_cuda_arch_backup)
-unset(cmake_cuda_arch_cache_backup)
-
-if(Legion_USE_CUDA)
-  include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/cuda_arch_helpers.cmake)
-  # Needs to run before `rapids_cuda_init_architectures`
-  set_cuda_arch_from_names()
-  # Needs to run before `enable_language(CUDA)`
-  rapids_cuda_init_architectures(cunumeric)
-  enable_language(CUDA)
-  # Since cunumeric only enables CUDA optionally we need to manually include
-  # the file that rapids_cuda_init_architectures relies on `project` calling
-  if(CMAKE_PROJECT_cunumeric_INCLUDE)
-    include("${CMAKE_PROJECT_cunumeric_INCLUDE}")
-  endif()
-
-  # Must come after enable_language(CUDA)
-  # Use `-isystem <path>` instead of `-isystem=<path>`
-  # because the former works with clangd intellisense
-  set(CMAKE_INCLUDE_SYSTEM_FLAG_CUDA "-isystem ")
-
-  rapids_find_package(
-    CUDAToolkit REQUIRED
-    BUILD_EXPORT_SET cunumeric-exports
-    INSTALL_EXPORT_SET cunumeric-exports
-  )
-
-  include(cmake/thirdparty/get_nccl.cmake)
-  include(cmake/thirdparty/get_cutensor.cmake)
-endif()
-
-include(cmake/thirdparty/get_openblas.cmake)
-
-include(cmake/thirdparty/get_tblis.cmake)
-
-##############################################################################
-# - cuNumeric ----------------------------------------------------------------
-
-set(cunumeric_SOURCES "")
-set(cunumeric_CXX_DEFS "")
-set(cunumeric_CUDA_DEFS "")
-set(cunumeric_CXX_OPTIONS "")
-set(cunumeric_CUDA_OPTIONS "")
-
-include(cmake/Modules/set_cpu_arch_flags.cmake)
-set_cpu_arch_flags(cunumeric_CXX_OPTIONS)
-
-# Add `src/cunumeric.mk` sources
-list(APPEND cunumeric_SOURCES
-  src/cunumeric/ternary/where.cc
-  src/cunumeric/scan/scan_global.cc
-  src/cunumeric/scan/scan_local.cc
-  src/cunumeric/binary/binary_op.cc
-  src/cunumeric/binary/binary_op_util.cc
-  src/cunumeric/binary/binary_red.cc
-  src/cunumeric/bits/packbits.cc
-  src/cunumeric/bits/unpackbits.cc
-  src/cunumeric/unary/scalar_unary_red.cc
-  src/cunumeric/unary/unary_op.cc
-  src/cunumeric/unary/unary_red.cc
-  src/cunumeric/unary/convert.cc
-  src/cunumeric/nullary/arange.cc
-  src/cunumeric/nullary/eye.cc
-  src/cunumeric/nullary/fill.cc
-  src/cunumeric/nullary/window.cc
-  src/cunumeric/index/advanced_indexing.cc
-  src/cunumeric/index/choose.cc
-  src/cunumeric/index/putmask.cc
-  src/cunumeric/index/repeat.cc
-  src/cunumeric/index/select.cc
-  src/cunumeric/index/wrap.cc
-  src/cunumeric/index/zip.cc
-  src/cunumeric/item/read.cc
-  src/cunumeric/item/write.cc
-  src/cunumeric/matrix/batched_cholesky.cc
-  src/cunumeric/matrix/contract.cc
-  src/cunumeric/matrix/diag.cc
-  src/cunumeric/matrix/gemm.cc
-  src/cunumeric/matrix/matmul.cc
-  src/cunumeric/matrix/matvecmul.cc
-  src/cunumeric/matrix/dot.cc
-  src/cunumeric/matrix/potrf.cc
-  src/cunumeric/matrix/qr.cc
-  src/cunumeric/matrix/solve.cc
-  src/cunumeric/matrix/svd.cc
-  src/cunumeric/matrix/syrk.cc
-  src/cunumeric/matrix/tile.cc
-  src/cunumeric/matrix/transpose.cc
-  src/cunumeric/matrix/trilu.cc
-  src/cunumeric/matrix/trsm.cc
-  src/cunumeric/matrix/util.cc
-  src/cunumeric/random/bitgenerator.cc
-  src/cunumeric/random/randutil/generator_host.cc
-  src/cunumeric/random/randutil/generator_host_straightforward.cc
-  src/cunumeric/random/randutil/generator_host_advanced.cc
-  src/cunumeric/random/rand.cc
-  src/cunumeric/search/argwhere.cc
-  src/cunumeric/search/nonzero.cc
-  src/cunumeric/set/unique.cc
-  src/cunumeric/set/unique_reduce.cc
-  src/cunumeric/stat/bincount.cc
-  src/cunumeric/convolution/convolve.cc
-  src/cunumeric/transform/flip.cc
-  src/cunumeric/utilities/repartition.cc
-  src/cunumeric/arg_redop_register.cc
-  src/cunumeric/mapper.cc
-  src/cunumeric/ndarray.cc
-  src/cunumeric/operators.cc
-  src/cunumeric/runtime.cc
-  src/cunumeric/cephes/chbevl.cc
-  src/cunumeric/cephes/i0.cc
-  src/cunumeric/stat/histogram.cc
-)
-
-if(Legion_USE_OpenMP)
-  list(APPEND cunumeric_SOURCES
-    src/cunumeric/ternary/where_omp.cc
-    src/cunumeric/scan/scan_global_omp.cc
-    src/cunumeric/scan/scan_local_omp.cc
-    src/cunumeric/binary/binary_op_omp.cc
-    src/cunumeric/binary/binary_red_omp.cc
-    src/cunumeric/bits/packbits_omp.cc
-    src/cunumeric/bits/unpackbits_omp.cc
-    src/cunumeric/unary/unary_op_omp.cc
-    src/cunumeric/unary/scalar_unary_red_omp.cc
-    src/cunumeric/unary/unary_red_omp.cc
-    src/cunumeric/unary/convert_omp.cc
-    src/cunumeric/nullary/arange_omp.cc
-    src/cunumeric/nullary/eye_omp.cc
-    src/cunumeric/nullary/fill_omp.cc
-    src/cunumeric/nullary/window_omp.cc
-    src/cunumeric/index/advanced_indexing_omp.cc
-    src/cunumeric/index/choose_omp.cc
-    src/cunumeric/index/putmask_omp.cc
-    src/cunumeric/index/repeat_omp.cc
-    src/cunumeric/index/select_omp.cc
-    src/cunumeric/index/wrap_omp.cc
-    src/cunumeric/index/zip_omp.cc
-    src/cunumeric/matrix/batched_cholesky_omp.cc
-    src/cunumeric/matrix/contract_omp.cc
-    src/cunumeric/matrix/diag_omp.cc
-    src/cunumeric/matrix/gemm_omp.cc
-    src/cunumeric/matrix/matmul_omp.cc
-    src/cunumeric/matrix/matvecmul_omp.cc
-    src/cunumeric/matrix/dot_omp.cc
-    src/cunumeric/matrix/potrf_omp.cc
-    src/cunumeric/matrix/qr_omp.cc
-    src/cunumeric/matrix/solve_omp.cc
-    src/cunumeric/matrix/svd_omp.cc
-    src/cunumeric/matrix/syrk_omp.cc
-    src/cunumeric/matrix/tile_omp.cc
-    src/cunumeric/matrix/transpose_omp.cc
-    src/cunumeric/matrix/trilu_omp.cc
-    src/cunumeric/matrix/trsm_omp.cc
-    src/cunumeric/random/rand_omp.cc
-    src/cunumeric/search/argwhere_omp.cc
-    src/cunumeric/search/nonzero_omp.cc
-    src/cunumeric/set/unique_omp.cc
-    src/cunumeric/set/unique_reduce_omp.cc
-    src/cunumeric/stat/bincount_omp.cc
-    src/cunumeric/convolution/convolve_omp.cc
-    src/cunumeric/transform/flip_omp.cc
-    src/cunumeric/stat/histogram_omp.cc
-  )
-endif()
-
-if(Legion_USE_CUDA)
-  list(APPEND cunumeric_SOURCES
-    src/cunumeric/ternary/where.cu
-    src/cunumeric/scan/scan_global.cu
-    src/cunumeric/scan/scan_local.cu
-    src/cunumeric/binary/binary_op.cu
-    src/cunumeric/binary/binary_red.cu
-    src/cunumeric/bits/packbits.cu
-    src/cunumeric/bits/unpackbits.cu
-    src/cunumeric/unary/scalar_unary_red.cu
-    src/cunumeric/unary/unary_red.cu
-    src/cunumeric/unary/unary_op.cu
-    src/cunumeric/unary/convert.cu
-    src/cunumeric/nullary/arange.cu
-    src/cunumeric/nullary/eye.cu
-    src/cunumeric/nullary/fill.cu
-    src/cunumeric/nullary/window.cu
-    src/cunumeric/index/advanced_indexing.cu
-    src/cunumeric/index/choose.cu
-    src/cunumeric/index/putmask.cu
-    src/cunumeric/index/repeat.cu
-    src/cunumeric/index/select.cu
-    src/cunumeric/index/wrap.cu
-    src/cunumeric/index/zip.cu
-    src/cunumeric/item/read.cu
-    src/cunumeric/item/write.cu
-    src/cunumeric/matrix/batched_cholesky.cu
-    src/cunumeric/matrix/contract.cu
-    src/cunumeric/matrix/diag.cu
-    src/cunumeric/matrix/gemm.cu
-    src/cunumeric/matrix/matmul.cu
-    src/cunumeric/matrix/matvecmul.cu
-    src/cunumeric/matrix/dot.cu
-    src/cunumeric/matrix/potrf.cu
-    src/cunumeric/matrix/qr.cu
-    src/cunumeric/matrix/solve.cu
-    src/cunumeric/matrix/svd.cu
-    src/cunumeric/matrix/syrk.cu
-    src/cunumeric/matrix/tile.cu
-    src/cunumeric/matrix/transpose.cu
-    src/cunumeric/matrix/trilu.cu
-    src/cunumeric/matrix/trsm.cu
-    src/cunumeric/random/rand.cu
-    src/cunumeric/search/argwhere.cu
-    src/cunumeric/search/nonzero.cu
-    src/cunumeric/set/unique.cu
-    src/cunumeric/stat/bincount.cu
-    src/cunumeric/convolution/convolve.cu
-    src/cunumeric/fft/fft.cu
-    src/cunumeric/transform/flip.cu
-    src/cunumeric/utilities/repartition.cu
-    src/cunumeric/arg_redop_register.cu
-    src/cunumeric/cudalibs.cu
-    src/cunumeric/stat/histogram.cu
-  )
-endif()
-
-# Add `src/cunumeric/sort/sort.mk` sources
-list(APPEND cunumeric_SOURCES
-  src/cunumeric/sort/sort.cc
-  src/cunumeric/sort/searchsorted.cc
-)
-
-if(Legion_USE_OpenMP)
-  list(APPEND cunumeric_SOURCES
-    src/cunumeric/sort/sort_omp.cc
-    src/cunumeric/sort/searchsorted_omp.cc
-  )
-endif()
-
-if(Legion_USE_CUDA)
-  list(APPEND cunumeric_SOURCES
-    src/cunumeric/sort/sort.cu
-    src/cunumeric/sort/searchsorted.cu
-    src/cunumeric/sort/cub_sort_bool.cu
-    src/cunumeric/sort/cub_sort_int8.cu
-    src/cunumeric/sort/cub_sort_int16.cu
-    src/cunumeric/sort/cub_sort_int32.cu
-    src/cunumeric/sort/cub_sort_int64.cu
-    src/cunumeric/sort/cub_sort_uint8.cu
-    src/cunumeric/sort/cub_sort_uint16.cu
-    src/cunumeric/sort/cub_sort_uint32.cu
-    src/cunumeric/sort/cub_sort_uint64.cu
-    src/cunumeric/sort/cub_sort_half.cu
-    src/cunumeric/sort/cub_sort_float.cu
-    src/cunumeric/sort/cub_sort_double.cu
-    src/cunumeric/sort/thrust_sort_bool.cu
-    src/cunumeric/sort/thrust_sort_int8.cu
-    src/cunumeric/sort/thrust_sort_int16.cu
-    src/cunumeric/sort/thrust_sort_int32.cu
-    src/cunumeric/sort/thrust_sort_int64.cu
-    src/cunumeric/sort/thrust_sort_uint8.cu
-    src/cunumeric/sort/thrust_sort_uint16.cu
-    src/cunumeric/sort/thrust_sort_uint32.cu
-    src/cunumeric/sort/thrust_sort_uint64.cu
-    src/cunumeric/sort/thrust_sort_half.cu
-    src/cunumeric/sort/thrust_sort_float.cu
-    src/cunumeric/sort/thrust_sort_double.cu
-    src/cunumeric/sort/thrust_sort_complex64.cu
-    src/cunumeric/sort/thrust_sort_complex128.cu
-  )
-endif()
-
-# Add `src/cunumeric/random/random.mk` sources
-if(Legion_USE_CUDA)
-  list(APPEND cunumeric_SOURCES
-      src/cunumeric/random/bitgenerator.cu
-      src/cunumeric/random/randutil/generator_device.cu
-      src/cunumeric/random/randutil/generator_device_straightforward.cu
-      src/cunumeric/random/randutil/generator_device_advanced.cu
-)
-endif()
-
-# add sources for cusolverMp
-if(Legion_USE_CUDA AND CUSOLVERMP_DIR)
-  list(APPEND cunumeric_SOURCES
-    src/cunumeric/matrix/mp_potrf.cu
-    src/cunumeric/matrix/mp_solve.cu
-  )
-endif()
-
-list(APPEND cunumeric_SOURCES
-  # This must always be the last file!
-  # It guarantees we do our registration callback
-  # only after all task variants are recorded
-  src/cunumeric/cunumeric.cc
-)
-
-if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-  list(APPEND cunumeric_CXX_DEFS DEBUG_CUNUMERIC)
-  list(APPEND cunumeric_CUDA_DEFS DEBUG_CUNUMERIC)
-endif()
-
-if(Legion_BOUNDS_CHECKS)
-  list(APPEND cunumeric_CXX_DEFS BOUNDS_CHECKS)
-  list(APPEND cunumeric_CUDA_DEFS BOUNDS_CHECKS)
-endif()
-
-list(APPEND cunumeric_CUDA_OPTIONS -Xfatbin=-compress-all)
-list(APPEND cunumeric_CUDA_OPTIONS --expt-extended-lambda)
-list(APPEND cunumeric_CUDA_OPTIONS --expt-relaxed-constexpr)
-list(APPEND cunumeric_CXX_OPTIONS -Wno-deprecated-declarations)
-list(APPEND cunumeric_CUDA_OPTIONS -Wno-deprecated-declarations)
-
-add_library(cunumeric ${cunumeric_SOURCES})
-add_library(cunumeric::cunumeric ALIAS cunumeric)
-
-if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
-  set(platform_rpath_origin "\$ORIGIN")
-elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin")
-  set(platform_rpath_origin "@loader_path")
-endif ()
-
-set_target_properties(cunumeric
-           PROPERTIES BUILD_RPATH                         "${platform_rpath_origin}"
-                      INSTALL_RPATH                       "${platform_rpath_origin}"
-                      CXX_STANDARD                        17
-                      CXX_STANDARD_REQUIRED               ON
-                      POSITION_INDEPENDENT_CODE           ON
-                      INTERFACE_POSITION_INDEPENDENT_CODE ON
-                      CUDA_STANDARD                       17
-                      CUDA_STANDARD_REQUIRED              ON
-                      LIBRARY_OUTPUT_DIRECTORY            lib)
-
-target_link_libraries(cunumeric
-   PUBLIC legate::legate
-          $<TARGET_NAME_IF_EXISTS:NCCL::NCCL>
-  PRIVATE BLAS::BLAS
-          tblis::tblis
-          # Add Conda library and include paths
-          $<TARGET_NAME_IF_EXISTS:conda_env>
-          $<TARGET_NAME_IF_EXISTS:CUDA::cufft>
-          $<TARGET_NAME_IF_EXISTS:CUDA::cublas>
-          $<TARGET_NAME_IF_EXISTS:CUDA::cusolver>
-          $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
-          $<TARGET_NAME_IF_EXISTS:cutensor::cutensor>)
-
-if(NOT Legion_USE_CUDA AND cunumeric_cuRAND_INCLUDE_DIR)
-  list(APPEND cunumeric_CXX_DEFS CUNUMERIC_CURAND_FOR_CPU_BUILD)
-  target_include_directories(cunumeric PRIVATE ${cunumeric_cuRAND_INCLUDE_DIR})
-endif()
-
-if(Legion_USE_CUDA AND CUSOLVERMP_DIR)
-  message(VERBOSE "cunumeric: CUSOLVERMP_DIR ${CUSOLVERMP_DIR}")
-  list(APPEND cunumeric_CXX_DEFS CUNUMERIC_USE_CUSOLVERMP)
-  list(APPEND cunumeric_CUDA_DEFS CUNUMERIC_USE_CUSOLVERMP)
-  target_include_directories(cunumeric PRIVATE ${CUSOLVERMP_DIR}/include)
-  target_link_libraries(cunumeric PRIVATE ${CUSOLVERMP_DIR}/lib/libcusolverMp.so)
-endif()
-
-target_compile_options(cunumeric
-  PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${cunumeric_CXX_OPTIONS}>"
-          "$<$<COMPILE_LANGUAGE:CUDA>:${cunumeric_CUDA_OPTIONS}>")
-
-target_compile_definitions(cunumeric
-  PUBLIC  "$<$<COMPILE_LANGUAGE:CXX>:${cunumeric_CXX_DEFS}>"
-          "$<$<COMPILE_LANGUAGE:CUDA>:${cunumeric_CUDA_DEFS}>")
-
-target_include_directories(cunumeric
-  PUBLIC
-    $<BUILD_INTERFACE:${cunumeric_SOURCE_DIR}/src>
-  INTERFACE
-    $<INSTALL_INTERFACE:include/cunumeric>
-)
-
-if(Legion_USE_CUDA)
-  file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld"
-[=[
-SECTIONS
-{
-.nvFatBinSegment : { *(.nvFatBinSegment) }
-.nv_fatbin : { *(.nv_fatbin) }
-}
-]=])
-
-  # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
-  target_link_options(cunumeric PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
-endif()
-
-##############################################################################
-# - install targets-----------------------------------------------------------
-
-include(CPack)
-include(GNUInstallDirs)
-rapids_cmake_install_lib_dir(lib_dir)
-
-install(TARGETS cunumeric
-        DESTINATION ${lib_dir}
-        EXPORT cunumeric-exports)
-
-install(
-  FILES src/cunumeric.h
-        ${CMAKE_CURRENT_BINARY_DIR}/include/cunumeric/version_config.hpp
-  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cunumeric)
-
-install(
-  FILES src/cunumeric/cunumeric_c.h
-        src/cunumeric/ndarray.h
-        src/cunumeric/ndarray.inl
-        src/cunumeric/operators.h
-        src/cunumeric/operators.inl
-        src/cunumeric/runtime.h
-        src/cunumeric/slice.h
-        src/cunumeric/typedefs.h
-  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cunumeric/cunumeric)
-
-if(cunumeric_INSTALL_TBLIS)
-  install(DIRECTORY ${tblis_BINARY_DIR}/lib/ DESTINATION ${lib_dir})
-  install(DIRECTORY ${tblis_BINARY_DIR}/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
-endif()
-
-##############################################################################
-# - install export -----------------------------------------------------------
-
-set(doc_string
-        [=[
-Provide targets for cuNumeric, an aspiring drop-in replacement for NumPy at scale.
-
-Imported Targets:
-  - cunumeric::cunumeric
-
-]=])
-
-string(JOIN "\n" code_string
-  "set(Legion_USE_CUDA ${Legion_USE_CUDA})"
-  "set(Legion_USE_OpenMP ${Legion_USE_OpenMP})"
-  "set(Legion_BOUNDS_CHECKS ${Legion_BOUNDS_CHECKS})"
-)
-
-if(DEFINED Legion_USE_Python)
-  string(APPEND code_string "\nset(Legion_USE_Python ${Legion_USE_Python})")
-endif()
-
-if(DEFINED Legion_NETWORKS)
-  string(APPEND code_string "\nset(Legion_NETWORKS ${Legion_NETWORKS})")
-endif()
-
-rapids_export(
-  INSTALL cunumeric
-  EXPORT_SET cunumeric-exports
-  GLOBAL_TARGETS cunumeric
-  NAMESPACE cunumeric::
-  DOCUMENTATION doc_string
-  FINAL_CODE_BLOCK code_string)
-
-# build export targets
-rapids_export(
-  BUILD cunumeric
-  EXPORT_SET cunumeric-exports
-  GLOBAL_TARGETS cunumeric
-  NAMESPACE cunumeric::
-  DOCUMENTATION doc_string
-  FINAL_CODE_BLOCK code_string)
-
-if(cunumeric_BUILD_TESTS)
-  include(CTest)
-
-  add_subdirectory(tests/cpp)
-endif()
diff --git a/cunumeric/__init__.py b/cupynumeric/__init__.py
similarity index 66%
rename from cunumeric/__init__.py
rename to cupynumeric/__init__.py
index 4f0458cf2a..082217fb12 100644
--- a/cunumeric/__init__.py
+++ b/cupynumeric/__init__.py
@@ -14,7 +14,7 @@
 #
 
 """
-cuNumeric
+cuPyNumeric
 =====
 
 Provides a distributed task-parallel implementation of the Numpy interface
@@ -31,7 +31,7 @@
 from ._array.util import maybe_convert_to_np_ndarray
 from ._module import *
 from ._ufunc import *
-from ._utils.array import is_supported_dtype
+from ._utils.array import is_supported_dtype, local_task_array
 from ._utils.coverage import clone_module
 
 clone_module(_np, globals(), maybe_convert_to_np_ndarray)
@@ -40,6 +40,22 @@
 del clone_module
 del _np
 
-from . import _version
 
-__version__ = _version.get_versions()["version"]  # type: ignore [no-untyped-call]
+def _fixup_version() -> str:
+    import os
+
+    if (v := os.environ.get("CUPYNUMERIC_USE_VERSION")) is not None:
+        return v
+
+    from . import _version
+
+    if hasattr(_version, "get_versions"):
+        return _version.get_versions()["version"]  # type: ignore [no-untyped-call]
+    if hasattr(_version, "__version__"):
+        return _version.__version__
+
+    raise RuntimeError("Failed to determine version")
+
+
+__version__ = _fixup_version()
+del _fixup_version
diff --git a/cunumeric/_array/__init__.py b/cupynumeric/_array/__init__.py
similarity index 100%
rename from cunumeric/_array/__init__.py
rename to cupynumeric/_array/__init__.py
diff --git a/cunumeric/_array/array.py b/cupynumeric/_array/array.py
similarity index 93%
rename from cunumeric/_array/array.py
rename to cupynumeric/_array/array.py
index 5a6406057b..a985f3cbe4 100644
--- a/cunumeric/_array/array.py
+++ b/cupynumeric/_array/array.py
@@ -17,7 +17,7 @@
 import operator
 import warnings
 from functools import reduce
-from typing import TYPE_CHECKING, Any, Sequence, cast
+from typing import TYPE_CHECKING, Any, Literal, Sequence, cast
 
 import legate.core.types as ty
 import numpy as np
@@ -52,7 +52,7 @@
     add_boilerplate,
     broadcast_where,
     check_writeable,
-    convert_to_cunumeric_ndarray,
+    convert_to_cupynumeric_ndarray,
     maybe_convert_to_np_ndarray,
     sanitize_shape,
     tuple_pop,
@@ -106,9 +106,10 @@ def __init__(
         order: OrderType | None = None,
         thunk: NumPyThunk | None = None,
         inputs: Any | None = None,
+        force_thunk: Literal["deferred"] | Literal["eager"] | None = None,
         writeable: bool = True,
     ) -> None:
-        # `inputs` being a cuNumeric ndarray is definitely a bug
+        # `inputs` being a cuPyNumeric ndarray is definitely a bug
         assert not isinstance(inputs, ndarray)
         if thunk is None:
             assert shape is not None
@@ -138,7 +139,7 @@ def __init__(
                     ]
                 core_dtype = to_core_type(dtype)
                 self._thunk = runtime.create_empty_thunk(
-                    sanitized_shape, core_dtype, inputs
+                    sanitized_shape, core_dtype, inputs, force_thunk
                 )
         else:
             self._thunk = thunk
@@ -161,7 +162,7 @@ def __legate_data_interface__(self) -> dict[str, Any]:
             array = LogicalArray.from_store(deferred_thunk.base)
             self._legate_data = dict()
             self._legate_data["version"] = 1
-            field = Field("cuNumeric Array", dtype)
+            field = Field("cuPyNumeric Array", dtype)
             self._legate_data["data"] = {field: array}
         return self._legate_data
 
@@ -186,7 +187,7 @@ def __legate_data_interface__(self) -> dict[str, Any]:
     def __array_function__(
         self, func: Any, types: Any, args: tuple[Any], kwargs: dict[str, Any]
     ) -> Any:
-        import cunumeric as cn
+        import cupynumeric as cn
 
         what = func.__name__
 
@@ -197,19 +198,19 @@ def __array_function__(
                 return NotImplemented
 
         # We are wrapping all NumPy modules, so we can expect to find every
-        # NumPy API call in cuNumeric, even if just an "unimplemented" stub.
+        # NumPy API call in cuPyNumeric, even if just an "unimplemented" stub.
         module = reduce(getattr, func.__module__.split(".")[1:], cn)
         cn_func = getattr(module, func.__name__)
 
-        # We can't immediately forward to the corresponding cuNumeric
+        # We can't immediately forward to the corresponding cuPyNumeric
         # entrypoint. Say that we reached this point because the user code
-        # invoked `np.foo(x, bar=True)` where `x` is a `cunumeric.ndarray`. If
-        # our implementation of `foo` is not complete, and cannot handle
+        # invoked `np.foo(x, bar=True)` where `x` is a `cupynumeric.ndarray`.
+        # If our implementation of `foo` is not complete, and cannot handle
         # `bar=True`, then forwarding this call to `cn.foo` would fail. This
         # goes against the semantics of `__array_function__`, which shouldn't
         # fail if the custom implementation cannot handle the provided
         # arguments. Conversely, if the user calls `cn.foo(x, bar=True)`
-        # directly, that means they requested the cuNumeric implementation
+        # directly, that means they requested the cuPyNumeric implementation
         # specifically, and the `NotImplementedError` should not be hidden.
         if is_implemented(cn_func):
             try:
@@ -265,6 +266,12 @@ def __array_ufunc__(
                 except NotImplementedError:
                     what = f"the requested combination of arguments to {what}"
 
+        # special case for @ matmul
+        if what == "matmul.__call__":
+            from .._module import matmul
+
+            return matmul(*inputs, **kwargs)
+
         # We cannot handle this ufunc call, so we will fall back to NumPy.
         warnings.warn(
             FALLBACK_WARNING.format(what=what),
@@ -285,7 +292,7 @@ def T(self) -> ndarray:
 
         See Also
         --------
-        cunumeric.transpose
+        cupynumeric.transpose
         ndarray.transpose
 
         """
@@ -297,8 +304,8 @@ def base(self) -> npt.NDArray[Any] | None:
         Base object if memory is from some other object.
         """
         raise NotImplementedError(
-            "cunumeric.ndarray doesn't keep track of the array view hierarchy "
-            "yet"
+            "cupynumeric.ndarray doesn't keep track of the array view "
+            "hierarchy yet"
         )
 
     @property
@@ -313,6 +320,17 @@ def data(self) -> memoryview:
         """
         return self.__array__().data
 
+    def __buffer__(self, flags: int, /) -> memoryview:
+        """
+        Python buffer object pointing to the start of the array's data.
+
+        Notes
+        -----
+        This causes the entire (potentially distributed) array to be collected
+        into one memory.
+        """
+        return self.__array__().__buffer__(flags)  # type: ignore
+
     @property
     def dtype(self) -> np.dtype[Any]:
         """
@@ -332,9 +350,9 @@ def flags(self) -> Any:
         """
         Information about the memory layout of the array.
 
-        These flags don't reflect the properties of the cuNumeric array, but
-        rather the NumPy array that will be produced if the cuNumeric array is
-        materialized on a single node.
+        These flags don't reflect the properties of the cuPyNumeric array, but
+        rather the NumPy array that will be produced if the cuPyNumeric array
+        is materialized on a single node.
 
         Attributes
         ----------
@@ -416,7 +434,7 @@ def flat(self) -> np.flatiter[npt.NDArray[Any]]:
         flatten : Return a copy of the array collapsed into one dimension.
 
         Availability
-        --------
+        ------------
         Single CPU
 
         """
@@ -734,7 +752,7 @@ def __divmod__(self, rhs: Any) -> ndarray:
 
         """
         raise NotImplementedError(
-            "cunumeric.ndarray doesn't support __divmod__ yet"
+            "cupynumeric.ndarray doesn't support __divmod__ yet"
         )
 
     def __eq__(self, rhs: object) -> ndarray:  # type: ignore [override]
@@ -787,7 +805,7 @@ def __ge__(self, rhs: Any) -> ndarray:
     # __getattribute__
 
     def _convert_key(self, key: Any, first: bool = True) -> Any:
-        # Convert any arrays stored in a key to a cuNumeric array
+        # Convert any arrays stored in a key to a cuPyNumeric array
         if isinstance(key, slice):
             key = slice(
                 operator.index(key.start) if key.start is not None else None,
@@ -804,9 +822,9 @@ def _convert_key(self, key: Any, first: bool = True) -> Any:
         elif isinstance(key, tuple) and first:
             return tuple(self._convert_key(k, first=False) for k in key)
         else:
-            # Otherwise convert it to a cuNumeric array, check types
+            # Otherwise convert it to a cuPyNumeric array, check types
             # and get the thunk
-            key = convert_to_cunumeric_ndarray(key)
+            key = convert_to_cupynumeric_ndarray(key)
             if key.dtype != bool and not np.issubdtype(key.dtype, np.integer):
                 raise TypeError("index arrays should be int or bool type")
             if key.dtype != bool:
@@ -837,7 +855,7 @@ def __gt__(self, rhs: Any) -> ndarray:
         return _ufunc.greater(self, rhs)
 
     def __hash__(self) -> int:
-        raise TypeError("unhashable type: cunumeric.ndarray")
+        raise TypeError("unhashable type: cupynumeric.ndarray")
 
     def __iadd__(self, rhs: Any) -> ndarray:
         """a.__iadd__(value, /)
@@ -1154,11 +1172,11 @@ def nonzero(self) -> tuple[ndarray, ...]:
 
         Return the indices of the elements that are non-zero.
 
-        Refer to :func:`cunumeric.nonzero` for full documentation.
+        Refer to :func:`cupynumeric.nonzero` for full documentation.
 
         See Also
         --------
-        cunumeric.nonzero : equivalent function
+        cupynumeric.nonzero : equivalent function
 
         Availability
         --------
@@ -1254,7 +1272,7 @@ def __rdivmod__(self, lhs: Any) -> ndarray:
 
         """
         raise NotImplementedError(
-            "cunumeric.ndarray doesn't support __rdivmod__ yet"
+            "cupynumeric.ndarray doesn't support __rdivmod__ yet"
         )
 
     def __reduce__(self, *args: Any, **kwargs: Any) -> str | tuple[str, ...]:
@@ -1505,11 +1523,11 @@ def all(
 
         Returns True if all elements evaluate to True.
 
-        Refer to :func:`cunumeric.all` for full documentation.
+        Refer to :func:`cupynumeric.all` for full documentation.
 
         See Also
         --------
-        cunumeric.all : equivalent function
+        cupynumeric.all : equivalent function
 
         Availability
         --------
@@ -1540,11 +1558,11 @@ def any(
 
         Returns True if any of the elements of `a` evaluate to True.
 
-        Refer to :func:`cunumeric.any` for full documentation.
+        Refer to :func:`cupynumeric.any` for full documentation.
 
         See Also
         --------
-        cunumeric.any : equivalent function
+        cupynumeric.any : equivalent function
 
         Availability
         --------
@@ -1573,11 +1591,11 @@ def argmax(
 
         Return indices of the maximum values along the given axis.
 
-        Refer to :func:`cunumeric.argmax` for full documentation.
+        Refer to :func:`cupynumeric.argmax` for full documentation.
 
         See Also
         --------
-        cunumeric.argmax : equivalent function
+        cupynumeric.argmax : equivalent function
 
         Availability
         --------
@@ -1608,11 +1626,11 @@ def argmin(
 
         Return indices of the minimum values along the given axis.
 
-        Refer to :func:`cunumeric.argmin` for detailed documentation.
+        Refer to :func:`cupynumeric.argmin` for detailed documentation.
 
         See Also
         --------
-        cunumeric.argmin : equivalent function
+        cupynumeric.argmin : equivalent function
 
         Availability
         --------
@@ -1741,11 +1759,11 @@ def take(
 
         Take elements from an array along an axis.
 
-        Refer to :func:`cunumeric.take` for full documentation.
+        Refer to :func:`cupynumeric.take` for full documentation.
 
         See Also
         --------
-        cunumeric.take : equivalent function
+        cupynumeric.take : equivalent function
 
         Availability
         --------
@@ -1755,7 +1773,7 @@ def take(
         if not np.isscalar(indices):
             # if indices is a tuple or list, bring sub-tuples to the same shape
             # and concatenate them
-            indices = convert_to_cunumeric_ndarray(indices)
+            indices = convert_to_cupynumeric_ndarray(indices)
 
         if axis is None:
             self = self.ravel()
@@ -1821,11 +1839,11 @@ def choose(
 
         Use an index array to construct a new array from a set of choices.
 
-        Refer to :func:`cunumeric.choose` for full documentation.
+        Refer to :func:`cupynumeric.choose` for full documentation.
 
         See Also
         --------
-        cunumeric.choose : equivalent function
+        cupynumeric.choose : equivalent function
 
         Availability
         --------
@@ -1843,12 +1861,12 @@ def choose(
             dtypes = [ch.dtype for ch in choices]
             ch_dtype = np.result_type(*dtypes)
             choices = tuple(
-                convert_to_cunumeric_ndarray(choices[i]).astype(ch_dtype)
+                convert_to_cupynumeric_ndarray(choices[i]).astype(ch_dtype)
                 for i in range(n)
             )
 
         else:
-            choices = convert_to_cunumeric_ndarray(choices)
+            choices = convert_to_cupynumeric_ndarray(choices)
             n = choices.shape[0]
             ch_dtype = choices.dtype
             choices = tuple(choices[i, ...] for i in range(n))
@@ -1922,11 +1940,11 @@ def compress(
 
         Return selected slices of an array along given axis.
 
-        Refer to :func:`cunumeric.compress` for full documentation.
+        Refer to :func:`cupynumeric.compress` for full documentation.
 
         See Also
         --------
-        cunumeric.compress : equivalent function
+        cupynumeric.compress : equivalent function
 
         Availability
         --------
@@ -1985,11 +2003,11 @@ def clip(
 
         One of max or min must be given.
 
-        Refer to :func:`cunumeric.clip` for full documentation.
+        Refer to :func:`cupynumeric.clip` for full documentation.
 
         See Also
         --------
-        cunumeric.clip : equivalent function
+        cupynumeric.clip : equivalent function
 
         Availability
         --------
@@ -2005,7 +2023,7 @@ def clip(
         )
         if args[0].size != 1 or args[1].size != 1:
             runtime.warn(
-                "cuNumeric has not implemented clip with array-like "
+                "cuPyNumeric has not implemented clip with array-like "
                 "arguments and is falling back to canonical numpy. You "
                 "may notice significantly decreased performance for this "
                 "function call.",
@@ -2015,7 +2033,7 @@ def clip(
                 self.__array__().clip(args[0], args[1], out=out.__array__())
                 return out
             else:
-                return convert_to_cunumeric_ndarray(
+                return convert_to_cupynumeric_ndarray(
                     self.__array__().clip(args[0], args[1])
                 )
         core_dtype = to_core_type(self.dtype)
@@ -2034,7 +2052,7 @@ def round(
 
         Return a with each element rounded to the given number of decimals.
 
-        Refer to :func:`cunumeric.round` for full documentation.
+        Refer to :func:`cupynumeric.round` for full documentation.
 
         Availability
         --------
@@ -2054,11 +2072,11 @@ def conj(self) -> ndarray:
 
         Complex-conjugate all elements.
 
-        Refer to :func:`cunumeric.conjugate` for full documentation.
+        Refer to :func:`cupynumeric.conjugate` for full documentation.
 
         See Also
         --------
-        cunumeric.conjugate : equivalent function
+        cupynumeric.conjugate : equivalent function
 
         Availability
         --------
@@ -2076,11 +2094,11 @@ def conjugate(self) -> ndarray:
 
         Return the complex conjugate, element-wise.
 
-        Refer to :func:`cunumeric.conjugate` for full documentation.
+        Refer to :func:`cupynumeric.conjugate` for full documentation.
 
         See Also
         --------
-        cunumeric.conjugate : equivalent function
+        cupynumeric.conjugate : equivalent function
 
         Availability
         --------
@@ -2099,7 +2117,7 @@ def copy(self, order: OrderType = "C") -> ndarray:
         Multiple GPUs, Multiple CPUs
 
         """
-        # We don't care about dimension order in cuNumeric
+        # We don't care about dimension order in cuPyNumeric
         return self.__copy__()
 
     @add_boilerplate()
@@ -2274,9 +2292,7 @@ def _diag_helper(
             res_dtype = (
                 dtype
                 if dtype is not None
-                else out.dtype
-                if out is not None
-                else a.dtype
+                else out.dtype if out is not None else a.dtype
             )
             a = a._maybe_convert(res_dtype, (a,))
             if out is not None and out.shape != out_shape:
@@ -2306,11 +2322,11 @@ def diagonal(
 
         Return specified diagonals.
 
-        Refer to :func:`cunumeric.diagonal` for full documentation.
+        Refer to :func:`cupynumeric.diagonal` for full documentation.
 
         See Also
         --------
-        cunumeric.diagonal : equivalent function
+        cupynumeric.diagonal : equivalent function
 
         Availability
         --------
@@ -2332,11 +2348,11 @@ def put(
         """
         Replaces specified elements of the array with given values.
 
-        Refer to :func:`cunumeric.put` for full documentation.
+        Refer to :func:`cupynumeric.put` for full documentation.
 
         See Also
         --------
-        cunumeric.put : equivalent function
+        cupynumeric.put : equivalent function
 
         Availability
         --------
@@ -2395,11 +2411,11 @@ def trace(
 
         Return the sum along diagonals of the array.
 
-        Refer to :func:`cunumeric.trace` for full documentation.
+        Refer to :func:`cupynumeric.trace` for full documentation.
 
         See Also
         --------
-        cunumeric.trace : equivalent function
+        cupynumeric.trace : equivalent function
 
         Availability
         --------
@@ -2436,11 +2452,11 @@ def dot(self, rhs: ndarray, out: ndarray | None = None) -> ndarray:
 
         Return the dot product of this array with ``rhs``.
 
-        Refer to :func:`cunumeric.dot` for full documentation.
+        Refer to :func:`cupynumeric.dot` for full documentation.
 
         See Also
         --------
-        cunumeric.dot : equivalent function
+        cupynumeric.dot : equivalent function
 
         Availability
         --------
@@ -2469,7 +2485,7 @@ def dump(self, file: str | Path) -> None:
 
         Dump a pickle of the array to the specified file.
 
-        The array can be read back with pickle.load or cunumeric.load.
+        The array can be read back with pickle.load or cupynumeric.load.
 
         Parameters
         ----------
@@ -2538,7 +2554,7 @@ def fft(
         Return the ``kind`` ``direction`` FFT of this array
         with normalization ``norm``.
 
-        Common entrypoint for FFT functionality in cunumeric.fft module.
+        Common entrypoint for FFT functionality in cupynumeric.fft module.
 
         Notes
         -----
@@ -2546,7 +2562,7 @@ def fft(
 
         See Also
         --------
-        cunumeric.fft : FFT functions for different ``kind`` and
+        cupynumeric.fft : FFT functions for different ``kind`` and
         ``direction`` arguments
 
         Availability
@@ -2693,7 +2709,7 @@ def flatten(self, order: OrderType = "C") -> ndarray:
 
     def getfield(self, dtype: np.dtype[Any], offset: int = 0) -> None:
         raise NotImplementedError(
-            "cuNumeric does not currently support type reinterpretation "
+            "cuPyNumeric does not currently support type reinterpretation "
             "for ndarray.getfield"
         )
 
@@ -2815,11 +2831,11 @@ def max(
 
         Return the maximum along a given axis.
 
-        Refer to :func:`cunumeric.amax` for full documentation.
+        Refer to :func:`cupynumeric.amax` for full documentation.
 
         See Also
         --------
-        cunumeric.amax : equivalent function
+        cupynumeric.amax : equivalent function
 
         Availability
         --------
@@ -2906,11 +2922,11 @@ def mean(
 
         Returns the average of the array elements along given axis.
 
-        Refer to :func:`cunumeric.mean` for full documentation.
+        Refer to :func:`cupynumeric.mean` for full documentation.
 
         See Also
         --------
-        cunumeric.mean : equivalent function
+        cupynumeric.mean : equivalent function
 
         Availability
         --------
@@ -2919,7 +2935,7 @@ def mean(
         """
         if axis is not None and not isinstance(axis, int):
             raise NotImplementedError(
-                "cunumeric.mean only supports int types for "
+                "cupynumeric.mean only supports int types for "
                 "`axis` currently"
             )
 
@@ -2994,11 +3010,11 @@ def var(
 
         Returns the variance of the array elements along given axis.
 
-        Refer to :func:`cunumeric.var` for full documentation.
+        Refer to :func:`cupynumeric.var` for full documentation.
 
         See Also
         --------
-        cunumeric.var : equivalent function
+        cupynumeric.var : equivalent function
 
         Availability
         --------
@@ -3007,7 +3023,7 @@ def var(
         """
         if axis is not None and not isinstance(axis, int):
             raise NotImplementedError(
-                "cunumeric.var only supports int types for `axis` currently"
+                "cupynumeric.var only supports int types for `axis` currently"
             )
 
         # this could be computed as a single pass through the array
@@ -3017,7 +3033,7 @@ def var(
         # directly as <(x-mu)^2>, which then requires two passes through the
         # data to first compute the mean and then compute the variance
         # see https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
-        # TODO(https://github.com/nv-legate/cunumeric/issues/590)
+        # TODO(https://github.com/nv-legate/cupynumeric/issues/590)
 
         dtype = self._summation_dtype(dtype)
         # calculate the mean, but keep the dimensions so that the
@@ -3044,7 +3060,7 @@ def var(
                 args=(Scalar(mu.__array__(), to_core_type(self.dtype)),),
             )
         else:
-            # TODO(https://github.com/nv-legate/cunumeric/issues/591)
+            # TODO(https://github.com/nv-legate/cupynumeric/issues/591)
             # there isn't really support for generic binary reductions
             # right now all of the current binary reductions are boolean
             # reductions like allclose. To implement this a single pass would
@@ -3088,11 +3104,11 @@ def min(
 
         Return the minimum along a given axis.
 
-        Refer to :func:`cunumeric.amin` for full documentation.
+        Refer to :func:`cupynumeric.amin` for full documentation.
 
         See Also
         --------
-        cunumeric.amin : equivalent function
+        cupynumeric.amin : equivalent function
 
         Availability
         --------
@@ -3121,11 +3137,11 @@ def partition(
 
         Partition of an array in-place.
 
-        Refer to :func:`cunumeric.partition` for full documentation.
+        Refer to :func:`cupynumeric.partition` for full documentation.
 
         See Also
         --------
-        cunumeric.partition : equivalent function
+        cupynumeric.partition : equivalent function
 
         Availability
         --------
@@ -3149,11 +3165,11 @@ def argpartition(
 
         Returns the indices that would partition this array.
 
-        Refer to :func:`cunumeric.argpartition` for full documentation.
+        Refer to :func:`cupynumeric.argpartition` for full documentation.
 
         See Also
         --------
-        cunumeric.argpartition : equivalent function
+        cupynumeric.argpartition : equivalent function
 
         Availability
         --------
@@ -3186,11 +3202,11 @@ def prod(
 
         Return the product of the array elements over the given axis
 
-        Refer to :func:`cunumeric.prod` for full documentation.
+        Refer to :func:`cupynumeric.prod` for full documentation.
 
         See Also
         --------
-        cunumeric.prod : equivalent function
+        cupynumeric.prod : equivalent function
 
         Availability
         --------
@@ -3213,11 +3229,11 @@ def ravel(self, order: OrderType = "C") -> ndarray:
 
         Return a flattened array.
 
-        Refer to :func:`cunumeric.ravel` for full documentation.
+        Refer to :func:`cupynumeric.ravel` for full documentation.
 
         See Also
         --------
-        cunumeric.ravel : equivalent function
+        cupynumeric.ravel : equivalent function
         ndarray.flat : a flat iterator on the array.
 
         Availability
@@ -3232,11 +3248,11 @@ def reshape(self, *args: Any, order: OrderType = "C") -> ndarray:
 
         Returns an array containing the same data with a new shape.
 
-        Refer to :func:`cunumeric.reshape` for full documentation.
+        Refer to :func:`cupynumeric.reshape` for full documentation.
 
         See Also
         --------
-        cunumeric.reshape : equivalent function
+        cupynumeric.reshape : equivalent function
 
 
         Availability
@@ -3307,7 +3323,7 @@ def setfield(
         self, val: Any, dtype: npt.DTypeLike, offset: int = 0
     ) -> None:
         raise NotImplementedError(
-            "cuNumeric does not currently support type reinterpretation "
+            "cuPyNumeric does not currently support type reinterpretation "
             "for ndarray.setfield"
         )
 
@@ -3415,7 +3431,7 @@ def searchsorted(
             raise ValueError("Dimension mismatch: self must be a 1D array")
 
         # this is needed in case v is a scalar
-        v_ndarray = convert_to_cunumeric_ndarray(v)
+        v_ndarray = convert_to_cupynumeric_ndarray(v)
 
         a = self
         # in case we have different dtypes we ned to find a common type
@@ -3459,11 +3475,11 @@ def sort(
 
         Sort an array in-place.
 
-        Refer to :func:`cunumeric.sort` for full documentation.
+        Refer to :func:`cupynumeric.sort` for full documentation.
 
         See Also
         --------
-        cunumeric.sort : equivalent function
+        cupynumeric.sort : equivalent function
 
         Availability
         --------
@@ -3483,11 +3499,11 @@ def argsort(
 
         Returns the indices that would sort this array.
 
-        Refer to :func:`cunumeric.argsort` for full documentation.
+        Refer to :func:`cupynumeric.argsort` for full documentation.
 
         See Also
         --------
-        cunumeric.argsort : equivalent function
+        cupynumeric.argsort : equivalent function
 
         Availability
         --------
@@ -3505,11 +3521,11 @@ def squeeze(self, axis: Any = None) -> ndarray:
 
         Remove axes of length one from `a`.
 
-        Refer to :func:`cunumeric.squeeze` for full documentation.
+        Refer to :func:`cupynumeric.squeeze` for full documentation.
 
         See Also
         --------
-        cunumeric.squeeze : equivalent function
+        cupynumeric.squeeze : equivalent function
 
         Availability
         --------
@@ -3546,11 +3562,11 @@ def sum(
 
         Return the sum of the array elements over the given axis.
 
-        Refer to :func:`cunumeric.sum` for full documentation.
+        Refer to :func:`cupynumeric.sum` for full documentation.
 
         See Also
         --------
-        cunumeric.sum : equivalent function
+        cupynumeric.sum : equivalent function
 
         Availability
         --------
@@ -3601,11 +3617,11 @@ def swapaxes(self, axis1: Any, axis2: Any) -> ndarray:
 
         Return a view of the array with `axis1` and `axis2` interchanged.
 
-        Refer to :func:`cunumeric.swapaxes` for full documentation.
+        Refer to :func:`cupynumeric.swapaxes` for full documentation.
 
         See Also
         --------
-        cunumeric.swapaxes : equivalent function
+        cupynumeric.swapaxes : equivalent function
 
         Availability
         --------
@@ -3703,7 +3719,7 @@ def tolist(self) -> Any:
 
         Return a copy of the array data as a (nested) Python list.
         Data items are converted to the nearest compatible builtin Python
-        type, via the `~cunumeric.ndarray.item` function.
+        type, via the `~cupynumeric.ndarray.item` function.
 
         If ``a.ndim`` is 0, then since the depth of the nested list is 0, it
         will not be a list at all, but a simple Python scalar.
@@ -3720,7 +3736,7 @@ def tolist(self) -> Any:
 
         Notes
         -----
-        The array may be recreated via ``a = cunumeric.array(a.tolist())``,
+        The array may be recreated via ``a = cupynumeric.array(a.tolist())``,
         although this may sometimes lose precision.
 
         Availability
@@ -3856,7 +3872,7 @@ def view(
 
         Notes
         -----
-        cuNumeric does not currently support type reinterpretation, or
+        cuPyNumeric does not currently support type reinterpretation, or
         conversion to ndarray sub-classes; use :func:`ndarray.__array__()` to
         convert to `numpy.ndarray`.
 
@@ -3870,11 +3886,11 @@ def view(
         """
         if dtype is not None and dtype != self.dtype:
             raise NotImplementedError(
-                "cuNumeric does not currently support type reinterpretation"
+                "cuPyNumeric does not currently support type reinterpretation"
             )
         if type is not None:
             raise NotImplementedError(
-                "cuNumeric does not currently support conversion to ndarray "
+                "cuPyNumeric does not currently support conversion to ndarray "
                 "sub-classes; use __array__() to convert to numpy.ndarray"
             )
         return ndarray(
@@ -3889,11 +3905,11 @@ def unique(self) -> ndarray:
 
         Find the unique elements of an array.
 
-        Refer to :func:`cunumeric.unique` for full documentation.
+        Refer to :func:`cupynumeric.unique` for full documentation.
 
         See Also
         --------
-        cunumeric.unique : equivalent function
+        cupynumeric.unique : equivalent function
 
         Availability
         --------
@@ -3939,12 +3955,12 @@ def stencil_hint(
         high_offsets: tuple[int, ...],
     ) -> None:
         """
-        Inform cuNumeric that this array will be used in a stencil computation
-        in the following code.
+        Inform cuPyNumeric that this array will be used in a stencil
+        computation in the following code.
 
-        This allows cuNumeric to allocate space for the "ghost" elements ahead
-        of time, rather than discover the full extent of accesses incrementally,
-        and thus avoid intermediate copies.
+        This allows cuPyNumeric to allocate space for the "ghost" elements
+        ahead of time, rather than discovering the full extent of accesses
+        incrementally, and thus avoid intermediate copies.
 
         For example, let's say we have a 1-D array A of size 10 and we want to
         partition A across two GPUs. By default, A would be partitioned equally
@@ -3953,8 +3969,8 @@ def stencil_hint(
         `B = A[:9] + A[1:]`. The runtime would now need to adjust the
         partitioning such that GPU0 has elements 0-5 and GPU1 has elements 4-9
         inclusive. Since the original instance on GPU0 does not cover index 5,
-        cuNumeric needs to allocate a full new instance that covers 0-5, leading
-        to an extra copy. In this case, if the code calls
+        cuPyNumeric needs to allocate a full new instance that covers 0-5,
+        leading to an extra copy. In this case, if the code calls
         `A.stencil_hint([1], [1])` to pre-allocate instances that contain the
         extra elements before it uses A, the extra copies can be avoided.
 
diff --git a/cunumeric/_array/flags.py b/cupynumeric/_array/flags.py
similarity index 91%
rename from cunumeric/_array/flags.py
rename to cupynumeric/_array/flags.py
index 0ed9c81e31..d58a5480ab 100644
--- a/cunumeric/_array/flags.py
+++ b/cupynumeric/_array/flags.py
@@ -24,8 +24,8 @@ class flagsobj:
     """
     Information about the memory layout of the array.
 
-    These flags don't reflect the properties of the cuNumeric array, but
-    rather the NumPy array that will be produced if the cuNumeric array is
+    These flags don't reflect the properties of the cuPyNumeric array, but
+    rather the NumPy array that will be produced if the cuPyNumeric array is
     materialized on a single node.
     """
 
@@ -78,5 +78,5 @@ def __setitem__(self, key: str, value: Any) -> None:
     def _check_writeable(self, value: Any) -> None:
         if value and not self._array._writeable:
             raise ValueError(
-                "non-writeable cunumeric arrays cannot be made writeable"
+                "non-writeable cupynumeric arrays cannot be made writeable"
             )
diff --git a/cunumeric/_array/thunk.py b/cupynumeric/_array/thunk.py
similarity index 100%
rename from cunumeric/_array/thunk.py
rename to cupynumeric/_array/thunk.py
diff --git a/cunumeric/_array/util.py b/cupynumeric/_array/util.py
similarity index 90%
rename from cunumeric/_array/util.py
rename to cupynumeric/_array/util.py
index 6dc3f68a0e..e0096db857 100644
--- a/cunumeric/_array/util.py
+++ b/cupynumeric/_array/util.py
@@ -47,11 +47,11 @@ def add_boilerplate(
     *array_params: str,
 ) -> Callable[[Callable[P, R]], Callable[P, R]]:
     """
-    Adds required boilerplate to the wrapped cunumeric.ndarray or module-level
-    function.
+    Adds required boilerplate to the wrapped cupynumeric.ndarray or
+    module-level function.
 
     Every time the wrapped function is called, this wrapper will convert all
-    specified array-like parameters to cuNumeric ndarrays. Additionally, any
+    specified array-like parameters to cuPyNumeric ndarrays. Additionally, any
     "out" or "where" arguments will also always be automatically converted.
     """
     to_convert = set(array_params)
@@ -86,11 +86,11 @@ def wrapper(*args: Any, **kwargs: Any) -> R:
             for idx, arg in enumerate(args):
                 if idx in indices and arg is not None:
                     if idx == out_idx:
-                        arg = convert_to_cunumeric_ndarray(arg, share=True)
+                        arg = convert_to_cupynumeric_ndarray(arg, share=True)
                         if not arg.flags.writeable:
                             raise ValueError("out is not writeable")
                     else:
-                        arg = convert_to_cunumeric_ndarray(arg)
+                        arg = convert_to_cupynumeric_ndarray(arg)
                 converted_args.append(arg)
             args = tuple(converted_args)
 
@@ -99,11 +99,13 @@ def wrapper(*args: Any, **kwargs: Any) -> R:
             for k, v in kwargs.items():
                 if k in to_convert and v is not None:
                     if k == "out":
-                        kwargs[k] = convert_to_cunumeric_ndarray(v, share=True)
+                        kwargs[k] = convert_to_cupynumeric_ndarray(
+                            v, share=True
+                        )
                         if not kwargs[k].flags.writeable:
                             raise ValueError("out is not writeable")
                     else:
-                        kwargs[k] = convert_to_cunumeric_ndarray(v)
+                        kwargs[k] = convert_to_cupynumeric_ndarray(v)
 
             return func(*args, **kwargs)
 
@@ -120,7 +122,7 @@ def broadcast_where(where: ndarray | None, shape: NdShape) -> ndarray | None:
     return where
 
 
-def convert_to_cunumeric_ndarray(obj: Any, share: bool = False) -> ndarray:
+def convert_to_cupynumeric_ndarray(obj: Any, share: bool = False) -> ndarray:
     from .array import ndarray
 
     # If this is an instance of one of our ndarrays then we're done
@@ -136,7 +138,7 @@ def convert_to_cunumeric_ndarray(obj: Any, share: bool = False) -> ndarray:
 
 def maybe_convert_to_np_ndarray(obj: Any) -> Any:
     """
-    Converts cuNumeric arrays into NumPy arrays, otherwise has no effect.
+    Converts cuPyNumeric arrays into NumPy arrays, otherwise has no effect.
     """
     from ..ma import MaskedArray
     from .array import ndarray
diff --git a/cunumeric/_module/__init__.py b/cupynumeric/_module/__init__.py
similarity index 96%
rename from cunumeric/_module/__init__.py
rename to cupynumeric/_module/__init__.py
index 86a4105bb0..e96566d914 100644
--- a/cunumeric/_module/__init__.py
+++ b/cupynumeric/_module/__init__.py
@@ -140,7 +140,7 @@
 
 def test(*args: Any, **kw: Any) -> None:
     warn(
-        "cuNumeric cannot execute numpy.test() due to reliance "
+        "cuPyNumeric cannot execute numpy.test() due to reliance "
         "on Numpy internals. For information about running the "
-        "cuNumeric test suite, see: https://docs.nvidia.com/cunumeric/latest/developer/index.html"
+        "cuPyNumeric test suite, see: https://docs.nvidia.com/cupynumeric/latest/developer/index.html"
     )
diff --git a/cunumeric/_module/_unary_red_utils.py b/cupynumeric/_module/_unary_red_utils.py
similarity index 100%
rename from cunumeric/_module/_unary_red_utils.py
rename to cupynumeric/_module/_unary_red_utils.py
diff --git a/cunumeric/_module/array_basic.py b/cupynumeric/_module/array_basic.py
similarity index 100%
rename from cunumeric/_module/array_basic.py
rename to cupynumeric/_module/array_basic.py
diff --git a/cunumeric/_module/array_dimension.py b/cupynumeric/_module/array_dimension.py
similarity index 96%
rename from cunumeric/_module/array_dimension.py
rename to cupynumeric/_module/array_dimension.py
index b75bf45404..01629b2cb9 100644
--- a/cunumeric/_module/array_dimension.py
+++ b/cupynumeric/_module/array_dimension.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 from .._array.array import ndarray
-from .._array.util import add_boilerplate, convert_to_cunumeric_ndarray
+from .._array.util import add_boilerplate, convert_to_cupynumeric_ndarray
 from .._utils import is_np2
 from .creation_data import array
 
@@ -45,7 +45,7 @@ def _reshape_recur(ndim: int, arr: ndarray) -> tuple[int, ...]:
 
 
 def _atleast_nd(ndim: int, arys: Sequence[ndarray]) -> list[ndarray] | ndarray:
-    inputs = list(convert_to_cunumeric_ndarray(arr) for arr in arys)
+    inputs = list(convert_to_cupynumeric_ndarray(arr) for arr in arys)
     # 'reshape' change the shape of arrays
     # only when arr.shape != _reshape_recur(ndim,arr)
     result = list(arr.reshape(_reshape_recur(ndim, arr)) for arr in inputs)
@@ -251,7 +251,7 @@ def broadcast_to(
         The shape of the desired array.
         A single integer i is interpreted as (i,).
     subok : bool, optional
-        This option is ignored by cuNumeric.
+        This option is ignored by cuPyNumeric.
 
     Returns
     -------
@@ -298,7 +298,7 @@ def broadcast_arrays(*args: Any, subok: bool = False) -> list[ndarray]:
         The arrays to broadcast.
 
     subok : bool, optional
-        This option is ignored by cuNumeric
+        This option is ignored by cuPyNumeric
 
     Returns
     -------
@@ -314,7 +314,7 @@ def broadcast_arrays(*args: Any, subok: bool = False) -> list[ndarray]:
     Multiple GPUs, Multiple CPUs
 
     """
-    arrs = [convert_to_cunumeric_ndarray(arr) for arr in args]
+    arrs = [convert_to_cupynumeric_ndarray(arr) for arr in args]
     return _broadcast_arrays(arrs, subok=subok)
 
 
@@ -337,7 +337,7 @@ class broadcast:
     """
 
     def __init__(self, *arrays: Any) -> None:
-        arrs = [convert_to_cunumeric_ndarray(arr) for arr in arrays]
+        arrs = [convert_to_cupynumeric_ndarray(arr) for arr in arrays]
         broadcasted = _broadcast_arrays(arrs)
         self._iters = tuple(arr.flat for arr in broadcasted)
         self._index = 0
diff --git a/cunumeric/_module/array_joining.py b/cupynumeric/_module/array_joining.py
similarity index 96%
rename from cunumeric/_module/array_joining.py
rename to cupynumeric/_module/array_joining.py
index 13956a7aad..fbdf2adda4 100644
--- a/cunumeric/_module/array_joining.py
+++ b/cupynumeric/_module/array_joining.py
@@ -20,7 +20,7 @@
 import numpy as np
 
 from .._array.array import ndarray
-from .._array.util import convert_to_cunumeric_ndarray
+from .._array.util import convert_to_cupynumeric_ndarray
 from .._utils import is_np2
 from .array_dimension import _atleast_nd
 
@@ -82,7 +82,7 @@ def check_list_depth(arr: Any, prefix: NdShape = (0,)) -> int:
                     "List depths are mismatched. First element was at depth "
                     f"{first_depth}, but there is an element at"
                     f" depth {other_depth}, "
-                    f"arrays{convert_to_array_form(prefix+(idx+1,))}"
+                    f"arrays{convert_to_array_form(prefix + (idx + 1,))}"
                 )
 
     return depths[0] + 1
@@ -121,7 +121,7 @@ def check_shape_dtype_without_axis(
     if len(inputs) == 0:
         raise ValueError("need at least one array to concatenate")
 
-    inputs = list(convert_to_cunumeric_ndarray(inp) for inp in inputs)
+    inputs = list(convert_to_cupynumeric_ndarray(inp) for inp in inputs)
     ndim = inputs[0].ndim
     shape = inputs[0].shape
 
@@ -184,7 +184,7 @@ def _block_collect_slices(
         # flatten lists of slices into a single list
         slices = list(chain(*updated_slices))
     else:
-        arrays = list(convert_to_cunumeric_ndarray(inp) for inp in arr)
+        arrays = list(convert_to_cupynumeric_ndarray(inp) for inp in arr)
         common_shape = arrays[0].shape
         if len(arr) > 1:
             arrays, common_info = check_shape_dtype_without_axis(
@@ -248,7 +248,7 @@ def _concatenate(
             shape=out_shape, dtype=common_info.dtype, inputs=inputs
         )
     else:
-        out = convert_to_cunumeric_ndarray(out)
+        out = convert_to_cupynumeric_ndarray(out)
         if not isinstance(out, ndarray):
             raise TypeError("out should be ndarray")
         elif list(out.shape) != out_shape:
@@ -295,8 +295,8 @@ def append(arr: ndarray, values: ndarray, axis: int | None = None) -> ndarray:
     Multiple GPUs, Multiple CPUs
 
     """
-    # Check to see if we can build a new tuple of cuNumeric arrays
-    inputs = list(convert_to_cunumeric_ndarray(inp) for inp in [arr, values])
+    # Check to see if we can build a new tuple of cuPyNumeric arrays
+    inputs = list(convert_to_cupynumeric_ndarray(inp) for inp in [arr, values])
     return concatenate(inputs, axis)
 
 
@@ -427,14 +427,14 @@ def concatenate(
         inputs = list(inp.ravel() for inp in reshaped)
         axis = 0
 
-    # Check to see if we can build a new tuple of cuNumeric arrays
-    cunumeric_inputs, common_info = check_shape_dtype_without_axis(
+    # Check to see if we can build a new tuple of cuPyNumeric arrays
+    cupynumeric_inputs, common_info = check_shape_dtype_without_axis(
         inputs, concatenate.__name__, dtype, casting
     )
-    check_shape_with_axis(cunumeric_inputs, concatenate.__name__, axis)
+    check_shape_with_axis(cupynumeric_inputs, concatenate.__name__, axis)
 
     return _concatenate(
-        cunumeric_inputs,
+        cupynumeric_inputs,
         common_info,
         axis,
         out,
diff --git a/cunumeric/_module/array_rearrange.py b/cupynumeric/_module/array_rearrange.py
similarity index 97%
rename from cunumeric/_module/array_rearrange.py
rename to cupynumeric/_module/array_rearrange.py
index ea30e08746..7f27075835 100644
--- a/cunumeric/_module/array_rearrange.py
+++ b/cupynumeric/_module/array_rearrange.py
@@ -68,7 +68,7 @@ def flip(m: ndarray, axis: NdShapeLike | None = None) -> ndarray:
 
     Notes
     -----
-    cuNumeric implementation doesn't return a view, it returns a new array
+    cuPyNumeric implementation doesn't return a view, it returns a new array
     """
     return m.flip(axis=axis)
 
@@ -101,7 +101,7 @@ def flipud(m: ndarray) -> ndarray:
 
     Notes
     -----
-    cuNumeric implementation doesn't return a view, it returns a new array
+    cuPyNumeric implementation doesn't return a view, it returns a new array
     """
     if m.ndim < 1:
         raise ValueError("Input must be >= 1-d.")
@@ -137,7 +137,7 @@ def fliplr(m: ndarray) -> ndarray:
 
     Notes
     -----
-    cuNumeric implementation doesn't return a view, it returns a new array
+    cuPyNumeric implementation doesn't return a view, it returns a new array
     """
     if m.ndim < 2:
         raise ValueError("Input must be >= 2-d.")
diff --git a/cunumeric/_module/array_shape.py b/cupynumeric/_module/array_shape.py
similarity index 100%
rename from cunumeric/_module/array_shape.py
rename to cupynumeric/_module/array_shape.py
diff --git a/cunumeric/_module/array_splitting.py b/cupynumeric/_module/array_splitting.py
similarity index 98%
rename from cunumeric/_module/array_splitting.py
rename to cupynumeric/_module/array_splitting.py
index dd4a9e2b1d..4462ee5e69 100644
--- a/cunumeric/_module/array_splitting.py
+++ b/cupynumeric/_module/array_splitting.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 from .._array.array import ndarray
-from .._array.util import convert_to_cunumeric_ndarray
+from .._array.util import convert_to_cupynumeric_ndarray
 
 if TYPE_CHECKING:
     import numpy.typing as npt
@@ -99,7 +99,7 @@ def array_split(
     --------
     Multiple GPUs, Multiple CPUs
     """
-    array = convert_to_cunumeric_ndarray(a)
+    array = convert_to_cupynumeric_ndarray(a)
     split_pts = []
     if axis >= array.ndim:
         raise ValueError(
diff --git a/cunumeric/_module/array_tiling.py b/cupynumeric/_module/array_tiling.py
similarity index 97%
rename from cunumeric/_module/array_tiling.py
rename to cupynumeric/_module/array_tiling.py
index 72e5287bc2..6dca2939d6 100644
--- a/cunumeric/_module/array_tiling.py
+++ b/cupynumeric/_module/array_tiling.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 from .._array.array import ndarray
-from .._array.util import add_boilerplate, convert_to_cunumeric_ndarray
+from .._array.util import add_boilerplate, convert_to_cupynumeric_ndarray
 from .._utils import is_np2
 from ..runtime import runtime
 from .creation_shape import full
@@ -183,9 +183,9 @@ def repeat(a: ndarray, repeats: Any, axis: int | None = None) -> ndarray:
             )
 
     # array is an array
-    array = convert_to_cunumeric_ndarray(a)
+    array = convert_to_cupynumeric_ndarray(a)
     if np.ndim(repeats) == 1:
-        repeats = convert_to_cunumeric_ndarray(repeats)
+        repeats = convert_to_cupynumeric_ndarray(repeats)
 
     # if no axes specified, flatten array
     if axis is None:
diff --git a/cunumeric/_module/array_transpose.py b/cupynumeric/_module/array_transpose.py
similarity index 100%
rename from cunumeric/_module/array_transpose.py
rename to cupynumeric/_module/array_transpose.py
diff --git a/cunumeric/_module/binary_bit_packing.py b/cupynumeric/_module/binary_bit_packing.py
similarity index 100%
rename from cunumeric/_module/binary_bit_packing.py
rename to cupynumeric/_module/binary_bit_packing.py
diff --git a/cunumeric/_module/creation_data.py b/cupynumeric/_module/creation_data.py
similarity index 100%
rename from cunumeric/_module/creation_data.py
rename to cupynumeric/_module/creation_data.py
diff --git a/cunumeric/_module/creation_matrices.py b/cupynumeric/_module/creation_matrices.py
similarity index 98%
rename from cunumeric/_module/creation_matrices.py
rename to cupynumeric/_module/creation_matrices.py
index 7b97ef488f..540276c532 100644
--- a/cunumeric/_module/creation_matrices.py
+++ b/cupynumeric/_module/creation_matrices.py
@@ -30,7 +30,7 @@ def diag(v: ndarray, k: int = 0) -> ndarray:
 
     Extract a diagonal or construct a diagonal array.
 
-    See the more detailed documentation for ``cunumeric.diagonal`` if you use
+    See the more detailed documentation for ``cupynumeric.diagonal`` if you use
     this function to extract a diagonal and wish to write to the resulting
     array; whether it returns a copy or a view depends on what version of numpy
     you are using.
diff --git a/cunumeric/_module/creation_ranges.py b/cupynumeric/_module/creation_ranges.py
similarity index 98%
rename from cunumeric/_module/creation_ranges.py
rename to cupynumeric/_module/creation_ranges.py
index ca72f401e4..dc09d8ad09 100644
--- a/cunumeric/_module/creation_ranges.py
+++ b/cupynumeric/_module/creation_ranges.py
@@ -15,7 +15,8 @@
 from __future__ import annotations
 
 import math
-from typing import TYPE_CHECKING, Any
+from types import EllipsisType
+from typing import TYPE_CHECKING
 
 import numpy as np
 
@@ -49,7 +50,7 @@ def arange(
     `range` function, but returns an ndarray rather than a list.
 
     When using a non-integer step, such as 0.1, the results will often not
-    be consistent.  It is better to use `cunumeric.linspace` for these cases.
+    be consistent.  It is better to use `cupynumeric.linspace` for these cases.
 
     Parameters
     ----------
@@ -180,7 +181,7 @@ def linspace(
     delta = stop - start
     y = arange(0, num, dtype=dt)
 
-    out: tuple[Any, ...]  # EllipsisType not even in typing_extensions yet
+    out: tuple[int | EllipsisType | slice, ...]
 
     # Reshape these arrays into dimensions that allow them to broadcast
     if delta.ndim > 0:
diff --git a/cunumeric/_module/creation_shape.py b/cupynumeric/_module/creation_shape.py
similarity index 94%
rename from cunumeric/_module/creation_shape.py
rename to cupynumeric/_module/creation_shape.py
index b208bc57bd..d14aa7298d 100644
--- a/cunumeric/_module/creation_shape.py
+++ b/cupynumeric/_module/creation_shape.py
@@ -38,7 +38,8 @@ def empty(shape: NdShapeLike, dtype: npt.DTypeLike = np.float64) -> ndarray:
     shape : int or tuple[int]
         Shape of the empty array.
     dtype : data-type, optional
-        Desired output data-type for the array. Default is `cunumeric.float64`.
+        Desired output data-type for the array. Default is
+        ``cupynumeric.float64``.
 
     Returns
     -------
@@ -189,7 +190,7 @@ def ones(shape: NdShapeLike, dtype: npt.DTypeLike = np.float64) -> ndarray:
     shape : int or tuple[int]
         Shape of the new array.
     dtype : data-type, optional
-        The desired data-type for the array. Default is `cunumeric.float64`.
+        The desired data-type for the array. Default is `cupynumeric.float64`.
 
     Returns
     -------
@@ -256,7 +257,7 @@ def zeros(shape: NdShapeLike, dtype: npt.DTypeLike = np.float64) -> ndarray:
     shape : int or tuple[int]
         Shape of the new array.
     dtype : data-type, optional
-        The desired data-type for the array.  Default is `cunumeric.float64`.
+        The desired data-type for the array.  Default is `cupynumeric.float64`.
 
     Returns
     -------
@@ -331,7 +332,7 @@ def full(
         Fill value.
     dtype : data-type, optional
         The desired data-type for the array  The default, None, means
-         `cunumeric.array(fill_value).dtype`.
+         `cupynumeric.array(fill_value).dtype`.
 
     Returns
     -------
@@ -351,6 +352,8 @@ def full(
     else:
         dtype = np.dtype(dtype)
         val = np.array(value, dtype=dtype)
+    if np.dtype(dtype).itemsize == 1 and value > 255:
+        raise OverflowError(f"Value {value} out of bounds for {dtype}")
     result = empty(shape, dtype=val.dtype)
     result._thunk.fill(val)
     return result
@@ -395,6 +398,8 @@ def full_like(
         dtype = np.dtype(dtype)
     else:
         dtype = a.dtype
+    if np.dtype(dtype).itemsize == 1 and value > 255:
+        raise OverflowError(f"Value {value} out of bounds for {dtype}")
     result = empty_like(a, dtype=dtype, shape=shape)
     val = np.array(value).astype(dtype)
     result._thunk.fill(val)
diff --git a/cunumeric/_module/indexing.py b/cupynumeric/_module/indexing.py
similarity index 95%
rename from cunumeric/_module/indexing.py
rename to cupynumeric/_module/indexing.py
index 30f4c1633b..3af4622565 100644
--- a/cunumeric/_module/indexing.py
+++ b/cupynumeric/_module/indexing.py
@@ -22,7 +22,7 @@
 from .._array.util import (
     add_boilerplate,
     check_writeable,
-    convert_to_cunumeric_ndarray,
+    convert_to_cupynumeric_ndarray,
 )
 from .._utils import is_np2
 from .._utils.array import calculate_volume
@@ -195,7 +195,7 @@ def mask_indices(
     Assume `mask_func` is a function that, for a square array a of size
     ``(n, n)`` with a possible offset argument `k`, when called as
     ``mask_func(a, k)`` returns a new array with zeros in certain locations
-    (functions like :func:`cunumeric.triu` or :func:`cunumeric.tril`
+    (functions like :func:`cupynumeric.triu` or :func:`cupynumeric.tril`
     do precisely this). Then this function returns the indices where
     the non-zero values would be located.
 
@@ -205,12 +205,12 @@ def mask_indices(
         The returned indices will be valid to access arrays of shape (n, n).
     mask_func : callable
         A function whose call signature is similar to that of
-        :func:`cunumeric.triu`, :func:`cunumeric.tril`.
+        :func:`cupynumeric.triu`, :func:`cupynumeric.tril`.
         That is, ``mask_func(x, k)`` returns a boolean array, shaped like `x`.
         `k` is an optional argument to the function.
     k : scalar
         An optional argument which is passed through to `mask_func`. Functions
-        like :func:`cunumeric.triu`, :func:`cunumeric,tril`
+        like :func:`cupynumeric.triu`, :func:`cupynumeric,tril`
         take a second argument that is interpreted as an offset.
 
     Returns
@@ -225,10 +225,10 @@ def mask_indices(
 
     Notes
     -----
-    WARNING: `mask_indices` expects `mask_function` to call cuNumeric functions
-    for good performance. In case non-cuNumeric functions are called by
-    `mask_function`, cuNumeric will have to materialize all data on the host
-    which might result in running out of system memory.
+    WARNING: ``mask_indices`` expects ``mask_function`` to call cuPyNumeric
+    functions for good performance. In case non-cuPyNumeric functions are
+    called by ``mask_function``, cuPyNumeric will have to materialize all data
+    on the host which might result in running out of system memory.
 
     Availability
     --------
@@ -238,7 +238,7 @@ def mask_indices(
     a = ones((n, n), dtype=bool)
     if not is_implemented(mask_func):
         runtime.warn(
-            "Calling non-cuNumeric functions in mask_func can result in bad "
+            "Calling non-cuPyNumeric functions in mask_func can result in bad "
             "performance",
             category=UserWarning,
         )
@@ -389,7 +389,7 @@ def tril_indices(
         The row dimension of the arrays for which the returned
         indices will be valid.
     k : int, optional
-        Diagonal offset (see :func:`cunumeric.tril` for details).
+        Diagonal offset (see :func:`cupynumeric.tril` for details).
     m : int, optional
         The column dimension of the arrays for which the returned
         indices will be valid.
@@ -422,7 +422,7 @@ def tril_indices_from(arr: ndarray, k: int = 0) -> tuple[ndarray, ...]:
     """
     Return the indices for the lower-triangle of arr.
 
-    See :func:`cunumeric.tril_indices` for full details.
+    See :func:`cupynumeric.tril_indices` for full details.
 
     Parameters
     ----------
@@ -430,7 +430,7 @@ def tril_indices_from(arr: ndarray, k: int = 0) -> tuple[ndarray, ...]:
         The indices will be valid for arrays whose dimensions are
         the same as arr.
     k : int, optional
-        Diagonal offset (see :func:`cunumeric.tril` for details).
+        Diagonal offset (see :func:`cupynumeric.tril` for details).
 
     Returns
     -------
@@ -468,7 +468,7 @@ def triu_indices(
         The size of the arrays for which the returned indices will
         be valid.
     k : int, optional
-        Diagonal offset (see :func:`cunumeric.triu` for details).
+        Diagonal offset (see :func:`cupynumeric.triu` for details).
     m : int, optional
         The column dimension of the arrays for which the returned
         arrays will be valid.
@@ -501,7 +501,7 @@ def triu_indices_from(arr: ndarray, k: int = 0) -> tuple[ndarray, ...]:
     """
     Return the indices for the upper-triangle of arr.
 
-    See :func:`cunumeric.triu_indices` for full details.
+    See :func:`cupynumeric.triu_indices` for full details.
 
     Parameters
     ----------
@@ -509,7 +509,7 @@ def triu_indices_from(arr: ndarray, k: int = 0) -> tuple[ndarray, ...]:
         The indices will be valid for arrays whose dimensions are
         the same as arr.
     k : int, optional
-        Diagonal offset (see :func:`cunumeric.triu` for details).
+        Diagonal offset (see :func:`cupynumeric.triu` for details).
 
     Returns
     -------
@@ -674,7 +674,7 @@ def take_along_axis(a: ndarray, indices: ndarray, axis: int | None) -> ndarray:
     latter. These slices can be different lengths.
 
     Functions returning an index along an axis, like
-    :func:`cunumeric.argsort` and :func:`cunumeric.argpartition`,
+    :func:`cupynumeric.argsort` and :func:`cupynumeric.argpartition`,
     produce suitable indices for this function.
 
     Parameters
@@ -688,7 +688,7 @@ def take_along_axis(a: ndarray, indices: ndarray, axis: int | None) -> ndarray:
     axis : int
         The axis to take 1d slices along. If axis is None, the input array is
         treated as if it had first been flattened to 1d, for consistency with
-        :func:`cunumeric.sort` and :func:`cunumeric.argsort`.
+        :func:`cupynumeric.sort` and :func:`cupynumeric.argsort`.
 
     Returns
     -------
@@ -738,9 +738,9 @@ def put_along_axis(
     the index and data arrays, and uses the former to place values into the
     latter. These slices can be different lengths.
 
-    Functions returning an index along an axis, like :func:`cunumeric.argsort`
-    and :func:`cunumeric.argpartition`, produce suitable indices for
-    this function.
+    Functions returning an index along an axis, like
+    :func:`cupynumeric.argsort` and :func:`cupynumeric.argpartition`, produce
+    suitable indices for this function.
 
     Parameters
     ----------
@@ -924,14 +924,14 @@ def select(
     if len(condlist) == 0:
         raise ValueError("select with an empty condition list is not possible")
 
-    condlist_ = tuple(convert_to_cunumeric_ndarray(c) for c in condlist)
+    condlist_ = tuple(convert_to_cupynumeric_ndarray(c) for c in condlist)
     for i, c in enumerate(condlist_):
         if c.dtype != bool:
             raise TypeError(
                 f"invalid entry {i} in condlist: should be boolean ndarray"
             )
 
-    choicelist_ = tuple(convert_to_cunumeric_ndarray(c) for c in choicelist)
+    choicelist_ = tuple(convert_to_cupynumeric_ndarray(c) for c in choicelist)
     common_type = np.result_type(*choicelist_, default)
     args = condlist_ + choicelist_
     choicelist_ = tuple(
@@ -1065,7 +1065,7 @@ def diagonal(
 
     Notes
     -----
-    Unlike NumPy's, the cuNumeric implementation always returns a copy
+    Unlike NumPy's, the cuPyNumeric implementation always returns a copy
 
     See Also
     --------
diff --git a/cunumeric/_module/io_numpy.py b/cupynumeric/_module/io_numpy.py
similarity index 96%
rename from cunumeric/_module/io_numpy.py
rename to cupynumeric/_module/io_numpy.py
index 67ea13c051..42d4ebdf53 100644
--- a/cunumeric/_module/io_numpy.py
+++ b/cupynumeric/_module/io_numpy.py
@@ -61,7 +61,7 @@ def load(
 
     Notes
     -----
-    cuNumeric does not currently support ``.npz`` and pickled files.
+    cuPyNumeric does not currently support ``.npz`` and pickled files.
 
     Availability
     --------
diff --git a/cunumeric/_module/linalg_mvp.py b/cupynumeric/_module/linalg_mvp.py
similarity index 97%
rename from cunumeric/_module/linalg_mvp.py
rename to cupynumeric/_module/linalg_mvp.py
index dd764c04ec..8650b1b00c 100644
--- a/cunumeric/_module/linalg_mvp.py
+++ b/cupynumeric/_module/linalg_mvp.py
@@ -25,7 +25,7 @@
 from .._array.array import ndarray
 from .._array.util import (
     add_boilerplate,
-    convert_to_cunumeric_ndarray,
+    convert_to_cupynumeric_ndarray,
     find_common_type,
 )
 from .._ufunc.math import multiply
@@ -72,7 +72,7 @@ def inner(a: ndarray, b: ndarray, out: ndarray | None = None) -> ndarray:
 
     Notes
     -----
-    The cuNumeric implementation is a little more liberal than NumPy in terms
+    The cuPyNumeric implementation is a little more liberal than NumPy in terms
     of allowed broadcasting, e.g. ``inner(ones((1,)), ones((4,)))`` is allowed.
 
     See Also
@@ -109,7 +109,7 @@ def dot(a: ndarray, b: ndarray, out: ndarray | None = None) -> ndarray:
       but using ``a @ b`` is preferred.
 
     - If either `a` or `b` is 0-D (scalar), it is equivalent to
-      :func:`multiply` and using ``cunumeric.multiply(a, b)`` or ``a * b`` is
+      :func:`multiply` and using ``cupynumeric.multiply(a, b)`` or ``a * b`` is
       preferred.
 
     - If `a` is an N-D array and `b` is a 1-D array, it is a sum product over
@@ -139,7 +139,7 @@ def dot(a: ndarray, b: ndarray, out: ndarray | None = None) -> ndarray:
 
     Notes
     -----
-    The cuNumeric implementation is a little more liberal than NumPy in terms
+    The cuPyNumeric implementation is a little more liberal than NumPy in terms
     of allowed broadcasting, e.g. ``dot(ones((3,1)), ones((4,5)))`` is allowed.
 
     Except for the inner-product case, only floating-point types are supported.
@@ -227,7 +227,7 @@ def matmul(
       (9, 5, 7, 3)
       >>> # n is 7, k is 4, m is 3
 
-    The cuNumeric implementation is a little more liberal than NumPy in terms
+    The cuPyNumeric implementation is a little more liberal than NumPy in terms
     of allowed broadcasting, e.g. ``matmul(ones((3,1)), ones((4,5)))`` is
     allowed.
 
@@ -290,7 +290,7 @@ def vdot(a: ndarray, b: ndarray, out: ndarray | None = None) -> ndarray:
 
     Notes
     -----
-    The cuNumeric implementation is a little more liberal than NumPy in terms
+    The cuPyNumeric implementation is a little more liberal than NumPy in terms
     of allowed broadcasting, e.g. ``vdot(ones((1,)), ones((4,)))`` is allowed.
 
     See Also
@@ -389,7 +389,7 @@ def tensordot(
 
     Notes
     -----
-    The cuNumeric implementation is a little more liberal than NumPy in terms
+    The cuPyNumeric implementation is a little more liberal than NumPy in terms
     of allowed broadcasting, e.g. ``tensordot(ones((3,1)), ones((1,4)))`` is
     allowed.
 
@@ -710,8 +710,9 @@ def einsum(
     optimize : ``{False, True, 'greedy', 'optimal'}``, optional
         Controls if intermediate optimization should occur. If False then
         arrays will be contracted in input order, one at a time. True (the
-        default) will use the 'greedy' algorithm. See ``cunumeric.einsum_path``
-        for more information on the available optimization algorithms.
+        default) will use the 'greedy' algorithm. See
+        ``cupynumeric.einsum_path`` for more information on the available
+        optimization algorithms.
 
     Returns
     -------
@@ -730,10 +731,10 @@ def einsum(
     --------
     Multiple GPUs, Multiple CPUs
     """
-    operands_list = [convert_to_cunumeric_ndarray(op) for op in operands]
+    operands_list = [convert_to_cupynumeric_ndarray(op) for op in operands]
 
     if out is not None:
-        out = convert_to_cunumeric_ndarray(out, share=True)
+        out = convert_to_cupynumeric_ndarray(out, share=True)
 
     if optimize is True:
         optimize = "greedy"
@@ -841,7 +842,7 @@ def einsum_path(
     --------
     Multiple GPUs, Multiple CPUs
     """
-    computed_operands = [convert_to_cunumeric_ndarray(op) for op in operands]
+    computed_operands = [convert_to_cupynumeric_ndarray(op) for op in operands]
     memory_limit = _builtin_max(op.size for op in computed_operands)
     if isinstance(optimize, tuple):
         if len(optimize) != 2:
diff --git a/cunumeric/_module/logic_array_contents.py b/cupynumeric/_module/logic_array_contents.py
similarity index 92%
rename from cunumeric/_module/logic_array_contents.py
rename to cupynumeric/_module/logic_array_contents.py
index a1fe574b98..e5bb9bd9ee 100644
--- a/cunumeric/_module/logic_array_contents.py
+++ b/cupynumeric/_module/logic_array_contents.py
@@ -16,7 +16,7 @@
 
 from typing import TYPE_CHECKING
 
-from .._array.util import convert_to_cunumeric_ndarray
+from .._array.util import convert_to_cupynumeric_ndarray
 from .._ufunc.comparison import logical_and
 from .._ufunc.floating import isinf, signbit
 
@@ -61,9 +61,9 @@ def isneginf(x: ndarray, out: ndarray | None = None) -> ndarray:
     Multiple GPUs, Multiple CPUs
 
     """
-    x = convert_to_cunumeric_ndarray(x)
+    x = convert_to_cupynumeric_ndarray(x)
     if out is not None:
-        out = convert_to_cunumeric_ndarray(out, share=True)
+        out = convert_to_cupynumeric_ndarray(out, share=True)
     rhs1 = isinf(x)
     rhs2 = signbit(x)
     return logical_and(rhs1, rhs2, out=out)
@@ -106,9 +106,9 @@ def isposinf(x: ndarray, out: ndarray | None = None) -> ndarray:
     Multiple GPUs, Multiple CPUs
 
     """
-    x = convert_to_cunumeric_ndarray(x)
+    x = convert_to_cupynumeric_ndarray(x)
     if out is not None:
-        out = convert_to_cunumeric_ndarray(out, share=True)
+        out = convert_to_cupynumeric_ndarray(out, share=True)
     rhs1 = isinf(x)
     rhs2 = ~signbit(x)
     return logical_and(rhs1, rhs2, out=out)
diff --git a/cunumeric/_module/logic_array_type.py b/cupynumeric/_module/logic_array_type.py
similarity index 93%
rename from cunumeric/_module/logic_array_type.py
rename to cupynumeric/_module/logic_array_type.py
index 2c8553078c..1e39754a7b 100644
--- a/cunumeric/_module/logic_array_type.py
+++ b/cupynumeric/_module/logic_array_type.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 from .._array.array import ndarray
-from .._array.util import convert_to_cunumeric_ndarray
+from .._array.util import convert_to_cupynumeric_ndarray
 from .creation_shape import full
 
 if TYPE_CHECKING:
@@ -53,7 +53,7 @@ def iscomplex(x: ndarray | npt.NDArray[Any]) -> ndarray:
     Multiple GPUs, Multiple CPUs
 
     """
-    x = convert_to_cunumeric_ndarray(x)
+    x = convert_to_cupynumeric_ndarray(x)
     if x.dtype.kind != "c":
         return full(x.shape, False, dtype=bool)
     else:
@@ -121,7 +121,7 @@ def isreal(x: ndarray | npt.NDArray[Any]) -> ndarray:
     Multiple GPUs, Multiple CPUs
 
     """
-    x = convert_to_cunumeric_ndarray(x)
+    x = convert_to_cupynumeric_ndarray(x)
     if x.dtype.kind != "c":
         return full(x.shape, True, dtype=bool)
     else:
@@ -179,7 +179,7 @@ def isscalar(x: ndarray | npt.NDArray[Any]) -> bool:
 
     Notes
     -----
-    This function falls back to NumPy for all object types but cuNumeric's
+    This function falls back to NumPy for all object types but cuPyNumeric's
     ndarray, which always returns `False`.
 
     Availability
@@ -187,9 +187,9 @@ def isscalar(x: ndarray | npt.NDArray[Any]) -> bool:
     Multiple GPUs, Multiple CPUs
     """
 
-    # Since the input can be any value, we can't just convert it to cunumeric
-    # ndarray. Instead we check if the input is cunumeric ndarray and, if not,
-    # fall back to Numpy
+    # Since the input can be any value, we can't just convert it to cupynumeric
+    # ndarray. Instead we check if the input is cupynumeric ndarray and, if
+    # not, fall back to Numpy
     if isinstance(x, ndarray):
         return False
     else:
diff --git a/cunumeric/_module/logic_comparison.py b/cupynumeric/_module/logic_comparison.py
similarity index 95%
rename from cunumeric/_module/logic_comparison.py
rename to cupynumeric/_module/logic_comparison.py
index dad4782027..46c6410a4a 100644
--- a/cunumeric/_module/logic_comparison.py
+++ b/cupynumeric/_module/logic_comparison.py
@@ -84,7 +84,7 @@ def allclose(
     """
     if equal_nan:
         raise NotImplementedError(
-            "cuNumeric does not support `equal_nan` yet for allclose"
+            "cuPyNumeric does not support `equal_nan` yet for allclose"
         )
     args = (Scalar(rtol, ty.float64), Scalar(atol, ty.float64))
     return perform_binary_reduction(
@@ -145,7 +145,7 @@ def isclose(
     """
     if equal_nan:
         raise NotImplementedError(
-            "cuNumeric does not support `equal_nan` yet for isclose"
+            "cuPyNumeric does not support `equal_nan` yet for isclose"
         )
 
     out_shape = np.broadcast_shapes(a.shape, b.shape)
@@ -191,7 +191,7 @@ def array_equal(
     """
     if equal_nan:
         raise NotImplementedError(
-            "cuNumeric does not support `equal_nan` yet for `array_equal`"
+            "cuPyNumeric does not support `equal_nan` yet for `array_equal`"
         )
 
     if a1.shape != a2.shape:
diff --git a/cunumeric/_module/logic_truth.py b/cupynumeric/_module/logic_truth.py
similarity index 100%
rename from cunumeric/_module/logic_truth.py
rename to cupynumeric/_module/logic_truth.py
diff --git a/cunumeric/_module/math_complex.py b/cupynumeric/_module/math_complex.py
similarity index 98%
rename from cunumeric/_module/math_complex.py
rename to cupynumeric/_module/math_complex.py
index 3d05580ad2..29f3787f75 100644
--- a/cunumeric/_module/math_complex.py
+++ b/cupynumeric/_module/math_complex.py
@@ -20,7 +20,6 @@
 
 from .._array.thunk import perform_unary_op
 from .._array.util import add_boilerplate
-from .._utils.array import to_core_type
 from ..config import UnaryOpCode
 
 if TYPE_CHECKING:
diff --git a/cunumeric/_module/math_extrema.py b/cupynumeric/_module/math_extrema.py
similarity index 93%
rename from cunumeric/_module/math_extrema.py
rename to cupynumeric/_module/math_extrema.py
index ad805c00f6..0b576684d7 100644
--- a/cunumeric/_module/math_extrema.py
+++ b/cupynumeric/_module/math_extrema.py
@@ -66,10 +66,11 @@ def amax(
 
     initial : scalar, optional
         The minimum value of an output element. Must be present to allow
-        computation on empty slice. See `~cunumeric.ufunc.reduce` for details.
+        computation on empty slice. See `~cupynumeric.ufunc.reduce` for
+        details.
 
     where : array_like[bool], optional
-        Elements to compare for the maximum. See `~cunumeric.ufunc.reduce`
+        Elements to compare for the maximum. See `~cupynumeric.ufunc.reduce`
         for details.
 
     Returns
@@ -142,10 +143,11 @@ def amin(
 
     initial : scalar, optional
         The maximum value of an output element. Must be present to allow
-        computation on empty slice. See `~cunumeric.ufunc.reduce` for details.
+        computation on empty slice. See `~cupynumeric.ufunc.reduce` for
+        details.
 
     where : array_like[bool], optional
-        Elements to compare for the minimum. See `~cunumeric.ufunc.reduce`
+        Elements to compare for the minimum. See `~cupynumeric.ufunc.reduce`
         for details.
 
     Returns
diff --git a/cunumeric/_module/math_misc.py b/cupynumeric/_module/math_misc.py
similarity index 80%
rename from cunumeric/_module/math_misc.py
rename to cupynumeric/_module/math_misc.py
index 251d92eae1..a91e3faccc 100644
--- a/cunumeric/_module/math_misc.py
+++ b/cupynumeric/_module/math_misc.py
@@ -18,15 +18,21 @@
 
 from .._array.array import ndarray
 from .._array.util import add_boilerplate
+from ..config import ConvolveMethod
 
 if TYPE_CHECKING:
     import numpy.typing as npt
 
-    from ..types import ConvolveMode
+    from ..types import ConvolveMethod as ConvolveMethodType, ConvolveMode
 
 
 @add_boilerplate("a", "v")
-def convolve(a: ndarray, v: ndarray, mode: ConvolveMode = "full") -> ndarray:
+def convolve(
+    a: ndarray,
+    v: ndarray,
+    mode: ConvolveMode = "full",
+    method: ConvolveMethodType = "auto",
+) -> ndarray:
     """
 
     Returns the discrete, linear convolution of two ndarrays.
@@ -52,6 +58,19 @@ def convolve(a: ndarray, v: ndarray, mode: ConvolveMode = "full") -> ndarray:
           The output consists only of those elements that do not
           rely on the zero-padding. In 'valid' mode, either `a` or `v`
           must be at least as large as the other in every dimension.
+    method : ``{'auto', 'direct', 'fft'}``, optional
+        A string indicating which method to use to calculate the convolution.
+
+        'auto':
+         Automatically chooses direct or Fourier method based on an estimate of
+         which is faster (default)
+
+        'direct':
+         The convolution is determined directly from sums, the definition of
+         convolution
+
+        'fft':
+          The Fourier Transform is used to perform the convolution
 
     Returns
     -------
@@ -66,7 +85,7 @@ def convolve(a: ndarray, v: ndarray, mode: ConvolveMode = "full") -> ndarray:
     -----
     The current implementation only supports the 'same' mode.
 
-    Unlike `numpy.convolve`, `cunumeric.convolve` supports N-dimensional
+    Unlike `numpy.convolve`, `cupynumeric.convolve` supports N-dimensional
     inputs, but it follows NumPy's behavior for 1-D inputs.
 
     Availability
@@ -74,7 +93,7 @@ def convolve(a: ndarray, v: ndarray, mode: ConvolveMode = "full") -> ndarray:
     Multiple GPUs, Multiple CPUs
     """
     if mode != "same":
-        raise NotImplementedError("Need to implement other convolution modes")
+        raise NotImplementedError("Only support mode='same'")
 
     if a.ndim != v.ndim:
         raise RuntimeError("Arrays should have the same dimensions")
@@ -84,6 +103,11 @@ def convolve(a: ndarray, v: ndarray, mode: ConvolveMode = "full") -> ndarray:
     if a.ndim == 1 and a.size < v.size:
         v, a = a, v
 
+    if not hasattr(ConvolveMethod, method.upper()):
+        raise ValueError(
+            "Acceptable method flags are 'auto', 'direct', or 'fft'."
+        )
+
     if a.dtype != v.dtype:
         v = v.astype(a.dtype)
     out = ndarray(
@@ -91,7 +115,7 @@ def convolve(a: ndarray, v: ndarray, mode: ConvolveMode = "full") -> ndarray:
         dtype=a.dtype,
         inputs=(a, v),
     )
-    out._thunk.convolve(a._thunk, v._thunk, mode)
+    out._thunk.convolve(a._thunk, v._thunk, mode, method)
     return out
 
 
diff --git a/cunumeric/_module/math_rounding.py b/cupynumeric/_module/math_rounding.py
similarity index 100%
rename from cunumeric/_module/math_rounding.py
rename to cupynumeric/_module/math_rounding.py
diff --git a/cunumeric/_module/math_sum_prod_diff.py b/cupynumeric/_module/math_sum_prod_diff.py
similarity index 93%
rename from cunumeric/_module/math_sum_prod_diff.py
rename to cupynumeric/_module/math_sum_prod_diff.py
index 8a18f57bf7..6027fb1c1a 100644
--- a/cunumeric/_module/math_sum_prod_diff.py
+++ b/cupynumeric/_module/math_sum_prod_diff.py
@@ -26,7 +26,7 @@
 from .._ufunc.math import add, multiply, subtract
 from .._utils import is_np2
 from ..config import ScanCode, UnaryRedCode
-from ..settings import settings as cunumeric_settings
+from ..settings import settings as cupynumeric_settings
 from ._unary_red_utils import get_non_nan_unary_red_code
 from .array_dimension import broadcast_to
 from .array_joining import concatenate
@@ -93,12 +93,12 @@ def prod(
         sub-class' method does not implement `keepdims` any
         exceptions will be raised.
     initial : scalar, optional
-        The starting value for this product. See `~cunumeric.ufunc.reduce` for
-        details.
+        The starting value for this product. See `~cupynumeric.ufunc.reduce`
+        for details.
 
     where : array_like[bool], optional
-        Elements to include in the product. See `~cunumeric.ufunc.reduce` for
-        details.
+        Elements to include in the product. See `~cupynumeric.ufunc.reduce`
+        for details.
 
     Returns
     -------
@@ -177,10 +177,11 @@ def sum(
         sub-class' method does not implement `keepdims` any
         exceptions will be raised.
     initial : scalar, optional
-        Starting value for the sum. See `~cunumeric.ufunc.reduce` for details.
+        Starting value for the sum. See `~cupynumeric.ufunc.reduce` for
+        details.
 
     where : array_like[bool], optional
-        Elements to include in the sum. See `~cunumeric.ufunc.reduce` for
+        Elements to include in the sum. See `~cupynumeric.ufunc.reduce` for
         details.
 
     Returns
@@ -253,13 +254,14 @@ def cumprod(
 
     Notes
     -----
-    CuNumeric's parallel implementation may yield different results from NumPy
-    with floating point and complex types. For example, when boundary values
-    such as inf occur they may not propagate as expected. Consider the float32
-    array ``[3e+37, 1, 100, 0.01]``. NumPy's cumprod will return a result of
-    ``[3e+37, 3e+37, inf, inf]``. However, cuNumeric might internally partition
-    the array such that partition 0 has ``[3e+37, 1]``  and partition 1 has
-    ``[100, 0.01]``, returning the result ``[3e+37, 3e+37, inf, 3e+37]``.
+    cuPyNumeric's parallel implementation may yield different results from
+    NumPy with floating point and complex types. For example, when boundary
+    values such as inf occur they may not propagate as expected. Consider the
+    float32 array ``[3e+37, 1, 100, 0.01]``. NumPy's cumprod will return a
+    result ofc``[3e+37, 3e+37, inf, inf]``. However, cuPyNumeric might
+    internally partition the array such that partition 0 has ``[3e+37, 1]``
+    and partition 1 has ``[100, 0.01]``, returning the result
+    ``[3e+37, 3e+37, inf, 3e+37]``.
 
     Availability
     --------
@@ -318,10 +320,10 @@ def cumsum(
 
     Notes
     -----
-    CuNumeric's parallel implementation may yield different results from NumPy
-    with floating point and complex types. For example, when boundary values
-    such as inf occur they may not propagate as expected. For more explanation
-    check cunumeric.cumprod.
+    CuPyNumeric's parallel implementation may yield different results from
+    NumPy with floating point and complex types. For example, when boundary
+    values such as inf occur they may not propagate as expected. For more
+    explanation check cupynumeric.cumprod.
 
     Availability
     --------
@@ -379,10 +381,10 @@ def nancumprod(
 
     Notes
     -----
-    CuNumeric's parallel implementation may yield different results from NumPy
-    with floating point and complex types. For example, when boundary values
-    such as inf occur they may not propagate as expected. For more explanation
-    check cunumeric.cumprod.
+    CuPyNumeric's parallel implementation may yield different results from
+    NumPy with floating point and complex types. For example, when boundary
+    values such as inf occur they may not propagate as expected. For more
+    explanation check cupynumeric.cumprod.
 
     Availability
     --------
@@ -440,10 +442,10 @@ def nancumsum(
 
     Notes
     -----
-    CuNumeric's parallel implementation may yield different results from NumPy
-    with floating point and complex types. For example, when boundary values
-    such as inf occur they may not propagate as expected. For more explanation
-    check cunumeric.cumprod.
+    CuPyNumeric's parallel implementation may yield different results from
+    NumPy with floating point and complex types. For example, when boundary
+    values such as inf occur they may not propagate as expected. For more
+    explanation check cupynumeric.cumprod.
 
     Availability
     --------
@@ -465,7 +467,7 @@ def nanargmax(
     """
     Return the indices of the maximum values in the specified axis ignoring
     NaNs. For empty arrays, ValueError is raised. For all-NaN slices,
-    ValueError is raised only when CUNUMERIC_NUMPY_COMPATIBILITY
+    ValueError is raised only when CUPYNUMERIC_NUMPY_COMPATIBILITY
     environment variable is set, otherwise identity is returned.
 
     Warning: results cannot be trusted if a slice contains only NaNs
@@ -504,7 +506,7 @@ def nanargmax(
     if a.size == 0:
         raise ValueError("attempt to get nanargmax of an empty sequence")
 
-    if cunumeric_settings.numpy_compat() and a.dtype.kind == "f":
+    if cupynumeric_settings.numpy_compat() and a.dtype.kind == "f":
         if any(all(isnan(a), axis=axis)):
             raise ValueError("Array/Slice contains only NaNs")
 
@@ -533,7 +535,7 @@ def nanargmin(
     """
     Return the indices of the minimum values in the specified axis ignoring
     NaNs. For empty arrays, ValueError is raised. For all-NaN slices,
-    ValueError is raised only when CUNUMERIC_NUMPY_COMPATIBILITY
+    ValueError is raised only when CUPYNUMERIC_NUMPY_COMPATIBILITY
     environment variable is set, otherwise identity is returned.
 
     Warning: results cannot be trusted if a slice contains only NaNs
@@ -572,7 +574,7 @@ def nanargmin(
     if a.size == 0:
         raise ValueError("attempt to get nanargmin of an empty sequence")
 
-    if cunumeric_settings.numpy_compat() and a.dtype.kind == "f":
+    if cupynumeric_settings.numpy_compat() and a.dtype.kind == "f":
         if any(all(isnan(a), axis=axis)):
             raise ValueError("Array/Slice contains only NaNs")
 
@@ -602,7 +604,7 @@ def nanmin(
     """
     Return minimum of an array or minimum along an axis, ignoring any
     NaNs. When all-NaN slices are encountered, a NaN is returned
-    for that slice only when CUNUMERIC_NUMPY_COMPATIBILITY environment
+    for that slice only when CUPYNUMERIC_NUMPY_COMPATIBILITY environment
     variable is set, otherwise identity is returned.
     Empty slices will raise a ValueError
 
@@ -633,10 +635,11 @@ def nanmin(
 
     initial : scalar, optional
         The maximum value of an output element. Must be present to allow
-        computation on empty slice. See `~cunumeric.ufunc.reduce` for details.
+        computation on empty slice. See `~cupynumeric.ufunc.reduce` for
+        details.
 
     where : array_like[bool], optional
-        Elements to compare for the minimum. See `~cunumeric.ufunc.reduce`
+        Elements to compare for the minimum. See `~cupynumeric.ufunc.reduce`
         for details.
 
     Returns
@@ -648,7 +651,7 @@ def nanmin(
 
     Notes
     -----
-    CuNumeric's implementation will not raise a Runtime Warning for
+    CuPyNumeric's implementation will not raise a Runtime Warning for
     slices with all-NaNs
 
     See Also
@@ -675,7 +678,7 @@ def nanmin(
         where=where,
     )
 
-    if cunumeric_settings.numpy_compat() and a.dtype.kind == "f":
+    if cupynumeric_settings.numpy_compat() and a.dtype.kind == "f":
         all_nan = all(isnan(a), axis=axis, keepdims=keepdims, where=where)
         putmask(out_array, all_nan, np.nan)  # type: ignore
 
@@ -694,7 +697,7 @@ def nanmax(
     """
     Return the maximum of an array or maximum along an axis, ignoring any
     NaNs. When all-NaN slices are encountered, a NaN is returned
-    for that slice only when CUNUMERIC_NUMPY_COMPATIBILITY environment
+    for that slice only when CUPYNUMERIC_NUMPY_COMPATIBILITY environment
     variable is set, otherwise identity is returned.
     Empty slices will raise a ValueError
 
@@ -728,10 +731,11 @@ def nanmax(
 
     initial : scalar, optional
         The minimum value of an output element. Must be present to allow
-        computation on empty slice. See `~cunumeric.ufunc.reduce` for details.
+        computation on empty slice. See `~cupynumeric.ufunc.reduce` for
+        details.
 
     where : array_like[bool], optional
-        Elements to compare for the maximum. See `~cunumeric.ufunc.reduce`
+        Elements to compare for the maximum. See `~cupynumeric.ufunc.reduce`
         for details.
 
     Returns
@@ -743,7 +747,7 @@ def nanmax(
 
     Notes
     -----
-    CuNumeric's implementation will not raise a Runtime Warning for
+    CuPyNumeric's implementation will not raise a Runtime Warning for
     slices with all-NaNs
 
     See Also
@@ -770,7 +774,7 @@ def nanmax(
         where=where,
     )
 
-    if cunumeric_settings.numpy_compat() and a.dtype.kind == "f":
+    if cupynumeric_settings.numpy_compat() and a.dtype.kind == "f":
         all_nan = all(isnan(a), axis=axis, keepdims=keepdims, where=where)
         putmask(out_array, all_nan, np.nan)  # type: ignore
 
@@ -825,11 +829,11 @@ def nanprod(
         sub-class' method does not implement `keepdims` any
         exceptions will be raised.
     initial : scalar, optional
-        The starting value for this product. See `~cunumeric.ufunc.reduce` for
-        details.
+        The starting value for this product. See `~cupynumeric.ufunc.reduce`
+        for details.
     where : array_like[bool], optional
-        Elements to include in the product. See `~cunumeric.ufunc.reduce` for
-        details.
+        Elements to include in the product. See `~cupynumeric.ufunc.reduce`
+        for details.
 
     Returns
     -------
@@ -924,11 +928,11 @@ def nansum(
         the result will broadcast correctly against the input array.
 
     initial : scalar, optional
-        Starting value for the sum. See `~cunumeric.ufunc.reduce` for
+        Starting value for the sum. See `~cupynumeric.ufunc.reduce` for
         details.
 
     where : array_like[bool], optional
-        Elements to include in the sum. See `~cunumeric.ufunc.reduce` for
+        Elements to include in the sum. See `~cupynumeric.ufunc.reduce` for
         details.
 
     Returns
diff --git a/cunumeric/_module/sets_making.py b/cupynumeric/_module/sets_making.py
similarity index 100%
rename from cunumeric/_module/sets_making.py
rename to cupynumeric/_module/sets_making.py
diff --git a/cunumeric/_module/ssc_counting.py b/cupynumeric/_module/ssc_counting.py
similarity index 100%
rename from cunumeric/_module/ssc_counting.py
rename to cupynumeric/_module/ssc_counting.py
diff --git a/cunumeric/_module/ssc_searching.py b/cupynumeric/_module/ssc_searching.py
similarity index 96%
rename from cunumeric/_module/ssc_searching.py
rename to cupynumeric/_module/ssc_searching.py
index de8319ca6b..bcdba11cee 100644
--- a/cunumeric/_module/ssc_searching.py
+++ b/cupynumeric/_module/ssc_searching.py
@@ -107,8 +107,8 @@ def argmax(
 
     Notes
     -----
-    CuNumeric's parallel implementation may yield different results from NumPy
-    when the array contains NaN(s).
+    cuPyNumeric's parallel implementation may yield different results from
+    NumPy when the array contains NaN(s).
 
     Availability
     --------
@@ -156,8 +156,8 @@ def argmin(
 
     Notes
     -----
-    CuNumeric's parallel implementation may yield different results from NumPy
-    when the array contains NaN(s).
+    cuPyNumeric's parallel implementation may yield different results from
+    NumPy when the array contains NaN(s).
 
     Availability
     --------
@@ -197,8 +197,9 @@ def flatnonzero(a: ndarray) -> ndarray:
 
 
 @overload
-def where(a: npt.ArrayLike | ndarray, x: None, y: None) -> tuple[ndarray, ...]:
-    ...
+def where(
+    a: npt.ArrayLike | ndarray, x: None, y: None
+) -> tuple[ndarray, ...]: ...
 
 
 @overload
@@ -206,8 +207,7 @@ def where(
     a: npt.ArrayLike | ndarray,
     x: npt.ArrayLike | ndarray,
     y: npt.ArrayLike | ndarray,
-) -> ndarray:
-    ...
+) -> ndarray: ...
 
 
 @add_boilerplate("a", "x", "y")  # type: ignore [misc]
diff --git a/cunumeric/_module/ssc_sorting.py b/cupynumeric/_module/ssc_sorting.py
similarity index 98%
rename from cunumeric/_module/ssc_sorting.py
rename to cupynumeric/_module/ssc_sorting.py
index 1ee86e0d02..4f32d0194f 100644
--- a/cunumeric/_module/ssc_sorting.py
+++ b/cupynumeric/_module/ssc_sorting.py
@@ -219,7 +219,7 @@ def argpartition(
 
     Notes
     -----
-    The current implementation falls back to `cunumeric.argsort`.
+    The current implementation falls back to `cupynumeric.argsort`.
 
     See Also
     --------
@@ -274,7 +274,7 @@ def partition(
 
     Notes
     -----
-    The current implementation falls back to `cunumeric.sort`.
+    The current implementation falls back to `cupynumeric.sort`.
 
     See Also
     --------
diff --git a/cunumeric/_module/stats_avgs_vars.py b/cupynumeric/_module/stats_avgs_vars.py
similarity index 100%
rename from cunumeric/_module/stats_avgs_vars.py
rename to cupynumeric/_module/stats_avgs_vars.py
diff --git a/cunumeric/_module/stats_correlating.py b/cupynumeric/_module/stats_correlating.py
similarity index 100%
rename from cunumeric/_module/stats_correlating.py
rename to cupynumeric/_module/stats_correlating.py
diff --git a/cunumeric/_module/stats_histograms.py b/cupynumeric/_module/stats_histograms.py
similarity index 99%
rename from cunumeric/_module/stats_histograms.py
rename to cupynumeric/_module/stats_histograms.py
index d6397760f2..05ab4e9289 100644
--- a/cunumeric/_module/stats_histograms.py
+++ b/cupynumeric/_module/stats_histograms.py
@@ -64,7 +64,7 @@ def bincount(
     -------
     out : ndarray[int]
         The result of binning the input array.
-        The length of `out` is equal to ``cunumeric.amax(x)+1``.
+        The length of `out` is equal to ``cupynumeric.amax(x)+1``.
 
     Raises
     ------
diff --git a/cunumeric/_module/stats_order.py b/cupynumeric/_module/stats_order.py
similarity index 99%
rename from cunumeric/_module/stats_order.py
rename to cupynumeric/_module/stats_order.py
index 7c70424761..7d7564a3df 100644
--- a/cunumeric/_module/stats_order.py
+++ b/cupynumeric/_module/stats_order.py
@@ -720,7 +720,7 @@ def nanquantile_impl(
         assert qs_all[qindex].shape == remaining_shape
 
         # TODO(aschaffer): Vectorize this operation, see
-        # github.com/nv-legate/cunumeric/pull/1121#discussion_r1484731763
+        # github.com/nv-legate/cupynumeric/pull/1121#discussion_r1484731763
         gamma = None
         for aindex, n in np.ndenumerate(non_nan_counts):
             # TODO (2024-08): `n` should be an integral type, but wasn't:
diff --git a/cunumeric/_module/window.py b/cupynumeric/_module/window.py
similarity index 100%
rename from cunumeric/_module/window.py
rename to cupynumeric/_module/window.py
diff --git a/cunumeric/_sphinxext/__init__.py b/cupynumeric/_sphinxext/__init__.py
similarity index 100%
rename from cunumeric/_sphinxext/__init__.py
rename to cupynumeric/_sphinxext/__init__.py
diff --git a/cunumeric/_sphinxext/_comparison_config.py b/cupynumeric/_sphinxext/_comparison_config.py
similarity index 95%
rename from cunumeric/_sphinxext/_comparison_config.py
rename to cupynumeric/_sphinxext/_comparison_config.py
index 3623a61a0c..911e487973 100644
--- a/cunumeric/_sphinxext/_comparison_config.py
+++ b/cupynumeric/_sphinxext/_comparison_config.py
@@ -83,12 +83,11 @@ class SectionConfig:
 UFUNCS = (numpy.ufunc,)
 
 NUMPY_CONFIGS = [
-    SectionConfig("Module-Level", None, types=FUNCTIONS),
-    SectionConfig("Ufuncs", None, types=UFUNCS),
-    SectionConfig("Multi-Dimensional Array", "ndarray", types=METHODS),
-    SectionConfig("Linear Algebra", "linalg", types=FUNCTIONS),
-    SectionConfig("Discrete Fourier Transform", "fft", types=FUNCTIONS),
-    SectionConfig("Random Sampling", "random", types=FUNCTIONS),
+    SectionConfig("Module-Level", None),
+    SectionConfig("Multi-Dimensional Array", "ndarray"),
+    SectionConfig("Linear Algebra", "linalg"),
+    SectionConfig("Discrete Fourier Transform", "fft"),
+    SectionConfig("Random Sampling", "random"),
 ]
 
 CONVOLVE = ("convolve", "correlate")
diff --git a/cunumeric/_sphinxext/_comparison_util.py b/cupynumeric/_sphinxext/_comparison_util.py
similarity index 72%
rename from cunumeric/_sphinxext/_comparison_util.py
rename to cupynumeric/_sphinxext/_comparison_util.py
index ddd9bab2b4..a7168cee47 100644
--- a/cunumeric/_sphinxext/_comparison_util.py
+++ b/cupynumeric/_sphinxext/_comparison_util.py
@@ -16,16 +16,16 @@
 
 from dataclasses import dataclass
 from types import ModuleType
-from typing import TYPE_CHECKING, Any, Iterable, Iterator, Type
+from typing import TYPE_CHECKING, Any, Iterable, Iterator
 
-from .._utils.coverage import is_implemented, is_multi, is_single
+from .._utils.coverage import is_implemented, is_multi, is_single, is_wrapped
 from ._comparison_config import MISSING_NP_REFS, SKIP
 
 if TYPE_CHECKING:
     from ._comparison_config import SectionConfig
 
 YES = "\u2713"
-NO = "\u274C"
+NO = "\u274c"
 
 
 @dataclass(frozen=True)
@@ -66,24 +66,38 @@ def _lgref(name: str, obj: Any, implemented: bool) -> str:
     if isinstance(obj, ModuleType):
         full_name = f"{obj.__name__}.{name}"
     else:
-        full_name = f"cunumeric.{obj.__name__}.{name}"
+        full_name = f"cupynumeric.{obj.__name__}.{name}"
 
     role = "meth" if "ndarray" in full_name else "obj"
 
     return f":{role}:`{full_name}`"
 
 
-def filter_names(
+def filter_wrapped_names(
     obj: Any,
-    types: tuple[Type[Any], ...] | None = None,
+    *,
     skip: Iterable[str] = (),
 ) -> Iterator[str]:
     names = (n for n in dir(obj))  # every name in the module or class
+    names = (
+        n for n in names if is_wrapped(getattr(obj, n))
+    )  # that is wrapped
+    names = (n for n in names if n not in skip)  # except the ones we skip
+    names = (n for n in names if not n.startswith("_"))  # or any private names
+    return names
+
+
+def filter_type_names(
+    obj: Any,
+    *,
+    skip: Iterable[str] = (),
+) -> Iterator[str]:
+    names = (n for n in dir(obj))  # every name in the module or class
+    names = (
+        n for n in names if isinstance(getattr(obj, n), type)
+    )  # that is a type (class, dtype, etc)
     names = (n for n in names if n not in skip)  # except the ones we skip
     names = (n for n in names if not n.startswith("_"))  # or any private names
-    if types:
-        # optionally filtered by type
-        names = (n for n in names if isinstance(getattr(obj, n), types))
     return names
 
 
@@ -109,12 +123,12 @@ def get_item(name: str, np_obj: Any, lg_obj: Any) -> ItemDetail:
 def get_namespaces(attr: str | None) -> tuple[Any, Any]:
     import numpy
 
-    import cunumeric
+    import cupynumeric
 
     if attr is None:
-        return numpy, cunumeric
+        return numpy, cupynumeric
 
-    return getattr(numpy, attr), getattr(cunumeric, attr)
+    return getattr(numpy, attr), getattr(cupynumeric, attr)
 
 
 def generate_section(config: SectionConfig) -> SectionDetail:
@@ -123,9 +137,14 @@ def generate_section(config: SectionConfig) -> SectionDetail:
     names: Iterable[str]
 
     if config.names:
-        names = config.names
+        names = set(config.names)
     else:
-        names = filter_names(np_obj, config.types, skip=SKIP)
+        wrapped_names = filter_wrapped_names(lg_obj, skip=SKIP)
+        type_names = filter_type_names(lg_obj, skip=SKIP)
+        names = set(wrapped_names) | set(type_names)
+
+    # we can omit anything that isn't in np namespace to begin with
+    names = {n for n in names if n in dir(np_obj)}
 
     items = [get_item(name, np_obj, lg_obj) for name in names]
 
diff --git a/cunumeric/_sphinxext/_cunumeric_directive.py b/cupynumeric/_sphinxext/_cupynumeric_directive.py
similarity index 96%
rename from cunumeric/_sphinxext/_cunumeric_directive.py
rename to cupynumeric/_sphinxext/_cupynumeric_directive.py
index 62b7c9672d..593d25b241 100644
--- a/cunumeric/_sphinxext/_cunumeric_directive.py
+++ b/cupynumeric/_sphinxext/_cupynumeric_directive.py
@@ -20,7 +20,7 @@
 from sphinx.util.nodes import nested_parse_with_titles
 
 
-class CunumericDirective(SphinxDirective):
+class CupynumericDirective(SphinxDirective):
     def parse(self, rst_text: str, annotation: str) -> list[nodes.Node]:
         result = StringList()
         for line in rst_text.split("\n"):
diff --git a/cunumeric/_sphinxext/_templates.py b/cupynumeric/_sphinxext/_templates.py
similarity index 100%
rename from cunumeric/_sphinxext/_templates.py
rename to cupynumeric/_sphinxext/_templates.py
diff --git a/cunumeric/_sphinxext/_templates/comparison_table.rst b/cupynumeric/_sphinxext/_templates/comparison_table.rst
similarity index 69%
rename from cunumeric/_sphinxext/_templates/comparison_table.rst
rename to cupynumeric/_sphinxext/_templates/comparison_table.rst
index 3a4211100d..55d1d583f3 100644
--- a/cunumeric/_sphinxext/_templates/comparison_table.rst
+++ b/cupynumeric/_sphinxext/_templates/comparison_table.rst
@@ -3,13 +3,13 @@
 {{ section.title }}
 {{ "~" * section.title|length }}
 
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
 
 .. autosummary::
     :toctree: generated/
 
 .. csv-table::
-    :header: NumPy, cunumeric, single-GPU/CPU, multi-GPU/CPU
+    :header: NumPy, cupynumeric, single-GPU/CPU, multi-GPU/CPU
 
     {% for item in section.items -%}
     {{ item.np_ref }}, {{ item.lg_ref }}, {{ item.single }}, {{ item.multi }}
@@ -19,6 +19,6 @@
 
 Number of NumPy functions: {{ section.np_count }}
 
-Number of functions covered by cunumeric: {{ section.lg_count }}
+Number of functions covered by cupynumeric: {{ section.lg_count }}
 
 {% endfor %}
\ No newline at end of file
diff --git a/cunumeric/_sphinxext/comparison_table.py b/cupynumeric/_sphinxext/comparison_table.py
similarity index 94%
rename from cunumeric/_sphinxext/comparison_table.py
rename to cupynumeric/_sphinxext/comparison_table.py
index a00d4bca7a..baa62a53d8 100644
--- a/cunumeric/_sphinxext/comparison_table.py
+++ b/cupynumeric/_sphinxext/comparison_table.py
@@ -22,13 +22,13 @@
 from . import PARALLEL_SAFE, SphinxParallelSpec
 from ._comparison_config import GROUPED_CONFIGS, NUMPY_CONFIGS
 from ._comparison_util import generate_section
-from ._cunumeric_directive import CunumericDirective
+from ._cupynumeric_directive import CupynumericDirective
 from ._templates import COMPARISON_TABLE
 
 log = getLogger(__name__)
 
 
-class ComparisonTable(CunumericDirective):
+class ComparisonTable(CupynumericDirective):
     has_content = False
     required_arguments = 0
     optional_arguments = 1
diff --git a/cunumeric/_sphinxext/implemented_index.py b/cupynumeric/_sphinxext/implemented_index.py
similarity index 90%
rename from cunumeric/_sphinxext/implemented_index.py
rename to cupynumeric/_sphinxext/implemented_index.py
index 175e12d693..f0e9598bc7 100644
--- a/cunumeric/_sphinxext/implemented_index.py
+++ b/cupynumeric/_sphinxext/implemented_index.py
@@ -20,11 +20,11 @@
 from sphinx.application import Sphinx
 from sphinx.util.logging import getLogger
 
-import cunumeric as cn
+import cupynumeric as cn
 
 from .._utils.coverage import is_implemented
 from . import PARALLEL_SAFE, SphinxParallelSpec
-from ._cunumeric_directive import CunumericDirective
+from ._cupynumeric_directive import CupynumericDirective
 
 log = getLogger(__name__)
 
@@ -45,7 +45,7 @@ def _filter(x: Any) -> bool:
 )
 
 
-class ImplementedIndex(CunumericDirective):
+class ImplementedIndex(CupynumericDirective):
     has_content = False
     required_arguments = 0
     optional_arguments = 0
@@ -59,7 +59,7 @@ def run(self) -> list[nodes.Node]:
                 if _filter(x)
             ]
         refs += [
-            f"* :obj:`cunumeric.ndarray.{x.__name__}`"
+            f"* :obj:`cupynumeric.ndarray.{x.__name__}`"
             for x in cn.ndarray.__dict__.values()
             if _filter(x)
         ]
diff --git a/cunumeric/_sphinxext/missing_refs.py b/cupynumeric/_sphinxext/missing_refs.py
similarity index 70%
rename from cunumeric/_sphinxext/missing_refs.py
rename to cupynumeric/_sphinxext/missing_refs.py
index bd55cb5d41..99938b80dd 100644
--- a/cunumeric/_sphinxext/missing_refs.py
+++ b/cupynumeric/_sphinxext/missing_refs.py
@@ -28,25 +28,25 @@
 log = getLogger(__name__)
 
 SKIP = (
-    "cunumeric.cast",
-    "cunumeric.ndarray.__array_function__",
-    "cunumeric.ndarray.__array_ufunc__",
-    "cunumeric.ndarray.__format__",
-    "cunumeric.ndarray.__hash__",
-    "cunumeric.ndarray.__iter__",
-    "cunumeric.ndarray.__radd__",
-    "cunumeric.ndarray.__rand__",
-    "cunumeric.ndarray.__rdivmod__",
-    "cunumeric.ndarray.__reduce_ex__",
-    "cunumeric.ndarray.__rfloordiv__",
-    "cunumeric.ndarray.__rmod__",
-    "cunumeric.ndarray.__rmul__",
-    "cunumeric.ndarray.__ror__",
-    "cunumeric.ndarray.__rpow__",
-    "cunumeric.ndarray.__rsub__",
-    "cunumeric.ndarray.__rtruediv__",
-    "cunumeric.ndarray.__rxor__",
-    "cunumeric.ndarray.__sizeof__",
+    "cupynumeric.cast",
+    "cupynumeric.ndarray.__array_function__",
+    "cupynumeric.ndarray.__array_ufunc__",
+    "cupynumeric.ndarray.__format__",
+    "cupynumeric.ndarray.__hash__",
+    "cupynumeric.ndarray.__iter__",
+    "cupynumeric.ndarray.__radd__",
+    "cupynumeric.ndarray.__rand__",
+    "cupynumeric.ndarray.__rdivmod__",
+    "cupynumeric.ndarray.__reduce_ex__",
+    "cupynumeric.ndarray.__rfloordiv__",
+    "cupynumeric.ndarray.__rmod__",
+    "cupynumeric.ndarray.__rmul__",
+    "cupynumeric.ndarray.__ror__",
+    "cupynumeric.ndarray.__rpow__",
+    "cupynumeric.ndarray.__rsub__",
+    "cupynumeric.ndarray.__rtruediv__",
+    "cupynumeric.ndarray.__rxor__",
+    "cupynumeric.ndarray.__sizeof__",
 )
 
 MISSING: list[tuple[str, str]] = []
@@ -62,7 +62,7 @@ def run(self, **kwargs: Any) -> None:
     def _check_target(self, node: Any) -> None:
         target = node["reftarget"]
 
-        if not target.startswith("cunumeric.") or target in SKIP:
+        if not target.startswith("cupynumeric.") or target in SKIP:
             return
 
         domain = self.env.domains[node["refdomain"]]
@@ -85,7 +85,7 @@ def _check_target(self, node: Any) -> None:
         if uri is None:
             loc = get_node_location(node)
             log.warning(
-                f"Cunumeric reference missing a target: {loc}: {target}",
+                f"cuPyNumeric reference missing a target: {loc}: {target}",
                 type="ref",
             )
 
diff --git a/cunumeric/_sphinxext/ufunc_formatter.py b/cupynumeric/_sphinxext/ufunc_formatter.py
similarity index 97%
rename from cunumeric/_sphinxext/ufunc_formatter.py
rename to cupynumeric/_sphinxext/ufunc_formatter.py
index 05cac694e6..6f574d7541 100644
--- a/cunumeric/_sphinxext/ufunc_formatter.py
+++ b/cupynumeric/_sphinxext/ufunc_formatter.py
@@ -19,7 +19,7 @@
 from sphinx.application import Sphinx
 from sphinx.ext.autodoc import FunctionDocumenter
 
-from cunumeric import ufunc
+from cupynumeric import ufunc
 
 from . import PARALLEL_SAFE, SphinxParallelSpec
 
diff --git a/cunumeric/_thunk/__init__.py b/cupynumeric/_thunk/__init__.py
similarity index 100%
rename from cunumeric/_thunk/__init__.py
rename to cupynumeric/_thunk/__init__.py
diff --git a/cunumeric/_thunk/_sort.py b/cupynumeric/_thunk/_sort.py
similarity index 98%
rename from cunumeric/_thunk/_sort.py
rename to cupynumeric/_thunk/_sort.py
index b97a8eba0b..82ab738479 100644
--- a/cunumeric/_thunk/_sort.py
+++ b/cupynumeric/_thunk/_sort.py
@@ -19,7 +19,7 @@
 from legate.core import get_legate_runtime, types as ty
 
 from .._utils import is_np2
-from ..config import CuNumericOpCode
+from ..config import CuPyNumericOpCode
 from ..runtime import runtime
 
 if is_np2:
@@ -92,7 +92,7 @@ def sort_task(
 ) -> None:
     legate_runtime = get_legate_runtime()
     task = legate_runtime.create_auto_task(
-        output.library, CuNumericOpCode.SORT
+        output.library, CuPyNumericOpCode.SORT
     )
 
     uses_unbound_output = runtime.num_procs > 1 and input.ndim == 1
diff --git a/cunumeric/_thunk/deferred.py b/cupynumeric/_thunk/deferred.py
similarity index 96%
rename from cunumeric/_thunk/deferred.py
rename to cupynumeric/_thunk/deferred.py
index 0a0ae7fcbd..58349cd57d 100644
--- a/cunumeric/_thunk/deferred.py
+++ b/cupynumeric/_thunk/deferred.py
@@ -62,12 +62,14 @@
     BitGeneratorOperation,
     Bitorder,
     ConvertCode,
-    CuNumericOpCode,
+    ConvolveMethod,
+    CuPyNumericOpCode,
     RandGenCode,
     UnaryOpCode,
     UnaryRedCode,
 )
 from ..linalg._cholesky import cholesky_deferred
+from ..linalg._eigen import eig_deferred
 from ..linalg._qr import qr_deferred
 from ..linalg._solve import solve_deferred
 from ..linalg._svd import svd_deferred
@@ -87,6 +89,7 @@
     from ..config import BitGeneratorType, FFTDirection, FFTType, WindowOpCode
     from ..types import (
         BitOrder,
+        ConvolveMethod as ConvolveMethodType,
         ConvolveMode,
         NdShape,
         OrderType,
@@ -140,9 +143,11 @@ def decorator(func: Callable[P, R]) -> Callable[P, R]:
         def wrapper(*args: Any, **kwargs: Any) -> R:
             # Convert relevant arguments to DeferredArrays
             args = tuple(
-                runtime.to_deferred_array(arg, read_only=True)
-                if idx in indices and arg is not None
-                else arg
+                (
+                    runtime.to_deferred_array(arg, read_only=True)
+                    if idx in indices and arg is not None
+                    else arg
+                )
                 for (idx, arg) in enumerate(args)
             )
             for k, v in kwargs.items():
@@ -429,7 +434,7 @@ def _zip_indices(
 
         # call ZIP function to combine index arrays into a singe array
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.ZIP
+            self.library, CuPyNumericOpCode.ZIP
         )
         task.throws_exception(IndexError)
         p_out = task.add_output(output_arr.base)
@@ -646,7 +651,7 @@ def _advanced_indexing_with_boolean_array(
 
             task = legate_runtime.create_auto_task(
                 self.library,
-                CuNumericOpCode.ADVANCED_INDEXING,
+                CuPyNumericOpCode.ADVANCED_INDEXING,
             )
             task.add_output(out.base)
             p_rhs = task.add_input(rhs.base)
@@ -931,7 +936,7 @@ def get_item(self, key: Any) -> NumPyThunk:
                 )
 
                 task = legate_runtime.create_auto_task(
-                    self.library, CuNumericOpCode.READ
+                    self.library, CuPyNumericOpCode.READ
                 )
                 task.add_input(input.base)
                 task.add_output(result.base)  # type: ignore
@@ -1002,7 +1007,7 @@ def set_item(self, key: Any, rhs: Any) -> None:
                 assert rhs.size == 1
 
                 task = legate_runtime.create_auto_task(
-                    self.library, CuNumericOpCode.WRITE
+                    self.library, CuPyNumericOpCode.WRITE
                 )
                 # Since we pass the view with write discard privilege,
                 # we should make sure that the mapper either creates a fresh
@@ -1015,7 +1020,7 @@ def set_item(self, key: Any, rhs: Any) -> None:
                 # In Python, any inplace update of form arr[key] op= value
                 # goes through three steps: 1) __getitem__ fetching the object
                 # for the key, 2) __iop__ for the update, and 3) __setitem__
-                # to set the result back. In cuNumeric, the object we
+                # to set the result back. In cuPyNumeric, the object we
                 # return in step (1) is actually a subview to the array arr
                 # through which we make updates in place, so after step (2) is
                 # done, the effect of inplace update is already reflected
@@ -1040,7 +1045,7 @@ def reshape(self, newshape: NdShape, order: OrderType) -> NumPyThunk:
         if order != "C":
             # If we don't have a transform then we need to make a copy
             runtime.warn(
-                "cuNumeric has not implemented reshape using Fortran-like "
+                "cuPyNumeric has not implemented reshape using Fortran-like "
                 "index order and is falling back to canonical numpy. You may "
                 "notice significantly decreased performance for this "
                 "function call.",
@@ -1269,7 +1274,7 @@ def convert(
 
         if warn:
             runtime.warn(
-                "cuNumeric performing implicit type conversion from "
+                "cuPyNumeric performing implicit type conversion from "
                 + str(rhs_array.dtype)
                 + " to "
                 + str(lhs_array.dtype),
@@ -1280,7 +1285,7 @@ def convert(
         rhs = rhs_array.base
 
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.CONVERT
+            self.library, CuPyNumericOpCode.CONVERT
         )
         p_lhs = task.add_output(lhs)
         p_rhs = task.add_input(rhs)
@@ -1291,9 +1296,18 @@ def convert(
         task.execute()
 
     @auto_convert("input", "filter")
-    def convolve(self, input: Any, filter: Any, mode: ConvolveMode) -> None:
+    def convolve(
+        self,
+        input: Any,
+        filter: Any,
+        mode: ConvolveMode,
+        method: ConvolveMethodType,
+    ) -> None:
+        if method != "auto" and runtime.num_gpus == 0:
+            runtime.warn(f"the method {method} is ignored on CPUs")
+
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.CONVOLVE
+            self.library, CuPyNumericOpCode.CONVOLVE
         )
 
         offsets = tuple((ext + 1) // 2 for ext in filter.shape)
@@ -1304,6 +1318,7 @@ def convolve(self, input: Any, filter: Any, mode: ConvolveMode) -> None:
         p_halo = task.declare_partition()
         task.add_input(input.base, p_halo)
         task.add_scalar_arg(input.shape, (ty.int64,))
+        task.add_scalar_arg(getattr(ConvolveMethod, method.upper()), ty.int32)
 
         task.add_constraint(align(p_out, p_in))
         task.add_constraint(bloat(p_out, p_halo, offsets, offsets))
@@ -1333,7 +1348,7 @@ def fft(
             output = lhs.base
 
             task = legate_runtime.create_auto_task(
-                self.library, CuNumericOpCode.FFT
+                self.library, CuPyNumericOpCode.FFT
             )
 
             p_output = task.add_output(output)
@@ -1363,7 +1378,7 @@ def fft(
 
             task.execute()
 
-    # Fill the cuNumeric array with the value in the numpy array
+    # Fill the cuPyNumeric array with the value in the numpy array
     def _fill(self, value: LogicalStore | Scalar) -> None:
         assert self.base is not None
 
@@ -1379,7 +1394,7 @@ def _fill(self, value: LogicalStore | Scalar) -> None:
             # If this is a fill for an arg value, make sure to pass
             # the value dtype so that we get it packed correctly
             task = legate_runtime.create_auto_task(
-                self.library, CuNumericOpCode.FILL
+                self.library, CuPyNumericOpCode.FILL
             )
             task.add_output(self.base)
             task.add_input(value)
@@ -1508,7 +1523,7 @@ def contract(
             if blas_op == BlasOperation.VV:
                 # Vector dot product
                 task = legate_runtime.create_auto_task(
-                    self.library, CuNumericOpCode.DOT
+                    self.library, CuPyNumericOpCode.DOT
                 )
                 task.add_reduction(lhs, ReductionOpKind.ADD)
                 p_rhs1 = task.add_input(rhs1)
@@ -1533,7 +1548,7 @@ def contract(
                 lhs = lhs.promote(1, n)
 
                 task = legate_runtime.create_auto_task(
-                    self.library, CuNumericOpCode.MATVECMUL
+                    self.library, CuPyNumericOpCode.MATVECMUL
                 )
                 p_lhs = task.add_reduction(lhs, ReductionOpKind.ADD)
                 p_rhs1 = task.add_input(rhs1)
@@ -1577,7 +1592,7 @@ def rounding_divide(
 
                 # TODO: better heuristics
                 def choose_2d_color_shape(
-                    shape: tuple[int, int]
+                    shape: tuple[int, int],
                 ) -> tuple[int, int]:
                     # 1M elements, we should probably even go larger
                     MIN_MATRIX_SIZE = 1 << 20
@@ -1603,6 +1618,10 @@ def choose_2d_color_shape(
                 def choose_batchsize(
                     tilesize: tuple[int, int], k: int, itemsize: int
                 ) -> int:
+                    # don't batch in case we only have 1 proc
+                    if runtime.num_procs == 1:
+                        return k
+
                     # default corresponds to 128MB (to store A and B tile)
                     from ..settings import settings
 
@@ -1642,7 +1661,7 @@ def run_matmul_for_batch(
                     i: int,
                 ) -> None:
                     manual_task = legate_runtime.create_manual_task(
-                        self.library, CuNumericOpCode.MATMUL, color_shape
+                        self.library, CuPyNumericOpCode.MATMUL, color_shape
                     )
 
                     manual_task.add_output(tiled_lhs)
@@ -1714,7 +1733,7 @@ def add_mode(
 
         # Prepare the launch
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.CONTRACT
+            self.library, CuPyNumericOpCode.CONTRACT
         )
         p_lhs = task.add_reduction(lhs, ReductionOpKind.ADD)
         p_rhs1 = task.add_input(rhs1)
@@ -1740,7 +1759,7 @@ def choose(self, rhs: Any, *args: Any) -> None:
         ch_tuple = tuple(c._broadcast(tuple(out_arr.shape)) for c in ch_def)
 
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.CHOOSE
+            self.library, CuPyNumericOpCode.CHOOSE
         )
         p_out = task.add_output(out_arr)
         p_ind = task.add_input(index)
@@ -1764,7 +1783,7 @@ def select(
         )
 
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.SELECT
+            self.library, CuPyNumericOpCode.SELECT
         )
         out_arr = self.base
         task.add_output(out_arr)
@@ -1829,7 +1848,7 @@ def _diag_helper(
                 diag = diag.promote(0, matrix.shape[0])
 
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.DIAG
+            self.library, CuPyNumericOpCode.DIAG
         )
 
         if extract:
@@ -1883,7 +1902,7 @@ def put(self, indices: Any, values: Any, check_bounds: bool) -> None:
 
         shape = self_tmp.shape
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.WRAP
+            self.library, CuPyNumericOpCode.WRAP
         )
         p_indirect = task.add_output(indirect.base)
         task.add_scalar_arg(shape, (ty.int64,))
@@ -1910,7 +1929,7 @@ def putmask(self, mask: Any, values: Any) -> None:
         else:
             values_new = values.base
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.PUTMASK
+            self.library, CuPyNumericOpCode.PUTMASK
         )
         p_self = task.add_input(self.base)
         p_mask = task.add_input(mask.base)
@@ -1935,7 +1954,7 @@ def eye(self, k: int) -> None:
         # tells the runtime that it can throw away the previous contents of the
         # entire region.
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.EYE
+            self.library, CuPyNumericOpCode.EYE
         )
         task.add_input(self.base)
         task.add_output(self.base)
@@ -1952,7 +1971,7 @@ def arange(self, start: float, stop: float, step: float) -> None:
             return
 
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.ARANGE
+            self.library, CuPyNumericOpCode.ARANGE
         )
         task.add_output(self.base)
         task.add_scalar_arg(start, self.base.type)
@@ -1972,7 +1991,7 @@ def tile(self, rhs: Any, reps: Any | Sequence[int]) -> None:
             return
 
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.TILE
+            self.library, CuPyNumericOpCode.TILE
         )
 
         task.add_output(self.base)
@@ -1996,7 +2015,7 @@ def trilu(self, rhs: Any, k: int, lower: bool) -> None:
         rhs = rhs._broadcast(lhs.shape)
 
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.TRILU
+            self.library, CuPyNumericOpCode.TRILU
         )
 
         p_lhs = task.add_output(lhs)
@@ -2013,7 +2032,7 @@ def repeat(
         self, repeats: Any, axis: int, scalar_repeats: bool
     ) -> DeferredArray:
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.REPEAT
+            self.library, CuPyNumericOpCode.REPEAT
         )
         if scalar_repeats:
             out_shape = tuple(
@@ -2068,7 +2087,7 @@ def flip(self, rhs: Any, axes: int | tuple[int, ...] | None) -> None:
             axes = normalize_axis_tuple(axes, self.ndim)
 
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.FLIP
+            self.library, CuPyNumericOpCode.FLIP
         )
         p_out = task.add_output(output)
         p_in = task.add_input(input)
@@ -2095,7 +2114,7 @@ def bincount(self, rhs: Any, weights: NumPyThunk | None = None) -> None:
         dst_array.fill(np.array(0, dst_array.dtype))
 
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.BINCOUNT
+            self.library, CuPyNumericOpCode.BINCOUNT
         )
         p_dst = task.add_reduction(dst_array.base, ReductionOpKind.ADD)
         p_src = task.add_input(src_array.base)
@@ -2113,7 +2132,7 @@ def nonzero(self) -> tuple[NumPyThunk, ...]:
         )
 
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.NONZERO
+            self.library, CuPyNumericOpCode.NONZERO
         )
 
         p_self = task.add_input(self.base)
@@ -2134,7 +2153,7 @@ def bitgenerator_random_raw(
         flags: int,
     ) -> None:
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.BITGENERATOR
+            self.library, CuPyNumericOpCode.BITGENERATOR
         )
 
         task.add_output(self.base)
@@ -2162,7 +2181,7 @@ def bitgenerator_distribution(
         doubleparams: tuple[float, ...],
     ) -> None:
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.BITGENERATOR
+            self.library, CuPyNumericOpCode.BITGENERATOR
         )
 
         task.add_output(self.base)
@@ -3124,7 +3143,7 @@ def bitgenerator_negative_binomial(
 
     def random(self, gen_code: Any, args: tuple[Scalar, ...] = ()) -> None:
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.RAND
+            self.library, CuPyNumericOpCode.RAND
         )
 
         task.add_output(self.base)
@@ -3170,7 +3189,7 @@ def unary_op(
 
         with Annotation({"OpCode": op.name}):
             task = legate_runtime.create_auto_task(
-                self.library, CuNumericOpCode.UNARY_OP
+                self.library, CuPyNumericOpCode.UNARY_OP
             )
             p_lhs = task.add_output(lhs)
             p_rhs = task.add_input(rhs)
@@ -3242,7 +3261,7 @@ def unary_reduction(
 
             with Annotation({"OpCode": op.name, "ArgRed?": str(argred)}):
                 task = legate_runtime.create_auto_task(
-                    self.library, CuNumericOpCode.SCALAR_UNARY_RED
+                    self.library, CuPyNumericOpCode.SCALAR_UNARY_RED
                 )
 
                 task.add_reduction(lhs, _UNARY_RED_TO_REDUCTION_OPS[op])
@@ -3288,7 +3307,7 @@ def unary_reduction(
 
             with Annotation({"OpCode": op.name, "ArgRed?": str(argred)}):
                 task = legate_runtime.create_auto_task(
-                    self.library, CuNumericOpCode.UNARY_RED
+                    self.library, CuPyNumericOpCode.UNARY_RED
                 )
 
                 p_rhs = task.add_input(rhs_array.base)
@@ -3345,7 +3364,7 @@ def binary_op(
         with Annotation({"OpCode": op_code.name}):
             # Populate the Legate launcher
             task = legate_runtime.create_auto_task(
-                self.library, CuNumericOpCode.BINARY_OP
+                self.library, CuPyNumericOpCode.BINARY_OP
             )
             p_lhs = task.add_output(lhs)
             p_rhs1 = task.add_input(rhs1)
@@ -3369,13 +3388,14 @@ def binary_reduction(
         args: tuple[Scalar, ...],
     ) -> None:
         lhs = self.base
-        rhs1 = src1.base
-        rhs2 = src2.base
         assert lhs.has_scalar_storage
 
         if broadcast is not None:
-            rhs1 = rhs1._broadcast(broadcast)
-            rhs2 = rhs2._broadcast(broadcast)
+            rhs1 = src1._broadcast(broadcast)
+            rhs2 = src2._broadcast(broadcast)
+        else:
+            rhs1 = src1.base
+            rhs2 = src2.base
 
         # Populate the Legate launcher
         if op == BinaryOpCode.NOT_EQUAL:
@@ -3385,7 +3405,7 @@ def binary_reduction(
             redop = ReductionOpKind.MUL
             self.fill(np.array(True))
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.BINARY_RED
+            self.library, CuPyNumericOpCode.BINARY_RED
         )
         task.add_reduction(lhs, redop)
         p_rhs1 = task.add_input(rhs1)
@@ -3407,7 +3427,7 @@ def where(self, src1: Any, src2: Any, src3: Any) -> None:
 
         # Populate the Legate launcher
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.WHERE
+            self.library, CuPyNumericOpCode.WHERE
         )
         p_lhs = task.add_output(lhs)
         p_rhs1 = task.add_input(rhs1)
@@ -3424,7 +3444,7 @@ def argwhere(self) -> NumPyThunk:
         result = runtime.create_unbound_thunk(ty.int64, ndim=2)
 
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.ARGWHERE
+            self.library, CuPyNumericOpCode.ARGWHERE
         )
 
         task.add_output(result.base)
@@ -3446,8 +3466,16 @@ def compute_strides(shape: NdShape) -> tuple[int, ...]:
         return result
 
     @auto_convert("src")
-    def cholesky(self, src: Any, no_tril: bool = False) -> None:
-        cholesky_deferred(self, src, no_tril)
+    def cholesky(self, src: Any) -> None:
+        cholesky_deferred(self, src)
+
+    @auto_convert("ew", "ev")
+    def eig(self, ew: Any, ev: Any) -> None:
+        eig_deferred(self, ew, ev)
+
+    @auto_convert("ew")
+    def eigvals(self, ew: Any) -> None:
+        eig_deferred(self, ew)
 
     @auto_convert("q", "r")
     def qr(self, q: Any, r: Any) -> None:
@@ -3489,7 +3517,7 @@ def scan(
             output = input
 
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.SCAN_LOCAL
+            self.library, CuPyNumericOpCode.SCAN_LOCAL
         )
         p_out = task.add_output(output.base)
         p_in = task.add_input(input.base)
@@ -3505,7 +3533,7 @@ def scan(
         # NOTE: Each node will do a sum up to its index, alternatively could
         # do one centralized scan and broadcast (slightly less redundant work)
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.SCAN_GLOBAL
+            self.library, CuPyNumericOpCode.SCAN_GLOBAL
         )
         task.add_input(output.base)
         p_temp = task.add_input(temp.base)
@@ -3526,7 +3554,7 @@ def unique(self) -> NumPyThunk:
         result = runtime.create_unbound_thunk(self.base.type)
 
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.UNIQUE
+            self.library, CuPyNumericOpCode.UNIQUE
         )
 
         task.add_output(result.base)
@@ -3539,7 +3567,7 @@ def unique(self) -> NumPyThunk:
 
         if runtime.num_gpus == 0 and runtime.num_procs > 1:
             result.base = legate_runtime.tree_reduce(
-                self.library, CuNumericOpCode.UNIQUE_REDUCE, result.base
+                self.library, CuPyNumericOpCode.UNIQUE_REDUCE, result.base
             )
 
         return result
@@ -3547,7 +3575,7 @@ def unique(self) -> NumPyThunk:
     @auto_convert("rhs", "v")
     def searchsorted(self, rhs: Any, v: Any, side: SortSide = "left") -> None:
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.SEARCHSORTED
+            self.library, CuPyNumericOpCode.SEARCHSORTED
         )
 
         is_left = side == "left"
@@ -3587,7 +3615,7 @@ def sort(
 
         if order is not None:
             raise NotImplementedError(
-                "cuNumeric does not support sorting with 'order' as "
+                "cuPyNumeric does not support sorting with 'order' as "
                 "ndarray only supports numeric values"
             )
         if axis is not None and (axis >= rhs.ndim or axis < -rhs.ndim):
@@ -3607,7 +3635,7 @@ def partition(
     ) -> None:
         if order is not None:
             raise NotImplementedError(
-                "cuNumeric does not support partitioning with 'order' as "
+                "cuPyNumeric does not support partitioning with 'order' as "
                 "ndarray only supports numeric values"
             )
         if axis is not None and (axis >= rhs.ndim or axis < -rhs.ndim):
@@ -3618,7 +3646,7 @@ def partition(
 
     def create_window(self, op_code: WindowOpCode, M: int, *args: Any) -> None:
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.WINDOW
+            self.library, CuPyNumericOpCode.WINDOW
         )
         task.add_output(self.base)
         task.add_scalar_arg(op_code, ty.int32)
@@ -3631,7 +3659,7 @@ def create_window(self, op_code: WindowOpCode, M: int, *args: Any) -> None:
     def packbits(self, src: Any, axis: int | None, bitorder: BitOrder) -> None:
         bitorder_code = getattr(Bitorder, bitorder.upper())
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.PACKBITS
+            self.library, CuPyNumericOpCode.PACKBITS
         )
         p_out = task.declare_partition()
         p_in = task.declare_partition()
@@ -3649,7 +3677,7 @@ def unpackbits(
     ) -> None:
         bitorder_code = getattr(Bitorder, bitorder.upper())
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.UNPACKBITS
+            self.library, CuPyNumericOpCode.UNPACKBITS
         )
         p_out = task.declare_partition()
         p_in = task.declare_partition()
@@ -3682,7 +3710,7 @@ def _wrap(self, src: Any, new_len: int) -> None:
         )
 
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.WRAP
+            self.library, CuPyNumericOpCode.WRAP
         )
         task.add_output(indirect.base)
         task.add_scalar_arg(src.shape, (ty.int64,))
@@ -3710,7 +3738,7 @@ def histogram(self, src: Any, bins: Any, weights: Any) -> None:
         dst_array.fill(np.array(0, dst_array.dtype))
 
         task = legate_runtime.create_auto_task(
-            self.library, CuNumericOpCode.HISTOGRAM
+            self.library, CuPyNumericOpCode.HISTOGRAM
         )
         p_dst = task.add_reduction(dst_array.base, ReductionOpKind.ADD)
         p_src = task.add_input(src_array.base)
diff --git a/cunumeric/_thunk/eager.py b/cupynumeric/_thunk/eager.py
similarity index 95%
rename from cunumeric/_thunk/eager.py
rename to cupynumeric/_thunk/eager.py
index 868fb97bf9..4eb86df694 100644
--- a/cunumeric/_thunk/eager.py
+++ b/cupynumeric/_thunk/eager.py
@@ -45,6 +45,7 @@
     from ..config import BitGeneratorType, FFTType
     from ..types import (
         BitOrder,
+        ConvolveMethod,
         ConvolveMode,
         NdShape,
         OrderType,
@@ -336,17 +337,30 @@ def conj(self) -> NumPyThunk:
 
         return EagerArray(self.array.conj())
 
-    def convolve(self, input: Any, filter: Any, mode: ConvolveMode) -> None:
+    def convolve(
+        self,
+        input: Any,
+        filter: Any,
+        mode: ConvolveMode,
+        method: ConvolveMethod,
+    ) -> None:
         self.check_eager_args(input, filter)
         if self.deferred is not None:
-            self.deferred.convolve(input, filter, mode)
+            self.deferred.convolve(input, filter, mode, method)
         else:
             if self.ndim == 1:
+                if method != "auto":
+                    runtime.warn(
+                        f"the method {method} is ignored "
+                        "for the 1D convolution"
+                    )
                 self.array[:] = np.convolve(input.array, filter.array, mode)
             else:
                 from scipy.signal import convolve  # type: ignore [import]
 
-                self.array[...] = convolve(input.array, filter.array, mode)
+                self.array[...] = convolve(
+                    input.array, filter.array, mode, method
+                )
 
     def fft(
         self,
@@ -1453,17 +1467,21 @@ def unary_op(
                 func(
                     rhs.array,
                     out=self.array,
-                    where=where
-                    if not isinstance(where, EagerArray)
-                    else where.array,
+                    where=(
+                        where
+                        if not isinstance(where, EagerArray)
+                        else where.array
+                    ),
                 )
             else:
                 func(
                     rhs.array,
                     out=(self.array, *(out.array for out in multiout)),
-                    where=where
-                    if not isinstance(where, EagerArray)
-                    else where.array,
+                    where=(
+                        where
+                        if not isinstance(where, EagerArray)
+                        else where.array
+                    ),
                 )
         elif op == UnaryOpCode.CLIP:
             np.clip(
@@ -1535,9 +1553,9 @@ def unary_reduction(
                 out=self.array,
                 axis=orig_axis,
                 keepdims=keepdims,
-                where=where
-                if not isinstance(where, EagerArray)
-                else where.array,
+                where=(
+                    where if not isinstance(where, EagerArray) else where.array
+                ),
                 **kws,
             )
         elif op == UnaryRedCode.SUM_SQUARES:
@@ -1546,9 +1564,9 @@ def unary_reduction(
                 squared,
                 out=self.array,
                 axis=orig_axis,
-                where=where
-                if not isinstance(where, EagerArray)
-                else where.array,
+                where=(
+                    where if not isinstance(where, EagerArray) else where.array
+                ),
                 keepdims=keepdims,
             )
         elif op == UnaryRedCode.VARIANCE:
@@ -1558,9 +1576,9 @@ def unary_reduction(
             np.sum(
                 squares,
                 axis=orig_axis,
-                where=where
-                if not isinstance(where, EagerArray)
-                else where.array,
+                where=(
+                    where if not isinstance(where, EagerArray) else where.array
+                ),
                 keepdims=keepdims,
                 out=self.array,
             )
@@ -1605,9 +1623,9 @@ def binary_op(
                 rhs1.array,
                 rhs2.array,
                 out=self.array,
-                where=where
-                if not isinstance(where, EagerArray)
-                else where.array,
+                where=(
+                    where if not isinstance(where, EagerArray) else where.array
+                ),
             )
 
     def binary_reduction(
@@ -1661,10 +1679,10 @@ def trilu(self, rhs: Any, k: int, lower: bool) -> None:
             else:
                 self.array[:] = np.triu(rhs.array, k)
 
-    def cholesky(self, src: Any, no_tril: bool) -> None:
+    def cholesky(self, src: Any) -> None:
         self.check_eager_args(src)
         if self.deferred is not None:
-            self.deferred.cholesky(src, no_tril)
+            self.deferred.cholesky(src)
         else:
             try:
                 result = np.linalg.cholesky(src.array)
@@ -1672,10 +1690,40 @@ def cholesky(self, src: Any, no_tril: bool) -> None:
                 from ..linalg import LinAlgError
 
                 raise LinAlgError(e) from e
-            if no_tril:
-                result = np.triu(result.T.conj(), k=1) + result
+
             self.array[:] = result
 
+    def eig(self, ew: Any, ev: Any) -> None:
+        self.check_eager_args(ew, ev)
+        if self.deferred is not None and (
+            runtime.num_gpus == 0 or runtime.cusolver_has_geev()
+        ):
+            self.deferred.eig(ew, ev)
+        else:
+            try:
+                result_ew, result_ev = np.linalg.eig(self.array)
+            except np.linalg.LinAlgError as e:
+                from ..linalg import LinAlgError
+
+                raise LinAlgError(e) from e
+            ew.array[:] = result_ew
+            ev.array[:] = result_ev
+
+    def eigvals(self, ew: Any) -> None:
+        self.check_eager_args(ew)
+        if self.deferred is not None and (
+            runtime.num_gpus == 0 or runtime.cusolver_has_geev()
+        ):
+            self.deferred.eigvals(ew)
+        else:
+            try:
+                result_ew = np.linalg.eigvals(self.array)
+            except np.linalg.LinAlgError as e:
+                from ..linalg import LinAlgError
+
+                raise LinAlgError(e) from e
+            ew.array[:] = result_ew
+
     def qr(self, q: Any, r: Any) -> None:
         self.check_eager_args(q, r)
         if self.deferred is not None:
diff --git a/cunumeric/_thunk/thunk.py b/cupynumeric/_thunk/thunk.py
similarity index 78%
rename from cunumeric/_thunk/thunk.py
rename to cupynumeric/_thunk/thunk.py
index 5dbe09264c..06619d7dc1 100644
--- a/cunumeric/_thunk/thunk.py
+++ b/cupynumeric/_thunk/thunk.py
@@ -36,6 +36,7 @@
     )
     from ..types import (
         BitOrder,
+        ConvolveMethod,
         ConvolveMode,
         NdShape,
         OrderType,
@@ -48,7 +49,7 @@
 class NumPyThunk(ABC):
     """This is the base class for NumPy computations. It has methods
     for all the kinds of computations and operations that can be done
-    on cuNumeric ndarrays.
+    on cuPyNumeric ndarrays.
 
     :meta private:
     """
@@ -73,28 +74,28 @@ def size(self) -> int:
     # Abstract methods
 
     @abstractproperty
-    def shape(self) -> NdShape:
-        ...
+    def shape(self) -> NdShape: ...
 
     @abstractmethod
-    def __numpy_array__(self) -> npt.NDArray[Any]:
-        ...
+    def __numpy_array__(self) -> npt.NDArray[Any]: ...
 
     @abstractmethod
-    def imag(self) -> NumPyThunk:
-        ...
+    def imag(self) -> NumPyThunk: ...
 
     @abstractmethod
-    def real(self) -> NumPyThunk:
-        ...
+    def real(self) -> NumPyThunk: ...
 
     @abstractmethod
-    def conj(self) -> NumPyThunk:
-        ...
+    def conj(self) -> NumPyThunk: ...
 
     @abstractmethod
-    def convolve(self, input: Any, filter: Any, mode: ConvolveMode) -> None:
-        ...
+    def convolve(
+        self,
+        input: Any,
+        filter: Any,
+        mode: ConvolveMode,
+        method: ConvolveMethod,
+    ) -> None: ...
 
     @abstractmethod
     def fft(
@@ -103,43 +104,34 @@ def fft(
         axes: Sequence[int],
         kind: FFTType,
         direction: FFTDirection,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
-    def copy(self, rhs: Any, deep: bool) -> None:
-        ...
+    def copy(self, rhs: Any, deep: bool) -> None: ...
 
     @abstractmethod
     def repeat(
         self, repeats: Any, axis: int, scalar_repeats: bool
-    ) -> NumPyThunk:
-        ...
+    ) -> NumPyThunk: ...
 
     @property
     @abstractmethod
-    def scalar(self) -> bool:
-        ...
+    def scalar(self) -> bool: ...
 
     @abstractmethod
-    def get_item(self, key: Any) -> NumPyThunk:
-        ...
+    def get_item(self, key: Any) -> NumPyThunk: ...
 
     @abstractmethod
-    def set_item(self, key: Any, value: Any) -> None:
-        ...
+    def set_item(self, key: Any, value: Any) -> None: ...
 
     @abstractmethod
-    def reshape(self, newshape: NdShape, order: OrderType) -> NumPyThunk:
-        ...
+    def reshape(self, newshape: NdShape, order: OrderType) -> NumPyThunk: ...
 
     @abstractmethod
-    def squeeze(self, axis: int | tuple[int, ...] | None) -> NumPyThunk:
-        ...
+    def squeeze(self, axis: int | tuple[int, ...] | None) -> NumPyThunk: ...
 
     @abstractmethod
-    def swapaxes(self, axis1: int, axis2: int) -> NumPyThunk:
-        ...
+    def swapaxes(self, axis1: int, axis2: int) -> NumPyThunk: ...
 
     @abstractmethod
     def convert(
@@ -148,20 +140,16 @@ def convert(
         warn: bool = True,
         nan_op: ConvertCode = ConvertCode.NOOP,
         temporary: bool = False,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
-    def fill(self, value: Any) -> None:
-        ...
+    def fill(self, value: Any) -> None: ...
 
     @abstractmethod
-    def transpose(self, axes: tuple[int, ...] | list[int]) -> NumPyThunk:
-        ...
+    def transpose(self, axes: tuple[int, ...] | list[int]) -> NumPyThunk: ...
 
     @abstractmethod
-    def flip(self, rhs: Any, axes: int | tuple[int, ...] | None) -> None:
-        ...
+    def flip(self, rhs: Any, axes: int | tuple[int, ...] | None) -> None: ...
 
     @abstractmethod
     def contract(
@@ -172,12 +160,10 @@ def contract(
         rhs2_thunk: Any,
         rhs2_modes: list[str],
         mode2extent: dict[str, int],
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
-    def choose(self, rhs: Any, *args: Any) -> None:
-        ...
+    def choose(self, rhs: Any, *args: Any) -> None: ...
 
     @abstractmethod
     def select(
@@ -185,46 +171,38 @@ def select(
         condlist: Iterable[Any],
         choicelist: Iterable[Any],
         default: npt.NDArray[Any],
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def _diag_helper(
         self, rhs: Any, offset: int, naxes: int, extract: bool, trace: bool
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
-    def put(self, indices: Any, values: Any, check_bounds: bool) -> None:
-        ...
+    def put(self, indices: Any, values: Any, check_bounds: bool) -> None: ...
 
     @abstractmethod
-    def putmask(self, mask: Any, values: Any) -> None:
-        ...
+    def putmask(self, mask: Any, values: Any) -> None: ...
 
     @abstractmethod
-    def eye(self, k: int) -> None:
-        ...
+    def eye(self, k: int) -> None: ...
 
     @abstractmethod
-    def arange(self, start: float, stop: float, step: float) -> None:
-        ...
+    def arange(self, start: float, stop: float, step: float) -> None: ...
 
     @abstractmethod
-    def tile(self, rhs: Any, reps: Any | Sequence[int]) -> None:
-        ...
+    def tile(self, rhs: Any, reps: Any | Sequence[int]) -> None: ...
 
     @abstractmethod
-    def trilu(self, rhs: Any, k: int, lower: bool) -> None:
-        ...
+    def trilu(self, rhs: Any, k: int, lower: bool) -> None: ...
 
     @abstractmethod
-    def bincount(self, rhs: Any, weights: NumPyThunk | None = None) -> None:
-        ...
+    def bincount(
+        self, rhs: Any, weights: NumPyThunk | None = None
+    ) -> None: ...
 
     @abstractmethod
-    def nonzero(self) -> tuple[NumPyThunk, ...]:
-        ...
+    def nonzero(self) -> tuple[NumPyThunk, ...]: ...
 
     @abstractmethod
     def bitgenerator_random_raw(
@@ -233,8 +211,7 @@ def bitgenerator_random_raw(
         generatorType: BitGeneratorType,
         seed: int | None,
         flags: int,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_integers(
@@ -245,8 +222,7 @@ def bitgenerator_integers(
         flags: int,
         low: int,
         high: int,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_uniform(
@@ -257,8 +233,7 @@ def bitgenerator_uniform(
         flags: int,
         low: float,
         high: float,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_lognormal(
@@ -269,8 +244,7 @@ def bitgenerator_lognormal(
         flags: int,
         mean: float,
         sigma: float,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_normal(
@@ -281,8 +255,7 @@ def bitgenerator_normal(
         flags: int,
         mean: float,
         sigma: float,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_poisson(
@@ -292,8 +265,7 @@ def bitgenerator_poisson(
         seed: int | None,
         flags: int,
         lam: float,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_exponential(
@@ -303,8 +275,7 @@ def bitgenerator_exponential(
         seed: int | None,
         flags: int,
         scale: float,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_gumbel(
@@ -315,8 +286,7 @@ def bitgenerator_gumbel(
         flags: int,
         mu: float,
         beta: float,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_laplace(
@@ -327,8 +297,7 @@ def bitgenerator_laplace(
         flags: int,
         mu: float,
         beta: float,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_logistic(
@@ -339,8 +308,7 @@ def bitgenerator_logistic(
         flags: int,
         mu: float,
         beta: float,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_pareto(
@@ -350,8 +318,7 @@ def bitgenerator_pareto(
         seed: int | None,
         flags: int,
         alpha: float,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_power(
@@ -361,8 +328,7 @@ def bitgenerator_power(
         seed: int | None,
         flags: int,
         alpha: float,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_rayleigh(
@@ -372,8 +338,7 @@ def bitgenerator_rayleigh(
         seed: int | None,
         flags: int,
         sigma: float,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_cauchy(
@@ -384,8 +349,7 @@ def bitgenerator_cauchy(
         flags: int,
         x0: float,
         gamma: float,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_triangular(
@@ -397,8 +361,7 @@ def bitgenerator_triangular(
         a: float,
         b: float,
         c: float,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_weibull(
@@ -409,8 +372,7 @@ def bitgenerator_weibull(
         flags: int,
         lam: float,
         k: float,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_bytes(
@@ -419,8 +381,7 @@ def bitgenerator_bytes(
         generatorType: BitGeneratorType,
         seed: int | None,
         flags: int,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_beta(
@@ -431,8 +392,7 @@ def bitgenerator_beta(
         flags: int,
         a: float,
         b: float,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_f(
@@ -443,8 +403,7 @@ def bitgenerator_f(
         flags: int,
         dfnum: float,
         dfden: float,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_logseries(
@@ -454,8 +413,7 @@ def bitgenerator_logseries(
         seed: int | None,
         flags: int,
         p: float,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_noncentral_f(
@@ -467,8 +425,7 @@ def bitgenerator_noncentral_f(
         dfnum: float,
         dfden: float,
         nonc: float,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_chisquare(
@@ -479,8 +436,7 @@ def bitgenerator_chisquare(
         flags: int,
         df: float,
         nonc: float,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_gamma(
@@ -491,8 +447,7 @@ def bitgenerator_gamma(
         flags: int,
         k: float,
         theta: float,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_standard_t(
@@ -502,8 +457,7 @@ def bitgenerator_standard_t(
         seed: int | None,
         flags: int,
         df: float,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_hypergeometric(
@@ -515,8 +469,7 @@ def bitgenerator_hypergeometric(
         ngood: int,
         nbad: int,
         nsample: int,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_vonmises(
@@ -527,8 +480,7 @@ def bitgenerator_vonmises(
         flags: int,
         mu: float,
         kappa: float,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_zipf(
@@ -538,8 +490,7 @@ def bitgenerator_zipf(
         seed: int | None,
         flags: int,
         alpha: float,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_geometric(
@@ -549,8 +500,7 @@ def bitgenerator_geometric(
         seed: int | None,
         flags: int,
         p: float,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_wald(
@@ -561,8 +511,7 @@ def bitgenerator_wald(
         flags: int,
         mean: float,
         scale: float,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_binomial(
@@ -573,8 +522,7 @@ def bitgenerator_binomial(
         flags: int,
         ntrials: int,
         p: float,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def bitgenerator_negative_binomial(
@@ -585,12 +533,10 @@ def bitgenerator_negative_binomial(
         flags: int,
         ntrials: int,
         p: float,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
-    def random_uniform(self) -> None:
-        ...
+    def random_uniform(self) -> None: ...
 
     @abstractmethod
     def partition(
@@ -601,24 +547,22 @@ def partition(
         axis: int | None = -1,
         kind: SelectKind = "introselect",
         order: str | list[str] | None = None,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
-    def random_normal(self) -> None:
-        ...
+    def random_normal(self) -> None: ...
 
     @abstractmethod
     def random_integer(
         self,
         low: int | npt.NDArray[Any],
         high: int | npt.NDArray[Any],
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
-    def searchsorted(self, rhs: Any, v: Any, side: SortSide = "left") -> None:
-        ...
+    def searchsorted(
+        self, rhs: Any, v: Any, side: SortSide = "left"
+    ) -> None: ...
 
     @abstractmethod
     def sort(
@@ -628,8 +572,7 @@ def sort(
         axis: int | None = -1,
         kind: SortType = "quicksort",
         order: str | list[str] | None = None,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def unary_op(
@@ -639,8 +582,7 @@ def unary_op(
         where: Any,
         args: tuple[Scalar, ...] = (),
         multiout: Any | None = None,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def unary_reduction(
@@ -653,14 +595,12 @@ def unary_reduction(
         keepdims: bool,
         args: tuple[Scalar, ...],
         initial: Any,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def isclose(
         self, rhs1: Any, rhs2: Any, rtol: float, atol: float, equal_nan: bool
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def binary_op(
@@ -670,8 +610,7 @@ def binary_op(
         rhs2: Any,
         where: Any,
         args: tuple[Scalar, ...],
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
     def binary_reduction(
@@ -681,36 +620,34 @@ def binary_reduction(
         rhs2: Any,
         broadcast: NdShape | None,
         args: tuple[Scalar, ...],
-    ) -> None:
-        ...
+    ) -> None: ...
+
+    @abstractmethod
+    def broadcast_to(self, shape: NdShape) -> NumPyThunk: ...
 
     @abstractmethod
-    def broadcast_to(self, shape: NdShape) -> NumPyThunk:
-        ...
+    def argwhere(self) -> NumPyThunk: ...
 
     @abstractmethod
-    def argwhere(self) -> NumPyThunk:
-        ...
+    def where(self, rhs1: Any, rhs2: Any, rhs3: Any) -> None: ...
 
     @abstractmethod
-    def where(self, rhs1: Any, rhs2: Any, rhs3: Any) -> None:
-        ...
+    def cholesky(self, src: Any) -> None: ...
 
     @abstractmethod
-    def cholesky(self, src: Any, no_tril: bool) -> None:
-        ...
+    def eig(self, ew: Any, ev: Any) -> None: ...
 
     @abstractmethod
-    def qr(self, q: Any, r: Any) -> None:
-        ...
+    def eigvals(self, ew: Any) -> None: ...
 
     @abstractmethod
-    def solve(self, a: Any, b: Any) -> None:
-        ...
+    def qr(self, q: Any, r: Any) -> None: ...
 
     @abstractmethod
-    def svd(self, u: Any, s: Any, vh: Any) -> None:
-        ...
+    def solve(self, a: Any, b: Any) -> None: ...
+
+    @abstractmethod
+    def svd(self, u: Any, s: Any, vh: Any) -> None: ...
 
     @abstractmethod
     def scan(
@@ -720,39 +657,35 @@ def scan(
         axis: int,
         dtype: npt.DTypeLike | None,
         nan_to_identity: bool,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
-    def unique(self) -> NumPyThunk:
-        ...
+    def unique(self) -> NumPyThunk: ...
 
     @abstractmethod
-    def create_window(self, op_code: WindowOpCode, M: Any, *args: Any) -> None:
-        ...
+    def create_window(
+        self, op_code: WindowOpCode, M: Any, *args: Any
+    ) -> None: ...
 
     @abstractmethod
-    def packbits(self, src: Any, axis: int | None, bitorder: BitOrder) -> None:
-        ...
+    def packbits(
+        self, src: Any, axis: int | None, bitorder: BitOrder
+    ) -> None: ...
 
     @abstractmethod
     def unpackbits(
         self, src: Any, axis: int | None, bitorder: BitOrder
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @abstractmethod
-    def _wrap(self, src: Any, new_len: int) -> None:
-        ...
+    def _wrap(self, src: Any, new_len: int) -> None: ...
 
     @abstractmethod
-    def histogram(self, src: Any, bins: Any, weights: Any) -> None:
-        ...
+    def histogram(self, src: Any, bins: Any, weights: Any) -> None: ...
 
     @abstractmethod
     def stencil_hint(
         self,
         low_offsets: tuple[int, ...],
         high_offsets: tuple[int, ...],
-    ) -> None:
-        ...
+    ) -> None: ...
diff --git a/cunumeric/_ufunc/__init__.py b/cupynumeric/_ufunc/__init__.py
similarity index 100%
rename from cunumeric/_ufunc/__init__.py
rename to cupynumeric/_ufunc/__init__.py
diff --git a/cunumeric/_ufunc/bit_twiddling.py b/cupynumeric/_ufunc/bit_twiddling.py
similarity index 100%
rename from cunumeric/_ufunc/bit_twiddling.py
rename to cupynumeric/_ufunc/bit_twiddling.py
diff --git a/cunumeric/_ufunc/comparison.py b/cupynumeric/_ufunc/comparison.py
similarity index 97%
rename from cunumeric/_ufunc/comparison.py
rename to cupynumeric/_ufunc/comparison.py
index 089aa7f0fe..148854fad0 100644
--- a/cunumeric/_ufunc/comparison.py
+++ b/cupynumeric/_ufunc/comparison.py
@@ -18,7 +18,7 @@
 
 import numpy as np
 
-from .._array.util import convert_to_cunumeric_ndarray
+from .._array.util import convert_to_cupynumeric_ndarray
 from ..config import BinaryOpCode, UnaryOpCode, UnaryRedCode
 from .ufunc import (
     all_dtypes,
@@ -74,7 +74,7 @@ def _post_resolution_check(
 
         if truthiness is not None:
             # Replace with an always-true/always-false operation
-            arr_x = convert_to_cunumeric_ndarray(
+            arr_x = convert_to_cupynumeric_ndarray(
                 np.array(iinfo.min, dtype=arr_x.dtype)
             )
             op_code = (
@@ -98,7 +98,7 @@ def _post_resolution_check(
 
         if truthiness is not None:
             # Replace with an always-true/always-false operation
-            arr_y = convert_to_cunumeric_ndarray(
+            arr_y = convert_to_cupynumeric_ndarray(
                 np.array(iinfo.min, dtype=arr_y.dtype)
             )
             op_code = (
diff --git a/cunumeric/_ufunc/floating.py b/cupynumeric/_ufunc/floating.py
similarity index 100%
rename from cunumeric/_ufunc/floating.py
rename to cupynumeric/_ufunc/floating.py
diff --git a/cunumeric/_ufunc/math.py b/cupynumeric/_ufunc/math.py
similarity index 100%
rename from cunumeric/_ufunc/math.py
rename to cupynumeric/_ufunc/math.py
diff --git a/cunumeric/_ufunc/trigonometric.py b/cupynumeric/_ufunc/trigonometric.py
similarity index 100%
rename from cunumeric/_ufunc/trigonometric.py
rename to cupynumeric/_ufunc/trigonometric.py
diff --git a/cunumeric/_ufunc/ufunc.py b/cupynumeric/_ufunc/ufunc.py
similarity index 97%
rename from cunumeric/_ufunc/ufunc.py
rename to cupynumeric/_ufunc/ufunc.py
index 74b4f8badf..6eb42a3221 100644
--- a/cunumeric/_ufunc/ufunc.py
+++ b/cupynumeric/_ufunc/ufunc.py
@@ -19,11 +19,13 @@
 import numpy as np
 from legate.core.utils import OrderedSet
 
+from cupynumeric._utils import is_np2_1
+
 from .._array.thunk import perform_unary_reduction
 from .._array.util import (
     add_boilerplate,
     check_writeable,
-    convert_to_cunumeric_ndarray,
+    convert_to_cupynumeric_ndarray,
 )
 from ..config import BinaryOpCode, UnaryOpCode, UnaryRedCode
 from ..types import NdShape
@@ -79,7 +81,7 @@
 numpy.{}
 
 Availability
---------
+------------
 Multiple GPUs, Multiple CPUs
 """
 
@@ -117,7 +119,7 @@
 numpy.{}
 
 Availability
---------
+------------
 Multiple GPUs, Multiple CPUs
 """
 
@@ -155,7 +157,7 @@
 numpy.{}
 
 Availability
---------
+------------
 Multiple GPUs, Multiple CPUs
 """
 
@@ -322,7 +324,7 @@ def _maybe_cast_output(out: ndarray | None, result: ndarray) -> ndarray:
         return out
 
     @staticmethod
-    def _maybe_convert_output_to_cunumeric_ndarray(
+    def _maybe_convert_output_to_cupynumeric_ndarray(
         out: ndarray | npt.NDArray[Any] | None,
     ) -> ndarray | None:
         from .._array.array import ndarray
@@ -332,7 +334,7 @@ def _maybe_convert_output_to_cunumeric_ndarray(
         if isinstance(out, ndarray):
             return out
         if isinstance(out, np.ndarray):
-            return convert_to_cunumeric_ndarray(out, share=True)
+            return convert_to_cupynumeric_ndarray(out, share=True)
         raise TypeError("return arrays must be of ArrayType")
 
     def _prepare_operands(
@@ -354,7 +356,7 @@ def _prepare_operands(
             )
 
         inputs = tuple(
-            convert_to_cunumeric_ndarray(arr) for arr in args[: self.nin]
+            convert_to_cupynumeric_ndarray(arr) for arr in args[: self.nin]
         )
 
         if len(args) > self.nin:
@@ -374,7 +376,7 @@ def _prepare_operands(
             computed_out = out
 
         outputs = tuple(
-            self._maybe_convert_output_to_cunumeric_ndarray(arr)
+            self._maybe_convert_output_to_cupynumeric_ndarray(arr)
             for arr in computed_out
         )
 
@@ -486,6 +488,14 @@ def __call__(
             precision_fixed = True
             x = self._maybe_cast_input(x, dtype, casting)
 
+        if (
+            self._name in {"ceil", "floor", "trunc"}
+            and is_np2_1
+            and np.issubdtype(x.dtype, np.integer)
+        ):
+            result = x
+            return self._maybe_cast_output(out, result)
+
         # Resolve the dtype to use for the computation and cast the input
         # if necessary. If the dtype is already fixed by the caller,
         # the dtype must be one of the dtypes supported by this operation.
@@ -666,9 +676,11 @@ def _resolve_dtype(
         else:
             to_dtypes = tuple(arr.dtype for arr in arrs)
             key = tuple(
-                arr.dtype.char
-                if type(orig) not in (int, float, complex)
-                else type(orig)
+                (
+                    arr.dtype.char
+                    if type(orig) not in (int, float, complex)
+                    else type(orig)
+                )
                 for orig, arr in zip(orig_args, arrs)
             )
             # When all inputs are scalars, cannot use weak logic below.
diff --git a/cunumeric/_utils/__init__.py b/cupynumeric/_utils/__init__.py
similarity index 92%
rename from cunumeric/_utils/__init__.py
rename to cupynumeric/_utils/__init__.py
index 626ef7aae5..d292c29016 100644
--- a/cunumeric/_utils/__init__.py
+++ b/cupynumeric/_utils/__init__.py
@@ -17,3 +17,4 @@
 import numpy as np
 
 is_np2 = np.lib.NumpyVersion(np.__version__) >= "2.0.0b1"
+is_np2_1 = np.lib.NumpyVersion(np.__version__) >= "2.1.0b1"
diff --git a/cunumeric/_utils/array.py b/cupynumeric/_utils/array.py
similarity index 71%
rename from cunumeric/_utils/array.py
rename to cupynumeric/_utils/array.py
index 6e35735d30..5ad037e39b 100644
--- a/cunumeric/_utils/array.py
+++ b/cupynumeric/_utils/array.py
@@ -15,13 +15,17 @@
 from __future__ import annotations
 
 from functools import reduce
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import legate.core.types as ty
 import numpy as np
+from legate.core import PhysicalArray, StoreTarget
 
 from ..types import NdShape
 
+if TYPE_CHECKING:
+    from legate.core import PhysicalStore
+
 SUPPORTED_DTYPES = {
     np.dtype(bool): ty.bool_,
     np.dtype(np.int8): ty.int8,
@@ -42,7 +46,7 @@
 
 def is_supported_dtype(dtype: str | np.dtype[Any]) -> bool:
     """
-    Whether a NumPy dtype is supported by cuNumeric
+    Whether a NumPy dtype is supported by cuPyNumeric
 
     Parameters
     ----------
@@ -60,7 +64,7 @@ def is_supported_dtype(dtype: str | np.dtype[Any]) -> bool:
 def to_core_type(dtype: str | np.dtype[Any]) -> ty.Type:
     core_dtype = SUPPORTED_DTYPES.get(np.dtype(dtype))
     if core_dtype is None:
-        raise TypeError(f"cuNumeric does not support dtype={dtype}")
+        raise TypeError(f"cuPyNumeric does not support dtype={dtype}")
     return core_dtype
 
 
@@ -111,3 +115,32 @@ def min_identity(
         return True
     else:
         raise ValueError(f"Unsupported dtype: {ty}")
+
+
+def local_task_array(obj: PhysicalArray | PhysicalStore) -> Any:
+    """
+    Generate an appropriate local-memory ndarray object, that is backed by the
+    portion of a Legate array or store that was passed to a task.
+
+    Parameters
+    ----------
+    obj : PhysicalArray | PhysicalStore
+        A Legate physical array or store to adapt.
+
+    Returns
+    -------
+    arr : cupy.ndarray or np.ndarray
+        If the array or store is located on GPU, then this function will return
+        a CuPy array. Otherwise, a NumPy array is returned.
+
+    """
+    store = obj.data() if isinstance(obj, PhysicalArray) else obj
+
+    if store.target in {StoreTarget.FBMEM, StoreTarget.ZCMEM}:
+        # cupy is only a dependency for GPU packages -- but we should
+        # only hit this import in case the store is located on a GPU
+        import cupy  # type: ignore [import-untyped,import-not-found]
+
+        return cupy.asarray(store)
+    else:
+        return np.asarray(store)
diff --git a/cunumeric/_utils/coverage.py b/cupynumeric/_utils/coverage.py
similarity index 88%
rename from cunumeric/_utils/coverage.py
rename to cupynumeric/_utils/coverage.py
index 3b87bb89f6..0a05f82360 100644
--- a/cunumeric/_utils/coverage.py
+++ b/cupynumeric/_utils/coverage.py
@@ -17,13 +17,7 @@
 import warnings
 from dataclasses import dataclass
 from functools import WRAPPER_ASSIGNMENTS, wraps
-from types import (
-    BuiltinFunctionType,
-    FunctionType,
-    MethodDescriptorType,
-    MethodType,
-    ModuleType,
-)
+from types import BuiltinFunctionType, ModuleType
 from typing import Any, Callable, Container, Iterable, Mapping, Protocol, cast
 
 from legate.core import track_provenance
@@ -37,7 +31,7 @@
 __all__ = ("clone_module", "clone_class")
 
 FALLBACK_WARNING = (
-    "cuNumeric has not implemented {what} "
+    "cuPyNumeric has not implemented {what} "
     + "and is falling back to canonical NumPy. "
     + "You may notice significantly decreased performance "
     + "for this function call."
@@ -63,8 +57,7 @@ def filter_namespace(
 
 
 class AnyCallable(Protocol):
-    def __call__(self, *args: Any, **kwargs: Any) -> Any:
-        ...
+    def __call__(self, *args: Any, **kwargs: Any) -> Any: ...
 
 
 @dataclass(frozen=True)
@@ -75,7 +68,7 @@ class CuWrapperMetadata:
 
 
 class CuWrapped(AnyCallable, Protocol):
-    _cunumeric: CuWrapperMetadata
+    _cupynumeric_metadata: CuWrapperMetadata
     __wrapped__: AnyCallable
     __name__: str
     __qualname__: str
@@ -122,7 +115,7 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
     multi = "Multiple GPUs" in (getattr(func, "__doc__", None) or "")
     single = "Single GPU" in (getattr(func, "__doc__", None) or "") or multi
 
-    wrapper._cunumeric = CuWrapperMetadata(
+    wrapper._cupynumeric_metadata = CuWrapperMetadata(
         implemented=True, single=single, multi=multi
     )
 
@@ -147,7 +140,7 @@ def unimplemented(
     # all array-like arguments to `numpy.ndarray` through `__array__()` (taking
     # some care to skip the `__array_function__` dispatch logic, to avoid
     # infinite loops). However, it appears that this behavior is inconsistent
-    # in NumPy, so we will instead convert any `cunumeric.ndarray`s manually
+    # in NumPy, so we will instead convert any `cupynumeric.ndarray`s manually
     # before calling into NumPy.
 
     wrapper: CuWrapped
@@ -185,13 +178,13 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
             return func(*args, **kwargs)
 
     wrapper.__doc__ = f"""
-    cuNumeric has not implemented this function, and will fall back to NumPy.
+    cuPyNumeric has not implemented this function, and will fall back to NumPy.
 
     See Also
     --------
     {name}
     """
-    wrapper._cunumeric = CuWrapperMetadata(implemented=False)
+    wrapper._cupynumeric_metadata = CuWrapperMetadata(implemented=False)
 
     return wrapper
 
@@ -248,7 +241,7 @@ def clone_module(
         # Only need to wrap things that are in the origin module to begin with
         if attr not in origin_module.__dict__:
             continue
-        if isinstance(value, (FunctionType, lgufunc)) or (
+        if should_wrap(value) or (
             include_builtin_function_type
             and isinstance(value, BuiltinFunctionType)
         ):
@@ -279,7 +272,7 @@ def clone_module(
     from numpy import ufunc as npufunc
 
     for attr, value in missing.items():
-        if isinstance(value, (FunctionType, npufunc)) or (
+        if should_wrap(value) or (
             include_builtin_function_type
             and isinstance(value, BuiltinFunctionType)
         ):
@@ -306,7 +299,19 @@ def clone_module(
 
 
 def should_wrap(obj: object) -> bool:
-    return isinstance(obj, (FunctionType, MethodType, MethodDescriptorType))
+    from numpy import ufunc as npufunc
+
+    from .._ufunc.ufunc import ufunc as lgufunc
+
+    # Custom callables, e.g. cython functions used in np2, do not inherit
+    # anything, so we check callable() instead (and include the __get__/__set__
+    # checks to filter out classes). OTOH ufuncs need to be checked specially
+    # because they do not have __get__.
+    return (
+        callable(obj)
+        and hasattr(obj, "__get__")
+        and not hasattr(obj, "__set__")
+    ) or isinstance(obj, (lgufunc, npufunc))
 
 
 def clone_class(
@@ -363,13 +368,17 @@ def _clone_class(cls: type) -> type:
     return _clone_class
 
 
+def is_wrapped(obj: Any) -> bool:
+    return hasattr(obj, "_cupynumeric_metadata")
+
+
 def is_implemented(obj: Any) -> bool:
-    return hasattr(obj, "_cunumeric") and obj._cunumeric.implemented
+    return is_wrapped(obj) and obj._cupynumeric_metadata.implemented
 
 
 def is_single(obj: Any) -> bool:
-    return hasattr(obj, "_cunumeric") and obj._cunumeric.single
+    return is_wrapped(obj) and obj._cupynumeric_metadata.single
 
 
 def is_multi(obj: Any) -> bool:
-    return hasattr(obj, "_cunumeric") and obj._cunumeric.multi
+    return is_wrapped(obj) and obj._cupynumeric_metadata.multi
diff --git a/cunumeric/_utils/linalg.py b/cupynumeric/_utils/linalg.py
similarity index 100%
rename from cunumeric/_utils/linalg.py
rename to cupynumeric/_utils/linalg.py
diff --git a/cunumeric/_utils/stack.py b/cupynumeric/_utils/stack.py
similarity index 91%
rename from cunumeric/_utils/stack.py
rename to cupynumeric/_utils/stack.py
index 470cf77750..f5e714a3c6 100644
--- a/cunumeric/_utils/stack.py
+++ b/cupynumeric/_utils/stack.py
@@ -21,7 +21,7 @@
 def find_last_user_stacklevel() -> int:
     stacklevel = 1
     for frame, _ in traceback.walk_stack(None):
-        if not frame.f_globals["__name__"].startswith("cunumeric"):
+        if not frame.f_globals["__name__"].startswith("cupynumeric"):
             break
         stacklevel += 1
     return stacklevel
@@ -36,7 +36,7 @@ def find_last_user_frames(top_only: bool = True) -> str:
         if "__name__" not in last.f_globals:
             continue
         name = last.f_globals["__name__"]
-        if not any(name.startswith(pkg) for pkg in ("cunumeric", "legate")):
+        if not any(name.startswith(pkg) for pkg in ("cupynumeric", "legate")):
             break
 
     if top_only:
diff --git a/cunumeric/_utils/structure.py b/cupynumeric/_utils/structure.py
similarity index 100%
rename from cunumeric/_utils/structure.py
rename to cupynumeric/_utils/structure.py
diff --git a/cunumeric/_version.py b/cupynumeric/_version.py
similarity index 99%
rename from cunumeric/_version.py
rename to cupynumeric/_version.py
index 7c006fdc15..9d05050897 100644
--- a/cunumeric/_version.py
+++ b/cupynumeric/_version.py
@@ -43,8 +43,8 @@ def get_config():
     cfg.VCS = "git"
     cfg.style = "pep440"
     cfg.tag_prefix = "v"
-    cfg.parentdir_prefix = "cunumeric-"
-    cfg.versionfile_source = "cunumeric/_version.py"
+    cfg.parentdir_prefix = "cupynumeric-"
+    cfg.versionfile_source = "cupynumeric/_version.py"
     cfg.verbose = False
     return cfg
 
diff --git a/cupynumeric/config.py b/cupynumeric/config.py
new file mode 100644
index 0000000000..c7a351d8f5
--- /dev/null
+++ b/cupynumeric/config.py
@@ -0,0 +1,842 @@
+# Copyright 2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+import os
+import platform
+from abc import abstractmethod
+from ctypes import CDLL, RTLD_GLOBAL
+from enum import IntEnum, unique
+from typing import TYPE_CHECKING, Any, cast
+
+import cffi  # type: ignore
+import numpy as np
+
+if TYPE_CHECKING:
+    import numpy.typing as npt
+
+
+class _ReductionOpIds:
+    argmax_redop_id: int
+    argmin_redop_id: int
+
+
+class _CupynumericSharedLib:
+    CUPYNUMERIC_ADVANCED_INDEXING: int
+    CUPYNUMERIC_ARANGE: int
+    CUPYNUMERIC_ARGWHERE: int
+    CUPYNUMERIC_BATCHED_CHOLESKY: int
+    CUPYNUMERIC_BINARY_OP: int
+    CUPYNUMERIC_BINARY_RED: int
+    CUPYNUMERIC_BINCOUNT: int
+    CUPYNUMERIC_BINOP_ADD: int
+    CUPYNUMERIC_BINOP_ARCTAN2: int
+    CUPYNUMERIC_BINOP_BITWISE_AND: int
+    CUPYNUMERIC_BINOP_BITWISE_OR: int
+    CUPYNUMERIC_BINOP_BITWISE_XOR: int
+    CUPYNUMERIC_BINOP_COPYSIGN: int
+    CUPYNUMERIC_BINOP_DIVIDE: int
+    CUPYNUMERIC_BINOP_EQUAL: int
+    CUPYNUMERIC_BINOP_FLOAT_POWER: int
+    CUPYNUMERIC_BINOP_FLOOR_DIVIDE: int
+    CUPYNUMERIC_BINOP_FMOD: int
+    CUPYNUMERIC_BINOP_GCD: int
+    CUPYNUMERIC_BINOP_GREATER: int
+    CUPYNUMERIC_BINOP_GREATER_EQUAL: int
+    CUPYNUMERIC_BINOP_HYPOT: int
+    CUPYNUMERIC_BINOP_ISCLOSE: int
+    CUPYNUMERIC_BINOP_LCM: int
+    CUPYNUMERIC_BINOP_LDEXP: int
+    CUPYNUMERIC_BINOP_LEFT_SHIFT: int
+    CUPYNUMERIC_BINOP_LESS: int
+    CUPYNUMERIC_BINOP_LESS_EQUAL: int
+    CUPYNUMERIC_BINOP_LOGADDEXP2: int
+    CUPYNUMERIC_BINOP_LOGADDEXP: int
+    CUPYNUMERIC_BINOP_LOGICAL_AND: int
+    CUPYNUMERIC_BINOP_LOGICAL_OR: int
+    CUPYNUMERIC_BINOP_LOGICAL_XOR: int
+    CUPYNUMERIC_BINOP_MAXIMUM: int
+    CUPYNUMERIC_BINOP_MINIMUM: int
+    CUPYNUMERIC_BINOP_MOD: int
+    CUPYNUMERIC_BINOP_MULTIPLY: int
+    CUPYNUMERIC_BINOP_NEXTAFTER: int
+    CUPYNUMERIC_BINOP_NOT_EQUAL: int
+    CUPYNUMERIC_BINOP_POWER: int
+    CUPYNUMERIC_BINOP_RIGHT_SHIFT: int
+    CUPYNUMERIC_BINOP_SUBTRACT: int
+    CUPYNUMERIC_BITGENERATOR: int
+    CUPYNUMERIC_BITGENOP_DISTRIBUTION: int
+    CUPYNUMERIC_BITGENTYPE_DEFAULT: int
+    CUPYNUMERIC_BITGENTYPE_XORWOW: int
+    CUPYNUMERIC_BITGENTYPE_MRG32K3A: int
+    CUPYNUMERIC_BITGENTYPE_MTGP32: int
+    CUPYNUMERIC_BITGENTYPE_MT19937: int
+    CUPYNUMERIC_BITGENTYPE_PHILOX4_32_10: int
+    CUPYNUMERIC_BITGENDIST_INTEGERS_16: int
+    CUPYNUMERIC_BITGENDIST_INTEGERS_32: int
+    CUPYNUMERIC_BITGENDIST_INTEGERS_64: int
+    CUPYNUMERIC_BITGENDIST_UNIFORM_32: int
+    CUPYNUMERIC_BITGENDIST_UNIFORM_64: int
+    CUPYNUMERIC_BITGENDIST_LOGNORMAL_32: int
+    CUPYNUMERIC_BITGENDIST_LOGNORMAL_64: int
+    CUPYNUMERIC_BITGENDIST_NORMAL_32: int
+    CUPYNUMERIC_BITGENDIST_NORMAL_64: int
+    CUPYNUMERIC_BITGENDIST_POISSON: int
+    CUPYNUMERIC_BITGENDIST_EXPONENTIAL_32: int
+    CUPYNUMERIC_BITGENDIST_EXPONENTIAL_64: int
+    CUPYNUMERIC_BITGENDIST_GUMBEL_32: int
+    CUPYNUMERIC_BITGENDIST_GUMBEL_64: int
+    CUPYNUMERIC_BITGENDIST_LAPLACE_32: int
+    CUPYNUMERIC_BITGENDIST_LAPLACE_64: int
+    CUPYNUMERIC_BITGENDIST_LOGISTIC_32: int
+    CUPYNUMERIC_BITGENDIST_LOGISTIC_64: int
+    CUPYNUMERIC_BITGENDIST_PARETO_32: int
+    CUPYNUMERIC_BITGENDIST_PARETO_64: int
+    CUPYNUMERIC_BITGENDIST_POWER_32: int
+    CUPYNUMERIC_BITGENDIST_POWER_64: int
+    CUPYNUMERIC_BITGENDIST_RAYLEIGH_32: int
+    CUPYNUMERIC_BITGENDIST_RAYLEIGH_64: int
+    CUPYNUMERIC_BITGENDIST_CAUCHY_32: int
+    CUPYNUMERIC_BITGENDIST_CAUCHY_64: int
+    CUPYNUMERIC_BITGENDIST_TRIANGULAR_32: int
+    CUPYNUMERIC_BITGENDIST_TRIANGULAR_64: int
+    CUPYNUMERIC_BITGENDIST_WEIBULL_32: int
+    CUPYNUMERIC_BITGENDIST_WEIBULL_64: int
+    CUPYNUMERIC_BITGENDIST_BYTES: int
+    CUPYNUMERIC_BITGENDIST_BETA_32: int
+    CUPYNUMERIC_BITGENDIST_BETA_64: int
+    CUPYNUMERIC_BITGENDIST_F_32: int
+    CUPYNUMERIC_BITGENDIST_F_64: int
+    CUPYNUMERIC_BITGENDIST_LOGSERIES: int
+    CUPYNUMERIC_BITGENDIST_NONCENTRAL_F_32: int
+    CUPYNUMERIC_BITGENDIST_NONCENTRAL_F_64: int
+    CUPYNUMERIC_BITGENDIST_CHISQUARE_32: int
+    CUPYNUMERIC_BITGENDIST_CHISQUARE_64: int
+    CUPYNUMERIC_BITGENDIST_GAMMA_32: int
+    CUPYNUMERIC_BITGENDIST_GAMMA_64: int
+    CUPYNUMERIC_BITGENDIST_STANDARD_T_32: int
+    CUPYNUMERIC_BITGENDIST_STANDARD_T_64: int
+    CUPYNUMERIC_BITGENDIST_HYPERGEOMETRIC: int
+    CUPYNUMERIC_BITGENDIST_VONMISES_32: int
+    CUPYNUMERIC_BITGENDIST_VONMISES_64: int
+    CUPYNUMERIC_BITGENDIST_ZIPF: int
+    CUPYNUMERIC_BITGENDIST_GEOMETRIC: int
+    CUPYNUMERIC_BITGENDIST_WALD_32: int
+    CUPYNUMERIC_BITGENDIST_WALD_64: int
+    CUPYNUMERIC_BITGENDIST_BINOMIAL: int
+    CUPYNUMERIC_BITGENDIST_NEGATIVE_BINOMIAL: int
+    CUPYNUMERIC_BITGENOP_CREATE: int
+    CUPYNUMERIC_BITGENOP_DESTROY: int
+    CUPYNUMERIC_BITGENOP_RAND_RAW: int
+    CUPYNUMERIC_BITORDER_BIG: int
+    CUPYNUMERIC_BITORDER_LITTLE: int
+    CUPYNUMERIC_CHOOSE: int
+    CUPYNUMERIC_CONTRACT: int
+    CUPYNUMERIC_CONVERT: int
+    CUPYNUMERIC_CONVERT_NAN_NOOP: int
+    CUPYNUMERIC_CONVERT_NAN_PROD: int
+    CUPYNUMERIC_CONVERT_NAN_SUM: int
+    CUPYNUMERIC_CONVOLVE: int
+    CUPYNUMERIC_CONVOLVE_AUTO: int
+    CUPYNUMERIC_CONVOLVE_DIRECT: int
+    CUPYNUMERIC_CONVOLVE_FFT: int
+    CUPYNUMERIC_DIAG: int
+    CUPYNUMERIC_DOT: int
+    CUPYNUMERIC_EYE: int
+    CUPYNUMERIC_FFT: int
+    CUPYNUMERIC_FFT_C2C: int
+    CUPYNUMERIC_FFT_C2R: int
+    CUPYNUMERIC_FFT_D2Z: int
+    CUPYNUMERIC_FFT_FORWARD: int
+    CUPYNUMERIC_FFT_INVERSE: int
+    CUPYNUMERIC_FFT_R2C: int
+    CUPYNUMERIC_FFT_Z2D: int
+    CUPYNUMERIC_FFT_Z2Z: int
+    CUPYNUMERIC_FILL: int
+    CUPYNUMERIC_FLIP: int
+    CUPYNUMERIC_GEEV: int
+    CUPYNUMERIC_GEMM: int
+    CUPYNUMERIC_HISTOGRAM: int
+    CUPYNUMERIC_LOAD_CUDALIBS: int
+    CUPYNUMERIC_MATMUL: int
+    CUPYNUMERIC_MATVECMUL: int
+    CUPYNUMERIC_MAX_MAPPERS: int
+    CUPYNUMERIC_MAX_REDOPS: int
+    CUPYNUMERIC_MAX_TASKS: int
+    CUPYNUMERIC_MP_POTRF: int
+    CUPYNUMERIC_MP_SOLVE: int
+    CUPYNUMERIC_NONZERO: int
+    CUPYNUMERIC_PACKBITS: int
+    CUPYNUMERIC_POTRF: int
+    CUPYNUMERIC_PUTMASK: int
+    CUPYNUMERIC_QR: int
+    CUPYNUMERIC_RAND: int
+    CUPYNUMERIC_READ: int
+    CUPYNUMERIC_RED_ALL: int
+    CUPYNUMERIC_RED_ANY: int
+    CUPYNUMERIC_RED_ARGMAX: int
+    CUPYNUMERIC_RED_ARGMIN: int
+    CUPYNUMERIC_RED_CONTAINS: int
+    CUPYNUMERIC_RED_COUNT_NONZERO: int
+    CUPYNUMERIC_RED_MAX: int
+    CUPYNUMERIC_RED_MIN: int
+    CUPYNUMERIC_RED_NANARGMAX: int
+    CUPYNUMERIC_RED_NANARGMIN: int
+    CUPYNUMERIC_RED_NANMAX: int
+    CUPYNUMERIC_RED_NANMIN: int
+    CUPYNUMERIC_RED_NANPROD: int
+    CUPYNUMERIC_RED_NANSUM: int
+    CUPYNUMERIC_RED_PROD: int
+    CUPYNUMERIC_RED_SUM: int
+    CUPYNUMERIC_RED_SUM_SQUARES: int
+    CUPYNUMERIC_RED_VARIANCE: int
+    CUPYNUMERIC_REPEAT: int
+    CUPYNUMERIC_SCALAR_UNARY_RED: int
+    CUPYNUMERIC_SCAN_GLOBAL: int
+    CUPYNUMERIC_SCAN_LOCAL: int
+    CUPYNUMERIC_SCAN_PROD: int
+    CUPYNUMERIC_SCAN_SUM: int
+    CUPYNUMERIC_SEARCHSORTED: int
+    CUPYNUMERIC_SELECT: int
+    CUPYNUMERIC_SOLVE: int
+    CUPYNUMERIC_SORT: int
+    CUPYNUMERIC_SVD: int
+    CUPYNUMERIC_SYRK: int
+    CUPYNUMERIC_TILE: int
+    CUPYNUMERIC_TRANSPOSE_COPY_2D: int
+    CUPYNUMERIC_TRILU: int
+    CUPYNUMERIC_TRSM: int
+    CUPYNUMERIC_UNARY_OP: int
+    CUPYNUMERIC_UNARY_RED: int
+    CUPYNUMERIC_UNIQUE: int
+    CUPYNUMERIC_UNIQUE_REDUCE: int
+    CUPYNUMERIC_UNLOAD_CUDALIBS: int
+    CUPYNUMERIC_UNPACKBITS: int
+    CUPYNUMERIC_UOP_ABSOLUTE: int
+    CUPYNUMERIC_UOP_ANGLE: int
+    CUPYNUMERIC_UOP_ARCCOS: int
+    CUPYNUMERIC_UOP_ARCCOSH: int
+    CUPYNUMERIC_UOP_ARCSIN: int
+    CUPYNUMERIC_UOP_ARCSINH: int
+    CUPYNUMERIC_UOP_ARCTAN: int
+    CUPYNUMERIC_UOP_ARCTANH: int
+    CUPYNUMERIC_UOP_CBRT: int
+    CUPYNUMERIC_UOP_CEIL: int
+    CUPYNUMERIC_UOP_CLIP: int
+    CUPYNUMERIC_UOP_CONJ: int
+    CUPYNUMERIC_UOP_COPY: int
+    CUPYNUMERIC_UOP_COS: int
+    CUPYNUMERIC_UOP_COSH: int
+    CUPYNUMERIC_UOP_DEG2RAD: int
+    CUPYNUMERIC_UOP_EXP2: int
+    CUPYNUMERIC_UOP_EXP: int
+    CUPYNUMERIC_UOP_EXPM1: int
+    CUPYNUMERIC_UOP_FLOOR: int
+    CUPYNUMERIC_UOP_FREXP: int
+    CUPYNUMERIC_UOP_GETARG: int
+    CUPYNUMERIC_UOP_IMAG: int
+    CUPYNUMERIC_UOP_INVERT: int
+    CUPYNUMERIC_UOP_ISFINITE: int
+    CUPYNUMERIC_UOP_ISINF: int
+    CUPYNUMERIC_UOP_ISNAN: int
+    CUPYNUMERIC_UOP_LOG10: int
+    CUPYNUMERIC_UOP_LOG1P: int
+    CUPYNUMERIC_UOP_LOG2: int
+    CUPYNUMERIC_UOP_LOG: int
+    CUPYNUMERIC_UOP_LOGICAL_NOT: int
+    CUPYNUMERIC_UOP_MODF: int
+    CUPYNUMERIC_UOP_NEGATIVE: int
+    CUPYNUMERIC_UOP_POSITIVE: int
+    CUPYNUMERIC_UOP_RAD2DEG: int
+    CUPYNUMERIC_UOP_REAL: int
+    CUPYNUMERIC_UOP_RECIPROCAL: int
+    CUPYNUMERIC_UOP_RINT: int
+    CUPYNUMERIC_UOP_ROUND: int
+    CUPYNUMERIC_UOP_SIGN: int
+    CUPYNUMERIC_UOP_SIGNBIT: int
+    CUPYNUMERIC_UOP_SIN: int
+    CUPYNUMERIC_UOP_SINH: int
+    CUPYNUMERIC_UOP_SQRT: int
+    CUPYNUMERIC_UOP_SQUARE: int
+    CUPYNUMERIC_UOP_TAN: int
+    CUPYNUMERIC_UOP_TANH: int
+    CUPYNUMERIC_UOP_TRUNC: int
+    CUPYNUMERIC_WHERE: int
+    CUPYNUMERIC_WINDOW: int
+    CUPYNUMERIC_WINDOW_BARLETT: int
+    CUPYNUMERIC_WINDOW_BLACKMAN: int
+    CUPYNUMERIC_WINDOW_HAMMING: int
+    CUPYNUMERIC_WINDOW_HANNING: int
+    CUPYNUMERIC_WINDOW_KAISER: int
+    CUPYNUMERIC_WRAP: int
+    CUPYNUMERIC_WRITE: int
+    CUPYNUMERIC_ZIP: int
+
+    @abstractmethod
+    def cupynumeric_has_cusolvermp(self) -> bool: ...
+
+    @abstractmethod
+    def cupynumeric_cusolver_has_geev(self) -> bool: ...
+
+    @abstractmethod
+    def cupynumeric_max_eager_volume(self) -> int: ...
+
+    @abstractmethod
+    def cupynumeric_register_reduction_ops(
+        self, code: int
+    ) -> _ReductionOpIds: ...
+
+
+def dlopen_no_autoclose(ffi: Any, lib_path: str) -> Any:
+    # Use an already-opened library handle, which cffi will convert to a
+    # regular FFI object (using the definitions previously added using
+    # ffi.cdef), but will not automatically dlclose() on collection.
+    lib = CDLL(lib_path, mode=RTLD_GLOBAL)
+    return ffi.dlopen(ffi.cast("void *", lib._handle))
+
+
+# Load the cuPyNumeric library first so we have a shard object that
+# we can use to initialize all these configuration enumerations
+class CuPyNumericLib:
+    def __init__(self, name: str) -> None:
+        self.name = name
+
+        shared_lib_path = self.get_shared_library()
+        assert shared_lib_path is not None
+        header = self.get_c_header()
+        ffi = cffi.FFI()
+        if header is not None:
+            ffi.cdef(header)
+        # Don't use ffi.dlopen(), because that will call dlclose()
+        # automatically when the object gets collected, thus removing
+        # symbols that may be needed when destroying C++ objects later
+        # (e.g. vtable entries, which will be queried for virtual
+        # destructors), causing errors at shutdown.
+        shared_lib = dlopen_no_autoclose(ffi, shared_lib_path)
+        self.shared_object = cast(_CupynumericSharedLib, shared_lib)
+
+    def register(self) -> None:
+        from legate.core import get_legate_runtime
+
+        # We need to make sure that the runtime is started
+        get_legate_runtime()
+
+        callback = getattr(
+            self.shared_object, "cupynumeric_perform_registration"
+        )
+        callback()
+
+    def get_shared_library(self) -> str:
+        from .install_info import libpath
+
+        return os.path.join(
+            libpath, "libcupynumeric" + self.get_library_extension()
+        )
+
+    def get_c_header(self) -> str:
+        from .install_info import header
+
+        return header
+
+    @staticmethod
+    def get_library_extension() -> str:
+        os_name = platform.system()
+        if os_name == "Linux":
+            return ".so"
+        elif os_name == "Darwin":
+            return ".dylib"
+        raise RuntimeError(f"unknown platform {os_name!r}")
+
+
+CUPYNUMERIC_LIB_NAME = "cupynumeric"
+cupynumeric_lib = CuPyNumericLib(CUPYNUMERIC_LIB_NAME)
+cupynumeric_lib.register()
+_cupynumeric = cupynumeric_lib.shared_object
+
+
+# Match these to CuPyNumericOpCode in cupynumeric_c.h
+@unique
+class CuPyNumericOpCode(IntEnum):
+    ADVANCED_INDEXING = _cupynumeric.CUPYNUMERIC_ADVANCED_INDEXING
+    ARANGE = _cupynumeric.CUPYNUMERIC_ARANGE
+    ARGWHERE = _cupynumeric.CUPYNUMERIC_ARGWHERE
+    BATCHED_CHOLESKY = _cupynumeric.CUPYNUMERIC_BATCHED_CHOLESKY
+    BINARY_OP = _cupynumeric.CUPYNUMERIC_BINARY_OP
+    BINARY_RED = _cupynumeric.CUPYNUMERIC_BINARY_RED
+    BINCOUNT = _cupynumeric.CUPYNUMERIC_BINCOUNT
+    BITGENERATOR = _cupynumeric.CUPYNUMERIC_BITGENERATOR
+    CHOOSE = _cupynumeric.CUPYNUMERIC_CHOOSE
+    CONTRACT = _cupynumeric.CUPYNUMERIC_CONTRACT
+    CONVERT = _cupynumeric.CUPYNUMERIC_CONVERT
+    CONVOLVE = _cupynumeric.CUPYNUMERIC_CONVOLVE
+    DIAG = _cupynumeric.CUPYNUMERIC_DIAG
+    DOT = _cupynumeric.CUPYNUMERIC_DOT
+    EYE = _cupynumeric.CUPYNUMERIC_EYE
+    FFT = _cupynumeric.CUPYNUMERIC_FFT
+    FILL = _cupynumeric.CUPYNUMERIC_FILL
+    FLIP = _cupynumeric.CUPYNUMERIC_FLIP
+    GEEV = _cupynumeric.CUPYNUMERIC_GEEV
+    GEMM = _cupynumeric.CUPYNUMERIC_GEMM
+    HISTOGRAM = _cupynumeric.CUPYNUMERIC_HISTOGRAM
+    LOAD_CUDALIBS = _cupynumeric.CUPYNUMERIC_LOAD_CUDALIBS
+    MATMUL = _cupynumeric.CUPYNUMERIC_MATMUL
+    MATVECMUL = _cupynumeric.CUPYNUMERIC_MATVECMUL
+    MP_POTRF = _cupynumeric.CUPYNUMERIC_MP_POTRF
+    MP_SOLVE = _cupynumeric.CUPYNUMERIC_MP_SOLVE
+    NONZERO = _cupynumeric.CUPYNUMERIC_NONZERO
+    PACKBITS = _cupynumeric.CUPYNUMERIC_PACKBITS
+    POTRF = _cupynumeric.CUPYNUMERIC_POTRF
+    PUTMASK = _cupynumeric.CUPYNUMERIC_PUTMASK
+    QR = _cupynumeric.CUPYNUMERIC_QR
+    RAND = _cupynumeric.CUPYNUMERIC_RAND
+    READ = _cupynumeric.CUPYNUMERIC_READ
+    REPEAT = _cupynumeric.CUPYNUMERIC_REPEAT
+    SCALAR_UNARY_RED = _cupynumeric.CUPYNUMERIC_SCALAR_UNARY_RED
+    SCAN_GLOBAL = _cupynumeric.CUPYNUMERIC_SCAN_GLOBAL
+    SCAN_LOCAL = _cupynumeric.CUPYNUMERIC_SCAN_LOCAL
+    SEARCHSORTED = _cupynumeric.CUPYNUMERIC_SEARCHSORTED
+    SELECT = _cupynumeric.CUPYNUMERIC_SELECT
+    SOLVE = _cupynumeric.CUPYNUMERIC_SOLVE
+    SORT = _cupynumeric.CUPYNUMERIC_SORT
+    SVD = _cupynumeric.CUPYNUMERIC_SVD
+    SYRK = _cupynumeric.CUPYNUMERIC_SYRK
+    TILE = _cupynumeric.CUPYNUMERIC_TILE
+    TRANSPOSE_COPY_2D = _cupynumeric.CUPYNUMERIC_TRANSPOSE_COPY_2D
+    TRILU = _cupynumeric.CUPYNUMERIC_TRILU
+    TRSM = _cupynumeric.CUPYNUMERIC_TRSM
+    UNARY_OP = _cupynumeric.CUPYNUMERIC_UNARY_OP
+    UNARY_RED = _cupynumeric.CUPYNUMERIC_UNARY_RED
+    UNIQUE = _cupynumeric.CUPYNUMERIC_UNIQUE
+    UNIQUE_REDUCE = _cupynumeric.CUPYNUMERIC_UNIQUE_REDUCE
+    UNLOAD_CUDALIBS = _cupynumeric.CUPYNUMERIC_UNLOAD_CUDALIBS
+    UNPACKBITS = _cupynumeric.CUPYNUMERIC_UNPACKBITS
+    WHERE = _cupynumeric.CUPYNUMERIC_WHERE
+    WINDOW = _cupynumeric.CUPYNUMERIC_WINDOW
+    WRAP = _cupynumeric.CUPYNUMERIC_WRAP
+    WRITE = _cupynumeric.CUPYNUMERIC_WRITE
+    ZIP = _cupynumeric.CUPYNUMERIC_ZIP
+
+
+# Match these to CuPyNumericUnaryOpCode in cupynumeric_c.h
+@unique
+class UnaryOpCode(IntEnum):
+    ABSOLUTE = _cupynumeric.CUPYNUMERIC_UOP_ABSOLUTE
+    ANGLE = _cupynumeric.CUPYNUMERIC_UOP_ANGLE
+    ARCCOS = _cupynumeric.CUPYNUMERIC_UOP_ARCCOS
+    ARCCOSH = _cupynumeric.CUPYNUMERIC_UOP_ARCCOSH
+    ARCSIN = _cupynumeric.CUPYNUMERIC_UOP_ARCSIN
+    ARCSINH = _cupynumeric.CUPYNUMERIC_UOP_ARCSINH
+    ARCTAN = _cupynumeric.CUPYNUMERIC_UOP_ARCTAN
+    ARCTANH = _cupynumeric.CUPYNUMERIC_UOP_ARCTANH
+    CBRT = _cupynumeric.CUPYNUMERIC_UOP_CBRT
+    CEIL = _cupynumeric.CUPYNUMERIC_UOP_CEIL
+    CLIP = _cupynumeric.CUPYNUMERIC_UOP_CLIP
+    CONJ = _cupynumeric.CUPYNUMERIC_UOP_CONJ
+    COPY = _cupynumeric.CUPYNUMERIC_UOP_COPY
+    COS = _cupynumeric.CUPYNUMERIC_UOP_COS
+    COSH = _cupynumeric.CUPYNUMERIC_UOP_COSH
+    DEG2RAD = _cupynumeric.CUPYNUMERIC_UOP_DEG2RAD
+    EXP = _cupynumeric.CUPYNUMERIC_UOP_EXP
+    EXP2 = _cupynumeric.CUPYNUMERIC_UOP_EXP2
+    EXPM1 = _cupynumeric.CUPYNUMERIC_UOP_EXPM1
+    FLOOR = _cupynumeric.CUPYNUMERIC_UOP_FLOOR
+    FREXP = _cupynumeric.CUPYNUMERIC_UOP_FREXP
+    GETARG = _cupynumeric.CUPYNUMERIC_UOP_GETARG
+    IMAG = _cupynumeric.CUPYNUMERIC_UOP_IMAG
+    INVERT = _cupynumeric.CUPYNUMERIC_UOP_INVERT
+    ISFINITE = _cupynumeric.CUPYNUMERIC_UOP_ISFINITE
+    ISINF = _cupynumeric.CUPYNUMERIC_UOP_ISINF
+    ISNAN = _cupynumeric.CUPYNUMERIC_UOP_ISNAN
+    LOG = _cupynumeric.CUPYNUMERIC_UOP_LOG
+    LOG10 = _cupynumeric.CUPYNUMERIC_UOP_LOG10
+    LOG1P = _cupynumeric.CUPYNUMERIC_UOP_LOG1P
+    LOG2 = _cupynumeric.CUPYNUMERIC_UOP_LOG2
+    LOGICAL_NOT = _cupynumeric.CUPYNUMERIC_UOP_LOGICAL_NOT
+    MODF = _cupynumeric.CUPYNUMERIC_UOP_MODF
+    NEGATIVE = _cupynumeric.CUPYNUMERIC_UOP_NEGATIVE
+    POSITIVE = _cupynumeric.CUPYNUMERIC_UOP_POSITIVE
+    RAD2DEG = _cupynumeric.CUPYNUMERIC_UOP_RAD2DEG
+    REAL = _cupynumeric.CUPYNUMERIC_UOP_REAL
+    RECIPROCAL = _cupynumeric.CUPYNUMERIC_UOP_RECIPROCAL
+    RINT = _cupynumeric.CUPYNUMERIC_UOP_RINT
+    ROUND = _cupynumeric.CUPYNUMERIC_UOP_ROUND
+    SIGN = _cupynumeric.CUPYNUMERIC_UOP_SIGN
+    SIGNBIT = _cupynumeric.CUPYNUMERIC_UOP_SIGNBIT
+    SIN = _cupynumeric.CUPYNUMERIC_UOP_SIN
+    SINH = _cupynumeric.CUPYNUMERIC_UOP_SINH
+    SQRT = _cupynumeric.CUPYNUMERIC_UOP_SQRT
+    SQUARE = _cupynumeric.CUPYNUMERIC_UOP_SQUARE
+    TAN = _cupynumeric.CUPYNUMERIC_UOP_TAN
+    TANH = _cupynumeric.CUPYNUMERIC_UOP_TANH
+    TRUNC = _cupynumeric.CUPYNUMERIC_UOP_TRUNC
+
+
+# Match these to CuPyNumericUnaryRedCode in cupynumeric_c.h
+@unique
+class UnaryRedCode(IntEnum):
+    ALL = _cupynumeric.CUPYNUMERIC_RED_ALL
+    ANY = _cupynumeric.CUPYNUMERIC_RED_ANY
+    ARGMAX = _cupynumeric.CUPYNUMERIC_RED_ARGMAX
+    ARGMIN = _cupynumeric.CUPYNUMERIC_RED_ARGMIN
+    CONTAINS = _cupynumeric.CUPYNUMERIC_RED_CONTAINS
+    COUNT_NONZERO = _cupynumeric.CUPYNUMERIC_RED_COUNT_NONZERO
+    MAX = _cupynumeric.CUPYNUMERIC_RED_MAX
+    MIN = _cupynumeric.CUPYNUMERIC_RED_MIN
+    NANARGMAX = _cupynumeric.CUPYNUMERIC_RED_NANARGMAX
+    NANARGMIN = _cupynumeric.CUPYNUMERIC_RED_NANARGMIN
+    NANMAX = _cupynumeric.CUPYNUMERIC_RED_NANMAX
+    NANMIN = _cupynumeric.CUPYNUMERIC_RED_NANMIN
+    NANPROD = _cupynumeric.CUPYNUMERIC_RED_NANPROD
+    NANSUM = _cupynumeric.CUPYNUMERIC_RED_NANSUM
+    PROD = _cupynumeric.CUPYNUMERIC_RED_PROD
+    SUM = _cupynumeric.CUPYNUMERIC_RED_SUM
+    SUM_SQUARES = _cupynumeric.CUPYNUMERIC_RED_SUM_SQUARES
+    VARIANCE = _cupynumeric.CUPYNUMERIC_RED_VARIANCE
+
+
+# Match these to CuPyNumericBinaryOpCode in cupynumeric_c.h
+@unique
+class BinaryOpCode(IntEnum):
+    ADD = _cupynumeric.CUPYNUMERIC_BINOP_ADD
+    ARCTAN2 = _cupynumeric.CUPYNUMERIC_BINOP_ARCTAN2
+    BITWISE_AND = _cupynumeric.CUPYNUMERIC_BINOP_BITWISE_AND
+    BITWISE_OR = _cupynumeric.CUPYNUMERIC_BINOP_BITWISE_OR
+    BITWISE_XOR = _cupynumeric.CUPYNUMERIC_BINOP_BITWISE_XOR
+    COPYSIGN = _cupynumeric.CUPYNUMERIC_BINOP_COPYSIGN
+    DIVIDE = _cupynumeric.CUPYNUMERIC_BINOP_DIVIDE
+    EQUAL = _cupynumeric.CUPYNUMERIC_BINOP_EQUAL
+    FLOAT_POWER = _cupynumeric.CUPYNUMERIC_BINOP_FLOAT_POWER
+    FLOOR_DIVIDE = _cupynumeric.CUPYNUMERIC_BINOP_FLOOR_DIVIDE
+    FMOD = _cupynumeric.CUPYNUMERIC_BINOP_FMOD
+    GCD = _cupynumeric.CUPYNUMERIC_BINOP_GCD
+    GREATER = _cupynumeric.CUPYNUMERIC_BINOP_GREATER
+    GREATER_EQUAL = _cupynumeric.CUPYNUMERIC_BINOP_GREATER_EQUAL
+    HYPOT = _cupynumeric.CUPYNUMERIC_BINOP_HYPOT
+    ISCLOSE = _cupynumeric.CUPYNUMERIC_BINOP_ISCLOSE
+    LCM = _cupynumeric.CUPYNUMERIC_BINOP_LCM
+    LDEXP = _cupynumeric.CUPYNUMERIC_BINOP_LDEXP
+    LEFT_SHIFT = _cupynumeric.CUPYNUMERIC_BINOP_LEFT_SHIFT
+    LESS = _cupynumeric.CUPYNUMERIC_BINOP_LESS
+    LESS_EQUAL = _cupynumeric.CUPYNUMERIC_BINOP_LESS_EQUAL
+    LOGADDEXP = _cupynumeric.CUPYNUMERIC_BINOP_LOGADDEXP
+    LOGADDEXP2 = _cupynumeric.CUPYNUMERIC_BINOP_LOGADDEXP2
+    LOGICAL_AND = _cupynumeric.CUPYNUMERIC_BINOP_LOGICAL_AND
+    LOGICAL_OR = _cupynumeric.CUPYNUMERIC_BINOP_LOGICAL_OR
+    LOGICAL_XOR = _cupynumeric.CUPYNUMERIC_BINOP_LOGICAL_XOR
+    MAXIMUM = _cupynumeric.CUPYNUMERIC_BINOP_MAXIMUM
+    MINIMUM = _cupynumeric.CUPYNUMERIC_BINOP_MINIMUM
+    MOD = _cupynumeric.CUPYNUMERIC_BINOP_MOD
+    MULTIPLY = _cupynumeric.CUPYNUMERIC_BINOP_MULTIPLY
+    NEXTAFTER = _cupynumeric.CUPYNUMERIC_BINOP_NEXTAFTER
+    NOT_EQUAL = _cupynumeric.CUPYNUMERIC_BINOP_NOT_EQUAL
+    POWER = _cupynumeric.CUPYNUMERIC_BINOP_POWER
+    RIGHT_SHIFT = _cupynumeric.CUPYNUMERIC_BINOP_RIGHT_SHIFT
+    SUBTRACT = _cupynumeric.CUPYNUMERIC_BINOP_SUBTRACT
+
+
+@unique
+class WindowOpCode(IntEnum):
+    BARLETT = _cupynumeric.CUPYNUMERIC_WINDOW_BARLETT
+    BLACKMAN = _cupynumeric.CUPYNUMERIC_WINDOW_BLACKMAN
+    HAMMING = _cupynumeric.CUPYNUMERIC_WINDOW_HAMMING
+    HANNING = _cupynumeric.CUPYNUMERIC_WINDOW_HANNING
+    KAISER = _cupynumeric.CUPYNUMERIC_WINDOW_KAISER
+
+
+# Match these to RandGenCode in rand_util.h
+@unique
+class RandGenCode(IntEnum):
+    UNIFORM = 1
+    NORMAL = 2
+    INTEGER = 3
+
+
+# Match these to CuPyNumericScanCode in cupynumeric_c.h
+@unique
+class ScanCode(IntEnum):
+    PROD = _cupynumeric.CUPYNUMERIC_SCAN_PROD
+    SUM = _cupynumeric.CUPYNUMERIC_SCAN_SUM
+
+
+# Match these to CuPyNumericConvertCode in cupynumeric_c.h
+@unique
+class ConvertCode(IntEnum):
+    NOOP = _cupynumeric.CUPYNUMERIC_CONVERT_NAN_NOOP
+    PROD = _cupynumeric.CUPYNUMERIC_CONVERT_NAN_PROD
+    SUM = _cupynumeric.CUPYNUMERIC_CONVERT_NAN_SUM
+
+
+# Match these to BitGeneratorOperation in cupynumeric_c.h
+@unique
+class BitGeneratorOperation(IntEnum):
+    CREATE = _cupynumeric.CUPYNUMERIC_BITGENOP_CREATE
+    DESTROY = _cupynumeric.CUPYNUMERIC_BITGENOP_DESTROY
+    RAND_RAW = _cupynumeric.CUPYNUMERIC_BITGENOP_RAND_RAW
+    DISTRIBUTION = _cupynumeric.CUPYNUMERIC_BITGENOP_DISTRIBUTION
+
+
+# Match these to BitGeneratorType in cupynumeric_c.h
+@unique
+class BitGeneratorType(IntEnum):
+    DEFAULT = _cupynumeric.CUPYNUMERIC_BITGENTYPE_DEFAULT
+    XORWOW = _cupynumeric.CUPYNUMERIC_BITGENTYPE_XORWOW
+    MRG32K3A = _cupynumeric.CUPYNUMERIC_BITGENTYPE_MRG32K3A
+    MTGP32 = _cupynumeric.CUPYNUMERIC_BITGENTYPE_MTGP32
+    MT19937 = _cupynumeric.CUPYNUMERIC_BITGENTYPE_MT19937
+    PHILOX4_32_10 = _cupynumeric.CUPYNUMERIC_BITGENTYPE_PHILOX4_32_10
+
+
+# Match these to BitGeneratorDistribution in cupynumeric_c.h
+@unique
+class BitGeneratorDistribution(IntEnum):
+    INTEGERS_16 = _cupynumeric.CUPYNUMERIC_BITGENDIST_INTEGERS_16
+    INTEGERS_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_INTEGERS_32
+    INTEGERS_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_INTEGERS_64
+    UNIFORM_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_UNIFORM_32
+    UNIFORM_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_UNIFORM_64
+    LOGNORMAL_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_LOGNORMAL_32
+    LOGNORMAL_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_LOGNORMAL_64
+    NORMAL_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_NORMAL_32
+    NORMAL_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_NORMAL_64
+    POISSON = _cupynumeric.CUPYNUMERIC_BITGENDIST_POISSON
+    EXPONENTIAL_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_EXPONENTIAL_32
+    EXPONENTIAL_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_EXPONENTIAL_64
+    GUMBEL_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_GUMBEL_32
+    GUMBEL_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_GUMBEL_64
+    LAPLACE_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_LAPLACE_32
+    LAPLACE_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_LAPLACE_64
+    LOGISTIC_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_LOGISTIC_32
+    LOGISTIC_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_LOGISTIC_64
+    PARETO_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_PARETO_32
+    PARETO_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_PARETO_64
+    POWER_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_POWER_32
+    POWER_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_POWER_64
+    RAYLEIGH_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_RAYLEIGH_32
+    RAYLEIGH_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_RAYLEIGH_64
+    CAUCHY_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_CAUCHY_32
+    CAUCHY_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_CAUCHY_64
+    TRIANGULAR_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_TRIANGULAR_32
+    TRIANGULAR_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_TRIANGULAR_64
+    WEIBULL_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_WEIBULL_32
+    WEIBULL_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_WEIBULL_64
+    BYTES = _cupynumeric.CUPYNUMERIC_BITGENDIST_BYTES
+    BETA_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_BETA_32
+    BETA_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_BETA_64
+    F_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_F_32
+    F_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_F_64
+    LOGSERIES = _cupynumeric.CUPYNUMERIC_BITGENDIST_LOGSERIES
+    NONCENTRAL_F_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_NONCENTRAL_F_32
+    NONCENTRAL_F_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_NONCENTRAL_F_64
+    CHISQUARE_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_CHISQUARE_32
+    CHISQUARE_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_CHISQUARE_64
+    GAMMA_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_GAMMA_32
+    GAMMA_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_GAMMA_64
+    STANDARD_T_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_STANDARD_T_32
+    STANDARD_T_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_STANDARD_T_64
+    HYPERGEOMETRIC = _cupynumeric.CUPYNUMERIC_BITGENDIST_HYPERGEOMETRIC
+    VONMISES_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_VONMISES_32
+    VONMISES_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_VONMISES_64
+    ZIPF = _cupynumeric.CUPYNUMERIC_BITGENDIST_ZIPF
+    GEOMETRIC = _cupynumeric.CUPYNUMERIC_BITGENDIST_GEOMETRIC
+    WALD_32 = _cupynumeric.CUPYNUMERIC_BITGENDIST_WALD_32
+    WALD_64 = _cupynumeric.CUPYNUMERIC_BITGENDIST_WALD_64
+    BINOMIAL = _cupynumeric.CUPYNUMERIC_BITGENDIST_BINOMIAL
+    NEGATIVE_BINOMIAL = _cupynumeric.CUPYNUMERIC_BITGENDIST_NEGATIVE_BINOMIAL
+
+
+# Match these to CuPyNumericConvolveMethod in cupynumeric_c.h
+@unique
+class ConvolveMethod(IntEnum):
+    AUTO = _cupynumeric.CUPYNUMERIC_CONVOLVE_AUTO
+    DIRECT = _cupynumeric.CUPYNUMERIC_CONVOLVE_DIRECT
+    FFT = _cupynumeric.CUPYNUMERIC_CONVOLVE_FFT
+
+
+@unique
+class TransferType(IntEnum):
+    DONATE = 0
+    MAKE_COPY = 1
+    SHARE = 2
+
+
+# Match these to fftType in fft_util.h
+class FFTType:
+    def __init__(
+        self,
+        name: str,
+        type_id: int,
+        input_dtype: npt.DTypeLike,
+        output_dtype: npt.DTypeLike,
+        single_precision: bool,
+        complex_type: FFTType | None = None,
+    ) -> None:
+        self._name = name
+        self._type_id = type_id
+        self._complex_type = self if complex_type is None else complex_type
+        self._input_dtype = input_dtype
+        self._output_dtype = output_dtype
+        self._single_precision = single_precision
+
+    def __str__(self) -> str:
+        return self._name
+
+    def __repr__(self) -> str:
+        return str(self)
+
+    @property
+    def type_id(self) -> int:
+        return self._type_id
+
+    @property
+    def complex(self) -> FFTType:
+        return self._complex_type
+
+    @property
+    def input_dtype(self) -> npt.DTypeLike:
+        return self._input_dtype
+
+    @property
+    def output_dtype(self) -> npt.DTypeLike:
+        return self._output_dtype
+
+    @property
+    def is_single_precision(self) -> bool:
+        return self._single_precision
+
+
+FFT_C2C = FFTType(
+    "C2C",
+    _cupynumeric.CUPYNUMERIC_FFT_C2C,
+    np.complex64,
+    np.complex64,
+    True,
+)
+
+FFT_Z2Z = FFTType(
+    "Z2Z",
+    _cupynumeric.CUPYNUMERIC_FFT_Z2Z,
+    np.complex128,
+    np.complex128,
+    False,
+)
+
+FFT_R2C = FFTType(
+    "R2C",
+    _cupynumeric.CUPYNUMERIC_FFT_R2C,
+    np.float32,
+    np.complex64,
+    True,
+    FFT_C2C,
+)
+
+FFT_C2R = FFTType(
+    "C2R",
+    _cupynumeric.CUPYNUMERIC_FFT_C2R,
+    np.complex64,
+    np.float32,
+    True,
+    FFT_C2C,
+)
+
+FFT_D2Z = FFTType(
+    "D2Z",
+    _cupynumeric.CUPYNUMERIC_FFT_D2Z,
+    np.float64,
+    np.complex128,
+    False,
+    FFT_Z2Z,
+)
+
+FFT_Z2D = FFTType(
+    "Z2D",
+    _cupynumeric.CUPYNUMERIC_FFT_Z2D,
+    np.complex128,
+    np.float64,
+    False,
+    FFT_Z2Z,
+)
+
+
+class FFTCode:
+    @staticmethod
+    def real_to_complex_code(dtype: npt.DTypeLike) -> FFTType:
+        if dtype == np.float64:
+            return FFT_D2Z
+        elif dtype == np.float32:
+            return FFT_R2C
+        else:
+            raise TypeError(
+                (
+                    "Data type for FFT not supported "
+                    "(supported types are float32 and float64)"
+                )
+            )
+
+    @staticmethod
+    def complex_to_real_code(dtype: npt.DTypeLike) -> FFTType:
+        if dtype == np.complex128:
+            return FFT_Z2D
+        elif dtype == np.complex64:
+            return FFT_C2R
+        else:
+            raise TypeError(
+                (
+                    "Data type for FFT not supported "
+                    "(supported types are complex64 and complex128)"
+                )
+            )
+
+
+@unique
+class FFTDirection(IntEnum):
+    FORWARD = _cupynumeric.CUPYNUMERIC_FFT_FORWARD
+    INVERSE = _cupynumeric.CUPYNUMERIC_FFT_INVERSE
+
+
+# Match these to CuPyNumericBitorder in cupynumeric_c.h
+@unique
+class Bitorder(IntEnum):
+    BIG = _cupynumeric.CUPYNUMERIC_BITORDER_BIG
+    LITTLE = _cupynumeric.CUPYNUMERIC_BITORDER_LITTLE
+
+
+@unique
+class FFTNormalization(IntEnum):
+    FORWARD = 1
+    INVERSE = 2
+    ORTHOGONAL = 3
+
+    @staticmethod
+    def from_string(in_string: str) -> FFTNormalization | None:
+        if in_string == "forward":
+            return FFTNormalization.FORWARD
+        elif in_string == "ortho":
+            return FFTNormalization.ORTHOGONAL
+        elif in_string == "backward" or in_string is None:
+            return FFTNormalization.INVERSE
+        else:
+            raise ValueError(
+                f'Invalid norm value {in_string}; should be "backward",'
+                '"ortho" or "forward".'
+            )
+
+    @staticmethod
+    def reverse(in_string: str | None) -> str:
+        if in_string == "forward":
+            return "backward"
+        elif in_string == "backward" or in_string is None:
+            return "forward"
+        else:
+            return in_string
diff --git a/cunumeric/fft/__init__.py b/cupynumeric/fft/__init__.py
similarity index 100%
rename from cunumeric/fft/__init__.py
rename to cupynumeric/fft/__init__.py
diff --git a/cunumeric/fft/fft.py b/cupynumeric/fft/fft.py
similarity index 99%
rename from cunumeric/fft/fft.py
rename to cupynumeric/fft/fft.py
index ad6b7caafd..7576f3dd40 100644
--- a/cunumeric/fft/fft.py
+++ b/cupynumeric/fft/fft.py
@@ -20,7 +20,6 @@
 
 from .._array.util import add_boilerplate
 from .._module.array_rearrange import roll
-from .._module.creation_data import asarray
 from ..config import FFT_C2C, FFT_Z2Z, FFTCode, FFTDirection, FFTNormalization
 
 if TYPE_CHECKING:
@@ -105,7 +104,7 @@ def fft(
     numpy.fft.fft
 
     Availability
-    --------
+    ------------
     Multiple GPUs
     """
     s = (n,) if n is not None else None
diff --git a/cunumeric/install_info.py.in b/cupynumeric/install_info.py.in
similarity index 66%
rename from cunumeric/install_info.py.in
rename to cupynumeric/install_info.py.in
index cc683b2252..9175f52a37 100644
--- a/cunumeric/install_info.py.in
+++ b/cupynumeric/install_info.py.in
@@ -30,17 +30,22 @@ def get_libpath():
         "Windows": ".dll"
     }[platform.system()]
 
-    def find_libcunumeric(libdir):
-        if exists(join(libdir, f"libcunumeric{so_ext}")):
+    def find_libcupynumeric(libdir):
+        if exists(join(libdir, f"libcupynumeric{so_ext}")):
             return libdir
         return None
 
-    return (
-        find_libcunumeric(join(cn_path, "build", "lib")) or
-        find_libcunumeric(join(dirname(dirname(dirname(cn_path))), "lib")) or
-        find_libcunumeric(join(dirname(dirname(sys.executable)), "lib")) or
-        ""
-    )
+    for libdir in ("lib", "lib64"):
+        if ret := find_libcupynumeric(join(cn_path, "build", libdir)):
+            return ret
+        if ret := find_libcupynumeric(join(cn_path, "cupynumeric", libdir)):
+            return ret
+        if ret := find_libcupynumeric(join(dirname(dirname(dirname(cn_path))), libdir)):
+            return ret
+        if ret := find_libcupynumeric(join(dirname(dirname(sys.executable)), libdir)):
+            return ret
+
+    return ""
 
 
 libpath: str = get_libpath()
diff --git a/cunumeric/linalg/__init__.py b/cupynumeric/linalg/__init__.py
similarity index 100%
rename from cunumeric/linalg/__init__.py
rename to cupynumeric/linalg/__init__.py
diff --git a/cunumeric/linalg/_cholesky.py b/cupynumeric/linalg/_cholesky.py
similarity index 85%
rename from cunumeric/linalg/_cholesky.py
rename to cupynumeric/linalg/_cholesky.py
index 3775951dcd..a99ae68117 100644
--- a/cunumeric/linalg/_cholesky.py
+++ b/cupynumeric/linalg/_cholesky.py
@@ -25,7 +25,7 @@
 )
 from legate.settings import settings
 
-from ..config import CuNumericOpCode
+from ..config import CuPyNumericOpCode
 from ..runtime import runtime
 from ._exception import LinAlgError
 
@@ -42,7 +42,7 @@ def transpose_copy_single(
     library: Library, input: LogicalStore, output: LogicalStore
 ) -> None:
     task = legate_runtime.create_auto_task(
-        library, CuNumericOpCode.TRANSPOSE_COPY_2D
+        library, CuPyNumericOpCode.TRANSPOSE_COPY_2D
     )
     p_out = task.add_output(output)
     p_in = task.add_input(input)
@@ -63,7 +63,7 @@ def transpose_copy(
 ) -> None:
     task = legate_runtime.create_manual_task(
         library,
-        CuNumericOpCode.TRANSPOSE_COPY_2D,
+        CuPyNumericOpCode.TRANSPOSE_COPY_2D,
         launch_domain,
     )
     task.add_output(p_output)
@@ -75,7 +75,7 @@ def transpose_copy(
 
 
 def potrf_single(library: Library, output: LogicalStore) -> None:
-    task = legate_runtime.create_auto_task(library, CuNumericOpCode.POTRF)
+    task = legate_runtime.create_auto_task(library, CuPyNumericOpCode.POTRF)
     task.throws_exception(LinAlgError)
     task.add_output(output)
     task.add_input(output)
@@ -89,7 +89,7 @@ def mp_potrf(
     input: LogicalStore,
     output: LogicalStore,
 ) -> None:
-    task = legate_runtime.create_auto_task(library, CuNumericOpCode.MP_POTRF)
+    task = legate_runtime.create_auto_task(library, CuPyNumericOpCode.MP_POTRF)
     task.throws_exception(LinAlgError)
     task.add_input(input)
     task.add_output(output)
@@ -103,7 +103,7 @@ def mp_potrf(
 
 def potrf(library: Library, p_output: LogicalStorePartition, i: int) -> None:
     task = legate_runtime.create_manual_task(
-        library, CuNumericOpCode.POTRF, (i + 1, i + 1), lower_bounds=(i, i)
+        library, CuPyNumericOpCode.POTRF, (i + 1, i + 1), lower_bounds=(i, i)
     )
     task.throws_exception(LinAlgError)
     task.add_output(p_output)
@@ -121,7 +121,7 @@ def trsm(
     lhs = p_output
 
     task = legate_runtime.create_manual_task(
-        library, CuNumericOpCode.TRSM, (hi, i + 1), lower_bounds=(lo, i)
+        library, CuPyNumericOpCode.TRSM, (hi, i + 1), lower_bounds=(lo, i)
     )
     task.add_output(lhs)
     task.add_input(rhs)
@@ -136,7 +136,7 @@ def syrk(
     lhs = p_output
 
     task = legate_runtime.create_manual_task(
-        library, CuNumericOpCode.SYRK, (k + 1, k + 1), lower_bounds=(k, k)
+        library, CuPyNumericOpCode.SYRK, (k + 1, k + 1), lower_bounds=(k, k)
     )
     task.add_output(lhs)
     task.add_input(rhs)
@@ -160,7 +160,7 @@ def gemm(
     rhs1 = p_output
 
     task = legate_runtime.create_manual_task(
-        library, CuNumericOpCode.GEMM, (hi, k + 1), lower_bounds=(lo, k)
+        library, CuPyNumericOpCode.GEMM, (hi, k + 1), lower_bounds=(lo, k)
     )
     task.add_output(lhs)
     task.add_input(rhs1, (dimension(0), constant(i)))
@@ -169,19 +169,16 @@ def gemm(
     task.execute()
 
 
-MIN_CHOLESKY_TILE_SIZE = 2048
-MIN_CHOLESKY_MATRIX_SIZE = 8192
+MIN_CHOLESKY_TILE_SIZE = 2 if settings.test() else 2048
+MIN_CHOLESKY_MATRIX_SIZE = 4 if settings.test() else 8192
 
 
 # TODO: We need a better cost model
 def choose_color_shape(
     runtime: Runtime, shape: tuple[int, ...]
 ) -> tuple[int, ...]:
-    if settings.test():
-        num_tiles = runtime.num_procs * 2
-        return (num_tiles, num_tiles)
-
     extent = shape[0]
+
     # If there's only one processor or the matrix is too small,
     # don't even bother to partition it at all
     if runtime.num_procs == 1 or extent <= MIN_CHOLESKY_MATRIX_SIZE:
@@ -201,7 +198,7 @@ def choose_color_shape(
 
 
 def tril_single(library: Library, output: LogicalStore) -> None:
-    task = legate_runtime.create_auto_task(library, CuNumericOpCode.TRILU)
+    task = legate_runtime.create_auto_task(library, CuPyNumericOpCode.TRILU)
     task.add_output(output)
     task.add_input(output)
     task.add_scalar_arg(True, ty.bool_)
@@ -214,7 +211,7 @@ def tril_single(library: Library, output: LogicalStore) -> None:
 
 def tril(library: Library, p_output: LogicalStorePartition, n: int) -> None:
     task = legate_runtime.create_manual_task(
-        library, CuNumericOpCode.TRILU, (n, n)
+        library, CuPyNumericOpCode.TRILU, (n, n)
     )
 
     task.add_output(p_output)
@@ -242,7 +239,7 @@ def _batched_cholesky(
     # Just use a fixed cutoff to provide some sensible warning.
     # TODO: find a better way to inform the user dims are too big
     task = legate_runtime.create_auto_task(
-        library, CuNumericOpCode.BATCHED_CHOLESKY
+        library, CuPyNumericOpCode.BATCHED_CHOLESKY
     )
     task.add_input(input.base)
     task.add_output(output.base)
@@ -254,16 +251,9 @@ def _batched_cholesky(
     task.execute()
 
 
-def cholesky_deferred(
-    output: DeferredArray, input: DeferredArray, no_tril: bool
-) -> None:
+def cholesky_deferred(output: DeferredArray, input: DeferredArray) -> None:
     library = runtime.library
     if len(input.base.shape) > 2:
-        if no_tril:
-            raise NotImplementedError(
-                "batched cholesky expects to only "
-                "produce the lower triangular matrix"
-            )
         size = input.base.shape[-1]
         # Choose 32768 as dimension cutoff for warning
         # so that for float64 anything larger than
@@ -280,8 +270,7 @@ def cholesky_deferred(
     if runtime.num_procs == 1:
         transpose_copy_single(library, input.base, output.base)
         potrf_single(library, output.base)
-        if not no_tril:
-            tril_single(library, output.base)
+        tril_single(library, output.base)
         return
 
     shape = tuple(output.base.shape)
@@ -295,8 +284,7 @@ def cholesky_deferred(
             library, shape[0], MIN_CHOLESKY_TILE_SIZE, input.base, output.base
         )
 
-        if not no_tril:
-            tril_single(library, output.base)
+        tril_single(library, output.base)
     else:
         initial_color_shape = choose_color_shape(runtime, shape)
         tile_shape = _rounding_divide(shape, initial_color_shape)
@@ -314,5 +302,4 @@ def cholesky_deferred(
                 syrk(library, p_output, k, i)
                 gemm(library, p_output, k, i, k + 1, n)
 
-        if not no_tril:
-            tril(library, p_output, n)
+        tril(library, p_output, n)
diff --git a/cupynumeric/linalg/_eigen.py b/cupynumeric/linalg/_eigen.py
new file mode 100644
index 0000000000..cb6b0dd057
--- /dev/null
+++ b/cupynumeric/linalg/_eigen.py
@@ -0,0 +1,87 @@
+# Copyright 2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Optional
+
+from legate.core import dimension, get_legate_runtime
+
+from cupynumeric.config import CuPyNumericOpCode
+
+from ..runtime import runtime
+from ._exception import LinAlgError
+
+if TYPE_CHECKING:
+    from .._thunk.deferred import DeferredArray
+
+
+def eig_deferred(
+    a: DeferredArray, ew: DeferredArray, ev: Optional[DeferredArray] = None
+) -> None:
+    library = a.library
+
+    m = a.shape[-1]
+
+    if m == 0:
+        raise ValueError("Input shape dimension 0 not allowed!")
+
+    def choose_nd_color_shape(shape: tuple[int, ...]) -> tuple[int, ...]:
+        # start with 1D and re-balance by powers of 2
+        # (don't worry about other primes)
+        color_shape = [1 for i in shape]
+        if len(shape) > 2:
+            color_shape[0] = runtime.num_procs
+
+            done = False
+            while not done and color_shape[0] % 2 == 0:
+                # find max idx
+                # if large enough --> switch
+                weight_per_dim = list(
+                    map(lambda x, y: x / y, list(shape), color_shape)
+                )[:-2]
+
+                max_weight = max(weight_per_dim)
+                idx = weight_per_dim.index(max_weight)
+
+                if weight_per_dim[idx] > 2 * weight_per_dim[0]:
+                    color_shape[0] = color_shape[0] // 2
+                    color_shape[idx] = color_shape[idx] * 2
+                else:
+                    done = True
+
+        return tuple(color_shape)
+
+    # coloring via num_procs to get utilization
+    initial_color_shape = choose_nd_color_shape(a.shape)
+    tilesize = tuple(
+        map(lambda x, y: (x + y - 1) // y, a.shape, initial_color_shape)
+    )
+    color_shape = tuple(map(lambda x, y: (x + y - 1) // y, a.shape, tilesize))
+
+    # partition defined py local batchsize
+    tiled_a = a.base.partition_by_tiling(tilesize)
+    tiled_ew = ew.base.partition_by_tiling(tilesize[:-1])
+
+    task = get_legate_runtime().create_manual_task(
+        library, CuPyNumericOpCode.GEEV, color_shape
+    )
+    task.throws_exception(LinAlgError)
+    partition = tuple(dimension(i) for i in range(len(color_shape)))
+    task.add_input(tiled_a, partition)
+    task.add_output(tiled_ew, partition[:-1])
+    if ev is not None:
+        tiled_ev = ev.base.partition_by_tiling(tilesize)
+        task.add_output(tiled_ev, partition)
+    task.execute()
diff --git a/cunumeric/linalg/_exception.py b/cupynumeric/linalg/_exception.py
similarity index 100%
rename from cunumeric/linalg/_exception.py
rename to cupynumeric/linalg/_exception.py
diff --git a/cunumeric/linalg/_qr.py b/cupynumeric/linalg/_qr.py
similarity index 91%
rename from cunumeric/linalg/_qr.py
rename to cupynumeric/linalg/_qr.py
index aa2c38e1cb..4b20d5fe62 100644
--- a/cunumeric/linalg/_qr.py
+++ b/cupynumeric/linalg/_qr.py
@@ -18,7 +18,7 @@
 
 from legate.core import get_legate_runtime
 
-from cunumeric.config import CuNumericOpCode
+from cupynumeric.config import CuPyNumericOpCode
 
 from ._exception import LinAlgError
 
@@ -31,7 +31,7 @@
 def qr_single(
     library: Library, a: LogicalStore, q: LogicalStore, r: LogicalStore
 ) -> None:
-    task = get_legate_runtime().create_auto_task(library, CuNumericOpCode.QR)
+    task = get_legate_runtime().create_auto_task(library, CuPyNumericOpCode.QR)
     task.throws_exception(LinAlgError)
     task.add_input(a)
     task.add_output(q)
diff --git a/cunumeric/linalg/_solve.py b/cupynumeric/linalg/_solve.py
similarity index 95%
rename from cunumeric/linalg/_solve.py
rename to cupynumeric/linalg/_solve.py
index 7681444ac3..325fe301de 100644
--- a/cunumeric/linalg/_solve.py
+++ b/cupynumeric/linalg/_solve.py
@@ -19,7 +19,7 @@
 import legate.core.types as ty
 from legate.core import broadcast, get_legate_runtime
 
-from ..config import CuNumericOpCode
+from ..config import CuPyNumericOpCode
 from ..runtime import runtime
 from ._cholesky import transpose_copy_single
 from ._exception import LinAlgError
@@ -32,7 +32,7 @@
 
 def solve_single(library: Library, a: LogicalStore, b: LogicalStore) -> None:
     task = get_legate_runtime().create_auto_task(
-        library, CuNumericOpCode.SOLVE
+        library, CuPyNumericOpCode.SOLVE
     )
     task.throws_exception(LinAlgError)
     p_a = task.add_input(a)
@@ -60,7 +60,7 @@ def mp_solve(
     output: LogicalStore,
 ) -> None:
     task = get_legate_runtime().create_auto_task(
-        library, CuNumericOpCode.MP_SOLVE
+        library, CuPyNumericOpCode.MP_SOLVE
     )
     task.throws_exception(LinAlgError)
     task.add_input(a)
diff --git a/cunumeric/linalg/_svd.py b/cupynumeric/linalg/_svd.py
similarity index 90%
rename from cunumeric/linalg/_svd.py
rename to cupynumeric/linalg/_svd.py
index 9579f06849..a9be94924d 100644
--- a/cunumeric/linalg/_svd.py
+++ b/cupynumeric/linalg/_svd.py
@@ -18,7 +18,7 @@
 
 from legate.core import get_legate_runtime
 
-from cunumeric.config import CuNumericOpCode
+from cupynumeric.config import CuPyNumericOpCode
 
 from ._exception import LinAlgError
 
@@ -35,7 +35,9 @@ def svd_single(
     s: LogicalStore,
     vh: LogicalStore,
 ) -> None:
-    task = get_legate_runtime().create_auto_task(library, CuNumericOpCode.SVD)
+    task = get_legate_runtime().create_auto_task(
+        library, CuPyNumericOpCode.SVD
+    )
     task.throws_exception(LinAlgError)
     task.add_input(a)
     task.add_output(u)
diff --git a/cunumeric/linalg/linalg.py b/cupynumeric/linalg/linalg.py
similarity index 62%
rename from cunumeric/linalg/linalg.py
rename to cupynumeric/linalg/linalg.py
index 31f64eca0a..39b04adc5c 100644
--- a/cunumeric/linalg/linalg.py
+++ b/cupynumeric/linalg/linalg.py
@@ -14,11 +14,12 @@
 #
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Sequence
+from typing import TYPE_CHECKING, Any, Sequence
 
 import numpy as np
 
 from .._utils import is_np2
+from ..runtime import runtime
 
 if is_np2:
     from numpy.lib.array_utils import normalize_axis_index  # type: ignore
@@ -31,8 +32,11 @@
         normalize_axis_tuple,
     )
 
-from .._array.util import add_boilerplate, convert_to_cunumeric_ndarray
+from legate.core import get_machine
+
+from .._array.util import add_boilerplate, convert_to_cupynumeric_ndarray
 from .._module import dot, empty_like, eye, matmul, ndarray
+from .._module.creation_shape import zeros, zeros_like
 from .._ufunc.math import add, sqrt as _sqrt
 from ._exception import LinAlgError
 
@@ -89,6 +93,108 @@ def cholesky(a: ndarray) -> ndarray:
     return _thunk_cholesky(a)
 
 
+@add_boilerplate("a")
+def eig(a: ndarray) -> tuple[ndarray, ...]:
+    """
+    Compute the eigenvalues and right eigenvectors of a square array.
+
+    Parameters
+    ----------
+    a : (..., M, M) array_like
+        Matrices for which the eigenvalues and right eigenvectors will be
+        computed, at least dimension 2.
+
+    Returns
+    -------
+    eigenvalues : (…, M) array_like
+        The eigenvalues, each repeated according to its multiplicity.
+    eigenvectors : (…, M, M) array
+        The normalized (unit “length”) eigenvectors, such that the column
+        eigenvectors[:,i] is the eigenvector corresponding to the eigenvalue
+        eigenvalues[i].
+
+    Raises
+    ------
+    LinAlgError
+        If the eigenvalue computation does not converge.
+
+    Notes
+    -----
+    Unlike NumPy, cuPyNumeric always returns complex-dtype results, even if the
+    imaginary part is zero.
+
+    Multi-GPU/CPU usage is limited to data parallel matrix-wise batching.
+
+    See Also
+    --------
+    numpy.linalg.eig
+
+    Availability
+    --------
+    Multiple GPU, Multiple CPU
+    """
+    shape = a.shape
+    if len(shape) < 2:
+        raise LinAlgError(
+            f"{len(shape)}-dimensional array given. "
+            "Array must be at least two-dimensional"
+        )
+    if shape[-2] != shape[-1]:
+        raise LinAlgError("Last 2 dimensions of the array must be square")
+    if np.dtype("e") == a.dtype:
+        raise TypeError("array type float16 is unsupported in linalg")
+    return _thunk_eig(a)
+
+
+@add_boilerplate("a")
+def eigvals(a: ndarray) -> ndarray:
+    """
+    Compute the eigenvalues of a square array.
+
+    Parameters
+    ----------
+    a : (..., M, M) array_like
+        Matrices for which the eigenvalues will be computed, at least
+        dimension 2.
+
+    Returns
+    -------
+    w : (…, M) array_like
+        The eigenvalues, each repeated according to its multiplicity.
+
+    Raises
+    ------
+    LinAlgError
+        If the eigenvalue computation does not converge.
+
+    Notes
+    -----
+    Unlike NumPy, cuPyNumeric always returns complex-dtype results, even if the
+    imaginary part is zero.
+
+    Multi-GPU/CPU usage is limited to data parallel matrix-wise batching.
+
+    See Also
+    --------
+    numpy.linalg.eigvals
+
+    Availability
+    --------
+    Multiple GPU, Multiple CPU
+    """
+    shape = a.shape
+    if len(shape) < 2:
+        raise LinAlgError(
+            f"{len(shape)}-dimensional array given. "
+            "Array must be at least two-dimensional"
+        )
+    if shape[-2] != shape[-1]:
+        raise LinAlgError("Last 2 dimensions of the array must be square")
+    if np.dtype("e") == a.dtype:
+        raise TypeError("array type float16 is unsupported in linalg")
+    return _thunk_eigvals(a)
+
+
 @add_boilerplate("a")
 def qr(a: ndarray) -> tuple[ndarray, ...]:
     """
@@ -134,7 +240,7 @@ def qr(a: ndarray) -> tuple[ndarray, ...]:
         )
     if len(shape) > 2:
         raise NotImplementedError(
-            "cuNumeric does not yet support stacked 2d arrays"
+            "cuPyNumeric does not yet support stacked 2d arrays"
         )
     if np.dtype("e") == a.dtype:
         raise TypeError("array type float16 is unsupported in linalg")
@@ -194,7 +300,7 @@ def solve(a: ndarray, b: ndarray, out: ndarray | None = None) -> ndarray:
         raise TypeError("array type float16 is unsupported in linalg")
     if a.ndim > 2 or b.ndim > 2:
         raise NotImplementedError(
-            "cuNumeric does not yet support stacked 2d arrays"
+            "cuPyNumeric does not yet support stacked 2d arrays"
         )
     if a.shape[-2] != a.shape[-1]:
         raise LinAlgError("Last 2 dimensions of the array must be square")
@@ -246,8 +352,7 @@ def svd(a: ndarray, full_matrices: bool = True) -> tuple[ndarray, ...]:
 
     Notes
     -----
-    Currently does not support the parameters 'full_matrices', 'compute_uv',
-    and 'hermitian'.
+    Currently does not support the parameters 'compute_uv' and 'hermitian'.
 
     See Also
     --------
@@ -265,10 +370,10 @@ def svd(a: ndarray, full_matrices: bool = True) -> tuple[ndarray, ...]:
         )
     if len(shape) > 2:
         raise NotImplementedError(
-            "cuNumeric does not yet support stacked 2d arrays"
+            "cuPyNumeric does not yet support stacked 2d arrays"
         )
     if shape[0] < shape[1]:
-        raise NotImplementedError("cuNumeric only supports M >= N")
+        raise NotImplementedError("cuPyNumeric only supports M >= N")
     if np.dtype("e") == a.dtype:
         raise TypeError("array type float16 is unsupported in linalg")
     return _thunk_svd(a, full_matrices)
@@ -323,7 +428,7 @@ def matrix_power(a: ndarray, n: int) -> ndarray:
 
     # Invert if necessary
     if n < 0:
-        # TODO: Add this once cunumeric.inv is implemented
+        # TODO: Add this once cupynumeric.inv is implemented
         # a = inv(a)
         # n = abs(n)
         raise NotImplementedError("Negative exponent in matrix_power")
@@ -385,9 +490,9 @@ def multi_dot(
     --------
     Multiple GPUs, Multiple CPUs
     """
-    arrays = [convert_to_cunumeric_ndarray(x) for x in arrays]
+    arrays = [convert_to_cupynumeric_ndarray(x) for x in arrays]
     if out is not None:
-        out = convert_to_cunumeric_ndarray(out, share=True)
+        out = convert_to_cupynumeric_ndarray(out, share=True)
 
     n = len(arrays)
     # optimization only makes sense for len(arrays) > 2
@@ -700,7 +805,7 @@ def norm(
         raise ValueError("Improper number of dimensions to norm")
 
 
-def _thunk_cholesky(a: ndarray, no_tril: bool = False) -> ndarray:
+def _thunk_cholesky(a: ndarray) -> ndarray:
     """Cholesky decomposition.
 
     Return the Cholesky decomposition, `L * L.H`, of the square matrix `a`,
@@ -744,10 +849,84 @@ def _thunk_cholesky(a: ndarray, no_tril: bool = False) -> ndarray:
         dtype=input.dtype,
         inputs=(input,),
     )
-    output._thunk.cholesky(input._thunk, no_tril=no_tril)
+    output._thunk.cholesky(input._thunk)
     return output
 
 
+def _thunk_eig(a: ndarray) -> tuple[ndarray, ...]:
+    if a.dtype.kind not in ("f", "c"):
+        a = a.astype("float64")
+
+    if a.dtype == np.float32:
+        complex_dtype = np.dtype(np.complex64)
+    elif a.dtype == np.float64:
+        complex_dtype = np.dtype(np.complex128)  # type: ignore
+    elif a.dtype.kind in ("c"):
+        complex_dtype = a.dtype
+    else:
+        raise TypeError("Eig input not supported (missing a conversion?)")
+
+    if runtime.num_gpus > 0 and not runtime.cusolver_has_geev():
+        a = ndarray(a.shape, a.dtype, thunk=runtime.to_eager_array(a._thunk))
+        out_ew = ndarray(
+            shape=a.shape[:-1],
+            dtype=complex_dtype,
+            force_thunk="eager",
+        )
+        out_ev = ndarray(
+            shape=a.shape,
+            dtype=complex_dtype,
+            force_thunk="eager",
+        )
+    else:
+        out_ew = ndarray(
+            shape=a.shape[:-1],
+            dtype=complex_dtype,
+            inputs=(a,),
+        )
+        out_ev = ndarray(
+            shape=a.shape,
+            dtype=complex_dtype,
+            inputs=(a,),
+        )
+
+    if a.shape[-1] > 0:
+        a._thunk.eig(out_ew._thunk, out_ev._thunk)
+    return out_ew, out_ev
+
+
+def _thunk_eigvals(a: ndarray) -> ndarray:
+    if a.dtype.kind not in ("f", "c"):
+        a = a.astype("float64")
+
+    if a.dtype == np.float32:
+        complex_dtype = np.dtype(np.complex64)
+    elif a.dtype == np.float64:
+        complex_dtype = np.dtype(np.complex128)  # type: ignore
+    elif a.dtype.kind in ("c"):
+        complex_dtype = a.dtype
+    else:
+        raise TypeError("Eigvals input not supported (missing a conversion?)")
+
+    if runtime.num_gpus > 0 and not runtime.cusolver_has_geev():
+        a = ndarray(a.shape, a.dtype, thunk=runtime.to_eager_array(a._thunk))
+        out_ew = ndarray(
+            shape=a.shape[:-1],
+            dtype=complex_dtype,
+            force_thunk="eager",
+        )
+    else:
+        out_ew = ndarray(
+            shape=a.shape[:-1],
+            dtype=complex_dtype,
+            inputs=(a,),
+        )
+
+    if a.shape[-1] > 0:
+        a._thunk.eigvals(out_ew._thunk)
+    return out_ew
+
+
 def _thunk_qr(a: ndarray) -> tuple[ndarray, ...]:
     if a.dtype.kind not in ("f", "c"):
         a = a.astype("float64")
@@ -833,3 +1012,369 @@ def _thunk_svd(a: ndarray, full_matrices: bool) -> tuple[ndarray, ...]:
 
     a._thunk.svd(out_u._thunk, out_s._thunk, out_vh._thunk)
     return out_u, out_s, out_vh
+
+
+# helper function to construct rational Pade
+# numerator / denominator for expm(A):
+#
+def make_uv(A: ndarray, b: Any, m: int) -> tuple[ndarray, ndarray]:
+    # 1 + floor(m/2):
+    #
+    k = 1 + m // 2
+    n = A.shape[0]
+
+    U = zeros((n, n), dtype=A.dtype)
+    V = zeros((n, n), dtype=A.dtype)
+
+    # U := A * ∑_{j=0, k} b_{2j+1} * A^{2j};
+    # V := ∑_{j=0, k} b_{2j} * A^{2j};
+    #
+    A2 = matmul(A, A)
+    A2k = eye(n, dtype=A.dtype)
+    for j in range(k):
+        U = U + b[2 * j + 1] * A2k
+        V = V + b[2 * j] * A2k
+        A2k = matmul(A2k, A2)
+
+    U = matmul(A, U)
+
+    return (U, V)
+
+
+class ExpmConstants:
+    """
+    Aggregates all the necessary expm(A) constants.
+    """
+
+    # Pade `b` coefficient generators
+    # for both numerator `p(x)` and
+    # denominator `q(x)` coefficients
+    #
+    # dictionary key := `m`, degree of
+    # both `p(x)` and `q(x)` for
+    # diagonal Pade implementation;
+    #
+    b_coeff = {
+        3: np.array([120, 60, 12, 1], dtype=np.float64),
+        5: np.array([30240, 15120, 3360, 420, 30, 1], dtype=np.float64),
+        7: np.array(
+            [17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1],
+            dtype=np.float64,
+        ),
+        9: np.array(
+            [
+                17643225600,
+                8821612800,
+                2075673600,
+                302702400,
+                30270240,
+                2162160,
+                110880,
+                3960,
+                90,
+                1,
+            ],
+            dtype=np.float64,
+        ),
+        13: np.array(
+            [
+                64764752532480000,
+                32382376266240000,
+                7771770303897600,
+                1187353796428800,
+                129060195264000,
+                10559470521600,
+                670442572800,
+                33522128640,
+                1323241920,
+                40840800,
+                960960,
+                16380,
+                182,
+                1,
+            ],
+            dtype=np.float64,
+        ),
+    }
+
+    # Pade error control: absolute error tolerance
+    # parameter `theta`, also degree `m` dependent:
+    #
+    theta = {
+        3: 1.5e-2,
+        5: 2.5e-1,
+        7: 9.5e-1,
+        9: 2.1,
+        13: 5.4,
+    }
+
+    # Taylor-18 coefficients
+    #
+    a01 = 0
+    a11 = -0.10036558103014462001
+    a21 = -0.00802924648241156960
+    a31 = -0.00089213849804572995
+
+    b01 = 0
+    b11 = 0.39784974949964507614
+    b21 = 1.36783778460411719922
+    b31 = 0.49828962252538267755
+    b61 = -0.00063789819459472330
+    b02 = -10.9676396052962062593
+    b12 = 1.68015813878906197182
+    b22 = 0.05717798464788655127
+    b32 = -0.00698210122488052084
+    b62 = 0.00003349750170860705
+    b03 = -0.09043168323908105619
+    b13 = -0.06764045190713819075
+    b23 = 0.06759613017704596460
+    b33 = 0.02955525704293155274
+    b63 = -0.00001391802575160607
+    b04 = 0
+    b14 = 0
+    b24 = -0.09233646193671185927
+    b34 = -0.01693649390020817171
+    b64 = -0.00001400867981820361
+
+    # Taylor-18 error control (squaring and scalling decision):
+    #
+    theta_m = 1.09
+
+
+def expm_impl(a: ndarray, output: ndarray) -> tuple[int, int]:
+    """
+    Implements Pade rational aproximant of
+    Algorithm 10.20, p.246-247 in
+    "Functions of Matrices - Theory and Computation",
+    Nicholas J. Higham, SIAM 2008.
+    """
+
+    lst_keys = list(ExpmConstants.theta.keys())
+
+    # maximum polynomial degree for [p(x)/q(x)]:
+    max_deg = lst_keys[-1]
+
+    # L1 norm of matrix input:
+    l1_norm_a = norm(a, 1)
+
+    # loop decides which Pade degree, `m`, to
+    # use, starting with the lowest degree
+    # up to the one before last degree;
+    #
+    # if neither satisfies the theta tolerance
+    # then exit the loop and proceed by using
+    # m=max_deg degree + scaling (to achieve
+    # desired tolerance);
+    #
+    requires_scaling = True
+    s = 0
+    a_scaled = a
+
+    for m in lst_keys[0:-1]:
+        tol_m = ExpmConstants.theta[m]
+        b_arr = ExpmConstants.b_coeff[m]
+        if l1_norm_a <= tol_m:
+            requires_scaling = False
+            break
+
+    # at this point scaling + squaring with [max_deg/max_deg]
+    # Pade rational approximation is done;
+    #
+    # using [max_deg/max_deg] Pade with scaling A/(2^s)
+    # until || A / (2^s) ||_1 <= tol_13;
+    # i.e., s = ceil(log_2(|| A / (2^s) ||_1)):
+    #
+    if requires_scaling:
+        m = max_deg
+        tol_m = ExpmConstants.theta[m]
+        b_arr = ExpmConstants.b_coeff[m]
+
+        s = np.maximum(1, int(np.ceil(np.log2(l1_norm_a / tol_m))))
+        #
+        # scale `a` by sfactor = 1.0/2^s = 2^(-s):
+        #
+        sfactor = np.power(2.0, s)
+        #
+        # A' <- A / sfactor
+        #
+        a_scaled = a / sfactor
+
+    # evaluate U, V matrices, via Eq. 10.33 of [1]
+    # k = 1 + floor(m/2):
+    # U := A * ∑_{j=0, k} b_{2j+1} * A^{2j};
+    # V := ∑_{j=0, k} b_{2j} * A^{2j};
+    #
+    (U, V) = make_uv(a_scaled, b_arr, m)
+    A = V - U
+    B = V + U
+
+    # independently solve for each column:
+    # TODO: can more parallelism be harvested here?
+    #       at the very least avoid oversolving by
+    #       doing LU / QR factorization once, followed
+    #       by `n` backward-forward substitutions;
+    #
+    output[:] = solve(A, B)
+
+    # if scaling by 1/2^s was done then
+    # squaring s times is necessary:
+    #
+    if requires_scaling:
+        for j in range(s):
+            output[:] = matmul(output, output)
+
+    return (m, s)
+
+
+def expm_expl(a: ndarray, output: ndarray) -> tuple[int, int]:
+    """
+    Implements Taylor expansion, algorithm T_18
+    in "Computing the Matrix Exponential with an
+    Optimized Taylor Polynomial Approximation",
+    Philipp Bader et. al.,
+    which minimizes the number of matrix products
+    for given number of terms in the expansion.
+    """
+
+    tol_m = ExpmConstants.theta_m  # may vary w/ degree, m, in future impls.
+
+    # L1 norm of matrix input:
+    l1_norm_a = norm(a, 1)
+
+    requires_scaling = l1_norm_a > tol_m
+
+    s = 0
+    A = a
+    m = 18
+
+    if requires_scaling:
+        s = np.maximum(1, int(np.ceil(np.log2(l1_norm_a / tol_m))))
+        #
+        # scale `a` by sfactor = 1.0/2^s = 2^(-s):
+        #
+        sfactor = np.power(2.0, s)
+        #
+        # A' <- A / sfactor
+        #
+        A = a / sfactor
+
+    EYE = eye(A.shape[0], dtype=A.dtype)
+    A2 = matmul(A, A)
+    A3 = matmul(A2, A)
+    A6 = matmul(A3, A3)
+    B1 = (
+        ExpmConstants.a11 * A + ExpmConstants.a21 * A2 + ExpmConstants.a31 * A3
+    )
+    B2 = (
+        ExpmConstants.b11 * A
+        + ExpmConstants.b21 * A2
+        + ExpmConstants.b31 * A3
+        + ExpmConstants.b61 * A6
+    )
+    B3 = (
+        ExpmConstants.b02 * EYE
+        + ExpmConstants.b12 * A
+        + ExpmConstants.b22 * A2
+        + ExpmConstants.b32 * A3
+        + ExpmConstants.b62 * A6
+    )
+    B4 = (
+        ExpmConstants.b03 * EYE
+        + ExpmConstants.b13 * A
+        + ExpmConstants.b23 * A2
+        + ExpmConstants.b33 * A3
+        + ExpmConstants.b63 * A6
+    )
+    B5 = (
+        ExpmConstants.b24 * A2
+        + ExpmConstants.b34 * A3
+        + ExpmConstants.b64 * A6
+    )
+
+    A9 = B4 + matmul(B1, B5)
+    B39 = B3 + A9
+
+    output[:] = B2 + matmul(B39, A9)
+
+    # if scaling by 1/2^s was done then
+    # squaring s times is necessary:
+    #
+    if requires_scaling:
+        for j in range(s):
+            output[:] = matmul(output, output)
+
+    return (m, s)
+
+
+@add_boilerplate("a")
+def expm(a: ndarray, method: str = "pade") -> ndarray:
+    """
+    Matrix exponential.
+
+    Returns exp(A) for each (M x M) slice into a multi-dimensional
+    array, assumed to be of shape (..., M, M);
+
+    By default Pade (implicit) implementation is used.
+    However, explicit Taylor(deg = 18) implementation can be used,
+    by supplying additional flag `use_explicit = True`.
+
+    Parameters
+    ----------
+    a : (..., M, M) array_like
+        Input matrix or multi-dimensional array of shape (..., M, M).
+
+    method : String method selector to use explicit ('taylor')
+        or implicit ('pade'); default = 'pade'.
+
+    Returns
+    -------
+    exp(A): matrix exponential of input, or a matrix exponential
+        for each slice in the input.
+
+    Notes
+    -----
+    Implicit Pade implementation is more stable but more computationally
+    intensive than explicit Taylor, which is less stable when matrix norm is
+    big enough. Also, Taylor can be slightly more performant for matrices of
+    small enough norms, but more memory consuming.
+
+    See Also
+    --------
+    scipy.linalg.expm
+
+    Availability
+    --------
+    Multiple GPUs, Multiple CPUs
+    """
+
+    if a.ndim < 2 or a.shape[-1] != a.shape[-2] or a.size <= 1:
+        raise ValueError(f"Invalid input shape for expm: {a.shape}")
+
+    output = zeros_like(a)
+
+    m_info = get_machine()
+    num_PEs = m_info.count()
+
+    # run implicit (Pade) method by default:
+    #
+    if method == "pade":
+        expm_func = expm_impl
+    elif method == "taylor":
+        expm_func = expm_expl
+    else:
+        raise ValueError(f"Method {method} not supported.")
+
+    if num_PEs < 2:
+        for idx in np.ndindex(a.shape[:-2]):
+            mdeg, s = expm_func(a[idx], output[idx])
+    else:
+        for idx in np.ndindex(a.shape[:-2]):
+            flat_index = np.ravel_multi_index(idx, a.shape[:-2])
+
+            # assign work to multiple GPUs in round-robin way:
+            #
+            findx = int(flat_index)
+            with m_info[findx % num_PEs]:
+                mdeg, s = expm_func(a[idx], output[idx])
+
+    return output
diff --git a/cunumeric/ma/__init__.py b/cupynumeric/ma/__init__.py
similarity index 100%
rename from cunumeric/ma/__init__.py
rename to cupynumeric/ma/__init__.py
diff --git a/cunumeric/ma/_masked_array.py b/cupynumeric/ma/_masked_array.py
similarity index 100%
rename from cunumeric/ma/_masked_array.py
rename to cupynumeric/ma/_masked_array.py
diff --git a/cunumeric/patch.py b/cupynumeric/patch.py
similarity index 76%
rename from cunumeric/patch.py
rename to cupynumeric/patch.py
index 2cc72266e1..569499fc29 100644
--- a/cunumeric/patch.py
+++ b/cupynumeric/patch.py
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-""" This module may be imported in order to globably replace NumPy with
-CuNumeric.
+"""This module may be imported in order to globably replace NumPy with
+cuPyNumeric.
 
 In order to function properly, this module must be imported early (ideally
 at the very start of a script).  The ``numpy`` module in ``sys.modules``
-will be replaced with ``cunumeric`` so that any subsequent use of the
-``numpy`` module will use ``cunumeric`` instead.
+will be replaced with ``cupynumeric`` so that any subsequent use of the
+``numpy`` module will use ``cupynumeric`` instead.
 
 This module is primarily intended for quick demonstrations or proofs of
 concept.
@@ -28,6 +28,6 @@
 
 import sys
 
-import cunumeric
+import cupynumeric
 
-sys.modules["numpy"] = cunumeric
+sys.modules["numpy"] = cupynumeric
diff --git a/cunumeric/py.typed b/cupynumeric/py.typed
similarity index 100%
rename from cunumeric/py.typed
rename to cupynumeric/py.typed
diff --git a/cunumeric/random/__init__.py b/cupynumeric/random/__init__.py
similarity index 100%
rename from cunumeric/random/__init__.py
rename to cupynumeric/random/__init__.py
diff --git a/cunumeric/random/_bitgenerator.py b/cupynumeric/random/_bitgenerator.py
similarity index 99%
rename from cunumeric/random/_bitgenerator.py
rename to cupynumeric/random/_bitgenerator.py
index 2dbd41a29b..c4f62691b1 100644
--- a/cunumeric/random/_bitgenerator.py
+++ b/cupynumeric/random/_bitgenerator.py
@@ -53,7 +53,7 @@ def __init__(
         numpy.random.BitGenerator
 
         Availability
-        --------
+        ------------
         Multiple GPUs, Multiple CPUs
         """
         if type(self) is BitGenerator:
@@ -68,8 +68,7 @@ def __init__(
         )
 
     @abstractproperty
-    def generatorType(self) -> BitGeneratorType:
-        ...
+    def generatorType(self) -> BitGeneratorType: ...
 
     def __del__(self) -> None:
         if self.handle != 0:
diff --git a/cunumeric/random/_generator.py b/cupynumeric/random/_generator.py
similarity index 98%
rename from cunumeric/random/_generator.py
rename to cupynumeric/random/_generator.py
index c84cce39de..4736bd8981 100644
--- a/cunumeric/random/_generator.py
+++ b/cupynumeric/random/_generator.py
@@ -43,8 +43,8 @@ def __init__(self, bit_generator: BitGenerator) -> None:
         then an array with that shape is filled and returned.
 
 
-        The function :func:`cunumeric.random.default_rng` will instantiate
-        a `Generator` with cuNumeric's default `BitGenerator`.
+        The function :func:`cupynumeric.random.default_rng` will instantiate
+        a `Generator` with cuPyNumeric's default `BitGenerator`.
 
         Parameters
         ----------
@@ -57,7 +57,7 @@ def __init__(self, bit_generator: BitGenerator) -> None:
         default_rng : Recommended constructor for `Generator`.
 
         Availability
-        --------
+        ------------
         Multiple GPUs, Multiple CPUs
 
         """
diff --git a/cunumeric/random/_random.py b/cupynumeric/random/_random.py
similarity index 99%
rename from cunumeric/random/_random.py
rename to cupynumeric/random/_random.py
index 8299da0608..6879e9053b 100644
--- a/cunumeric/random/_random.py
+++ b/cupynumeric/random/_random.py
@@ -1713,7 +1713,7 @@ def _random_state_fallback(obj: Any) -> Any:
     # wrapped vanilla NumPy RandomState
     if isinstance(obj, RandomState):
         return obj._np_random_state
-    # eagerly convert any cuNumeric ndarrays to NumPy
+    # eagerly convert any cuPyNumeric ndarrays to NumPy
     if isinstance(obj, ndarray):
         return obj.__array__()
     return obj
diff --git a/cunumeric/runtime.py b/cupynumeric/runtime.py
similarity index 92%
rename from cunumeric/runtime.py
rename to cupynumeric/runtime.py
index 85cc7d9548..ca7a32bf43 100644
--- a/cunumeric/runtime.py
+++ b/cupynumeric/runtime.py
@@ -17,7 +17,7 @@
 import math
 import warnings
 from functools import lru_cache, reduce
-from typing import TYPE_CHECKING, Any, Sequence, TypeGuard
+from typing import TYPE_CHECKING, Any, Literal, Sequence, TypeGuard
 
 import legate.core.types as ty
 import numpy as np
@@ -28,12 +28,12 @@
 from ._utils.stack import find_last_user_stacklevel
 from .config import (
     BitGeneratorOperation,
-    CuNumericOpCode,
+    CuPyNumericOpCode,
     TransferType,
-    cunumeric_lib,
+    cupynumeric_lib,
 )
 
-# We need to be careful about importing from other cunumeric modules here. The
+# We need to be careful about importing from other cupynumeric modules. The
 # runtime is global and used in many places, but also depends on many of the
 # other modules. Things like config and utils are OK, but imports for thunks,
 # array types, etc. need to be deferred in order to avoid circular imports.
@@ -75,7 +75,7 @@ def cached_thunk_from_scalar(
 
 class Runtime(object):
     def __init__(self) -> None:
-        self.library = legate_runtime.find_library(cunumeric_lib.name)
+        self.library = legate_runtime.find_library(cupynumeric_lib.name)
         self.current_random_epoch = 0
         self.current_random_bitgenid = 0
         self.current_random_bitgen_zombies: tuple[Any, ...] = ()
@@ -83,14 +83,14 @@ def __init__(self) -> None:
         self.api_calls: list[tuple[str, str, bool]] = []
 
         max_eager_volume = (
-            cunumeric_lib.shared_object.cunumeric_max_eager_volume()
+            cupynumeric_lib.shared_object.cupynumeric_max_eager_volume()
         )
         self.max_eager_volume = int(np.asarray(max_eager_volume))
 
-        assert cunumeric_lib.shared_object is not None
-        self.cunumeric_lib = cunumeric_lib.shared_object
+        assert cupynumeric_lib.shared_object is not None
+        self.cupynumeric_lib = cupynumeric_lib.shared_object
         self.has_cusolvermp = (
-            cunumeric_lib.shared_object.cunumeric_has_cusolvermp()
+            cupynumeric_lib.shared_object.cupynumeric_has_cusolvermp()
         )
 
         from .settings import settings
@@ -103,6 +103,13 @@ def __init__(self) -> None:
         # Maps value types to struct types used in argmin/argmax
         self._cached_argred_types: dict[ty.Type, ty.Type] = dict()
 
+    def cusolver_has_geev(self) -> bool:
+        if not hasattr(self, "cusolver_has_geev_"):
+            self.cusolver_has_geev_ = (
+                cupynumeric_lib.shared_object.cupynumeric_cusolver_has_geev()
+            )
+        return self.cusolver_has_geev_
+
     @property
     def num_procs(self) -> int:
         return len(legate_runtime.machine)
@@ -122,7 +129,7 @@ def record_api_call(
     def _load_cudalibs(self) -> None:
         task = legate_runtime.create_manual_task(
             self.library,
-            CuNumericOpCode.LOAD_CUDALIBS,
+            CuPyNumericOpCode.LOAD_CUDALIBS,
             [self.num_gpus],
         )
         task.execute()
@@ -134,7 +141,7 @@ def get_argred_type(self, value_dtype: ty.Type) -> ty.Type:
             return cached
         argred_dtype = ty.struct_type([ty.int64, value_dtype], True)
         self._cached_argred_types[value_dtype] = argred_dtype
-        ids = self.cunumeric_lib.cunumeric_register_reduction_ops(
+        ids = self.cupynumeric_lib.cupynumeric_register_reduction_ops(
             value_dtype.code
         )
         argred_dtype.record_reduction_op(
@@ -150,10 +157,10 @@ def _report_coverage(self) -> None:
         implemented = sum(int(impl) for (_, _, impl) in self.api_calls)
 
         if total == 0:
-            print("cuNumeric API coverage: 0/0")
+            print("cuPyNumeric API coverage: 0/0")
         else:
             print(
-                f"cuNumeric API coverage: {implemented}/{total} "
+                f"cuPyNumeric API coverage: {implemented}/{total} "
                 f"({implemented / total * 100}%)"
             )
 
@@ -199,7 +206,7 @@ def bitgenerator_create(
         if forceCreate:
             task = legate_runtime.create_manual_task(
                 self.library,
-                CuNumericOpCode.BITGENERATOR,
+                CuPyNumericOpCode.BITGENERATOR,
                 (self.num_procs,),
             )
             self.bitgenerator_populate_task(
@@ -229,7 +236,7 @@ def bitgenerator_destroy(
             legate_runtime.issue_execution_fence()
             task = legate_runtime.create_manual_task(
                 self.library,
-                CuNumericOpCode.BITGENERATOR,
+                CuPyNumericOpCode.BITGENERATOR,
                 (self.num_procs,),
             )
             self.bitgenerator_populate_task(
@@ -395,7 +402,9 @@ def find_or_create_array_thunk(
 
         assert isinstance(array, np.ndarray)
         if not is_supported_dtype(array.dtype):
-            raise TypeError(f"cuNumeric does not support dtype={array.dtype}")
+            raise TypeError(
+                f"cuPyNumeric does not support dtype={array.dtype}"
+            )
 
         # We have to be really careful here to handle the case of
         # aliased numpy arrays that are passed in from the application
@@ -412,7 +421,7 @@ def find_or_create_array_thunk(
             if key is None:
                 # This base array wasn't made with a view
                 raise NotImplementedError(
-                    "cuNumeric does not currently know "
+                    "cuPyNumeric does not currently know "
                     + "how to attach to array views that are not affine "
                     + "transforms of their parent array."
                 )
@@ -471,10 +480,16 @@ def create_empty_thunk(
         shape: NdShape,
         dtype: ty.Type,
         inputs: Sequence[NumPyThunk] | None = None,
+        force_thunk: Literal["deferred"] | Literal["eager"] | None = None,
     ) -> NumPyThunk:
         from ._thunk.deferred import DeferredArray
 
-        if self.is_eager_shape(shape) and self.are_all_eager_inputs(inputs):
+        assert inputs is None or force_thunk is None
+        if force_thunk == "eager" or (
+            force_thunk is None
+            and self.is_eager_shape(shape)
+            and self.are_all_eager_inputs(inputs)
+        ):
             return self.create_eager_thunk(shape, dtype.to_numpy_dtype())
 
         store = legate_runtime.create_store(
@@ -514,7 +529,7 @@ def is_eager_shape(self, shape: NdShape) -> bool:
 
         from .settings import settings
 
-        # CUNUMERIC_FORCE_THUNK == "eager"
+        # CUPYNUMERIC_FORCE_THUNK == "eager"
         if settings.force_thunk() == "eager":
             return True
 
diff --git a/cunumeric/settings.py b/cupynumeric/settings.py
similarity index 84%
rename from cunumeric/settings.py
rename to cupynumeric/settings.py
index 292699d260..d73eee2616 100644
--- a/cunumeric/settings.py
+++ b/cupynumeric/settings.py
@@ -25,21 +25,21 @@
 __all__ = ("settings",)
 
 
-class CunumericRuntimeSettings(Settings):
+class CupynumericRuntimeSettings(Settings):
     preload_cudalibs: PrioritizedSetting[bool] = PrioritizedSetting(
         "preload_cudalibs",
-        "CUNUMERIC_PRELOAD_CUDALIBS",
+        "CUPYNUMERIC_PRELOAD_CUDALIBS",
         default=False,
         convert=convert_bool,
         help="""
         Preload and initialize handles of all CUDA libraries (cuBLAS, cuSOLVER,
-        etc.) used in cuNumeric.
+        etc.) used in cuPyNumeric.
         """,
     )
 
     warn: PrioritizedSetting[bool] = PrioritizedSetting(
         "warn",
-        "CUNUMERIC_WARN",
+        "CUPYNUMERIC_WARN",
         default=False,
         convert=convert_bool,
         help="""
@@ -49,27 +49,27 @@ class CunumericRuntimeSettings(Settings):
 
     report_coverage: PrioritizedSetting[bool] = PrioritizedSetting(
         "report_coverage",
-        "CUNUMERIC_REPORT_COVERAGE",
+        "CUPYNUMERIC_REPORT_COVERAGE",
         default=False,
         convert=convert_bool,
         help="""
-        Print an overall percentage of cunumeric coverage.
+        Print an overall percentage of cupynumeric coverage.
         """,
     )
 
     report_dump_callstack: PrioritizedSetting[bool] = PrioritizedSetting(
         "report_dump_callstack",
-        "CUNUMERIC_REPORT_DUMP_CALLSTACK",
+        "CUPYNUMERIC_REPORT_DUMP_CALLSTACK",
         default=False,
         convert=convert_bool,
         help="""
-        Print an overall percentage of cunumeric coverage with call stack info.
+        Print an overall percentage of cupynumeric coverage with a call stack.
         """,
     )
 
     report_dump_csv: PrioritizedSetting[str | None] = PrioritizedSetting(
         "report_dump_csv",
-        "CUNUMERIC_REPORT_DUMP_CSV",
+        "CUPYNUMERIC_REPORT_DUMP_CSV",
         default=None,
         help="""
         Save a coverage report to a specified CSV file.
@@ -78,11 +78,11 @@ class CunumericRuntimeSettings(Settings):
 
     numpy_compat: PrioritizedSetting[bool] = PrioritizedSetting(
         "numpy_compat",
-        "CUNUMERIC_NUMPY_COMPATIBILITY",
+        "CUPYNUMERIC_NUMPY_COMPATIBILITY",
         default=False,
         convert=convert_bool,
         help="""
-        cuNumeric will issue additional tasks to match numpy's results
+        cuPyNumeric will issue additional tasks to match numpy's results
         and behavior. This is currently used in the following
         APIs: nanmin, nanmax, nanargmin, nanargmax
         """,
@@ -90,7 +90,7 @@ class CunumericRuntimeSettings(Settings):
 
     fast_math: EnvOnlySetting[int] = EnvOnlySetting(
         "fast_math",
-        "CUNUMERIC_FAST_MATH",
+        "CUPYNUMERIC_FAST_MATH",
         default=False,
         convert=convert_bool,
         help="""
@@ -105,7 +105,7 @@ class CunumericRuntimeSettings(Settings):
 
     min_gpu_chunk: EnvOnlySetting[int] = EnvOnlySetting(
         "min_gpu_chunk",
-        "CUNUMERIC_MIN_GPU_CHUNK",
+        "CUPYNUMERIC_MIN_GPU_CHUNK",
         default=65536,  # 1 << 16
         test_default=2,
         convert=convert_int,
@@ -121,7 +121,7 @@ class CunumericRuntimeSettings(Settings):
 
     min_cpu_chunk: EnvOnlySetting[int] = EnvOnlySetting(
         "min_cpu_chunk",
-        "CUNUMERIC_MIN_CPU_CHUNK",
+        "CUPYNUMERIC_MIN_CPU_CHUNK",
         default=1024,  # 1 << 10
         test_default=2,
         convert=convert_int,
@@ -137,7 +137,7 @@ class CunumericRuntimeSettings(Settings):
 
     min_omp_chunk: EnvOnlySetting[int] = EnvOnlySetting(
         "min_omp_chunk",
-        "CUNUMERIC_MIN_OMP_CHUNK",
+        "CUPYNUMERIC_MIN_OMP_CHUNK",
         default=8192,  # 1 << 13
         test_default=2,
         convert=convert_int,
@@ -153,15 +153,15 @@ class CunumericRuntimeSettings(Settings):
 
     force_thunk: EnvOnlySetting[str | None] = EnvOnlySetting(
         "force_thunk",
-        "CUNUMERIC_FORCE_THUNK",
+        "CUPYNUMERIC_FORCE_THUNK",
         default=None,
         test_default="deferred",
         help="""
-        Force cuNumeric to always use a specific strategy for backing
+        Force cuPyNumeric to always use a specific strategy for backing
         ndarrays: "deferred", i.e. managed by the Legate runtime, which
         enables distribution and accelerated operations, but has some
         up-front offloading overhead, or "eager", i.e. falling back to
-        using a vanilla NumPy array. By default cuNumeric will decide
+        using a vanilla NumPy array. By default cuPyNumeric will decide
         this on a per-array basis, based on the size of the array and
         the accelerator in use.
 
@@ -171,12 +171,12 @@ class CunumericRuntimeSettings(Settings):
 
     matmul_cache_size: EnvOnlySetting[int] = EnvOnlySetting(
         "matmul_cache_size",
-        "CUNUMERIC_MATMUL_CACHE_SIZE",
+        "CUPYNUMERIC_MATMUL_CACHE_SIZE",
         default=134217728,  # 128MB
         test_default=4096,  # 4KB
         convert=convert_int,
         help="""
-        Force cuNumeric to keep temporary task slices during matmul
+        Force cuPyNumeric to keep temporary task slices during matmul
         computations smaller than this threshold. Whenever the temporary
         space needed during computation would exceed this value the task
         will be batched over 'k' to fulfill the requirement.
@@ -186,4 +186,4 @@ class CunumericRuntimeSettings(Settings):
     )
 
 
-settings = CunumericRuntimeSettings()
+settings = CupynumericRuntimeSettings()
diff --git a/cunumeric/types.py b/cupynumeric/types.py
similarity index 95%
rename from cunumeric/types.py
rename to cupynumeric/types.py
index 35f2e012f5..f2fbf83114 100644
--- a/cunumeric/types.py
+++ b/cupynumeric/types.py
@@ -34,4 +34,6 @@
 
 ConvolveMode: TypeAlias = Literal["full", "valid", "same"]
 
+ConvolveMethod: TypeAlias = Literal["auto", "direct", "fft"]
+
 SelectKind: TypeAlias = Literal["introselect"]
diff --git a/cupynumeric_cpp.cmake b/cupynumeric_cpp.cmake
new file mode 100644
index 0000000000..2a56ccbc0a
--- /dev/null
+++ b/cupynumeric_cpp.cmake
@@ -0,0 +1,539 @@
+#=============================================================================
+# Copyright 2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+##############################################################################
+# - User Options  ------------------------------------------------------------
+
+option(BUILD_SHARED_LIBS "Build cuPyNumeric shared libraries" ON)
+option(cupynumeric_EXCLUDE_TBLIS_FROM_ALL "Exclude tblis targets from cuPyNumeric's 'all' target" OFF)
+option(cupynumeric_EXCLUDE_OPENBLAS_FROM_ALL "Exclude OpenBLAS targets from cuPyNumeric's 'all' target" OFF)
+option(cupynumeric_EXCLUDE_LEGATE_FROM_ALL "Exclude legate targets from cuPyNumeric's 'all' target" OFF)
+
+##############################################################################
+# - Project definition -------------------------------------------------------
+
+# Write the version header
+rapids_cmake_write_version_file(include/cupynumeric/version_config.hpp)
+
+# Needed to integrate with LLVM/clang tooling
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+##############################################################################
+# - Build Type ---------------------------------------------------------------
+
+# Set a default build type if none was specified
+rapids_cmake_build_type(Release)
+
+##############################################################################
+# - conda environment --------------------------------------------------------
+
+rapids_cmake_support_conda_env(conda_env MODIFY_PREFIX_PATH)
+
+# We're building python extension libraries, which must always be installed
+# under lib/, even if the system normally uses lib64/. Rapids-cmake currently
+# doesn't realize this when we're going through scikit-build, see
+# https://github.com/rapidsai/rapids-cmake/issues/426
+if(TARGET conda_env)
+  set(CMAKE_INSTALL_LIBDIR "lib")
+endif()
+
+##############################################################################
+# - Dependencies -------------------------------------------------------------
+
+# add third party dependencies using CPM
+rapids_cpm_init(OVERRIDE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/versions.json)
+
+rapids_find_package(OpenMP GLOBAL_TARGETS OpenMP::OpenMP_CXX)
+
+option(Legion_USE_CUDA "Use CUDA" ON)
+option(Legion_USE_OpenMP "Use OpenMP" ${OpenMP_FOUND})
+option(Legion_BOUNDS_CHECKS "Build cuPyNumeric with bounds checks (expensive)" OFF)
+
+###
+# If we find legate already configured on the system, it will report
+# whether it was compiled with bounds checking (Legion_BOUNDS_CHECKS),
+# CUDA (Legion_USE_CUDA), and OpenMP (Legion_USE_OpenMP).
+#
+# We use the same variables as legate because we want to enable/disable
+# each of these features based on how legate was configured (it doesn't
+# make sense to build cuPyNumeric's CUDA bindings if legate wasn't built
+# with CUDA support).
+###
+include(thirdparty/get_legate)
+
+# Use of DEFINED is deliberate. CMAKE_CUDA_ARCHITECTURES may be OFF which we want to leave
+# in place. Legion_CUDA_ARCH is defined by Legate.
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+  set(CMAKE_CUDA_ARCHITECTURES "${Legion_CUDA_ARCH}")
+endif()
+
+if(Legion_USE_CUDA)
+  include(Modules/cuda_arch_helpers)
+  # Needs to run before `rapids_cuda_init_architectures`
+  set_cuda_arch_from_names()
+  # Needs to run before `enable_language(CUDA)`
+  rapids_cuda_init_architectures(cupynumeric)
+  message(STATUS "CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}")
+  enable_language(CUDA)
+  # Since cupynumeric only enables CUDA optionally we need to manually include
+  # the file that rapids_cuda_init_architectures relies on `project` calling
+  if(CMAKE_PROJECT_cupynumeric_INCLUDE)
+    include("${CMAKE_PROJECT_cupynumeric_INCLUDE}")
+  endif()
+
+  # Must come after enable_language(CUDA)
+  # Use `-isystem <path>` instead of `-isystem=<path>`
+  # because the former works with clangd intellisense
+  set(CMAKE_INCLUDE_SYSTEM_FLAG_CUDA "-isystem ")
+
+  rapids_find_package(
+    CUDAToolkit REQUIRED
+    BUILD_EXPORT_SET cupynumeric-exports
+    INSTALL_EXPORT_SET cupynumeric-exports
+  )
+
+  include(thirdparty/get_nccl)
+  include(thirdparty/get_cutensor)
+endif()
+
+include(thirdparty/get_openblas)
+
+include(thirdparty/get_tblis)
+
+##############################################################################
+# - cuPyNumeric ----------------------------------------------------------------
+
+add_library(cupynumeric)
+add_library(cupynumeric::cupynumeric ALIAS cupynumeric)
+
+set(cupynumeric_CXX_OPTIONS "")
+set(cupynumeric_CUDA_OPTIONS "")
+
+include(Modules/set_cpu_arch_flags)
+set_cpu_arch_flags(cupynumeric_CXX_OPTIONS)
+
+# Add `src/cupynumeric.mk` sources
+target_sources(cupynumeric PRIVATE
+  src/cupynumeric/ternary/where.cc
+  src/cupynumeric/scan/scan_global.cc
+  src/cupynumeric/scan/scan_local.cc
+  src/cupynumeric/binary/binary_op.cc
+  src/cupynumeric/binary/binary_op_util.cc
+  src/cupynumeric/binary/binary_red.cc
+  src/cupynumeric/bits/packbits.cc
+  src/cupynumeric/bits/unpackbits.cc
+  src/cupynumeric/unary/scalar_unary_red.cc
+  src/cupynumeric/unary/unary_op.cc
+  src/cupynumeric/unary/unary_red.cc
+  src/cupynumeric/unary/convert.cc
+  src/cupynumeric/nullary/arange.cc
+  src/cupynumeric/nullary/eye.cc
+  src/cupynumeric/nullary/fill.cc
+  src/cupynumeric/nullary/window.cc
+  src/cupynumeric/index/advanced_indexing.cc
+  src/cupynumeric/index/choose.cc
+  src/cupynumeric/index/putmask.cc
+  src/cupynumeric/index/repeat.cc
+  src/cupynumeric/index/select.cc
+  src/cupynumeric/index/wrap.cc
+  src/cupynumeric/index/zip.cc
+  src/cupynumeric/item/read.cc
+  src/cupynumeric/item/write.cc
+  src/cupynumeric/matrix/batched_cholesky.cc
+  src/cupynumeric/matrix/contract.cc
+  src/cupynumeric/matrix/diag.cc
+  src/cupynumeric/matrix/geev.cc
+  src/cupynumeric/matrix/gemm.cc
+  src/cupynumeric/matrix/matmul.cc
+  src/cupynumeric/matrix/matvecmul.cc
+  src/cupynumeric/matrix/dot.cc
+  src/cupynumeric/matrix/potrf.cc
+  src/cupynumeric/matrix/qr.cc
+  src/cupynumeric/matrix/solve.cc
+  src/cupynumeric/matrix/svd.cc
+  src/cupynumeric/matrix/syrk.cc
+  src/cupynumeric/matrix/tile.cc
+  src/cupynumeric/matrix/transpose.cc
+  src/cupynumeric/matrix/trilu.cc
+  src/cupynumeric/matrix/trsm.cc
+  src/cupynumeric/matrix/util.cc
+  src/cupynumeric/random/bitgenerator.cc
+  src/cupynumeric/random/randutil/generator_host.cc
+  src/cupynumeric/random/randutil/generator_host_straightforward.cc
+  src/cupynumeric/random/randutil/generator_host_advanced.cc
+  src/cupynumeric/random/rand.cc
+  src/cupynumeric/search/argwhere.cc
+  src/cupynumeric/search/nonzero.cc
+  src/cupynumeric/set/unique.cc
+  src/cupynumeric/set/unique_reduce.cc
+  src/cupynumeric/stat/bincount.cc
+  src/cupynumeric/convolution/convolve.cc
+  src/cupynumeric/transform/flip.cc
+  src/cupynumeric/utilities/repartition.cc
+  src/cupynumeric/arg_redop_register.cc
+  src/cupynumeric/mapper.cc
+  src/cupynumeric/ndarray.cc
+  src/cupynumeric/operators.cc
+  src/cupynumeric/runtime.cc
+  src/cupynumeric/cephes/chbevl.cc
+  src/cupynumeric/cephes/i0.cc
+  src/cupynumeric/stat/histogram.cc
+)
+
+if(Legion_USE_OpenMP)
+  target_sources(cupynumeric PRIVATE
+    src/cupynumeric/ternary/where_omp.cc
+    src/cupynumeric/scan/scan_global_omp.cc
+    src/cupynumeric/scan/scan_local_omp.cc
+    src/cupynumeric/binary/binary_op_omp.cc
+    src/cupynumeric/binary/binary_red_omp.cc
+    src/cupynumeric/bits/packbits_omp.cc
+    src/cupynumeric/bits/unpackbits_omp.cc
+    src/cupynumeric/unary/unary_op_omp.cc
+    src/cupynumeric/unary/scalar_unary_red_omp.cc
+    src/cupynumeric/unary/unary_red_omp.cc
+    src/cupynumeric/unary/convert_omp.cc
+    src/cupynumeric/nullary/arange_omp.cc
+    src/cupynumeric/nullary/eye_omp.cc
+    src/cupynumeric/nullary/fill_omp.cc
+    src/cupynumeric/nullary/window_omp.cc
+    src/cupynumeric/index/advanced_indexing_omp.cc
+    src/cupynumeric/index/choose_omp.cc
+    src/cupynumeric/index/putmask_omp.cc
+    src/cupynumeric/index/repeat_omp.cc
+    src/cupynumeric/index/select_omp.cc
+    src/cupynumeric/index/wrap_omp.cc
+    src/cupynumeric/index/zip_omp.cc
+    src/cupynumeric/matrix/batched_cholesky_omp.cc
+    src/cupynumeric/matrix/contract_omp.cc
+    src/cupynumeric/matrix/diag_omp.cc
+    src/cupynumeric/matrix/geev_omp.cc
+    src/cupynumeric/matrix/gemm_omp.cc
+    src/cupynumeric/matrix/matmul_omp.cc
+    src/cupynumeric/matrix/matvecmul_omp.cc
+    src/cupynumeric/matrix/dot_omp.cc
+    src/cupynumeric/matrix/potrf_omp.cc
+    src/cupynumeric/matrix/qr_omp.cc
+    src/cupynumeric/matrix/solve_omp.cc
+    src/cupynumeric/matrix/svd_omp.cc
+    src/cupynumeric/matrix/syrk_omp.cc
+    src/cupynumeric/matrix/tile_omp.cc
+    src/cupynumeric/matrix/transpose_omp.cc
+    src/cupynumeric/matrix/trilu_omp.cc
+    src/cupynumeric/matrix/trsm_omp.cc
+    src/cupynumeric/random/rand_omp.cc
+    src/cupynumeric/search/argwhere_omp.cc
+    src/cupynumeric/search/nonzero_omp.cc
+    src/cupynumeric/set/unique_omp.cc
+    src/cupynumeric/set/unique_reduce_omp.cc
+    src/cupynumeric/stat/bincount_omp.cc
+    src/cupynumeric/convolution/convolve_omp.cc
+    src/cupynumeric/transform/flip_omp.cc
+    src/cupynumeric/stat/histogram_omp.cc
+  )
+endif()
+
+if(Legion_USE_CUDA)
+  target_sources(cupynumeric PRIVATE
+    src/cupynumeric/ternary/where.cu
+    src/cupynumeric/scan/scan_global.cu
+    src/cupynumeric/scan/scan_local.cu
+    src/cupynumeric/binary/binary_op.cu
+    src/cupynumeric/binary/binary_red.cu
+    src/cupynumeric/bits/packbits.cu
+    src/cupynumeric/bits/unpackbits.cu
+    src/cupynumeric/unary/scalar_unary_red.cu
+    src/cupynumeric/unary/unary_red.cu
+    src/cupynumeric/unary/unary_op.cu
+    src/cupynumeric/unary/convert.cu
+    src/cupynumeric/nullary/arange.cu
+    src/cupynumeric/nullary/eye.cu
+    src/cupynumeric/nullary/fill.cu
+    src/cupynumeric/nullary/window.cu
+    src/cupynumeric/index/advanced_indexing.cu
+    src/cupynumeric/index/choose.cu
+    src/cupynumeric/index/putmask.cu
+    src/cupynumeric/index/repeat.cu
+    src/cupynumeric/index/select.cu
+    src/cupynumeric/index/wrap.cu
+    src/cupynumeric/index/zip.cu
+    src/cupynumeric/item/read.cu
+    src/cupynumeric/item/write.cu
+    src/cupynumeric/matrix/batched_cholesky.cu
+    src/cupynumeric/matrix/contract.cu
+    src/cupynumeric/matrix/diag.cu
+    src/cupynumeric/matrix/geev.cu
+    src/cupynumeric/matrix/gemm.cu
+    src/cupynumeric/matrix/matmul.cu
+    src/cupynumeric/matrix/matvecmul.cu
+    src/cupynumeric/matrix/dot.cu
+    src/cupynumeric/matrix/potrf.cu
+    src/cupynumeric/matrix/qr.cu
+    src/cupynumeric/matrix/solve.cu
+    src/cupynumeric/matrix/svd.cu
+    src/cupynumeric/matrix/syrk.cu
+    src/cupynumeric/matrix/tile.cu
+    src/cupynumeric/matrix/transpose.cu
+    src/cupynumeric/matrix/trilu.cu
+    src/cupynumeric/matrix/trsm.cu
+    src/cupynumeric/random/rand.cu
+    src/cupynumeric/search/argwhere.cu
+    src/cupynumeric/search/nonzero.cu
+    src/cupynumeric/set/unique.cu
+    src/cupynumeric/stat/bincount.cu
+    src/cupynumeric/convolution/convolve.cu
+    src/cupynumeric/fft/fft.cu
+    src/cupynumeric/transform/flip.cu
+    src/cupynumeric/utilities/repartition.cu
+    src/cupynumeric/arg_redop_register.cu
+    src/cupynumeric/cudalibs.cu
+    src/cupynumeric/stat/histogram.cu
+  )
+endif()
+
+# Add `src/cupynumeric/sort/sort.mk` sources
+target_sources(cupynumeric PRIVATE
+  src/cupynumeric/sort/sort.cc
+  src/cupynumeric/sort/searchsorted.cc
+)
+
+if(Legion_USE_OpenMP)
+  target_sources(cupynumeric PRIVATE
+    src/cupynumeric/sort/sort_omp.cc
+    src/cupynumeric/sort/searchsorted_omp.cc
+  )
+endif()
+
+if(Legion_USE_CUDA)
+  target_sources(cupynumeric PRIVATE
+    src/cupynumeric/sort/sort.cu
+    src/cupynumeric/sort/searchsorted.cu
+    src/cupynumeric/sort/cub_sort_bool.cu
+    src/cupynumeric/sort/cub_sort_int8.cu
+    src/cupynumeric/sort/cub_sort_int16.cu
+    src/cupynumeric/sort/cub_sort_int32.cu
+    src/cupynumeric/sort/cub_sort_int64.cu
+    src/cupynumeric/sort/cub_sort_uint8.cu
+    src/cupynumeric/sort/cub_sort_uint16.cu
+    src/cupynumeric/sort/cub_sort_uint32.cu
+    src/cupynumeric/sort/cub_sort_uint64.cu
+    src/cupynumeric/sort/cub_sort_half.cu
+    src/cupynumeric/sort/cub_sort_float.cu
+    src/cupynumeric/sort/cub_sort_double.cu
+    src/cupynumeric/sort/thrust_sort_bool.cu
+    src/cupynumeric/sort/thrust_sort_int8.cu
+    src/cupynumeric/sort/thrust_sort_int16.cu
+    src/cupynumeric/sort/thrust_sort_int32.cu
+    src/cupynumeric/sort/thrust_sort_int64.cu
+    src/cupynumeric/sort/thrust_sort_uint8.cu
+    src/cupynumeric/sort/thrust_sort_uint16.cu
+    src/cupynumeric/sort/thrust_sort_uint32.cu
+    src/cupynumeric/sort/thrust_sort_uint64.cu
+    src/cupynumeric/sort/thrust_sort_half.cu
+    src/cupynumeric/sort/thrust_sort_float.cu
+    src/cupynumeric/sort/thrust_sort_double.cu
+    src/cupynumeric/sort/thrust_sort_complex64.cu
+    src/cupynumeric/sort/thrust_sort_complex128.cu
+  )
+endif()
+
+# Add `src/cupynumeric/random/random.mk` sources
+if(Legion_USE_CUDA)
+  target_sources(cupynumeric PRIVATE
+      src/cupynumeric/random/bitgenerator.cu
+      src/cupynumeric/random/randutil/generator_device.cu
+      src/cupynumeric/random/randutil/generator_device_straightforward.cu
+      src/cupynumeric/random/randutil/generator_device_advanced.cu
+  )
+endif()
+
+# add sources for cusolverMp
+if(Legion_USE_CUDA AND CUSOLVERMP_DIR)
+  target_sources(cupynumeric PRIVATE
+    src/cupynumeric/matrix/mp_potrf.cu
+    src/cupynumeric/matrix/mp_solve.cu
+  )
+endif()
+
+target_sources(cupynumeric PRIVATE
+  # This must always be the last file!
+  # It guarantees we do our registration callback
+  # only after all task variants are recorded
+  src/cupynumeric/cupynumeric.cc
+)
+
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+  target_compile_definitions(cupynumeric PUBLIC  "$<$<COMPILE_LANGUAGE:CXX,CUDA>:DEBUG_CUPYNUMERIC>")
+endif()
+
+if(Legion_BOUNDS_CHECKS)
+  target_compile_definitions(cupynumeric PUBLIC "$<$<COMPILE_LANGUAGE:CXX,CUDA>:BOUNDS_CHECKS>")
+endif()
+
+list(APPEND cupynumeric_CUDA_OPTIONS -Xfatbin=-compress-all)
+list(APPEND cupynumeric_CUDA_OPTIONS --expt-extended-lambda)
+list(APPEND cupynumeric_CUDA_OPTIONS --expt-relaxed-constexpr)
+list(APPEND cupynumeric_CXX_OPTIONS -Wno-deprecated-declarations)
+list(APPEND cupynumeric_CUDA_OPTIONS -Wno-deprecated-declarations)
+
+if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
+  set(platform_rpath_origin "\$ORIGIN")
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+  set(platform_rpath_origin "@loader_path")
+endif ()
+
+set_target_properties(cupynumeric
+           PROPERTIES BUILD_RPATH                         "${platform_rpath_origin}"
+                      INSTALL_RPATH                       "${platform_rpath_origin}"
+                      CXX_STANDARD                        17
+                      CXX_STANDARD_REQUIRED               ON
+                      POSITION_INDEPENDENT_CODE           ON
+                      INTERFACE_POSITION_INDEPENDENT_CODE ON
+                      CUDA_STANDARD                       17
+                      CUDA_STANDARD_REQUIRED              ON
+                      LIBRARY_OUTPUT_DIRECTORY            lib)
+
+target_link_libraries(cupynumeric
+  PUBLIC legate::legate
+          $<TARGET_NAME_IF_EXISTS:NCCL::NCCL>
+  PRIVATE BLAS::BLAS
+          tblis::tblis
+          # Add Conda library and include paths
+          $<TARGET_NAME_IF_EXISTS:conda_env>
+          $<TARGET_NAME_IF_EXISTS:CUDA::cufft>
+          $<TARGET_NAME_IF_EXISTS:CUDA::cublas>
+          $<TARGET_NAME_IF_EXISTS:CUDA::cusolver>
+          $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+          $<TARGET_NAME_IF_EXISTS:cutensor::cutensor>)
+
+if(NOT Legion_USE_CUDA AND cupynumeric_cuRAND_INCLUDE_DIR)
+  target_compile_definitions(cupynumeric
+    PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:CUPYNUMERIC_CURAND_FOR_CPU_BUILD>")
+  target_include_directories(cupynumeric PRIVATE ${cupynumeric_cuRAND_INCLUDE_DIR})
+endif()
+
+if(Legion_USE_CUDA AND CUSOLVERMP_DIR)
+  message(VERBOSE "cupynumeric: CUSOLVERMP_DIR ${CUSOLVERMP_DIR}")
+  target_compile_definitions(cupynumeric PUBLIC "$<$<COMPILE_LANGUAGE:CXX,CUDA>:CUPYNUMERIC_USE_CUSOLVERMP>")
+  target_include_directories(cupynumeric PRIVATE ${CUSOLVERMP_DIR}/include)
+  target_link_libraries(cupynumeric PRIVATE ${CUSOLVERMP_DIR}/lib/libcusolverMp.so)
+endif()
+
+target_compile_options(cupynumeric
+  PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${cupynumeric_CXX_OPTIONS}>"
+          "$<$<COMPILE_LANGUAGE:CUDA>:${cupynumeric_CUDA_OPTIONS}>")
+
+target_include_directories(cupynumeric
+  PUBLIC
+    $<BUILD_INTERFACE:${cupynumeric_SOURCE_DIR}/src>
+  INTERFACE
+    $<INSTALL_INTERFACE:include/cupynumeric>
+)
+
+if(Legion_USE_CUDA)
+  file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld"
+[=[
+SECTIONS
+{
+.nvFatBinSegment : { *(.nvFatBinSegment) }
+.nv_fatbin : { *(.nv_fatbin) }
+}
+]=])
+
+  # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
+  target_link_options(cupynumeric PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
+endif()
+
+##############################################################################
+# - install targets-----------------------------------------------------------
+
+include(CPack)
+include(GNUInstallDirs)
+rapids_cmake_install_lib_dir(lib_dir)
+
+install(TARGETS cupynumeric
+        DESTINATION ${lib_dir}
+        EXPORT cupynumeric-exports)
+
+install(
+  FILES src/cupynumeric.h
+        ${CMAKE_CURRENT_BINARY_DIR}/include/cupynumeric/version_config.hpp
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cupynumeric)
+
+install(
+  FILES src/cupynumeric/cupynumeric_c.h
+        src/cupynumeric/ndarray.h
+        src/cupynumeric/ndarray.inl
+        src/cupynumeric/operators.h
+        src/cupynumeric/operators.inl
+        src/cupynumeric/runtime.h
+        src/cupynumeric/slice.h
+        src/cupynumeric/typedefs.h
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cupynumeric/cupynumeric)
+
+if(cupynumeric_INSTALL_TBLIS)
+  install(DIRECTORY ${tblis_BINARY_DIR}/lib/ DESTINATION ${lib_dir})
+  install(DIRECTORY ${tblis_BINARY_DIR}/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+endif()
+
+##############################################################################
+# - install export -----------------------------------------------------------
+
+set(doc_string
+        [=[
+Provide targets for cuPyNumeric, an aspiring drop-in replacement for NumPy at scale.
+
+Imported Targets:
+  - cupynumeric::cupynumeric
+
+]=])
+
+string(JOIN "\n" code_string
+  "set(Legion_USE_CUDA ${Legion_USE_CUDA})"
+  "set(Legion_USE_OpenMP ${Legion_USE_OpenMP})"
+  "set(Legion_BOUNDS_CHECKS ${Legion_BOUNDS_CHECKS})"
+)
+
+if(DEFINED Legion_USE_Python)
+  string(APPEND code_string "\nset(Legion_USE_Python ${Legion_USE_Python})")
+endif()
+
+if(DEFINED Legion_NETWORKS)
+  string(APPEND code_string "\nset(Legion_NETWORKS ${Legion_NETWORKS})")
+endif()
+
+rapids_export(
+  INSTALL cupynumeric
+  EXPORT_SET cupynumeric-exports
+  GLOBAL_TARGETS cupynumeric
+  NAMESPACE cupynumeric::
+  DOCUMENTATION doc_string
+  FINAL_CODE_BLOCK code_string)
+
+# build export targets
+rapids_export(
+  BUILD cupynumeric
+  EXPORT_SET cupynumeric-exports
+  GLOBAL_TARGETS cupynumeric
+  NAMESPACE cupynumeric::
+  DOCUMENTATION doc_string
+  FINAL_CODE_BLOCK code_string)
+
+if(cupynumeric_BUILD_TESTS)
+  include(CTest)
+
+  add_subdirectory(tests/cpp)
+endif()
diff --git a/cunumeric_python.cmake b/cupynumeric_python.cmake
similarity index 69%
rename from cunumeric_python.cmake
rename to cupynumeric_python.cmake
index 3c4b891cfd..1be5b35c62 100644
--- a/cunumeric_python.cmake
+++ b/cupynumeric_python.cmake
@@ -17,25 +17,25 @@
 ##############################################################################
 # - User Options  ------------------------------------------------------------
 
-option(FIND_CUNUMERIC_CPP "Search for existing cuNumeric C++ installations before defaulting to local files"
+option(FIND_CUPYNUMERIC_CPP "Search for existing cuPyNumeric C++ installations before defaulting to local files"
        OFF)
 
 ##############################################################################
 # - Dependencies -------------------------------------------------------------
 
-# If the user requested it we attempt to find cunumeric.
-if(FIND_CUNUMERIC_CPP)
+# If the user requested it we attempt to find cupynumeric.
+if(FIND_CUPYNUMERIC_CPP)
   include("${rapids-cmake-dir}/export/detail/parse_version.cmake")
-  rapids_export_parse_version(${cunumeric_version} cunumeric parsed_ver)
-  rapids_find_package(cunumeric ${parsed_ver} EXACT CONFIG
-                      GLOBAL_TARGETS     cunumeric::cunumeric
-                      BUILD_EXPORT_SET   cunumeric-python-exports
-                      INSTALL_EXPORT_SET cunumeric-python-exports)
+  rapids_export_parse_version(${cupynumeric_version} cupynumeric parsed_ver)
+  rapids_find_package(cupynumeric ${parsed_ver} EXACT CONFIG
+                      GLOBAL_TARGETS     cupynumeric::cupynumeric
+                      BUILD_EXPORT_SET   cupynumeric-python-exports
+                      INSTALL_EXPORT_SET cupynumeric-python-exports)
 else()
-  set(cunumeric_FOUND OFF)
+  set(cupynumeric_FOUND OFF)
 endif()
 
-if(NOT cunumeric_FOUND)
+if(NOT cupynumeric_FOUND)
   set(SKBUILD OFF)
   set(Legion_USE_Python ON)
   set(Legion_BUILD_BINDINGS ON)
@@ -51,9 +51,9 @@ add_custom_target("generate_install_info_py" ALL
   VERBATIM
 )
 
-add_library(cunumeric_python INTERFACE)
-add_library(cunumeric::cunumeric_python ALIAS cunumeric_python)
-target_link_libraries(cunumeric_python INTERFACE legate::legate)
+add_library(cupynumeric_python INTERFACE)
+add_library(cupynumeric::cupynumeric_python ALIAS cupynumeric_python)
+target_link_libraries(cupynumeric_python INTERFACE legate::legate)
 
 # ############################################################################
 # - conda environment --------------------------------------------------------
@@ -75,37 +75,37 @@ include(CPack)
 include(GNUInstallDirs)
 rapids_cmake_install_lib_dir(lib_dir)
 
-install(TARGETS cunumeric_python
+install(TARGETS cupynumeric_python
         DESTINATION ${lib_dir}
-        EXPORT cunumeric-python-exports)
+        EXPORT cupynumeric-python-exports)
 
 ##############################################################################
 # - install export -----------------------------------------------------------
 
 set(doc_string
         [=[
-Provide Python targets for cuNumeric, an aspiring drop-in replacement for NumPy at scale.
+Provide Python targets for cuPyNumeric, an aspiring drop-in replacement for NumPy at scale.
 
 Imported Targets:
-  - cunumeric::cunumeric_python
+  - cupynumeric::cupynumeric_python
 
 ]=])
 
 set(code_string "")
 
 rapids_export(
-  INSTALL cunumeric_python
-  EXPORT_SET cunumeric-python-exports
-  GLOBAL_TARGETS cunumeric_python
-  NAMESPACE cunumeric::
+  INSTALL cupynumeric_python
+  EXPORT_SET cupynumeric-python-exports
+  GLOBAL_TARGETS cupynumeric_python
+  NAMESPACE cupynumeric::
   DOCUMENTATION doc_string
   FINAL_CODE_BLOCK code_string)
 
 # build export targets
 rapids_export(
-  BUILD cunumeric_python
-  EXPORT_SET cunumeric-python-exports
-  GLOBAL_TARGETS cunumeric_python
-  NAMESPACE cunumeric::
+  BUILD cupynumeric_python
+  EXPORT_SET cupynumeric-python-exports
+  GLOBAL_TARGETS cupynumeric_python
+  NAMESPACE cupynumeric::
   DOCUMENTATION doc_string
   FINAL_CODE_BLOCK code_string)
diff --git a/docs/cunumeric/source/_templates/layout.html b/docs/cunumeric/source/_templates/layout.html
deleted file mode 100644
index 2f473f38ee..0000000000
--- a/docs/cunumeric/source/_templates/layout.html
+++ /dev/null
@@ -1,13 +0,0 @@
-{% extends "!layout.html" %}
-
-{% block extrahead %}
-
-<script src="https://assets.adobedtm.com/5d4962a43b79/c1061d2c5e7b/launch-191c2462b890.min.js"></script>
-
-{% endblock %}
-
-{% block footer %}
-
-<script type="text/javascript">if (typeof _satellite !== “undefined”){ _satellite.pageBottom();}</script>
-
-{% endblock %}
diff --git a/docs/cunumeric/source/api/broadcast.rst b/docs/cunumeric/source/api/broadcast.rst
deleted file mode 100644
index 50d329a2e8..0000000000
--- a/docs/cunumeric/source/api/broadcast.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-.. currentmodule:: cunumeric
-
-cunumeric.broadcast
-===================
-
-.. autoclass:: broadcast
-   :members:
\ No newline at end of file
diff --git a/docs/cunumeric/source/api/comparison.rst b/docs/cunumeric/source/api/comparison.rst
deleted file mode 100644
index 139a02d76e..0000000000
--- a/docs/cunumeric/source/api/comparison.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-Project comparisons
-===================
-
-Here is a list of NumPy APIs and corresponding cuNumeric implementations.
-
-A dot in the cunumeric column denotes that cuNumeric implementation
-is not provided yet. We welcome contributions for these functions.
-
-NumPy vs cuNumeric APIs
------------------------
-
-.. comparison-table::
diff --git a/docs/cunumeric/source/api/settings.rst b/docs/cunumeric/source/api/settings.rst
deleted file mode 100644
index abc807f0b4..0000000000
--- a/docs/cunumeric/source/api/settings.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-Settings
-========
-
-cuNumeric has a number of runtime settings that can be configured through
-environment variables.
-
-.. settings:: settings
-    :module: cunumeric.settings
\ No newline at end of file
diff --git a/docs/cunumeric/source/developer/CONTRIBUTING.md b/docs/cunumeric/source/developer/CONTRIBUTING.md
deleted file mode 120000
index 069558fad2..0000000000
--- a/docs/cunumeric/source/developer/CONTRIBUTING.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../../CONTRIBUTING.md
\ No newline at end of file
diff --git a/docs/cunumeric/source/developer/building.rst b/docs/cunumeric/source/developer/building.rst
deleted file mode 100644
index b4ba151e99..0000000000
--- a/docs/cunumeric/source/developer/building.rst
+++ /dev/null
@@ -1,71 +0,0 @@
-.. _building cunumeric from source:
-
-Building from source
-====================
-
-Basic build
------------
-
-Users must have a working installation of the `Legate`_ library prior to
-installing cuNumeric.
-**Installing cuNumeric by itself will not automatically install Legate.**
-
-As for other dependencies, the Dependencies section on the
-`Legate build instructions`_ also covers cuNumeric, so no additional
-packages are required.
-
-Once Legate is installed, you can simply invoke ``./install.py`` from the
-cuNumeric top-level directory. The build will automatically pick up the
-configuration used when building Legate (e.g. the CUDA Toolkit directory).
-
-Advanced topics
----------------
-
-Building through pip & cmake
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-cuNumeric uses the same cmake/scikit-build-based build workflow as Legate.
-See the `Legate build instructions`_ for an overview.
-
-There are several examples in the ``scripts`` folder. We walk through the steps in
-``build-with-legate-separately-no-install.sh`` here.
-
-We assume a pre-existing Legate build. For details on building Legate,
-consult the `Legate repository`_.
-
-First, the CMake build needs to be configured:
-
-.. code:: sh
-
-  $ cmake -S . -B build -GNinja -D legate_ROOT:STRING=path/to/legate/build
-
-We point cuNumeric to the Legate *build* tree, not an installation.
-This generates all build-dependent headers and Python files.
-
-Once configured, we can build the C++ libraries:
-
-.. code:: sh
-
-  $ cmake --build build
-
-This will invoke Ninja (or make) to execute the build.
-Once the C++ libraries are available, we can do an editable (development) pip installation.
-
-.. code:: sh
-
-  $ SKBUILD_BUILD_OPTIONS="-D FIND_CUNUMERIC_CPP=ON -D cunumeric_ROOT=$(pwd)/build" \
-    python3 -m pip install \
-    --root / --no-deps --no-build-isolation
-    --editable .
-
-The Python source tree and CMake build tree are now available with the environment Python
-for running cuNumeric programs. The diagram below illustrates the
-complete workflow for building both Legate and cuNumeric.
-
-.. image:: /_images/developer-build.png
-  :width: 600
-  :alt: "notional diagram of cunumeric build process"
-
-.. _Legate: https://github.com/nv-legate/legate.core
-.. _Legate build instructions: https://github.com/nv-legate/legate.core/blob/HEAD/BUILD.md
-.. _Legate repository: https://github.com/nv-legate/legate.core
diff --git a/docs/cunumeric/source/index.rst b/docs/cunumeric/source/index.rst
deleted file mode 100644
index afd32f6530..0000000000
--- a/docs/cunumeric/source/index.rst
+++ /dev/null
@@ -1,41 +0,0 @@
-:html_theme.sidebar_secondary.remove:
-
-NVIDIA cuNumeric
-================
-
-cuNumeric is a `Legate`_ library that aims to provide a distributed and
-accelerated drop-in replacement for the `NumPy API`_ on top of the `Legion`_
-runtime.
-
-Using cuNumeric you do things like run the final example of the
-`Python CFD course`_ completely unmodified on 2048 A100 GPUs in a
-`DGX SuperPOD`_ and achieve good weak scaling.
-
-.. toctree::
-  :maxdepth: 1
-  :caption: Contents:
-
-  installation
-  user/index
-  examples/index
-  api/index
-  faqs
-  developer/index
-
-.. toctree::
-  :maxdepth: 1
-
-  versions
-
-
-Indices and tables
-------------------
-
-* :ref:`genindex`
-* :ref:`search`
-
-.. _DGX SuperPOD: https://www.nvidia.com/en-us/data-center/dgx-superpod/
-.. _Legate: https://github.com/nv-legate/legate.core
-.. _Legion: https://legion.stanford.edu/
-.. _Numpy API: https://numpy.org/doc/stable/reference/
-.. _Python CFD course: https://github.com/barbagroup/CFDPython/blob/master/lessons/15_Step_12.ipynb
\ No newline at end of file
diff --git a/docs/cunumeric/source/installation.rst b/docs/cunumeric/source/installation.rst
deleted file mode 100644
index 7e4a3d4720..0000000000
--- a/docs/cunumeric/source/installation.rst
+++ /dev/null
@@ -1,58 +0,0 @@
-Installation
-============
-
-Default conda install
----------------------
-
-cuNumeric is available from
-`conda <https://docs.conda.io/projects/conda/en/latest/index.html>`_
-on the `legate channel <https://anaconda.org/legate/cunumeric>`_.
-Please make sure you have at least conda version 24.1 installed, then create
-a new environment containing cuNumeric:
-
-.. code-block:: sh
-
-    conda create -n myenv -c conda-forge -c legate cunumeric
-
-or install it into an existing environment:
-
-.. code-block:: sh
-
-    conda install -c conda-forge -c legate cunumeric
-
-Packages with GPU support are available, and will be chosen automatically by
-``conda install`` on systems with GPUs.
-
-In an environment without GPUs available, ``conda install`` will by default
-choose a CPU-only package. To install a version with GPU support in such an
-environment, use environment variable ``CONDA_OVERRIDE_CUDA``:
-
-.. code-block:: sh
-
-    CONDA_OVERRIDE_CUDA="12.2" \
-      conda install -c conda-forge -c legate cunumeric
-
-Once installed, you can verify the installation by running one of the examples
-from the cuNumeric repository, for instance:
-
-.. code-block:: sh
-
-    $ legate examples/black_scholes.py
-    Running black scholes on 10K options...
-    Elapsed Time: 129.017 ms
-
-Building from source
----------------------
-
-See :ref:`building cunumeric from source` for instructions on building
-cuNumeric manually.
-
-Licenses
---------
-
-This project will download and install additional third-party open source
-software projects at install time. Review the license terms of these open
-source projects before use.
-
-For license information regarding projects bundled directly, see
-:ref:`thirdparty`.
\ No newline at end of file
diff --git a/docs/cunumeric/source/user/howtos/jupyter.rst b/docs/cunumeric/source/user/howtos/jupyter.rst
deleted file mode 100644
index c0c3f8ffdf..0000000000
--- a/docs/cunumeric/source/user/howtos/jupyter.rst
+++ /dev/null
@@ -1,107 +0,0 @@
-Configuring Jupyter kernels
-===========================
-
-Legate supports single-node execution of programs using Jupyter Notebooks.
-Please use the instructions given below to set up IPython kernels that
-will be used in the notebooks.
-
-Setup
------
-
-IPython Kernel
-~~~~~~~~~~~~~~
-
-Inputs that are passed to the Legate launcher will now be passed to the
-notebook through IPython kernels. By default, ``LEGATE_SM_GPU`` kernel will
-be available and set to use one GPU.
-
-For each set of inputs to legate, a new kernel will have to be created using
-``legate-jupyter`` and then selected from the drop-down menu for
-"Select Kernel" from your notebook.
-
-Use the following to list all the installed kernels. By default,
-``LEGATE_SM_GPU`` should be available.
-
-.. code-block:: sh
-
-    jupyter kernelspec list
-
-To create a new kernel that corresponds to a particular set of inputs to
-``legate``, say, to run on 2 CPUs with 10GB of memory and 10% of memory
-reserved for eager allocations, run the following:
-
-.. code-block:: sh
-
-    legate-jupyter --name "legate_cpus_2" --cpus 2 --sysmem 10000 --eager-alloc-percentage 10
-
-    jupyter kernelspec list
-
-This should create a new kernel named ``legate_cpus_2``. The installed kernel
-can then be selected from the notebook to run on two CPUs.
-
-You can also see input arguments that were passed to Legate by the kernel by
-using magic commands from a cell in the notebook (including the % character),
-like below:
-
-.. code-block:: text
-
-    %load_ext legate.info
-    %legate_info
-
-A sample output from a custom kernel is given below:
-
-.. code-block:: text
-
-    Kernel 'legate_cpus_2' configured for 1 node(s)
-
-    Cores:
-    CPUs to use per rank : 2
-    GPUs to use per rank : 0
-    OpenMP groups to use per rank : 0
-    Threads per OpenMP group : 4
-    Utility processors per rank : 2
-
-    Memory:
-    DRAM memory per rank (in MBs) : 10000
-    DRAM memory per NUMA domain per rank (in MBs) : 0
-    Framebuffer memory per GPU (in MBs) : 4000
-    Zero-copy memory per rank (in MBs) : 32
-    Registered CPU-side pinned memory per rank (in MBs) : 0
-
-Running on a remote server
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-If you intend to run the notebook on a remote server or a laptop, you will
-have to create a tunnel from your localhost to the remote server. Substitute
-remote-server-hostname with the hostname of the remote server you plan to use,
-
-.. code-block:: sh
-
-    ssh -4 -t -L 8888:localhost:8002 username@remote-server-hostname ssh -t -L 8002:localhost:8888 remote-server-hostname
-
-and then run on your local machine:
-
-.. code-block:: sh
-
-    jupyter notebook --port=8888 --no-browser
-
-This should give a URL where the Jupyter server is running and will look like
-this:
-
-.. code-block:: text
-
-    http://localhost:8888/tree?token=<token-id>
-
-Where ``<token-id>`` will be different each time you launch jupyter. Launch
-the URL from your browser and choose the ``Legate_SM_GPU`` kernel. This ensures
-that the underlying computations can be run using the resources specified
-in the ``Legate_SM_GPU`` kernel.
-
-For more information on how this works with the runtime, we refer the readers
-to respective sections in Legion and Legate documentation.
-
-Running Jupyter Notebooks
--------------------------
-
-You are now set up to run the notebooks using Jupyter with your configured
-options. Check out the notebooks in the `examples` section.
diff --git a/docs/cunumeric/source/user/usage.rst b/docs/cunumeric/source/user/usage.rst
deleted file mode 100644
index 384e8d74ab..0000000000
--- a/docs/cunumeric/source/user/usage.rst
+++ /dev/null
@@ -1,148 +0,0 @@
-.. _usage:
-
-Usage
-=====
-
-Running cuNumeric programs
---------------------------
-
-Using cuNumeric as a replacement for NumPy is simple. Replace your NumPy import
-statement with cuNumeric:
-
-.. code-block:: python
-
-  import numpy as np
-
-becomes
-
-.. code-block:: python
-
-  import cunumeric as np
-
-Then, run the application like you usually do. For example, if you had a script
-``main.py`` written in NumPy that adds two vectors,
-
-.. code-block:: python
-
-    import numpy as np
-    x = np.array([1.0, 2.0, 3.0, 4.0])
-    y = np.array([4.0, 3.0, 2.0, 1.0])
-    z = x + y
-    print(z)
-
-change the import statement to use cuNumeric like below,
-
-.. code-block:: python
-
-    import cunumeric as np
-    x = np.array([1.0, 2.0, 3.0, 4.0])
-    y = np.array([4.0, 3.0, 2.0, 1.0])
-    z = x + y
-    print(z)
-
-And run the program, like this
-
-.. code-block:: sh
-
-    python main.py
-
-By default, this command will use 4 CPUs to run the program, but is
-configurable through the LEGATE_CONFIG environment variable. For
-example, to use 2 GPUs instead, run the following
-
-.. code-block:: sh
-
-    LEGATE_CONFIG="--gpus 2" python main.py
-
-For execution with multiple nodes (assuming Legate is installed
-with networking support) users can supply the `--nodes` option.
-
-
-For more information on how resources can be allocated using this
-environment variable, see `Using LEGATE_CONFIG`_.
-
-.. note::
-
-    Usage of standard Python is intended as a quick on-ramp for users to try
-    out cuNumeric more easily. Several legate command line configuration
-    options, especially for multi-node execution, are not available when
-    running programs with standard Python. See the output of ``legate --help``
-    for more details.
-
-To fully utilize the power of cuNumeric and overcome these restrictions, we
-recommend requesting resource allocation using Legate.
-
-Resource allocation
--------------------
-
-Legate allows you to prescribe the resources required to successfully execute
-your application. Applications can be run on three different types of
-processors, also known as task variants: CPU, OMP, and GPU. The OMP variant
-will use OpenMP threads to parallelize your application while the CPU variant
-will use individual processes per processor. In addition to the number or
-processors, you can also specify the amount of memory required for your
-application on each of these processors.
-
-Check the relevant command line arguments to legate and their default values
-before using them. In summary, if you want to change the number of processors,
-make sure to check out the following arguments in the documentation for legate:
-``--cpus``, ``--omps``, ``--ompthreads``, and ``--gpus``. Similarly, if you
-need to change the amount of memory required for your application, check the
-following arguments: ``--sysmem``, ``--numamem``, and ``--fbmem``.
-
-Legate reserves a fraction of the requested memory, denoted by
-``--eager-alloc-percentage``, to be used eagerly, with the rest used for
-deferred allocations. Reducing this typically helps you run larger problems.
-
-If you encounter errors related to resource allocation, check out our
-:ref:`faqs` to debug them.
-
-Using legate launcher
-~~~~~~~~~~~~~~~~~~~~~
-
-To run the above program using four OpenMP threads using the Legate launcher,
-run the following command
-
-.. code-block:: sh
-
-    legate --omps 1 --ompthreads 4 --sysmem 40000 --eager-alloc-percentage 10 ./main.py <main.py options>
-
-This will use one OpenMP group and two OpenMP threads to parallelize the
-application. We defer discussions on changing the OpenMP group to a later
-section.
-
-To run on 8 CPUs and use 40GB of system memory with 10% of that memory reserved
-for eager allocations, use the following command:
-
-.. code-block:: sh
-
-    legate --cpus 8 --sysmem 40000 --eager-alloc-percentage 10 ./main.py <main.py options>
-
-To run on multiple GPUs and use 40GB of framebuffer memory per GPU with 10%
-of that memory reserved for eager allocations, use the following command:
-
-.. code-block:: sh
-
-    legate --gpus 2 --fbmem 40000 --eager-alloc-percentage 10 ./main.py <main.py options>
-
-Using LEGATE_CONFIG
-~~~~~~~~~~~~~~~~~~~
-
-All of the above commands can also be passed through the environment variable
-``LEGATE_CONFIG`` as shown below:
-
-.. code-block:: sh
-
-    LEGATE_CONFIG="--omps 1 --ompthreads 4 --sysmem 40000 --eager-alloc-percentage 10" legate main.py <main.py options>
-
-.. code-block:: sh
-
-    LEGATE_CONFIG="--cpus 8 --sysmem 40000 --eager-alloc-percentage 10" legate main.py <main.py options>
-
-.. code-block:: sh
-
-    LEGATE_CONFIG="--gpus 2 --fbmem 40000 --eager-alloc-percentage 10" legate main.py <main.py options>
-
-Using the environment variable might be useful for users using the same set of
-resources for their runs where they can just set the environment variable once
-and use ``legate main.py`` for all subsequent runs.
diff --git a/docs/cunumeric/source/versions.rst b/docs/cunumeric/source/versions.rst
deleted file mode 100644
index 1760786d8e..0000000000
--- a/docs/cunumeric/source/versions.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-Versions
-========
-
-.. toctree::
-  :caption: Versions:
-
-  22.05 <https://nv-legate.github.io/cunumeric/22.05>
-  22.08 <https://nv-legate.github.io/cunumeric/22.08>
-  22.10 <https://nv-legate.github.io/cunumeric/22.10>
-  23.01 <https://nv-legate.github.io/cunumeric/23.01>
-  23.03 <https://nv-legate.github.io/cunumeric/23.03>
-  23.07 <https://nv-legate.github.io/cunumeric/23.07>
-  23.09 <https://nv-legate.github.io/cunumeric/23.09>
-  23.11 <https://nv-legate.github.io/cunumeric/23.11>
diff --git a/docs/cunumeric/switcher.json b/docs/cunumeric/switcher.json
deleted file mode 100644
index e62a26d440..0000000000
--- a/docs/cunumeric/switcher.json
+++ /dev/null
@@ -1,7 +0,0 @@
-[
-  {
-    "name": "24.06",
-    "version": "24.06",
-    "url": "https://docs.nvidia.com/cunumeric/24.06/"
-  }
-]
\ No newline at end of file
diff --git a/docs/cunumeric/Makefile b/docs/cupynumeric/Makefile
similarity index 100%
rename from docs/cunumeric/Makefile
rename to docs/cupynumeric/Makefile
diff --git a/docs/cunumeric/make.bat b/docs/cupynumeric/make.bat
similarity index 100%
rename from docs/cunumeric/make.bat
rename to docs/cupynumeric/make.bat
diff --git a/docs/cunumeric/source/_images/developer-build.png b/docs/cupynumeric/source/_images/developer-build.png
similarity index 100%
rename from docs/cunumeric/source/_images/developer-build.png
rename to docs/cupynumeric/source/_images/developer-build.png
diff --git a/docs/cunumeric/source/_implemented.rst b/docs/cupynumeric/source/_implemented.rst
similarity index 73%
rename from docs/cunumeric/source/_implemented.rst
rename to docs/cupynumeric/source/_implemented.rst
index f3a76189da..03181433ed 100644
--- a/docs/cunumeric/source/_implemented.rst
+++ b/docs/cupynumeric/source/_implemented.rst
@@ -1,4 +1,4 @@
-.. This page exists to collect references to all cunumeric functions and
+.. This page exists to collect references to all cupynumeric functions and
 .. methods that are "implemented". Doing so, any implemented functions or
 .. methods that are not present in the docs (but should be) will result in
 .. docs build errors
diff --git a/docs/cunumeric/source/_static/.keep b/docs/cupynumeric/source/_static/.keep
similarity index 100%
rename from docs/cunumeric/source/_static/.keep
rename to docs/cupynumeric/source/_static/.keep
diff --git a/docs/cupynumeric/source/_templates/layout.html b/docs/cupynumeric/source/_templates/layout.html
new file mode 100644
index 0000000000..c84d8e5e56
--- /dev/null
+++ b/docs/cupynumeric/source/_templates/layout.html
@@ -0,0 +1,7 @@
+{% extends "!layout.html" %}
+
+{% block extrahead %}
+
+<script src="https://assets.adobedtm.com/5d4962a43b79/c1061d2c5e7b/launch-191c2462b890.min.js"></script>
+
+{% endblock %}
\ No newline at end of file
diff --git a/docs/cunumeric/source/api/_bitgenerator.rst b/docs/cupynumeric/source/api/_bitgenerator.rst
similarity index 50%
rename from docs/cunumeric/source/api/_bitgenerator.rst
rename to docs/cupynumeric/source/api/_bitgenerator.rst
index 32854eff96..0ad24527d2 100644
--- a/docs/cunumeric/source/api/_bitgenerator.rst
+++ b/docs/cupynumeric/source/api/_bitgenerator.rst
@@ -1,7 +1,7 @@
-cunumeric.random.BitGenerator
-=============================
+cupynumeric.random.BitGenerator
+===============================
 
-.. currentmodule:: cunumeric.random
+.. currentmodule:: cupynumeric.random
 
 .. autoclass:: BitGenerator
 
diff --git a/docs/cunumeric/source/api/_generator.rst b/docs/cupynumeric/source/api/_generator.rst
similarity index 51%
rename from docs/cunumeric/source/api/_generator.rst
rename to docs/cupynumeric/source/api/_generator.rst
index 539a3c0014..5bffd5501b 100644
--- a/docs/cunumeric/source/api/_generator.rst
+++ b/docs/cupynumeric/source/api/_generator.rst
@@ -1,7 +1,7 @@
-cunumeric.random.Generator
-==========================
+cupynumeric.random.Generator
+============================
 
-.. currentmodule:: cunumeric.random
+.. currentmodule:: cupynumeric.random
 
 .. autoclass:: Generator
 
diff --git a/docs/cunumeric/source/api/_grouped.rst b/docs/cupynumeric/source/api/_grouped.rst
similarity index 100%
rename from docs/cunumeric/source/api/_grouped.rst
rename to docs/cupynumeric/source/api/_grouped.rst
diff --git a/docs/cunumeric/source/api/_ndarray.rst b/docs/cupynumeric/source/api/_ndarray.rst
similarity index 95%
rename from docs/cunumeric/source/api/_ndarray.rst
rename to docs/cupynumeric/source/api/_ndarray.rst
index 8e3f03de7d..5dfff107f7 100644
--- a/docs/cunumeric/source/api/_ndarray.rst
+++ b/docs/cupynumeric/source/api/_ndarray.rst
@@ -1,7 +1,7 @@
-cunumeric.ndarray
-=================
+cupynumeric.ndarray
+===================
 
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
 
 .. autoclass:: ndarray
 
diff --git a/docs/cunumeric/source/api/binary.rst b/docs/cupynumeric/source/api/binary.rst
similarity index 90%
rename from docs/cunumeric/source/api/binary.rst
rename to docs/cupynumeric/source/api/binary.rst
index 237fdc071c..38b0260ab8 100644
--- a/docs/cunumeric/source/api/binary.rst
+++ b/docs/cupynumeric/source/api/binary.rst
@@ -1,7 +1,7 @@
 Binary operations
 =================
 
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
 
 Elementwise bit operations
 --------------------------
diff --git a/docs/cupynumeric/source/api/broadcast.rst b/docs/cupynumeric/source/api/broadcast.rst
new file mode 100644
index 0000000000..df9197044c
--- /dev/null
+++ b/docs/cupynumeric/source/api/broadcast.rst
@@ -0,0 +1,7 @@
+.. currentmodule:: cupynumeric
+
+cupynumeric.broadcast
+=====================
+
+.. autoclass:: broadcast
+   :members:
\ No newline at end of file
diff --git a/docs/cunumeric/source/api/classes.rst b/docs/cupynumeric/source/api/classes.rst
similarity index 100%
rename from docs/cunumeric/source/api/classes.rst
rename to docs/cupynumeric/source/api/classes.rst
diff --git a/docs/cupynumeric/source/api/comparison.rst b/docs/cupynumeric/source/api/comparison.rst
new file mode 100644
index 0000000000..eda6dddecb
--- /dev/null
+++ b/docs/cupynumeric/source/api/comparison.rst
@@ -0,0 +1,12 @@
+Project comparisons
+===================
+
+Here is a list of NumPy APIs and corresponding cuPyNumeric implementations.
+
+A dot in the cupynumeric column denotes that cuPyNumeric implementation
+is not provided yet. We welcome contributions for these functions.
+
+NumPy vs cuPyNumeric APIs
+-------------------------
+
+.. comparison-table::
diff --git a/docs/cunumeric/source/api/creation.rst b/docs/cupynumeric/source/api/creation.rst
similarity index 94%
rename from docs/cunumeric/source/api/creation.rst
rename to docs/cupynumeric/source/api/creation.rst
index 153db24475..e35f6ab4cb 100644
--- a/docs/cunumeric/source/api/creation.rst
+++ b/docs/cupynumeric/source/api/creation.rst
@@ -1,7 +1,7 @@
 Array creation routines
 =======================
 
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
 
 From shape or value
 -------------------
diff --git a/docs/cunumeric/source/api/datatype.rst b/docs/cupynumeric/source/api/datatype.rst
similarity index 81%
rename from docs/cunumeric/source/api/datatype.rst
rename to docs/cupynumeric/source/api/datatype.rst
index 1e4d521e95..bb5667fd05 100644
--- a/docs/cunumeric/source/api/datatype.rst
+++ b/docs/cupynumeric/source/api/datatype.rst
@@ -1,7 +1,7 @@
 Data type routines
 ==================
 
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
 
 Data type testing
 -----------------
diff --git a/docs/cunumeric/source/api/fft.rst b/docs/cupynumeric/source/api/fft.rst
similarity index 76%
rename from docs/cunumeric/source/api/fft.rst
rename to docs/cupynumeric/source/api/fft.rst
index 4dce08d136..6ffe039d9f 100644
--- a/docs/cunumeric/source/api/fft.rst
+++ b/docs/cupynumeric/source/api/fft.rst
@@ -1,7 +1,7 @@
-.. module:: cunumeric.fft
+.. module:: cupynumeric.fft
 
-Discrete Fourier Transform (:mod:`cunumeric.fft`)
-==================================================
+Discrete Fourier Transform (:mod:`cupynumeric.fft`)
+===================================================
 
 Standard FFTs
 ---------------
diff --git a/docs/cunumeric/source/api/index.rst b/docs/cupynumeric/source/api/index.rst
similarity index 77%
rename from docs/cunumeric/source/api/index.rst
rename to docs/cupynumeric/source/api/index.rst
index ea740628ec..d57ccba21e 100644
--- a/docs/cunumeric/source/api/index.rst
+++ b/docs/cupynumeric/source/api/index.rst
@@ -1,7 +1,7 @@
 API Reference
 =============
 
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
 
 .. toctree::
    :maxdepth: 2
diff --git a/docs/cunumeric/source/api/indexing.rst b/docs/cupynumeric/source/api/indexing.rst
similarity index 95%
rename from docs/cunumeric/source/api/indexing.rst
rename to docs/cupynumeric/source/api/indexing.rst
index 2723a2d317..3468e893ee 100644
--- a/docs/cunumeric/source/api/indexing.rst
+++ b/docs/cupynumeric/source/api/indexing.rst
@@ -1,7 +1,7 @@
 Indexing routines
 =================
 
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
 
 Generating index arrays
 -----------------------
diff --git a/docs/cunumeric/source/api/io.rst b/docs/cupynumeric/source/api/io.rst
similarity index 82%
rename from docs/cunumeric/source/api/io.rst
rename to docs/cupynumeric/source/api/io.rst
index 0fd4ee4b3a..a5ba6f6709 100644
--- a/docs/cunumeric/source/api/io.rst
+++ b/docs/cupynumeric/source/api/io.rst
@@ -1,7 +1,7 @@
 Input and output
 ================
 
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
 
 NumPy binary files (npy, npz)
 -----------------------------
diff --git a/docs/cunumeric/source/api/linalg.rst b/docs/cupynumeric/source/api/linalg.rst
similarity index 68%
rename from docs/cunumeric/source/api/linalg.rst
rename to docs/cupynumeric/source/api/linalg.rst
index 5d94889803..c3beaf9c61 100644
--- a/docs/cunumeric/source/api/linalg.rst
+++ b/docs/cupynumeric/source/api/linalg.rst
@@ -1,9 +1,9 @@
-.. module:: cunumeric.linalg
+.. module:: cupynumeric.linalg
 
-Linear algebra (:mod:`cunumeric.linalg`)
-========================================
+Linear algebra (:mod:`cupynumeric.linalg`)
+==========================================
 
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
 
 Matrix and vector products
 --------------------------
@@ -29,6 +29,8 @@ Decompositions
    :toctree: generated/
 
    linalg.cholesky
+   linalg.eig
+   linalg.eigvals
    linalg.qr
    linalg.svd
 
@@ -49,3 +51,12 @@ Solving equations and inverting matrices
    :toctree: generated/
 
    linalg.solve
+
+
+Matrix Functions
+----------------
+
+.. autosummary::
+   :toctree: generated/
+
+   linalg.expm
diff --git a/docs/cunumeric/source/api/logic.rst b/docs/cupynumeric/source/api/logic.rst
similarity index 95%
rename from docs/cunumeric/source/api/logic.rst
rename to docs/cupynumeric/source/api/logic.rst
index abc016c653..1ab6c7873c 100644
--- a/docs/cunumeric/source/api/logic.rst
+++ b/docs/cupynumeric/source/api/logic.rst
@@ -1,7 +1,7 @@
 Logic functions
 ===============
 
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
 
 Truth value testing
 -------------------
diff --git a/docs/cunumeric/source/api/manipulation.rst b/docs/cupynumeric/source/api/manipulation.rst
similarity index 93%
rename from docs/cunumeric/source/api/manipulation.rst
rename to docs/cupynumeric/source/api/manipulation.rst
index 6f8bf6f33f..b1d3f54c32 100644
--- a/docs/cunumeric/source/api/manipulation.rst
+++ b/docs/cupynumeric/source/api/manipulation.rst
@@ -1,7 +1,7 @@
 Array manipulation routines
 ===========================
 
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
 
 Basic operations
 ----------------
@@ -32,7 +32,7 @@ Transpose-like operations
    swapaxes
    transpose
 
-See also :attr:`cunumeric.ndarray.T` property.
+See also :attr:`cupynumeric.ndarray.T` property.
 
 Changing number of dimensions
 -----------------------------
diff --git a/docs/cunumeric/source/api/math.rst b/docs/cupynumeric/source/api/math.rst
similarity index 98%
rename from docs/cunumeric/source/api/math.rst
rename to docs/cupynumeric/source/api/math.rst
index ef40212852..5764a93727 100644
--- a/docs/cunumeric/source/api/math.rst
+++ b/docs/cupynumeric/source/api/math.rst
@@ -1,7 +1,7 @@
 Mathematical functions
 ======================
 
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
 
 Trigonometric functions
 -----------------------
diff --git a/docs/cunumeric/source/api/ndarray.rst b/docs/cupynumeric/source/api/ndarray.rst
similarity index 97%
rename from docs/cunumeric/source/api/ndarray.rst
rename to docs/cupynumeric/source/api/ndarray.rst
index 4efec7e0a1..50bfd1a2a3 100644
--- a/docs/cunumeric/source/api/ndarray.rst
+++ b/docs/cupynumeric/source/api/ndarray.rst
@@ -1,7 +1,7 @@
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
 
-The N-Dimensional array (:class:`cunumeric.ndarray`)
-====================================================
+The N-Dimensional array (:class:`cupynumeric.ndarray`)
+======================================================
 
 Constructing arrays
 -------------------
diff --git a/docs/cunumeric/source/api/random.rst b/docs/cupynumeric/source/api/random.rst
similarity index 89%
rename from docs/cunumeric/source/api/random.rst
rename to docs/cupynumeric/source/api/random.rst
index 0cf5a61a99..79a0f2adbd 100644
--- a/docs/cunumeric/source/api/random.rst
+++ b/docs/cupynumeric/source/api/random.rst
@@ -1,7 +1,7 @@
-.. module:: cunumeric.random
+.. module:: cupynumeric.random
 
-Random sampling (:mod:`cunumeric.random`)
-=========================================
+Random sampling (:mod:`cupynumeric.random`)
+===========================================
 
 Random Generator
 -----------------
diff --git a/docs/cunumeric/source/api/routines.rst b/docs/cupynumeric/source/api/routines.rst
similarity index 100%
rename from docs/cunumeric/source/api/routines.rst
rename to docs/cupynumeric/source/api/routines.rst
diff --git a/docs/cunumeric/source/api/set.rst b/docs/cupynumeric/source/api/set.rst
similarity index 79%
rename from docs/cunumeric/source/api/set.rst
rename to docs/cupynumeric/source/api/set.rst
index c4299e870d..e797379d13 100644
--- a/docs/cunumeric/source/api/set.rst
+++ b/docs/cupynumeric/source/api/set.rst
@@ -1,7 +1,7 @@
 Set routines
 ============
 
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
 
 Making proper sets
 ------------------
diff --git a/docs/cupynumeric/source/api/settings.rst b/docs/cupynumeric/source/api/settings.rst
new file mode 100644
index 0000000000..6a424f0fbc
--- /dev/null
+++ b/docs/cupynumeric/source/api/settings.rst
@@ -0,0 +1,8 @@
+Settings
+========
+
+cuPyNumeric has a number of runtime settings that can be configured through
+environment variables.
+
+.. settings:: settings
+    :module: cupynumeric.settings
\ No newline at end of file
diff --git a/docs/cunumeric/source/api/sorting.rst b/docs/cupynumeric/source/api/sorting.rst
similarity index 93%
rename from docs/cunumeric/source/api/sorting.rst
rename to docs/cupynumeric/source/api/sorting.rst
index 86d8e65dc0..ab5570cfde 100644
--- a/docs/cunumeric/source/api/sorting.rst
+++ b/docs/cupynumeric/source/api/sorting.rst
@@ -1,7 +1,7 @@
 Sorting, searching, and counting
 ================================
 
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
 
 Sorting
 -------
diff --git a/docs/cunumeric/source/api/statistics.rst b/docs/cupynumeric/source/api/statistics.rst
similarity index 94%
rename from docs/cunumeric/source/api/statistics.rst
rename to docs/cupynumeric/source/api/statistics.rst
index 5fb0cdc95f..9430ea3240 100644
--- a/docs/cunumeric/source/api/statistics.rst
+++ b/docs/cupynumeric/source/api/statistics.rst
@@ -1,7 +1,7 @@
 Statistics
 ==========
 
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
 
 Order statistics
 ----------------
diff --git a/docs/cunumeric/source/api/window.rst b/docs/cupynumeric/source/api/window.rst
similarity index 85%
rename from docs/cunumeric/source/api/window.rst
rename to docs/cupynumeric/source/api/window.rst
index 28058d21fd..e50dc58984 100644
--- a/docs/cunumeric/source/api/window.rst
+++ b/docs/cupynumeric/source/api/window.rst
@@ -1,7 +1,7 @@
 Window functions
 ======================
 
-.. currentmodule:: cunumeric
+.. currentmodule:: cupynumeric
 
 Various windows
 -----------------------
diff --git a/docs/cunumeric/source/conf.py b/docs/cupynumeric/source/conf.py
similarity index 54%
rename from docs/cunumeric/source/conf.py
rename to docs/cupynumeric/source/conf.py
index 592defb55c..832a83ebac 100644
--- a/docs/cunumeric/source/conf.py
+++ b/docs/cupynumeric/source/conf.py
@@ -15,20 +15,37 @@
 
 from os import getenv
 
-from cunumeric import __version__
+import cupynumeric
 
-SWITCHER_PROD = "https://docs.nvidia.com/cunumeric/switcher.json"
+SWITCHER_PROD = "https://docs.nvidia.com/cupynumeric/switcher.json"
 SWITCHER_DEV = "http://localhost:8000/switcher.json"
 JSON_URL = SWITCHER_DEV if getenv("SWITCHER_DEV") == "1" else SWITCHER_PROD
 
+ANNOTATE = getenv("LEGATE_ANNOTATION_DOCS") == "1"
+
+# This is the "YY.MM" version string that we want users to see
+BASE_VERSION = ".".join(cupynumeric.__version__.split(".", 2)[:2])
+
+# make sure BASE VERSION is formatted as expected
+_yy, _mm = BASE_VERSION.split(".")
+assert _yy.isdigit()
+assert _mm.isdigit()
+
 # -- Project information -----------------------------------------------------
 
-project = "NVIDIA cuNumeric"
-if "dev" in __version__:
-    project += f" ({__version__})"
+project = "NVIDIA cuPyNumeric"
 copyright = "2024, NVIDIA"
 author = "NVIDIA Corporation"
 
+if "dev" in cupynumeric.__version__ or "rc" in cupynumeric.__version__:
+    # for dev/rc versions just use the entire version with everything, and
+    # add it to the page title as well, for easy recognition
+    version = release = cupynumeric.__version__
+    project += f" ({cupynumeric.__version__})"
+else:
+    # otherwise, we actually only want the YY.MM to be visible for releases
+    version = release = BASE_VERSION
+
 # -- General configuration ---------------------------------------------------
 
 extensions = [
@@ -42,10 +59,10 @@
     "myst_parser",
     "nbsphinx",
     "legate._sphinxext.settings",
-    "cunumeric._sphinxext.comparison_table",
-    "cunumeric._sphinxext.implemented_index",
-    "cunumeric._sphinxext.missing_refs",
-    "cunumeric._sphinxext.ufunc_formatter",
+    "cupynumeric._sphinxext.comparison_table",
+    "cupynumeric._sphinxext.implemented_index",
+    "cupynumeric._sphinxext.missing_refs",
+    "cupynumeric._sphinxext.ufunc_formatter",
 ]
 
 source_suffix = {".rst": "restructuredtext", ".md": "markdown"}
@@ -55,43 +72,25 @@
 html_context = {
     # "default_mode": "light",
     "AUTHOR": author,
-    "DESCRIPTION": "cuNumeric documentation site.",
+    "DESCRIPTION": "cuPyNumeric documentation site.",
 }
 
 html_static_path = ["_static"]
 
-# This is pretty kludgy but the nv theme is not publicly available to
-# install on CI, etc. We will use the pydata theme in those situations
-if getenv("NV_THEME") == "1":
-    html_theme = "nvidia_sphinx_theme"
-
-    html_theme_options = {
-        "switcher": {
-            "json_url": JSON_URL,
-            "navbar_start": ["navbar-logo", "version-switcher"],
-            "version_match": ".".join(__version__.split(".", 2)[:2]),
-        }
-    }
-
-else:
-    html_theme = "pydata_sphinx_theme"
-
-    html_theme_options = {
-        "footer_start": ["copyright"],
-        "github_url": "https://github.com/nv-legate/cunumeric",
-        # https://github.com/pydata/pydata-sphinx-theme/issues/1220
-        "icon_links": [],
-        "logo": {
-            "text": project,
-            "link": "https://nv-legate.github.io/cunumeric",
-        },
-        "navbar_align": "left",
-        "navbar_end": ["navbar-icon-links", "theme-switcher"],
-        "primary_sidebar_end": ["indices.html"],
-        "secondary_sidebar_items": ["page-toc"],
-        "show_nav_level": 2,
-        "show_toc_level": 2,
-    }
+html_theme = "nvidia_sphinx_theme"
+
+html_theme_options = {
+    "switcher": {
+        "json_url": JSON_URL,
+        "navbar_start": ["navbar-logo", "version-switcher"],
+        "version_match": BASE_VERSION,
+    },
+    "extra_footer": [
+        "This project, i.e., cuPyNumeric, is separate and independent of the CuPy project. CuPy is a registered trademark of Preferred Networks.",  # NOQA
+        '<script type="text/javascript">if (typeof _satellite !== “undefined”){ _satellite.pageBottom();}</script>',  # NOQA
+    ],
+    "show_version_warning_banner": True,
+}
 
 templates_path = ["_templates"]
 
@@ -116,4 +115,6 @@
 
 
 def setup(app):
+    if ANNOTATE:
+        app.add_js_file("https://hypothes.is/embed.js", kind="hypothesis")
     app.add_css_file("params.css")
diff --git a/docs/cupynumeric/source/developer/CONTRIBUTING.md b/docs/cupynumeric/source/developer/CONTRIBUTING.md
new file mode 100644
index 0000000000..8dacfa72c3
--- /dev/null
+++ b/docs/cupynumeric/source/developer/CONTRIBUTING.md
@@ -0,0 +1,72 @@
+# Contributing to cuPyNumeric
+
+cuPyNumeric is an open-source project released under the [Apache license, version 2.0](https://www.apache.org/licenses/LICENSE-2.0).  We welcome any and all contributions, and we hope that you can help us develop a strong community.
+
+## How to begin
+
+Most of the time, the best thing is to begin by [opening an issue](https://github.com/nv-legate/cupynumeric/issues).  This gives us a chance to discuss the contribution and to define the problem or feature that it addresses.   Often, opening of the issue first may help prevent you from doing unnecessary work or to enhance and further develop your idea.
+
+Once you are ready to start development, we ask you to work on a [fork](https://docs.github.com/en/get-started/quickstart/fork-a-repo) of our repository.  The next step is to create a (pull request)[https://help.github.com/en/articles/about-pull-requests].  Feel free to open the pull request as soon as you begin your development (just mark it [as a draft](https://github.blog/2019-02-14-introducing-draft-pull-requests/)) or when you are ready to have your contribution merged.
+
+## The Legalese: Developer Certificate of Origin
+
+cuPyNumeric is released under the open-source [Apache license, version 2.0](https://www.apache.org/licenses/LICENSE-2.0), and is free to use, modify, and redistribute.  To ensure that the license can be exercised without encumbrance, we ask you that you only contribute your own work or work to which you have the intellectual rights.  To that end, we employ the Developer's Certificate of Origin (DCO), which is the lightweight mechanism for you to certify that you are legally able to make your contribution. Here is the full text of the certificate (also available at [DeveloperCertificate.org](https://developercertificate.org/):
+
+````
+Developer Certificate of Origin
+Version 1.1
+
+Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
+
+Everyone is permitted to copy and distribute verbatim copies of this
+license document, but changing it is not allowed.
+
+
+Developer's Certificate of Origin 1.1
+
+By making a contribution to this project, I certify that:
+
+(a) The contribution was created in whole or in part by me and I
+    have the right to submit it under the open source license
+    indicated in the file; or
+
+(b) The contribution is based upon previous work that, to the best
+    of my knowledge, is covered under an appropriate open source
+    license and I have the right under that license to submit that
+    work with modifications, whether created in whole or in part
+    by me, under the same open source license (unless I am
+    permitted to submit under a different license), as indicated
+    in the file; or
+
+(c) The contribution was provided directly to me by some other
+    person who certified (a), (b) or (c) and I have not modified
+    it.
+
+(d) I understand and agree that this project and the contribution
+    are public and that a record of the contribution (including all
+    personal information I submit with it, including my sign-off) is
+    maintained indefinitely and may be redistributed consistent with
+    this project or the open source license(s) involved.
+````
+
+### How Do I Sign the DCO?
+
+Fortunately, it does not take much work to sign the DCO.  The only thing that you have to do is to mark all your commits with a `Signed-off-by` line that looks like that:
+
+````
+Signed-off-by: Your Name <your@email.address>
+````
+
+Please use your real name and a valid email address at which you can be reached.  For legal reasons, we will not be able to accept contributions that use pseudonyms in the signature.  You can simply add this line at the end of all your commits manually, or you can use the `-s` or the `--signoff` options provided by Git to automatically tack on the signature.
+
+## Review Process
+
+We are really grateful that you are thinking of contributing to cuPyNumeric.  We will make every effort to review your contributions as soon as possible.
+
+As we suggested at the beginning of this document, it will be really helpful to start with an issue unless your proposed change is really trivial.  An issue will help to save work in the review process (e.g., maybe somebody is already working on exactly the same thing you want to work on).  After you open your pull request (PR), there usually will be a community feedback that often will require further changes to your contribution (the usual open-source process).  Usually, this will conclude in the PR being merged by a maintainer, but on rare occasions a PR may be rejected.  This may happen, for example, if the PR appears abandoned (no response to the community feedback) or if the PR does not seem to be approaching community acceptance in a reasonable time frame.  In any case, an explanation will always be given why a PR is closed.  Even if a PR is closed for some reason, it may always be reopened if the situation evolves (feel free to comment on closed PRs to discuss reopening them).
+
+## Code Formatting Requirements
+
+cuPyNumeric has a set of coding standards that are expected from all the code merged into the project.  The coding standards are defined by the set of tools we use to format our code.  We use the [pre-commit](https://pre-commit.com/) framework to run our formatting tools.  The easiest way to meet the coding standards is to simply use the pre-commit framework to run all the checks for you.  Please visit the [pre-commit project page](https://pre-commit.com/) for pre-commit installation and usage instructions.  Once pre-commit is installed in the cuPyNumeric repo, all the checks and formatting will be run on every commit, but one can also run the checks explicitly as detailed in pre-commit documentation.
+
+We hope that the automation of our formatting checks will make it easy to comply with our coding standards.  If you encounter problems with code formatting, however, please let us know in a comment on your PR, and we will do our best to help.
diff --git a/docs/cupynumeric/source/developer/building.rst b/docs/cupynumeric/source/developer/building.rst
new file mode 100644
index 0000000000..25e61cc940
--- /dev/null
+++ b/docs/cupynumeric/source/developer/building.rst
@@ -0,0 +1,108 @@
+.. _building cupynumeric from source:
+
+Building from source
+====================
+
+Basic build
+-----------
+
+Users must have a working installation of the `Legate`_ library prior to
+installing cuPyNumeric.
+**Installing cuPyNumeric by itself will not automatically install Legate.**
+
+See below for a list of cuPyNumeric's dependencies. The easiest way to set up a
+build environment that includes all of cuPyNumeric dependencies is to use the
+``scripts/generate-conda-envs.py`` script from the `Legate build instructions`_,
+passing the ``--cupynumeric`` flag.
+
+Once all dependencies are installed, you can simply invoke ``./install.py`` from
+the cuPyNumeric top-level directory. The build will automatically pick up the
+configuration used when building Legate (e.g. the CUDA Toolkit directory).
+
+Dependencies
+------------
+
+OpenBLAS
+~~~~~~~~
+
+Used for implementing linear algebra routines on CPUs.
+
+If you want to use a custom build of OpenBLAS, you will need to get a
+Fortran compiler, e.g. by pulling ``fortran-compiler`` from conda-forge.
+
+If using a build of Legate that includes OpenMP support, then you need a build
+of OpenBLAS configured with the following options:
+
+* ``USE_THREAD=1``
+* ``USE_OPENMP=1``
+* ``NUM_PARALLEL=32`` (or at least as many as the NUMA domains on the target
+  machine) -- The ``NUM_PARALLEL`` flag defines how many instances of OpenBLAS's
+  calculation API can run in parallel. Legate will typically instantiate a
+  separate OpenMP group per NUMA domain, and each group can launch independent
+  BLAS work. If ``NUM_PARALLEL`` is not high enough, some of this parallel work
+  will be serialized.
+
+TBLIS
+~~~~~
+
+Used for implementing tensor contraction routines on CPUs.
+
+This library will be automatically downloaded and built during cuPyNumeric
+installation.
+
+cuPyNumeric requires a build of TBLIS configured as follows:
+
+.. code-block:: none
+
+   --with-label-type=int32_t --with-length-type=int64_t --with-stride-type=int64_t
+
+and additionally ``--enable-thread-model=openmp`` if using a build of Legate
+that includes OpenMP support.
+
+Advanced topics
+---------------
+
+Building through pip & cmake
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+cuPyNumeric uses a cmake/scikit-build-based build workflow. There are several
+examples in the ``scripts`` directory, showing how to build different
+configurations of cuPyNumeric. We walk through the steps in
+``build-with-legate-separately-no-install.sh`` here. We assume a pre-existing
+Legate build.
+
+First, the CMake build needs to be configured:
+
+.. code:: sh
+
+  $ cmake -S . -B build -GNinja -D legate_ROOT:STRING=path/to/legate/build
+
+We point cuPyNumeric to the Legate *build* tree, not an installation.
+This generates all build-dependent headers and Python files.
+
+Once configured, we can build the C++ libraries:
+
+.. code:: sh
+
+  $ cmake --build build
+
+This will invoke Ninja (or make) to execute the build.
+Once the C++ libraries are available, we can do an editable (development) pip installation.
+
+.. code:: sh
+
+  $ SKBUILD_BUILD_OPTIONS="-D FIND_CUPYNUMERIC_CPP=ON -D cupynumeric_ROOT=$(pwd)/build" \
+    python3 -m pip install \
+    --root / --no-deps --no-build-isolation
+    --editable .
+
+The Python source tree and CMake build tree are now available with the environment Python
+for running cuPyNumeric programs. The diagram below illustrates the
+complete workflow for building both Legate and cuPyNumeric.
+
+.. image:: /_images/developer-build.png
+  :width: 600
+  :alt: "notional diagram of cupynumeric build process"
+
+.. _Legate: https://github.com/nv-legate/legate
+.. _Legate build instructions: https://docs.nvidia.com/legate/latest/BUILD.html#dependencies
diff --git a/docs/cunumeric/source/developer/index.rst b/docs/cupynumeric/source/developer/index.rst
similarity index 100%
rename from docs/cunumeric/source/developer/index.rst
rename to docs/cupynumeric/source/developer/index.rst
diff --git a/docs/cunumeric/source/developer/testing.rst b/docs/cupynumeric/source/developer/testing.rst
similarity index 97%
rename from docs/cunumeric/source/developer/testing.rst
rename to docs/cupynumeric/source/developer/testing.rst
index f5485b7874..55aa39e366 100644
--- a/docs/cunumeric/source/developer/testing.rst
+++ b/docs/cupynumeric/source/developer/testing.rst
@@ -4,7 +4,7 @@ Running tests
 Basic usage
 -----------
 
-The simplest way to run the cuNumeric test suite is to use the ``test.py``
+The simplest way to run the cuPyNumeric test suite is to use the ``test.py``
 test driver script.
 
 .. code-block:: sh
diff --git a/docs/cunumeric/source/examples/black_scholes.ipynb b/docs/cupynumeric/source/examples/black_scholes.ipynb
similarity index 99%
rename from docs/cunumeric/source/examples/black_scholes.ipynb
rename to docs/cupynumeric/source/examples/black_scholes.ipynb
index a091201f63..e5868463a1 100644
--- a/docs/cunumeric/source/examples/black_scholes.ipynb
+++ b/docs/cupynumeric/source/examples/black_scholes.ipynb
@@ -41,7 +41,7 @@
    "id": "5b787e94-e440-4e1c-bd66-29faf9b59041",
    "metadata": {},
    "source": [
-    "To get started, `import cunumeric as np` (just the same way we would import `numpy`)"
+    "To get started, `import cupynumeric as np` (just the same way we would import `numpy`)"
    ]
   },
   {
@@ -51,7 +51,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import cunumeric as np  # instead of numpy"
+    "import cupynumeric as np  # instead of numpy"
    ]
   },
   {
@@ -162,7 +162,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "/home/bryan/work/legate.core/legate/core/context.py:280: RuntimeWarning: cuNumeric has not implemented numpy.result_type and is falling back to canonical numpy. You may notice significantly decreased performance for this function call.\n",
+      "/home/bryan/work/legate.core/legate/core/context.py:280: RuntimeWarning: cuPyNumeric has not implemented numpy.result_type and is falling back to canonical numpy. You may notice significantly decreased performance for this function call.\n",
       "  result = func(*args, **kwargs)\n",
       "Elapsed Time: 45.659 ms\n"
      ]
diff --git a/docs/cunumeric/source/examples/cholesky.ipynb b/docs/cupynumeric/source/examples/cholesky.ipynb
similarity index 87%
rename from docs/cunumeric/source/examples/cholesky.ipynb
rename to docs/cupynumeric/source/examples/cholesky.ipynb
index 0e82d20ee2..ee39c6ec07 100644
--- a/docs/cunumeric/source/examples/cholesky.ipynb
+++ b/docs/cupynumeric/source/examples/cholesky.ipynb
@@ -9,7 +9,7 @@
     "\n",
     "A [Cholesky decomposition](https://en.wikipedia.org/wiki/Cholesky_decomposition) is a useful factorization of Hermitian, positive-definite matrices into the product of a lower triangular matrix $L$ with its conjugate transpose $L^{*}$.\n",
     "\n",
-    "Numpy has a function [numpy.linalg.cholesky](https://numpy.org/doc/stable/reference/generated/numpy.linalg.cholesky.html) built-in for computing Cholesky decompositions. Cunumeric also implements this function, and it can be used as an immediate drop-in replacement.\n",
+    "Numpy has a function [numpy.linalg.cholesky](https://numpy.org/doc/stable/reference/generated/numpy.linalg.cholesky.html) built-in for computing Cholesky decompositions. cuPyNumeric also implements this function, and it can be used as an immediate drop-in replacement.\n",
     "\n",
     "<details>\n",
     "<summary>License</summary>\n",
@@ -37,7 +37,7 @@
    "id": "389cd191-ccda-4597-8e08-8d01ac226bee",
    "metadata": {},
    "source": [
-    "To get started, `import cunumeric as np` (just the same way we would import `numpy`)\n"
+    "To get started, `import cupynumeric as np` (just the same way we would import `numpy`)\n"
    ]
   },
   {
@@ -49,7 +49,7 @@
    },
    "outputs": [],
    "source": [
-    "import cunumeric as np  # instead of numpy"
+    "import cupynumeric as np  # instead of numpy"
    ]
   },
   {
@@ -57,7 +57,7 @@
    "id": "9ef2bc57-e703-40ce-8aaa-d45408259c7a",
    "metadata": {},
    "source": [
-    "At this point we can call `np.linalg.cholesky`, exactly how we would with Numpy, but will get the result computed by Cunumeric's `cholesky` function. Let's quickly try it out with a simple identitity matrix:"
+    "At this point we can call `np.linalg.cholesky`, exactly how we would with Numpy, but will get the result computed by cuPyNumeric's `cholesky` function. Let's quickly try it out with a simple identitity matrix:"
    ]
   },
   {
@@ -96,7 +96,7 @@
     "tags": []
    },
    "source": [
-    "We'd like to get some information about how well Cunumeric's `cholesky` function performs. In order to obtain accurate timings, we need to use the `time` function from `legate.timing`. Let's define a helper function `cholesky_timed` that calls the `time` function for us, and prints out the results as well:"
+    "We'd like to get some information about how well cuPyNumeric's `cholesky` function performs. In order to obtain accurate timings, we need to use the `time` function from `legate.timing`. Let's define a helper function `cholesky_timed` that calls the `time` function for us, and prints out the results as well:"
    ]
   },
   {
diff --git a/docs/cunumeric/source/examples/compact_finite_difference.ipynb b/docs/cupynumeric/source/examples/compact_finite_difference.ipynb
similarity index 100%
rename from docs/cunumeric/source/examples/compact_finite_difference.ipynb
rename to docs/cupynumeric/source/examples/compact_finite_difference.ipynb
diff --git a/docs/cunumeric/source/examples/edge_detection.ipynb b/docs/cupynumeric/source/examples/edge_detection.ipynb
similarity index 99%
rename from docs/cunumeric/source/examples/edge_detection.ipynb
rename to docs/cupynumeric/source/examples/edge_detection.ipynb
index 6836020ee9..c83093f02c 100644
--- a/docs/cunumeric/source/examples/edge_detection.ipynb
+++ b/docs/cupynumeric/source/examples/edge_detection.ipynb
@@ -16,7 +16,7 @@
     "## Learning Outcomes\n",
     "This example identifies edges in an image using Sobol edge detection algorithm and is implemented using NumPy and SciPy. An edge is defined as an abrupt change in intensity of the image. The Sobol edge detection algorithm uses a kernel in each direction to compute derivative of intensity of the image. The gradient of the intensity will help us determine the locations where changes in intensity are abrupt, which can then be used to detect edges in an image.\n",
     "\n",
-    "This example uses the following packages in addition to NumPy/cuNumeric: Scipy, Matplotlib, PIL"
+    "This example uses the following packages in addition to NumPy/cuPyNumeric: Scipy, Matplotlib, PIL"
    ]
   },
   {
@@ -68,7 +68,7 @@
    "id": "78273013-cea0-4c28-a376-c3c40e681276",
    "metadata": {},
    "source": [
-    "Since NumPy's `convolve` API does not allow two-dimensional arrays and our image is represented in an two-dimensional array, we will use the `convolve` API from SciPy for this example. cuNumeric's implementation of `convolve` permits two-dimensional array and will be used if `cuNumeric` is imported instead of `NumPy`. Try changing the import statement from \"import numpy as np\" to \"import cunumeric as np\"!"
+    "Since NumPy's `convolve` API does not allow two-dimensional arrays and our image is represented in an two-dimensional array, we will use the `convolve` API from SciPy for this example. cuPyNumeric's implementation of `convolve` permits two-dimensional array and will be used if `cuPyNumeric` is imported instead of `NumPy`. Try changing the import statement from \"import numpy as np\" to \"import cupynumeric as np\"!"
    ]
   },
   {
@@ -85,7 +85,7 @@
     "    kernel: ndarray\n",
     "        Kernel to compute the gradient in x or y as per Sobel Edge Detector\n",
     "    mode: str\n",
-    "        The default convolution mode. Note that cuNumeric only\n",
+    "        The default convolution mode. Note that cuPyNumeric only\n",
     "        supports the convolution mode \"same\".\n",
     "\n",
     "    Notes:\n",
@@ -95,7 +95,7 @@
     "        The image was taken from:\n",
     "        https://docs.nvidia.com/vpi/algo_canny_edge_detector.html\n",
     "    \"\"\"\n",
-    "    if np.__name__ == \"cunumeric\":\n",
+    "    if np.__name__ == \"cupynumeric\":\n",
     "        return np.convolve(array, kernel, mode)\n",
     "    return convolve(array, kernel, mode)"
    ]
diff --git a/docs/cunumeric/source/examples/image.png b/docs/cupynumeric/source/examples/image.png
similarity index 100%
rename from docs/cunumeric/source/examples/image.png
rename to docs/cupynumeric/source/examples/image.png
diff --git a/docs/cunumeric/source/examples/index.rst b/docs/cupynumeric/source/examples/index.rst
similarity index 93%
rename from docs/cunumeric/source/examples/index.rst
rename to docs/cupynumeric/source/examples/index.rst
index 6c2f0cfba9..bd1adf4e2a 100644
--- a/docs/cunumeric/source/examples/index.rst
+++ b/docs/cupynumeric/source/examples/index.rst
@@ -11,3 +11,4 @@ Examples
   edge_detection
   newton_raphson_2d
   compact_finite_difference
+  torchswe
diff --git a/docs/cunumeric/source/examples/kmeans.ipynb b/docs/cupynumeric/source/examples/kmeans.ipynb
similarity index 99%
rename from docs/cunumeric/source/examples/kmeans.ipynb
rename to docs/cupynumeric/source/examples/kmeans.ipynb
index 5118b4ef18..29e78003c0 100644
--- a/docs/cunumeric/source/examples/kmeans.ipynb
+++ b/docs/cupynumeric/source/examples/kmeans.ipynb
@@ -14,7 +14,7 @@
    "metadata": {},
    "source": [
     "## Learning Outcomes\n",
-    "This example teaches how to implement k-means clustering algorithm using NumPy and is based on the k-means example in cuNumeric. \n",
+    "This example teaches how to implement k-means clustering algorithm using NumPy and is based on the k-means example in cuPyNumeric. \n",
     "\n",
     "In this example, you will learn:\n",
     "* how to compute pairwise distances using `newaxis`\n",
diff --git a/docs/cunumeric/source/examples/newton_raphson_2d.ipynb b/docs/cupynumeric/source/examples/newton_raphson_2d.ipynb
similarity index 91%
rename from docs/cunumeric/source/examples/newton_raphson_2d.ipynb
rename to docs/cupynumeric/source/examples/newton_raphson_2d.ipynb
index 3ab628a284..43a7edf747 100644
--- a/docs/cunumeric/source/examples/newton_raphson_2d.ipynb
+++ b/docs/cupynumeric/source/examples/newton_raphson_2d.ipynb
@@ -16,7 +16,7 @@
     "## Learning Outcomes\n",
     "This example teaches how to compute the solution for systems of equations in two variables using NumPy. There are two equations, $f_{1}(x,y)$ and $f_{2}(x, y)$, with two variables each, $x$ and $y$. We seek to find a solution that satisfies these two equations using Newton's method. To understand Newton's method in multiple dimensions, please see [this](https://wiki.math.ntnu.no/_media/tma4125/2017v/newton.pdf) note by Markus Grasmair.\n",
     "\n",
-    "The example also teaches how to interpret a warning from cuNumeric when the import statement is changed from importing numpy to importing cuNumeric.\n",
+    "The example also teaches how to interpret a warning from cuPyNumeric when the import statement is changed from importing numpy to importing cuPyNumeric.\n",
     "\n",
     "---"
    ]
@@ -106,15 +106,15 @@
    "id": "a91752f1-5ca8-44dd-9a26-525cdf87ab51",
    "metadata": {},
    "source": [
-    "When you switch the import statement from importing to importing cunumeric, you might see a warning like this:\n",
+    "When you switch the import statement from importing to importing cupynumeric, you might see a warning like this:\n",
     "\n",
     "---\n",
     "\n",
-    "*RuntimeWarning: cuNumeric has not implemented inv and is falling back to canonical NumPy. You may notice significantly decreased performance for this function call.*\n",
+    "*RuntimeWarning: cuPyNumeric has not implemented inv and is falling back to canonical NumPy. You may notice significantly decreased performance for this function call.*\n",
     "\n",
     "---\n",
     "\n",
-    "This means that cuNumeric has not implemented the `linalg.inv` API and is falling back to NumPy's implementation. This means that the API would be *eagerly* executed using NumPy's single-threaded implementation. If the API was intended to be invoked from a GPU, the data will get transferred from the GPU to the CPU before the API is executed. This can have performance implications, as indicated by the warning."
+    "This means that cuPyNumeric has not implemented the `linalg.inv` API and is falling back to NumPy's implementation. This means that the API would be *eagerly* executed using NumPy's single-threaded implementation. If the API was intended to be invoked from a GPU, the data will get transferred from the GPU to the CPU before the API is executed. This can have performance implications, as indicated by the warning."
    ]
   },
   {
diff --git a/docs/cunumeric/source/examples/stencil.ipynb b/docs/cupynumeric/source/examples/stencil.ipynb
similarity index 99%
rename from docs/cunumeric/source/examples/stencil.ipynb
rename to docs/cupynumeric/source/examples/stencil.ipynb
index 95b91744c6..72a635efae 100644
--- a/docs/cunumeric/source/examples/stencil.ipynb
+++ b/docs/cupynumeric/source/examples/stencil.ipynb
@@ -33,7 +33,7 @@
    "id": "35c48e6f-1bde-4aac-af55-b7218cc22491",
    "metadata": {},
    "source": [
-    "To get started, `import cunumeric as np` (just the same way we would import `numpy`)\n"
+    "To get started, `import cupynumeric as np` (just the same way we would import `numpy`)\n"
    ]
   },
   {
@@ -45,7 +45,7 @@
    },
    "outputs": [],
    "source": [
-    "import cunumeric as np  # instead of numpy"
+    "import cupynumeric as np  # instead of numpy"
    ]
   },
   {
diff --git a/docs/cupynumeric/source/examples/torchswe.ipynb b/docs/cupynumeric/source/examples/torchswe.ipynb
new file mode 100644
index 0000000000..c4b6173b9e
--- /dev/null
+++ b/docs/cupynumeric/source/examples/torchswe.ipynb
@@ -0,0 +1,219 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "5be6c57b-7cae-4fc1-b78f-899becabc6ee",
+   "metadata": {},
+   "source": [
+    "# TorchSWE case study\n",
+    "\n",
+    "\n",
+    "[TorchSWE](https://github.com/piyueh/TorchSWE) is a shallow-water solver created by Dr. Pi-Yueh Chuang and Prof. Lorena Barba that solves the vertically averaged Navier-Stokes equations using MPI and CuPy. It can simulate free-surface water flow in rivers, channels, and coastal areas, as well as model flood inundation. Given a topography, TorchSWE can predict flood-prone areas and the height of water inundation, making it a valuable tool for risk mapping.\n",
+    "\n",
+    "High-resolution numerical simulations—such as those on real topographies requiring hundreds of millions of data points—demand distributed computation across multiple GPUs. Although scalability is achievable with MPI4Py and CuPy, this approach requires manually partitioning the problem and managing inter-GPU data communication, which are complex and error-prone tasks.\n",
+    "\n",
+    "cuPyNumeric enables a distributed implementation of TorchSWE using only NumPy operations, without the complexities of MPI+CuPy. After porting TorchSWE to cuPyNumeric by removing all domain decomposition logic, it scaled effortlessly across multiple GPUs and nodes without further code modifications. This scalability enabled high-fidelity simulations exceeding 1.2 billion data points using 32 GPUs, allowing researchers to tackle critical scientific problems in flood inundation modeling without needing specialized distributed computing expertise. Overall, the cuPyNumeric implementation reduced the lines of code by over 20%, and simplified development and maintenance by eliminating complex logic for managing distribution and communication.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0402fb01-748b-48d9-9caa-80e7510ade80",
+   "metadata": {},
+   "source": [
+    "\n",
+    "<h2>Deep dive into the TorchSWE code implementation</h2>\n",
+    "\n",
+    "<h3> Original code details</h3>\n",
+    "\n",
+    "TorchSWE uses stencil operations to model shallow-water equations on a 2D grid, where each point is updated based on neighboring values, simulating water flow dynamics. The stencil computations are structured to update each grid cell iteratively, based on data from surrounding cells, mimicking fluid behavior over time. Below is an example that mimics the basic structure of the stencil logic from the TorchSWE repository:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "640f0b62-f70f-4d8a-86c5-7b4739e60a33",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "  \n",
+    "# Example dimensions for the grid\n",
+    "nx, ny = 128, 128\n",
+    "grid = np.ones((nx, ny))  # Initialize the grid with \"1\"\n",
+    "\n",
+    "# Stencil operation \n",
+    "for i in range(1, nx - 1):\n",
+    "    for j in range(1, ny - 1):\n",
+    "        grid[i, j] = (grid[i + 1, j] + grid[i - 1, j] + grid[i, j + 1] + grid[i, j - 1]) / 4\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0281b3f4-5a48-40cc-9ec8-0fc9d7fd760c",
+   "metadata": {},
+   "source": [
+    "This code iteratively updates cell `h[i, j]` using adjacent cells, representing a basic averaging stencil operation that can be extended to various boundary conditions and flow dynamics in the shallow-water model. For full context, refer to [TorchSWE on GitHub](https://github.com/piyueh/TorchSWE).\n",
+    "\n",
+    "Parallelizing stencil operations for multi-GPU systems is challenging. When arrays are partitioned across multiple GPUs, any update to a cell requires the updated values to be shared between GPUs to maintain consistency across boundaries. This communication overhead and synchronization make parallelizing stencil code complex and difficult to implement efficiently on multi-GPU architectures.\n",
+    "\n",
+    "Below, we outline TorchSWE’s MPI4Py logic in more detail  to highlight the complexity involved in this implementation.\n",
+    "Here’s an example code snippet that mirrors the TorchSWE MPI logic, implementing a simple MPI stencil operation from above:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "0d7db631-3ae9-41ca-a0f1-07390349fbd0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mpi4py import MPI\n",
+    "import cupy as cp\n",
+    "\n",
+    "num_timesteps=10\n",
+    "\n",
+    "def set_device(comm: MPI.Comm):\n",
+    "    # Device selection for each rank on multi-GPU nodes (TorchSWE-specific)\n",
+    "    n_gpus = cp.cuda.runtime.getDeviceCount()\n",
+    "    local_rank = comm.Get_rank() % n_gpus\n",
+    "    cp.cuda.runtime.setDevice(local_rank)\n",
+    "\n",
+    "comm = MPI.COMM_WORLD\n",
+    "rank = comm.Get_rank()\n",
+    "size = comm.Get_size()\n",
+    "\n",
+    "# Determine grid size and decompose domain\n",
+    "gnx, gny = 126,126   # global grid dimensions\n",
+    "local_nx, local_ny = gnx // size, gny  # local grid dimensions per rank\n",
+    "local_grid = cp.ones((local_nx + 2, local_ny + 2))  # with halo boundaries\n",
+    "\n",
+    "# Set up MPI data types and boundaries\n",
+    "send_type, recv_type = MPI.DOUBLE.Create_subarray((local_nx + 2, local_ny + 2), (local_nx, local_ny), (1, 1)), MPI.DOUBLE.Create_subarray((local_nx + 2, local_ny + 2), (local_nx, local_ny), (1, 1))\n",
+    "send_type.Commit()\n",
+    "recv_type.Commit()\n",
+    "\n",
+    "# Stencil computation loop\n",
+    "for timestep in range(num_timesteps):\n",
+    "    # Boundary exchange with non-blocking sends/receives\n",
+    "    reqs = []\n",
+    "    if rank > 0:\n",
+    "        reqs.append(comm.Isend(local_grid[1, :], dest=rank - 1))\n",
+    "        reqs.append(comm.Irecv(local_grid[0, :], source=rank - 1))\n",
+    "    if rank < size - 1:\n",
+    "        reqs.append(comm.Isend(local_grid[local_nx, :], dest=rank + 1))\n",
+    "        reqs.append(comm.Irecv(local_grid[local_nx + 1, :], source=rank + 1))\n",
+    "\n",
+    "    # Ensure all sends/receives are complete\n",
+    "    MPI.Request.Waitall(reqs)\n",
+    "\n",
+    "    # Perform stencil operation\n",
+    "    for i in range(1, local_nx + 1):\n",
+    "        for j in range(1, local_ny + 1):\n",
+    "            local_grid[i, j] = 0.25 * (local_grid[i - 1, j] + local_grid[i + 1, j] +\n",
+    "                                       local_grid[i, j - 1] + local_grid[i, j + 1])\n",
+    "\n",
+    "# Clean up MPI data types\n",
+    "send_type.Free()\n",
+    "recv_type.Free()\n",
+    "MPI.Finalize()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "660621f9-2bc9-49a3-be59-cde1ce87df65",
+   "metadata": {},
+   "source": [
+    "This example follows TorchSWE's approach to domain decomposition and parallelization as in the original implementation. It starts with MPI initialization and sets up logic to manage GPU assignment per rank, dividing the global grid into subdomains. Each rank is responsible for a local subgrid with added halo rows to hold neighboring data. Once the domain is decomposed, the user must ensure proper communication of data at processor boundaries, accounting for datatype differences between CuPy and MPI4Py. For optimal performance, the appropriate type of point-to-point communication, such as non-blocking send/recv, must be selected, as incorrect implementation can cause deadlock. Users must also handle varying numbers of neighboring ranks on domain boundaries and ensure data exchange across mesh, topography, and solution variables. Non-blocking `Isend` and `Irecv` functions handle boundary data exchanges, allowing each rank to receive necessary data for stencil computations. After a `Waitall` synchronization step, each rank performs computations on its subdomain. Finally, custom MPI data types are freed, and `MPI_Finalize()` concludes the environment.\n",
+    "\n",
+    "The actual TorchSWE code has additional complexities specific to its use of multiple arrays, GPU memory management, one-sided communications etc.\n",
+    "For the complete implementation, you can refer to the [TorchSWE repository](https://github.com/piyueh/TorchSWE).\n",
+    "\n",
+    "Explicit distributed logic, like that in TorchSWE, is difficult to debug and maintain throughout the lifespan of simulation codes. Most applications, including TorchSWE,  require specialized validation tests to ensure correct outputs. This results in significant programming effort and further complicates development. \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e93aa24e-fc18-4f69-819d-59b5997aa087",
+   "metadata": {},
+   "source": [
+    "<h3>cuPyNumeric Implementation</h3>\n",
+    "\n",
+    "In the [cuPyNumeric version of TorchSWE](https://github.com/shriram-jagan/TorchSWE), stencil operations are implemented using distributed array handling from cuPyNumeric, simplifying the code and removing the need for manual partitioning or boundary synchronization. The code operates similarly to NumPy slicing but scales across multiple GPUs. For example, the stencil computation in this version would typically involve using simple array slices like below (instead of the nested loops with integrated MPI logic as in the original implementation).\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b6e15757-a681-4a09-9f82-6304adf82fb4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cupynumeric as np\n",
+    "  \n",
+    "# Example dimensions\n",
+    "nx, ny = 128, 128\n",
+    "\n",
+    "# Initialize the array h\n",
+    "grid = np.ones((nx, ny))\n",
+    "\n",
+    "# Stencil operation using slicing\n",
+    "grid[1:-1, 1:-1] = (\n",
+    "    grid[2:, 1:-1] +  # Below\n",
+    "    grid[:-2, 1:-1] +  # Above\n",
+    "    grid[1:-1, 2:] +  # Right\n",
+    "    grid[1:-1, :-2]   # Left\n",
+    ") / 4\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f29f5387-3408-4bff-948d-55519412de31",
+   "metadata": {},
+   "source": [
+    "This operation is automatically managed across nodes and GPUs without needing MPI-specific code. More details can be found in the [cuPyNumeric port of TorchSWE](https://github.com/shriram-jagan/TorchSWE).\n",
+    "\n",
+    "The cuPyNumeric version of TorchSWE eliminates 600 lines of code related to domain decomposition, communication, synchronization, and validation that would otherwise be needed when using MPI4Py with CuPy. These 600 lines require substantial knowledge of distributed computing from domain scientists. By using cuPyNumeric, the simplified NumPy code scales efficiently to 1024 GPUs, making high-fidelity flood modeling accessible without requiring specialized expertise in distributed systems."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7e5d6565-ceda-4b61-8826-b6ae5aff3c83",
+   "metadata": {},
+   "source": [
+    "<h2>Conclusion</h2>\n",
+    "\n",
+    "cuPyNumeric significantly simplifies the development and maintenance of distributed simulations, such as TorchSWE, by abstracting complex parallelization, synchronization, and communication logic. This eliminates the need for specialized HPC knowledge and reduces the risk of errors, allowing domain scientists to focus on their research. With cuPyNumeric, large-scale simulations can scale efficiently across large HPC systems, enhancing productivity, reducing programming effort, and lowering development costs. \n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eb3a186a-3ea7-4150-8ec0-7760ad2adf1f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/cunumeric/source/faqs.rst b/docs/cupynumeric/source/faqs.rst
similarity index 72%
rename from docs/cunumeric/source/faqs.rst
rename to docs/cupynumeric/source/faqs.rst
index 553bc16710..7d542437f1 100644
--- a/docs/cunumeric/source/faqs.rst
+++ b/docs/cupynumeric/source/faqs.rst
@@ -10,20 +10,20 @@ What are the different task variants available in Legate?
 Legate offers three different task variants: CPU, OMP, and GPU. A task variant
 determines the type of processor Legate chooses to perform the computations.
 
-What is the difference between Legate and cuNumeric?
-----------------------------------------------------
+What is the difference between Legate and cuPyNumeric?
+------------------------------------------------------
 
 Legate is a task-based runtime software stack that enables development of
 scalable and composable libraries for distributed and accelerated computing.
 
-cuNumeric is one of the foundational libraries built using Legate and aspires
+cuPyNumeric is one of the foundational libraries built using Legate and aspires
 to be a distributed and accelerated drop-in replacement library for NumPy, an
-array programming library widely used in scientific computing. cuNumeric scales
+array programming library widely used in scientific computing. cuPyNumeric scales
 idiomatic NumPy programs to multiple GPUs and CPUs and seamlessly interoperates
 with other Legate libraries.
 
-Check out this `blog post <https://developer.nvidia.com/blog/accelerating-python-applications-with-cunumeric-and-legate/>`_
-to learn more about cuNumeric.
+Check out this `blog post <https://developer.nvidia.com/blog/effortlessly-scale-numpy-from-laptops-to-supercomputers-with-nvidia-cupynumeric/>`_
+to learn more about cuPyNumeric.
 
 When to use python vs legate?
 -----------------------------
@@ -45,9 +45,9 @@ What does this warning mean?
 
 .. code-block:: text
 
-    RuntimeWarning: cuNumeric has not implemented <API> and is falling back to canonical NumPy. You may notice significantly decreased performance for this function call.
+    RuntimeWarning: cuPyNumeric has not implemented <API> and is falling back to canonical NumPy. You may notice significantly decreased performance for this function call.
 
-This means that the NumPy <API> has not been implemented in cuNumeric and that
+This means that the NumPy <API> has not been implemented in cuPyNumeric and that
 the Legate runtime is falling back to using NumPy’s implementation which will
 be single-threaded execution and can lead to decreased performance for that
 function call.
@@ -101,14 +101,13 @@ How to handle Out-Of-Memory errors?
 
 .. code-block:: text
 
-    [0 - 7fb9fc426000]    0.985000 {5}{cunumeric.mapper}: Mapper cunumeric on Node 0 failed to allocate 144000000 bytes on memory 1e00000000000000 (of kind SYSTEM_MEM: Visible to all processors on a node) for region requirement 1 of Task cunumeric::WhereTask[./script.py:90] (UID 39).
+    [0 - 7fda18f26000]    0.805182 {5}{cunumeric.mapper}: Failed to allocate 8388608 bytes on memory 1e00000000000000 (of kind SYSTEM_MEM) for region requirement(s) 1 of Task cupynumeric::BinaryOpTask[oom.py:24] (UID 18)
 
 The above error indicates that the application ran out of memory during
 execution. More granular details on the type of memory, the task that triggered
-the error are provided in the error message, but this usually indicates that
-resources (add more cores/threads/ GPUs, or increase the amount of system
-memory or framebuffer memory) or decrease the problem size and confirm that you
-are able to run the program to completion.
+the error, and what was using up the available memory are provided in the error
+message. If possible, try increasing the amount of system memory or framebuffer
+memory allocated to the program, or decrease the problem size.
 
 Reducing the ``--eager-alloc-percentage`` to, say, 10 or less can also help
 since this reduces the amount of available memory available to the eager memory
@@ -121,12 +120,12 @@ Why are the results different from NumPy?
 While a majority of the APIs will give the same result as NumPy, some APIs
 might be implemented differently from that of NumPy which might lead to
 differences in results. One such example is, :ref:`reshape`, which returns a
-copy of the array in cuNumeric but returns a view in NumPy. Another example
+copy of the array in cuPyNumeric but returns a view in NumPy. Another example
 is :ref:`astype` which does *not* return a copy by default, where NumPy does.
 
 Such differences in implementation are noted in the documentation of the
-cuNumeric APIs, please review them before opening an issue on the
-`cuNumeric issue tracker <https://github.com/nv-legate/cunumeric/issues>`_.
+cuPyNumeric APIs, please review them before opening an issue on the
+`cuPyNumeric issue tracker <https://github.com/nv-legate/cupynumeric/issues>`_.
 
 Why doesn’t Legate use my GPU?
 ------------------------------
@@ -148,20 +147,20 @@ How do I time the execution of my application?
 ----------------------------------------------
 
 Check out the :ref:`benchmarking` section for information on how to accurately
-measure cuNumeric execution.
+measure cuPyNumeric execution.
 
-Why is cuNumeric slower than NumPy on my laptop?
-------------------------------------------------
+Why is cuPyNumeric slower than NumPy on my laptop?
+--------------------------------------------------
 
-For small problem sizes, cuNumeric might be slower than NumPy. We suggest you
+For small problem sizes, cuPyNumeric might be slower than NumPy. We suggest you
 increase the problem size and correspondingly increase the resources needed
 for the problem size as described in the Usage section. Take a look at our
 :ref:`practices` on how to do that.
 
-Why is cuNumeric slower than cuPy on my laptop?
------------------------------------------------
+Why is cuPyNumeric slower than CuPy on my laptop?
+-------------------------------------------------
 
-For small problem sizes, cuNumeric might be slower than cuPy. We suggest you
+For small problem sizes, cuPyNumeric might be slower than CuPy. We suggest you
 increase the problem size and correspondingly increase the resources needed for
 the problem size as described in the :ref:`Usage` section. Take a look at
 performance :ref:`practices`.
@@ -169,7 +168,7 @@ performance :ref:`practices`.
 How do I use Jupyter Notebooks?
 -------------------------------
 
-Notebooks are useful for experimentation and evaluation on a single node.
+See https://docs.nvidia.com/legate/latest/jupyter.html.
 
 How to pass Legion and Realm arguments?
 ---------------------------------------
@@ -191,19 +190,17 @@ What are the defaults?
 The default values for several input arguments to Legate are mentioned in
 Legate's documentation.
 
-Are there resources where I can read more about Legate?
--------------------------------------------------------
+Where I can read more about cuPyNumeric?
+----------------------------------------
 
-Check out this `blog post <https://developer.nvidia.com/blog/accelerating-python-applications-with-cunumeric-and-legate/>`_
-to learn more about cuNumeric.
+Check out this `blog post <https://developer.nvidia.com/blog/accelerating-python-applications-with-cupynumeric-and-legate/>`_
+or this `tutorial <https://github.com/NVIDIA/accelerated-computing-hub/blob/main/Accelerated_Python_User_Guide/notebooks/Chapter_11_Distributed_Computing_cuPyNumeric.ipynb>`_
+to learn more about cuPyNumeric.
 
-Technical questions?
---------------------
+Questions?
+----------
 
-For technical questions about Cunumeric and Legate-based tools, please visit
+For technical questions about cuPyNumeric and Legate-based tools, please visit
 the `community discussion forum <https://github.com/nv-legate/discussion>`_.
 
-Other questions?
-----------------
-
-Follow us on `GitHub <https://github.com/nv-legate>`_ or reach out to us there.
+If you have other questions, please contact us at *legate@nvidia.com*.
diff --git a/docs/cupynumeric/source/index.rst b/docs/cupynumeric/source/index.rst
new file mode 100644
index 0000000000..43ca3f8347
--- /dev/null
+++ b/docs/cupynumeric/source/index.rst
@@ -0,0 +1,38 @@
+:html_theme.sidebar_secondary.remove:
+
+NVIDIA cuPyNumeric
+==================
+
+cuPyNumeric is a library that aims to provide a distributed and accelerated
+drop-in replacement for `NumPy`_ built on top of the `Legate`_ framework.
+
+With cuPyNumeric you can write code productively in Python, using the familiar
+NumPy API, and have your program scale with no code changes from single-CPU
+computers to multi-node-multi-GPU clusters.
+
+For example, you can run `the final example of the Python CFD course`_
+completely unmodified on 2048 A100 GPUs in a `DGX SuperPOD`_ and achieve
+good weak scaling.
+
+.. toctree::
+  :maxdepth: 1
+  :caption: Contents:
+
+  installation
+  user/index
+  examples/index
+  api/index
+  faqs
+  developer/index
+
+
+Indices and tables
+------------------
+
+* :ref:`genindex`
+* :ref:`search`
+
+.. _NumPy: https://numpy.org/
+.. _Legate: https://github.com/nv-legate/legate
+.. _DGX SuperPOD: https://www.nvidia.com/en-us/data-center/dgx-superpod/
+.. _the final example of the Python CFD course: https://github.com/barbagroup/CFDPython/blob/master/lessons/15_Step_12.ipynb
\ No newline at end of file
diff --git a/docs/cupynumeric/source/installation.rst b/docs/cupynumeric/source/installation.rst
new file mode 100644
index 0000000000..1f1e88dafc
--- /dev/null
+++ b/docs/cupynumeric/source/installation.rst
@@ -0,0 +1,96 @@
+Installation
+============
+
+Installing Conda Packages
+-------------------------
+
+cuPyNumeric supports the
+`same platforms as Legate <https://docs.nvidia.com/legate/latest/installation.html#support-matrix>`_.
+
+cuPyNumeric is available from
+`conda <https://docs.conda.io/projects/conda/en/latest/index.html>`_
+on the `legate channel <https://anaconda.org/legate/cupynumeric>`_.
+
+.. note::
+   conda version >= 24.1 required
+
+.. code-block:: bash
+
+   # with a new environment
+   $ conda create -n myenv -c conda-forge -c legate cupynumeric
+
+   # =========== OR =========== #
+
+   # into an existing environment
+   $ conda install -c conda-forge -c legate cupynumeric
+
+Installing PyPI Packages
+------------------------
+
+cuPyNumeric is also available from `PyPI
+<https://pypi.org/project/nvidia-cupynumeric>`_.  To install, run the following
+command:
+
+.. code-block:: bash
+
+   # into existing environment
+   $ pip install nvidia-cupynumeric
+
+   # =========== OR =========== #
+
+   # into new environment
+   $ python -m venv myenv
+   $ source myenv/bin/activate
+   $ pip install nvidia-cupynumeric
+
+This will install the latest version of cuPyNumeric and the corresponding
+version of `Legate <https://github.com/nv-legate/legate>`_.
+
+The cuPyNumeric package on PyPI is multi-node and multi-rank capable.  Please
+check `Legate <https://docs.nvidia.com/legate>`_ documentation to find more
+details about running on multiple nodes.
+
+Verify your Installation
+------------------------
+
+You can verify the installation by running one of the
+`examples <https://github.com/nv-legate/cunumeric/tree/HEAD/examples>`_.
+
+For instance:
+
+.. code-block:: sh
+
+   $ legate examples/black_scholes.py
+   Running black scholes on 10K options...
+   Elapsed Time: 129.017 ms
+
+Conda and GPU / CPU Variants
+----------------------------
+
+``conda`` automatically installs the right variant for the system:
+* CPU variant if no NVIDIA GPU is detected
+* GPU variant if an NVIDIA GPU is detected
+
+To override this behavior and force install a version with GPU support, use the
+following (with the desired CUDA version):
+
+.. code-block:: sh
+
+   $ CONDA_OVERRIDE_CUDA="12.2" conda install -c conda-forge -c legate cupynumeric
+
+
+Building from source
+---------------------
+
+See :ref:`building cupynumeric from source` for instructions on building
+cuPyNumeric manually.
+
+Licenses
+--------
+
+This project will download and install additional third-party open source
+software projects at install time. Review the license terms of these open
+source projects before use.
+
+For license information regarding projects bundled directly, see
+:ref:`thirdparty`.
\ No newline at end of file
diff --git a/docs/cunumeric/source/oss-licenses.rst b/docs/cupynumeric/source/oss-licenses.rst
similarity index 77%
rename from docs/cunumeric/source/oss-licenses.rst
rename to docs/cupynumeric/source/oss-licenses.rst
index a6a9b0226b..84c0d96456 100644
--- a/docs/cunumeric/source/oss-licenses.rst
+++ b/docs/cupynumeric/source/oss-licenses.rst
@@ -5,6 +5,42 @@
 Third-party notices
 ===================
 
+NumPy
+-----
+
+.. code-block:: none
+
+    Copyright (c) 2005-2025, NumPy Developers.
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+
+        * Redistributions of source code must retain the above copyright
+           notice, this list of conditions and the following disclaimer.
+
+        * Redistributions in binary form must reproduce the above
+           copyright notice, this list of conditions and the following
+           disclaimer in the documentation and/or other materials provided
+           with the distribution.
+
+        * Neither the name of the NumPy Developers nor the names of any
+           contributors may be used to endorse or promote products derived
+           from this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 TBLIS
 -----
 
diff --git a/docs/cunumeric/source/user/advanced.rst b/docs/cupynumeric/source/user/advanced.rst
similarity index 92%
rename from docs/cunumeric/source/user/advanced.rst
rename to docs/cupynumeric/source/user/advanced.rst
index 2fdd96d974..b6bbc31fc6 100644
--- a/docs/cunumeric/source/user/advanced.rst
+++ b/docs/cupynumeric/source/user/advanced.rst
@@ -9,7 +9,7 @@ Multi-node execution
 Using ``legate``
 ~~~~~~~~~~~~~~~~
 
-Cunumeric programs can be run in parallel by using the ``--nodes`` option to
+cuPyNumeric programs can be run in parallel by using the ``--nodes`` option to
 the ``legate`` driver, followed by the number of nodes to be used.
 When running on 2+ nodes, a task launcher must be specified.
 
diff --git a/docs/cunumeric/source/user/differences.rst b/docs/cupynumeric/source/user/differences.rst
similarity index 77%
rename from docs/cunumeric/source/user/differences.rst
rename to docs/cupynumeric/source/user/differences.rst
index 5195ccdd37..efab90df11 100644
--- a/docs/cunumeric/source/user/differences.rst
+++ b/docs/cupynumeric/source/user/differences.rst
@@ -3,10 +3,10 @@ Differences with Numpy
 Supported shapes and datatypes
 ------------------------------
 
-cuNumeric natively supports arrays of dimensionality only up to the maximum
+cuPyNumeric natively supports arrays of dimensionality only up to the maximum
 number of dimensions supported by the linked build of Legate.
 
-cuNumeric natively supports only numerical datatypes, and doesn't support
+cuPyNumeric natively supports only numerical datatypes, and doesn't support
 extended-precision floats (e.g. `np.float128`).
 
 Trying to use an unsupported number of dimensions or datatype will trigger a
@@ -15,7 +15,7 @@ fallback to base NumPy.
 Returning a copy instead of a view
 ----------------------------------
 
-Some functions that return a view in Numpy return a copy in cuNumeric. These
+Some functions that return a view in Numpy return a copy in cuPyNumeric. These
 include:
 
 * ``np.diag``
@@ -46,21 +46,21 @@ Scalar return values
 --------------------
 
 NumPy will occasionally convert a 0d array to a python-level scalar, but
-cuNumeric avoids doing that, because in our system an array value can
+cuPyNumeric avoids doing that, because in our system an array value can
 potentially represent an asynchronous computation. As a result, sometimes
-cuNumeric will return 0d arrays (possibly deferred), in cases where NumPy
+cuPyNumeric will return 0d arrays (possibly deferred), in cases where NumPy
 returns a scalar.
 
 Indexing behavior
 -----------------
 
-``x[:,True]`` works differently from NumPy. cuNumeric broadcasts it up to the
+``x[:,True]`` works differently from NumPy. cuPyNumeric broadcasts it up to the
 corresponding dimension, whereas NumPy adds a dimension.
 
 Additionally ``[]`` does not work for advanced indexing since ``[]`` is
 ``float64`` by default.
 
-cuNumeric doesn't support non-unit steps on index expressions, e.g. `arr[::2]`.
+cuPyNumeric doesn't support non-unit steps on index expressions, e.g. `arr[::2]`.
 
 Duplicate indices on advanced indexing expressions produce undefined behavior.
 This is also the case in NumPy but the current NumPy implementation happens
diff --git a/docs/cunumeric/source/user/howtos/benchmarking.rst b/docs/cupynumeric/source/user/howtos/benchmarking.rst
similarity index 94%
rename from docs/cunumeric/source/user/howtos/benchmarking.rst
rename to docs/cupynumeric/source/user/howtos/benchmarking.rst
index f744e10683..2be87f8483 100644
--- a/docs/cunumeric/source/user/howtos/benchmarking.rst
+++ b/docs/cupynumeric/source/user/howtos/benchmarking.rst
@@ -7,7 +7,7 @@ Using Legate timing tools
 -------------------------
 
 Use legate's timing API to measure elapsed time, rather than standard Python
-timers. cuNumeric executes work asynchronously when possible, and a standard
+timers. cuPyNumeric executes work asynchronously when possible, and a standard
 Python timer will only measure the time taken to launch the work, not the time
 spent in actual computation.
 
@@ -18,7 +18,7 @@ Here is an example of how to measure elapsed time in milliseconds:
 
 .. code-block:: python
 
-    import cunumeric as np
+    import cupynumeric as np
     from legate.timing import time
 
     init() # Initialization step
diff --git a/docs/cunumeric/source/user/howtos/index.rst b/docs/cupynumeric/source/user/howtos/index.rst
similarity index 89%
rename from docs/cunumeric/source/user/howtos/index.rst
rename to docs/cupynumeric/source/user/howtos/index.rst
index 1e07c8f0b2..72140ffd72 100644
--- a/docs/cunumeric/source/user/howtos/index.rst
+++ b/docs/cupynumeric/source/user/howtos/index.rst
@@ -6,5 +6,4 @@ Howtos
 
   measuring
   benchmarking
-  jupyter
   patching
diff --git a/docs/cunumeric/source/user/howtos/measuring.rst b/docs/cupynumeric/source/user/howtos/measuring.rst
similarity index 56%
rename from docs/cunumeric/source/user/howtos/measuring.rst
rename to docs/cupynumeric/source/user/howtos/measuring.rst
index 3513a86287..4e146a3868 100644
--- a/docs/cunumeric/source/user/howtos/measuring.rst
+++ b/docs/cupynumeric/source/user/howtos/measuring.rst
@@ -3,42 +3,42 @@
 Measure API coverage
 ====================
 
-cuNumeric does not currently implment all of NumPy's APIs. If necessary,
-cuNumeric will fall back to using NumPy directly to complete a compuation.
-When running applications that use cuNumeric, the command line options below
+cuPyNumeric does not currently implment all of NumPy's APIs. If necessary,
+cuPyNumeric will fall back to using NumPy directly to complete a compuation.
+When running applications that use cuPyNumeric, the command line options below
 may be used to generate coverage reports that show which APIs are implemented
-and optimized by cuNumeric and which APIs required falling back to NumPy.
+and optimized by cuPyNumeric and which APIs required falling back to NumPy.
 
 Overall coverage report
 ~~~~~~~~~~~~~~~~~~~~~~~
 
-The environment variable ``CUNUMERIC_REPORT_COVERAGE`` may be used to print an
-overall percentage of cunumeric coverage:
+The environment variable ``CUPYNUMERIC_REPORT_COVERAGE`` may be used to print an
+overall percentage of cupynumeric coverage:
 
 .. code-block:: sh
 
-    CUNUMERIC_REPORT_COVERAGE=1 legate test.py
+    CUPYNUMERIC_REPORT_COVERAGE=1 legate test.py
 
 After execution completes, the percentage of NumPy API calls that were handled
-by cunumeric is printed:
+by cupynumeric is printed:
 
 .. code-block::
 
-    cuNumeric API coverage: 26/26 (100.0%)
+    cuPyNumeric API coverage: 26/26 (100.0%)
 
 Detailed coverage report
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
-The environment variable ``CUNUMERIC_REPORT_DUMP_CSV`` may be used to save a
+The environment variable ``CUPYNUMERIC_REPORT_DUMP_CSV`` may be used to save a
 detailed coverage report:
 
 .. code-block:: sh
 
-    CUNUMERIC_REPORT_COVERAGE=1 CUNUMERIC_REPORT_DUMP_CSV="out.csv" legate test.py
+    CUPYNUMERIC_REPORT_COVERAGE=1 CUPYNUMERIC_REPORT_DUMP_CSV="out.csv" legate test.py
 
 After execution completes, a CSV file will be saved to the specified location
 (in this case ``out.csv``). The file shows exactly what NumPy API functions
-were called, whether the are implemented by cunumeric, and the location of
+were called, whether the are implemented by cupynumeric, and the location of
 the call site:
 
 .. code-block::
@@ -56,12 +56,12 @@ the call site:
 Call stack reporting
 ~~~~~~~~~~~~~~~~~~~~
 
-The environment variable ``CUNUMERIC_REPORT_DUMP_CALLSTACK`` may be added to
+The environment variable ``CUPYNUMERIC_REPORT_DUMP_CALLSTACK`` may be added to
 include full call stack information in a CSV report:
 
 .. code-block:: sh
 
-   CUNUMERIC_REPORT_COVERAGE=1 CUNUMERIC_REPORT_DUMP_CALLSTACK=1 CUNUMERIC_REPORT_DUMP_CALLSTACK=1 legate test.py
+   CUPYNUMERIC_REPORT_COVERAGE=1 CUPYNUMERIC_REPORT_DUMP_CALLSTACK=1 CUPYNUMERIC_REPORT_DUMP_CALLSTACK=1 legate test.py
 
 After execution completes, the CSV output file have full call stack
 information in the location column, with individual stack frames separated
diff --git a/docs/cunumeric/source/user/howtos/patching.rst b/docs/cupynumeric/source/user/howtos/patching.rst
similarity index 64%
rename from docs/cunumeric/source/user/howtos/patching.rst
rename to docs/cupynumeric/source/user/howtos/patching.rst
index cdac9223cf..576e9396c1 100644
--- a/docs/cunumeric/source/user/howtos/patching.rst
+++ b/docs/cupynumeric/source/user/howtos/patching.rst
@@ -2,7 +2,7 @@ Trying Numpy code without changes
 =================================
 
 The ``lgpatch`` script (in the same location as the ``legate`` executable) can
-help facilitate quick demonstrations of ``cunumeric`` on existing codebases
+help facilitate quick demonstrations of ``cupynumeric`` on existing codebases
 that make use of ``numpy``.
 
 To use this tool, invoke it as shown below, with the name of the program to
@@ -23,13 +23,13 @@ For example, here is a small ``test.py`` program that imports and uses various
     input = np.eye(10, dtype=np.float32)
     np.linalg.cholesky(input)
 
-You can invoke ``lgpatch`` to run ``test.py`` using ``cunumeric`` functions
+You can invoke ``lgpatch`` to run ``test.py`` using ``cupynumeric`` functions
 instead, without any changes to the original source code. Any standard
-``cunumeric`` runtime options (e.g. for :ref:`measuring api coverage`) may
+``cupynumeric`` runtime options (e.g. for :ref:`measuring api coverage`) may
 also be used:
 
 .. code-block:: sh
 
-    $ CUNUMERIC_REPORT_COVERAGE=1 LEGATE_CONFIG="--cpus 4"  lgpatch test.py -patch numpy
-    cuNumeric API coverage: 4/4 (100.0%)
+    $ CUPYNUMERIC_REPORT_COVERAGE=1 LEGATE_CONFIG="--cpus 4"  lgpatch test.py -patch numpy
+    cuPyNumeric API coverage: 4/4 (100.0%)
 
diff --git a/docs/cunumeric/source/user/index.rst b/docs/cupynumeric/source/user/index.rst
similarity index 100%
rename from docs/cunumeric/source/user/index.rst
rename to docs/cupynumeric/source/user/index.rst
diff --git a/docs/cunumeric/source/user/practices.rst b/docs/cupynumeric/source/user/practices.rst
similarity index 91%
rename from docs/cunumeric/source/user/practices.rst
rename to docs/cupynumeric/source/user/practices.rst
index c064fe8f6e..063a7a0fb7 100644
--- a/docs/cunumeric/source/user/practices.rst
+++ b/docs/cupynumeric/source/user/practices.rst
@@ -8,7 +8,7 @@ General Recommendations
 
 Following the basics of numpy as documented
 `here <https://numpy.org/doc/stable/user/basics.html>`_ is highly recommended.
-Here we highlight some of the anti-patterns and best practices for cuNumeric
+Here we highlight some of the anti-patterns and best practices for cuPyNumeric
 to avoid commonly encountered problems related to performance. In general,
 array-based computations are recommended.
 
@@ -16,14 +16,14 @@ Availability of each API (e.g., single CPU or Multiple GPUs/Multiple CPUs,
 etc.) is noted in the docstring of the API. This would be useful to know while
 designing the application since it can impact the scalability.
 
-Guidelines on using cuNumeric APIs
-----------------------------------
+Guidelines on using cuPyNumeric APIs
+------------------------------------
 
-Use cuNumeric or NumPy arrays, AVOID native lists
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Use cuPyNumeric or NumPy arrays, AVOID native lists
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Create a cuNumeric array from data structures native to Python like lists,
-tuples, etc., and operate on the cuNumeric array, as shown in the example
+Create a cuPyNumeric array from data structures native to Python like lists,
+tuples, etc., and operate on the cuPyNumeric array, as shown in the example
 below. Find more details on this here:
 
 .. https://numpy.org/doc/stable/user/basics.creation.html
@@ -37,7 +37,7 @@ below. Find more details on this here:
     for val in x:
         y.append(val + 2)
 
-    # Recommended: Create a cuNumeric array and use array-based operations
+    # Recommended: Create a cuPyNumeric array and use array-based operations
     y = np.array(x)
     y = x + 2
 
@@ -48,7 +48,7 @@ thus performing an array-based operation.
 
 .. code-block:: python
 
-    import cunumeric as np
+    import cupynumeric as np
 
     def transform(input):
         return (input + 3) * 4
@@ -121,7 +121,7 @@ performance.
 
 .. code-block:: python
 
-    import cunumeric as np
+    import cupynumeric as np
 
     # Not recommended: don't use nonzero to get indices
     indices = np.nonzero(h < 0)
@@ -141,7 +141,7 @@ condition is met, which can be described using the ``putmask`` API.
 
 .. code-block:: python
 
-    import cunumeric as np
+    import cupynumeric as np
 
     # We need to update elements of x from y based on a condition
     cond = y < tol
@@ -177,12 +177,12 @@ Use mathematical functions, AVOID element-wise loops
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 When there are nested element-wise operations, it is recommended that they
-are translated to array-based operations using equivalent cuNumeric APIs, if
+are translated to array-based operations using equivalent cuPyNumeric APIs, if
 possible. Here is an example:
 
 .. code-block:: python
 
-    import cunumeric as np
+    import cupynumeric as np
 
     # Not recommended: Naive element-wise implementation
     for i in range(ny):
@@ -208,14 +208,14 @@ can also make it run slower, so we recommend using it as sparingly as possible.
 
 .. code-block:: python
 
-    import cunumeric as np
+    import cupynumeric as np
 
     x = np.ones((3,4))
     y = x.reshape((12,))
 
     y[0] = 42
 
-    assert x[0,0] == 42 # succeeds in NumPy, fails in cuNumeric
+    assert x[0,0] == 42 # succeeds in NumPy, fails in cuPyNumeric
 
 Stack results in a performance penalty
 ......................................
@@ -231,8 +231,8 @@ Faster I/O Routines
 
 As of 23.07, we recommend using `h5py <https://github.com/h5py/h5py>`_ to perform I/O.
 
-Guidelines on designing cuNumeric applications
-----------------------------------------------
+Guidelines on designing cuPyNumeric applications
+------------------------------------------------
 
 Use output arguments to reduce memory allocation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -242,7 +242,7 @@ intermediate array in our implementation.
 
 .. code-block:: python
 
-    import cunumeric as np
+    import cupynumeric as np
 
     # Acceptable
     x = x + y
@@ -338,10 +338,10 @@ here.
 
 .. code-block:: python
 
-    import cunumeric as np
+    import cupynumeric as np
 
     # compute() does some computations and returns a multi-dimensional
-    # cuNumeric array. The application stops after the iterative computation
+    # cuPyNumeric array. The application stops after the iterative computation
     # is converged
 
     # Acceptable: Performing convergence checks every iteration
diff --git a/docs/cupynumeric/source/user/usage.rst b/docs/cupynumeric/source/user/usage.rst
new file mode 100644
index 0000000000..aebdad2763
--- /dev/null
+++ b/docs/cupynumeric/source/user/usage.rst
@@ -0,0 +1,50 @@
+.. _usage:
+
+Usage
+=====
+
+Using cuPyNumeric as a replacement for NumPy is simple. Replace your NumPy import
+statement with cuPyNumeric:
+
+.. code-block:: python
+
+  import numpy as np
+
+becomes
+
+.. code-block:: python
+
+  import cupynumeric as np
+
+Then, run the application like you usually do. For example, if you had a script
+``main.py`` written in NumPy that adds two vectors,
+
+.. code-block:: python
+
+    import numpy as np
+    x = np.array([1.0, 2.0, 3.0, 4.0])
+    y = np.array([4.0, 3.0, 2.0, 1.0])
+    z = x + y
+    print(z)
+
+change the import statement to use cuPyNumeric like below,
+
+.. code-block:: python
+
+    import cupynumeric as np
+    x = np.array([1.0, 2.0, 3.0, 4.0])
+    y = np.array([4.0, 3.0, 2.0, 1.0])
+    z = x + y
+    print(z)
+
+And run the program, like this
+
+.. code-block:: sh
+
+    python main.py
+
+By default this invocation will use all the hardware resources (e.g. CPU cores,
+RAM, GPUs) available on the current machine.
+
+For more information on controlling the resource allocation, running on multiple
+nodes etc. see https://docs.nvidia.com/legate/latest/usage.html.
diff --git a/docs/cupynumeric/switcher.json b/docs/cupynumeric/switcher.json
new file mode 100644
index 0000000000..7d049e3dd7
--- /dev/null
+++ b/docs/cupynumeric/switcher.json
@@ -0,0 +1,23 @@
+[
+    {
+        "name": "24.11",
+        "version": "24.11",
+        "url": "https://docs.nvidia.com/cupynumeric/24.11/"
+    },
+    {
+        "name": "25.01",
+        "version": "25.01",
+        "url": "https://docs.nvidia.com/cupynumeric/25.01/"
+    },
+    {
+      "name": "25.03",
+      "version": "25.03",
+      "url": "https://docs.nvidia.com/cupynumeric/25.03/"
+    },
+    {
+      "name": "25.05",
+      "version": "25.05",
+      "preferred": true,
+      "url": "https://docs.nvidia.com/cupynumeric/25.05/"
+  }
+]
diff --git a/examples/benchmark.py b/examples/benchmark.py
index d882e120fb..29a7f4a451 100644
--- a/examples/benchmark.py
+++ b/examples/benchmark.py
@@ -21,8 +21,7 @@
 
 
 class Timer(Protocol):
-    def start(self):
-        ...
+    def start(self): ...
 
     def stop(self):
         """
@@ -32,7 +31,7 @@ def stop(self):
         ...
 
 
-class CuNumericTimer(Timer):
+class CuPyNumericTimer(Timer):
     def __init__(self):
         self._start_time = None
 
@@ -112,9 +111,9 @@ def parse_args(parser):
     )
     args, _ = parser.parse_known_args()
     if args.package == "legate":
-        import cunumeric as np
+        import cupynumeric as np
 
-        timer = CuNumericTimer()
+        timer = CuPyNumericTimer()
     elif args.package == "cupy":
         import cupy as np
 
diff --git a/examples/cpp/gemm/CMakeLists.txt b/examples/cpp/gemm/CMakeLists.txt
new file mode 100644
index 0000000000..91c1ff2723
--- /dev/null
+++ b/examples/cpp/gemm/CMakeLists.txt
@@ -0,0 +1,31 @@
+#=============================================================================
+# Copyright 2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+cmake_minimum_required(VERSION 3.22.1 FATAL_ERROR)
+
+project(stencil VERSION 0.1 LANGUAGES C CXX)
+
+if (NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+
+find_package(cupynumeric REQUIRED)
+
+add_executable(gemm gemm.cc)
+
+target_link_libraries(gemm PRIVATE cupynumeric::cupynumeric)
+
+install(TARGETS gemm DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/cmake-install")
diff --git a/examples/cpp/gemm/build.sh b/examples/cpp/gemm/build.sh
new file mode 100755
index 0000000000..53ed6d6c09
--- /dev/null
+++ b/examples/cpp/gemm/build.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# Copyright 2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+legate_root=`python -c 'import legate.install_info as i; from pathlib import Path; print(Path(i.libpath).parent.resolve())'`
+echo "Using Legate at $legate_root"
+cupynumeric_root=`python -c 'import cupynumeric.install_info as i; from pathlib import Path; print(Path(i.libpath).parent.resolve())'`
+echo "Using cuPyNumeric at $cupynumeric_root"
+cmake -S . -B build -D legate_ROOT="$legate_root" -D cupynumeric_ROOT="$cupynumeric_root" -D CMAKE_BUILD_TYPE=RelWithDebInfo
+cmake --build build --parallel 8
diff --git a/examples/cpp/gemm/gemm.cc b/examples/cpp/gemm/gemm.cc
new file mode 100644
index 0000000000..7ddc290522
--- /dev/null
+++ b/examples/cpp/gemm/gemm.cc
@@ -0,0 +1,111 @@
+/* Copyright 2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include <legate.h>
+#include <legate/timing/timing.h>
+#include <cupynumeric.h>
+#include <realm/cmdline.h>
+
+#include <cstdint>
+#include <cstdio>
+#include <tuple>
+
+namespace gemm {
+
+struct Config {
+  bool timing{false};
+  std::int32_t iter{100};
+  std::int32_t warmup{5};
+  std::uint64_t N{100};
+};
+
+[[nodiscard]] std::tuple<cupynumeric::NDArray, cupynumeric::NDArray, cupynumeric::NDArray>
+initialize(std::uint64_t N, const legate::Type& ft)
+{
+  auto A = cupynumeric::random({N, N}).as_type(ft);
+  auto B = cupynumeric::random({N, N}).as_type(ft);
+  auto C = cupynumeric::zeros({N, N}, ft);
+  return {A, B, C};
+}
+
+[[nodiscard]] std::size_t total_flops(std::uint64_t M, std::uint64_t N, std::uint64_t K)
+{
+  return M * N * (2 * K - 1);
+}
+
+[[nodiscard]] std::size_t total_space(std::uint64_t M,
+                                      std::uint64_t N,
+                                      std::uint64_t K,
+                                      const legate::Type& ft)
+{
+  return (M * N + M * K + K * N) * ft.size();
+}
+
+void run_gemm(const Config& config)
+{
+  const auto ft = legate::float32();
+  const auto N  = config.N;
+  std::printf("Problem Size:     M=%lu N=%lu K=%lu\n", N, N, N);
+  std::printf("Total Iterations: %d\n", config.iter);
+  const auto flops = total_flops(N, N, N);
+  std::printf("Total Flops:      %lf GFLOPS/iter\n", flops / 1e9);
+  const auto space = total_space(N, N, N, ft);
+  std::printf("Total Size:       %lf MB\n", space / 1e6);
+  auto [A, B, C] = initialize(config.N, legate::float32());
+
+  auto start    = legate::timing::measure_microseconds();
+  auto max_iter = config.iter + config.warmup;
+  for (int32_t iter = 0; iter < max_iter; ++iter) {
+    if (iter == config.warmup) {
+      start = legate::timing::measure_microseconds();
+    }
+    C.dot(A, B);
+    // We need to rotate the matrices to keep Legate honest
+    // about moving data so it can't just duplicate A and B
+    // on the first iteration and reuse them, this means
+    // that A, B, C all need to be square
+    A, B, C = B, C, A;
+  }
+  auto stop = legate::timing::measure_microseconds();
+
+  const auto total = (stop.value() - start.value()) / 1e3;
+  std::printf("Elapsed Time:     %lf ms\n", total);
+  const auto average = total / config.iter;
+  std::printf("Average GEMM:     %lf ms\n", average);
+  std::printf("FLOPS/s:          %lf GFLOPS/s\n", flops / (average * 1e6));
+}
+
+}  // namespace gemm
+
+int main(int argc, char** argv)
+{
+  legate::start();
+
+  cupynumeric::initialize(argc, argv);
+
+  gemm::Config config{};
+
+  Realm::CommandLineParser cp;
+  cp.add_option_int("--iter", config.iter)
+    .add_option_int("--warmup", config.warmup)
+    .add_option_int("--num", config.N)
+    .add_option_bool("--time", config.timing)
+    .parse_command_line(argc, argv);
+
+  gemm::run_gemm(config);
+
+  return legate::finish();
+}
diff --git a/examples/cpp/stencil/CMakeLists.txt b/examples/cpp/stencil/CMakeLists.txt
index d17920c4a3..3def9488f9 100644
--- a/examples/cpp/stencil/CMakeLists.txt
+++ b/examples/cpp/stencil/CMakeLists.txt
@@ -22,10 +22,10 @@ if (NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 17)
 endif()
 
-find_package(cunumeric REQUIRED)
+find_package(cupynumeric REQUIRED)
 
 add_executable(stencil stencil.cc)
 
-target_link_libraries(stencil PRIVATE cunumeric::cunumeric)
+target_link_libraries(stencil PRIVATE cupynumeric::cupynumeric)
 
 install(TARGETS stencil DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/cmake-install")
diff --git a/examples/cpp/stencil/build.sh b/examples/cpp/stencil/build.sh
index 485365ae3c..1eac0fe8d9 100755
--- a/examples/cpp/stencil/build.sh
+++ b/examples/cpp/stencil/build.sh
@@ -16,7 +16,7 @@
 
 legate_root=`python -c 'import legate.install_info as i; from pathlib import Path; print(Path(i.libpath).parent.resolve())'`
 echo "Using Legate at $legate_root"
-cunumeric_root=`python -c 'import cunumeric.install_info as i; from pathlib import Path; print(Path(i.libpath).parent.resolve())'`
-echo "Using cuNumeric at $cunumeric_root"
-cmake -S . -B build -D legate_ROOT="$legate_root" -D cunumeric_ROOT="$cunumeric_root" -D CMAKE_BUILD_TYPE=Debug
+cupynumeric_root=`python -c 'import cupynumeric.install_info as i; from pathlib import Path; print(Path(i.libpath).parent.resolve())'`
+echo "Using cuPyNumeric at $cupynumeric_root"
+cmake -S . -B build -D legate_ROOT="$legate_root" -D cupynumeric_ROOT="$cupynumeric_root" -D CMAKE_BUILD_TYPE=Debug
 cmake --build build --parallel 8
diff --git a/examples/cpp/stencil/stencil.cc b/examples/cpp/stencil/stencil.cc
index 600535123c..022b3f222c 100644
--- a/examples/cpp/stencil/stencil.cc
+++ b/examples/cpp/stencil/stencil.cc
@@ -15,15 +15,15 @@
  */
 
 #include "legate.h"
-#include "cunumeric.h"
+#include "cupynumeric.h"
 #include "realm/cmdline.h"
 
 #include <iomanip>
 
 namespace stencil {
 
-using cunumeric::open;
-using cunumeric::slice;
+using cupynumeric::open;
+using cupynumeric::slice;
 
 struct Config {
   bool timing{false};
@@ -32,7 +32,7 @@ struct Config {
   uint64_t N{100};
 };
 
-void print_array(cunumeric::NDArray array)
+void print_array(cupynumeric::NDArray array)
 {
   auto acc    = array.get_read_accessor<double, 2>();
   auto& shape = array.shape();
@@ -49,9 +49,9 @@ void print_array(cunumeric::NDArray array)
   std::cerr << std::move(ss).str();
 }
 
-cunumeric::NDArray initialize(uint64_t N)
+cupynumeric::NDArray initialize(uint64_t N)
 {
-  auto grid = cunumeric::zeros({N + 2, N + 2});
+  auto grid = cupynumeric::zeros({N + 2, N + 2});
   grid[{slice(), slice(0, 1)}].assign(legate::Scalar{-273.15});
   grid[{slice(), slice(-1, open)}].assign(legate::Scalar{-273.15});
   grid[{slice(-1, open), slice()}].assign(legate::Scalar{-273.15});
@@ -84,7 +84,7 @@ int main(int argc, char** argv)
   auto result = legate::start(argc, argv);
   assert(result == 0);
 
-  cunumeric::initialize(argc, argv);
+  cupynumeric::initialize(argc, argv);
 
   stencil::Config config{};
 
diff --git a/examples/gemm.py b/examples/gemm.py
index 183f65a7b2..3830459be7 100644
--- a/examples/gemm.py
+++ b/examples/gemm.py
@@ -21,8 +21,8 @@
 
 
 def initialize(M, N, K, ft):
-    A = np.random.rand(N, N).astype(ft)
-    B = np.random.rand(N, N).astype(ft)
+    A = np.random.uniform(size=(N, N), dtype=ft)
+    B = np.random.uniform(size=(N, N), dtype=ft)
     C = np.zeros((N, N), dtype=ft)
     return A, B, C
 
diff --git a/examples/richardson_lucy.py b/examples/richardson_lucy.py
index b024d46e75..25ed2154ee 100644
--- a/examples/richardson_lucy.py
+++ b/examples/richardson_lucy.py
@@ -22,7 +22,9 @@
 # A simplified implementation of Richardson-Lucy deconvolution
 
 
-def run_richardson_lucy(shape, filter_shape, num_iter, warmup, timing):
+def run_richardson_lucy(
+    shape, filter_shape, num_iter, warmup, timing, conv_method
+):
     image = np.random.rand(*shape).astype(float_type)
     psf = np.random.rand(*filter_shape).astype(float_type)
     im_deconv = np.full(image.shape, 0.5, dtype=float_type)
@@ -33,13 +35,16 @@ def run_richardson_lucy(shape, filter_shape, num_iter, warmup, timing):
     for idx in range(num_iter + warmup):
         if idx == warmup:
             timer.start()
-        conv = np.convolve(im_deconv, psf, mode="same")
+        conv = np.convolve(im_deconv, psf, mode="same", method=conv_method)
         relative_blur = image / conv
-        im_deconv *= np.convolve(relative_blur, psf_mirror, mode="same")
+        im_deconv *= np.convolve(
+            relative_blur, psf_mirror, mode="same", method=conv_method
+        )
 
     total = timer.stop()
     if timing:
         print("Elapsed Time: " + str(total) + " ms")
+    return total
 
 
 if __name__ == "__main__":
@@ -109,6 +114,13 @@ def run_richardson_lucy(shape, filter_shape, num_iter, warmup, timing):
         action="store_true",
         help="perform timing",
     )
+    parser.add_argument(
+        "--conv-method",
+        dest="conv_method",
+        type=str,
+        default="auto",
+        help="convolution method (auto by default)",
+    )
 
     args, np, timer = parse_args(parser)
 
@@ -122,5 +134,6 @@ def run_richardson_lucy(shape, filter_shape, num_iter, warmup, timing):
             args.I,
             args.warmup,
             args.timing,
+            args.conv_method,
         ),
     )
diff --git a/examples/scan.py b/examples/scan.py
index 09acad0608..00907a60af 100644
--- a/examples/scan.py
+++ b/examples/scan.py
@@ -62,7 +62,7 @@ def check_scan(OP, A, B, ax):
     else:
         print("FAIL!")
         print(f"INPUT    : {A}")
-        print(f"CUNUMERIC: {B}")
+        print(f"CUPYNUMERIC: {B}")
         print(f"NUMPY    : {C}")
         assert False
 
diff --git a/install.py b/install.py
index f58e19ec05..ae0639a7dd 100755
--- a/install.py
+++ b/install.py
@@ -108,13 +108,15 @@ def find_cmake_val(pattern, filepath):
 
 
 def was_previously_built_with_different_build_isolation(
-    isolated, cunumeric_build_dir
+    isolated, cupynumeric_build_dir
 ):
     if (
-        cunumeric_build_dir is not None
-        and os.path.exists(cunumeric_build_dir)
+        cupynumeric_build_dir is not None
+        and os.path.exists(cupynumeric_build_dir)
         and os.path.exists(
-            cmake_cache := os.path.join(cunumeric_build_dir, "CMakeCache.txt")
+            cmake_cache := os.path.join(
+                cupynumeric_build_dir, "CMakeCache.txt"
+            )
         )
     ):
         try:
@@ -154,8 +156,8 @@ def find_legate_cmake_dir() -> Path:
         # conda env.
         return path
 
-    # Possibly installed in an editable installation, in which case legate-config.cmake
-    # and friends will live in the root binary directory.
+    # Possibly installed in an editable installation, in which case legate
+    # config.cmake and friends will live in the root binary directory.
     root_path = path.root
     assert isinstance(root_path, str)
     while not any(p.name == "legate-config.cmake" for p in path.iterdir()):
@@ -167,7 +169,7 @@ def find_legate_cmake_dir() -> Path:
     return path
 
 
-def install_cunumeric(
+def install_cupynumeric(
     arch,
     build_isolation,
     with_tests,
@@ -251,7 +253,7 @@ def install_cunumeric(
     dirname = os.path.dirname
     realpath = os.path.realpath
 
-    cunumeric_dir = dirname(realpath(__file__))
+    cupynumeric_dir = dirname(realpath(__file__))
 
     if thread_count is None:
         thread_count = multiprocessing.cpu_count()
@@ -260,7 +262,7 @@ def validate_path(path):
         if path is None or (path := str(path)) == "":
             return None
         if not os.path.isabs(path):
-            path = join(cunumeric_dir, path)
+            path = join(cupynumeric_dir, path)
         if not exists(path := realpath(path)):
             print(f"Error: path does not exist: {path}")
             sys.exit(1)
@@ -288,20 +290,20 @@ def validate_path(path):
         print("cutensor_dir: ", cutensor_dir)
         print("openblas_dir: ", openblas_dir)
 
-    skbuild_dir = join(cunumeric_dir, "_skbuild")
-    cunumeric_build_dir = scikit_build_cmake_build_dir(skbuild_dir)
+    skbuild_dir = join(cupynumeric_dir, "_skbuild")
+    cupynumeric_build_dir = scikit_build_cmake_build_dir(skbuild_dir)
 
     if was_previously_built_with_different_build_isolation(
-        build_isolation and not editable, cunumeric_build_dir
+        build_isolation and not editable, cupynumeric_build_dir
     ):
         print("Performing a clean build to accommodate build isolation.")
         clean_first = True
 
     cmd_env = dict(os.environ.items())
 
-    # Explicitly uninstall cunumeric if doing a clean/isolated build.
+    # Explicitly uninstall cupynumeric if doing a clean/isolated build.
     #
-    # A prior installation may have built and installed cunumeric C++
+    # A prior installation may have built and installed cupynumeric C++
     # dependencies (like BLAS or tblis).
     #
     # CMake will find and use them for the current build, which would normally
@@ -313,23 +315,23 @@ def validate_path(path):
     # these dependencies, triggering CMake to build and install them again.
     if clean_first or (build_isolation and not editable):
         execute_command(
-            [sys.executable, "-m", "pip", "uninstall", "-y", "cunumeric"],
+            [sys.executable, "-m", "pip", "uninstall", "-y", "cupynumeric"],
             verbose,
             ignore_errors=True,
-            cwd=cunumeric_dir,
+            cwd=cupynumeric_dir,
             env=cmd_env,
         )
 
     if clean_first:
         shutil.rmtree(skbuild_dir, ignore_errors=True)
-        shutil.rmtree(join(cunumeric_dir, "dist"), ignore_errors=True)
-        shutil.rmtree(join(cunumeric_dir, "build"), ignore_errors=True)
+        shutil.rmtree(join(cupynumeric_dir, "dist"), ignore_errors=True)
+        shutil.rmtree(join(cupynumeric_dir, "build"), ignore_errors=True)
         shutil.rmtree(
-            join(cunumeric_dir, "cunumeric.egg-info"),
+            join(cupynumeric_dir, "cupynumeric.egg-info"),
             ignore_errors=True,
         )
 
-    # Configure and build cuNumeric via setup.py
+    # Configure and build cuPyNumeric via setup.py
     pip_install_cmd = [sys.executable, "-m", "pip", "install"]
 
     install_dir = None
@@ -376,8 +378,8 @@ def validate_path(path):
 
     cmake_flags += f"""\
 -DCMAKE_BUILD_TYPE={(
-    "Debug" if debug else "RelWithDebInfo" if debug_release else "Release"
-)}
+        "Debug" if debug else "RelWithDebInfo" if debug_release else "Release"
+    )}
 -DBUILD_SHARED_LIBS=ON
 -DCMAKE_CUDA_ARCHITECTURES={str(arch)}
 -DLegion_MAX_DIM={str(maxdim)}
@@ -389,7 +391,7 @@ def validate_path(path):
 -DLegion_USE_LLVM={("ON" if llvm else "OFF")}
 -DLegion_NETWORKS={";".join(networks)}
 -DLegion_USE_HDF5={("ON" if hdf else "OFF")}
--Dcunumeric_BUILD_TESTS={("ON" if with_tests else "OFF")}
+-Dcupynumeric_BUILD_TESTS={("ON" if with_tests else "OFF")}
 """.splitlines()
 
     if march:
@@ -412,7 +414,7 @@ def validate_path(path):
         cmake_flags += ["-Dcutensor_DIR=%s" % cutensor_dir]
     # A custom path to cuRAND is ignored when CUDA support is available
     if cuda and curand_dir is not None:
-        cmake_flags += ["-Dcunumeric_cuRAND_INCLUDE_DIR=%s" % curand_dir]
+        cmake_flags += ["-Dcupynumeric_cuRAND_INCLUDE_DIR=%s" % curand_dir]
 
     cmake_flags += ["-Dlegate_ROOT=%s" % str(legate_dir)]
     cmake_flags += ["-DCMAKE_BUILD_PARALLEL_LEVEL=%s" % thread_count]
@@ -433,18 +435,18 @@ def validate_path(path):
         }
     )
 
-    execute_command(pip_install_cmd, verbose, cwd=cunumeric_dir, env=cmd_env)
+    execute_command(pip_install_cmd, verbose, cwd=cupynumeric_dir, env=cmd_env)
 
 
 def driver():
-    parser = argparse.ArgumentParser(description="Install cuNumeric.")
+    parser = argparse.ArgumentParser(description="Install cuPyNumeric.")
     parser.add_argument(
         "--debug",
         dest="debug",
         action="store_true",
         required=False,
         default=os.environ.get("DEBUG", "0") == "1",
-        help="Build cuNumeric with no optimizations.",
+        help="Build cuPyNumeric with no optimizations.",
     )
     parser.add_argument(
         "--debug-release",
@@ -452,7 +454,7 @@ def driver():
         action="store_true",
         required=False,
         default=os.environ.get("DEBUG_RELEASE", "0") == "1",
-        help="Build cuNumeric with optimizations, but include debugging "
+        help="Build cuPyNumeric with optimizations, but include debugging "
         "symbols.",
     )
     parser.add_argument(
@@ -461,7 +463,7 @@ def driver():
         action="store_true",
         required=False,
         default=False,
-        help="Build cuNumeric tests.",
+        help="Build cuPyNumeric tests.",
     )
     parser.add_argument(
         "--check-bounds",
@@ -469,21 +471,21 @@ def driver():
         action="store_true",
         required=False,
         default=False,
-        help="Build cuNumeric with bounds checks.",
+        help="Build cuPyNumeric with bounds checks.",
     )
     parser.add_argument(
         "--max-dim",
         dest="maxdim",
         type=int,
         default=int(os.environ.get("LEGION_MAX_DIM", 4)),
-        help="Maximum number of dimensions that cuNumeric will support",
+        help="Maximum number of dimensions that cuPyNumeric will support",
     )
     parser.add_argument(
         "--max-fields",
         dest="maxfields",
         type=int,
         default=int(os.environ.get("LEGION_MAX_FIELDS", 256)),
-        help="Maximum number of fields that cuNumeric will support",
+        help="Maximum number of fields that cuPyNumeric will support",
     )
     parser.add_argument(
         "--network",
@@ -510,7 +512,7 @@ def driver():
         default=os.environ.get("OPENBLAS_PATH"),
         help="Path to OpenBLAS installation directory. Note that providing a "
         "user-defined BLAS library may lead to dynamic library conflicts with "
-        "BLAS loaded by Python's Numpy. When using cuNumeric's BLAS, this "
+        "BLAS loaded by Python's Numpy. When using cuPyNumeric's BLAS, this "
         "issue is prevented by a custom library name.",
     )
     parser.add_argument(
@@ -579,7 +581,7 @@ def driver():
         "--cuda",
         action=BooleanFlag,
         default=os.environ.get("USE_CUDA", "0") == "1",
-        help="Build cuNumeric with CUDA support.",
+        help="Build cuPyNumeric with CUDA support.",
     )
     parser.add_argument(
         "--with-cuda",
@@ -601,7 +603,7 @@ def driver():
         "--openmp",
         action=BooleanFlag,
         default=os.environ.get("USE_OPENMP", "0") == "1",
-        help="Build cuNumeric with OpenMP support.",
+        help="Build cuPyNumeric with OpenMP support.",
     )
     parser.add_argument(
         "--march",
@@ -616,7 +618,7 @@ def driver():
         action="store_true",
         required=False,
         default=os.environ.get("USE_LLVM", "0") == "1",
-        help="Build cuNumeric with LLVM support.",
+        help="Build cuPyNumeric with LLVM support.",
     )
     parser.add_argument(
         "--hdf5",
@@ -625,7 +627,7 @@ def driver():
         action="store_true",
         required=False,
         default=os.environ.get("USE_HDF", "0") == "1",
-        help="Build cuNumeric with HDF support.",
+        help="Build cuPyNumeric with HDF support.",
     )
     parser.add_argument(
         "--spy",
@@ -633,7 +635,7 @@ def driver():
         action="store_true",
         required=False,
         default=os.environ.get("USE_SPY", "0") == "1",
-        help="Build cuNumeric with detailed Legion Spy enabled.",
+        help="Build cuPyNumeric with detailed Legion Spy enabled.",
     )
     parser.add_argument(
         "--conduit",
@@ -645,7 +647,7 @@ def driver():
         # See https://github.com/nv-legate/legate.core/issues/294.
         choices=["ibv", "ucx", "aries", "mpi"],
         default=os.environ.get("CONDUIT"),
-        help="Build cuNumeric with specified GASNet conduit.",
+        help="Build cuPyNumeric with specified GASNet conduit.",
     )
     parser.add_argument(
         "--clean",
@@ -701,7 +703,7 @@ def driver():
     )
     args, unknown = parser.parse_known_args()
 
-    install_cunumeric(unknown=unknown, **vars(args))
+    install_cupynumeric(unknown=unknown, **vars(args))
 
 
 if __name__ == "__main__":
diff --git a/pyproject.toml b/pyproject.toml
index 022a0f0a97..cc807dbb1d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -90,8 +90,8 @@ warn_unused_configs = true
 # legate files need to be listed here for now
 # since they are included in the type check
 module = [
-  "cunumeric.install_info",
-  "cunumeric._version",
+  "cupynumeric.install_info",
+  "cupynumeric._version",
   "legate._version",
   "legate.__main__",
   "legate.install_info",
diff --git a/scripts/api_compare.py b/scripts/api_compare.py
index 37923157e2..7f3561fa22 100644
--- a/scripts/api_compare.py
+++ b/scripts/api_compare.py
@@ -18,9 +18,9 @@
 import sys
 from dataclasses import astuple, dataclass
 
-from cunumeric._sphinxext._comparison_config import GROUPED_CONFIGS
-from cunumeric._sphinxext._comparison_util import filter_names
-from cunumeric.coverage import is_implemented
+from cupynumeric._sphinxext._comparison_config import GROUPED_CONFIGS
+from cupynumeric._sphinxext._comparison_util import filter_names
+from cupynumeric.coverage import is_implemented
 
 
 @dataclass
@@ -35,16 +35,20 @@ def get_namespaces(attr):
     import cupy
     import numpy
 
-    import cunumeric
+    import cupynumeric
 
     if attr is None:
-        return numpy, cunumeric, cupy
+        return numpy, cupynumeric, cupy
 
-    return getattr(numpy, attr), getattr(cunumeric, attr), getattr(cupy, attr)
+    return (
+        getattr(numpy, attr),
+        getattr(cupynumeric, attr),
+        getattr(cupy, attr),
+    )
 
 
 def write_rows(rows):
-    headers = ("group", "numpy", "cunumeric", "cupy")
+    headers = ("group", "numpy", "cupynumeric", "cupy")
     writer = csv.writer(sys.stdout)
     writer.writerow(headers)
     for row in rows:
diff --git a/scripts/build/python/cupynumeric/CMakeLists.txt b/scripts/build/python/cupynumeric/CMakeLists.txt
new file mode 100644
index 0000000000..f0fa381c3a
--- /dev/null
+++ b/scripts/build/python/cupynumeric/CMakeLists.txt
@@ -0,0 +1,39 @@
+#=============================================================================
+# Copyright 2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+cmake_minimum_required(VERSION 3.26.4)
+
+project(cupynumeric-python VERSION 25.05.00 LANGUAGES CXX)
+
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
+set(CUPYNUMERIC_BUILD_PIP_WHEELS ON)
+
+add_subdirectory(../../../.. cupynumeric-all)
+
+set(rpaths
+  "$ORIGIN/../../legate/lib64"
+  "$ORIGIN/../../cutensor/lib"
+  "$ORIGIN/../../nvidia/cublas/lib"
+  "$ORIGIN/../../nvidia/cufft/lib"
+  "$ORIGIN/../../nvidia/cusolver/lib"
+  "$ORIGIN/../../nvidia/cusparse/lib"
+  "$ORIGIN/../../nvidia/nvjitlink/lib"
+)
+set_property(
+  TARGET cupynumeric
+  PROPERTY INSTALL_RPATH ${rpaths}
+  APPEND
+)
diff --git a/scripts/build/python/cupynumeric/pyproject.toml b/scripts/build/python/cupynumeric/pyproject.toml
new file mode 100644
index 0000000000..ae17766497
--- /dev/null
+++ b/scripts/build/python/cupynumeric/pyproject.toml
@@ -0,0 +1,84 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+[build-system]
+requires = [
+    "scikit-build-core",
+    "cython>=3.0.1",
+    "rich",
+]
+build-backend = "scikit_build_core.build"
+python-requires = ">=3.10"
+
+[project]
+name = "nvidia-cupynumeric"
+authors = [{name = "NVIDIA Corporation"}]
+license = {text = "Apache-2.0"}
+description = "cupynumeric - drop in replacement for numpy"
+classifiers = [
+    "Intended Audience :: Developers",
+    "Topic :: Database",
+    "Topic :: Scientific/Engineering",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12"
+]
+dependencies = [
+    "numpy!=2.1.0",
+    "cffi",
+    "opt_einsum",
+    "legate==25.5.*,>=0.0.0a0",
+    "cutensor-cu12",
+    "nvidia-cublas-cu12",
+    "nvidia-cufft-cu12",
+    "nvidia-cusolver-cu12",
+    "nvidia-cusparse-cu12",
+    "nvidia-nvjitlink-cu12",
+]
+dynamic = ["version"]
+
+[project.urls]
+homepage = "https://github.com/nv-legate/cupynumeric"
+
+[project.entry-points."cmake.prefix"]
+cupynumeric = "cupynumeric"
+
+[tool.scikit-build.cmake]
+version = ">=3.26.4"
+
+[tool.scikit-build.metadata.version]
+provider = "scikit_build_core.metadata.setuptools_scm"
+
+[tool.scikit-build.sdist]
+include = [
+    "../../../../cupynumeric/_version.py",
+]
+
+[tool.setuptools_scm]
+write_to = "cupynumeric/_version.py"
+root = "../../../../"
+
+[tool.scikit-build.build]
+verbose = true
+
+[tool.scikit-build.logging]
+level = "DEBUG"
+
+[tool.scikit-build.wheel]
+exclude = ["**.pyx", "**CMakeLists.txt", "**.pxd"]
+install-dir = "cupynumeric"
+
+[tool.scikit-build]
+build-dir = "buildwheel"
+
+[tool.scikit-build.wheel.packages]
+"cupynumeric" = "../../../../cupynumeric"
diff --git a/scripts/conda-build.sh b/scripts/conda-build.sh
index 47a4528274..a01c27ef29 100755
--- a/scripts/conda-build.sh
+++ b/scripts/conda-build.sh
@@ -1,11 +1,11 @@
 #! /usr/bin/env bash
 
-# mamba create -n cunumeric_build python=$PYTHON_VERSION boa git
+# mamba create -n cupynumeric_build python=$PYTHON_VERSION boa git
 
 cd $(dirname "$(realpath "$0")")/..
 
-mkdir -p /tmp/conda-build/cunumeric
-rm -rf /tmp/conda-build/cunumeric/*
+mkdir -p /tmp/conda-build/cupynumeric
+rm -rf /tmp/conda-build/cupynumeric/*
 
 PYTHON_VERSION="${PYTHON_VERSION:-3.10}"
 
@@ -15,7 +15,7 @@ conda mambabuild \
     --override-channels \
     -c conda-forge -c https://github.com/nv-legate/ucx-package/raw/main \
     -c file:///tmp/conda-build/legate_core \
-    --croot /tmp/conda-build/cunumeric \
+    --croot /tmp/conda-build/cupynumeric \
     --no-test \
     --no-verify \
     --no-build-id \
diff --git a/setup.cfg b/setup.cfg
index fb6cf969a2..fd1da9c82a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -3,10 +3,10 @@
 [versioneer]
 VCS = git
 style = pep440
-versionfile_source = cunumeric/_version.py
-versionfile_build = cunumeric/_version.py
+versionfile_source = cupynumeric/_version.py
+versionfile_build = cupynumeric/_version.py
 tag_prefix = v
-parentdir_prefix = cunumeric-
+parentdir_prefix = cupynumeric-
 
 [flake8]
 exclude = __init__.py
@@ -31,7 +31,7 @@ known_legion=
     legion_cffi
     legion_top
 known_first_party=
-    cunumeric
+    cupynumeric
 default_section=THIRDPARTY
 sections=FUTURE,STDLIB,THIRDPARTY,LEGION,FIRSTPARTY,LOCALFOLDER
 skip=
diff --git a/setup.py b/setup.py
index 530216c86b..bc9b8918c1 100644
--- a/setup.py
+++ b/setup.py
@@ -21,10 +21,10 @@
 import versioneer
 
 setup(
-    name="cunumeric",
+    name="cupynumeric",
     version=versioneer.get_version(),
     description="An Aspiring Drop-In Replacement for NumPy at Scale",
-    url="https://github.com/nv-legate/cunumeric",
+    url="https://github.com/nv-legate/cupynumeric",
     author="NVIDIA Corporation",
     license="Apache 2.0",
     classifiers=[
@@ -39,11 +39,11 @@
     ],
     packages=find_packages(
         where=".",
-        include=["cunumeric*"],
+        include=["cupynumeric*"],
     ),
-    package_data={"cunumeric": ["_sphinxext/_templates/*.rst"]},
+    package_data={"cupynumeric": ["_sphinxext/_templates/*.rst"]},
     include_package_data=True,
     cmdclass=versioneer.get_cmdclass(),
-    install_requires=["numpy>=1.22,<2"],
+    install_requires=["cffi", "numpy>=1.22,<2", "opt_einsum>=3.3"],
     zip_safe=False,
 )
diff --git a/src/cunumeric/cunumeric.cc b/src/cunumeric/cunumeric.cc
deleted file mode 100644
index 2f62991118..0000000000
--- a/src/cunumeric/cunumeric.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright 2024 NVIDIA Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#include "cunumeric/cunumeric_c.h"
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/mapper.h"
-#include "cunumeric/runtime.h"
-#include "cunumeric/unary/unary_red_util.h"
-
-using namespace legate;
-
-namespace cunumeric {
-
-static const char* const cunumeric_library_name = "cunumeric";
-
-/*static*/ TaskRegistrar& CuNumericRegistrar::get_registrar()
-{
-  static TaskRegistrar registrar;
-  return registrar;
-}
-
-void unload_cudalibs() noexcept
-{
-  auto machine = legate::get_machine();
-
-  auto num_gpus = machine.count(legate::mapping::TaskTarget::GPU);
-  if (0 == num_gpus) {
-    return;
-  }
-
-  auto runtime = legate::Runtime::get_runtime();
-  auto library = runtime->find_library(cunumeric_library_name);
-
-  // Issue an execution fence so all outstanding tasks are done before we start destroying handles
-  runtime->issue_execution_fence();
-
-  runtime->submit(
-    runtime->create_task(library,
-                         legate::LocalTaskID{CuNumericOpCode::CUNUMERIC_UNLOAD_CUDALIBS},
-                         legate::tuple<uint64_t>{num_gpus}));
-}
-
-void registration_callback()
-{
-  ResourceConfig config;
-  config.max_tasks         = CUNUMERIC_MAX_TASKS;
-  config.max_reduction_ops = CUNUMERIC_MAX_REDOPS;
-
-  auto runtime = legate::Runtime::get_runtime();
-  auto library =
-    runtime->create_library(cunumeric_library_name, config, std::make_unique<CuNumericMapper>());
-
-  CuNumericRegistrar::get_registrar().register_all_tasks(library);
-  CuNumericRuntime::initialize(runtime, library);
-
-  legate::register_shutdown_callback(unload_cudalibs);
-}
-
-}  // namespace cunumeric
-
-extern "C" {
-
-void cunumeric_perform_registration(void) { cunumeric::registration_callback(); }
-
-bool cunumeric_has_cusolvermp()
-{
-  return LEGATE_DEFINED(LEGATE_USE_CUDA) && LEGATE_DEFINED(CUNUMERIC_USE_CUSOLVERMP);
-}
-}
diff --git a/src/cunumeric/cunumeric_c.h b/src/cunumeric/cunumeric_c.h
deleted file mode 100644
index c569f786e1..0000000000
--- a/src/cunumeric/cunumeric_c.h
+++ /dev/null
@@ -1,349 +0,0 @@
-/* Copyright 2024 NVIDIA Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#ifndef __CUNUMERIC_C_H__
-#define __CUNUMERIC_C_H__
-
-// Match these to CuNumericOpCode in config.py
-// Also, sort these alphabetically except the first one for easy lookup later
-enum CuNumericOpCode {
-  _CUNUMERIC_OP_CODE_BASE = 0,
-  CUNUMERIC_ADVANCED_INDEXING,
-  CUNUMERIC_ARANGE,
-  CUNUMERIC_ARGWHERE,
-  CUNUMERIC_BATCHED_CHOLESKY,
-  CUNUMERIC_BINARY_OP,
-  CUNUMERIC_BINARY_RED,
-  CUNUMERIC_BINCOUNT,
-  CUNUMERIC_BITGENERATOR,
-  CUNUMERIC_CHOOSE,
-  CUNUMERIC_CONTRACT,
-  CUNUMERIC_CONVERT,
-  CUNUMERIC_CONVOLVE,
-  CUNUMERIC_SCAN_GLOBAL,
-  CUNUMERIC_SCAN_LOCAL,
-  CUNUMERIC_DIAG,
-  CUNUMERIC_DOT,
-  CUNUMERIC_EYE,
-  CUNUMERIC_FFT,
-  CUNUMERIC_FILL,
-  CUNUMERIC_FLIP,
-  CUNUMERIC_GEMM,
-  CUNUMERIC_HISTOGRAM,
-  CUNUMERIC_LOAD_CUDALIBS,
-  CUNUMERIC_MATMUL,
-  CUNUMERIC_MATVECMUL,
-  CUNUMERIC_MP_POTRF,
-  CUNUMERIC_MP_SOLVE,
-  CUNUMERIC_NONZERO,
-  CUNUMERIC_PACKBITS,
-  CUNUMERIC_POTRF,
-  CUNUMERIC_PUTMASK,
-  CUNUMERIC_QR,
-  CUNUMERIC_RAND,
-  CUNUMERIC_READ,
-  CUNUMERIC_REPEAT,
-  CUNUMERIC_SCALAR_UNARY_RED,
-  CUNUMERIC_SEARCHSORTED,
-  CUNUMERIC_SELECT,
-  CUNUMERIC_SOLVE,
-  CUNUMERIC_SORT,
-  CUNUMERIC_SVD,
-  CUNUMERIC_SYRK,
-  CUNUMERIC_TILE,
-  CUNUMERIC_TRANSPOSE_COPY_2D,
-  CUNUMERIC_TRILU,
-  CUNUMERIC_TRSM,
-  CUNUMERIC_UNARY_OP,
-  CUNUMERIC_UNARY_RED,
-  CUNUMERIC_UNIQUE,
-  CUNUMERIC_UNIQUE_REDUCE,
-  CUNUMERIC_UNLOAD_CUDALIBS,
-  CUNUMERIC_UNPACKBITS,
-  CUNUMERIC_WHERE,
-  CUNUMERIC_WINDOW,
-  CUNUMERIC_WRAP,
-  CUNUMERIC_WRITE,
-  CUNUMERIC_ZIP,
-};
-
-// Match these to UnaryOpCode in config.py
-// Also, sort these alphabetically for easy lookup later
-enum CuNumericUnaryOpCode {
-  CUNUMERIC_UOP_ABSOLUTE = 1,
-  CUNUMERIC_UOP_ANGLE,
-  CUNUMERIC_UOP_ARCCOS,
-  CUNUMERIC_UOP_ARCCOSH,
-  CUNUMERIC_UOP_ARCSIN,
-  CUNUMERIC_UOP_ARCSINH,
-  CUNUMERIC_UOP_ARCTAN,
-  CUNUMERIC_UOP_ARCTANH,
-  CUNUMERIC_UOP_CBRT,
-  CUNUMERIC_UOP_CEIL,
-  CUNUMERIC_UOP_CLIP,
-  CUNUMERIC_UOP_CONJ,
-  CUNUMERIC_UOP_COPY,
-  CUNUMERIC_UOP_COS,
-  CUNUMERIC_UOP_COSH,
-  CUNUMERIC_UOP_DEG2RAD,
-  CUNUMERIC_UOP_EXP,
-  CUNUMERIC_UOP_EXP2,
-  CUNUMERIC_UOP_EXPM1,
-  CUNUMERIC_UOP_FLOOR,
-  CUNUMERIC_UOP_FREXP,
-  CUNUMERIC_UOP_GETARG,
-  CUNUMERIC_UOP_IMAG,
-  CUNUMERIC_UOP_INVERT,
-  CUNUMERIC_UOP_ISFINITE,
-  CUNUMERIC_UOP_ISINF,
-  CUNUMERIC_UOP_ISNAN,
-  CUNUMERIC_UOP_LOG,
-  CUNUMERIC_UOP_LOG10,
-  CUNUMERIC_UOP_LOG1P,
-  CUNUMERIC_UOP_LOG2,
-  CUNUMERIC_UOP_LOGICAL_NOT,
-  CUNUMERIC_UOP_MODF,
-  CUNUMERIC_UOP_NEGATIVE,
-  CUNUMERIC_UOP_POSITIVE,
-  CUNUMERIC_UOP_RAD2DEG,
-  CUNUMERIC_UOP_REAL,
-  CUNUMERIC_UOP_RECIPROCAL,
-  CUNUMERIC_UOP_RINT,
-  CUNUMERIC_UOP_ROUND,
-  CUNUMERIC_UOP_SIGN,
-  CUNUMERIC_UOP_SIGNBIT,
-  CUNUMERIC_UOP_SIN,
-  CUNUMERIC_UOP_SINH,
-  CUNUMERIC_UOP_SQRT,
-  CUNUMERIC_UOP_SQUARE,
-  CUNUMERIC_UOP_TAN,
-  CUNUMERIC_UOP_TANH,
-  CUNUMERIC_UOP_TRUNC,
-};
-
-// Match these to UnaryRedCode in config.py
-// Also, sort these alphabetically for easy lookup later
-enum CuNumericUnaryRedCode {
-  CUNUMERIC_RED_ALL = 1,
-  CUNUMERIC_RED_ANY,
-  CUNUMERIC_RED_ARGMAX,
-  CUNUMERIC_RED_ARGMIN,
-  CUNUMERIC_RED_CONTAINS,
-  CUNUMERIC_RED_COUNT_NONZERO,
-  CUNUMERIC_RED_MAX,
-  CUNUMERIC_RED_MIN,
-  CUNUMERIC_RED_NANARGMAX,
-  CUNUMERIC_RED_NANARGMIN,
-  CUNUMERIC_RED_NANMAX,
-  CUNUMERIC_RED_NANMIN,
-  CUNUMERIC_RED_NANPROD,
-  CUNUMERIC_RED_NANSUM,
-  CUNUMERIC_RED_PROD,
-  CUNUMERIC_RED_SUM,
-  CUNUMERIC_RED_SUM_SQUARES,
-  CUNUMERIC_RED_VARIANCE
-};
-
-// Match these to BinaryOpCode in config.py
-// Also, sort these alphabetically for easy lookup later
-enum CuNumericBinaryOpCode {
-  CUNUMERIC_BINOP_ADD = 1,
-  CUNUMERIC_BINOP_ARCTAN2,
-  CUNUMERIC_BINOP_BITWISE_AND,
-  CUNUMERIC_BINOP_BITWISE_OR,
-  CUNUMERIC_BINOP_BITWISE_XOR,
-  CUNUMERIC_BINOP_COPYSIGN,
-  CUNUMERIC_BINOP_DIVIDE,
-  CUNUMERIC_BINOP_EQUAL,
-  CUNUMERIC_BINOP_FLOAT_POWER,
-  CUNUMERIC_BINOP_FLOOR_DIVIDE,
-  CUNUMERIC_BINOP_FMOD,
-  CUNUMERIC_BINOP_GCD,
-  CUNUMERIC_BINOP_GREATER,
-  CUNUMERIC_BINOP_GREATER_EQUAL,
-  CUNUMERIC_BINOP_HYPOT,
-  CUNUMERIC_BINOP_ISCLOSE,
-  CUNUMERIC_BINOP_LCM,
-  CUNUMERIC_BINOP_LDEXP,
-  CUNUMERIC_BINOP_LEFT_SHIFT,
-  CUNUMERIC_BINOP_LESS,
-  CUNUMERIC_BINOP_LESS_EQUAL,
-  CUNUMERIC_BINOP_LOGADDEXP,
-  CUNUMERIC_BINOP_LOGADDEXP2,
-  CUNUMERIC_BINOP_LOGICAL_AND,
-  CUNUMERIC_BINOP_LOGICAL_OR,
-  CUNUMERIC_BINOP_LOGICAL_XOR,
-  CUNUMERIC_BINOP_MAXIMUM,
-  CUNUMERIC_BINOP_MINIMUM,
-  CUNUMERIC_BINOP_MOD,
-  CUNUMERIC_BINOP_MULTIPLY,
-  CUNUMERIC_BINOP_NEXTAFTER,
-  CUNUMERIC_BINOP_NOT_EQUAL,
-  CUNUMERIC_BINOP_POWER,
-  CUNUMERIC_BINOP_RIGHT_SHIFT,
-  CUNUMERIC_BINOP_SUBTRACT,
-};
-
-// Match these to WindowOpCode in config.py
-// Also, sort these alphabetically for easy lookup later
-enum CuNumericWindowOpCode {
-  CUNUMERIC_WINDOW_BARLETT = 1,
-  CUNUMERIC_WINDOW_BLACKMAN,
-  CUNUMERIC_WINDOW_HAMMING,
-  CUNUMERIC_WINDOW_HANNING,
-  CUNUMERIC_WINDOW_KAISER,
-};
-
-// Match these to CuNumericRedopCode in config.py
-enum CuNumericRedopID {
-  CUNUMERIC_ARGMAX_REDOP = 1,
-  CUNUMERIC_ARGMIN_REDOP = 2,
-};
-
-enum CuNumericBounds {
-  CUNUMERIC_MAX_REDOPS = 1024,
-  CUNUMERIC_MAX_TASKS  = 1048576,
-};
-
-// Match these to ScanCode in config.py
-// Also, sort these alphabetically for easy lookup later
-enum CuNumericScanCode {
-  CUNUMERIC_SCAN_PROD = 1,
-  CUNUMERIC_SCAN_SUM,
-};
-
-// Match these to ConvertCode in config.py
-// Also, sort these alphabetically for easy lookup later
-enum CuNumericConvertCode {
-  CUNUMERIC_CONVERT_NAN_NOOP = 1,
-  CUNUMERIC_CONVERT_NAN_PROD,
-  CUNUMERIC_CONVERT_NAN_SUM,
-};
-
-// Match these to BitGeneratorOperation in config.py
-enum CuNumericBitGeneratorOperation {
-  CUNUMERIC_BITGENOP_CREATE       = 1,
-  CUNUMERIC_BITGENOP_DESTROY      = 2,
-  CUNUMERIC_BITGENOP_RAND_RAW     = 3,
-  CUNUMERIC_BITGENOP_DISTRIBUTION = 4,
-};
-
-// Match these to BitGeneratorType in config.py
-enum CuNumericBitGeneratorType {
-  CUNUMERIC_BITGENTYPE_DEFAULT       = 0,
-  CUNUMERIC_BITGENTYPE_XORWOW        = 1,
-  CUNUMERIC_BITGENTYPE_MRG32K3A      = 2,
-  CUNUMERIC_BITGENTYPE_MTGP32        = 3,
-  CUNUMERIC_BITGENTYPE_MT19937       = 4,
-  CUNUMERIC_BITGENTYPE_PHILOX4_32_10 = 5,
-};
-
-// Match these to BitGeneratorDistribution in config.py
-enum CuNumericBitGeneratorDistribution {
-  CUNUMERIC_BITGENDIST_INTEGERS_16 = 1,
-  CUNUMERIC_BITGENDIST_INTEGERS_32,
-  CUNUMERIC_BITGENDIST_INTEGERS_64,
-  CUNUMERIC_BITGENDIST_UNIFORM_32,
-  CUNUMERIC_BITGENDIST_UNIFORM_64,
-  CUNUMERIC_BITGENDIST_LOGNORMAL_32,
-  CUNUMERIC_BITGENDIST_LOGNORMAL_64,
-  CUNUMERIC_BITGENDIST_NORMAL_32,
-  CUNUMERIC_BITGENDIST_NORMAL_64,
-  CUNUMERIC_BITGENDIST_POISSON,
-  CUNUMERIC_BITGENDIST_EXPONENTIAL_32,
-  CUNUMERIC_BITGENDIST_EXPONENTIAL_64,
-  CUNUMERIC_BITGENDIST_GUMBEL_32,
-  CUNUMERIC_BITGENDIST_GUMBEL_64,
-  CUNUMERIC_BITGENDIST_LAPLACE_32,
-  CUNUMERIC_BITGENDIST_LAPLACE_64,
-  CUNUMERIC_BITGENDIST_LOGISTIC_32,
-  CUNUMERIC_BITGENDIST_LOGISTIC_64,
-  CUNUMERIC_BITGENDIST_PARETO_32,
-  CUNUMERIC_BITGENDIST_PARETO_64,
-  CUNUMERIC_BITGENDIST_POWER_32,
-  CUNUMERIC_BITGENDIST_POWER_64,
-  CUNUMERIC_BITGENDIST_RAYLEIGH_32,
-  CUNUMERIC_BITGENDIST_RAYLEIGH_64,
-  CUNUMERIC_BITGENDIST_CAUCHY_32,
-  CUNUMERIC_BITGENDIST_CAUCHY_64,
-  CUNUMERIC_BITGENDIST_TRIANGULAR_32,
-  CUNUMERIC_BITGENDIST_TRIANGULAR_64,
-  CUNUMERIC_BITGENDIST_WEIBULL_32,
-  CUNUMERIC_BITGENDIST_WEIBULL_64,
-  CUNUMERIC_BITGENDIST_BYTES,
-  CUNUMERIC_BITGENDIST_BETA_32,
-  CUNUMERIC_BITGENDIST_BETA_64,
-  CUNUMERIC_BITGENDIST_F_32,
-  CUNUMERIC_BITGENDIST_F_64,
-  CUNUMERIC_BITGENDIST_LOGSERIES,
-  CUNUMERIC_BITGENDIST_NONCENTRAL_F_32,
-  CUNUMERIC_BITGENDIST_NONCENTRAL_F_64,
-  CUNUMERIC_BITGENDIST_CHISQUARE_32,
-  CUNUMERIC_BITGENDIST_CHISQUARE_64,
-  CUNUMERIC_BITGENDIST_GAMMA_32,
-  CUNUMERIC_BITGENDIST_GAMMA_64,
-  CUNUMERIC_BITGENDIST_STANDARD_T_32,
-  CUNUMERIC_BITGENDIST_STANDARD_T_64,
-  CUNUMERIC_BITGENDIST_HYPERGEOMETRIC,
-  CUNUMERIC_BITGENDIST_VONMISES_32,
-  CUNUMERIC_BITGENDIST_VONMISES_64,
-  CUNUMERIC_BITGENDIST_ZIPF,
-  CUNUMERIC_BITGENDIST_GEOMETRIC,
-  CUNUMERIC_BITGENDIST_WALD_32,
-  CUNUMERIC_BITGENDIST_WALD_64,
-  CUNUMERIC_BITGENDIST_BINOMIAL,
-  CUNUMERIC_BITGENDIST_NEGATIVE_BINOMIAL,
-};
-
-// These fft types match CuNumericFFTType in config.py and cufftType
-enum CuNumericFFTType {
-  CUNUMERIC_FFT_R2C = 0x2a,  // Real to complex (interleaved)
-  CUNUMERIC_FFT_C2R = 0x2c,  // Complex (interleaved) to real
-  CUNUMERIC_FFT_C2C = 0x29,  // Complex to complex (interleaved)
-  CUNUMERIC_FFT_D2Z = 0x6a,  // Double to double-complex (interleaved)
-  CUNUMERIC_FFT_Z2D = 0x6c,  // Double-complex (interleaved) to double
-  CUNUMERIC_FFT_Z2Z = 0x69   // Double-complex to double-complex (interleaved)
-};
-
-// These fft types match CuNumericFFTDirection in config.py and cufftDirection
-enum CuNumericFFTDirection { CUNUMERIC_FFT_FORWARD = -1, CUNUMERIC_FFT_INVERSE = 1 };
-
-// Match these to Bitorder in config.py
-enum CuNumericBitorder { CUNUMERIC_BITORDER_BIG = 0, CUNUMERIC_BITORDER_LITTLE = 1 };
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct ReductionOpIds {
-  int argmax_redop_id;
-  int argmin_redop_id;
-} ReductionOpIds;
-
-void cunumeric_perform_registration();
-bool cunumeric_has_cusolvermp();
-
-unsigned cunumeric_max_eager_volume();
-
-unsigned cunumeric_matmul_cache_size();
-
-struct ReductionOpIds cunumeric_register_reduction_ops(int code);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // __CUNUMERIC_C_H__
diff --git a/src/cunumeric/mapper.cc b/src/cunumeric/mapper.cc
deleted file mode 100644
index 711ee0363e..0000000000
--- a/src/cunumeric/mapper.cc
+++ /dev/null
@@ -1,229 +0,0 @@
-/* Copyright 2024 NVIDIA Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#include "cunumeric/mapper.h"
-
-using namespace legate;
-using namespace legate::mapping;
-
-namespace cunumeric {
-
-TaskTarget CuNumericMapper::task_target(const legate::mapping::Task& task,
-                                        const std::vector<TaskTarget>& options)
-{
-  return *options.begin();
-}
-
-Scalar CuNumericMapper::tunable_value(TunableID tunable_id)
-{
-  LEGATE_ABORT("cuNumeric does not use any tunable values");
-}
-
-std::vector<StoreMapping> CuNumericMapper::store_mappings(
-  const mapping::Task& task, const std::vector<mapping::StoreTarget>& options)
-{
-  switch (static_cast<std::int64_t>(task.task_id())) {
-    case CUNUMERIC_CONVOLVE: {
-      std::vector<StoreMapping> mappings;
-      auto inputs = task.inputs();
-      mappings.push_back(StoreMapping::default_mapping(inputs[0].data(), options.front()));
-      mappings.push_back(StoreMapping::default_mapping(inputs[1].data(), options.front()));
-      auto& input_mapping = mappings.back();
-      for (uint32_t idx = 2; idx < inputs.size(); ++idx) {
-        input_mapping.add_store(inputs[idx].data());
-      }
-      return mappings;
-    }
-    case CUNUMERIC_FFT: {
-      std::vector<StoreMapping> mappings;
-      auto inputs  = task.inputs();
-      auto outputs = task.outputs();
-      mappings.push_back(StoreMapping::default_mapping(inputs[0].data(), options.front()));
-      mappings.push_back(
-        StoreMapping::default_mapping(outputs[0].data(), options.front(), true /*exact*/));
-      return mappings;
-    }
-    case CUNUMERIC_TRANSPOSE_COPY_2D: {
-      std::vector<StoreMapping> mappings;
-      auto output = task.output(0);
-      mappings.push_back(StoreMapping::default_mapping(output.data(), options.front()));
-      mappings.back().policy().ordering.set_fortran_order();
-      mappings.back().policy().exact = true;
-      return std::move(mappings);
-    }
-    case CUNUMERIC_MATMUL: {
-      std::vector<StoreMapping> mappings;
-      auto inputA = task.input(1);
-      auto inputB = task.input(2);
-
-      mappings.push_back(
-        StoreMapping::default_mapping(inputA.data(), options.front(), true /*exact*/));
-      mappings.back().policy().redundant = true;
-      mappings.push_back(
-        StoreMapping::default_mapping(inputB.data(), options.front(), true /*exact*/));
-      mappings.back().policy().redundant = true;
-
-      auto outputC = task.output(0);
-      mappings.push_back(
-        StoreMapping::default_mapping(outputC.data(), options.front(), true /*exact*/));
-
-      return mappings;
-    }
-    case CUNUMERIC_MATVECMUL:
-    case CUNUMERIC_UNIQUE_REDUCE: {
-      // TODO: Our actual requirements are a little less strict than this; we require each array or
-      // vector to have a stride of 1 on at least one dimension.
-      std::vector<StoreMapping> mappings;
-      auto inputs     = task.inputs();
-      auto reductions = task.reductions();
-      for (auto& input : inputs) {
-        mappings.push_back(
-          StoreMapping::default_mapping(input.data(), options.front(), true /*exact*/));
-      }
-      for (auto& reduction : reductions) {
-        mappings.push_back(
-          StoreMapping::default_mapping(reduction.data(), options.front(), true /*exact*/));
-      }
-      return mappings;
-    }
-    case CUNUMERIC_POTRF:
-    case CUNUMERIC_QR:
-    case CUNUMERIC_TRSM:
-    case CUNUMERIC_SOLVE:
-    case CUNUMERIC_SVD:
-    case CUNUMERIC_SYRK:
-    case CUNUMERIC_GEMM:
-    case CUNUMERIC_MP_POTRF:
-    case CUNUMERIC_MP_SOLVE: {
-      std::vector<StoreMapping> mappings;
-      auto inputs  = task.inputs();
-      auto outputs = task.outputs();
-      for (auto& input : inputs) {
-        mappings.push_back(
-          StoreMapping::default_mapping(input.data(), options.front(), true /*exact*/));
-        mappings.back().policy().ordering.set_fortran_order();
-      }
-      for (auto& output : outputs) {
-        mappings.push_back(
-          StoreMapping::default_mapping(output.data(), options.front(), true /*exact*/));
-        mappings.back().policy().ordering.set_fortran_order();
-      }
-      return mappings;
-    }
-    // CHANGE: If this code is changed, make sure all layouts are
-    // consistent with those assumed in batched_cholesky.cu, etc
-    case CUNUMERIC_BATCHED_CHOLESKY: {
-      std::vector<StoreMapping> mappings;
-      auto inputs  = task.inputs();
-      auto outputs = task.outputs();
-      mappings.reserve(inputs.size() + outputs.size());
-      for (auto& input : inputs) {
-        mappings.push_back(StoreMapping::default_mapping(input.data(), options.front()));
-        mappings.back().policy().exact = true;
-        mappings.back().policy().ordering.set_c_order();
-      }
-      for (auto& output : outputs) {
-        mappings.push_back(StoreMapping::default_mapping(output.data(), options.front()));
-        mappings.back().policy().exact = true;
-        mappings.back().policy().ordering.set_c_order();
-      }
-      return std::move(mappings);
-    }
-    case CUNUMERIC_TRILU: {
-      if (task.scalars().size() == 2) {
-        return {};
-      }
-      // If we're here, this task was the post-processing for Cholesky.
-      // So we will request fortran ordering
-      std::vector<StoreMapping> mappings;
-      auto input = task.input(0);
-      mappings.push_back(
-        StoreMapping::default_mapping(input.data(), options.front(), true /*exact*/));
-      mappings.back().policy().ordering.set_fortran_order();
-      return mappings;
-    }
-    case CUNUMERIC_SEARCHSORTED: {
-      std::vector<StoreMapping> mappings;
-      auto inputs = task.inputs();
-      mappings.push_back(
-        StoreMapping::default_mapping(inputs[0].data(), options.front(), true /*exact*/));
-      return mappings;
-    }
-    case CUNUMERIC_SORT: {
-      std::vector<StoreMapping> mappings;
-      auto inputs  = task.inputs();
-      auto outputs = task.outputs();
-      for (auto& input : inputs) {
-        mappings.push_back(
-          StoreMapping::default_mapping(input.data(), options.front(), true /*exact*/));
-      }
-      for (auto& output : outputs) {
-        mappings.push_back(
-          StoreMapping::default_mapping(output.data(), options.front(), true /*exact*/));
-      }
-      return mappings;
-    }
-    case CUNUMERIC_SCAN_LOCAL: {
-      std::vector<StoreMapping> mappings;
-      auto inputs  = task.inputs();
-      auto outputs = task.outputs();
-      for (auto& input : inputs) {
-        mappings.push_back(
-          StoreMapping::default_mapping(input.data(), options.front(), true /*exact*/));
-      }
-      for (auto& output : outputs) {
-        mappings.push_back(
-          StoreMapping::default_mapping(output.data(), options.front(), true /*exact*/));
-      }
-      return mappings;
-    }
-    case CUNUMERIC_SCAN_GLOBAL: {
-      std::vector<StoreMapping> mappings;
-      auto inputs  = task.inputs();
-      auto outputs = task.outputs();
-      for (auto& input : inputs) {
-        mappings.push_back(
-          StoreMapping::default_mapping(input.data(), options.front(), true /*exact*/));
-      }
-      for (auto& output : outputs) {
-        mappings.push_back(
-          StoreMapping::default_mapping(output.data(), options.front(), true /*exact*/));
-      }
-      return mappings;
-    }
-    case CUNUMERIC_BITGENERATOR: {
-      std::vector<StoreMapping> mappings;
-      auto inputs  = task.inputs();
-      auto outputs = task.outputs();
-      for (auto& input : inputs) {
-        mappings.push_back(
-          StoreMapping::default_mapping(input.data(), options.front(), true /*exact*/));
-      }
-      for (auto& output : outputs) {
-        mappings.push_back(
-          StoreMapping::default_mapping(output.data(), options.front(), true /*exact*/));
-      }
-      return mappings;
-    }
-    default: {
-      return {};
-    }
-  }
-  assert(false);
-  return {};
-}
-
-}  // namespace cunumeric
diff --git a/src/cunumeric/matrix/batched_cholesky.h b/src/cunumeric/matrix/batched_cholesky.h
deleted file mode 100644
index 94713beffe..0000000000
--- a/src/cunumeric/matrix/batched_cholesky.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2024 NVIDIA Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#pragma once
-
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/cunumeric_c.h"
-
-namespace cunumeric {
-
-class BatchedCholeskyTask : public CuNumericTask<BatchedCholeskyTask> {
- public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_BATCHED_CHOLESKY};
-
- public:
-  static void cpu_variant(legate::TaskContext context);
-#if LEGATE_DEFINED(LEGATE_USE_OPENMP)
-  static void omp_variant(legate::TaskContext context);
-#endif
-#if LEGATE_DEFINED(LEGATE_USE_CUDA)
-  static void gpu_variant(legate::TaskContext context);
-#endif
-};
-
-}  // namespace cunumeric
diff --git a/src/cunumeric/matrix/potrf.h b/src/cunumeric/matrix/potrf.h
deleted file mode 100644
index d2928df9fc..0000000000
--- a/src/cunumeric/matrix/potrf.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2024 NVIDIA Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#pragma once
-
-#include "cunumeric/cunumeric_task.h"
-
-namespace cunumeric {
-
-class PotrfTask : public CuNumericTask<PotrfTask> {
- public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_POTRF};
-
- public:
-  static void cpu_variant(legate::TaskContext context);
-#if LEGATE_DEFINED(LEGATE_USE_OPENMP)
-  static void omp_variant(legate::TaskContext context);
-#endif
-#if LEGATE_DEFINED(LEGATE_USE_CUDA)
-  static void gpu_variant(legate::TaskContext context);
-#endif
-};
-
-}  // namespace cunumeric
diff --git a/src/cunumeric/random/bitgenerator_util.h b/src/cunumeric/random/bitgenerator_util.h
deleted file mode 100644
index 0a726a9f08..0000000000
--- a/src/cunumeric/random/bitgenerator_util.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright 2024 NVIDIA Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#pragma once
-
-#include "cunumeric/cunumeric_task.h"
-
-namespace cunumeric {
-
-// Match these to BitGeneratorOperation in config.py
-enum class BitGeneratorOperation : int32_t {
-  CREATE       = CUNUMERIC_BITGENOP_CREATE,
-  DESTROY      = CUNUMERIC_BITGENOP_DESTROY,
-  RAND_RAW     = CUNUMERIC_BITGENOP_RAND_RAW,
-  DISTRIBUTION = CUNUMERIC_BITGENOP_DISTRIBUTION,
-};
-
-// Match these to BitGeneratorType in config.py
-enum class BitGeneratorType : uint32_t {
-  DEFAULT       = CUNUMERIC_BITGENTYPE_DEFAULT,
-  XORWOW        = CUNUMERIC_BITGENTYPE_XORWOW,
-  MRG32K3A      = CUNUMERIC_BITGENTYPE_MRG32K3A,
-  MTGP32        = CUNUMERIC_BITGENTYPE_MTGP32,
-  MT19937       = CUNUMERIC_BITGENTYPE_MT19937,
-  PHILOX4_32_10 = CUNUMERIC_BITGENTYPE_PHILOX4_32_10,
-};
-
-// Match these to BitGeneratorDistribution in config.py
-enum class BitGeneratorDistribution : int32_t {
-  INTEGERS_16       = CUNUMERIC_BITGENDIST_INTEGERS_16,
-  INTEGERS_32       = CUNUMERIC_BITGENDIST_INTEGERS_32,
-  INTEGERS_64       = CUNUMERIC_BITGENDIST_INTEGERS_64,
-  UNIFORM_32        = CUNUMERIC_BITGENDIST_UNIFORM_32,
-  UNIFORM_64        = CUNUMERIC_BITGENDIST_UNIFORM_64,
-  LOGNORMAL_32      = CUNUMERIC_BITGENDIST_LOGNORMAL_32,
-  LOGNORMAL_64      = CUNUMERIC_BITGENDIST_LOGNORMAL_64,
-  NORMAL_32         = CUNUMERIC_BITGENDIST_NORMAL_32,
-  NORMAL_64         = CUNUMERIC_BITGENDIST_NORMAL_64,
-  POISSON           = CUNUMERIC_BITGENDIST_POISSON,
-  EXPONENTIAL_32    = CUNUMERIC_BITGENDIST_EXPONENTIAL_32,
-  EXPONENTIAL_64    = CUNUMERIC_BITGENDIST_EXPONENTIAL_64,
-  GUMBEL_32         = CUNUMERIC_BITGENDIST_GUMBEL_32,
-  GUMBEL_64         = CUNUMERIC_BITGENDIST_GUMBEL_64,
-  LAPLACE_32        = CUNUMERIC_BITGENDIST_LAPLACE_32,
-  LAPLACE_64        = CUNUMERIC_BITGENDIST_LAPLACE_64,
-  LOGISTIC_32       = CUNUMERIC_BITGENDIST_LOGISTIC_32,
-  LOGISTIC_64       = CUNUMERIC_BITGENDIST_LOGISTIC_64,
-  PARETO_32         = CUNUMERIC_BITGENDIST_PARETO_32,
-  PARETO_64         = CUNUMERIC_BITGENDIST_PARETO_64,
-  POWER_32          = CUNUMERIC_BITGENDIST_POWER_32,
-  POWER_64          = CUNUMERIC_BITGENDIST_POWER_64,
-  RAYLEIGH_32       = CUNUMERIC_BITGENDIST_RAYLEIGH_32,
-  RAYLEIGH_64       = CUNUMERIC_BITGENDIST_RAYLEIGH_64,
-  CAUCHY_32         = CUNUMERIC_BITGENDIST_CAUCHY_32,
-  CAUCHY_64         = CUNUMERIC_BITGENDIST_CAUCHY_64,
-  TRIANGULAR_32     = CUNUMERIC_BITGENDIST_TRIANGULAR_32,
-  TRIANGULAR_64     = CUNUMERIC_BITGENDIST_TRIANGULAR_64,
-  WEIBULL_32        = CUNUMERIC_BITGENDIST_WEIBULL_32,
-  WEIBULL_64        = CUNUMERIC_BITGENDIST_WEIBULL_64,
-  BYTES             = CUNUMERIC_BITGENDIST_BYTES,
-  BETA_32           = CUNUMERIC_BITGENDIST_BETA_32,
-  BETA_64           = CUNUMERIC_BITGENDIST_BETA_64,
-  F_32              = CUNUMERIC_BITGENDIST_F_32,
-  F_64              = CUNUMERIC_BITGENDIST_F_64,
-  LOGSERIES         = CUNUMERIC_BITGENDIST_LOGSERIES,
-  NONCENTRAL_F_32   = CUNUMERIC_BITGENDIST_NONCENTRAL_F_32,
-  NONCENTRAL_F_64   = CUNUMERIC_BITGENDIST_NONCENTRAL_F_64,
-  CHISQUARE_32      = CUNUMERIC_BITGENDIST_CHISQUARE_32,
-  CHISQUARE_64      = CUNUMERIC_BITGENDIST_CHISQUARE_64,
-  GAMMA_32          = CUNUMERIC_BITGENDIST_GAMMA_32,
-  GAMMA_64          = CUNUMERIC_BITGENDIST_GAMMA_64,
-  STANDARD_T_32     = CUNUMERIC_BITGENDIST_STANDARD_T_32,
-  STANDARD_T_64     = CUNUMERIC_BITGENDIST_STANDARD_T_64,
-  HYPERGEOMETRIC    = CUNUMERIC_BITGENDIST_HYPERGEOMETRIC,
-  VONMISES_32       = CUNUMERIC_BITGENDIST_VONMISES_32,
-  VONMISES_64       = CUNUMERIC_BITGENDIST_VONMISES_64,
-  ZIPF              = CUNUMERIC_BITGENDIST_ZIPF,
-  GEOMETRIC         = CUNUMERIC_BITGENDIST_GEOMETRIC,
-  WALD_32           = CUNUMERIC_BITGENDIST_WALD_32,
-  WALD_64           = CUNUMERIC_BITGENDIST_WALD_64,
-  BINOMIAL          = CUNUMERIC_BITGENDIST_BINOMIAL,
-  NEGATIVE_BINOMIAL = CUNUMERIC_BITGENDIST_NEGATIVE_BINOMIAL,
-};
-
-}  // namespace cunumeric
diff --git a/src/cunumeric/runtime.cc b/src/cunumeric/runtime.cc
deleted file mode 100644
index ff6afc92fc..0000000000
--- a/src/cunumeric/runtime.cc
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright 2024 NVIDIA Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#include "env_defaults.h"
-#include "cunumeric/runtime.h"
-
-#include "cunumeric/ndarray.h"
-#include "cunumeric/unary/unary_red_util.h"
-
-#include <charconv>
-#include <cstdlib>
-#include <string_view>
-
-namespace cunumeric {
-
-/*static*/ CuNumericRuntime* CuNumericRuntime::runtime_;
-
-extern void bootstrapping_callback(Legion::Machine machine,
-                                   Legion::Runtime* runtime,
-                                   const std::set<Legion::Processor>& local_procs);
-
-void initialize(int32_t argc, char** argv) { cunumeric_perform_registration(); }
-
-CuNumericRuntime::CuNumericRuntime(legate::Runtime* legate_runtime, legate::Library library)
-  : legate_runtime_(legate_runtime), library_(library)
-{
-}
-
-NDArray CuNumericRuntime::create_array(const legate::Type& type)
-{
-  auto store = legate_runtime_->create_store(type);
-  return NDArray(std::move(store));
-}
-
-NDArray CuNumericRuntime::create_array(std::vector<uint64_t> shape,
-                                       const legate::Type& type,
-                                       bool optimize_scalar)
-{
-  auto store = legate_runtime_->create_store(legate::Shape{shape}, type, optimize_scalar);
-  return NDArray(std::move(store));
-}
-
-NDArray CuNumericRuntime::create_array(legate::LogicalStore&& store)
-{
-  return NDArray(std::move(store));
-}
-
-NDArray CuNumericRuntime::create_array(const legate::Type& type, int32_t dim)
-{
-  auto store = legate_runtime_->create_store(type, dim);
-  return NDArray(std::move(store));
-}
-
-legate::LogicalStore CuNumericRuntime::create_scalar_store(const Scalar& value)
-{
-  return legate_runtime_->create_store(value);
-}
-
-legate::Type CuNumericRuntime::get_argred_type(const legate::Type& value_type)
-{
-  auto finder = argred_types_.find(value_type.code());
-  if (finder != argred_types_.end()) {
-    return finder->second;
-  }
-
-  auto argred_type = legate::struct_type({legate::int64(), value_type}, true /*align*/);
-  argred_types_.insert({value_type.code(), argred_type});
-  return argred_type;
-}
-
-legate::AutoTask CuNumericRuntime::create_task(CuNumericOpCode op_code)
-{
-  return legate_runtime_->create_task(library_, legate::LocalTaskID{op_code});
-}
-
-legate::ManualTask CuNumericRuntime::create_task(CuNumericOpCode op_code,
-                                                 const legate::tuple<std::uint64_t>& launch_shape)
-{
-  return legate_runtime_->create_task(library_, legate::LocalTaskID{op_code}, launch_shape);
-}
-
-void CuNumericRuntime::submit(legate::AutoTask&& task) { legate_runtime_->submit(std::move(task)); }
-
-void CuNumericRuntime::submit(legate::ManualTask&& task)
-{
-  legate_runtime_->submit(std::move(task));
-}
-
-uint32_t CuNumericRuntime::get_next_random_epoch() { return next_epoch_++; }
-
-/*static*/ CuNumericRuntime* CuNumericRuntime::get_runtime() { return runtime_; }
-
-/*static*/ void CuNumericRuntime::initialize(legate::Runtime* legate_runtime,
-                                             legate::Library library)
-{
-  runtime_ = new CuNumericRuntime(legate_runtime, library);
-}
-
-namespace {
-
-std::uint32_t extract_env(const char* env_name,
-                          std::uint32_t default_value,
-                          std::uint32_t test_value)
-{
-  auto parse_value = [](const char* value_char) {
-    auto value_sv = std::string_view{value_char};
-
-    std::uint32_t result{};
-    if (auto&& [_, ec] = std::from_chars(value_sv.begin(), value_sv.end(), result);
-        ec != std::errc{}) {
-      throw std::runtime_error{std::make_error_code(ec).message()};
-    }
-
-    return result;
-  };
-
-  if (const auto* env_value = std::getenv(env_name); env_value) {
-    return parse_value(env_value);
-  }
-
-  if (const auto* is_in_test_mode = std::getenv("LEGATE_TEST");
-      is_in_test_mode && parse_value(is_in_test_mode)) {
-    return test_value;
-  }
-
-  return default_value;
-}
-
-}  // namespace
-
-}  // namespace cunumeric
-
-extern "C" {
-
-unsigned cunumeric_max_eager_volume()
-{
-  static const auto min_gpu_chunk =
-    cunumeric::extract_env("CUNUMERIC_MIN_GPU_CHUNK", MIN_GPU_CHUNK_DEFAULT, MIN_GPU_CHUNK_TEST);
-  static const auto min_cpu_chunk =
-    cunumeric::extract_env("CUNUMERIC_MIN_CPU_CHUNK", MIN_CPU_CHUNK_DEFAULT, MIN_CPU_CHUNK_TEST);
-  static const auto min_omp_chunk =
-    cunumeric::extract_env("CUNUMERIC_MIN_OMP_CHUNK", MIN_OMP_CHUNK_DEFAULT, MIN_OMP_CHUNK_TEST);
-
-  auto machine = legate::get_machine();
-
-  if (machine.count(legate::mapping::TaskTarget::GPU) > 0) {
-    return min_gpu_chunk;
-  }
-  if (machine.count(legate::mapping::TaskTarget::OMP) > 0) {
-    return min_omp_chunk;
-  }
-  return min_cpu_chunk;
-}
-
-unsigned cunumeric_matmul_cache_size()
-{
-  static const auto max_cache_size = cunumeric::extract_env(
-    "CUNUMERIC_MATMUL_CACHE_SIZE", MATMUL_CACHE_SIZE_DEFAULT, MATMUL_CACHE_SIZE_TEST);
-  return max_cache_size;
-}
-
-}  // extern "C"
diff --git a/src/cunumeric/set/unique.h b/src/cunumeric/set/unique.h
deleted file mode 100644
index ab6ed17cd7..0000000000
--- a/src/cunumeric/set/unique.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2024 NVIDIA Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#pragma once
-
-#include "cunumeric/cunumeric_task.h"
-
-namespace cunumeric {
-
-class UniqueTask : public CuNumericTask<UniqueTask> {
- public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_UNIQUE};
-
- public:
-  static void cpu_variant(legate::TaskContext context);
-#if LEGATE_DEFINED(LEGATE_USE_OPENMP)
-  static void omp_variant(legate::TaskContext context);
-#endif
-#if LEGATE_DEFINED(LEGATE_USE_CUDA)
-  static void gpu_variant(legate::TaskContext context);
-#endif
-};
-
-}  // namespace cunumeric
diff --git a/src/cunumeric.h b/src/cupynumeric.h
similarity index 85%
rename from src/cunumeric.h
rename to src/cupynumeric.h
index dfd752c834..fe598bd438 100644
--- a/src/cunumeric.h
+++ b/src/cupynumeric.h
@@ -14,6 +14,6 @@
  *
  */
 
-#include "cunumeric/ndarray.h"
-#include "cunumeric/operators.h"
-#include "cunumeric/slice.h"
+#include "cupynumeric/ndarray.h"
+#include "cupynumeric/operators.h"
+#include "cupynumeric/slice.h"
diff --git a/src/cunumeric/arg.h b/src/cupynumeric/arg.h
similarity index 96%
rename from src/cunumeric/arg.h
rename to src/cupynumeric/arg.h
index 1dd91b12b1..70803223d4 100644
--- a/src/cunumeric/arg.h
+++ b/src/cupynumeric/arg.h
@@ -18,7 +18,7 @@
 
 #include "legate.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename T>
 class Argval {
@@ -95,6 +95,6 @@ class ArgminReduction {
   }
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
 
-#include "cunumeric/arg.inl"
+#include "cupynumeric/arg.inl"
diff --git a/src/cunumeric/arg.inl b/src/cupynumeric/arg.inl
similarity index 98%
rename from src/cunumeric/arg.inl
rename to src/cupynumeric/arg.inl
index 5c0ba9b689..995b516486 100644
--- a/src/cunumeric/arg.inl
+++ b/src/cupynumeric/arg.inl
@@ -19,7 +19,7 @@
 // Useful for IDEs
 #include "arg.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename T>
 __CUDA_HD__ Argval<T>::Argval(T v) : arg(LLONG_MIN), arg_value(v)
@@ -143,4 +143,4 @@ DECLARE_IDENTITIES(uint64_t)
 #undef DECLARE_ARGMIN_IDENTITY
 #undef DECLARE_ARGMAX_IDENTITY
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/arg_redop_register.cc b/src/cupynumeric/arg_redop_register.cc
similarity index 89%
rename from src/cunumeric/arg_redop_register.cc
rename to src/cupynumeric/arg_redop_register.cc
index 7fdf450ac3..2e7372bb06 100644
--- a/src/cunumeric/arg_redop_register.cc
+++ b/src/cupynumeric/arg_redop_register.cc
@@ -14,11 +14,11 @@
  *
  */
 
-#include "cunumeric/arg_redop_register.h"
+#include "cupynumeric/arg_redop_register.h"
 
 #include <type_traits>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 #define DEFINE_ARGMAX_IDENTITY(TYPE)                   \
   template <>                                          \
@@ -58,15 +58,15 @@ register_reduction_op_fn::register_reduction_op_fn::next_reduction_operator_id()
   return legate::LocalRedopID{next_redop_id++};
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
 
 #if !LEGATE_DEFINED(LEGATE_USE_CUDA)
 extern "C" {
 
-ReductionOpIds cunumeric_register_reduction_ops(int code)
+ReductionOpIds cupynumeric_register_reduction_ops(int code)
 {
   return legate::type_dispatch(static_cast<legate::Type::Code>(code),
-                               cunumeric::register_reduction_op_fn{});
+                               cupynumeric::register_reduction_op_fn{});
 }
 }
 #endif
diff --git a/src/cunumeric/arg_redop_register.cu b/src/cupynumeric/arg_redop_register.cu
similarity index 79%
rename from src/cunumeric/arg_redop_register.cu
rename to src/cupynumeric/arg_redop_register.cu
index 076d02a029..c48ed286a3 100644
--- a/src/cunumeric/arg_redop_register.cu
+++ b/src/cupynumeric/arg_redop_register.cu
@@ -14,13 +14,13 @@
  *
  */
 
-#include "cunumeric/arg_redop_register.h"
+#include "cupynumeric/arg_redop_register.h"
 
 extern "C" {
 
-ReductionOpIds cunumeric_register_reduction_ops(int code)
+ReductionOpIds cupynumeric_register_reduction_ops(int code)
 {
   return legate::type_dispatch(static_cast<legate::Type::Code>(code),
-                               cunumeric::register_reduction_op_fn{});
+                               cupynumeric::register_reduction_op_fn{});
 }
 }
diff --git a/src/cunumeric/arg_redop_register.h b/src/cupynumeric/arg_redop_register.h
similarity index 89%
rename from src/cunumeric/arg_redop_register.h
rename to src/cupynumeric/arg_redop_register.h
index 05b764c8e0..68e6e65a63 100644
--- a/src/cunumeric/arg_redop_register.h
+++ b/src/cupynumeric/arg_redop_register.h
@@ -17,10 +17,10 @@
 #pragma once
 
 #include "legate.h"
-#include "cunumeric/cunumeric_c.h"
-#include "cunumeric/arg.h"
+#include "cupynumeric/cupynumeric_c.h"
+#include "cupynumeric/arg.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct register_reduction_op_fn {
   template <legate::Type::Code CODE, std::enable_if_t<!legate::is_complex<CODE>::value>* = nullptr>
@@ -29,7 +29,7 @@ struct register_reduction_op_fn {
     using VAL = legate::type_of<CODE>;
     ReductionOpIds result;
     auto runtime           = legate::Runtime::get_runtime();
-    auto context           = runtime->find_library("cunumeric");
+    auto context           = runtime->find_library("cupynumeric");
     result.argmax_redop_id = static_cast<int>(
       context.register_reduction_operator<ArgmaxReduction<VAL>>(next_reduction_operator_id()));
     result.argmin_redop_id = static_cast<int>(
@@ -47,4 +47,4 @@ struct register_reduction_op_fn {
   static legate::LocalRedopID next_reduction_operator_id();
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/binary/binary_op.cc b/src/cupynumeric/binary/binary_op.cc
similarity index 87%
rename from src/cunumeric/binary/binary_op.cc
rename to src/cupynumeric/binary/binary_op.cc
index 64a810d981..e8a271b729 100644
--- a/src/cunumeric/binary/binary_op.cc
+++ b/src/cupynumeric/binary/binary_op.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/binary/binary_op.h"
-#include "cunumeric/binary/binary_op_template.inl"
+#include "cupynumeric/binary/binary_op.h"
+#include "cupynumeric/binary/binary_op_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -60,7 +60,10 @@ struct BinaryOpImplBody<VariantKind::CPU, OP_CODE, CODE, DIM> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { BinaryOpTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  BinaryOpTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/binary/binary_op.cu b/src/cupynumeric/binary/binary_op.cu
similarity index 92%
rename from src/cunumeric/binary/binary_op.cu
rename to src/cupynumeric/binary/binary_op.cu
index d00fa66e7d..ea7f68f4c9 100644
--- a/src/cunumeric/binary/binary_op.cu
+++ b/src/cupynumeric/binary/binary_op.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/binary/binary_op.h"
-#include "cunumeric/binary/binary_op_template.inl"
+#include "cupynumeric/binary/binary_op.h"
+#include "cupynumeric/binary/binary_op_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename Function, typename LHS, typename RHS1, typename RHS2>
 static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
@@ -82,7 +82,7 @@ struct BinaryOpImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
       generic_kernel<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
         volume, func, out, in1, in2, pitches, rect);
     }
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -91,4 +91,4 @@ struct BinaryOpImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
   binary_op_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/binary/binary_op.h b/src/cupynumeric/binary/binary_op.h
similarity index 77%
rename from src/cunumeric/binary/binary_op.h
rename to src/cupynumeric/binary/binary_op.h
index 8bdf29d7d5..34ac087835 100644
--- a/src/cunumeric/binary/binary_op.h
+++ b/src/cupynumeric/binary/binary_op.h
@@ -16,10 +16,10 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/binary/binary_op_util.h"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/binary/binary_op_util.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct BinaryOpArgs {
   legate::PhysicalStore in1;
@@ -29,9 +29,10 @@ struct BinaryOpArgs {
   std::vector<legate::Scalar> args;
 };
 
-class BinaryOpTask : public CuNumericTask<BinaryOpTask> {
+class BinaryOpTask : public CuPyNumericTask<BinaryOpTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_BINARY_OP};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_BINARY_OP}};
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -43,4 +44,4 @@ class BinaryOpTask : public CuNumericTask<BinaryOpTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/binary/binary_op_omp.cc b/src/cupynumeric/binary/binary_op_omp.cc
similarity index 92%
rename from src/cunumeric/binary/binary_op_omp.cc
rename to src/cupynumeric/binary/binary_op_omp.cc
index 684296a53a..9d4542d5f9 100644
--- a/src/cunumeric/binary/binary_op_omp.cc
+++ b/src/cupynumeric/binary/binary_op_omp.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/binary/binary_op.h"
-#include "cunumeric/binary/binary_op_template.inl"
+#include "cupynumeric/binary/binary_op.h"
+#include "cupynumeric/binary/binary_op_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -60,4 +60,4 @@ struct BinaryOpImplBody<VariantKind::OMP, OP_CODE, CODE, DIM> {
   binary_op_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/binary/binary_op_template.inl b/src/cupynumeric/binary/binary_op_template.inl
similarity index 94%
rename from src/cunumeric/binary/binary_op_template.inl
rename to src/cupynumeric/binary/binary_op_template.inl
index e3f5acbf44..01869a1922 100644
--- a/src/cunumeric/binary/binary_op_template.inl
+++ b/src/cupynumeric/binary/binary_op_template.inl
@@ -17,11 +17,11 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/binary/binary_op.h"
-#include "cunumeric/binary/binary_op_util.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/binary/binary_op.h"
+#include "cupynumeric/binary/binary_op_util.h"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -94,4 +94,4 @@ static void binary_op_template(TaskContext& context)
   op_dispatch(args.op_code, BinaryOpDispatch<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/binary/binary_op_util.cc b/src/cupynumeric/binary/binary_op_util.cc
similarity index 90%
rename from src/cunumeric/binary/binary_op_util.cc
rename to src/cupynumeric/binary/binary_op_util.cc
index 180c9d9c02..0f90b40b12 100644
--- a/src/cunumeric/binary/binary_op_util.cc
+++ b/src/cupynumeric/binary/binary_op_util.cc
@@ -14,13 +14,13 @@
  *
  */
 
-#include "cunumeric/binary/binary_op_util.h"
+#include "cupynumeric/binary/binary_op_util.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 std::vector<uint64_t> broadcast_shapes(std::vector<NDArray> arrays)
 {
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
   assert(!arrays.empty());
 #endif
   int32_t dim = 0;
@@ -46,4 +46,4 @@ std::vector<uint64_t> broadcast_shapes(std::vector<NDArray> arrays)
   return result;
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/binary/binary_op_util.h b/src/cupynumeric/binary/binary_op_util.h
similarity index 94%
rename from src/cunumeric/binary/binary_op_util.h
rename to src/cupynumeric/binary/binary_op_util.h
index 55189409ea..84c8a88cdb 100644
--- a/src/cunumeric/binary/binary_op_util.h
+++ b/src/cupynumeric/binary/binary_op_util.h
@@ -16,47 +16,47 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/ndarray.h"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/ndarray.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 enum class BinaryOpCode : int {
-  ADD           = CUNUMERIC_BINOP_ADD,
-  ARCTAN2       = CUNUMERIC_BINOP_ARCTAN2,
-  BITWISE_AND   = CUNUMERIC_BINOP_BITWISE_AND,
-  BITWISE_OR    = CUNUMERIC_BINOP_BITWISE_OR,
-  BITWISE_XOR   = CUNUMERIC_BINOP_BITWISE_XOR,
-  COPYSIGN      = CUNUMERIC_BINOP_COPYSIGN,
-  DIVIDE        = CUNUMERIC_BINOP_DIVIDE,
-  EQUAL         = CUNUMERIC_BINOP_EQUAL,
-  FLOAT_POWER   = CUNUMERIC_BINOP_FLOAT_POWER,
-  FLOOR_DIVIDE  = CUNUMERIC_BINOP_FLOOR_DIVIDE,
-  FMOD          = CUNUMERIC_BINOP_FMOD,
-  GCD           = CUNUMERIC_BINOP_GCD,
-  GREATER       = CUNUMERIC_BINOP_GREATER,
-  GREATER_EQUAL = CUNUMERIC_BINOP_GREATER_EQUAL,
-  HYPOT         = CUNUMERIC_BINOP_HYPOT,
-  ISCLOSE       = CUNUMERIC_BINOP_ISCLOSE,
-  LCM           = CUNUMERIC_BINOP_LCM,
-  LDEXP         = CUNUMERIC_BINOP_LDEXP,
-  LEFT_SHIFT    = CUNUMERIC_BINOP_LEFT_SHIFT,
-  LESS          = CUNUMERIC_BINOP_LESS,
-  LESS_EQUAL    = CUNUMERIC_BINOP_LESS_EQUAL,
-  LOGADDEXP     = CUNUMERIC_BINOP_LOGADDEXP,
-  LOGADDEXP2    = CUNUMERIC_BINOP_LOGADDEXP2,
-  LOGICAL_AND   = CUNUMERIC_BINOP_LOGICAL_AND,
-  LOGICAL_OR    = CUNUMERIC_BINOP_LOGICAL_OR,
-  LOGICAL_XOR   = CUNUMERIC_BINOP_LOGICAL_XOR,
-  MAXIMUM       = CUNUMERIC_BINOP_MAXIMUM,
-  MINIMUM       = CUNUMERIC_BINOP_MINIMUM,
-  MOD           = CUNUMERIC_BINOP_MOD,
-  MULTIPLY      = CUNUMERIC_BINOP_MULTIPLY,
-  NEXTAFTER     = CUNUMERIC_BINOP_NEXTAFTER,
-  NOT_EQUAL     = CUNUMERIC_BINOP_NOT_EQUAL,
-  POWER         = CUNUMERIC_BINOP_POWER,
-  RIGHT_SHIFT   = CUNUMERIC_BINOP_RIGHT_SHIFT,
-  SUBTRACT      = CUNUMERIC_BINOP_SUBTRACT,
+  ADD           = CUPYNUMERIC_BINOP_ADD,
+  ARCTAN2       = CUPYNUMERIC_BINOP_ARCTAN2,
+  BITWISE_AND   = CUPYNUMERIC_BINOP_BITWISE_AND,
+  BITWISE_OR    = CUPYNUMERIC_BINOP_BITWISE_OR,
+  BITWISE_XOR   = CUPYNUMERIC_BINOP_BITWISE_XOR,
+  COPYSIGN      = CUPYNUMERIC_BINOP_COPYSIGN,
+  DIVIDE        = CUPYNUMERIC_BINOP_DIVIDE,
+  EQUAL         = CUPYNUMERIC_BINOP_EQUAL,
+  FLOAT_POWER   = CUPYNUMERIC_BINOP_FLOAT_POWER,
+  FLOOR_DIVIDE  = CUPYNUMERIC_BINOP_FLOOR_DIVIDE,
+  FMOD          = CUPYNUMERIC_BINOP_FMOD,
+  GCD           = CUPYNUMERIC_BINOP_GCD,
+  GREATER       = CUPYNUMERIC_BINOP_GREATER,
+  GREATER_EQUAL = CUPYNUMERIC_BINOP_GREATER_EQUAL,
+  HYPOT         = CUPYNUMERIC_BINOP_HYPOT,
+  ISCLOSE       = CUPYNUMERIC_BINOP_ISCLOSE,
+  LCM           = CUPYNUMERIC_BINOP_LCM,
+  LDEXP         = CUPYNUMERIC_BINOP_LDEXP,
+  LEFT_SHIFT    = CUPYNUMERIC_BINOP_LEFT_SHIFT,
+  LESS          = CUPYNUMERIC_BINOP_LESS,
+  LESS_EQUAL    = CUPYNUMERIC_BINOP_LESS_EQUAL,
+  LOGADDEXP     = CUPYNUMERIC_BINOP_LOGADDEXP,
+  LOGADDEXP2    = CUPYNUMERIC_BINOP_LOGADDEXP2,
+  LOGICAL_AND   = CUPYNUMERIC_BINOP_LOGICAL_AND,
+  LOGICAL_OR    = CUPYNUMERIC_BINOP_LOGICAL_OR,
+  LOGICAL_XOR   = CUPYNUMERIC_BINOP_LOGICAL_XOR,
+  MAXIMUM       = CUPYNUMERIC_BINOP_MAXIMUM,
+  MINIMUM       = CUPYNUMERIC_BINOP_MINIMUM,
+  MOD           = CUPYNUMERIC_BINOP_MOD,
+  MULTIPLY      = CUPYNUMERIC_BINOP_MULTIPLY,
+  NEXTAFTER     = CUPYNUMERIC_BINOP_NEXTAFTER,
+  NOT_EQUAL     = CUPYNUMERIC_BINOP_NOT_EQUAL,
+  POWER         = CUPYNUMERIC_BINOP_POWER,
+  RIGHT_SHIFT   = CUPYNUMERIC_BINOP_RIGHT_SHIFT,
+  SUBTRACT      = CUPYNUMERIC_BINOP_SUBTRACT,
 };
 
 template <typename Functor, typename... Fnargs>
@@ -913,4 +913,4 @@ using rhs2_of_binary_op = typename RHS2OfBinaryOp<OP_CODE, CODE>::type;
 
 std::vector<uint64_t> broadcast_shapes(std::vector<NDArray> arrays);
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/binary/binary_red.cc b/src/cupynumeric/binary/binary_red.cc
similarity index 89%
rename from src/cunumeric/binary/binary_red.cc
rename to src/cupynumeric/binary/binary_red.cc
index 576347b37d..89ad585ccf 100644
--- a/src/cunumeric/binary/binary_red.cc
+++ b/src/cupynumeric/binary/binary_red.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/binary/binary_red.h"
-#include "cunumeric/binary/binary_red_template.inl"
+#include "cupynumeric/binary/binary_red.h"
+#include "cupynumeric/binary/binary_red_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -66,10 +66,10 @@ struct BinaryRedImplBody<VariantKind::CPU, OP_CODE, CODE, DIM> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+const auto cupynumeric_reg_task_ = []() -> char {
   BinaryRedTask::register_variants();
-}
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/binary/binary_red.cu b/src/cupynumeric/binary/binary_red.cu
similarity index 92%
rename from src/cunumeric/binary/binary_red.cu
rename to src/cupynumeric/binary/binary_red.cu
index 4623e43bdc..47544a5ab4 100644
--- a/src/cunumeric/binary/binary_red.cu
+++ b/src/cupynumeric/binary/binary_red.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/binary/binary_red.h"
-#include "cunumeric/binary/binary_red_template.inl"
+#include "cupynumeric/binary/binary_red.h"
+#include "cupynumeric/binary/binary_red_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename Function, typename RES, typename ARG>
 static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
@@ -82,7 +82,7 @@ struct BinaryRedImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
     }
 
     copy_kernel<<<1, 1, 0, stream>>>(result, out);
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -91,4 +91,4 @@ struct BinaryRedImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
   binary_red_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/binary/binary_red.h b/src/cupynumeric/binary/binary_red.h
similarity index 72%
rename from src/cunumeric/binary/binary_red.h
rename to src/cupynumeric/binary/binary_red.h
index 28ca9f030f..906300e95b 100644
--- a/src/cunumeric/binary/binary_red.h
+++ b/src/cupynumeric/binary/binary_red.h
@@ -16,10 +16,10 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/binary/binary_op_util.h"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/binary/binary_op_util.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct BinaryRedArgs {
   legate::PhysicalStore out;
@@ -29,9 +29,12 @@ struct BinaryRedArgs {
   std::vector<legate::Scalar> args;
 };
 
-class BinaryRedTask : public CuNumericTask<BinaryRedTask> {
+class BinaryRedTask : public CuPyNumericTask<BinaryRedTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_BINARY_RED};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_BINARY_RED}};
+
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -43,4 +46,4 @@ class BinaryRedTask : public CuNumericTask<BinaryRedTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/binary/binary_red_omp.cc b/src/cupynumeric/binary/binary_red_omp.cc
similarity index 92%
rename from src/cunumeric/binary/binary_red_omp.cc
rename to src/cupynumeric/binary/binary_red_omp.cc
index f3823c2031..021f99943b 100644
--- a/src/cunumeric/binary/binary_red_omp.cc
+++ b/src/cupynumeric/binary/binary_red_omp.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/binary/binary_red.h"
-#include "cunumeric/binary/binary_red_template.inl"
+#include "cupynumeric/binary/binary_red.h"
+#include "cupynumeric/binary/binary_red_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -65,4 +65,4 @@ struct BinaryRedImplBody<VariantKind::OMP, OP_CODE, CODE, DIM> {
   binary_red_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/binary/binary_red_template.inl b/src/cupynumeric/binary/binary_red_template.inl
similarity index 94%
rename from src/cunumeric/binary/binary_red_template.inl
rename to src/cupynumeric/binary/binary_red_template.inl
index e1971f5b45..15bdf9201f 100644
--- a/src/cunumeric/binary/binary_red_template.inl
+++ b/src/cupynumeric/binary/binary_red_template.inl
@@ -17,11 +17,11 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/binary/binary_red.h"
-#include "cunumeric/binary/binary_op_util.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/binary/binary_red.h"
+#include "cupynumeric/binary/binary_op_util.h"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -98,4 +98,4 @@ static void binary_red_template(TaskContext& context)
   reduce_op_dispatch(args.op_code, BinaryRedDispatch<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/bits/bits_util.h b/src/cupynumeric/bits/bits_util.h
similarity index 79%
rename from src/cunumeric/bits/bits_util.h
rename to src/cupynumeric/bits/bits_util.h
index bd3294f19f..3e8cd6d077 100644
--- a/src/cunumeric/bits/bits_util.h
+++ b/src/cupynumeric/bits/bits_util.h
@@ -16,13 +16,13 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_c.h"
+#include "cupynumeric/cupynumeric_c.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 enum class Bitorder {
-  BIG    = CUNUMERIC_BITORDER_BIG,
-  LITTLE = CUNUMERIC_BITORDER_LITTLE,
+  BIG    = CUPYNUMERIC_BITORDER_BIG,
+  LITTLE = CUPYNUMERIC_BITORDER_LITTLE,
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/bits/packbits.cc b/src/cupynumeric/bits/packbits.cc
similarity index 88%
rename from src/cunumeric/bits/packbits.cc
rename to src/cupynumeric/bits/packbits.cc
index 41b056c1d8..f563a5d4fe 100644
--- a/src/cunumeric/bits/packbits.cc
+++ b/src/cupynumeric/bits/packbits.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/bits/packbits.h"
-#include "cunumeric/bits/packbits_template.inl"
+#include "cupynumeric/bits/packbits.h"
+#include "cupynumeric/bits/packbits_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -57,7 +57,10 @@ struct PackbitsImplBody<VariantKind::CPU, CODE, DIM, BITORDER> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { PackbitsTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  PackbitsTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/bits/packbits.cu b/src/cupynumeric/bits/packbits.cu
similarity index 93%
rename from src/cunumeric/bits/packbits.cu
rename to src/cupynumeric/bits/packbits.cu
index 541bed6e8a..2252275cfd 100644
--- a/src/cunumeric/bits/packbits.cu
+++ b/src/cupynumeric/bits/packbits.cu
@@ -14,11 +14,11 @@
  *
  */
 
-#include "cunumeric/bits/packbits.h"
-#include "cunumeric/bits/packbits_template.inl"
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/bits/packbits.h"
+#include "cupynumeric/bits/packbits_template.inl"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -76,7 +76,7 @@ struct PackbitsImplBody<VariantKind::GPU, CODE, DIM, BITORDER> {
                                                                in_hi_axis,
                                                                axis);
     }
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -85,4 +85,4 @@ struct PackbitsImplBody<VariantKind::GPU, CODE, DIM, BITORDER> {
   packbits_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/bits/packbits.h b/src/cupynumeric/bits/packbits.h
similarity index 91%
rename from src/cunumeric/bits/packbits.h
rename to src/cupynumeric/bits/packbits.h
index f24497fe73..6d32bef8a1 100644
--- a/src/cunumeric/bits/packbits.h
+++ b/src/cupynumeric/bits/packbits.h
@@ -16,10 +16,10 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/bits/bits_util.h"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/bits/bits_util.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <Bitorder BITORDER, bool ALIGNED>
 struct Pack;
@@ -101,9 +101,10 @@ struct Pack<Bitorder::LITTLE, false /*ALIGNED*/> {
   }
 };
 
-class PackbitsTask : public CuNumericTask<PackbitsTask> {
+class PackbitsTask : public CuPyNumericTask<PackbitsTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_PACKBITS};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_PACKBITS}};
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -115,4 +116,4 @@ class PackbitsTask : public CuNumericTask<PackbitsTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/bits/packbits_omp.cc b/src/cupynumeric/bits/packbits_omp.cc
similarity index 93%
rename from src/cunumeric/bits/packbits_omp.cc
rename to src/cupynumeric/bits/packbits_omp.cc
index c4dd57dd8c..18b39c9a55 100644
--- a/src/cunumeric/bits/packbits_omp.cc
+++ b/src/cupynumeric/bits/packbits_omp.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/bits/packbits.h"
-#include "cunumeric/bits/packbits_template.inl"
+#include "cupynumeric/bits/packbits.h"
+#include "cupynumeric/bits/packbits_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -57,4 +57,4 @@ struct PackbitsImplBody<VariantKind::OMP, CODE, DIM, BITORDER> {
   packbits_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/bits/packbits_template.inl b/src/cupynumeric/bits/packbits_template.inl
similarity index 95%
rename from src/cunumeric/bits/packbits_template.inl
rename to src/cupynumeric/bits/packbits_template.inl
index 9046b85410..6b84138f0b 100644
--- a/src/cunumeric/bits/packbits_template.inl
+++ b/src/cupynumeric/bits/packbits_template.inl
@@ -17,10 +17,10 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/bits/packbits.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/bits/packbits.h"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -50,13 +50,13 @@ struct PackbitsImpl {
     auto aligned_rect     = out_rect;
     int64_t axis_extent   = in_rect.hi[axis] - in_rect.lo[axis] + 1;
     aligned_rect.hi[axis] = aligned_rect.lo[axis] + axis_extent / 8 - 1;
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     assert(aligned_rect.hi[axis] <= out_rect.hi[axis]);
 #endif
 
     auto unaligned_rect     = out_rect;
     unaligned_rect.lo[axis] = aligned_rect.hi[axis] + 1;
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     assert(unaligned_rect.union_bbox(aligned_rect) == out_rect);
 #endif
 
@@ -106,4 +106,4 @@ static void packbits_template(TaskContext& context)
   }
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/bits/unpackbits.cc b/src/cupynumeric/bits/unpackbits.cc
similarity index 86%
rename from src/cunumeric/bits/unpackbits.cc
rename to src/cupynumeric/bits/unpackbits.cc
index 15217c5e86..2be36a8287 100644
--- a/src/cunumeric/bits/unpackbits.cc
+++ b/src/cupynumeric/bits/unpackbits.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/bits/unpackbits.h"
-#include "cunumeric/bits/unpackbits_template.inl"
+#include "cupynumeric/bits/unpackbits.h"
+#include "cupynumeric/bits/unpackbits_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -45,10 +45,10 @@ struct UnpackbitsImplBody<VariantKind::CPU, DIM, BITORDER> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+const auto cupynumeric_reg_task_ = []() -> char {
   UnpackbitsTask::register_variants();
-}
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/bits/unpackbits.cu b/src/cupynumeric/bits/unpackbits.cu
similarity index 89%
rename from src/cunumeric/bits/unpackbits.cu
rename to src/cupynumeric/bits/unpackbits.cu
index 71413618a6..f1b5b66890 100644
--- a/src/cunumeric/bits/unpackbits.cu
+++ b/src/cupynumeric/bits/unpackbits.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/bits/unpackbits.h"
-#include "cunumeric/bits/unpackbits_template.inl"
+#include "cupynumeric/bits/unpackbits.h"
+#include "cupynumeric/bits/unpackbits_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -55,7 +55,7 @@ struct UnpackbitsImplBody<VariantKind::GPU, DIM, BITORDER> {
     const size_t blocks = (in_volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
     generic_kernel<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
       in_volume, unpack, out, in, in_pitches, in_rect.lo, axis);
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -64,4 +64,4 @@ struct UnpackbitsImplBody<VariantKind::GPU, DIM, BITORDER> {
   unpackbits_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/bits/unpackbits.h b/src/cupynumeric/bits/unpackbits.h
similarity index 86%
rename from src/cunumeric/bits/unpackbits.h
rename to src/cupynumeric/bits/unpackbits.h
index 96b5d39e03..92061ae43b 100644
--- a/src/cunumeric/bits/unpackbits.h
+++ b/src/cupynumeric/bits/unpackbits.h
@@ -16,10 +16,10 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/bits/bits_util.h"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/bits/bits_util.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <Bitorder BITORDER>
 struct Unpack;
@@ -58,9 +58,10 @@ struct Unpack<Bitorder::LITTLE> {
   }
 };
 
-class UnpackbitsTask : public CuNumericTask<UnpackbitsTask> {
+class UnpackbitsTask : public CuPyNumericTask<UnpackbitsTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_UNPACKBITS};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_UNPACKBITS}};
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -72,4 +73,4 @@ class UnpackbitsTask : public CuNumericTask<UnpackbitsTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/bits/unpackbits_omp.cc b/src/cupynumeric/bits/unpackbits_omp.cc
similarity index 90%
rename from src/cunumeric/bits/unpackbits_omp.cc
rename to src/cupynumeric/bits/unpackbits_omp.cc
index 02151be529..3f12a5355d 100644
--- a/src/cunumeric/bits/unpackbits_omp.cc
+++ b/src/cupynumeric/bits/unpackbits_omp.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/bits/unpackbits.h"
-#include "cunumeric/bits/unpackbits_template.inl"
+#include "cupynumeric/bits/unpackbits.h"
+#include "cupynumeric/bits/unpackbits_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -44,4 +44,4 @@ struct UnpackbitsImplBody<VariantKind::OMP, DIM, BITORDER> {
   unpackbits_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/bits/unpackbits_template.inl b/src/cupynumeric/bits/unpackbits_template.inl
similarity index 94%
rename from src/cunumeric/bits/unpackbits_template.inl
rename to src/cupynumeric/bits/unpackbits_template.inl
index 0763818c47..2a710b8c01 100644
--- a/src/cunumeric/bits/unpackbits_template.inl
+++ b/src/cupynumeric/bits/unpackbits_template.inl
@@ -17,10 +17,10 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/bits/unpackbits.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/bits/unpackbits.h"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -77,4 +77,4 @@ static void unpackbits_template(TaskContext& context)
   }
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/cephes/chbevl.cc b/src/cupynumeric/cephes/chbevl.cc
similarity index 100%
rename from src/cunumeric/cephes/chbevl.cc
rename to src/cupynumeric/cephes/chbevl.cc
diff --git a/src/cunumeric/cephes/i0.cc b/src/cupynumeric/cephes/i0.cc
similarity index 100%
rename from src/cunumeric/cephes/i0.cc
rename to src/cupynumeric/cephes/i0.cc
diff --git a/src/cunumeric/convolution/convolve.cc b/src/cupynumeric/convolution/convolve.cc
similarity index 96%
rename from src/cunumeric/convolution/convolve.cc
rename to src/cupynumeric/convolution/convolve.cc
index 653933507e..9335606175 100644
--- a/src/cunumeric/convolution/convolve.cc
+++ b/src/cupynumeric/convolution/convolve.cc
@@ -14,11 +14,11 @@
  *
  */
 
-#include "cunumeric/divmod.h"
-#include "cunumeric/convolution/convolve.h"
-#include "cunumeric/convolution/convolve_template.inl"
+#include "cupynumeric/divmod.h"
+#include "cupynumeric/convolution/convolve.h"
+#include "cupynumeric/convolution/convolve_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 // This is the easy to understand functional specification of the
 // algorithm, but it is commented out in favor of the faster one
@@ -82,7 +82,8 @@ struct ConvolveImplBody<VariantKind::CPU, CODE, DIM> {
                   AccessorRO<VAL, DIM> in,
                   const Rect<DIM>& root_rect,
                   const Rect<DIM>& subrect,
-                  const Rect<DIM>& filter_rect) const
+                  const Rect<DIM>& filter_rect,
+                  CuPyNumericConvolveMethod method) const
   {
     const Point<DIM> one = Point<DIM>::ONES();
     Point<DIM> extents   = filter_rect.hi - filter_rect.lo + one;
@@ -272,7 +273,10 @@ struct ConvolveImplBody<VariantKind::CPU, CODE, DIM> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { ConvolveTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  ConvolveTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/convolution/convolve.cu b/src/cupynumeric/convolution/convolve.cu
similarity index 93%
rename from src/cunumeric/convolution/convolve.cu
rename to src/cupynumeric/convolution/convolve.cu
index c2c271577a..6cdacd3b9f 100644
--- a/src/cunumeric/convolution/convolve.cu
+++ b/src/cupynumeric/convolution/convolve.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/divmod.h"
-#include "cunumeric/cuda_help.h"
-#include "cunumeric/convolution/convolve.h"
-#include "cunumeric/convolution/convolve_template.inl"
+#include "cupynumeric/divmod.h"
+#include "cupynumeric/cuda_help.h"
+#include "cupynumeric/convolution/convolve.h"
+#include "cupynumeric/convolution/convolve_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -744,7 +744,7 @@ __host__ static inline void launch_small_tile_kernel(AccessorWO<VAL, DIM> out,
         out, filter, in, root_rect, subrect, filter_rect, args);
     }
   }
-  CUNUMERIC_CHECK_CUDA_STREAM(stream);
+  CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
 }
 
 template <typename VAL, int32_t DIM>
@@ -766,24 +766,24 @@ __host__ void direct_convolution(AccessorWO<VAL, DIM> out,
   static unsigned long long mask = 0;
   if (!(mask & (1 << device))) {
     if (properties.sharedMemPerBlock < max_smem_size) {
-      CUNUMERIC_CHECK_CUDA(cudaFuncSetAttribute(convolution_small_tile1<VAL, DIM>,
-                                                cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                                max_smem_size));
-      CUNUMERIC_CHECK_CUDA(cudaFuncSetAttribute(convolution_small_tile2<VAL, DIM>,
-                                                cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                                max_smem_size));
-      CUNUMERIC_CHECK_CUDA(cudaFuncSetAttribute(convolution_large_tile<VAL, DIM, THREADVALS>,
-                                                cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                                max_smem_size));
+      CUPYNUMERIC_CHECK_CUDA(cudaFuncSetAttribute(convolution_small_tile1<VAL, DIM>,
+                                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                                  max_smem_size));
+      CUPYNUMERIC_CHECK_CUDA(cudaFuncSetAttribute(convolution_small_tile2<VAL, DIM>,
+                                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                                  max_smem_size));
+      CUPYNUMERIC_CHECK_CUDA(cudaFuncSetAttribute(convolution_large_tile<VAL, DIM, THREADVALS>,
+                                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                                  max_smem_size));
     }
     if (sizeof(VAL) >= 8) {
       // Only need to set this on the first invocation
-      CUNUMERIC_CHECK_CUDA(cudaFuncSetSharedMemConfig(convolution_small_tile1<VAL, DIM>,
-                                                      cudaSharedMemBankSizeEightByte));
-      CUNUMERIC_CHECK_CUDA(cudaFuncSetSharedMemConfig(convolution_small_tile2<VAL, DIM>,
-                                                      cudaSharedMemBankSizeEightByte));
-      CUNUMERIC_CHECK_CUDA(cudaFuncSetSharedMemConfig(convolution_large_tile<VAL, DIM, THREADVALS>,
-                                                      cudaSharedMemBankSizeEightByte));
+      CUPYNUMERIC_CHECK_CUDA(cudaFuncSetSharedMemConfig(convolution_small_tile1<VAL, DIM>,
+                                                        cudaSharedMemBankSizeEightByte));
+      CUPYNUMERIC_CHECK_CUDA(cudaFuncSetSharedMemConfig(convolution_small_tile2<VAL, DIM>,
+                                                        cudaSharedMemBankSizeEightByte));
+      CUPYNUMERIC_CHECK_CUDA(cudaFuncSetSharedMemConfig(
+        convolution_large_tile<VAL, DIM, THREADVALS>, cudaSharedMemBankSizeEightByte));
     }
     // Make sure we have enough bits for every device
     assert(device < (8 * sizeof(mask)));
@@ -848,7 +848,7 @@ __host__ void direct_convolution(AccessorWO<VAL, DIM> out,
     }
     if (out_dense) {
       size_t bytes = sizeof(VAL) * out_pitch;
-      CUNUMERIC_CHECK_CUDA(cudaMemsetAsync(out_ptr, 0, bytes));
+      CUPYNUMERIC_CHECK_CUDA(cudaMemsetAsync(out_ptr, 0, bytes));
     } else {
       out_pitch = 1;
       ConvolutionInitArgs<DIM> args;
@@ -1168,7 +1168,7 @@ __host__ void direct_convolution(AccessorWO<VAL, DIM> out,
           one,
           args);
     }
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 }
 
@@ -1299,7 +1299,8 @@ __host__ static inline void cufft_convolution(AccessorWO<VAL, DIM> out,
                                               AccessorRO<VAL, DIM> in,
                                               const Rect<DIM>& root_rect,
                                               const Rect<DIM>& subrect,
-                                              const Rect<DIM>& filter_rect)
+                                              const Rect<DIM>& filter_rect,
+                                              CuPyNumericConvolveMethod method)
 {
   int device           = get_device_ordinal();
   auto& properties     = get_device_properties();
@@ -1310,19 +1311,19 @@ __host__ static inline void cufft_convolution(AccessorWO<VAL, DIM> out,
   static unsigned long long mask = 0;
   if (!(mask & (1 << device))) {
     if (properties.sharedMemPerBlock < max_smem_size) {
-      CUNUMERIC_CHECK_CUDA(cudaFuncSetAttribute(convolution_small_tile1<VAL, DIM>,
-                                                cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                                max_smem_size));
-      CUNUMERIC_CHECK_CUDA(cudaFuncSetAttribute(convolution_small_tile2<VAL, DIM>,
-                                                cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                                max_smem_size));
+      CUPYNUMERIC_CHECK_CUDA(cudaFuncSetAttribute(convolution_small_tile1<VAL, DIM>,
+                                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                                  max_smem_size));
+      CUPYNUMERIC_CHECK_CUDA(cudaFuncSetAttribute(convolution_small_tile2<VAL, DIM>,
+                                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                                  max_smem_size));
     }
     if (sizeof(VAL) >= 8) {
       // Only need to set this on the first invocation
-      CUNUMERIC_CHECK_CUDA(cudaFuncSetSharedMemConfig(convolution_small_tile1<VAL, DIM>,
-                                                      cudaSharedMemBankSizeEightByte));
-      CUNUMERIC_CHECK_CUDA(cudaFuncSetSharedMemConfig(convolution_small_tile2<VAL, DIM>,
-                                                      cudaSharedMemBankSizeEightByte));
+      CUPYNUMERIC_CHECK_CUDA(cudaFuncSetSharedMemConfig(convolution_small_tile1<VAL, DIM>,
+                                                        cudaSharedMemBankSizeEightByte));
+      CUPYNUMERIC_CHECK_CUDA(cudaFuncSetSharedMemConfig(convolution_small_tile2<VAL, DIM>,
+                                                        cudaSharedMemBankSizeEightByte));
     }
     // Make sure we have enough bits for every device
     assert(device < (8 * sizeof(mask)));
@@ -1354,7 +1355,7 @@ __host__ static inline void cufft_convolution(AccessorWO<VAL, DIM> out,
   for (int d = 0; d < DIM; d++) {
     smem_size *= (tile[d] + 2 * centers[d]);
   }
-  if (smem_size <= max_smem_size) {
+  if (method != CUPYNUMERIC_CONVOLVE_FFT && smem_size <= max_smem_size) {
     launch_small_tile_kernel<VAL, DIM>(out,
                                        filter,
                                        in,
@@ -1405,7 +1406,7 @@ __host__ static inline void cufft_convolution(AccessorWO<VAL, DIM> out,
     // Zero pad and copy in the input data
     auto signal_buffer = create_buffer<VAL, DIM>(buffersize, Memory::GPU_FB_MEM, 128 /*alignment*/);
     VAL* signal_ptr    = signal_buffer.ptr(zero);
-    CUNUMERIC_CHECK_CUDA(cudaMemsetAsync(signal_ptr, 0, buffervolume * sizeof(VAL), stream));
+    CUPYNUMERIC_CHECK_CUDA(cudaMemsetAsync(signal_ptr, 0, buffervolume * sizeof(VAL), stream));
     // Check to see if the input pointer is dense and we can do this with a CUDA memcpy
     size_t strides[DIM];
     const VAL* input_ptr = in.ptr(input_bounds, strides);
@@ -1421,7 +1422,7 @@ __host__ static inline void cufft_convolution(AccessorWO<VAL, DIM> out,
     // Zero pad and copy in the filter data
     auto filter_buffer = create_buffer<VAL, DIM>(buffersize, Memory::GPU_FB_MEM, 128 /*alignment*/);
     VAL* filter_ptr    = filter_buffer.ptr(zero);
-    CUNUMERIC_CHECK_CUDA(cudaMemsetAsync(filter_ptr, 0, buffervolume * sizeof(VAL), stream));
+    CUPYNUMERIC_CHECK_CUDA(cudaMemsetAsync(filter_ptr, 0, buffervolume * sizeof(VAL), stream));
     const VAL* filt_ptr = filter.ptr(filter_rect, strides);
     pitch               = 1;
     for (int d = DIM - 1; d >= 0; d--) {
@@ -1432,7 +1433,7 @@ __host__ static inline void cufft_convolution(AccessorWO<VAL, DIM> out,
     copy_into_buffer<VAL, DIM><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
       filter, filter_buffer, filter_rect.lo, copy_pitches, pitch);
 
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
 
     auto forward_plan  = get_cufft_plan(ForwardPlanType<VAL>::value, cufftPlanParams(fftsize));
     auto backward_plan = get_cufft_plan(BackwardPlanType<VAL>::value, cufftPlanParams(fftsize));
@@ -1455,7 +1456,7 @@ __host__ static inline void cufft_convolution(AccessorWO<VAL, DIM> out,
     // FFT the filter data
     cufft_execute_forward(forward_plan.handle(), filter_ptr, filter_ptr);
 
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
 
     // Perform the pointwise multiplcation
     {
@@ -1492,13 +1493,13 @@ __host__ static inline void cufft_convolution(AccessorWO<VAL, DIM> out,
     copy_from_buffer<VAL, DIM><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
       filter_ptr, out, buffer_offset, subrect.lo, copy_pitches, fft_pitches, pitch, scaling_factor);
 
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
 
 #if 0
     // This is useful debugging code for finding the output
     VAL *buffer = (VAL*)malloc(buffervolume*sizeof(VAL));
-    CUNUMERIC_CHECK_CUDA( cudaMemcpyAsync(buffer, filter_ptr, buffervolume*sizeof(VAL), cudaMemcpyDeviceToHost, stream) );
-    CUNUMERIC_CHECK_CUDA( cudaStreamSynchronize(stream) );
+    CUPYNUMERIC_CHECK_CUDA( cudaMemcpyAsync(buffer, filter_ptr, buffervolume*sizeof(VAL), cudaMemcpyDeviceToHost, stream) );
+    CUPYNUMERIC_CHECK_CUDA( cudaStreamSynchronize(stream) );
     for (unsigned idx = 0; idx < buffervolume; idx++) {
       if ((idx % fftsize[DIM-1]) == 0)
         printf("\n");
@@ -1515,7 +1516,7 @@ __host__ static inline void cufft_convolution(AccessorWO<VAL, DIM> out,
 /////////////
 
 template <typename VAL, int DIM>
-struct UseCUFFT {
+struct CanUseCUFFT {
   static constexpr bool value = 1 <= DIM && DIM <= 3 && std::is_floating_point<VAL>::value;
 };
 
@@ -1523,24 +1524,34 @@ template <Type::Code CODE, int DIM>
 struct ConvolveImplBody<VariantKind::GPU, CODE, DIM> {
   using VAL = type_of<CODE>;
 
-  template <typename _VAL, int32_t _DIM, std::enable_if_t<UseCUFFT<_VAL, _DIM>::value>* = nullptr>
+  template <typename _VAL,
+            int32_t _DIM,
+            std::enable_if_t<CanUseCUFFT<_VAL, _DIM>::value>* = nullptr>
   __host__ void dispatch(AccessorWO<_VAL, _DIM> out,
                          AccessorRO<_VAL, _DIM> filter,
                          AccessorRO<_VAL, _DIM> in,
                          const Rect<_DIM>& root_rect,
                          const Rect<_DIM>& subrect,
-                         const Rect<_DIM>& filter_rect) const
+                         const Rect<_DIM>& filter_rect,
+                         CuPyNumericConvolveMethod method) const
   {
-    cufft_convolution<_VAL, _DIM>(out, filter, in, root_rect, subrect, filter_rect);
+    if (method == CUPYNUMERIC_CONVOLVE_DIRECT) {
+      direct_convolution<_VAL, _DIM>(out, filter, in, root_rect, subrect, filter_rect);
+    } else {
+      cufft_convolution<_VAL, _DIM>(out, filter, in, root_rect, subrect, filter_rect, method);
+    }
   }
 
-  template <typename _VAL, int32_t _DIM, std::enable_if_t<!UseCUFFT<_VAL, _DIM>::value>* = nullptr>
+  template <typename _VAL,
+            int32_t _DIM,
+            std::enable_if_t<!CanUseCUFFT<_VAL, _DIM>::value>* = nullptr>
   __host__ void dispatch(AccessorWO<_VAL, _DIM> out,
                          AccessorRO<_VAL, _DIM> filter,
                          AccessorRO<_VAL, _DIM> in,
                          const Rect<_DIM>& root_rect,
                          const Rect<_DIM>& subrect,
-                         const Rect<_DIM>& filter_rect) const
+                         const Rect<_DIM>& filter_rect,
+                         CuPyNumericConvolveMethod method) const
   {
     direct_convolution<_VAL, _DIM>(out, filter, in, root_rect, subrect, filter_rect);
   }
@@ -1550,9 +1561,10 @@ struct ConvolveImplBody<VariantKind::GPU, CODE, DIM> {
                            AccessorRO<VAL, DIM> in,
                            const Rect<DIM>& root_rect,
                            const Rect<DIM>& subrect,
-                           const Rect<DIM>& filter_rect) const
+                           const Rect<DIM>& filter_rect,
+                           CuPyNumericConvolveMethod method) const
   {
-    dispatch(out, filter, in, root_rect, subrect, filter_rect);
+    dispatch(out, filter, in, root_rect, subrect, filter_rect, method);
   }
 };
 
@@ -1561,4 +1573,4 @@ struct ConvolveImplBody<VariantKind::GPU, CODE, DIM> {
   convolve_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/convolution/convolve.h b/src/cupynumeric/convolution/convolve.h
similarity index 77%
rename from src/cunumeric/convolution/convolve.h
rename to src/cupynumeric/convolution/convolve.h
index e20fe16031..1e2707dd45 100644
--- a/src/cunumeric/convolution/convolve.h
+++ b/src/cupynumeric/convolution/convolve.h
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
 // We'll make some assumptions here about cache size
 // that should hold up against most CPUs out there today
@@ -27,18 +27,22 @@
 // Most caches have 64B lines
 #define CACHE_LINE_SIZE 64
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct ConvolveArgs {
   legate::PhysicalStore out{nullptr};
   legate::PhysicalStore filter{nullptr};
   std::vector<legate::PhysicalStore> inputs;
   legate::Domain root_domain;
+  CuPyNumericConvolveMethod method;
 };
 
-class ConvolveTask : public CuNumericTask<ConvolveTask> {
+class ConvolveTask : public CuPyNumericTask<ConvolveTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_CONVOLVE};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_CONVOLVE}};
+
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -50,4 +54,4 @@ class ConvolveTask : public CuNumericTask<ConvolveTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/convolution/convolve_omp.cc b/src/cupynumeric/convolution/convolve_omp.cc
similarity index 97%
rename from src/cunumeric/convolution/convolve_omp.cc
rename to src/cupynumeric/convolution/convolve_omp.cc
index 6bb80383ba..f927edfb24 100644
--- a/src/cunumeric/convolution/convolve_omp.cc
+++ b/src/cupynumeric/convolution/convolve_omp.cc
@@ -14,13 +14,13 @@
  *
  */
 
-#include "cunumeric/divmod.h"
-#include "cunumeric/convolution/convolve.h"
-#include "cunumeric/convolution/convolve_template.inl"
+#include "cupynumeric/divmod.h"
+#include "cupynumeric/convolution/convolve.h"
+#include "cupynumeric/convolution/convolve_template.inl"
 
 #include <omp.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -33,7 +33,8 @@ struct ConvolveImplBody<VariantKind::OMP, CODE, DIM> {
                   AccessorRO<VAL, DIM> in,
                   const Rect<DIM>& root_rect,
                   const Rect<DIM>& subrect,
-                  const Rect<DIM>& filter_rect) const
+                  const Rect<DIM>& filter_rect,
+                  CuPyNumericConvolveMethod method) const
   {
     const Point<DIM> one = Point<DIM>::ONES();
     Point<DIM> extents   = filter_rect.hi - filter_rect.lo + one;
@@ -238,4 +239,4 @@ struct ConvolveImplBody<VariantKind::OMP, CODE, DIM> {
   convolve_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/convolution/convolve_template.inl b/src/cupynumeric/convolution/convolve_template.inl
similarity index 97%
rename from src/cunumeric/convolution/convolve_template.inl
rename to src/cupynumeric/convolution/convolve_template.inl
index dd54c0b722..ce7adf1041 100644
--- a/src/cunumeric/convolution/convolve_template.inl
+++ b/src/cupynumeric/convolution/convolve_template.inl
@@ -17,12 +17,12 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/convolution/convolve.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/convolution/convolve.h"
+#include "cupynumeric/pitches.h"
 
 #include <map>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -54,7 +54,8 @@ struct ConvolveImpl {
     auto input = args.inputs[0].read_accessor<VAL, DIM>(input_subrect);
 
     Rect<DIM> root_rect(args.root_domain);
-    ConvolveImplBody<KIND, CODE, DIM>()(out, filter, input, root_rect, subrect, filter_rect);
+    ConvolveImplBody<KIND, CODE, DIM>()(
+      out, filter, input, root_rect, subrect, filter_rect, args.method);
   }
 
   template <Type::Code CODE, int DIM, std::enable_if_t<!(DIM <= 3)>* = nullptr>
@@ -85,6 +86,8 @@ static void convolve_template(TaskContext& context)
     args.root_domain.rect_data[dim + shape.dim] = shape[dim] - 1;
   }
 
+  args.method = static_cast<CuPyNumericConvolveMethod>(context.scalar(1).value<std::int32_t>());
+
   double_dispatch(args.out.dim(), args.out.code(), ConvolveImpl<KIND>{}, args);
 }
 
@@ -392,4 +395,4 @@ static unsigned roundup_tile(Point<DIM>& tile,
   }
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/cuda_help.h b/src/cupynumeric/cuda_help.h
similarity index 66%
rename from src/cunumeric/cuda_help.h
rename to src/cupynumeric/cuda_help.h
index 1be68ffdb4..dd81f60224 100644
--- a/src/cunumeric/cuda_help.h
+++ b/src/cupynumeric/cuda_help.h
@@ -23,10 +23,10 @@
 #endif
 
 #include "legate/cuda/stream_pool.h"
-#include "cunumeric/arg.h"
+#include "cupynumeric/arg.h"
 #include <cublas_v2.h>
 #include <cusolverDn.h>
-#if LEGATE_DEFINED(CUNUMERIC_USE_CUSOLVERMP)
+#if LEGATE_DEFINED(CUPYNUMERIC_USE_CUSOLVERMP)
 #include <cusolverMp.h>
 #include <cal.h>
 #endif
@@ -42,7 +42,7 @@
 #define COOPERATIVE_THREADS 256
 #define COOPERATIVE_CTAS_PER_SM 4
 
-namespace cunumeric {
+namespace cupynumeric {
 
 __host__ inline void check_cuda(cudaError_t error, const char* file, int line)
 {
@@ -53,7 +53,7 @@ __host__ inline void check_cuda(cudaError_t error, const char* file, int line)
             cudaGetErrorName(error),
             file,
             line);
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     assert(false);
 #else
     exit(error);
@@ -69,7 +69,7 @@ __host__ inline void check_cublas(cublasStatus_t status, const char* file, int l
             status,
             file,
             line);
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     assert(false);
 #else
     exit(status);
@@ -85,7 +85,7 @@ __host__ inline void check_cufft(cufftResult result, const char* file, int line)
             result,
             file,
             line);
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     assert(false);
 #else
     exit(result);
@@ -101,7 +101,7 @@ __host__ inline void check_cusolver(cusolverStatus_t status, const char* file, i
             status,
             file,
             line);
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     assert(false);
 #else
     exit(status);
@@ -109,7 +109,7 @@ __host__ inline void check_cusolver(cusolverStatus_t status, const char* file, i
   }
 }
 
-#if LEGATE_DEFINED(CUNUMERIC_USE_CUSOLVERMP)
+#if LEGATE_DEFINED(CUPYNUMERIC_USE_CUSOLVERMP)
 __host__ inline void check_cal(calError_t status, const char* file, int line)
 {
   if (status != CAL_OK) {
@@ -118,7 +118,7 @@ __host__ inline void check_cal(calError_t status, const char* file, int line)
             status,
             file,
             line);
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     assert(false);
 #else
     exit(status);
@@ -136,7 +136,7 @@ __host__ inline void check_cutensor(cutensorStatus_t result, const char* file, i
             result,
             file,
             line);
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     assert(false);
 #else
     exit(result);
@@ -152,7 +152,7 @@ __host__ inline void check_nccl(ncclResult_t error, const char* file, int line)
             ncclGetErrorString(error),
             file,
             line);
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     assert(false);
 #else
     exit(error);
@@ -160,60 +160,60 @@ __host__ inline void check_nccl(ncclResult_t error, const char* file, int line)
   }
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
 
-#define CHECK_CUBLAS(expr)                                   \
-  do {                                                       \
-    cublasStatus_t __result__ = (expr);                      \
-    cunumeric::check_cublas(__result__, __FILE__, __LINE__); \
+#define CHECK_CUBLAS(expr)                                     \
+  do {                                                         \
+    cublasStatus_t __result__ = (expr);                        \
+    cupynumeric::check_cublas(__result__, __FILE__, __LINE__); \
   } while (false)
 
-#define CHECK_CUFFT(expr)                                   \
-  do {                                                      \
-    cufftResult __result__ = (expr);                        \
-    cunumeric::check_cufft(__result__, __FILE__, __LINE__); \
+#define CHECK_CUFFT(expr)                                     \
+  do {                                                        \
+    cufftResult __result__ = (expr);                          \
+    cupynumeric::check_cufft(__result__, __FILE__, __LINE__); \
   } while (false)
 
-#define CHECK_CUSOLVER(expr)                                   \
-  do {                                                         \
-    cusolverStatus_t __result__ = (expr);                      \
-    cunumeric::check_cusolver(__result__, __FILE__, __LINE__); \
+#define CHECK_CUSOLVER(expr)                                     \
+  do {                                                           \
+    cusolverStatus_t __result__ = (expr);                        \
+    cupynumeric::check_cusolver(__result__, __FILE__, __LINE__); \
   } while (false)
 
-#define CHECK_CAL(expr)                                   \
-  do {                                                    \
-    calError_t __result__ = (expr);                       \
-    cunumeric::check_cal(__result__, __FILE__, __LINE__); \
+#define CHECK_CAL(expr)                                     \
+  do {                                                      \
+    calError_t __result__ = (expr);                         \
+    cupynumeric::check_cal(__result__, __FILE__, __LINE__); \
   } while (false)
 
-#define CHECK_CUTENSOR(expr)                                   \
-  do {                                                         \
-    cutensorStatus_t __result__ = (expr);                      \
-    cunumeric::check_cutensor(__result__, __FILE__, __LINE__); \
+#define CHECK_CUTENSOR(expr)                                     \
+  do {                                                           \
+    cutensorStatus_t __result__ = (expr);                        \
+    cupynumeric::check_cutensor(__result__, __FILE__, __LINE__); \
   } while (false)
 
-#define CHECK_NCCL(...)                                    \
-  do {                                                     \
-    ncclResult_t __result__ = (__VA_ARGS__);               \
-    cunumeric::check_nccl(__result__, __FILE__, __LINE__); \
+#define CHECK_NCCL(...)                                      \
+  do {                                                       \
+    ncclResult_t __result__ = (__VA_ARGS__);                 \
+    cupynumeric::check_nccl(__result__, __FILE__, __LINE__); \
   } while (false)
 
-#define CUNUMERIC_CHECK_CUDA(...)                          \
-  do {                                                     \
-    cudaError_t __result__ = (__VA_ARGS__);                \
-    cunumeric::check_cuda(__result__, __FILE__, __LINE__); \
+#define CUPYNUMERIC_CHECK_CUDA(...)                          \
+  do {                                                       \
+    cudaError_t __result__ = (__VA_ARGS__);                  \
+    cupynumeric::check_cuda(__result__, __FILE__, __LINE__); \
   } while (false)
 
-#ifdef DEBUG_CUNUMERIC
-#define CUNUMERIC_CHECK_CUDA_STREAM(stream)              \
-  do {                                                   \
-    CUNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream)); \
-    CUNUMERIC_CHECK_CUDA(cudaPeekAtLastError());         \
+#ifdef DEBUG_CUPYNUMERIC
+#define CUPYNUMERIC_CHECK_CUDA_STREAM(stream)              \
+  do {                                                     \
+    CUPYNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream)); \
+    CUPYNUMERIC_CHECK_CUDA(cudaPeekAtLastError());         \
   } while (false)
 #else
-#define CUNUMERIC_CHECK_CUDA_STREAM(stream)              \
-  do {                                                   \
-    CUNUMERIC_CHECK_CUDA(cudaPeekAtLastError());         \
+#define CUPYNUMERIC_CHECK_CUDA_STREAM(stream)      \
+  do {                                             \
+    CUPYNUMERIC_CHECK_CUDA(cudaPeekAtLastError()); \
   } while (false)
 #endif
 
@@ -224,10 +224,10 @@ __host__ inline void check_nccl(ncclResult_t error, const char* file, int line)
 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
 #endif
 
-// Must go here since it depends on CUNUMERIC_CHECK_CUDA(), which is defined in this header...
-#include "cunumeric/device_scalar_reduction_buffer.h"
+// Must go here since it depends on CUPYNUMERIC_CHECK_CUDA(), which is defined in this header...
+#include "cupynumeric/device_scalar_reduction_buffer.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename T>
 struct cudaTypeToDataType;
@@ -311,6 +311,69 @@ struct cufftPlanParams {
   std::string to_string() const;
 };
 
+typedef cusolverStatus_t (*cusolverDnXgeev_bufferSize_handle)(cusolverDnHandle_t handle,
+                                                              cusolverDnParams_t params,
+                                                              cusolverEigMode_t jobvl,
+                                                              cusolverEigMode_t jobvr,
+                                                              int64_t n,
+                                                              cudaDataType dataTypeA,
+                                                              const void* A,
+                                                              int64_t lda,
+                                                              cudaDataType dataTypeW,
+                                                              const void* W,
+                                                              cudaDataType dataTypeVL,
+                                                              const void* VL,
+                                                              int64_t ldvl,
+                                                              cudaDataType dataTypeVR,
+                                                              const void* VR,
+                                                              int64_t ldvr,
+                                                              cudaDataType computeType,
+                                                              size_t* workspaceInBytesOnDevice,
+                                                              size_t* workspaceInBytesOnHost);
+
+typedef cusolverStatus_t (*cusolverDnXgeev_handle)(cusolverDnHandle_t handle,
+                                                   cusolverDnParams_t params,
+                                                   cusolverEigMode_t jobvl,
+                                                   cusolverEigMode_t jobvr,
+                                                   int64_t n,
+                                                   cudaDataType dataTypeA,
+                                                   void* A,
+                                                   int64_t lda,
+                                                   cudaDataType dataTypeW,
+                                                   void* W,
+                                                   cudaDataType dataTypeVL,
+                                                   void* VL,
+                                                   int64_t ldvl,
+                                                   cudaDataType dataTypeVR,
+                                                   void* VR,
+                                                   int64_t ldvr,
+                                                   cudaDataType computeType,
+                                                   void* bufferOnDevice,
+                                                   size_t workspaceInBytesOnDevice,
+                                                   void* bufferOnHost,
+                                                   size_t workspaceInBytesOnHost,
+                                                   int* info);
+
+struct CuSolverExtraSymbols {
+ private:
+  void* cusolver_lib;
+
+ public:
+  // geev support (since 12.6)
+  cusolverDnXgeev_bufferSize_handle cusolver_geev_bufferSize;
+  cusolverDnXgeev_handle cusolver_geev;
+  bool has_geev;
+
+  CuSolverExtraSymbols();
+  ~CuSolverExtraSymbols();
+
+  // Prevent copying and overwriting
+  CuSolverExtraSymbols(const CuSolverExtraSymbols& rhs)            = delete;
+  CuSolverExtraSymbols& operator=(const CuSolverExtraSymbols& rhs) = delete;
+
+  void finalize();
+};
+
 // Defined in cudalibs.cu
 
 // Return a cached stream for the current GPU
@@ -319,7 +382,8 @@ int get_device_ordinal();
 const cudaDeviceProp& get_device_properties();
 cublasHandle_t get_cublas();
 cusolverDnHandle_t get_cusolver();
-#if LEGATE_DEFINED(CUNUMERIC_USE_CUSOLVERMP)
+CuSolverExtraSymbols* get_cusolver_extra_symbols();
+#if LEGATE_DEFINED(CUPYNUMERIC_USE_CUSOLVERMP)
 cusolverMpHandle_t get_cusolvermp();
 #endif
 [[nodiscard]] const cutensorHandle_t& get_cutensor();
@@ -521,4 +585,4 @@ __device__ __forceinline__ void store_streaming<double>(double* ptr, double valu
   asm volatile("st.global.cs.f64 [%0], %1;" : : "l"(ptr), "d"(value) : "memory");
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/cudalibs.cu b/src/cupynumeric/cudalibs.cu
similarity index 83%
rename from src/cunumeric/cudalibs.cu
rename to src/cupynumeric/cudalibs.cu
index 3b09f495aa..d575c61491 100644
--- a/src/cunumeric/cudalibs.cu
+++ b/src/cupynumeric/cudalibs.cu
@@ -14,18 +14,19 @@
  *
  */
 
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/random/bitgenerator.h"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/random/bitgenerator.h"
 
 #include "cudalibs.h"
 
+#include <dlfcn.h>
 #include <stdio.h>
 
 using namespace legate;
 
-namespace cunumeric {
+namespace cupynumeric {
 
-static Logger log_cudalibs("cunumeric.cudalibs");
+static Logger log_cudalibs("cupynumeric.cudalibs");
 
 cufftContext::cufftContext(cufftPlan* plan) : plan_(plan) {}
 
@@ -264,15 +265,50 @@ cufftPlan* cufftPlanCache::get_cufft_plan(const cufftPlanParams& params)
       }
       entry.lru_index = 0;
     }
+    auto stream = get_cached_stream();
+    CHECK_CUFFT(cufftSetStream(result->handle, stream));
   }
   return result;
 }
 
+CuSolverExtraSymbols::CuSolverExtraSymbols()
+{
+  cusolver_lib = dlopen("libcusolver.so", RTLD_LAZY | RTLD_DEEPBIND);
+  void* fn1    = dlsym(cusolver_lib, "cusolverDnXgeev_bufferSize");
+  if (fn1 == nullptr) {
+    dlerror();
+  } else {
+    cusolver_geev_bufferSize = (cusolverDnXgeev_bufferSize_handle)fn1;
+    has_geev                 = true;
+  }
+
+  void* fn2 = dlsym(cusolver_lib, "cusolverDnXgeev");
+  if (fn2 == nullptr) {
+    has_geev                 = false;
+    cusolver_geev_bufferSize = nullptr;
+    dlerror();
+  } else {
+    cusolver_geev = (cusolverDnXgeev_handle)fn2;
+  }
+}
+
+void CuSolverExtraSymbols::finalize()
+{
+  cusolver_geev            = nullptr;
+  cusolver_geev_bufferSize = nullptr;
+  has_geev                 = false;
+  if (cusolver_lib != nullptr) {
+    dlclose(cusolver_lib);
+  }
+}
+
+CuSolverExtraSymbols::~CuSolverExtraSymbols() { finalize(); }
+
 CUDALibraries::CUDALibraries()
   : finalized_(false),
     cublas_(nullptr),
     cusolver_(nullptr),
-#if LEGATE_DEFINED(CUNUMERIC_USE_CUSOLVERMP)
+#if LEGATE_DEFINED(CUPYNUMERIC_USE_CUSOLVERMP)
     cusolvermp_(nullptr),
 #endif
     plan_caches_()
@@ -292,7 +328,8 @@ void CUDALibraries::finalize()
   if (cusolver_ != nullptr) {
     finalize_cusolver();
   }
-#if LEGATE_DEFINED(CUNUMERIC_USE_CUSOLVERMP)
+
+#if LEGATE_DEFINED(CUPYNUMERIC_USE_CUSOLVERMP)
   if (cusolvermp_ != nullptr) {
     finalize_cusolvermp();
   }
@@ -318,7 +355,7 @@ void CUDALibraries::finalize_cusolver()
   cusolver_ = nullptr;
 }
 
-#if LEGATE_DEFINED(CUNUMERIC_USE_CUSOLVERMP)
+#if LEGATE_DEFINED(CUPYNUMERIC_USE_CUSOLVERMP)
 void CUDALibraries::finalize_cusolvermp()
 {
   CHECK_CUSOLVER(cusolverMpDestroy(cusolvermp_));
@@ -340,7 +377,7 @@ int CUDALibraries::get_device_ordinal()
     return *ordinal_;
   }
   int ordinal{-1};
-  CUNUMERIC_CHECK_CUDA(cudaGetDevice(&ordinal));
+  CUPYNUMERIC_CHECK_CUDA(cudaGetDevice(&ordinal));
   ordinal_ = ordinal;
   return ordinal;
 }
@@ -351,7 +388,7 @@ const cudaDeviceProp& CUDALibraries::get_device_properties()
     return *device_prop_;
   }
   device_prop_ = std::make_unique<cudaDeviceProp>();
-  CUNUMERIC_CHECK_CUDA(cudaGetDeviceProperties(device_prop_.get(), get_device_ordinal()));
+  CUPYNUMERIC_CHECK_CUDA(cudaGetDeviceProperties(device_prop_.get(), get_device_ordinal()));
   return *device_prop_;
 }
 
@@ -359,7 +396,7 @@ cublasHandle_t CUDALibraries::get_cublas()
 {
   if (nullptr == cublas_) {
     CHECK_CUBLAS(cublasCreate(&cublas_));
-    const char* fast_math = getenv("CUNUMERIC_FAST_MATH");
+    const char* fast_math = getenv("CUPYNUMERIC_FAST_MATH");
     if (fast_math != nullptr && atoi(fast_math) > 0) {
       // Enable acceleration of single precision routines using TF32 tensor cores.
       cublasStatus_t status = cublasSetMathMode(cublas_, CUBLAS_TF32_TENSOR_OP_MATH);
@@ -379,12 +416,12 @@ cusolverDnHandle_t CUDALibraries::get_cusolver()
   return cusolver_;
 }
 
-#if LEGATE_DEFINED(CUNUMERIC_USE_CUSOLVERMP)
+#if LEGATE_DEFINED(CUPYNUMERIC_USE_CUSOLVERMP)
 cusolverMpHandle_t CUDALibraries::get_cusolvermp()
 {
   if (nullptr == cusolvermp_) {
     int device = -1;
-    CUNUMERIC_CHECK_CUDA(cudaGetDevice(&device));
+    CUPYNUMERIC_CHECK_CUDA(cudaGetDevice(&device));
     CHECK_CUSOLVER(cusolverMpCreate(&cusolvermp_, device, get_cached_stream()));
   }
   return cusolvermp_;
@@ -443,7 +480,19 @@ cusolverDnContext* get_cusolver()
   return lib.get_cusolver();
 }
 
-#if LEGATE_DEFINED(CUNUMERIC_USE_CUSOLVERMP)
+static CuSolverExtraSymbols& static_cusolver_extra_symbols()
+{
+  static CuSolverExtraSymbols cusolver_extra_symbols;
+  return cusolver_extra_symbols;
+}
+
+CuSolverExtraSymbols* get_cusolver_extra_symbols()
+{
+  auto& symbols = static_cusolver_extra_symbols();
+  return &symbols;
+}
+
+#if LEGATE_DEFINED(CUPYNUMERIC_USE_CUSOLVERMP)
 cusolverMpHandle* get_cusolvermp()
 {
   const auto proc = legate::Processor::get_executing_processor();
@@ -480,9 +529,10 @@ int get_device_ordinal()
   return lib.get_device_ordinal();
 }
 
-class LoadCUDALibsTask : public CuNumericTask<LoadCUDALibsTask> {
+class LoadCUDALibsTask : public CuPyNumericTask<LoadCUDALibsTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_LOAD_CUDALIBS};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_LOAD_CUDALIBS}};
 
  public:
   static void gpu_variant(legate::TaskContext context)
@@ -491,16 +541,18 @@ class LoadCUDALibsTask : public CuNumericTask<LoadCUDALibsTask> {
     auto& lib       = get_cuda_libraries(proc);
     lib.get_cublas();
     lib.get_cusolver();
-#if LEGATE_DEFINED(CUNUMERIC_USE_CUSOLVERMP)
+    auto* extra = get_cusolver_extra_symbols();
+#if LEGATE_DEFINED(CUPYNUMERIC_USE_CUSOLVERMP)
     lib.get_cusolvermp();
 #endif
     static_cast<void>(lib.get_cutensor());
   }
 };
 
-class UnloadCUDALibsTask : public CuNumericTask<UnloadCUDALibsTask> {
+class UnloadCUDALibsTask : public CuPyNumericTask<UnloadCUDALibsTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_UNLOAD_CUDALIBS};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_UNLOAD_CUDALIBS}};
 
  public:
   static void gpu_variant(legate::TaskContext context)
@@ -508,14 +560,21 @@ class UnloadCUDALibsTask : public CuNumericTask<UnloadCUDALibsTask> {
     const auto proc = legate::Processor::get_executing_processor();
     auto& lib       = get_cuda_libraries(proc);
     lib.finalize();
+    auto* extra = get_cusolver_extra_symbols();
+    extra->finalize();
     destroy_bitgenerator(proc);
   }
 };
 
-static void __attribute__((constructor)) register_tasks(void)
-{
+const auto cupynumeric_reg_task_ = []() -> char {
   LoadCUDALibsTask::register_variants();
   UnloadCUDALibsTask::register_variants();
-}
+  return 0;
+}();
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
+
+extern "C" {
+
+bool cupynumeric_cusolver_has_geev() { return cupynumeric::get_cusolver_extra_symbols()->has_geev; }
+}
diff --git a/src/cunumeric/cudalibs.h b/src/cupynumeric/cudalibs.h
similarity index 89%
rename from src/cunumeric/cudalibs.h
rename to src/cupynumeric/cudalibs.h
index 6996815110..a0718b8500 100644
--- a/src/cunumeric/cudalibs.h
+++ b/src/cupynumeric/cudalibs.h
@@ -18,7 +18,7 @@
 
 #include "cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct cufftPlanCache;
 
@@ -38,7 +38,7 @@ struct CUDALibraries {
   const cudaDeviceProp& get_device_properties();
   cublasHandle_t get_cublas();
   cusolverDnHandle_t get_cusolver();
-#if LEGATE_DEFINED(CUNUMERIC_USE_CUSOLVERMP)
+#if LEGATE_DEFINED(CUPYNUMERIC_USE_CUSOLVERMP)
   cusolverMpHandle_t get_cusolvermp();
 #endif
   [[nodiscard]] const cutensorHandle_t& get_cutensor();
@@ -47,7 +47,7 @@ struct CUDALibraries {
  private:
   void finalize_cublas();
   void finalize_cusolver();
-#if LEGATE_DEFINED(CUNUMERIC_USE_CUSOLVERMP)
+#if LEGATE_DEFINED(CUPYNUMERIC_USE_CUSOLVERMP)
   void finalize_cusolvermp();
 #endif
   void finalize_cutensor();
@@ -58,11 +58,12 @@ struct CUDALibraries {
   std::unique_ptr<cudaDeviceProp> device_prop_{};
   cublasContext* cublas_;
   cusolverDnContext* cusolver_;
-#if LEGATE_DEFINED(CUNUMERIC_USE_CUSOLVERMP)
+
+#if LEGATE_DEFINED(CUPYNUMERIC_USE_CUSOLVERMP)
   cusolverMpHandle* cusolvermp_;
 #endif
   std::optional<cutensorHandle_t> cutensor_{};
   std::map<cufftType, cufftPlanCache*> plan_caches_;
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cupynumeric/cupynumeric.cc b/src/cupynumeric/cupynumeric.cc
new file mode 100644
index 0000000000..09661b77a0
--- /dev/null
+++ b/src/cupynumeric/cupynumeric.cc
@@ -0,0 +1,90 @@
+/* Copyright 2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cupynumeric/cupynumeric_c.h"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/mapper.h"
+#include "cupynumeric/runtime.h"
+#include "cupynumeric/unary/unary_red_util.h"
+
+using namespace legate;
+
+namespace cupynumeric {
+
+static const char* const cupynumeric_library_name = "cupynumeric";
+
+/*static*/ TaskRegistrar& CuPyNumericRegistrar::get_registrar()
+{
+  static TaskRegistrar registrar;
+  return registrar;
+}
+
+void unload_cudalibs() noexcept
+{
+  auto machine = legate::get_machine();
+
+  auto num_gpus = machine.count(legate::mapping::TaskTarget::GPU);
+  if (0 == num_gpus) {
+    return;
+  }
+
+  auto runtime = legate::Runtime::get_runtime();
+  auto library = runtime->find_library(cupynumeric_library_name);
+
+  // Issue an execution fence so all outstanding tasks are done before we start destroying handles
+  runtime->issue_execution_fence();
+
+  runtime->submit(
+    runtime->create_task(library,
+                         legate::LocalTaskID{CuPyNumericOpCode::CUPYNUMERIC_UNLOAD_CUDALIBS},
+                         legate::tuple<uint64_t>{num_gpus}));
+}
+
+void registration_callback()
+{
+  ResourceConfig config;
+  config.max_tasks         = CUPYNUMERIC_MAX_TASKS;
+  config.max_reduction_ops = CUPYNUMERIC_MAX_REDOPS;
+
+  auto runtime           = legate::Runtime::get_runtime();
+  constexpr auto options = legate::VariantOptions{}.with_has_allocations(false);
+  auto library           = runtime->create_library(
+    cupynumeric_library_name,
+    config,
+    std::make_unique<CuPyNumericMapper>(),
+    {{LEGATE_CPU_VARIANT, options}, {LEGATE_GPU_VARIANT, options}, {LEGATE_OMP_VARIANT, options}});
+
+  CuPyNumericRegistrar::get_registrar().register_all_tasks(library);
+  CuPyNumericRuntime::initialize(runtime, library);
+
+  legate::register_shutdown_callback(unload_cudalibs);
+}
+
+}  // namespace cupynumeric
+
+extern "C" {
+
+void cupynumeric_perform_registration(void) { cupynumeric::registration_callback(); }
+
+bool cupynumeric_has_cusolvermp()
+{
+  return LEGATE_DEFINED(LEGATE_USE_CUDA) && LEGATE_DEFINED(CUPYNUMERIC_USE_CUSOLVERMP);
+}
+
+#if !LEGATE_DEFINED(LEGATE_USE_CUDA)
+bool cupynumeric_cusolver_has_geev() { return false; }
+#endif
+}
diff --git a/src/cunumeric/cunumeric.cu b/src/cupynumeric/cupynumeric.cu
similarity index 95%
rename from src/cunumeric/cunumeric.cu
rename to src/cupynumeric/cupynumeric.cu
index 1c44e57b59..59209290da 100644
--- a/src/cunumeric/cunumeric.cu
+++ b/src/cupynumeric/cupynumeric.cu
@@ -14,11 +14,11 @@
  *
  */
 
-#include "cunumeric_task.h"
+#include "cupynumeric_task.h"
 #include "arg.h"
 #include "arg.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 #define REGISTER_REDOPS(OP)                              \
   {                                                      \
@@ -42,4 +42,4 @@ void register_reduction_operators(legate::LibraryContext& context)
   REGISTER_REDOPS(ArgminReduction);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cupynumeric/cupynumeric_c.h b/src/cupynumeric/cupynumeric_c.h
new file mode 100644
index 0000000000..4c24685d66
--- /dev/null
+++ b/src/cupynumeric/cupynumeric_c.h
@@ -0,0 +1,358 @@
+/* Copyright 2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#ifndef __CUPYNUMERIC_C_H__
+#define __CUPYNUMERIC_C_H__
+
+// Match these to CuPyNumericOpCode in config.py
+// Also, sort these alphabetically except the first one for easy lookup later
+enum CuPyNumericOpCode {
+  _CUPYNUMERIC_OP_CODE_BASE = 0,
+  CUPYNUMERIC_ADVANCED_INDEXING,
+  CUPYNUMERIC_ARANGE,
+  CUPYNUMERIC_ARGWHERE,
+  CUPYNUMERIC_BATCHED_CHOLESKY,
+  CUPYNUMERIC_BINARY_OP,
+  CUPYNUMERIC_BINARY_RED,
+  CUPYNUMERIC_BINCOUNT,
+  CUPYNUMERIC_BITGENERATOR,
+  CUPYNUMERIC_CHOOSE,
+  CUPYNUMERIC_CONTRACT,
+  CUPYNUMERIC_CONVERT,
+  CUPYNUMERIC_CONVOLVE,
+  CUPYNUMERIC_SCAN_GLOBAL,
+  CUPYNUMERIC_SCAN_LOCAL,
+  CUPYNUMERIC_DIAG,
+  CUPYNUMERIC_DOT,
+  CUPYNUMERIC_EYE,
+  CUPYNUMERIC_FFT,
+  CUPYNUMERIC_FILL,
+  CUPYNUMERIC_FLIP,
+  CUPYNUMERIC_GEEV,
+  CUPYNUMERIC_GEMM,
+  CUPYNUMERIC_HISTOGRAM,
+  CUPYNUMERIC_LOAD_CUDALIBS,
+  CUPYNUMERIC_MATMUL,
+  CUPYNUMERIC_MATVECMUL,
+  CUPYNUMERIC_MP_POTRF,
+  CUPYNUMERIC_MP_SOLVE,
+  CUPYNUMERIC_NONZERO,
+  CUPYNUMERIC_PACKBITS,
+  CUPYNUMERIC_POTRF,
+  CUPYNUMERIC_PUTMASK,
+  CUPYNUMERIC_QR,
+  CUPYNUMERIC_RAND,
+  CUPYNUMERIC_READ,
+  CUPYNUMERIC_REPEAT,
+  CUPYNUMERIC_SCALAR_UNARY_RED,
+  CUPYNUMERIC_SEARCHSORTED,
+  CUPYNUMERIC_SELECT,
+  CUPYNUMERIC_SOLVE,
+  CUPYNUMERIC_SORT,
+  CUPYNUMERIC_SVD,
+  CUPYNUMERIC_SYRK,
+  CUPYNUMERIC_TILE,
+  CUPYNUMERIC_TRANSPOSE_COPY_2D,
+  CUPYNUMERIC_TRILU,
+  CUPYNUMERIC_TRSM,
+  CUPYNUMERIC_UNARY_OP,
+  CUPYNUMERIC_UNARY_RED,
+  CUPYNUMERIC_UNIQUE,
+  CUPYNUMERIC_UNIQUE_REDUCE,
+  CUPYNUMERIC_UNLOAD_CUDALIBS,
+  CUPYNUMERIC_UNPACKBITS,
+  CUPYNUMERIC_WHERE,
+  CUPYNUMERIC_WINDOW,
+  CUPYNUMERIC_WRAP,
+  CUPYNUMERIC_WRITE,
+  CUPYNUMERIC_ZIP,
+};
+
+// Match these to UnaryOpCode in config.py
+// Also, sort these alphabetically for easy lookup later
+enum CuPyNumericUnaryOpCode {
+  CUPYNUMERIC_UOP_ABSOLUTE = 1,
+  CUPYNUMERIC_UOP_ANGLE,
+  CUPYNUMERIC_UOP_ARCCOS,
+  CUPYNUMERIC_UOP_ARCCOSH,
+  CUPYNUMERIC_UOP_ARCSIN,
+  CUPYNUMERIC_UOP_ARCSINH,
+  CUPYNUMERIC_UOP_ARCTAN,
+  CUPYNUMERIC_UOP_ARCTANH,
+  CUPYNUMERIC_UOP_CBRT,
+  CUPYNUMERIC_UOP_CEIL,
+  CUPYNUMERIC_UOP_CLIP,
+  CUPYNUMERIC_UOP_CONJ,
+  CUPYNUMERIC_UOP_COPY,
+  CUPYNUMERIC_UOP_COS,
+  CUPYNUMERIC_UOP_COSH,
+  CUPYNUMERIC_UOP_DEG2RAD,
+  CUPYNUMERIC_UOP_EXP,
+  CUPYNUMERIC_UOP_EXP2,
+  CUPYNUMERIC_UOP_EXPM1,
+  CUPYNUMERIC_UOP_FLOOR,
+  CUPYNUMERIC_UOP_FREXP,
+  CUPYNUMERIC_UOP_GETARG,
+  CUPYNUMERIC_UOP_IMAG,
+  CUPYNUMERIC_UOP_INVERT,
+  CUPYNUMERIC_UOP_ISFINITE,
+  CUPYNUMERIC_UOP_ISINF,
+  CUPYNUMERIC_UOP_ISNAN,
+  CUPYNUMERIC_UOP_LOG,
+  CUPYNUMERIC_UOP_LOG10,
+  CUPYNUMERIC_UOP_LOG1P,
+  CUPYNUMERIC_UOP_LOG2,
+  CUPYNUMERIC_UOP_LOGICAL_NOT,
+  CUPYNUMERIC_UOP_MODF,
+  CUPYNUMERIC_UOP_NEGATIVE,
+  CUPYNUMERIC_UOP_POSITIVE,
+  CUPYNUMERIC_UOP_RAD2DEG,
+  CUPYNUMERIC_UOP_REAL,
+  CUPYNUMERIC_UOP_RECIPROCAL,
+  CUPYNUMERIC_UOP_RINT,
+  CUPYNUMERIC_UOP_ROUND,
+  CUPYNUMERIC_UOP_SIGN,
+  CUPYNUMERIC_UOP_SIGNBIT,
+  CUPYNUMERIC_UOP_SIN,
+  CUPYNUMERIC_UOP_SINH,
+  CUPYNUMERIC_UOP_SQRT,
+  CUPYNUMERIC_UOP_SQUARE,
+  CUPYNUMERIC_UOP_TAN,
+  CUPYNUMERIC_UOP_TANH,
+  CUPYNUMERIC_UOP_TRUNC,
+};
+
+// Match these to UnaryRedCode in config.py
+// Also, sort these alphabetically for easy lookup later
+enum CuPyNumericUnaryRedCode {
+  CUPYNUMERIC_RED_ALL = 1,
+  CUPYNUMERIC_RED_ANY,
+  CUPYNUMERIC_RED_ARGMAX,
+  CUPYNUMERIC_RED_ARGMIN,
+  CUPYNUMERIC_RED_CONTAINS,
+  CUPYNUMERIC_RED_COUNT_NONZERO,
+  CUPYNUMERIC_RED_MAX,
+  CUPYNUMERIC_RED_MIN,
+  CUPYNUMERIC_RED_NANARGMAX,
+  CUPYNUMERIC_RED_NANARGMIN,
+  CUPYNUMERIC_RED_NANMAX,
+  CUPYNUMERIC_RED_NANMIN,
+  CUPYNUMERIC_RED_NANPROD,
+  CUPYNUMERIC_RED_NANSUM,
+  CUPYNUMERIC_RED_PROD,
+  CUPYNUMERIC_RED_SUM,
+  CUPYNUMERIC_RED_SUM_SQUARES,
+  CUPYNUMERIC_RED_VARIANCE
+};
+
+// Match these to BinaryOpCode in config.py
+// Also, sort these alphabetically for easy lookup later
+enum CuPyNumericBinaryOpCode {
+  CUPYNUMERIC_BINOP_ADD = 1,
+  CUPYNUMERIC_BINOP_ARCTAN2,
+  CUPYNUMERIC_BINOP_BITWISE_AND,
+  CUPYNUMERIC_BINOP_BITWISE_OR,
+  CUPYNUMERIC_BINOP_BITWISE_XOR,
+  CUPYNUMERIC_BINOP_COPYSIGN,
+  CUPYNUMERIC_BINOP_DIVIDE,
+  CUPYNUMERIC_BINOP_EQUAL,
+  CUPYNUMERIC_BINOP_FLOAT_POWER,
+  CUPYNUMERIC_BINOP_FLOOR_DIVIDE,
+  CUPYNUMERIC_BINOP_FMOD,
+  CUPYNUMERIC_BINOP_GCD,
+  CUPYNUMERIC_BINOP_GREATER,
+  CUPYNUMERIC_BINOP_GREATER_EQUAL,
+  CUPYNUMERIC_BINOP_HYPOT,
+  CUPYNUMERIC_BINOP_ISCLOSE,
+  CUPYNUMERIC_BINOP_LCM,
+  CUPYNUMERIC_BINOP_LDEXP,
+  CUPYNUMERIC_BINOP_LEFT_SHIFT,
+  CUPYNUMERIC_BINOP_LESS,
+  CUPYNUMERIC_BINOP_LESS_EQUAL,
+  CUPYNUMERIC_BINOP_LOGADDEXP,
+  CUPYNUMERIC_BINOP_LOGADDEXP2,
+  CUPYNUMERIC_BINOP_LOGICAL_AND,
+  CUPYNUMERIC_BINOP_LOGICAL_OR,
+  CUPYNUMERIC_BINOP_LOGICAL_XOR,
+  CUPYNUMERIC_BINOP_MAXIMUM,
+  CUPYNUMERIC_BINOP_MINIMUM,
+  CUPYNUMERIC_BINOP_MOD,
+  CUPYNUMERIC_BINOP_MULTIPLY,
+  CUPYNUMERIC_BINOP_NEXTAFTER,
+  CUPYNUMERIC_BINOP_NOT_EQUAL,
+  CUPYNUMERIC_BINOP_POWER,
+  CUPYNUMERIC_BINOP_RIGHT_SHIFT,
+  CUPYNUMERIC_BINOP_SUBTRACT,
+};
+
+// Match these to WindowOpCode in config.py
+// Also, sort these alphabetically for easy lookup later
+enum CuPyNumericWindowOpCode {
+  CUPYNUMERIC_WINDOW_BARLETT = 1,
+  CUPYNUMERIC_WINDOW_BLACKMAN,
+  CUPYNUMERIC_WINDOW_HAMMING,
+  CUPYNUMERIC_WINDOW_HANNING,
+  CUPYNUMERIC_WINDOW_KAISER,
+};
+
+// Match these to CuPyNumericRedopCode in config.py
+enum CuPyNumericRedopID {
+  CUPYNUMERIC_ARGMAX_REDOP = 1,
+  CUPYNUMERIC_ARGMIN_REDOP = 2,
+};
+
+enum CuPyNumericBounds {
+  CUPYNUMERIC_MAX_REDOPS = 1024,
+  CUPYNUMERIC_MAX_TASKS  = 1048576,
+};
+
+// Match these to ScanCode in config.py
+// Also, sort these alphabetically for easy lookup later
+enum CuPyNumericScanCode {
+  CUPYNUMERIC_SCAN_PROD = 1,
+  CUPYNUMERIC_SCAN_SUM,
+};
+
+// Match these to ConvertCode in config.py
+// Also, sort these alphabetically for easy lookup later
+enum CuPyNumericConvertCode {
+  CUPYNUMERIC_CONVERT_NAN_NOOP = 1,
+  CUPYNUMERIC_CONVERT_NAN_PROD,
+  CUPYNUMERIC_CONVERT_NAN_SUM,
+};
+
+// Match these to BitGeneratorOperation in config.py
+enum CuPyNumericBitGeneratorOperation {
+  CUPYNUMERIC_BITGENOP_CREATE       = 1,
+  CUPYNUMERIC_BITGENOP_DESTROY      = 2,
+  CUPYNUMERIC_BITGENOP_RAND_RAW     = 3,
+  CUPYNUMERIC_BITGENOP_DISTRIBUTION = 4,
+};
+
+// Match these to BitGeneratorType in config.py
+enum CuPyNumericBitGeneratorType {
+  CUPYNUMERIC_BITGENTYPE_DEFAULT       = 0,
+  CUPYNUMERIC_BITGENTYPE_XORWOW        = 1,
+  CUPYNUMERIC_BITGENTYPE_MRG32K3A      = 2,
+  CUPYNUMERIC_BITGENTYPE_MTGP32        = 3,
+  CUPYNUMERIC_BITGENTYPE_MT19937       = 4,
+  CUPYNUMERIC_BITGENTYPE_PHILOX4_32_10 = 5,
+};
+
+// Match these to BitGeneratorDistribution in config.py
+enum CuPyNumericBitGeneratorDistribution {
+  CUPYNUMERIC_BITGENDIST_INTEGERS_16 = 1,
+  CUPYNUMERIC_BITGENDIST_INTEGERS_32,
+  CUPYNUMERIC_BITGENDIST_INTEGERS_64,
+  CUPYNUMERIC_BITGENDIST_UNIFORM_32,
+  CUPYNUMERIC_BITGENDIST_UNIFORM_64,
+  CUPYNUMERIC_BITGENDIST_LOGNORMAL_32,
+  CUPYNUMERIC_BITGENDIST_LOGNORMAL_64,
+  CUPYNUMERIC_BITGENDIST_NORMAL_32,
+  CUPYNUMERIC_BITGENDIST_NORMAL_64,
+  CUPYNUMERIC_BITGENDIST_POISSON,
+  CUPYNUMERIC_BITGENDIST_EXPONENTIAL_32,
+  CUPYNUMERIC_BITGENDIST_EXPONENTIAL_64,
+  CUPYNUMERIC_BITGENDIST_GUMBEL_32,
+  CUPYNUMERIC_BITGENDIST_GUMBEL_64,
+  CUPYNUMERIC_BITGENDIST_LAPLACE_32,
+  CUPYNUMERIC_BITGENDIST_LAPLACE_64,
+  CUPYNUMERIC_BITGENDIST_LOGISTIC_32,
+  CUPYNUMERIC_BITGENDIST_LOGISTIC_64,
+  CUPYNUMERIC_BITGENDIST_PARETO_32,
+  CUPYNUMERIC_BITGENDIST_PARETO_64,
+  CUPYNUMERIC_BITGENDIST_POWER_32,
+  CUPYNUMERIC_BITGENDIST_POWER_64,
+  CUPYNUMERIC_BITGENDIST_RAYLEIGH_32,
+  CUPYNUMERIC_BITGENDIST_RAYLEIGH_64,
+  CUPYNUMERIC_BITGENDIST_CAUCHY_32,
+  CUPYNUMERIC_BITGENDIST_CAUCHY_64,
+  CUPYNUMERIC_BITGENDIST_TRIANGULAR_32,
+  CUPYNUMERIC_BITGENDIST_TRIANGULAR_64,
+  CUPYNUMERIC_BITGENDIST_WEIBULL_32,
+  CUPYNUMERIC_BITGENDIST_WEIBULL_64,
+  CUPYNUMERIC_BITGENDIST_BYTES,
+  CUPYNUMERIC_BITGENDIST_BETA_32,
+  CUPYNUMERIC_BITGENDIST_BETA_64,
+  CUPYNUMERIC_BITGENDIST_F_32,
+  CUPYNUMERIC_BITGENDIST_F_64,
+  CUPYNUMERIC_BITGENDIST_LOGSERIES,
+  CUPYNUMERIC_BITGENDIST_NONCENTRAL_F_32,
+  CUPYNUMERIC_BITGENDIST_NONCENTRAL_F_64,
+  CUPYNUMERIC_BITGENDIST_CHISQUARE_32,
+  CUPYNUMERIC_BITGENDIST_CHISQUARE_64,
+  CUPYNUMERIC_BITGENDIST_GAMMA_32,
+  CUPYNUMERIC_BITGENDIST_GAMMA_64,
+  CUPYNUMERIC_BITGENDIST_STANDARD_T_32,
+  CUPYNUMERIC_BITGENDIST_STANDARD_T_64,
+  CUPYNUMERIC_BITGENDIST_HYPERGEOMETRIC,
+  CUPYNUMERIC_BITGENDIST_VONMISES_32,
+  CUPYNUMERIC_BITGENDIST_VONMISES_64,
+  CUPYNUMERIC_BITGENDIST_ZIPF,
+  CUPYNUMERIC_BITGENDIST_GEOMETRIC,
+  CUPYNUMERIC_BITGENDIST_WALD_32,
+  CUPYNUMERIC_BITGENDIST_WALD_64,
+  CUPYNUMERIC_BITGENDIST_BINOMIAL,
+  CUPYNUMERIC_BITGENDIST_NEGATIVE_BINOMIAL,
+};
+
+// These fft types match CuPyNumericFFTType in config.py and cufftType
+enum CuPyNumericFFTType {
+  CUPYNUMERIC_FFT_R2C = 0x2a,  // Real to complex (interleaved)
+  CUPYNUMERIC_FFT_C2R = 0x2c,  // Complex (interleaved) to real
+  CUPYNUMERIC_FFT_C2C = 0x29,  // Complex to complex (interleaved)
+  CUPYNUMERIC_FFT_D2Z = 0x6a,  // Double to double-complex (interleaved)
+  CUPYNUMERIC_FFT_Z2D = 0x6c,  // Double-complex (interleaved) to double
+  CUPYNUMERIC_FFT_Z2Z = 0x69   // Double-complex to double-complex (interleaved)
+};
+
+enum CuPyNumericConvolveMethod {
+  CUPYNUMERIC_CONVOLVE_AUTO,
+  CUPYNUMERIC_CONVOLVE_DIRECT,
+  CUPYNUMERIC_CONVOLVE_FFT,
+};
+
+// These fft types match CuPyNumericFFTDirection in config.py and cufftDirection
+enum CuPyNumericFFTDirection { CUPYNUMERIC_FFT_FORWARD = -1, CUPYNUMERIC_FFT_INVERSE = 1 };
+
+// Match these to Bitorder in config.py
+enum CuPyNumericBitorder { CUPYNUMERIC_BITORDER_BIG = 0, CUPYNUMERIC_BITORDER_LITTLE = 1 };
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct ReductionOpIds {
+  int argmax_redop_id;
+  int argmin_redop_id;
+} ReductionOpIds;
+
+void cupynumeric_perform_registration();
+bool cupynumeric_has_cusolvermp();
+
+bool cupynumeric_cusolver_has_geev();
+
+unsigned cupynumeric_max_eager_volume();
+
+unsigned cupynumeric_matmul_cache_size();
+
+struct ReductionOpIds cupynumeric_register_reduction_ops(int code);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // __CUPYNUMERIC_C_H__
diff --git a/src/cunumeric/cunumeric_task.h b/src/cupynumeric/cupynumeric_task.h
similarity index 75%
rename from src/cunumeric/cunumeric_task.h
rename to src/cupynumeric/cupynumeric_task.h
index fe710c93ef..729fbb7686 100644
--- a/src/cunumeric/cunumeric_task.h
+++ b/src/cupynumeric/cupynumeric_task.h
@@ -17,10 +17,10 @@
 #pragma once
 
 #include "legate.h"
-#include "cunumeric/typedefs.h"
-#include "cunumeric/cunumeric_c.h"
+#include "cupynumeric/typedefs.h"
+#include "cupynumeric/cupynumeric_c.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 enum class VariantKind : int {
   CPU = 0,
@@ -28,13 +28,13 @@ enum class VariantKind : int {
   GPU = 2,
 };
 
-struct CuNumericRegistrar {
+struct CuPyNumericRegistrar {
   static legate::TaskRegistrar& get_registrar();
 };
 
 template <typename T>
-struct CuNumericTask : public legate::LegateTask<T> {
-  using Registrar = CuNumericRegistrar;
+struct CuPyNumericTask : public legate::LegateTask<T> {
+  using Registrar = CuPyNumericRegistrar;
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/device_scalar_reduction_buffer.h b/src/cupynumeric/device_scalar_reduction_buffer.h
similarity index 82%
rename from src/cunumeric/device_scalar_reduction_buffer.h
rename to src/cupynumeric/device_scalar_reduction_buffer.h
index dd5f5ec673..e4701df2ac 100644
--- a/src/cunumeric/device_scalar_reduction_buffer.h
+++ b/src/cupynumeric/device_scalar_reduction_buffer.h
@@ -16,10 +16,10 @@
 
 #pragma once
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 #include "legate/data/buffer.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename REDOP>
 class DeviceScalarReductionBuffer {
@@ -27,12 +27,12 @@ class DeviceScalarReductionBuffer {
   using VAL = typename REDOP::RHS;
 
  public:
-  DeviceScalarReductionBuffer(cudaStream_t stream)
-    : buffer_(legate::create_buffer<VAL>(1, legate::Memory::Kind::GPU_FB_MEM))
+  DeviceScalarReductionBuffer(cudaStream_t stream, std::size_t alignment = 16)
+    : buffer_(legate::create_buffer<VAL>(1, legate::Memory::Kind::GPU_FB_MEM, alignment))
   {
     VAL identity{REDOP::identity};
     ptr_ = buffer_.ptr(0);
-    CUNUMERIC_CHECK_CUDA(
+    CUPYNUMERIC_CHECK_CUDA(
       cudaMemcpyAsync(ptr_, &identity, sizeof(VAL), cudaMemcpyHostToDevice, stream));
   }
 
@@ -45,9 +45,9 @@ class DeviceScalarReductionBuffer {
   __host__ VAL read(cudaStream_t stream) const
   {
     VAL result{REDOP::identity};
-    CUNUMERIC_CHECK_CUDA(
+    CUPYNUMERIC_CHECK_CUDA(
       cudaMemcpyAsync(&result, ptr_, sizeof(VAL), cudaMemcpyDeviceToHost, stream));
-    CUNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
+    CUPYNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
     return result;
   }
 
@@ -58,4 +58,4 @@ class DeviceScalarReductionBuffer {
   VAL* ptr_;
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/divmod.h b/src/cupynumeric/divmod.h
similarity index 99%
rename from src/cunumeric/divmod.h
rename to src/cupynumeric/divmod.h
index 0794e31750..25f377629f 100644
--- a/src/cunumeric/divmod.h
+++ b/src/cupynumeric/divmod.h
@@ -32,7 +32,7 @@
 #include <cstdint>
 #include <stdlib.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 // uint128_t for host and device from CUTLASS
 
@@ -465,4 +465,4 @@ struct FastDivmodU64 {
   }
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/execution_policy/indexing/parallel_loop.cuh b/src/cupynumeric/execution_policy/indexing/parallel_loop.cuh
similarity index 85%
rename from src/cunumeric/execution_policy/indexing/parallel_loop.cuh
rename to src/cupynumeric/execution_policy/indexing/parallel_loop.cuh
index fe35ea4c9f..449e3b2e19 100644
--- a/src/cunumeric/execution_policy/indexing/parallel_loop.cuh
+++ b/src/cupynumeric/execution_policy/indexing/parallel_loop.cuh
@@ -16,11 +16,11 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/execution_policy/indexing/parallel_loop.h"
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/execution_policy/indexing/parallel_loop.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <class KERNEL, class Tag>
 static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
@@ -48,8 +48,8 @@ struct ParallelLoopPolicy<VariantKind::GPU, Tag> {
     parallel_loop_kernel<<<blocks, THREADS_PER_BLOCK, 1, stream>>>(
       volume, std::forward<KERNEL>(kernel), Tag{});
 
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/execution_policy/indexing/parallel_loop.h b/src/cupynumeric/execution_policy/indexing/parallel_loop.h
similarity index 91%
rename from src/cunumeric/execution_policy/indexing/parallel_loop.h
rename to src/cupynumeric/execution_policy/indexing/parallel_loop.h
index 8a88f572b6..17e97a1f73 100644
--- a/src/cunumeric/execution_policy/indexing/parallel_loop.h
+++ b/src/cupynumeric/execution_policy/indexing/parallel_loop.h
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <VariantKind KIND, class Tag = void>
 struct ParallelLoopPolicy {};
@@ -35,4 +35,4 @@ struct ParallelLoopPolicy<VariantKind::CPU, Tag> {
   }
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/execution_policy/indexing/parallel_loop_omp.h b/src/cupynumeric/execution_policy/indexing/parallel_loop_omp.h
similarity index 83%
rename from src/cunumeric/execution_policy/indexing/parallel_loop_omp.h
rename to src/cupynumeric/execution_policy/indexing/parallel_loop_omp.h
index f474930d41..e9e0b6d5e4 100644
--- a/src/cunumeric/execution_policy/indexing/parallel_loop_omp.h
+++ b/src/cupynumeric/execution_policy/indexing/parallel_loop_omp.h
@@ -16,13 +16,13 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/execution_policy/indexing/parallel_loop.h"
-#include "cunumeric/omp_help.h"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/execution_policy/indexing/parallel_loop.h"
+#include "cupynumeric/omp_help.h"
 
 #include <omp.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <class Tag>
 struct ParallelLoopPolicy<VariantKind::OMP, Tag> {
@@ -37,4 +37,4 @@ struct ParallelLoopPolicy<VariantKind::OMP, Tag> {
   }
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/execution_policy/reduction/scalar_reduction.cuh b/src/cupynumeric/execution_policy/reduction/scalar_reduction.cuh
similarity index 84%
rename from src/cunumeric/execution_policy/reduction/scalar_reduction.cuh
rename to src/cupynumeric/execution_policy/reduction/scalar_reduction.cuh
index ac3b1aef29..20a506944e 100644
--- a/src/cunumeric/execution_policy/reduction/scalar_reduction.cuh
+++ b/src/cupynumeric/execution_policy/reduction/scalar_reduction.cuh
@@ -16,10 +16,10 @@
 
 #pragma once
 
-#include "cunumeric/execution_policy/reduction/scalar_reduction.h"
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/execution_policy/reduction/scalar_reduction.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 namespace scalar_reduction_impl {
 
 template <class AccessorRD, class Kernel, class LHS, class Tag>
@@ -49,10 +49,8 @@ static __global__ void __launch_bounds__(1, 1) copy_kernel(Buffer result, RedAcc
 template <class LG_OP, class Tag>
 struct ScalarReductionPolicy<VariantKind::GPU, LG_OP, Tag> {
   template <class AccessorRD, class LHS, class Kernel>
-  void __attribute__((visibility("hidden"))) operator()(size_t volume,
-                                                        AccessorRD& out,
-                                                        const LHS& identity,
-                                                        Kernel&& kernel)
+  void __attribute__((visibility("hidden"))) operator()(
+    size_t volume, AccessorRD & out, const LHS & identity, Kernel && kernel)
   {
     if (0 == volume) {
       return;
@@ -75,8 +73,8 @@ struct ScalarReductionPolicy<VariantKind::GPU, LG_OP, Tag> {
           volume, 1, result, std::forward<Kernel>(kernel), identity, Tag{});
     }
     scalar_reduction_impl::copy_kernel<<<1, 1, 0, stream>>>(result, out);
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/execution_policy/reduction/scalar_reduction.h b/src/cupynumeric/execution_policy/reduction/scalar_reduction.h
similarity index 94%
rename from src/cunumeric/execution_policy/reduction/scalar_reduction.h
rename to src/cupynumeric/execution_policy/reduction/scalar_reduction.h
index 66c7851404..900f59e1a5 100644
--- a/src/cunumeric/execution_policy/reduction/scalar_reduction.h
+++ b/src/cupynumeric/execution_policy/reduction/scalar_reduction.h
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <VariantKind KIND, class LG_OP, class Tag = void>
 struct ScalarReductionPolicy {
@@ -49,4 +49,4 @@ struct ScalarReductionPolicy<VariantKind::CPU, LG_OP, Tag> {
   }
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/execution_policy/reduction/scalar_reduction_omp.h b/src/cupynumeric/execution_policy/reduction/scalar_reduction_omp.h
similarity index 89%
rename from src/cunumeric/execution_policy/reduction/scalar_reduction_omp.h
rename to src/cupynumeric/execution_policy/reduction/scalar_reduction_omp.h
index 5b9334290d..ee0c26ff17 100644
--- a/src/cunumeric/execution_policy/reduction/scalar_reduction_omp.h
+++ b/src/cupynumeric/execution_policy/reduction/scalar_reduction_omp.h
@@ -16,12 +16,12 @@
 
 #pragma once
 
-#include "cunumeric/execution_policy/reduction/scalar_reduction.h"
-#include "cunumeric/omp_help.h"
+#include "cupynumeric/execution_policy/reduction/scalar_reduction.h"
+#include "cupynumeric/omp_help.h"
 
 #include <omp.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <class LG_OP, class Tag>
 struct ScalarReductionPolicy<VariantKind::OMP, LG_OP, Tag> {
@@ -47,4 +47,4 @@ struct ScalarReductionPolicy<VariantKind::OMP, LG_OP, Tag> {
   }
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/fft/fft.cu b/src/cupynumeric/fft/fft.cu
similarity index 86%
rename from src/cunumeric/fft/fft.cu
rename to src/cupynumeric/fft/fft.cu
index fd0052aa2e..f00261b568 100644
--- a/src/cunumeric/fft/fft.cu
+++ b/src/cupynumeric/fft/fft.cu
@@ -15,13 +15,13 @@
  */
 #include <csignal>
 
-#include "cunumeric/fft/fft.h"
-#include "cunumeric/fft/fft_template.inl"
+#include "cupynumeric/fft/fft.h"
+#include "cupynumeric/fft/fft_template.inl"
 
-#include "cunumeric/cuda_help.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/cuda_help.h"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 using dim_t = long long int32_t;
@@ -46,7 +46,7 @@ __host__ static inline void copy_into_buffer(TYPE* target,
                                              cudaStream_t stream)
 {
   if (acc.accessor.is_dense_row_major(rect)) {
-    CUNUMERIC_CHECK_CUDA(cudaMemcpyAsync(
+    CUPYNUMERIC_CHECK_CUDA(cudaMemcpyAsync(
       target, acc.ptr(rect.lo), volume * sizeof(TYPE), cudaMemcpyDeviceToDevice, stream));
   } else {
     Pitches<DIM - 1> pitches{};
@@ -56,7 +56,7 @@ __host__ static inline void copy_into_buffer(TYPE* target,
     copy_kernel<<<num_blocks, THREADS_PER_BLOCK, 0, stream>>>(
       volume, target, acc, pitches, rect.lo);
 
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 }
 
@@ -68,8 +68,8 @@ __host__ static inline void cufft_operation(AccessorWO<OUTPUT_TYPE, DIM> out,
                                             const Rect<DIM>& out_rect,
                                             const Rect<DIM>& in_rect,
                                             std::vector<int64_t>& axes,
-                                            CuNumericFFTType type,
-                                            CuNumericFFTDirection direction)
+                                            CuPyNumericFFTType type,
+                                            CuPyNumericFFTDirection direction)
 {
   auto stream = get_cached_stream();
 
@@ -84,8 +84,8 @@ __host__ static inline void cufft_operation(AccessorWO<OUTPUT_TYPE, DIM> out,
   Point<DIM> fft_size_out = out_rect.hi - out_rect.lo + one;
   num_elements            = 1;
   for (int32_t i = 0; i < DIM; ++i) {
-    n[i] =
-      (type == CUNUMERIC_FFT_R2C || type == CUNUMERIC_FFT_D2Z) ? fft_size_in[i] : fft_size_out[i];
+    n[i]       = (type == CUPYNUMERIC_FFT_R2C || type == CUPYNUMERIC_FFT_D2Z) ? fft_size_in[i]
+                                                                              : fft_size_out[i];
     inembed[i] = fft_size_in[i];
     onembed[i] = fft_size_out[i];
     num_elements *= n[i];
@@ -115,7 +115,7 @@ __host__ static inline void cufft_operation(AccessorWO<OUTPUT_TYPE, DIM> out,
                           static_cast<void*>(out.ptr(out_rect.lo)),
                           static_cast<int32_t>(direction)));
   // synchronize before cufft_context runs out of scope
-  CUNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
+  CUPYNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
 }
 
 // Perform the FFT operation as multiple 1D FFTs along the specified axes (Complex-to-complex case).
@@ -124,8 +124,8 @@ __host__ static inline void cufft_over_axes_c2c(INOUT_TYPE* out,
                                                 const INOUT_TYPE* in,
                                                 const Rect<DIM>& inout_rect,
                                                 std::vector<int64_t>& axes,
-                                                CuNumericFFTType type,
-                                                CuNumericFFTDirection direction)
+                                                CuPyNumericFFTType type,
+                                                CuPyNumericFFTDirection direction)
 {
   auto stream = get_cached_stream();
 
@@ -144,7 +144,7 @@ __host__ static inline void cufft_over_axes_c2c(INOUT_TYPE* out,
   // Copy input to output buffer (if needed)
   // the computation will be done inplace of the target
   if (in != out) {
-    CUNUMERIC_CHECK_CUDA(cudaMemcpyAsync(
+    CUPYNUMERIC_CHECK_CUDA(cudaMemcpyAsync(
       out, in, num_elements * sizeof(INOUT_TYPE), cudaMemcpyDeviceToDevice, stream));
   }
 
@@ -193,7 +193,7 @@ __host__ static inline void cufft_over_axes_c2c(INOUT_TYPE* out,
                               static_cast<int32_t>(direction)));
     }
     // synchronize before cufft_context runs out of scope
-    CUNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
+    CUPYNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
   }
 }
 
@@ -204,8 +204,8 @@ __host__ static inline void cufft_r2c_c2r(OUTPUT_TYPE* out,
                                           const Rect<DIM>& out_rect,
                                           const Rect<DIM>& in_rect,
                                           const int64_t axis,
-                                          CuNumericFFTType type,
-                                          CuNumericFFTDirection direction)
+                                          CuPyNumericFFTType type,
+                                          CuPyNumericFFTDirection direction)
 {
   auto stream = get_cached_stream();
 
@@ -220,7 +220,7 @@ __host__ static inline void cufft_r2c_c2r(OUTPUT_TYPE* out,
   size_t num_elements_in  = 1;
   size_t num_elements_out = 1;
   for (int32_t i = 0; i < DIM; ++i) {
-    n[i]       = (direction == CUNUMERIC_FFT_FORWARD) ? fft_size_in[i] : fft_size_out[i];
+    n[i]       = (direction == CUPYNUMERIC_FFT_FORWARD) ? fft_size_in[i] : fft_size_out[i];
     inembed[i] = fft_size_in[i];
     onembed[i] = fft_size_out[i];
     num_elements_in *= fft_size_in[i];
@@ -237,7 +237,7 @@ __host__ static inline void cufft_r2c_c2r(OUTPUT_TYPE* out,
       num_slices *= n[i];
     }
   }
-  dim_t batches = ((direction == CUNUMERIC_FFT_FORWARD) ? num_elements_in : num_elements_out) /
+  dim_t batches = ((direction == CUPYNUMERIC_FFT_FORWARD) ? num_elements_in : num_elements_out) /
                   (num_slices * size_1d);
   int64_t offset_in  = num_elements_in / num_slices;
   int64_t offset_out = num_elements_out / num_slices;
@@ -269,7 +269,7 @@ __host__ static inline void cufft_r2c_c2r(OUTPUT_TYPE* out,
                             static_cast<int32_t>(direction)));
   }
   // synchronize before cufft_context runs out of scope
-  CUNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
+  CUPYNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
 }
 
 // Perform the FFT operation as multiple 1D FFTs along the specified axes.
@@ -282,16 +282,16 @@ __host__ static inline void cufft_over_axes(AccessorWO<OUTPUT_TYPE, DIM> out,
                                             const Rect<DIM>& out_rect,
                                             const Rect<DIM>& in_rect,
                                             std::vector<int64_t>& axes,
-                                            CuNumericFFTType type,
-                                            CuNumericFFTDirection direction)
+                                            CuPyNumericFFTType type,
+                                            CuPyNumericFFTDirection direction)
 {
-  bool is_c2c = (type == CUNUMERIC_FFT_Z2Z || type == CUNUMERIC_FFT_C2C);
-  bool is_r2c = !is_c2c && (type == CUNUMERIC_FFT_D2Z || type == CUNUMERIC_FFT_R2C);
+  bool is_c2c = (type == CUPYNUMERIC_FFT_Z2Z || type == CUPYNUMERIC_FFT_C2C);
+  bool is_r2c = !is_c2c && (type == CUPYNUMERIC_FFT_D2Z || type == CUPYNUMERIC_FFT_R2C);
   bool is_c2r = !is_c2c && !is_r2c;
 
   bool is_double_precision =
-    (type == CUNUMERIC_FFT_Z2Z || type == CUNUMERIC_FFT_D2Z || type == CUNUMERIC_FFT_Z2D);
-  auto c2c_subtype = is_double_precision ? CUNUMERIC_FFT_Z2Z : CUNUMERIC_FFT_C2C;
+    (type == CUPYNUMERIC_FFT_Z2Z || type == CUPYNUMERIC_FFT_D2Z || type == CUPYNUMERIC_FFT_Z2D);
+  auto c2c_subtype = is_double_precision ? CUPYNUMERIC_FFT_Z2Z : CUPYNUMERIC_FFT_C2C;
 
   // C2C, R2C, C2R all modify input buffer --> create a copy
   OUTPUT_TYPE* out_ptr = out.ptr(out_rect.lo);
@@ -340,7 +340,7 @@ __host__ static inline void cufft_over_axes(AccessorWO<OUTPUT_TYPE, DIM> out,
   }
 }
 
-template <CuNumericFFTType FFT_TYPE, Type::Code CODE_OUT, Type::Code CODE_IN, int32_t DIM>
+template <CuPyNumericFFTType FFT_TYPE, Type::Code CODE_OUT, Type::Code CODE_IN, int32_t DIM>
 struct FFTImplBody<VariantKind::GPU, FFT_TYPE, CODE_OUT, CODE_IN, DIM> {
   using INPUT_TYPE  = type_of<CODE_IN>;
   using OUTPUT_TYPE = type_of<CODE_OUT>;
@@ -350,7 +350,7 @@ struct FFTImplBody<VariantKind::GPU, FFT_TYPE, CODE_OUT, CODE_IN, DIM> {
                            const Rect<DIM>& out_rect,
                            const Rect<DIM>& in_rect,
                            std::vector<int64_t>& axes,
-                           CuNumericFFTDirection direction,
+                           CuPyNumericFFTDirection direction,
                            bool operate_over_axes) const
   {
     assert(out.accessor.is_dense_row_major(out_rect));
@@ -377,7 +377,10 @@ struct FFTImplBody<VariantKind::GPU, FFT_TYPE, CODE_OUT, CODE_IN, DIM> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { FFTTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  FFTTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/fft/fft.h b/src/cupynumeric/fft/fft.h
similarity index 66%
rename from src/cunumeric/fft/fft.h
rename to src/cupynumeric/fft/fft.h
index 5e8e358cc1..938fb960c0 100644
--- a/src/cunumeric/fft/fft.h
+++ b/src/cupynumeric/fft/fft.h
@@ -16,23 +16,25 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/fft/fft_util.h"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/fft/fft_util.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct FFTArgs {
   legate::PhysicalStore output{nullptr};
   legate::PhysicalStore input{nullptr};
-  CuNumericFFTType type;
-  CuNumericFFTDirection direction;
+  CuPyNumericFFTType type;
+  CuPyNumericFFTDirection direction;
   bool operate_over_axes;
   std::vector<int64_t> axes;
 };
 
-class FFTTask : public CuNumericTask<FFTTask> {
+class FFTTask : public CuPyNumericTask<FFTTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_FFT};
+  static inline const auto TASK_CONFIG = legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_FFT}};
+
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
 
  public:
 #if LEGATE_DEFINED(LEGATE_USE_CUDA)
@@ -40,4 +42,4 @@ class FFTTask : public CuNumericTask<FFTTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/fft/fft_template.inl b/src/cupynumeric/fft/fft_template.inl
similarity index 85%
rename from src/cunumeric/fft/fft_template.inl
rename to src/cupynumeric/fft/fft_template.inl
index 647241e738..3de312b218 100644
--- a/src/cunumeric/fft/fft_template.inl
+++ b/src/cupynumeric/fft/fft_template.inl
@@ -17,22 +17,22 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/fft/fft.h"
-#include "cunumeric/pitches.h"
-#include "cunumeric/fft/fft_util.h"
+#include "cupynumeric/fft/fft.h"
+#include "cupynumeric/pitches.h"
+#include "cupynumeric/fft/fft_util.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
 template <VariantKind KIND,
-          CuNumericFFTType FFT_TYPE,
+          CuPyNumericFFTType FFT_TYPE,
           Type::Code CODE_OUT,
           Type::Code CODE_IN,
           int32_t DIM>
 struct FFTImplBody;
 
-template <VariantKind KIND, CuNumericFFTType FFT_TYPE>
+template <VariantKind KIND, CuPyNumericFFTType FFT_TYPE>
 struct FFTImpl {
   template <Type::Code CODE_IN,
             int32_t DIM,
@@ -67,7 +67,7 @@ struct FFTImpl {
 
 template <VariantKind KIND>
 struct FFTDispatch {
-  template <CuNumericFFTType FFT_TYPE>
+  template <CuPyNumericFFTType FFT_TYPE>
   void operator()(FFTArgs& args) const
   {
     // Not expecting changing dimensions, at least for now
@@ -85,8 +85,8 @@ static void fft_template(TaskContext& context)
   args.output = context.output(0);
   args.input  = context.input(0);
   // Scalar arguments. Pay attention to indexes / ranges when adding or reordering arguments
-  args.type              = context.scalar(0).value<CuNumericFFTType>();
-  args.direction         = context.scalar(1).value<CuNumericFFTDirection>();
+  args.type              = context.scalar(0).value<CuPyNumericFFTType>();
+  args.direction         = context.scalar(1).value<CuPyNumericFFTDirection>();
   args.operate_over_axes = context.scalar(2).value<bool>();
 
   const auto num_scalars = context.num_scalars();
@@ -96,4 +96,4 @@ static void fft_template(TaskContext& context)
 
   fft_dispatch(args.type, FFTDispatch<KIND>{}, args);
 }
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/fft/fft_util.h b/src/cupynumeric/fft/fft_util.h
similarity index 53%
rename from src/cunumeric/fft/fft_util.h
rename to src/cupynumeric/fft/fft_util.h
index 7f76501928..b5b0afc2ab 100644
--- a/src/cunumeric/fft/fft_util.h
+++ b/src/cupynumeric/fft/fft_util.h
@@ -16,73 +16,73 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
 template <typename Functor, typename... Fnargs>
-constexpr decltype(auto) fft_dispatch(CuNumericFFTType type, Functor f, Fnargs&&... args)
+constexpr decltype(auto) fft_dispatch(CuPyNumericFFTType type, Functor f, Fnargs&&... args)
 {
   switch (type) {
-    case CUNUMERIC_FFT_R2C:
-      return f.template operator()<CUNUMERIC_FFT_R2C>(std::forward<Fnargs>(args)...);
-    case CUNUMERIC_FFT_C2R:
-      return f.template operator()<CUNUMERIC_FFT_C2R>(std::forward<Fnargs>(args)...);
-    case CUNUMERIC_FFT_C2C:
-      return f.template operator()<CUNUMERIC_FFT_C2C>(std::forward<Fnargs>(args)...);
-    case CUNUMERIC_FFT_D2Z:
-      return f.template operator()<CUNUMERIC_FFT_D2Z>(std::forward<Fnargs>(args)...);
-    case CUNUMERIC_FFT_Z2D:
-      return f.template operator()<CUNUMERIC_FFT_Z2D>(std::forward<Fnargs>(args)...);
-    case CUNUMERIC_FFT_Z2Z:
-      return f.template operator()<CUNUMERIC_FFT_Z2Z>(std::forward<Fnargs>(args)...);
+    case CUPYNUMERIC_FFT_R2C:
+      return f.template operator()<CUPYNUMERIC_FFT_R2C>(std::forward<Fnargs>(args)...);
+    case CUPYNUMERIC_FFT_C2R:
+      return f.template operator()<CUPYNUMERIC_FFT_C2R>(std::forward<Fnargs>(args)...);
+    case CUPYNUMERIC_FFT_C2C:
+      return f.template operator()<CUPYNUMERIC_FFT_C2C>(std::forward<Fnargs>(args)...);
+    case CUPYNUMERIC_FFT_D2Z:
+      return f.template operator()<CUPYNUMERIC_FFT_D2Z>(std::forward<Fnargs>(args)...);
+    case CUPYNUMERIC_FFT_Z2D:
+      return f.template operator()<CUPYNUMERIC_FFT_Z2D>(std::forward<Fnargs>(args)...);
+    case CUPYNUMERIC_FFT_Z2Z:
+      return f.template operator()<CUPYNUMERIC_FFT_Z2Z>(std::forward<Fnargs>(args)...);
     default: break;
   }
   assert(false);
-  return f.template operator()<CUNUMERIC_FFT_C2C>(std::forward<Fnargs>(args)...);
+  return f.template operator()<CUPYNUMERIC_FFT_C2C>(std::forward<Fnargs>(args)...);
 }
 
-template <CuNumericFFTType TYPE, Type::Code CODE_IN>
+template <CuPyNumericFFTType TYPE, Type::Code CODE_IN>
 struct FFT {
   static constexpr bool valid = false;
 };
 
 template <>
-struct FFT<CUNUMERIC_FFT_R2C, Type::Code::FLOAT32> {
+struct FFT<CUPYNUMERIC_FFT_R2C, Type::Code::FLOAT32> {
   static constexpr bool valid          = true;
   static constexpr Type::Code CODE_OUT = Type::Code::COMPLEX64;
 };
 
 template <>
-struct FFT<CUNUMERIC_FFT_C2R, Type::Code::COMPLEX64> {
+struct FFT<CUPYNUMERIC_FFT_C2R, Type::Code::COMPLEX64> {
   static constexpr bool valid          = true;
   static constexpr Type::Code CODE_OUT = Type::Code::FLOAT32;
 };
 
 template <>
-struct FFT<CUNUMERIC_FFT_C2C, Type::Code::COMPLEX64> {
+struct FFT<CUPYNUMERIC_FFT_C2C, Type::Code::COMPLEX64> {
   static constexpr bool valid          = true;
   static constexpr Type::Code CODE_OUT = Type::Code::COMPLEX64;
 };
 
 template <>
-struct FFT<CUNUMERIC_FFT_D2Z, Type::Code::FLOAT64> {
+struct FFT<CUPYNUMERIC_FFT_D2Z, Type::Code::FLOAT64> {
   static constexpr bool valid          = true;
   static constexpr Type::Code CODE_OUT = Type::Code::COMPLEX128;
 };
 
 template <>
-struct FFT<CUNUMERIC_FFT_Z2D, Type::Code::COMPLEX128> {
+struct FFT<CUPYNUMERIC_FFT_Z2D, Type::Code::COMPLEX128> {
   static constexpr bool valid          = true;
   static constexpr Type::Code CODE_OUT = Type::Code::FLOAT64;
 };
 
 template <>
-struct FFT<CUNUMERIC_FFT_Z2Z, Type::Code::COMPLEX128> {
+struct FFT<CUPYNUMERIC_FFT_Z2Z, Type::Code::COMPLEX128> {
   static constexpr bool valid          = true;
   static constexpr Type::Code CODE_OUT = Type::Code::COMPLEX128;
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/advanced_indexing.cc b/src/cupynumeric/index/advanced_indexing.cc
similarity index 94%
rename from src/cunumeric/index/advanced_indexing.cc
rename to src/cupynumeric/index/advanced_indexing.cc
index 498901a2c4..ac590c620c 100644
--- a/src/cunumeric/index/advanced_indexing.cc
+++ b/src/cupynumeric/index/advanced_indexing.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/index/advanced_indexing.h"
-#include "cunumeric/index/advanced_indexing_template.inl"
+#include "cupynumeric/index/advanced_indexing.h"
+#include "cupynumeric/index/advanced_indexing_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -111,10 +111,10 @@ struct AdvancedIndexingImplBody<VariantKind::CPU, CODE, DIM, OUT_TYPE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+const auto cupynumeric_reg_task_ = []() -> char {
   AdvancedIndexingTask::register_variants();
-}
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/advanced_indexing.cu b/src/cupynumeric/index/advanced_indexing.cu
similarity index 94%
rename from src/cunumeric/index/advanced_indexing.cu
rename to src/cupynumeric/index/advanced_indexing.cu
index 23c7373b11..925e0081a8 100644
--- a/src/cunumeric/index/advanced_indexing.cu
+++ b/src/cupynumeric/index/advanced_indexing.cu
@@ -14,14 +14,14 @@
  *
  */
 
-#include "cunumeric/index/advanced_indexing.h"
-#include "cunumeric/index/advanced_indexing_template.inl"
-#include "cunumeric/utilities/thrust_util.h"
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/index/advanced_indexing.h"
+#include "cupynumeric/index/advanced_indexing_template.inl"
+#include "cupynumeric/utilities/thrust_util.h"
+#include "cupynumeric/cuda_help.h"
 
 #include <thrust/scan.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename Output, typename Pitches, typename Point, int32_t DIM>
 static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
@@ -109,7 +109,7 @@ struct AdvancedIndexingImplBody<VariantKind::GPU, CODE, DIM, OUT_TYPE> {
         volume, size, offsets, in, pitches, rect.lo, 1, skip_size, key_dim);
     }
 
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
 
     auto off_ptr = offsets.ptr(0);
     thrust::exclusive_scan(DEFAULT_POLICY.on(stream), off_ptr, off_ptr + volume, off_ptr);
@@ -158,7 +158,7 @@ struct AdvancedIndexingImplBody<VariantKind::GPU, CODE, DIM, OUT_TYPE> {
       advanced_indexing_kernel<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
         volume, input, index, out, pitches, rect.lo, offsets, skip_size, key_dim);
     }
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -166,4 +166,4 @@ struct AdvancedIndexingImplBody<VariantKind::GPU, CODE, DIM, OUT_TYPE> {
 {
   advanced_indexing_template<VariantKind::GPU>(context);
 }
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/advanced_indexing.h b/src/cupynumeric/index/advanced_indexing.h
similarity index 69%
rename from src/cunumeric/index/advanced_indexing.h
rename to src/cupynumeric/index/advanced_indexing.h
index 6ed94abcc1..5e2d591640 100644
--- a/src/cunumeric/index/advanced_indexing.h
+++ b/src/cupynumeric/index/advanced_indexing.h
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct AdvancedIndexingArgs {
   legate::PhysicalStore output;
@@ -28,9 +28,14 @@ struct AdvancedIndexingArgs {
   const int64_t key_dim;
 };
 
-class AdvancedIndexingTask : public CuNumericTask<AdvancedIndexingTask> {
+class AdvancedIndexingTask : public CuPyNumericTask<AdvancedIndexingTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_ADVANCED_INDEXING};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_ADVANCED_INDEXING}};
+
+  static constexpr auto CPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+  static constexpr auto OMP_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -54,4 +59,4 @@ constexpr void fill_out(T& out, legate::Point<DIM>& p, const T& in)
   out = in;
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/advanced_indexing_omp.cc b/src/cupynumeric/index/advanced_indexing_omp.cc
similarity index 95%
rename from src/cunumeric/index/advanced_indexing_omp.cc
rename to src/cupynumeric/index/advanced_indexing_omp.cc
index a0bf6c80ee..fae843dd99 100644
--- a/src/cunumeric/index/advanced_indexing_omp.cc
+++ b/src/cupynumeric/index/advanced_indexing_omp.cc
@@ -14,15 +14,15 @@
  *
  */
 
-#include "cunumeric/index/advanced_indexing.h"
-#include "cunumeric/index/advanced_indexing_template.inl"
-#include "cunumeric/omp_help.h"
+#include "cupynumeric/index/advanced_indexing.h"
+#include "cupynumeric/index/advanced_indexing_template.inl"
+#include "cupynumeric/omp_help.h"
 #include <omp.h>
 #include <thrust/fill.h>
 #include <thrust/execution_policy.h>
 #include <thrust/system/omp/execution_policy.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -127,4 +127,4 @@ struct AdvancedIndexingImplBody<VariantKind::OMP, CODE, DIM, OUT_TYPE> {
   advanced_indexing_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/advanced_indexing_template.inl b/src/cupynumeric/index/advanced_indexing_template.inl
similarity index 94%
rename from src/cunumeric/index/advanced_indexing_template.inl
rename to src/cupynumeric/index/advanced_indexing_template.inl
index 8370973738..476f536f1a 100644
--- a/src/cunumeric/index/advanced_indexing_template.inl
+++ b/src/cupynumeric/index/advanced_indexing_template.inl
@@ -17,10 +17,10 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/index/advanced_indexing.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/index/advanced_indexing.h"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -43,7 +43,7 @@ struct AdvancedIndexingImpl {
     auto index_rect = args.indexing_array.shape<DIM>();
     // this task is executed only for the case when index array is a bool type
     auto index_arr = args.indexing_array.read_accessor<bool, DIM>(index_rect);
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     // we make sure that index and input shapes are the same on the python side.
     // checking this one more time here
     assert(index_rect == input_rect);
@@ -75,4 +75,4 @@ static void advanced_indexing_template(TaskContext& context)
     args.input_array.dim(), args.input_array.code(), AdvancedIndexingImpl<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/choose.cc b/src/cupynumeric/index/choose.cc
similarity index 86%
rename from src/cunumeric/index/choose.cc
rename to src/cupynumeric/index/choose.cc
index 33b0832ec9..6f63bf989b 100644
--- a/src/cunumeric/index/choose.cc
+++ b/src/cupynumeric/index/choose.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/index/choose.h"
-#include "cunumeric/index/choose_template.inl"
+#include "cupynumeric/index/choose.h"
+#include "cupynumeric/index/choose_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -37,7 +37,7 @@ struct ChooseImplBody<VariantKind::CPU, CODE, DIM> {
       auto outptr   = out.ptr(rect);
       auto indexptr = index_arr.ptr(rect);
       for (size_t idx = 0; idx < volume; ++idx) {
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
         assert(indexptr[idx] < static_cast<int64_t>(choices.size()));
 #endif
         auto chptr  = choices[indexptr[idx]].ptr(rect);
@@ -46,7 +46,7 @@ struct ChooseImplBody<VariantKind::CPU, CODE, DIM> {
     } else {
       for (size_t idx = 0; idx < volume; ++idx) {
         auto p = pitches.unflatten(idx, rect.lo);
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
         assert(index_arr[p] < static_cast<int64_t>(choices.size()));
 #endif
         out[p] = choices[index_arr[p]][p];
@@ -62,7 +62,10 @@ struct ChooseImplBody<VariantKind::CPU, CODE, DIM> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { ChooseTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  ChooseTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/choose.cu b/src/cupynumeric/index/choose.cu
similarity index 93%
rename from src/cunumeric/index/choose.cu
rename to src/cupynumeric/index/choose.cu
index e9d8c8f9e6..f204b7e5c2 100644
--- a/src/cunumeric/index/choose.cu
+++ b/src/cupynumeric/index/choose.cu
@@ -14,11 +14,11 @@
  *
  */
 
-#include "cunumeric/index/choose.h"
-#include "cunumeric/index/choose_template.inl"
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/index/choose.h"
+#include "cupynumeric/index/choose_template.inl"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename VAL, int DIM>
 __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
@@ -82,7 +82,7 @@ struct ChooseImplBody<VariantKind::GPU, CODE, DIM> {
       choose_kernel<VAL, DIM>
         <<<blocks, THREADS_PER_BLOCK, 0, stream>>>(out, index_arr, ch_arr, rect, pitches, volume);
     }
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -90,4 +90,4 @@ struct ChooseImplBody<VariantKind::GPU, CODE, DIM> {
 {
   choose_template<VariantKind::GPU>(context);
 }
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/choose.h b/src/cupynumeric/index/choose.h
similarity index 73%
rename from src/cunumeric/index/choose.h
rename to src/cupynumeric/index/choose.h
index 70b11238d8..eaa0e9177d 100644
--- a/src/cunumeric/index/choose.h
+++ b/src/cupynumeric/index/choose.h
@@ -16,18 +16,21 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct ChooseArgs {
   legate::PhysicalStore out;
   std::vector<legate::PhysicalStore> inputs;
 };
 
-class ChooseTask : public CuNumericTask<ChooseTask> {
+class ChooseTask : public CuPyNumericTask<ChooseTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_CHOOSE};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_CHOOSE}};
+
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -39,4 +42,4 @@ class ChooseTask : public CuNumericTask<ChooseTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/choose_omp.cc b/src/cupynumeric/index/choose_omp.cc
similarity index 91%
rename from src/cunumeric/index/choose_omp.cc
rename to src/cupynumeric/index/choose_omp.cc
index 8c5441d19a..0969fe305e 100644
--- a/src/cunumeric/index/choose_omp.cc
+++ b/src/cupynumeric/index/choose_omp.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/index/choose.h"
-#include "cunumeric/index/choose_template.inl"
+#include "cupynumeric/index/choose.h"
+#include "cupynumeric/index/choose_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -38,7 +38,7 @@ struct ChooseImplBody<VariantKind::OMP, CODE, DIM> {
       auto indexptr = index_arr.ptr(rect);
 #pragma omp parallel for schedule(static)
       for (size_t idx = 0; idx < volume; ++idx) {
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
         assert(indexptr[idx] < static_cast<int64_t>(choices.size()));
 #endif
         auto chptr  = choices[indexptr[idx]].ptr(rect);
@@ -59,4 +59,4 @@ struct ChooseImplBody<VariantKind::OMP, CODE, DIM> {
   choose_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/choose_template.inl b/src/cupynumeric/index/choose_template.inl
similarity index 94%
rename from src/cunumeric/index/choose_template.inl
rename to src/cupynumeric/index/choose_template.inl
index d93930ccd3..ee40e770c7 100644
--- a/src/cunumeric/index/choose_template.inl
+++ b/src/cupynumeric/index/choose_template.inl
@@ -17,10 +17,10 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/index/choose.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/index/choose.h"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -74,4 +74,4 @@ static void choose_template(TaskContext& context)
   double_dispatch(args.inputs[0].dim(), args.inputs[0].code(), ChooseImpl<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/putmask.cc b/src/cupynumeric/index/putmask.cc
similarity index 75%
rename from src/cunumeric/index/putmask.cc
rename to src/cupynumeric/index/putmask.cc
index a8d50ef1df..7c7e6b2908 100644
--- a/src/cunumeric/index/putmask.cc
+++ b/src/cupynumeric/index/putmask.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/index/putmask.h"
-#include "cunumeric/index/putmask_template.inl"
+#include "cupynumeric/index/putmask.h"
+#include "cupynumeric/index/putmask_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 /*static*/ void PutmaskTask::cpu_variant(TaskContext context)
 {
@@ -26,7 +26,10 @@ namespace cunumeric {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { PutmaskTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  PutmaskTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/putmask.cu b/src/cupynumeric/index/putmask.cu
similarity index 77%
rename from src/cunumeric/index/putmask.cu
rename to src/cupynumeric/index/putmask.cu
index 588ed1834b..90308278ad 100644
--- a/src/cunumeric/index/putmask.cu
+++ b/src/cupynumeric/index/putmask.cu
@@ -14,15 +14,15 @@
  *
  */
 
-#include "cunumeric/execution_policy/indexing/parallel_loop.cuh"
-#include "cunumeric/index/putmask.h"
-#include "cunumeric/index/putmask_template.inl"
+#include "cupynumeric/execution_policy/indexing/parallel_loop.cuh"
+#include "cupynumeric/index/putmask.h"
+#include "cupynumeric/index/putmask_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 /*static*/ void PutmaskTask::gpu_variant(TaskContext context)
 {
   putmask_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/putmask.h b/src/cupynumeric/index/putmask.h
similarity index 79%
rename from src/cunumeric/index/putmask.h
rename to src/cupynumeric/index/putmask.h
index 0b9289487b..9827a7da06 100644
--- a/src/cunumeric/index/putmask.h
+++ b/src/cupynumeric/index/putmask.h
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct PutmaskArgs {
   legate::PhysicalStore input;
@@ -26,9 +26,10 @@ struct PutmaskArgs {
   legate::PhysicalStore values;
 };
 
-class PutmaskTask : public CuNumericTask<PutmaskTask> {
+class PutmaskTask : public CuPyNumericTask<PutmaskTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_PUTMASK};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_PUTMASK}};
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -40,4 +41,4 @@ class PutmaskTask : public CuNumericTask<PutmaskTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/putmask_omp.cc b/src/cupynumeric/index/putmask_omp.cc
similarity index 77%
rename from src/cunumeric/index/putmask_omp.cc
rename to src/cupynumeric/index/putmask_omp.cc
index 6ffa91ee9c..30256281ab 100644
--- a/src/cunumeric/index/putmask_omp.cc
+++ b/src/cupynumeric/index/putmask_omp.cc
@@ -14,15 +14,15 @@
  *
  */
 
-#include "cunumeric/execution_policy/indexing/parallel_loop_omp.h"
-#include "cunumeric/index/putmask.h"
-#include "cunumeric/index/putmask_template.inl"
+#include "cupynumeric/execution_policy/indexing/parallel_loop_omp.h"
+#include "cupynumeric/index/putmask.h"
+#include "cupynumeric/index/putmask_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 /*static*/ void PutmaskTask::omp_variant(TaskContext context)
 {
   putmask_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/putmask_template.inl b/src/cupynumeric/index/putmask_template.inl
similarity index 93%
rename from src/cunumeric/index/putmask_template.inl
rename to src/cupynumeric/index/putmask_template.inl
index 0b438cec1e..ec5c6611a5 100644
--- a/src/cunumeric/index/putmask_template.inl
+++ b/src/cupynumeric/index/putmask_template.inl
@@ -18,11 +18,11 @@
 
 // Useful for IDEs
 #include <legate/utilities/typedefs.h>
-#include "cunumeric/index/putmask.h"
-#include "cunumeric/pitches.h"
-#include "cunumeric/execution_policy/indexing/parallel_loop.h"
+#include "cupynumeric/index/putmask.h"
+#include "cupynumeric/pitches.h"
+#include "cupynumeric/execution_policy/indexing/parallel_loop.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -117,4 +117,4 @@ static void putmask_template(TaskContext& context)
   double_dispatch(dim, args.input.code(), PutmaskImpl<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/repeat.cc b/src/cupynumeric/index/repeat.cc
similarity index 93%
rename from src/cunumeric/index/repeat.cc
rename to src/cupynumeric/index/repeat.cc
index b23610ff0d..cf16231423 100644
--- a/src/cunumeric/index/repeat.cc
+++ b/src/cupynumeric/index/repeat.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/index/repeat.h"
-#include "cunumeric/index/repeat_template.inl"
+#include "cupynumeric/index/repeat.h"
+#include "cupynumeric/index/repeat_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -119,7 +119,10 @@ struct RepeatImplBody<VariantKind::CPU, CODE, DIM> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { RepeatTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  RepeatTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/repeat.cu b/src/cupynumeric/index/repeat.cu
similarity index 92%
rename from src/cunumeric/index/repeat.cu
rename to src/cupynumeric/index/repeat.cu
index 67184e6a92..88d69aa6b1 100644
--- a/src/cunumeric/index/repeat.cu
+++ b/src/cupynumeric/index/repeat.cu
@@ -14,14 +14,14 @@
  *
  */
 
-#include "cunumeric/index/repeat.h"
-#include "cunumeric/index/repeat_template.inl"
-#include "cunumeric/utilities/thrust_util.h"
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/index/repeat.h"
+#include "cupynumeric/index/repeat_template.inl"
+#include "cupynumeric/utilities/thrust_util.h"
+#include "cupynumeric/cuda_help.h"
 
 #include <thrust/scan.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename Output, int DIM>
 static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
@@ -110,13 +110,16 @@ struct RepeatImplBody<VariantKind::GPU, CODE, DIM> {
     auto out      = out_array.write_accessor<VAL, DIM>(out_rect);
     Pitches<DIM - 1> pitches{};
 
-    auto out_volume   = pitches.flatten(out_rect);
+    const auto out_volume = pitches.flatten(out_rect);
+    if (out_volume == 0) {
+      return;
+    }
     const auto blocks = (out_volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
 
     auto stream = get_cached_stream();
     repeat_kernel<VAL, DIM><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
       out, in, repeats, axis, out_rect.lo, pitches, out_volume);
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 
   void operator()(legate::PhysicalStore& out_array,
@@ -146,7 +149,7 @@ struct RepeatImplBody<VariantKind::GPU, CODE, DIM> {
       count_repeat_kernel<<<blocks_count, THREADS_PER_BLOCK, shmem_size, stream>>>(
         extent, sum, repeats, in_rect.lo, axis, 1, offsets);
     }
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
 
     Point<DIM> out_extents = in_rect.hi - in_rect.lo + Point<DIM>::ONES();
     out_extents[axis]      = static_cast<coord_t>(sum.read(stream));
@@ -159,7 +162,7 @@ struct RepeatImplBody<VariantKind::GPU, CODE, DIM> {
     const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
     repeat_kernel<VAL, DIM><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
       out, in, repeats, offsets, axis, in_rect.lo, pitches, volume);
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -167,4 +170,4 @@ struct RepeatImplBody<VariantKind::GPU, CODE, DIM> {
 {
   repeat_template<VariantKind::GPU>(context);
 }
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cupynumeric/index/repeat.h b/src/cupynumeric/index/repeat.h
new file mode 100644
index 0000000000..aadb48f406
--- /dev/null
+++ b/src/cupynumeric/index/repeat.h
@@ -0,0 +1,51 @@
+/* Copyright 2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "cupynumeric/cupynumeric_task.h"
+
+namespace cupynumeric {
+
+struct RepeatArgs {
+  legate::PhysicalStore output;
+  legate::PhysicalStore input;
+  legate::PhysicalStore repeats_arr;
+  int64_t repeats;
+  int32_t axis;
+  const bool scalar_repeats;
+};
+
+class RepeatTask : public CuPyNumericTask<RepeatTask> {
+ public:
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_REPEAT}};
+
+  static constexpr auto CPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+  static constexpr auto OMP_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+
+ public:
+  static void cpu_variant(legate::TaskContext context);
+#if LEGATE_DEFINED(LEGATE_USE_OPENMP)
+  static void omp_variant(legate::TaskContext context);
+#endif
+#if LEGATE_DEFINED(LEGATE_USE_CUDA)
+  static void gpu_variant(legate::TaskContext context);
+#endif
+};
+
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/repeat_omp.cc b/src/cupynumeric/index/repeat_omp.cc
similarity index 95%
rename from src/cunumeric/index/repeat_omp.cc
rename to src/cupynumeric/index/repeat_omp.cc
index d74e6a3246..01923c1931 100644
--- a/src/cunumeric/index/repeat_omp.cc
+++ b/src/cupynumeric/index/repeat_omp.cc
@@ -14,15 +14,15 @@
  *
  */
 
-#include "cunumeric/index/repeat.h"
-#include "cunumeric/index/repeat_template.inl"
-#include "cunumeric/omp_help.h"
+#include "cupynumeric/index/repeat.h"
+#include "cupynumeric/index/repeat_template.inl"
+#include "cupynumeric/omp_help.h"
 
 #include <omp.h>
 #include <thrust/scan.h>
 #include <thrust/execution_policy.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -117,4 +117,4 @@ struct RepeatImplBody<VariantKind::OMP, CODE, DIM> {
   repeat_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/repeat_template.inl b/src/cupynumeric/index/repeat_template.inl
similarity index 94%
rename from src/cunumeric/index/repeat_template.inl
rename to src/cupynumeric/index/repeat_template.inl
index c24cace01b..8ce7b5ed40 100644
--- a/src/cunumeric/index/repeat_template.inl
+++ b/src/cupynumeric/index/repeat_template.inl
@@ -17,10 +17,10 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/index/repeat.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/index/repeat.h"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -74,4 +74,4 @@ static void repeat_template(TaskContext& context)
   }
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/select.cc b/src/cupynumeric/index/select.cc
similarity index 89%
rename from src/cunumeric/index/select.cc
rename to src/cupynumeric/index/select.cc
index b375c2e631..56f28d7545 100644
--- a/src/cunumeric/index/select.cc
+++ b/src/cupynumeric/index/select.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/index/select.h"
-#include "cunumeric/index/select_template.inl"
+#include "cupynumeric/index/select.h"
+#include "cupynumeric/index/select_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -35,7 +35,7 @@ struct SelectImplBody<VariantKind::CPU, CODE, DIM> {
   {
     const size_t volume = rect.volume();
     uint32_t narrays    = condlist.size();
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     assert(narrays == choicelist.size());
 #endif
 
@@ -77,7 +77,10 @@ struct SelectImplBody<VariantKind::CPU, CODE, DIM> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { SelectTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  SelectTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/select.cu b/src/cupynumeric/index/select.cu
similarity index 94%
rename from src/cunumeric/index/select.cu
rename to src/cupynumeric/index/select.cu
index 2a9320a833..007367b68b 100644
--- a/src/cunumeric/index/select.cu
+++ b/src/cupynumeric/index/select.cu
@@ -14,11 +14,11 @@
  *
  */
 
-#include "cunumeric/index/select.h"
-#include "cunumeric/index/select_template.inl"
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/index/select.h"
+#include "cupynumeric/index/select_template.inl"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename VAL>
 __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
@@ -80,7 +80,7 @@ struct SelectImplBody<VariantKind::GPU, CODE, DIM> {
                   bool dense) const
   {
     uint32_t narrays = condlist.size();
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     assert(narrays == choicelist.size());
 #endif
     const size_t blocks = (rect.volume() + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
@@ -118,7 +118,7 @@ struct SelectImplBody<VariantKind::GPU, CODE, DIM> {
         out, narrays, cond_arr, choice_arr, default_val, rect, pitches, rect.volume());
     }
 
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -127,4 +127,4 @@ struct SelectImplBody<VariantKind::GPU, CODE, DIM> {
   select_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/select.h b/src/cupynumeric/index/select.h
similarity index 74%
rename from src/cunumeric/index/select.h
rename to src/cupynumeric/index/select.h
index e9857c9f0a..7f210dce36 100644
--- a/src/cunumeric/index/select.h
+++ b/src/cupynumeric/index/select.h
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct SelectArgs {
   legate::PhysicalArray out;
@@ -26,9 +26,12 @@ struct SelectArgs {
   const legate::Scalar& default_value;
 };
 
-class SelectTask : public CuNumericTask<SelectTask> {
+class SelectTask : public CuPyNumericTask<SelectTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_SELECT};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_SELECT}};
+
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -40,4 +43,4 @@ class SelectTask : public CuNumericTask<SelectTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/select_omp.cc b/src/cupynumeric/index/select_omp.cc
similarity index 93%
rename from src/cunumeric/index/select_omp.cc
rename to src/cupynumeric/index/select_omp.cc
index 353d5c1ba5..9fa7a42593 100644
--- a/src/cunumeric/index/select_omp.cc
+++ b/src/cupynumeric/index/select_omp.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/index/select.h"
-#include "cunumeric/index/select_template.inl"
+#include "cupynumeric/index/select.h"
+#include "cupynumeric/index/select_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -35,7 +35,7 @@ struct SelectImplBody<VariantKind::OMP, CODE, DIM> {
   {
     const size_t volume = rect.volume();
     uint32_t narrays    = condlist.size();
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     assert(narrays == choicelist.size());
 #endif
 
@@ -79,4 +79,4 @@ struct SelectImplBody<VariantKind::OMP, CODE, DIM> {
   select_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/select_template.inl b/src/cupynumeric/index/select_template.inl
similarity index 93%
rename from src/cunumeric/index/select_template.inl
rename to src/cupynumeric/index/select_template.inl
index 425be1ab44..54f530d837 100644
--- a/src/cunumeric/index/select_template.inl
+++ b/src/cupynumeric/index/select_template.inl
@@ -17,10 +17,10 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/index/select.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/index/select.h"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -55,7 +55,7 @@ struct SelectImpl {
     condlist.reserve(args.inputs.size() / 2);
     for (int32_t i = 0; i < args.inputs.size() / 2; i++) {
       auto rect_c = args.inputs[i].shape<DIM>();
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
       assert(rect_c == out_rect);
 #endif
       condlist.push_back(args.inputs[i].data().read_accessor<bool, DIM>(rect_c));
@@ -66,7 +66,7 @@ struct SelectImpl {
     choicelist.reserve(args.inputs.size() / 2);
     for (int32_t i = args.inputs.size() / 2; i < args.inputs.size(); i++) {
       auto rect_c = args.inputs[i].shape<DIM>();
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
       assert(rect_c == out_rect);
 #endif
       choicelist.push_back(args.inputs[i].data().read_accessor<VAL, DIM>(rect_c));
@@ -86,4 +86,4 @@ static void select_template(TaskContext& context)
   double_dispatch(args.out.dim(), args.out.type().code(), SelectImpl<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/wrap.cc b/src/cupynumeric/index/wrap.cc
similarity index 89%
rename from src/cunumeric/index/wrap.cc
rename to src/cupynumeric/index/wrap.cc
index bed8a68e02..1b768c34af 100644
--- a/src/cunumeric/index/wrap.cc
+++ b/src/cupynumeric/index/wrap.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/index/wrap.h"
-#include "cunumeric/index/wrap_template.inl"
+#include "cupynumeric/index/wrap.h"
+#include "cupynumeric/index/wrap_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -66,7 +66,10 @@ struct WrapImplBody<VariantKind::CPU, DIM> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { WrapTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  WrapTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/wrap.cu b/src/cupynumeric/index/wrap.cu
similarity index 96%
rename from src/cunumeric/index/wrap.cu
rename to src/cupynumeric/index/wrap.cu
index 3ee9ac0bd4..8edf256c7e 100644
--- a/src/cunumeric/index/wrap.cu
+++ b/src/cupynumeric/index/wrap.cu
@@ -14,11 +14,11 @@
  *
  */
 
-#include "cunumeric/index/wrap.h"
-#include "cunumeric/index/wrap_template.inl"
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/index/wrap.h"
+#include "cupynumeric/index/wrap_template.inl"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -113,7 +113,7 @@ void check_out_of_bounds(const AccessorRO<int64_t, 1>& indices,
     check_kernel<<<blocks, THREADS_PER_BLOCK, shmem_size, stream>>>(
       out_of_bounds, indices, start, volume, volume_base, 1);
   }
-  CUNUMERIC_CHECK_CUDA_STREAM(stream);
+  CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
 
   bool res = out_of_bounds.read(stream);
   if (res) {
@@ -158,7 +158,7 @@ struct WrapImplBody<VariantKind::GPU, DIM> {
                                                                       volume_base,
                                                                       indices);
     }
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -167,4 +167,4 @@ struct WrapImplBody<VariantKind::GPU, DIM> {
   wrap_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/wrap.h b/src/cupynumeric/index/wrap.h
similarity index 88%
rename from src/cunumeric/index/wrap.h
rename to src/cupynumeric/index/wrap.h
index 3b4afce930..91f22c7df8 100644
--- a/src/cunumeric/index/wrap.h
+++ b/src/cupynumeric/index/wrap.h
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct WrapArgs {
   legate::PhysicalStore out{nullptr};  // Array with Point<N> type that is used to
@@ -30,9 +30,11 @@ struct WrapArgs {
   legate::PhysicalStore in{nullptr};
 };
 
-class WrapTask : public CuNumericTask<WrapTask> {
+class WrapTask : public CuPyNumericTask<WrapTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_WRAP};
+  static inline const auto TASK_CONFIG = legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_WRAP}};
+
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -83,4 +85,4 @@ inline bool check_idx_omp(const int64_t i,
 }
 inline bool check_idx_omp(const int64_t i, const int64_t volume, const bool&) { return false; }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/wrap_omp.cc b/src/cupynumeric/index/wrap_omp.cc
similarity index 94%
rename from src/cunumeric/index/wrap_omp.cc
rename to src/cupynumeric/index/wrap_omp.cc
index 888b819f00..2d593ad98d 100644
--- a/src/cunumeric/index/wrap_omp.cc
+++ b/src/cupynumeric/index/wrap_omp.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/index/wrap.h"
-#include "cunumeric/index/wrap_template.inl"
+#include "cupynumeric/index/wrap.h"
+#include "cupynumeric/index/wrap_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -75,4 +75,4 @@ struct WrapImplBody<VariantKind::OMP, DIM> {
   wrap_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/wrap_template.inl b/src/cupynumeric/index/wrap_template.inl
similarity index 93%
rename from src/cunumeric/index/wrap_template.inl
rename to src/cupynumeric/index/wrap_template.inl
index 4fd95d574c..e06ad34c50 100644
--- a/src/cunumeric/index/wrap_template.inl
+++ b/src/cupynumeric/index/wrap_template.inl
@@ -17,10 +17,10 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/index/wrap.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/index/wrap.h"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -56,7 +56,7 @@ struct WrapImpl {
 
     Pitches<DIM - 1> pitches_base{};
     size_t volume_base = pitches_base.flatten(rect_base);
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     assert(volume_base != 0);
 #else
     static_cast<void>(volume_base);
@@ -65,7 +65,7 @@ struct WrapImpl {
     if (args.has_input) {
       auto rect_in = args.in.shape<1>();
       auto in = args.in.read_accessor<int64_t, 1>(rect_in);  // input should be always integer type
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
       assert(rect_in == rect_out);
 #endif
       WrapImplBody<KIND, DIM>()(
@@ -92,4 +92,4 @@ static void wrap_template(TaskContext& context)
   dim_dispatch(dim, WrapImpl<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/zip.cc b/src/cupynumeric/index/zip.cc
similarity index 91%
rename from src/cunumeric/index/zip.cc
rename to src/cupynumeric/index/zip.cc
index 7712ce756f..0da3d0ec44 100644
--- a/src/cunumeric/index/zip.cc
+++ b/src/cupynumeric/index/zip.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/index/zip.h"
-#include "cunumeric/index/zip_template.inl"
+#include "cupynumeric/index/zip.h"
+#include "cupynumeric/index/zip_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -59,7 +59,7 @@ struct ZipImplBody<VariantKind::CPU, DIM, N> {
         }
       }
     } else {
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
       assert(index_arrays.size() < N);
 #endif
       const size_t volume = rect.volume();
@@ -89,7 +89,10 @@ struct ZipImplBody<VariantKind::CPU, DIM, N> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { ZipTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  ZipTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/zip.cu b/src/cupynumeric/index/zip.cu
similarity index 96%
rename from src/cunumeric/index/zip.cu
rename to src/cupynumeric/index/zip.cu
index ecbe10b6b7..53a8ded958 100644
--- a/src/cunumeric/index/zip.cu
+++ b/src/cupynumeric/index/zip.cu
@@ -14,11 +14,11 @@
  *
  */
 
-#include "cunumeric/index/zip.h"
-#include "cunumeric/index/zip_template.inl"
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/index/zip.h"
+#include "cupynumeric/index/zip_template.inl"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <int DIM, int N, size_t... Is>
 __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
@@ -146,7 +146,7 @@ struct ZipImplBody<VariantKind::GPU, DIM, N> {
       check_kernel<<<blocks, THREADS_PER_BLOCK, shmem_size, stream>>>(
         out_of_bounds, index_arrays, volume, 1, rect, pitches, narrays, start_index, shape);
     }
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
 
     bool res = out_of_bounds.read(stream);
     if (res) {
@@ -191,14 +191,14 @@ struct ZipImplBody<VariantKind::GPU, DIM, N> {
           out, index_buf, rect, pitches, volume, shape, std::make_index_sequence<N>());
       }
     } else {
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
       assert(index_arrays.size() < N);
 #endif
       int num_arrays = index_arrays.size();
       zip_kernel<DIM, N><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
         out, index_buf, rect, pitches, num_arrays, volume, key_dim, start_index, shape);
     }
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -206,4 +206,4 @@ struct ZipImplBody<VariantKind::GPU, DIM, N> {
 {
   zip_template<VariantKind::GPU>(context);
 }
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/zip.h b/src/cupynumeric/index/zip.h
similarity index 84%
rename from src/cunumeric/index/zip.h
rename to src/cupynumeric/index/zip.h
index a1056af0a5..967f401ec6 100644
--- a/src/cunumeric/index/zip.h
+++ b/src/cupynumeric/index/zip.h
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct ZipArgs {
   legate::PhysicalStore out;
@@ -29,9 +29,11 @@ struct ZipArgs {
   const legate::DomainPoint shape;
 };
 
-class ZipTask : public CuNumericTask<ZipTask> {
+class ZipTask : public CuPyNumericTask<ZipTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_ZIP};
+  static inline const auto TASK_CONFIG = legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_ZIP}};
+
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -66,4 +68,4 @@ constexpr legate::coord_t compute_idx_cuda(legate::coord_t index, legate::coord_
   return new_index;
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/zip_omp.cc b/src/cupynumeric/index/zip_omp.cc
similarity index 95%
rename from src/cunumeric/index/zip_omp.cc
rename to src/cupynumeric/index/zip_omp.cc
index b48f695c8c..49e39ec449 100644
--- a/src/cunumeric/index/zip_omp.cc
+++ b/src/cupynumeric/index/zip_omp.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/index/zip.h"
-#include "cunumeric/index/zip_template.inl"
+#include "cupynumeric/index/zip.h"
+#include "cupynumeric/index/zip_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -70,7 +70,7 @@ struct ZipImplBody<VariantKind::OMP, DIM, N> {
         }
       }  // else
     } else {
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
       assert(index_arrays.size() < N);
 #endif
 #pragma omp parallel for schedule(static)
@@ -105,4 +105,4 @@ struct ZipImplBody<VariantKind::OMP, DIM, N> {
   zip_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/zip_template.inl b/src/cupynumeric/index/zip_template.inl
similarity index 96%
rename from src/cunumeric/index/zip_template.inl
rename to src/cupynumeric/index/zip_template.inl
index 642bd33db5..16eff841af 100644
--- a/src/cunumeric/index/zip_template.inl
+++ b/src/cupynumeric/index/zip_template.inl
@@ -17,10 +17,10 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/index/zip.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/index/zip.h"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -108,4 +108,4 @@ static void zip_template(TaskContext& context)
   double_dispatch(dim, N, ZipImpl<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/item/read.cc b/src/cupynumeric/item/read.cc
similarity index 79%
rename from src/cunumeric/item/read.cc
rename to src/cupynumeric/item/read.cc
index 477498abda..f3cb8c2a85 100644
--- a/src/cunumeric/item/read.cc
+++ b/src/cupynumeric/item/read.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/item/read.h"
-#include "cunumeric/item/read_template.inl"
+#include "cupynumeric/item/read.h"
+#include "cupynumeric/item/read_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -33,7 +33,10 @@ struct ReadImplBody<VariantKind::CPU, VAL> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { ReadTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  ReadTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/item/read.cu b/src/cupynumeric/item/read.cu
similarity index 84%
rename from src/cunumeric/item/read.cu
rename to src/cupynumeric/item/read.cu
index 20d69e5d7d..a18e5d863a 100644
--- a/src/cunumeric/item/read.cu
+++ b/src/cupynumeric/item/read.cu
@@ -14,11 +14,11 @@
  *
  */
 
-#include "cunumeric/item/read.h"
-#include "cunumeric/item/read_template.inl"
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/item/read.h"
+#include "cupynumeric/item/read_template.inl"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename VAL>
 static __global__ void __launch_bounds__(1, 1)
@@ -33,7 +33,7 @@ struct ReadImplBody<VariantKind::GPU, VAL> {
   {
     auto stream = get_cached_stream();
     read_value<VAL><<<1, 1, 0, stream>>>(out, in);
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -42,4 +42,4 @@ struct ReadImplBody<VariantKind::GPU, VAL> {
   read_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/item/read.h b/src/cupynumeric/item/read.h
similarity index 79%
rename from src/cunumeric/item/read.h
rename to src/cupynumeric/item/read.h
index c7fbdd909a..1541736e60 100644
--- a/src/cunumeric/item/read.h
+++ b/src/cupynumeric/item/read.h
@@ -16,13 +16,13 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
-class ReadTask : public CuNumericTask<ReadTask> {
+class ReadTask : public CuPyNumericTask<ReadTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_READ};
+  static inline const auto TASK_CONFIG = legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_READ}};
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -34,4 +34,4 @@ class ReadTask : public CuNumericTask<ReadTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/item/read_template.inl b/src/cupynumeric/item/read_template.inl
similarity index 93%
rename from src/cunumeric/item/read_template.inl
rename to src/cupynumeric/item/read_template.inl
index 8af240773c..644856c253 100644
--- a/src/cunumeric/item/read_template.inl
+++ b/src/cupynumeric/item/read_template.inl
@@ -17,9 +17,9 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/item/read.h"
+#include "cupynumeric/item/read.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -46,4 +46,4 @@ static void read_template(TaskContext& context)
   type_dispatch(in.type().code(), ReadImpl<KIND>{}, out, in);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/item/write.cc b/src/cupynumeric/item/write.cc
similarity index 80%
rename from src/cunumeric/item/write.cc
rename to src/cupynumeric/item/write.cc
index 34cd747c60..4ec987ea5f 100644
--- a/src/cunumeric/item/write.cc
+++ b/src/cupynumeric/item/write.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/item/write.h"
-#include "cunumeric/item/write_template.inl"
+#include "cupynumeric/item/write.h"
+#include "cupynumeric/item/write_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -36,7 +36,10 @@ struct WriteImplBody<VariantKind::CPU, VAL, DIM> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { WriteTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  WriteTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/item/write.cu b/src/cupynumeric/item/write.cu
similarity index 84%
rename from src/cunumeric/item/write.cu
rename to src/cupynumeric/item/write.cu
index aa056c9b80..725402e300 100644
--- a/src/cunumeric/item/write.cu
+++ b/src/cupynumeric/item/write.cu
@@ -14,11 +14,11 @@
  *
  */
 
-#include "cunumeric/item/write.h"
-#include "cunumeric/item/write_template.inl"
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/item/write.h"
+#include "cupynumeric/item/write_template.inl"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename VAL, int DIM>
 static __global__ void __launch_bounds__(1, 1)
@@ -33,7 +33,7 @@ struct WriteImplBody<VariantKind::GPU, VAL, DIM> {
   {
     auto stream = get_cached_stream();
     write_value<VAL, DIM><<<1, 1, 0, stream>>>(out, value);
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -42,4 +42,4 @@ struct WriteImplBody<VariantKind::GPU, VAL, DIM> {
   write_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/item/write.h b/src/cupynumeric/item/write.h
similarity index 78%
rename from src/cunumeric/item/write.h
rename to src/cupynumeric/item/write.h
index 65ec95d9ce..f05994a798 100644
--- a/src/cunumeric/item/write.h
+++ b/src/cupynumeric/item/write.h
@@ -16,13 +16,13 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
-class WriteTask : public CuNumericTask<WriteTask> {
+class WriteTask : public CuPyNumericTask<WriteTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_WRITE};
+  static inline const auto TASK_CONFIG = legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_WRITE}};
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -34,4 +34,4 @@ class WriteTask : public CuNumericTask<WriteTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/item/write_template.inl b/src/cupynumeric/item/write_template.inl
similarity index 93%
rename from src/cunumeric/item/write_template.inl
rename to src/cupynumeric/item/write_template.inl
index a7f828efa6..b584606f23 100644
--- a/src/cunumeric/item/write_template.inl
+++ b/src/cupynumeric/item/write_template.inl
@@ -17,9 +17,9 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/item/write.h"
+#include "cupynumeric/item/write.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -47,4 +47,4 @@ static void write_template(TaskContext& context)
   legate::double_dispatch(dim, out.type().code(), WriteImpl<KIND>(), out, in);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cupynumeric/mapper.cc b/src/cupynumeric/mapper.cc
new file mode 100644
index 0000000000..d3365fad2f
--- /dev/null
+++ b/src/cupynumeric/mapper.cc
@@ -0,0 +1,524 @@
+/* Copyright 2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cupynumeric/mapper.h"
+#include "legate/utilities/assert.h"
+
+using namespace legate;
+using namespace legate::mapping;
+
+namespace cupynumeric {
+
+Scalar CuPyNumericMapper::tunable_value(TunableID tunable_id)
+{
+  LEGATE_ABORT("cuPyNumeric does not use any tunable values");
+}
+
+std::vector<StoreMapping> CuPyNumericMapper::store_mappings(
+  const mapping::Task& task, const std::vector<mapping::StoreTarget>& options)
+{
+  const auto task_id = static_cast<CuPyNumericOpCode>(task.task_id());
+
+  switch (task_id) {
+    case CUPYNUMERIC_CONVOLVE: {
+      std::vector<StoreMapping> mappings;
+      auto inputs = task.inputs();
+      mappings.push_back(StoreMapping::default_mapping(inputs[0].data(), options.front()));
+      mappings.push_back(StoreMapping::default_mapping(inputs[1].data(), options.front()));
+      auto& input_mapping = mappings.back();
+      for (uint32_t idx = 2; idx < inputs.size(); ++idx) {
+        input_mapping.add_store(inputs[idx].data());
+      }
+      return mappings;
+    }
+    case CUPYNUMERIC_FFT: {
+      std::vector<StoreMapping> mappings;
+      auto inputs  = task.inputs();
+      auto outputs = task.outputs();
+      mappings.push_back(StoreMapping::default_mapping(inputs[0].data(), options.front()));
+      mappings.push_back(
+        StoreMapping::default_mapping(outputs[0].data(), options.front(), true /*exact*/));
+      return mappings;
+    }
+    case CUPYNUMERIC_TRANSPOSE_COPY_2D: {
+      std::vector<StoreMapping> mappings;
+      auto output = task.output(0);
+      mappings.push_back(StoreMapping::default_mapping(output.data(), options.front()));
+      mappings.back().policy().ordering.set_fortran_order();
+      mappings.back().policy().exact = true;
+      return std::move(mappings);
+    }
+    case CUPYNUMERIC_MATMUL: {
+      std::vector<StoreMapping> mappings;
+      auto inputA = task.input(1);
+      auto inputB = task.input(2);
+
+      mappings.push_back(
+        StoreMapping::default_mapping(inputA.data(), options.front(), true /*exact*/));
+      mappings.back().policy().redundant = true;
+      mappings.push_back(
+        StoreMapping::default_mapping(inputB.data(), options.front(), true /*exact*/));
+      mappings.back().policy().redundant = true;
+
+      auto outputC = task.output(0);
+      mappings.push_back(
+        StoreMapping::default_mapping(outputC.data(), options.front(), true /*exact*/));
+
+      return mappings;
+    }
+    case CUPYNUMERIC_MATVECMUL:
+    case CUPYNUMERIC_UNIQUE_REDUCE: {
+      // TODO: Our actual requirements are a little less strict than this; we require each array or
+      // vector to have a stride of 1 on at least one dimension.
+      std::vector<StoreMapping> mappings;
+      auto inputs     = task.inputs();
+      auto reductions = task.reductions();
+      for (auto& input : inputs) {
+        mappings.push_back(
+          StoreMapping::default_mapping(input.data(), options.front(), true /*exact*/));
+      }
+      for (auto& reduction : reductions) {
+        mappings.push_back(
+          StoreMapping::default_mapping(reduction.data(), options.front(), true /*exact*/));
+      }
+      return mappings;
+    }
+    case CUPYNUMERIC_POTRF:
+    case CUPYNUMERIC_QR:
+    case CUPYNUMERIC_TRSM:
+    case CUPYNUMERIC_SOLVE:
+    case CUPYNUMERIC_SVD:
+    case CUPYNUMERIC_SYRK:
+    case CUPYNUMERIC_GEMM:
+    case CUPYNUMERIC_MP_POTRF:
+    case CUPYNUMERIC_MP_SOLVE: {
+      std::vector<StoreMapping> mappings;
+      auto inputs  = task.inputs();
+      auto outputs = task.outputs();
+      for (auto& input : inputs) {
+        mappings.push_back(
+          StoreMapping::default_mapping(input.data(), options.front(), true /*exact*/));
+        mappings.back().policy().ordering.set_fortran_order();
+      }
+      for (auto& output : outputs) {
+        mappings.push_back(
+          StoreMapping::default_mapping(output.data(), options.front(), true /*exact*/));
+        mappings.back().policy().ordering.set_fortran_order();
+      }
+      return mappings;
+    }
+    case CUPYNUMERIC_GEEV: {
+      std::vector<StoreMapping> mappings;
+      auto input_a   = task.input(0);
+      auto output_ew = task.output(0);
+
+      auto dimensions = input_a.dim();
+
+      // last 2 (matrix) dimensions col-major
+      // batch dimensions 0, ..., dim-3 row-major
+      std::vector<int32_t> dim_order;
+      dim_order.push_back(dimensions - 2);
+      dim_order.push_back(dimensions - 1);
+      for (int32_t i = dimensions - 3; i >= 0; i--) {
+        dim_order.push_back(i);
+      }
+
+      mappings.push_back(
+        StoreMapping::default_mapping(input_a.data(), options.front(), true /*exact*/));
+      mappings.back().policy().ordering.set_custom_order(dim_order);
+
+      // eigenvalue computation is optional
+      if (task.outputs().size() > 1) {
+        auto output_ev = task.output(1);
+        mappings.push_back(
+          StoreMapping::default_mapping(output_ev.data(), options.front(), true /*exact*/));
+        mappings.back().policy().ordering.set_custom_order(dim_order);
+      }
+
+      // remove last dimension for eigenvalues
+      dim_order.erase(std::next(dim_order.begin()));
+      mappings.push_back(
+        StoreMapping::default_mapping(output_ew.data(), options.front(), true /*exact*/));
+      mappings.back().policy().ordering.set_custom_order(dim_order);
+
+      return mappings;
+    }
+    // CHANGE: If this code is changed, make sure all layouts are
+    // consistent with those assumed in batched_cholesky.cu, etc
+    case CUPYNUMERIC_BATCHED_CHOLESKY: {
+      std::vector<StoreMapping> mappings;
+      auto inputs  = task.inputs();
+      auto outputs = task.outputs();
+      mappings.reserve(inputs.size() + outputs.size());
+      for (auto& input : inputs) {
+        mappings.push_back(StoreMapping::default_mapping(input.data(), options.front()));
+        mappings.back().policy().exact = true;
+        mappings.back().policy().ordering.set_c_order();
+      }
+      for (auto& output : outputs) {
+        mappings.push_back(StoreMapping::default_mapping(output.data(), options.front()));
+        mappings.back().policy().exact = true;
+        mappings.back().policy().ordering.set_c_order();
+      }
+      return std::move(mappings);
+    }
+    case CUPYNUMERIC_TRILU: {
+      if (task.scalars().size() == 2) {
+        return {};
+      }
+      // If we're here, this task was the post-processing for Cholesky.
+      // So we will request fortran ordering
+      std::vector<StoreMapping> mappings;
+      auto input = task.input(0);
+      mappings.push_back(
+        StoreMapping::default_mapping(input.data(), options.front(), true /*exact*/));
+      mappings.back().policy().ordering.set_fortran_order();
+      return mappings;
+    }
+    case CUPYNUMERIC_SEARCHSORTED: {
+      std::vector<StoreMapping> mappings;
+      auto inputs = task.inputs();
+      mappings.push_back(
+        StoreMapping::default_mapping(inputs[0].data(), options.front(), true /*exact*/));
+      return mappings;
+    }
+    case CUPYNUMERIC_SORT: {
+      std::vector<StoreMapping> mappings;
+      auto inputs  = task.inputs();
+      auto outputs = task.outputs();
+      for (auto& input : inputs) {
+        mappings.push_back(
+          StoreMapping::default_mapping(input.data(), options.front(), true /*exact*/));
+      }
+      for (auto& output : outputs) {
+        mappings.push_back(
+          StoreMapping::default_mapping(output.data(), options.front(), true /*exact*/));
+      }
+      return mappings;
+    }
+    case CUPYNUMERIC_SCAN_LOCAL: {
+      std::vector<StoreMapping> mappings;
+      auto inputs  = task.inputs();
+      auto outputs = task.outputs();
+      for (auto& input : inputs) {
+        mappings.push_back(
+          StoreMapping::default_mapping(input.data(), options.front(), true /*exact*/));
+      }
+      for (auto& output : outputs) {
+        mappings.push_back(
+          StoreMapping::default_mapping(output.data(), options.front(), true /*exact*/));
+      }
+      return mappings;
+    }
+    case CUPYNUMERIC_SCAN_GLOBAL: {
+      std::vector<StoreMapping> mappings;
+      auto inputs  = task.inputs();
+      auto outputs = task.outputs();
+      for (auto& input : inputs) {
+        mappings.push_back(
+          StoreMapping::default_mapping(input.data(), options.front(), true /*exact*/));
+      }
+      for (auto& output : outputs) {
+        mappings.push_back(
+          StoreMapping::default_mapping(output.data(), options.front(), true /*exact*/));
+      }
+      return mappings;
+    }
+    case CUPYNUMERIC_BITGENERATOR: {
+      std::vector<StoreMapping> mappings;
+      auto inputs  = task.inputs();
+      auto outputs = task.outputs();
+      for (auto& input : inputs) {
+        mappings.push_back(
+          StoreMapping::default_mapping(input.data(), options.front(), true /*exact*/));
+      }
+      for (auto& output : outputs) {
+        mappings.push_back(
+          StoreMapping::default_mapping(output.data(), options.front(), true /*exact*/));
+      }
+      return mappings;
+    }
+    default: {
+      return {};
+    }
+  }
+  LEGATE_ABORT("Unsupported task id: " + std::to_string(task_id));
+  return {};
+}
+
+namespace {
+
+// Use an accessor type with the maximum number of dimensions for the size approximation
+using ACC_TYPE = legate::AccessorRO<std::int8_t, LEGATE_MAX_DIM>;
+
+[[nodiscard]] constexpr std::size_t aligned_size(std::size_t size, std::size_t alignment)
+{
+  return (size + alignment - 1) / alignment * alignment;
+}
+
+constexpr std::size_t DEFAULT_ALIGNMENT = 16;
+
+}  // namespace
+
+std::optional<std::size_t> CuPyNumericMapper::allocation_pool_size(
+  const legate::mapping::Task& task, legate::mapping::StoreTarget memory_kind)
+{
+  const auto task_id = static_cast<CuPyNumericOpCode>(task.task_id());
+
+  switch (task_id) {
+    case CUPYNUMERIC_ADVANCED_INDEXING: {
+      if (memory_kind == legate::mapping::StoreTarget::ZCMEM) {
+        return 0;
+      }
+      return std::nullopt;
+    }
+    case CUPYNUMERIC_ARGWHERE: {
+      auto&& input  = task.input(0);
+      auto in_count = input.domain().get_volume();
+      auto out_size = in_count * input.dim() * sizeof(std::int64_t);
+      switch (memory_kind) {
+        case legate::mapping::StoreTarget::SYSMEM: [[fallthrough]];
+        case legate::mapping::StoreTarget::SOCKETMEM: {
+          return out_size;
+        }
+        case legate::mapping::StoreTarget::FBMEM: {
+          return out_size + in_count * sizeof(std::int64_t);
+        }
+        case legate::mapping::StoreTarget::ZCMEM: {
+          return 0;
+        }
+      }
+    }
+    case CUPYNUMERIC_BATCHED_CHOLESKY: [[fallthrough]];
+    case CUPYNUMERIC_GEEV: [[fallthrough]];
+    case CUPYNUMERIC_POTRF: [[fallthrough]];
+    // FIXME(wonchanl): These tasks actually don't need unbound pools on CPUs. They are being used
+    // only to finish up the first implementation quickly
+    case CUPYNUMERIC_QR: [[fallthrough]];
+    case CUPYNUMERIC_SOLVE: [[fallthrough]];
+    case CUPYNUMERIC_SVD: {
+      if (memory_kind == legate::mapping::StoreTarget::ZCMEM) {
+        return aligned_size(sizeof(std::int32_t), DEFAULT_ALIGNMENT);
+      }
+      return std::nullopt;
+    }
+    case CUPYNUMERIC_BINARY_RED: {
+      return memory_kind == legate::mapping::StoreTarget::FBMEM
+               ? aligned_size(sizeof(bool), DEFAULT_ALIGNMENT)
+               : 0;
+    }
+    case CUPYNUMERIC_CHOOSE: {
+      return memory_kind == legate::mapping::StoreTarget::ZCMEM
+               ? sizeof(ACC_TYPE) * task.num_inputs()
+               : 0;
+    }
+    case CUPYNUMERIC_CONTRACT: {
+      switch (memory_kind) {
+        case legate::mapping::StoreTarget::SYSMEM: [[fallthrough]];
+        case legate::mapping::StoreTarget::SOCKETMEM: {
+          auto&& lhs = task.reduction(0);
+          if (lhs.type().code() != legate::Type::Code::FLOAT16) {
+            return 0;
+          }
+          constexpr auto compute_buffer_size = [](auto&& arr) {
+            return aligned_size(arr.domain().get_volume() * sizeof(float), DEFAULT_ALIGNMENT);
+          };
+          return compute_buffer_size(lhs) + compute_buffer_size(task.input(0)) +
+                 compute_buffer_size(task.input(1));
+        }
+        case legate::mapping::StoreTarget::FBMEM: {
+          return std::nullopt;
+        }
+        case legate::mapping::StoreTarget::ZCMEM: {
+          return 0;
+        }
+      }
+    }
+    case CUPYNUMERIC_CONVOLVE: {
+      if (memory_kind == legate::mapping::StoreTarget::ZCMEM) {
+        return 0;
+      }
+      return std::nullopt;
+    }
+    case CUPYNUMERIC_DOT: {
+      return memory_kind == legate::mapping::StoreTarget::FBMEM
+               ? aligned_size(task.reduction(0).type().size(), DEFAULT_ALIGNMENT)
+               : 0;
+    }
+    case CUPYNUMERIC_FFT: {
+      if (memory_kind == legate::mapping::StoreTarget::ZCMEM) {
+        return 0;
+      }
+      return std::nullopt;
+    }
+    case CUPYNUMERIC_FLIP: {
+      return memory_kind == legate::mapping::StoreTarget::ZCMEM
+               ? sizeof(std::int32_t) * task.scalar(0).values<std::int32_t>().size()
+               : 0;
+    }
+    case CUPYNUMERIC_HISTOGRAM: {
+      if (memory_kind == legate::mapping::StoreTarget::ZCMEM) {
+        return 0;
+      }
+      return std::nullopt;
+    }
+    case CUPYNUMERIC_MATMUL: [[fallthrough]];
+    case CUPYNUMERIC_MATVECMUL: {
+      switch (memory_kind) {
+        case legate::mapping::StoreTarget::SYSMEM: [[fallthrough]];
+        case legate::mapping::StoreTarget::SOCKETMEM: {
+          const auto rhs1_idx = task.num_inputs() - 2;
+          const auto rhs2_idx = task.num_inputs() - 1;
+          auto&& rhs1         = task.input(rhs1_idx);
+          if (rhs1.type().code() != legate::Type::Code::FLOAT16) {
+            return 0;
+          }
+          constexpr auto compute_buffer_size = [](auto&& arr) {
+            return aligned_size(arr.domain().get_volume() * sizeof(float), DEFAULT_ALIGNMENT);
+          };
+          return compute_buffer_size(rhs1) + compute_buffer_size(task.input(rhs2_idx));
+        }
+        // The GPU implementation needs no temporary allocations
+        case legate::mapping::StoreTarget::FBMEM: [[fallthrough]];
+        case legate::mapping::StoreTarget::ZCMEM: {
+          LEGATE_ABORT("GPU tasks shouldn't reach here");
+          return 0;
+        }
+      }
+    }
+    case CUPYNUMERIC_NONZERO: {
+      auto&& input      = task.input(0);
+      auto&& output     = task.output(0);
+      auto in_count     = input.domain().get_volume();
+      auto max_out_size = in_count * output.type().size() * input.dim();
+      switch (memory_kind) {
+        case legate::mapping::StoreTarget::SYSMEM: [[fallthrough]];
+        case legate::mapping::StoreTarget::SOCKETMEM: {
+          return max_out_size;
+        }
+        case legate::mapping::StoreTarget::FBMEM: {
+          // The GPU task creates a buffer to keep offsets
+          return max_out_size + in_count * sizeof(std::int64_t) +
+                 aligned_size(sizeof(std::uint64_t), DEFAULT_ALIGNMENT);
+        }
+        case legate::mapping::StoreTarget::ZCMEM: {
+          // The doubling here shouldn't be necessary, but the memory fragmentation seems to be
+          // causing allocation failures even though there's enough space.
+          return input.dim() * sizeof(std::int64_t*) * 2;
+        }
+      }
+    }
+    case CUPYNUMERIC_REPEAT: {
+      if (memory_kind == legate::mapping::StoreTarget::ZCMEM) {
+        if (const auto scalar_repeats = task.scalar(1).value<bool>(); scalar_repeats) {
+          return 0;
+        }
+        const auto axis      = task.scalar(0).value<std::uint32_t>();
+        const auto in_domain = task.input(0).domain();
+        const auto lo        = in_domain.lo();
+        const auto hi        = in_domain.hi();
+        return aligned_size((hi[axis] - lo[axis] + 1) * sizeof(std::int64_t), DEFAULT_ALIGNMENT);
+      }
+      return std::nullopt;
+    }
+    case CUPYNUMERIC_SCALAR_UNARY_RED: {
+      return memory_kind == legate::mapping::StoreTarget::FBMEM
+               ? aligned_size(task.reduction(0).type().size(), DEFAULT_ALIGNMENT)
+               : 0;
+    }
+    case CUPYNUMERIC_SCAN_LOCAL: {
+      if (memory_kind == legate::mapping::StoreTarget::ZCMEM) {
+        return 0;
+      }
+      const auto output = task.output(0);
+      const auto domain = output.domain();
+      const auto ndim   = domain.dim;
+      auto tmp_volume   = std::size_t{1};
+      for (std::int32_t dim = 0; dim < ndim; ++dim) {
+        tmp_volume *=
+          std::max<>(legate::coord_t{0}, domain.rect_data[dim + ndim] - domain.rect_data[dim] + 1);
+      }
+      return aligned_size(tmp_volume * output.type().size(), output.type().alignment());
+    }
+    case CUPYNUMERIC_SELECT: {
+      if (memory_kind == legate::mapping::StoreTarget::ZCMEM) {
+        return aligned_size(sizeof(ACC_TYPE) * task.num_inputs(), DEFAULT_ALIGNMENT);
+      }
+      return 0;
+    }
+    case CUPYNUMERIC_SORT: {
+      // There can be up to seven buffers on the zero-copy memory holding pointers and sizes
+      auto compute_zc_alloc_size = [&]() -> std::optional<std::size_t> {
+        return task.is_single_task() ? 0
+                                     : 7 * task.get_launch_domain().get_volume() * sizeof(void*);
+      };
+      return memory_kind == legate::mapping::StoreTarget::ZCMEM ? compute_zc_alloc_size()
+                                                                : std::nullopt;
+    }
+    case CUPYNUMERIC_UNIQUE: {
+      switch (memory_kind) {
+        case legate::mapping::StoreTarget::SYSMEM: [[fallthrough]];
+        case legate::mapping::StoreTarget::SOCKETMEM: {
+          auto&& input = task.input(0);
+          return input.domain().get_volume() * input.type().size();
+        }
+        case legate::mapping::StoreTarget::FBMEM: {
+          return std::nullopt;
+        }
+        case legate::mapping::StoreTarget::ZCMEM: {
+          return task.get_launch_domain().get_volume() * sizeof(std::size_t);
+        }
+      }
+    }
+    case CUPYNUMERIC_UNIQUE_REDUCE: {
+      switch (memory_kind) {
+        case legate::mapping::StoreTarget::SYSMEM: [[fallthrough]];
+        case legate::mapping::StoreTarget::SOCKETMEM: {
+          auto inputs       = task.inputs();
+          auto elem_type    = inputs.front().type();
+          auto total_volume = std::size_t{0};
+
+          for (auto&& input : inputs) {
+            total_volume += input.domain().get_volume();
+          }
+          return aligned_size(total_volume * elem_type.size(), elem_type.alignment());
+        }
+        // The GPU implementation needs no temporary allocations
+        case legate::mapping::StoreTarget::FBMEM: [[fallthrough]];
+        case legate::mapping::StoreTarget::ZCMEM: {
+          LEGATE_ABORT("GPU tasks shouldn't reach here");
+          return 0;
+        }
+      }
+    }
+    case CUPYNUMERIC_WRAP: {
+      if (memory_kind == legate::mapping::StoreTarget::ZCMEM) {
+        return 0;
+      }
+      return aligned_size(sizeof(bool), DEFAULT_ALIGNMENT);
+    }
+    case CUPYNUMERIC_ZIP: {
+      using ACC = legate::AccessorRO<std::int8_t, LEGATE_MAX_DIM>;
+      return memory_kind == legate::mapping::StoreTarget::ZCMEM
+               ? (task.num_inputs() * sizeof(ACC_TYPE) + 15)
+               : 0;
+    }
+  }
+  LEGATE_ABORT("Unsupported task id: " + std::to_string(task_id));
+  return {};
+}
+
+}  // namespace cupynumeric
diff --git a/src/cunumeric/mapper.h b/src/cupynumeric/mapper.h
similarity index 74%
rename from src/cunumeric/mapper.h
rename to src/cupynumeric/mapper.h
index 3cd7eb4149..a3d122f4a5 100644
--- a/src/cunumeric/mapper.h
+++ b/src/cupynumeric/mapper.h
@@ -16,20 +16,19 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
-class CuNumericMapper final : public legate::mapping::Mapper {
+class CuPyNumericMapper final : public legate::mapping::Mapper {
   // Legate mapping functions
  public:
-  [[nodiscard]] legate::mapping::TaskTarget task_target(
-    const legate::mapping::Task& task,
-    const std::vector<legate::mapping::TaskTarget>& options) override;
   [[nodiscard]] std::vector<legate::mapping::StoreMapping> store_mappings(
     const legate::mapping::Task& task,
     const std::vector<legate::mapping::StoreTarget>& options) override;
   [[nodiscard]] legate::Scalar tunable_value(legate::TunableID tunable_id) override;
+  [[nodiscard]] std::optional<std::size_t> allocation_pool_size(
+    const legate::mapping::Task& task, legate::mapping::StoreTarget memory_kind) override;
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/batched_cholesky.cc b/src/cupynumeric/matrix/batched_cholesky.cc
similarity index 88%
rename from src/cunumeric/matrix/batched_cholesky.cc
rename to src/cupynumeric/matrix/batched_cholesky.cc
index 67823830ec..079fae80e4 100644
--- a/src/cunumeric/matrix/batched_cholesky.cc
+++ b/src/cupynumeric/matrix/batched_cholesky.cc
@@ -14,15 +14,15 @@
  *
  */
 
-#include "cunumeric/matrix/batched_cholesky.h"
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/matrix/batched_cholesky_template.inl"
+#include "cupynumeric/matrix/batched_cholesky.h"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/matrix/batched_cholesky_template.inl"
 
 #include <cblas.h>
-#include <legate/type/type_info.h>
+#include <legate/type/types.h>
 #include <lapack.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -78,10 +78,10 @@ struct BatchedTransposeImplBody<VariantKind::CPU, CODE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+const auto cupynumeric_reg_task_ = []() -> char {
   BatchedCholeskyTask::register_variants();
-}
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/batched_cholesky.cu b/src/cupynumeric/matrix/batched_cholesky.cu
similarity index 91%
rename from src/cunumeric/matrix/batched_cholesky.cu
rename to src/cupynumeric/matrix/batched_cholesky.cu
index cf3c81c848..0ef285d537 100644
--- a/src/cunumeric/matrix/batched_cholesky.cu
+++ b/src/cupynumeric/matrix/batched_cholesky.cu
@@ -14,13 +14,13 @@
  *
  */
 
-#include "cunumeric/matrix/batched_cholesky.h"
-#include "cunumeric/matrix/potrf.h"
-#include "cunumeric/matrix/batched_cholesky_template.inl"
+#include "cupynumeric/matrix/batched_cholesky.h"
+#include "cupynumeric/matrix/potrf.h"
+#include "cupynumeric/matrix/batched_cholesky_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -101,7 +101,7 @@ struct BatchedTransposeImplBody<VariantKind::GPU, CODE> {
     // the lower diagonal
     transpose_2d_lower<VAL><<<blocks, threads, 0, stream>>>(out, n);
 
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -110,4 +110,4 @@ struct BatchedTransposeImplBody<VariantKind::GPU, CODE> {
   batched_cholesky_task_context_dispatch<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/index/repeat.h b/src/cupynumeric/matrix/batched_cholesky.h
similarity index 68%
rename from src/cunumeric/index/repeat.h
rename to src/cupynumeric/matrix/batched_cholesky.h
index a041f9ffa3..197d7b9f10 100644
--- a/src/cunumeric/index/repeat.h
+++ b/src/cupynumeric/matrix/batched_cholesky.h
@@ -16,22 +16,17 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/cupynumeric_c.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
-struct RepeatArgs {
-  legate::PhysicalStore output;
-  legate::PhysicalStore input;
-  legate::PhysicalStore repeats_arr;
-  int64_t repeats;
-  int32_t axis;
-  const bool scalar_repeats;
-};
-
-class RepeatTask : public CuNumericTask<RepeatTask> {
+class BatchedCholeskyTask : public CuPyNumericTask<BatchedCholeskyTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_REPEAT};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_BATCHED_CHOLESKY}};
+
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -43,4 +38,4 @@ class RepeatTask : public CuNumericTask<RepeatTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/batched_cholesky_omp.cc b/src/cupynumeric/matrix/batched_cholesky_omp.cc
similarity index 91%
rename from src/cunumeric/matrix/batched_cholesky_omp.cc
rename to src/cupynumeric/matrix/batched_cholesky_omp.cc
index 0f861f2d8a..2d1872b6c9 100644
--- a/src/cunumeric/matrix/batched_cholesky_omp.cc
+++ b/src/cupynumeric/matrix/batched_cholesky_omp.cc
@@ -14,15 +14,15 @@
  *
  */
 
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/matrix/batched_cholesky.h"
-#include "cunumeric/matrix/batched_cholesky_template.inl"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/matrix/batched_cholesky.h"
+#include "cupynumeric/matrix/batched_cholesky_template.inl"
 
 #include <cblas.h>
 #include <lapack.h>
 #include <omp.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -82,4 +82,4 @@ struct BatchedTransposeImplBody<VariantKind::OMP, CODE> {
   batched_cholesky_task_context_dispatch<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/batched_cholesky_template.inl b/src/cupynumeric/matrix/batched_cholesky_template.inl
similarity index 94%
rename from src/cunumeric/matrix/batched_cholesky_template.inl
rename to src/cupynumeric/matrix/batched_cholesky_template.inl
index 4bf807450b..8b330232f9 100644
--- a/src/cunumeric/matrix/batched_cholesky_template.inl
+++ b/src/cupynumeric/matrix/batched_cholesky_template.inl
@@ -18,13 +18,13 @@
 
 // Useful for IDEs
 #include <legate/task/exception.h>
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/matrix/batched_cholesky.h"
-#include "cunumeric/matrix/potrf_template.inl"
-#include "cunumeric/matrix/transpose_template.inl"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/matrix/batched_cholesky.h"
+#include "cupynumeric/matrix/potrf_template.inl"
+#include "cupynumeric/matrix/transpose_template.inl"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -147,4 +147,4 @@ static void batched_cholesky_task_context_dispatch(TaskContext& context)
                   batched_output);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/contract.cc b/src/cupynumeric/matrix/contract.cc
similarity index 96%
rename from src/cunumeric/matrix/contract.cc
rename to src/cupynumeric/matrix/contract.cc
index cf94c94629..e144908116 100644
--- a/src/cunumeric/matrix/contract.cc
+++ b/src/cupynumeric/matrix/contract.cc
@@ -14,13 +14,13 @@
  *
  */
 
-#include "cunumeric/matrix/contract.h"
-#include "cunumeric/matrix/contract_template.inl"
-#include "cunumeric/matrix/util.h"
+#include "cupynumeric/matrix/contract.h"
+#include "cupynumeric/matrix/contract_template.inl"
+#include "cupynumeric/matrix/util.h"
 
 #include <tblis/tblis.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace tblis;
 
@@ -247,7 +247,10 @@ struct ContractImplBody<VariantKind::CPU, Type::Code::COMPLEX128> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { ContractTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  ContractTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/contract.cu b/src/cupynumeric/matrix/contract.cu
similarity index 98%
rename from src/cunumeric/matrix/contract.cu
rename to src/cupynumeric/matrix/contract.cu
index 48130bce9d..b5d636c661 100644
--- a/src/cunumeric/matrix/contract.cu
+++ b/src/cupynumeric/matrix/contract.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/matrix/contract.h"
-#include "cunumeric/matrix/contract_template.inl"
+#include "cupynumeric/matrix/contract.h"
+#include "cupynumeric/matrix/contract_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 namespace {  // anonymous
 
@@ -143,7 +143,7 @@ __host__ void contract(T* lhs_data,
                                   work_size,
                                   task_stream));
 
-  CUNUMERIC_CHECK_CUDA_STREAM(task_stream);
+  CUPYNUMERIC_CHECK_CUDA_STREAM(task_stream);
 
   CHECK_CUTENSOR(cutensorDestroyPlan(plan));
   CHECK_CUTENSOR(cutensorDestroyPlanPreference(plan_pref));
@@ -348,4 +348,4 @@ struct ContractImplBody<VariantKind::GPU, Type::Code::COMPLEX128> {
   contract_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/contract.h b/src/cupynumeric/matrix/contract.h
similarity index 67%
rename from src/cunumeric/matrix/contract.h
rename to src/cupynumeric/matrix/contract.h
index 59be08ba40..86873c2610 100644
--- a/src/cunumeric/matrix/contract.h
+++ b/src/cupynumeric/matrix/contract.h
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct ContractArgs {
   legate::PhysicalStore lhs;
@@ -29,9 +29,14 @@ struct ContractArgs {
   legate::Span<const bool> rhs2_dim_mask;
 };
 
-class ContractTask : public CuNumericTask<ContractTask> {
+class ContractTask : public CuPyNumericTask<ContractTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_CONTRACT};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_CONTRACT}};
+
+  static constexpr auto CPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+  static constexpr auto OMP_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -43,4 +48,4 @@ class ContractTask : public CuNumericTask<ContractTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/contract_omp.cc b/src/cupynumeric/matrix/contract_omp.cc
similarity index 98%
rename from src/cunumeric/matrix/contract_omp.cc
rename to src/cupynumeric/matrix/contract_omp.cc
index 06f914d4e7..9aca0acd9f 100644
--- a/src/cunumeric/matrix/contract_omp.cc
+++ b/src/cupynumeric/matrix/contract_omp.cc
@@ -14,14 +14,14 @@
  *
  */
 
-#include "cunumeric/matrix/contract.h"
-#include "cunumeric/matrix/contract_template.inl"
-#include "cunumeric/matrix/util.h"
+#include "cupynumeric/matrix/contract.h"
+#include "cupynumeric/matrix/contract_template.inl"
+#include "cupynumeric/matrix/util.h"
 
 #include <tblis/tblis.h>
 #include <omp.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace tblis;
 
@@ -242,4 +242,4 @@ struct ContractImplBody<VariantKind::OMP, Type::Code::COMPLEX128> {
   contract_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/contract_template.inl b/src/cupynumeric/matrix/contract_template.inl
similarity index 98%
rename from src/cunumeric/matrix/contract_template.inl
rename to src/cupynumeric/matrix/contract_template.inl
index 107a2d6d6f..4351cc1af4 100644
--- a/src/cunumeric/matrix/contract_template.inl
+++ b/src/cupynumeric/matrix/contract_template.inl
@@ -17,14 +17,14 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/matrix/contract.h"
+#include "cupynumeric/matrix/contract.h"
 
 #if 0  // debugging output
 #include "legate/utilities/debug.h"
 #include <unistd.h>
 #endif
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -221,7 +221,7 @@ static void contract_template(legate::TaskContext& context)
   auto dim  = args.lhs.dim();
   auto code = args.lhs.code();
 
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
   assert(dim == args.rhs1.dim());
   assert(dim == args.rhs2.dim());
   assert(dim == static_cast<int32_t>(args.lhs_dim_mask.size()));
@@ -234,4 +234,4 @@ static void contract_template(legate::TaskContext& context)
   double_dispatch(dim, code, ContractImpl<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/diag.cc b/src/cupynumeric/matrix/diag.cc
similarity index 90%
rename from src/cunumeric/matrix/diag.cc
rename to src/cupynumeric/matrix/diag.cc
index 19b0948f09..9f189aa94c 100644
--- a/src/cunumeric/matrix/diag.cc
+++ b/src/cupynumeric/matrix/diag.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/matrix/diag.h"
-#include "cunumeric/matrix/diag_template.inl"
+#include "cupynumeric/matrix/diag.h"
+#include "cupynumeric/matrix/diag_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -79,7 +79,10 @@ struct DiagImplBody<VariantKind::CPU, CODE, 2, false> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { DiagTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  DiagTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/diag.cu b/src/cupynumeric/matrix/diag.cu
similarity index 93%
rename from src/cunumeric/matrix/diag.cu
rename to src/cupynumeric/matrix/diag.cu
index 1532e75d99..eea655de8c 100644
--- a/src/cunumeric/matrix/diag.cu
+++ b/src/cupynumeric/matrix/diag.cu
@@ -14,11 +14,11 @@
  *
  */
 
-#include "cunumeric/matrix/diag.h"
-#include "cunumeric/matrix/diag_template.inl"
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/matrix/diag.h"
+#include "cupynumeric/matrix/diag_template.inl"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename VAL>
 __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
@@ -93,7 +93,7 @@ struct DiagImplBody<VariantKind::GPU, CODE, DIM, true> {
     auto stream = get_cached_stream();
     diag_extract<VAL><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
       out, in, distance, volume, skip_size, start, naxes, m_pitches, m_shape);
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -110,7 +110,7 @@ struct DiagImplBody<VariantKind::GPU, CODE, 2, false> {
     const size_t blocks = (distance + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
     auto stream         = get_cached_stream();
     diag_populate<VAL><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(out, in, distance, start);
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -119,4 +119,4 @@ struct DiagImplBody<VariantKind::GPU, CODE, 2, false> {
   diag_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/diag.h b/src/cupynumeric/matrix/diag.h
similarity index 80%
rename from src/cunumeric/matrix/diag.h
rename to src/cupynumeric/matrix/diag.h
index 2a7ef9f752..eda14a003d 100644
--- a/src/cunumeric/matrix/diag.h
+++ b/src/cupynumeric/matrix/diag.h
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct DiagArgs {
   int naxes;
@@ -27,9 +27,9 @@ struct DiagArgs {
   legate::PhysicalStore diag;
 };
 
-class DiagTask : public CuNumericTask<DiagTask> {
+class DiagTask : public CuPyNumericTask<DiagTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_DIAG};
+  static inline const auto TASK_CONFIG = legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_DIAG}};
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -41,4 +41,4 @@ class DiagTask : public CuNumericTask<DiagTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/diag_omp.cc b/src/cupynumeric/matrix/diag_omp.cc
similarity index 94%
rename from src/cunumeric/matrix/diag_omp.cc
rename to src/cupynumeric/matrix/diag_omp.cc
index 700023296e..c1edbc9271 100644
--- a/src/cunumeric/matrix/diag_omp.cc
+++ b/src/cupynumeric/matrix/diag_omp.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/matrix/diag.h"
-#include "cunumeric/matrix/diag_template.inl"
+#include "cupynumeric/matrix/diag.h"
+#include "cupynumeric/matrix/diag_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -82,4 +82,4 @@ struct DiagImplBody<VariantKind::OMP, CODE, 2, false> {
   diag_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/diag_template.inl b/src/cupynumeric/matrix/diag_template.inl
similarity index 95%
rename from src/cunumeric/matrix/diag_template.inl
rename to src/cupynumeric/matrix/diag_template.inl
index fae129c4d8..72718f93aa 100644
--- a/src/cunumeric/matrix/diag_template.inl
+++ b/src/cupynumeric/matrix/diag_template.inl
@@ -17,10 +17,10 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/matrix/diag.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/matrix/diag.h"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -82,13 +82,13 @@ struct DiagImpl {
       const Point<2> stop1(shape.hi[0], shape.hi[0]);
       // y <= shape.hi[1]
       const Point<2> stop2(shape.hi[1], shape.hi[1]);
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
       assert(shape.contains(stop1) || shape.contains(stop2));
 #endif
       const Point<2> stop = shape.contains(stop1) ? stop1 : stop2;
       // Walk the path from the stop to the start
       const coord_t distance = (stop[0] - start[0]) + 1;
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
       // Should be the same along both dimensions
       assert(distance == ((stop[1] - start[1]) + 1));
       // no extract is supported only for 1d input array (2d output)
@@ -115,4 +115,4 @@ static void diag_template(TaskContext& context)
   double_dispatch(matrix.dim(), matrix.code(), DiagImpl<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/dot.cc b/src/cupynumeric/matrix/dot.cc
similarity index 87%
rename from src/cunumeric/matrix/dot.cc
rename to src/cupynumeric/matrix/dot.cc
index 514a269275..eaf00cf46c 100644
--- a/src/cunumeric/matrix/dot.cc
+++ b/src/cupynumeric/matrix/dot.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/matrix/dot.h"
-#include "cunumeric/matrix/dot_template.inl"
+#include "cupynumeric/matrix/dot.h"
+#include "cupynumeric/matrix/dot_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -57,7 +57,10 @@ struct DotImplBody<VariantKind::CPU, CODE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { DotTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  DotTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/dot.cu b/src/cupynumeric/matrix/dot.cu
similarity index 93%
rename from src/cunumeric/matrix/dot.cu
rename to src/cupynumeric/matrix/dot.cu
index ac498f540c..fd4163559e 100644
--- a/src/cunumeric/matrix/dot.cu
+++ b/src/cupynumeric/matrix/dot.cu
@@ -14,11 +14,11 @@
  *
  */
 
-#include "cunumeric/matrix/dot.h"
-#include "cunumeric/matrix/dot_template.inl"
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/matrix/dot.h"
+#include "cupynumeric/matrix/dot_template.inl"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename Output, typename ReadAcc, typename Point, typename ACC>
 static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) reduction_kernel(
@@ -72,7 +72,7 @@ struct DotImplBody<VariantKind::GPU, CODE> {
     }
 
     copy_kernel<<<1, 1, 0, stream>>>(result, out);
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -81,4 +81,4 @@ struct DotImplBody<VariantKind::GPU, CODE> {
   dot_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/dot.h b/src/cupynumeric/matrix/dot.h
similarity index 74%
rename from src/cunumeric/matrix/dot.h
rename to src/cupynumeric/matrix/dot.h
index 07520bbb6c..e4cf99c11f 100644
--- a/src/cunumeric/matrix/dot.h
+++ b/src/cupynumeric/matrix/dot.h
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct DotArgs {
   legate::PhysicalStore lhs;
@@ -26,9 +26,11 @@ struct DotArgs {
   legate::PhysicalStore rhs2;
 };
 
-class DotTask : public CuNumericTask<DotTask> {
+class DotTask : public CuPyNumericTask<DotTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_DOT};
+  static inline const auto TASK_CONFIG = legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_DOT}};
+
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -40,4 +42,4 @@ class DotTask : public CuNumericTask<DotTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/dot_omp.cc b/src/cupynumeric/matrix/dot_omp.cc
similarity index 93%
rename from src/cunumeric/matrix/dot_omp.cc
rename to src/cupynumeric/matrix/dot_omp.cc
index be11b880a4..f7b8dd1c62 100644
--- a/src/cunumeric/matrix/dot_omp.cc
+++ b/src/cupynumeric/matrix/dot_omp.cc
@@ -14,13 +14,13 @@
  *
  */
 
-#include "cunumeric/matrix/dot.h"
-#include "cunumeric/matrix/dot_template.inl"
-#include "cunumeric/omp_help.h"
+#include "cupynumeric/matrix/dot.h"
+#include "cupynumeric/matrix/dot_template.inl"
+#include "cupynumeric/omp_help.h"
 
 #include <omp.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -78,4 +78,4 @@ struct DotImplBody<VariantKind::OMP, CODE> {
   dot_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/dot_template.inl b/src/cupynumeric/matrix/dot_template.inl
similarity index 95%
rename from src/cunumeric/matrix/dot_template.inl
rename to src/cupynumeric/matrix/dot_template.inl
index 7d24e2afa2..fa4a4e838f 100644
--- a/src/cunumeric/matrix/dot_template.inl
+++ b/src/cupynumeric/matrix/dot_template.inl
@@ -17,9 +17,9 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/matrix/dot.h"
+#include "cupynumeric/matrix/dot.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -79,4 +79,4 @@ static void dot_template(TaskContext& context)
   type_dispatch(args.rhs1.code(), DotImpl<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/set/unique_reduce.h b/src/cupynumeric/matrix/geev.cc
similarity index 52%
rename from src/cunumeric/set/unique_reduce.h
rename to src/cupynumeric/matrix/geev.cc
index 4a754a5f66..08ba4ba68b 100644
--- a/src/cunumeric/set/unique_reduce.h
+++ b/src/cupynumeric/matrix/geev.cc
@@ -14,21 +14,30 @@
  *
  */
 
-#pragma once
+#include "cupynumeric/matrix/geev.h"
+#include "cupynumeric/matrix/geev_template.inl"
+#include "cupynumeric/matrix/geev_cpu.inl"
 
-#include "cunumeric/cunumeric_task.h"
+namespace cupynumeric {
 
-namespace cunumeric {
+using namespace legate;
 
-class UniqueReduceTask : public CuNumericTask<UniqueReduceTask> {
- public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_UNIQUE_REDUCE};
+/*static*/ const char* GeevTask::ERROR_MESSAGE = "Factorization failed";
 
- public:
-  static void cpu_variant(legate::TaskContext context);
+/*static*/ void GeevTask::cpu_variant(TaskContext context)
+{
 #if LEGATE_DEFINED(LEGATE_USE_OPENMP)
-  static void omp_variant(legate::TaskContext context);
+  openblas_set_num_threads(1);  // make sure this isn't overzealous
 #endif
-};
+  geev_template<VariantKind::CPU>(context);
+}
 
-}  // namespace cunumeric
+namespace  // unnamed
+{
+static const auto cupynumeric_reg_task_ = []() -> char {
+  GeevTask::register_variants();
+  return 0;
+}();
+}  // namespace
+
+}  // namespace cupynumeric
\ No newline at end of file
diff --git a/src/cupynumeric/matrix/geev.cu b/src/cupynumeric/matrix/geev.cu
new file mode 100644
index 0000000000..66655ce60e
--- /dev/null
+++ b/src/cupynumeric/matrix/geev.cu
@@ -0,0 +1,261 @@
+/* Copyright 2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cupynumeric/matrix/geev.h"
+#include "cupynumeric/matrix/geev_template.inl"
+#include "cupynumeric/utilities/thrust_util.h"
+
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#include "cupynumeric/cuda_help.h"
+#include <vector>
+
+namespace cupynumeric {
+
+using namespace legate;
+
+template <typename VAL, typename VAL_COMPLEX>
+struct assembleEvs : public thrust::unary_function<VAL_COMPLEX, int64_t> {
+  const VAL_COMPLEX* ew_in_;
+  const VAL* ev_in_;
+  const int64_t m_;
+
+  assembleEvs(VAL_COMPLEX* ew_in, VAL* ev_in, int64_t m) : ew_in_(ew_in), ev_in_(ev_in), m_(m) {}
+
+  __CUDA_HD__ VAL_COMPLEX operator()(const int64_t& idx) const
+  {
+    int64_t col_idx = idx / m_;
+    auto ew_i       = ew_in_[col_idx].y;
+    // if img == 0 -> ev = ev[idx]
+    // if img positive -> ev = ev[idx] + i*ev[idx+1]
+    // if img negative -> ev = ev[idx-1] - i*ev[idx]
+    const int64_t real_idx = idx - ((ew_i < 0) ? m_ : 0);
+    const int64_t img_idx  = idx + ((ew_i > 0) ? m_ : 0);
+    VAL factor             = ((ew_i > 0) ? VAL(1.0) : ((ew_i < 0) ? VAL(-1.0) : VAL(0.0)));
+    VAL_COMPLEX result;
+    result.x = ev_in_[real_idx];
+    result.y = factor * ev_in_[img_idx];
+    return result;
+  }
+};
+
+template <typename VAL, typename VAL_COMPLEX>
+void assemble_complex_evs(VAL_COMPLEX* ev_out, VAL_COMPLEX* ew_in, VAL* ev_in, int64_t m)
+{
+  auto stream = get_cached_stream();
+  thrust::transform(DEFAULT_POLICY.on(stream),
+                    thrust::make_counting_iterator<int64_t>(0),
+                    thrust::make_counting_iterator<int64_t>(m * m),
+                    ev_out,
+                    assembleEvs(ew_in, ev_in, m));
+}
+
+template <typename VAL, typename DataType>
+static inline void geev_template(
+  DataType valTypeC, DataType valTypeA, int64_t m, const void* a, void* ew, void* ev)
+{
+  auto handle       = get_cusolver();
+  auto stream       = get_cached_stream();
+  auto geev_handles = get_cusolver_extra_symbols();
+
+  bool compute_evs = ev != nullptr;
+
+  auto a_copy = create_buffer<VAL>(m * m, Memory::Kind::GPU_FB_MEM);
+
+  CUPYNUMERIC_CHECK_CUDA(
+    cudaMemcpyAsync(a_copy.ptr(0), a, m * m * sizeof(VAL), cudaMemcpyDeviceToDevice, stream));
+
+  CHECK_CUSOLVER(cusolverDnSetStream(handle, stream));
+
+  size_t lwork_device, lwork_host;
+  CHECK_CUSOLVER(geev_handles->cusolver_geev_bufferSize(
+    handle,
+    nullptr,
+    CUSOLVER_EIG_MODE_NOVECTOR,
+    compute_evs ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR,
+    m,
+    valTypeA,
+    reinterpret_cast<void*>(a_copy.ptr(0)),
+    m,
+    valTypeC,
+    ew,
+    valTypeA,
+    nullptr,  // left EVs
+    m,
+    valTypeA,
+    ev,
+    m,
+    valTypeA,
+    &lwork_device,
+    &lwork_host));
+
+  auto buffer = create_buffer<char>(lwork_device, Memory::Kind::GPU_FB_MEM);
+  std::vector<char> buffer_host(std::max(1ul, lwork_host));
+  auto info = create_buffer<int32_t>(1, Memory::Kind::Z_COPY_MEM);
+
+  CHECK_CUSOLVER(
+    geev_handles->cusolver_geev(handle,
+                                nullptr,
+                                CUSOLVER_EIG_MODE_NOVECTOR,
+                                compute_evs ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR,
+                                m,
+                                valTypeA,
+                                reinterpret_cast<void*>(a_copy.ptr(0)),
+                                m,
+                                valTypeC,
+                                ew,
+                                valTypeA,
+                                nullptr,  // left EVs
+                                m,
+                                valTypeA,
+                                ev,
+                                m,
+                                valTypeA,
+                                buffer.ptr(0),
+                                lwork_device,
+                                buffer_host.data(),
+                                lwork_host,
+                                info.ptr(0)));
+
+  CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
+
+  if (info[0] != 0) {
+    throw legate::TaskException(GeevTask::ERROR_MESSAGE);
+  }
+}
+
+template <>
+struct GeevImplBody<VariantKind::GPU, Type::Code::FLOAT32> {
+  void operator()(int64_t m,
+                  int64_t num_batches,
+                  int64_t batch_stride_ew,
+                  int64_t batch_stride_ev,
+                  const float* a,
+                  complex<float>* ew,
+                  complex<float>* ev)
+  {
+    bool compute_evs = ev != nullptr;
+
+    // for real input --> create real buffer and assemble afterwards
+    auto ev_tmp = create_buffer<float>(compute_evs ? m * m : 0, Memory::Kind::GPU_FB_MEM);
+
+    for (int64_t batch_idx = 0; batch_idx < num_batches; ++batch_idx) {
+      geev_template<float>(CUDA_C_32F,
+                           CUDA_R_32F,
+                           m,
+                           a + batch_idx * batch_stride_ev,
+                           reinterpret_cast<cuComplex*>(ew + batch_idx * batch_stride_ew),
+                           compute_evs ? reinterpret_cast<void*>(ev_tmp.ptr(0)) : nullptr);
+
+      if (compute_evs) {
+        assemble_complex_evs(reinterpret_cast<cuComplex*>(ev + batch_idx * batch_stride_ev),
+                             reinterpret_cast<cuComplex*>(ew + batch_idx * batch_stride_ew),
+                             ev_tmp.ptr(0),
+                             m);
+      }
+    }
+  }
+};
+
+template <>
+struct GeevImplBody<VariantKind::GPU, Type::Code::FLOAT64> {
+  void operator()(int64_t m,
+                  int64_t num_batches,
+                  int64_t batch_stride_ew,
+                  int64_t batch_stride_ev,
+                  const double* a,
+                  complex<double>* ew,
+                  complex<double>* ev)
+  {
+    bool compute_evs = ev != nullptr;
+
+    // for real input --> create real buffer and assemble afterwards
+    auto ev_tmp = create_buffer<double>(compute_evs ? m * m : 0, Memory::Kind::GPU_FB_MEM);
+
+    for (int64_t batch_idx = 0; batch_idx < num_batches; ++batch_idx) {
+      geev_template<double>(CUDA_C_64F,
+                            CUDA_R_64F,
+                            m,
+                            a + batch_idx * batch_stride_ev,
+                            reinterpret_cast<cuDoubleComplex*>(ew + batch_idx * batch_stride_ew),
+                            compute_evs ? reinterpret_cast<void*>(ev_tmp.ptr(0)) : nullptr);
+
+      if (compute_evs) {
+        assemble_complex_evs(reinterpret_cast<cuDoubleComplex*>(ev + batch_idx * batch_stride_ev),
+                             reinterpret_cast<cuDoubleComplex*>(ew + batch_idx * batch_stride_ew),
+                             ev_tmp.ptr(0),
+                             m);
+      }
+    }
+  }
+};
+
+template <>
+struct GeevImplBody<VariantKind::GPU, Type::Code::COMPLEX64> {
+  void operator()(int64_t m,
+                  int64_t num_batches,
+                  int64_t batch_stride_ew,
+                  int64_t batch_stride_ev,
+                  const complex<float>* a,
+                  complex<float>* ew,
+                  complex<float>* ev)
+  {
+    bool compute_evs = ev != nullptr;
+
+    for (int64_t batch_idx = 0; batch_idx < num_batches; ++batch_idx) {
+      geev_template<complex<float>>(
+        CUDA_C_32F,
+        CUDA_C_32F,
+        m,
+        reinterpret_cast<const cuComplex*>(a + batch_idx * batch_stride_ev),
+        reinterpret_cast<cuComplex*>(ew + batch_idx * batch_stride_ew),
+        compute_evs ? reinterpret_cast<cuComplex*>(ev + batch_idx * batch_stride_ev) : nullptr);
+    }
+  }
+};
+
+template <>
+struct GeevImplBody<VariantKind::GPU, Type::Code::COMPLEX128> {
+  void operator()(int64_t m,
+                  int64_t num_batches,
+                  int64_t batch_stride_ew,
+                  int64_t batch_stride_ev,
+                  const complex<double>* a,
+                  complex<double>* ew,
+                  complex<double>* ev)
+  {
+    bool compute_evs = ev != nullptr;
+
+    for (int64_t batch_idx = 0; batch_idx < num_batches; ++batch_idx) {
+      geev_template<complex<double>>(
+        CUDA_C_64F,
+        CUDA_C_64F,
+        m,
+        reinterpret_cast<const cuDoubleComplex*>(a + batch_idx * batch_stride_ev),
+        reinterpret_cast<cuDoubleComplex*>(ew + batch_idx * batch_stride_ew),
+        compute_evs ? reinterpret_cast<cuDoubleComplex*>(ev + batch_idx * batch_stride_ev)
+                    : nullptr);
+    }
+  }
+};
+
+/*static*/ void GeevTask::gpu_variant(TaskContext context)
+{
+  geev_template<VariantKind::GPU>(context);
+}
+
+}  // namespace cupynumeric
\ No newline at end of file
diff --git a/src/cunumeric/matrix/solve.h b/src/cupynumeric/matrix/geev.h
similarity index 63%
rename from src/cunumeric/matrix/solve.h
rename to src/cupynumeric/matrix/geev.h
index 512d0878e2..0e7f0ffb7b 100644
--- a/src/cunumeric/matrix/solve.h
+++ b/src/cupynumeric/matrix/geev.h
@@ -16,15 +16,19 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
-class SolveTask : public CuNumericTask<SolveTask> {
+class GeevTask : public CuPyNumericTask<GeevTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_SOLVE};
+  static inline const auto TASK_CONFIG = legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_GEEV}};
   static const char* ERROR_MESSAGE;
 
+  static constexpr auto CPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+  static constexpr auto OMP_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+
  public:
   static void cpu_variant(legate::TaskContext context);
 #if LEGATE_DEFINED(LEGATE_USE_OPENMP)
@@ -35,4 +39,4 @@ class SolveTask : public CuNumericTask<SolveTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
\ No newline at end of file
diff --git a/src/cupynumeric/matrix/geev_cpu.inl b/src/cupynumeric/matrix/geev_cpu.inl
new file mode 100644
index 0000000000..f777c26ac0
--- /dev/null
+++ b/src/cupynumeric/matrix/geev_cpu.inl
@@ -0,0 +1,341 @@
+/* Copyright 2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include <cblas.h>
+#include <lapack.h>
+#include <cstring>
+
+namespace cupynumeric {
+
+using namespace legate;
+
+namespace {
+
+template <typename T>
+void assemble_complex(complex<T>* ew, complex<T>* ev, T* ew_r, T* ew_i, T* ev_r, size_t m)
+{
+  bool skip_next_ev = false;
+  for (int i = 0; i < m; ++i) {
+    ew[i] = complex<T>(ew_r[i], ew_i[i]);
+    if (ev != nullptr) {
+      if (skip_next_ev) {
+        skip_next_ev = false;
+      } else {
+        T* src1          = &ev_r[i * m];
+        complex<T>* dst1 = &ev[i * m];
+        if (ew_i[i] != T(0)) {
+          // define next 2 EVs
+          T* src2          = src1 + m;
+          complex<T>* dst2 = dst1 + m;
+          for (int k = 0; k < m; ++k) {
+            dst1[k] = complex<T>(src1[k], src2[k]);
+            dst2[k] = complex<T>(src1[k], T(-1) * src2[k]);
+          }
+          skip_next_ev = true;
+        } else {
+          for (int k = 0; k < m; ++k) {
+            dst1[k] = complex<T>(src1[k], T(0));
+          }
+        }
+      }
+    }
+  }
+}
+}  // namespace
+
+template <VariantKind KIND>
+struct GeevImplBody<KIND, Type::Code::FLOAT32> {
+  void operator()(int32_t m,
+                  int32_t num_batches,
+                  int32_t batch_stride_ew,
+                  int32_t batch_stride_ev,
+                  const float* a,
+                  complex<float>* ew,
+                  complex<float>* ev)
+  {
+    bool compute_evs = ev != nullptr;
+    auto a_copy      = create_buffer<float>(m * m);
+
+    // for real input --> create real buffer and assemble afterwards
+    auto ev_tmp       = create_buffer<float>(m * m);
+    float* ev_tmp_prt = ev_tmp.ptr(0);
+
+    for (int64_t batch_idx = 0; batch_idx < num_batches; ++batch_idx) {
+      std::copy(a, a + (m * m), a_copy.ptr(0));
+
+      std::vector<float> ew_r(m);
+      std::vector<float> ew_i(m);
+
+      int32_t info  = 0;
+      float wkopt   = 0;
+      int32_t lwork = -1;
+      LAPACK_sgeev("N",
+                   compute_evs ? "V" : "N",
+                   &m,
+                   a_copy.ptr(0),
+                   &m,
+                   ew_r.data(),
+                   ew_i.data(),
+                   nullptr,
+                   &m,
+                   ev_tmp_prt,
+                   &m,
+                   &wkopt,
+                   &lwork,
+                   &info);
+      lwork = (int)wkopt;
+
+      std::vector<float> work_tmp(lwork);
+      LAPACK_sgeev("N",
+                   compute_evs ? "V" : "N",
+                   &m,
+                   a_copy.ptr(0),
+                   &m,
+                   ew_r.data(),
+                   ew_i.data(),
+                   nullptr,
+                   &m,
+                   ev_tmp_prt,
+                   &m,
+                   work_tmp.data(),
+                   &lwork,
+                   &info);
+
+      if (info != 0) {
+        throw legate::TaskException(GeevTask::ERROR_MESSAGE);
+      }
+
+      assemble_complex<float>(ew, ev, ew_r.data(), ew_i.data(), ev_tmp_prt, m);
+
+      a += batch_stride_ev;
+      ew += batch_stride_ew;
+      if (compute_evs) {
+        ev += batch_stride_ev;
+      }
+    }
+  }
+};
+
+template <VariantKind KIND>
+struct GeevImplBody<KIND, Type::Code::FLOAT64> {
+  void operator()(int32_t m,
+                  int32_t num_batches,
+                  int32_t batch_stride_ew,
+                  int32_t batch_stride_ev,
+                  const double* a,
+                  complex<double>* ew,
+                  complex<double>* ev)
+  {
+    bool compute_evs = ev != nullptr;
+    auto a_copy      = create_buffer<double>(m * m);
+
+    // for real input --> create real buffer and assemble afterwards
+    auto ev_tmp        = create_buffer<double>(m * m);
+    double* ev_tmp_prt = ev_tmp.ptr(0);
+
+    for (int64_t batch_idx = 0; batch_idx < num_batches; ++batch_idx) {
+      std::copy(a, a + (m * m), a_copy.ptr(0));
+
+      std::vector<double> ew_r(m);
+      std::vector<double> ew_i(m);
+
+      int32_t info  = 0;
+      double wkopt  = 0;
+      int32_t lwork = -1;
+
+      LAPACK_dgeev("N",
+                   compute_evs ? "V" : "N",
+                   &m,
+                   a_copy.ptr(0),
+                   &m,
+                   ew_r.data(),
+                   ew_i.data(),
+                   nullptr,
+                   &m,
+                   ev_tmp_prt,
+                   &m,
+                   &wkopt,
+                   &lwork,
+                   &info);
+      lwork = (int)wkopt;
+
+      std::vector<double> work_tmp(lwork);
+      LAPACK_dgeev("N",
+                   compute_evs ? "V" : "N",
+                   &m,
+                   a_copy.ptr(0),
+                   &m,
+                   ew_r.data(),
+                   ew_i.data(),
+                   nullptr,
+                   &m,
+                   ev_tmp_prt,
+                   &m,
+                   work_tmp.data(),
+                   &lwork,
+                   &info);
+
+      if (info != 0) {
+        throw legate::TaskException(GeevTask::ERROR_MESSAGE);
+      }
+
+      assemble_complex<double>(ew, ev, ew_r.data(), ew_i.data(), ev_tmp_prt, m);
+
+      a += batch_stride_ev;
+      ew += batch_stride_ew;
+      if (compute_evs) {
+        ev += batch_stride_ev;
+      }
+    }
+  }
+};
+
+template <VariantKind KIND>
+struct GeevImplBody<KIND, Type::Code::COMPLEX64> {
+  void operator()(int32_t m,
+                  int32_t num_batches,
+                  int32_t batch_stride_ew,
+                  int32_t batch_stride_ev,
+                  const complex<float>* a,
+                  complex<float>* ew,
+                  complex<float>* ev)
+  {
+    bool compute_evs = ev != nullptr;
+    auto a_copy      = create_buffer<complex<float>>(m * m);
+
+    for (int64_t batch_idx = 0; batch_idx < num_batches; ++batch_idx) {
+      std::copy(a, a + (m * m), a_copy.ptr(0));
+
+      int32_t info            = 0;
+      int32_t lwork           = -1;
+      __complex__ float wkopt = 0;
+      std::vector<float> rwork(2 * m);
+
+      LAPACK_cgeev("N",
+                   compute_evs ? "V" : "N",
+                   &m,
+                   reinterpret_cast<__complex__ float*>(a_copy.ptr(0)),
+                   &m,
+                   reinterpret_cast<__complex__ float*>(ew),
+                   nullptr,
+                   &m,
+                   reinterpret_cast<__complex__ float*>(ev),
+                   &m,
+                   &wkopt,
+                   &lwork,
+                   rwork.data(),
+                   &info);
+
+      lwork = __real__ wkopt;
+
+      std::vector<__complex__ float> work_tmp(lwork);
+      LAPACK_cgeev("N",
+                   compute_evs ? "V" : "N",
+                   &m,
+                   reinterpret_cast<__complex__ float*>(a_copy.ptr(0)),
+                   &m,
+                   reinterpret_cast<__complex__ float*>(ew),
+                   nullptr,
+                   &m,
+                   reinterpret_cast<__complex__ float*>(ev),
+                   &m,
+                   work_tmp.data(),
+                   &lwork,
+                   rwork.data(),
+                   &info);
+
+      if (info != 0) {
+        throw legate::TaskException(GeevTask::ERROR_MESSAGE);
+      }
+
+      a += batch_stride_ev;
+      ew += batch_stride_ew;
+      if (compute_evs) {
+        ev += batch_stride_ev;
+      }
+    }
+  }
+};
+
+template <VariantKind KIND>
+struct GeevImplBody<KIND, Type::Code::COMPLEX128> {
+  void operator()(int32_t m,
+                  int32_t num_batches,
+                  int32_t batch_stride_ew,
+                  int32_t batch_stride_ev,
+                  const complex<double>* a,
+                  complex<double>* ew,
+                  complex<double>* ev)
+  {
+    bool compute_evs = ev != nullptr;
+    auto a_copy      = create_buffer<complex<double>>(m * m);
+
+    for (int64_t batch_idx = 0; batch_idx < num_batches; ++batch_idx) {
+      std::copy(a, a + (m * m), a_copy.ptr(0));
+
+      int32_t info             = 0;
+      int32_t lwork            = -1;
+      __complex__ double wkopt = 0;
+      std::vector<double> rwork(2 * m);
+      LAPACK_zgeev("N",
+                   compute_evs ? "V" : "N",
+                   &m,
+                   reinterpret_cast<__complex__ double*>(a_copy.ptr(0)),
+                   &m,
+                   reinterpret_cast<__complex__ double*>(ew),
+                   nullptr,
+                   &m,
+                   reinterpret_cast<__complex__ double*>(ev),
+                   &m,
+                   &wkopt,
+                   &lwork,
+                   rwork.data(),
+                   &info);
+
+      lwork = __real__ wkopt;
+
+      std::vector<__complex__ double> work_tmp(lwork);
+      LAPACK_zgeev("N",
+                   compute_evs ? "V" : "N",
+                   &m,
+                   reinterpret_cast<__complex__ double*>(a_copy.ptr(0)),
+                   &m,
+                   reinterpret_cast<__complex__ double*>(ew),
+                   nullptr,
+                   &m,
+                   reinterpret_cast<__complex__ double*>(ev),
+                   &m,
+                   work_tmp.data(),
+                   &lwork,
+                   rwork.data(),
+                   &info);
+
+      if (info != 0) {
+        throw legate::TaskException(GeevTask::ERROR_MESSAGE);
+      }
+
+      a += batch_stride_ev;
+      ew += batch_stride_ew;
+      if (compute_evs) {
+        ev += batch_stride_ev;
+      }
+    }
+  }
+};
+
+}  // namespace cupynumeric
\ No newline at end of file
diff --git a/src/cupynumeric/matrix/geev_omp.cc b/src/cupynumeric/matrix/geev_omp.cc
new file mode 100644
index 0000000000..e9dccc0cda
--- /dev/null
+++ b/src/cupynumeric/matrix/geev_omp.cc
@@ -0,0 +1,31 @@
+/* Copyright 2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cupynumeric/matrix/geev.h"
+#include "cupynumeric/matrix/geev_template.inl"
+#include "cupynumeric/matrix/geev_cpu.inl"
+
+#include <omp.h>
+
+namespace cupynumeric {
+
+/*static*/ void GeevTask::omp_variant(TaskContext context)
+{
+  openblas_set_num_threads(omp_get_max_threads());
+  geev_template<VariantKind::OMP>(context);
+}
+
+}  // namespace cupynumeric
\ No newline at end of file
diff --git a/src/cupynumeric/matrix/geev_template.inl b/src/cupynumeric/matrix/geev_template.inl
new file mode 100644
index 0000000000..d11122d340
--- /dev/null
+++ b/src/cupynumeric/matrix/geev_template.inl
@@ -0,0 +1,194 @@
+/* Copyright 2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include <vector>
+
+// Useful for IDEs
+#include "cupynumeric/matrix/geev.h"
+
+namespace cupynumeric {
+
+using namespace legate;
+
+template <VariantKind KIND, Type::Code CODE>
+struct GeevImplBody;
+
+template <Type::Code CODE>
+struct support_geev : std::false_type {};
+template <>
+struct support_geev<Type::Code::FLOAT64> : std::true_type {};
+template <>
+struct support_geev<Type::Code::FLOAT32> : std::true_type {};
+template <>
+struct support_geev<Type::Code::COMPLEX64> : std::true_type {};
+template <>
+struct support_geev<Type::Code::COMPLEX128> : std::true_type {};
+
+template <Type::Code CODE>
+struct complex_type {
+  using TYPE = complex<float>;
+};
+template <>
+struct complex_type<Type::Code::FLOAT64> {
+  using TYPE = complex<double>;
+};
+template <>
+struct complex_type<Type::Code::COMPLEX128> {
+  using TYPE = complex<double>;
+};
+
+template <VariantKind KIND>
+struct GeevImpl {
+  template <Type::Code CODE,
+            int32_t DIM,
+            std::enable_if_t<support_geev<CODE>::value && DIM >= 2>* = nullptr>
+  void operator()(TaskContext& context) const
+  {
+    using VAL         = type_of<CODE>;
+    using VAL_COMPLEX = typename complex_type<CODE>::TYPE;
+
+    legate::PhysicalStore a_array  = context.input(0);
+    legate::PhysicalStore ew_array = context.output(0);
+
+#ifdef DEBUG_CUPYNUMERIC
+    assert(a_array.dim() >= 2);
+    assert(a_array.dim() == DIM);
+    assert(ew_array.dim() == DIM - 1);
+#endif
+    const auto a_shape  = a_array.shape<DIM>();
+    const auto ew_shape = ew_array.shape<DIM - 1>();
+
+    if (a_shape.empty()) {
+      return;
+    }
+
+    int64_t batchsize_total = 1;
+    std::vector<int64_t> batchdims;
+    for (auto i = 0; i < DIM - 2; ++i) {
+      batchdims.push_back(a_shape.hi[i] - a_shape.lo[i] + 1);
+      batchsize_total *= batchdims.back();
+    }
+
+    const int64_t m = a_shape.hi[DIM - 1] - a_shape.lo[DIM - 1] + 1;
+
+#ifdef DEBUG_CUPYNUMERIC
+    assert(m > 0);
+    assert(batchsize_total > 0);
+    assert(a_shape.hi[DIM - 2] - a_shape.lo[DIM - 2] + 1 == m);
+    assert(ew_shape.hi[DIM - 2] - ew_shape.lo[DIM - 2] + 1 == m);
+    for (auto i = 0; i < batchdims.size(); ++i) {
+      assert(ew_shape.hi[i] - ew_shape.lo[i] + 1 == batchdims[i]);
+    }
+#endif
+    size_t a_strides[DIM];
+    size_t ew_strides[DIM - 1];
+    size_t ev_strides[DIM];
+
+    auto* a_acc = a_array.read_accessor<VAL, DIM>(a_shape).ptr(a_shape, a_strides);
+    auto* ew_acc =
+      ew_array.write_accessor<VAL_COMPLEX, DIM - 1>(ew_shape).ptr(ew_shape, ew_strides);
+    VAL_COMPLEX* ev_acc = nullptr;
+
+    // optional computation of eigenvectors
+    bool compute_evs = context.outputs().size() > 1;
+    if (compute_evs) {
+      legate::PhysicalStore ev_array = context.output(1);
+#ifdef DEBUG_CUPYNUMERIC
+      assert(ev_array.dim() == DIM);
+#endif
+      const auto ev_shape = ev_array.shape<DIM>();
+#ifdef DEBUG_CUPYNUMERIC
+      assert(ev_shape.hi[DIM - 2] - ev_shape.lo[DIM - 2] + 1 == m);
+      assert(ev_shape.hi[DIM - 1] - ev_shape.lo[DIM - 1] + 1 == m);
+      for (auto i = 0; i < batchdims.size(); ++i) {
+        assert(ev_shape.hi[i] - ev_shape.lo[i] + 1 == batchdims[i]);
+      }
+#endif
+      ev_acc = ev_array.write_accessor<VAL_COMPLEX, DIM>(ev_shape).ptr(ev_shape, ev_strides);
+    }
+
+    // Find the outer most batch dimension on which we can iterate with constant batch stride.
+    // Then loop over remaining 'outer' batches
+    // Example:
+    // a-shape = (1, 4, 2, 7, 1, M, M)
+    // => inner_batch_dim=3, inner_batch_size=7
+    // => outer_batch_size=8
+
+    // 1. find batch dimension to perform computation with constant stride
+    int64_t inner_batch_dim       = -1;
+    int64_t inner_batch_size      = 1;
+    int64_t inner_batch_stride_ev = m * m;
+    int64_t inner_batch_stride_ew = m;
+    for (int i = batchdims.size() - 1; i >= 0; --i) {
+      if (batchdims[i] > 1) {
+        inner_batch_dim       = i;
+        inner_batch_size      = batchdims[i];
+        inner_batch_stride_ev = a_strides[i];
+        inner_batch_stride_ew = ew_strides[i];
+        break;
+      }
+    }
+
+    const int64_t outer_batch_size = batchsize_total / inner_batch_size;
+
+    // 2. loop over prod(dims(0..idx-1)), need to update offsets every start
+    for (int64_t batch_idx = 0; batch_idx < outer_batch_size; ++batch_idx) {
+      // duplicate pointers to data
+      auto a_acc_cur  = a_acc;
+      auto ew_acc_cur = ew_acc;
+      auto ev_acc_cur = ev_acc;
+
+      // apply offsets for pointers / assuming row wise batch order
+      int64_t remainder_idx = batch_idx;
+      for (int i = inner_batch_dim - 1; i >= 0; i--) {
+        int64_t dim_position = remainder_idx % batchdims[i];
+        a_acc_cur += a_strides[i] * dim_position;
+        ew_acc_cur += ew_strides[i] * dim_position;
+        if (compute_evs) {
+          ev_acc_cur += ev_strides[i] * dim_position;
+        }
+        remainder_idx /= batchdims[i];
+      }
+
+      GeevImplBody<KIND, CODE>()(m,
+                                 inner_batch_size,
+                                 inner_batch_stride_ew,
+                                 inner_batch_stride_ev,
+                                 a_acc_cur,
+                                 ew_acc_cur,
+                                 ev_acc_cur);
+    }
+  }
+
+  template <Type::Code CODE,
+            int32_t DIM,
+            std::enable_if_t<!support_geev<CODE>::value || DIM<2>* = nullptr> void
+            operator()(TaskContext& context) const
+  {
+    assert(false);
+  }
+};
+
+template <VariantKind KIND>
+static void geev_template(TaskContext& context)
+{
+  auto a_array = context.input(0);
+  double_dispatch(a_array.dim(), a_array.type().code(), GeevImpl<KIND>{}, context);
+}
+
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/gemm.cc b/src/cupynumeric/matrix/gemm.cc
similarity index 92%
rename from src/cunumeric/matrix/gemm.cc
rename to src/cupynumeric/matrix/gemm.cc
index e420d3dfd1..a707d78c97 100644
--- a/src/cunumeric/matrix/gemm.cc
+++ b/src/cupynumeric/matrix/gemm.cc
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/matrix/gemm.h"
-#include "cunumeric/matrix/gemm_template.inl"
+#include "cupynumeric/matrix/gemm.h"
+#include "cupynumeric/matrix/gemm_template.inl"
 
 #include <cblas.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -107,7 +107,10 @@ struct GemmImplBody<VariantKind::CPU, Type::Code::COMPLEX128> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { GemmTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  GemmTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/gemm.cu b/src/cupynumeric/matrix/gemm.cu
similarity index 93%
rename from src/cunumeric/matrix/gemm.cu
rename to src/cupynumeric/matrix/gemm.cu
index 03502e6fc3..cce3c342e7 100644
--- a/src/cunumeric/matrix/gemm.cu
+++ b/src/cupynumeric/matrix/gemm.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/matrix/gemm.h"
-#include "cunumeric/matrix/gemm_template.inl"
+#include "cupynumeric/matrix/gemm.h"
+#include "cupynumeric/matrix/gemm_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -39,7 +39,7 @@ static inline void gemm_template(
 
   CHECK_CUBLAS(gemm(context, transa, transb, m, n, k, &alpha, rhs1, m, rhs2, n, &beta, lhs, m));
 
-  CUNUMERIC_CHECK_CUDA_STREAM(stream);
+  CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
 }
 
 template <typename Gemm, typename VAL, typename CTOR>
@@ -58,7 +58,7 @@ static inline void complex_gemm_template(
 
   CHECK_CUBLAS(gemm(context, transa, transb, m, n, k, &alpha, rhs1, m, rhs2, n, &beta, lhs, m));
 
-  CUNUMERIC_CHECK_CUDA_STREAM(stream);
+  CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
 }
 
 template <>
@@ -117,4 +117,4 @@ struct GemmImplBody<VariantKind::GPU, Type::Code::COMPLEX128> {
   gemm_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/gemm.h b/src/cupynumeric/matrix/gemm.h
similarity index 78%
rename from src/cunumeric/matrix/gemm.h
rename to src/cupynumeric/matrix/gemm.h
index c0c7a53b74..30e6e2798b 100644
--- a/src/cunumeric/matrix/gemm.h
+++ b/src/cupynumeric/matrix/gemm.h
@@ -16,13 +16,13 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
-class GemmTask : public CuNumericTask<GemmTask> {
+class GemmTask : public CuPyNumericTask<GemmTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_GEMM};
+  static inline const auto TASK_CONFIG = legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_GEMM}};
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -34,4 +34,4 @@ class GemmTask : public CuNumericTask<GemmTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/gemm_omp.cc b/src/cupynumeric/matrix/gemm_omp.cc
similarity index 95%
rename from src/cunumeric/matrix/gemm_omp.cc
rename to src/cupynumeric/matrix/gemm_omp.cc
index 80f21e1639..45cefdfd31 100644
--- a/src/cunumeric/matrix/gemm_omp.cc
+++ b/src/cupynumeric/matrix/gemm_omp.cc
@@ -14,13 +14,13 @@
  *
  */
 
-#include "cunumeric/matrix/gemm.h"
-#include "cunumeric/matrix/gemm_template.inl"
+#include "cupynumeric/matrix/gemm.h"
+#include "cupynumeric/matrix/gemm_template.inl"
 
 #include <cblas.h>
 #include <omp.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -104,4 +104,4 @@ struct GemmImplBody<VariantKind::CPU, Type::Code::COMPLEX128> {
   gemm_template<VariantKind::CPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/gemm_template.inl b/src/cupynumeric/matrix/gemm_template.inl
similarity index 97%
rename from src/cunumeric/matrix/gemm_template.inl
rename to src/cupynumeric/matrix/gemm_template.inl
index c3ffa0967d..e96334eefe 100644
--- a/src/cunumeric/matrix/gemm_template.inl
+++ b/src/cupynumeric/matrix/gemm_template.inl
@@ -17,9 +17,9 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/matrix/gemm.h"
+#include "cupynumeric/matrix/gemm.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -93,4 +93,4 @@ static void gemm_template(TaskContext& context)
   type_dispatch(lhs.type().code(), GemmImpl<KIND>{}, lhs, rhs1, rhs2);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/matmul.cc b/src/cupynumeric/matrix/matmul.cc
similarity index 76%
rename from src/cunumeric/matrix/matmul.cc
rename to src/cupynumeric/matrix/matmul.cc
index 9ab62c5727..d8cc635825 100644
--- a/src/cunumeric/matrix/matmul.cc
+++ b/src/cupynumeric/matrix/matmul.cc
@@ -14,16 +14,16 @@
  *
  */
 
-#include "cunumeric/matrix/matmul.h"
-#include "cunumeric/matrix/matmul_template.inl"
-#include "cunumeric/matrix/matmul_cpu.inl"
+#include "cupynumeric/matrix/matmul.h"
+#include "cupynumeric/matrix/matmul_template.inl"
+#include "cupynumeric/matrix/matmul_cpu.inl"
 
 #include <cblas.h>
 #if LEGATE_DEFINED(LEGATE_USE_OPENMP)
 #include <omp.h>
 #endif
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -37,7 +37,10 @@ using namespace legate;
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { MatMulTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  MatMulTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/matmul.cu b/src/cupynumeric/matrix/matmul.cu
similarity index 95%
rename from src/cunumeric/matrix/matmul.cu
rename to src/cupynumeric/matrix/matmul.cu
index e11c51909f..00a52d6a12 100644
--- a/src/cunumeric/matrix/matmul.cu
+++ b/src/cupynumeric/matrix/matmul.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/matrix/matmul.h"
-#include "cunumeric/matrix/matmul_template.inl"
+#include "cupynumeric/matrix/matmul.h"
+#include "cupynumeric/matrix/matmul_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 // NOTE:
 // cuBLAS doesn't support row-major, so reverse the matrix order so it thinks things are
@@ -68,7 +68,7 @@ struct MatMulImplBody<VariantKind::GPU, Type::Code::FLOAT32> {
                                CUDA_R_32F,
                                lhs_stride));
 
-    CUNUMERIC_CHECK_CUDA_STREAM(task_stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(task_stream);
   }
 };
 
@@ -109,7 +109,7 @@ struct MatMulImplBody<VariantKind::GPU, Type::Code::FLOAT64> {
                              lhs,
                              lhs_stride));
 
-    CUNUMERIC_CHECK_CUDA_STREAM(task_stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(task_stream);
   }
 };
 
@@ -153,7 +153,7 @@ struct MatMulImplBody<VariantKind::GPU, Type::Code::FLOAT16> {
                                CUDA_R_32F,
                                lhs_stride));
 
-    CUNUMERIC_CHECK_CUDA_STREAM(task_stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(task_stream);
   }
 };
 
@@ -201,7 +201,7 @@ struct MatMulImplBody<VariantKind::GPU, Type::Code::COMPLEX64> {
                                CUDA_C_32F,
                                lhs_stride));
 
-    CUNUMERIC_CHECK_CUDA_STREAM(task_stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(task_stream);
   }
 };
 
@@ -246,7 +246,7 @@ struct MatMulImplBody<VariantKind::GPU, Type::Code::COMPLEX128> {
                              lhs,
                              lhs_stride));
 
-    CUNUMERIC_CHECK_CUDA_STREAM(task_stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(task_stream);
   }
 };
 
@@ -255,4 +255,4 @@ struct MatMulImplBody<VariantKind::GPU, Type::Code::COMPLEX128> {
   matmul_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cupynumeric/matrix/matmul.h b/src/cupynumeric/matrix/matmul.h
new file mode 100644
index 0000000000..695515e8c3
--- /dev/null
+++ b/src/cupynumeric/matrix/matmul.h
@@ -0,0 +1,48 @@
+/* Copyright 2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "cupynumeric/cupynumeric_task.h"
+
+namespace cupynumeric {
+
+struct MatMulArgs {
+  legate::PhysicalStore lhs;
+  legate::PhysicalStore rhs1;
+  legate::PhysicalStore rhs2;
+};
+
+class MatMulTask : public CuPyNumericTask<MatMulTask> {
+ public:
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_MATMUL}};
+
+  // Only the CPU implementation needs temporary allocations due to lack of float16 support
+  static constexpr auto CPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+  static constexpr auto OMP_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+
+ public:
+  static void cpu_variant(legate::TaskContext context);
+#if LEGATE_DEFINED(LEGATE_USE_OPENMP)
+  static void omp_variant(legate::TaskContext context);
+#endif
+#if LEGATE_DEFINED(LEGATE_USE_CUDA)
+  static void gpu_variant(legate::TaskContext context);
+#endif
+};
+
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/matmul_cpu.inl b/src/cupynumeric/matrix/matmul_cpu.inl
similarity index 97%
rename from src/cunumeric/matrix/matmul_cpu.inl
rename to src/cupynumeric/matrix/matmul_cpu.inl
index 575e0c6e9f..4bfb039ee6 100644
--- a/src/cunumeric/matrix/matmul_cpu.inl
+++ b/src/cupynumeric/matrix/matmul_cpu.inl
@@ -16,13 +16,13 @@
 
 #pragma once
 
-#include "cunumeric/matrix/matmul.h"
-#include "cunumeric/matrix/matmul_template.inl"
-#include "cunumeric/matrix/util.h"
+#include "cupynumeric/matrix/matmul.h"
+#include "cupynumeric/matrix/matmul_template.inl"
+#include "cupynumeric/matrix/util.h"
 
 #include <cblas.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace Legion;
 using namespace legate;
@@ -215,4 +215,4 @@ struct MatMulImplBody<KIND, Type::Code::COMPLEX128> {
   }
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/matmul_omp.cc b/src/cupynumeric/matrix/matmul_omp.cc
similarity index 81%
rename from src/cunumeric/matrix/matmul_omp.cc
rename to src/cupynumeric/matrix/matmul_omp.cc
index 3e9bdfbcb7..ceebbb0074 100644
--- a/src/cunumeric/matrix/matmul_omp.cc
+++ b/src/cupynumeric/matrix/matmul_omp.cc
@@ -14,13 +14,13 @@
  *
  */
 
-#include "cunumeric/matrix/matmul.h"
-#include "cunumeric/matrix/matmul_template.inl"
-#include "cunumeric/matrix/matmul_cpu.inl"
+#include "cupynumeric/matrix/matmul.h"
+#include "cupynumeric/matrix/matmul_template.inl"
+#include "cupynumeric/matrix/matmul_cpu.inl"
 
 #include <omp.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -30,4 +30,4 @@ using namespace legate;
   matmul_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/matmul_template.inl b/src/cupynumeric/matrix/matmul_template.inl
similarity index 95%
rename from src/cunumeric/matrix/matmul_template.inl
rename to src/cupynumeric/matrix/matmul_template.inl
index 2b041f923c..adffe16928 100644
--- a/src/cunumeric/matrix/matmul_template.inl
+++ b/src/cupynumeric/matrix/matmul_template.inl
@@ -17,10 +17,10 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/matrix/matmul.h"
-#include "cunumeric/matrix/util.h"
+#include "cupynumeric/matrix/matmul.h"
+#include "cupynumeric/matrix/util.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -70,7 +70,7 @@ struct MatMulImpl {
     const auto n = shape_lhs.hi[1] - shape_lhs.lo[1] + 1;
     const auto k = shape_rhs1.hi[1] - shape_rhs1.lo[1] + 1;
 
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     assert(m == shape_rhs1.hi[0] - shape_rhs1.lo[0] + 1);
     assert(k == shape_rhs2.hi[0] - shape_rhs2.lo[0] + 1);
     assert(n == shape_rhs2.hi[1] - shape_rhs2.lo[1] + 1);
@@ -84,7 +84,7 @@ struct MatMulImpl {
     auto rhs2 = args.rhs2.read_accessor<VAL, 2>(shape_rhs2).ptr(shape_rhs2, strides_rhs2);
     auto lhs  = args.lhs.read_write_accessor<ACC, 2>(shape_lhs).ptr(shape_lhs, strides_lhs);
 
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     assert(strides_rhs1[0] == 1 || strides_rhs1[1] == 1);
     assert(strides_rhs2[0] == 1 || strides_rhs2[1] == 1);
     assert(strides_lhs[1] == 1);
@@ -128,4 +128,4 @@ static void matmul_template(TaskContext& context)
   type_dispatch(args.rhs1.code(), MatMulImpl<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/matvecmul.cc b/src/cupynumeric/matrix/matvecmul.cc
similarity index 79%
rename from src/cunumeric/matrix/matvecmul.cc
rename to src/cupynumeric/matrix/matvecmul.cc
index 214624489b..29bd3c0acf 100644
--- a/src/cunumeric/matrix/matvecmul.cc
+++ b/src/cupynumeric/matrix/matvecmul.cc
@@ -14,16 +14,16 @@
  *
  */
 
-#include "cunumeric/matrix/matvecmul.h"
-#include "cunumeric/matrix/matvecmul_template.inl"
-#include "cunumeric/matrix/matvecmul_cpu.inl"
+#include "cupynumeric/matrix/matvecmul.h"
+#include "cupynumeric/matrix/matvecmul_template.inl"
+#include "cupynumeric/matrix/matvecmul_cpu.inl"
 
 #include <cblas.h>
 #if LEGATE_DEFINED(LEGATE_USE_OPENMP)
 #include <omp.h>
 #endif
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -37,10 +37,10 @@ using namespace legate;
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+const auto cupynumeric_reg_task_ = []() -> char {
   MatVecMulTask::register_variants();
-}
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/matvecmul.cu b/src/cupynumeric/matrix/matvecmul.cu
similarity index 95%
rename from src/cunumeric/matrix/matvecmul.cu
rename to src/cupynumeric/matrix/matvecmul.cu
index d7ce65a721..f85b350826 100644
--- a/src/cunumeric/matrix/matvecmul.cu
+++ b/src/cupynumeric/matrix/matvecmul.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/matrix/matvecmul.h"
-#include "cunumeric/matrix/matvecmul_template.inl"
+#include "cupynumeric/matrix/matvecmul.h"
+#include "cupynumeric/matrix/matvecmul_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <>
 struct MatVecMulImplBody<VariantKind::GPU, Type::Code::FLOAT32> {
@@ -71,7 +71,7 @@ struct MatVecMulImplBody<VariantKind::GPU, Type::Code::FLOAT32> {
                                  transpose_mat ? n : m));
     }
 
-    CUNUMERIC_CHECK_CUDA_STREAM(task_stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(task_stream);
   }
 };
 
@@ -119,7 +119,7 @@ struct MatVecMulImplBody<VariantKind::GPU, Type::Code::FLOAT64> {
                                transpose_mat ? n : m));
     }
 
-    CUNUMERIC_CHECK_CUDA_STREAM(task_stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(task_stream);
   }
 };
 
@@ -161,7 +161,7 @@ struct MatVecMulImplBody<VariantKind::GPU, Type::Code::FLOAT16> {
                                CUDA_R_32F,
                                transpose_mat ? n : m));
 
-    CUNUMERIC_CHECK_CUDA_STREAM(task_stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(task_stream);
   }
 };
 
@@ -216,7 +216,7 @@ struct MatVecMulImplBody<VariantKind::GPU, Type::Code::COMPLEX64> {
                                  transpose_mat ? n : m));
     }
 
-    CUNUMERIC_CHECK_CUDA_STREAM(task_stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(task_stream);
   }
 };
 
@@ -268,7 +268,7 @@ struct MatVecMulImplBody<VariantKind::GPU, Type::Code::COMPLEX128> {
                                transpose_mat ? n : m));
     }
 
-    CUNUMERIC_CHECK_CUDA_STREAM(task_stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(task_stream);
   }
 };
 
@@ -277,4 +277,4 @@ struct MatVecMulImplBody<VariantKind::GPU, Type::Code::COMPLEX128> {
   matvecmul_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/matvecmul.h b/src/cupynumeric/matrix/matvecmul.h
similarity index 64%
rename from src/cunumeric/matrix/matvecmul.h
rename to src/cupynumeric/matrix/matvecmul.h
index c012ab1910..607070a200 100644
--- a/src/cunumeric/matrix/matvecmul.h
+++ b/src/cupynumeric/matrix/matvecmul.h
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct MatVecMulArgs {
   legate::PhysicalStore lhs;
@@ -26,9 +26,14 @@ struct MatVecMulArgs {
   legate::PhysicalStore rhs2;
 };
 
-class MatVecMulTask : public CuNumericTask<MatVecMulTask> {
+class MatVecMulTask : public CuPyNumericTask<MatVecMulTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_MATVECMUL};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_MATVECMUL}};
+
+  // Only the CPU implementation needs temporary allocations due to lack of float16 support
+  static constexpr auto CPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+  static constexpr auto OMP_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -40,4 +45,4 @@ class MatVecMulTask : public CuNumericTask<MatVecMulTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/matvecmul_cpu.inl b/src/cupynumeric/matrix/matvecmul_cpu.inl
similarity index 96%
rename from src/cunumeric/matrix/matvecmul_cpu.inl
rename to src/cupynumeric/matrix/matvecmul_cpu.inl
index 5879d93701..977292433b 100644
--- a/src/cunumeric/matrix/matvecmul_cpu.inl
+++ b/src/cupynumeric/matrix/matvecmul_cpu.inl
@@ -16,13 +16,13 @@
 
 #pragma once
 
-#include "cunumeric/matrix/matvecmul.h"
-#include "cunumeric/matrix/matvecmul_template.inl"
-#include "cunumeric/matrix/util.h"
+#include "cupynumeric/matrix/matvecmul.h"
+#include "cupynumeric/matrix/matvecmul_template.inl"
+#include "cupynumeric/matrix/util.h"
 
 #include <cblas.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace Legion;
 using namespace legate;
@@ -130,4 +130,4 @@ struct MatVecMulImplBody<KIND, Type::Code::COMPLEX128> {
   }
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/matvecmul_omp.cc b/src/cupynumeric/matrix/matvecmul_omp.cc
similarity index 81%
rename from src/cunumeric/matrix/matvecmul_omp.cc
rename to src/cupynumeric/matrix/matvecmul_omp.cc
index ff332d95ca..7470ea6d1f 100644
--- a/src/cunumeric/matrix/matvecmul_omp.cc
+++ b/src/cupynumeric/matrix/matvecmul_omp.cc
@@ -14,14 +14,14 @@
  *
  */
 
-#include "cunumeric/matrix/matvecmul.h"
-#include "cunumeric/matrix/matvecmul_template.inl"
-#include "cunumeric/matrix/matvecmul_cpu.inl"
+#include "cupynumeric/matrix/matvecmul.h"
+#include "cupynumeric/matrix/matvecmul_template.inl"
+#include "cupynumeric/matrix/matvecmul_cpu.inl"
 
 #include <cblas.h>
 #include <omp.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -31,4 +31,4 @@ using namespace legate;
   matvecmul_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/matvecmul_template.inl b/src/cupynumeric/matrix/matvecmul_template.inl
similarity index 95%
rename from src/cunumeric/matrix/matvecmul_template.inl
rename to src/cupynumeric/matrix/matvecmul_template.inl
index 819b3da809..f8583a87f1 100644
--- a/src/cunumeric/matrix/matvecmul_template.inl
+++ b/src/cupynumeric/matrix/matvecmul_template.inl
@@ -17,10 +17,10 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/matrix/matvecmul.h"
-#include "cunumeric/matrix/util.h"
+#include "cupynumeric/matrix/matvecmul.h"
+#include "cupynumeric/matrix/util.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -81,7 +81,7 @@ struct MatVecMulImpl {
     size_t lhs_strides[2];
     auto lhs = args.lhs.reduce_accessor<SumReduction<ACC>, true, 2>().ptr(shape, lhs_strides);
 
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     assert(vec_strides[0] == 0 && vec_strides[1] == 1);
     assert(lhs_strides[0] == 1 && lhs_strides[1] == 0);
 #endif
@@ -109,4 +109,4 @@ static void matvecmul_template(TaskContext& context)
   type_dispatch(args.rhs1.code(), MatVecMulImpl<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/mp_potrf.cu b/src/cupynumeric/matrix/mp_potrf.cu
similarity index 93%
rename from src/cunumeric/matrix/mp_potrf.cu
rename to src/cupynumeric/matrix/mp_potrf.cu
index aa55691033..980bcce4e0 100644
--- a/src/cunumeric/matrix/mp_potrf.cu
+++ b/src/cupynumeric/matrix/mp_potrf.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/matrix/mp_potrf.h"
-#include "cunumeric/matrix/mp_potrf_template.inl"
+#include "cupynumeric/matrix/mp_potrf.h"
+#include "cupynumeric/matrix/mp_potrf_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace Legion;
 using namespace legate;
@@ -77,7 +77,7 @@ static inline void mp_potrf_template(
 
   // TODO: We need a deferred exception to avoid this synchronization
   CHECK_CAL(cal_stream_sync(comm, stream));
-  CUNUMERIC_CHECK_CUDA_STREAM(stream);
+  CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
 
   CHECK_CUSOLVER(cusolverMpDestroyMatrixDesc(desc));
   CHECK_CUSOLVER(cusolverMpDestroyGrid(grid));
@@ -140,7 +140,10 @@ struct MpPotrfImplBody<VariantKind::GPU, Type::Code::COMPLEX128> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { MpPotrfTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  MpPotrfTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
\ No newline at end of file
+}  // namespace cupynumeric
\ No newline at end of file
diff --git a/src/cunumeric/matrix/mp_potrf.h b/src/cupynumeric/matrix/mp_potrf.h
similarity index 74%
rename from src/cunumeric/matrix/mp_potrf.h
rename to src/cupynumeric/matrix/mp_potrf.h
index 0c03021ecc..2ccedcef79 100644
--- a/src/cunumeric/matrix/mp_potrf.h
+++ b/src/cupynumeric/matrix/mp_potrf.h
@@ -16,13 +16,14 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
-class MpPotrfTask : public CuNumericTask<MpPotrfTask> {
+class MpPotrfTask : public CuPyNumericTask<MpPotrfTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_MP_POTRF};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_MP_POTRF}};
 
  public:
 #if LEGATE_DEFINED(LEGATE_USE_CUDA)
@@ -30,4 +31,4 @@ class MpPotrfTask : public CuNumericTask<MpPotrfTask> {
 #endif
 };
 
-}  // namespace cunumeric
\ No newline at end of file
+}  // namespace cupynumeric
\ No newline at end of file
diff --git a/src/cunumeric/matrix/mp_potrf_template.inl b/src/cupynumeric/matrix/mp_potrf_template.inl
similarity index 96%
rename from src/cunumeric/matrix/mp_potrf_template.inl
rename to src/cupynumeric/matrix/mp_potrf_template.inl
index 521dfa2c56..1c2133d761 100644
--- a/src/cunumeric/matrix/mp_potrf_template.inl
+++ b/src/cupynumeric/matrix/mp_potrf_template.inl
@@ -21,13 +21,13 @@
 #include "legate/comm/coll.h"
 
 // Useful for IDEs
-#include "cunumeric/matrix/mp_potrf.h"
-#include "cunumeric/cuda_help.h"
-#include "cunumeric/utilities/repartition.h"
+#include "cupynumeric/matrix/mp_potrf.h"
+#include "cupynumeric/cuda_help.h"
+#include "cupynumeric/utilities/repartition.h"
 
 #include <cal.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace Legion;
 using namespace legate;
@@ -164,4 +164,4 @@ static void mp_potrf_template(TaskContext& context)
                 context.get_launch_domain());
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/mp_solve.cu b/src/cupynumeric/matrix/mp_solve.cu
similarity index 96%
rename from src/cunumeric/matrix/mp_solve.cu
rename to src/cupynumeric/matrix/mp_solve.cu
index 0e2670c897..6c3ac2f8c7 100644
--- a/src/cunumeric/matrix/mp_solve.cu
+++ b/src/cupynumeric/matrix/mp_solve.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/matrix/mp_solve.h"
-#include "cunumeric/matrix/mp_solve_template.inl"
+#include "cupynumeric/matrix/mp_solve.h"
+#include "cupynumeric/matrix/mp_solve_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace Legion;
 using namespace legate;
@@ -136,7 +136,7 @@ static inline void mp_solve_template(cal_comm_t comm,
 
   // TODO: We need a deferred exception to avoid this synchronization
   CHECK_CAL(cal_stream_sync(comm, stream));
-  CUNUMERIC_CHECK_CUDA_STREAM(stream);
+  CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
 
   CHECK_CUSOLVER(cusolverMpDestroyMatrixDesc(a_desc));
   CHECK_CUSOLVER(cusolverMpDestroyMatrixDesc(b_desc));
@@ -241,7 +241,10 @@ struct MpSolveImplBody<VariantKind::GPU, Type::Code::COMPLEX128> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { MpSolveTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  MpSolveTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
\ No newline at end of file
+}  // namespace cupynumeric
\ No newline at end of file
diff --git a/src/cunumeric/matrix/mp_solve.h b/src/cupynumeric/matrix/mp_solve.h
similarity index 74%
rename from src/cunumeric/matrix/mp_solve.h
rename to src/cupynumeric/matrix/mp_solve.h
index e329f95e87..d858682cb1 100644
--- a/src/cunumeric/matrix/mp_solve.h
+++ b/src/cupynumeric/matrix/mp_solve.h
@@ -16,13 +16,14 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
-class MpSolveTask : public CuNumericTask<MpSolveTask> {
+class MpSolveTask : public CuPyNumericTask<MpSolveTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_MP_SOLVE};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_MP_SOLVE}};
 
  public:
 #if LEGATE_DEFINED(LEGATE_USE_CUDA)
@@ -30,4 +31,4 @@ class MpSolveTask : public CuNumericTask<MpSolveTask> {
 #endif
 };
 
-}  // namespace cunumeric
\ No newline at end of file
+}  // namespace cupynumeric
\ No newline at end of file
diff --git a/src/cunumeric/matrix/mp_solve_template.inl b/src/cupynumeric/matrix/mp_solve_template.inl
similarity index 97%
rename from src/cunumeric/matrix/mp_solve_template.inl
rename to src/cupynumeric/matrix/mp_solve_template.inl
index b2e0a6739c..573e90db32 100644
--- a/src/cunumeric/matrix/mp_solve_template.inl
+++ b/src/cupynumeric/matrix/mp_solve_template.inl
@@ -21,13 +21,13 @@
 #include "legate/comm/coll.h"
 
 // Useful for IDEs
-#include "cunumeric/matrix/mp_solve.h"
-#include "cunumeric/cuda_help.h"
-#include "cunumeric/utilities/repartition.h"
+#include "cupynumeric/matrix/mp_solve.h"
+#include "cupynumeric/cuda_help.h"
+#include "cupynumeric/utilities/repartition.h"
 
 #include <cal.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace Legion;
 using namespace legate;
@@ -192,4 +192,4 @@ static void mp_solve_template(TaskContext& context)
                 context.get_launch_domain());
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/potrf.cc b/src/cupynumeric/matrix/potrf.cc
similarity index 91%
rename from src/cunumeric/matrix/potrf.cc
rename to src/cupynumeric/matrix/potrf.cc
index bd2f7af5ee..6b724ff76d 100644
--- a/src/cunumeric/matrix/potrf.cc
+++ b/src/cupynumeric/matrix/potrf.cc
@@ -14,13 +14,13 @@
  *
  */
 
-#include "cunumeric/matrix/potrf.h"
-#include "cunumeric/matrix/potrf_template.inl"
+#include "cupynumeric/matrix/potrf.h"
+#include "cupynumeric/matrix/potrf_template.inl"
 
 #include <cblas.h>
 #include <lapack.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -86,7 +86,10 @@ void PotrfImplBody<VariantKind::CPU, Type::Code::COMPLEX128>::operator()(complex
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { PotrfTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  PotrfTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/potrf.cu b/src/cupynumeric/matrix/potrf.cu
similarity index 91%
rename from src/cunumeric/matrix/potrf.cu
rename to src/cupynumeric/matrix/potrf.cu
index bdae5cd1c2..d8a6016c84 100644
--- a/src/cunumeric/matrix/potrf.cu
+++ b/src/cupynumeric/matrix/potrf.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/matrix/potrf.h"
-#include "cunumeric/matrix/potrf_template.inl"
+#include "cupynumeric/matrix/potrf.h"
+#include "cupynumeric/matrix/potrf_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -42,8 +42,8 @@ static inline void potrf_template(
   CHECK_CUSOLVER(potrf(context, uplo, n, array, m, buffer.ptr(0), bufferSize, info.ptr(0)));
 
   // TODO: We need a deferred exception to avoid this synchronization
-  CUNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
-  CUNUMERIC_CHECK_CUDA_STREAM(stream);
+  CUPYNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
+  CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
 
   if (info[0] != 0) {
     throw legate::TaskException("Matrix is not positive definite");
@@ -89,4 +89,4 @@ void PotrfImplBody<VariantKind::GPU, Type::Code::COMPLEX128>::operator()(complex
   potrf_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/matmul.h b/src/cupynumeric/matrix/potrf.h
similarity index 72%
rename from src/cunumeric/matrix/matmul.h
rename to src/cupynumeric/matrix/potrf.h
index 0420d8045e..3a8891f6e5 100644
--- a/src/cunumeric/matrix/matmul.h
+++ b/src/cupynumeric/matrix/potrf.h
@@ -16,19 +16,15 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
-struct MatMulArgs {
-  legate::PhysicalStore lhs;
-  legate::PhysicalStore rhs1;
-  legate::PhysicalStore rhs2;
-};
-
-class MatMulTask : public CuNumericTask<MatMulTask> {
+class PotrfTask : public CuPyNumericTask<PotrfTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_MATMUL};
+  static inline const auto TASK_CONFIG = legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_POTRF}};
+
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -40,4 +36,4 @@ class MatMulTask : public CuNumericTask<MatMulTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/potrf_omp.cc b/src/cupynumeric/matrix/potrf_omp.cc
similarity index 95%
rename from src/cunumeric/matrix/potrf_omp.cc
rename to src/cupynumeric/matrix/potrf_omp.cc
index e91d55429a..6031e50f04 100644
--- a/src/cunumeric/matrix/potrf_omp.cc
+++ b/src/cupynumeric/matrix/potrf_omp.cc
@@ -14,14 +14,14 @@
  *
  */
 
-#include "cunumeric/matrix/potrf.h"
-#include "cunumeric/matrix/potrf_template.inl"
+#include "cupynumeric/matrix/potrf.h"
+#include "cupynumeric/matrix/potrf_template.inl"
 
 #include <cblas.h>
 #include <lapack.h>
 #include <omp.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -83,4 +83,4 @@ void PotrfImplBody<VariantKind::OMP, Type::Code::COMPLEX128>::operator()(complex
   potrf_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/potrf_template.inl b/src/cupynumeric/matrix/potrf_template.inl
similarity index 96%
rename from src/cunumeric/matrix/potrf_template.inl
rename to src/cupynumeric/matrix/potrf_template.inl
index b8fc45d703..4728e47861 100644
--- a/src/cunumeric/matrix/potrf_template.inl
+++ b/src/cupynumeric/matrix/potrf_template.inl
@@ -17,9 +17,9 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/matrix/potrf.h"
+#include "cupynumeric/matrix/potrf.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -94,4 +94,4 @@ static void potrf_template(TaskContext& context)
   type_dispatch(array.type().code(), PotrfImpl<KIND>{}, array);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/qr.cc b/src/cupynumeric/matrix/qr.cc
similarity index 77%
rename from src/cunumeric/matrix/qr.cc
rename to src/cupynumeric/matrix/qr.cc
index 9d5881ab68..4c31141713 100644
--- a/src/cunumeric/matrix/qr.cc
+++ b/src/cupynumeric/matrix/qr.cc
@@ -14,11 +14,11 @@
  *
  */
 
-#include "cunumeric/matrix/qr.h"
-#include "cunumeric/matrix/qr_template.inl"
-#include "cunumeric/matrix/qr_cpu.inl"
+#include "cupynumeric/matrix/qr.h"
+#include "cupynumeric/matrix/qr_template.inl"
+#include "cupynumeric/matrix/qr_cpu.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -34,7 +34,10 @@ using namespace legate;
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { QrTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  QrTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/qr.cu b/src/cupynumeric/matrix/qr.cu
similarity index 90%
rename from src/cunumeric/matrix/qr.cu
rename to src/cupynumeric/matrix/qr.cu
index 7505565351..2b0f9c7d7e 100644
--- a/src/cunumeric/matrix/qr.cu
+++ b/src/cupynumeric/matrix/qr.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/matrix/qr.h"
-#include "cunumeric/matrix/qr_template.inl"
+#include "cupynumeric/matrix/qr.h"
+#include "cupynumeric/matrix/qr_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 #include <vector>
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -52,9 +52,9 @@ static inline void qr_template(GeqrfBufferSize geqrf_buffer_size,
     q_tmp       = q_copy.ptr(0);
   }
 
-  CUNUMERIC_CHECK_CUDA(
+  CUPYNUMERIC_CHECK_CUDA(
     cudaMemcpyAsync(q_tmp, a, sizeof(VAL) * m * n, cudaMemcpyDeviceToDevice, stream));
-  CUNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
+  CUPYNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
 
   CHECK_CUSOLVER(cusolverDnSetStream(handle, stream));
 
@@ -71,27 +71,27 @@ static inline void qr_template(GeqrfBufferSize geqrf_buffer_size,
 
   CHECK_CUSOLVER(
     geqrf(handle, m, n, q_tmp, m, tau.ptr(0), buffer.ptr(0), lwork_total, info.ptr(0)));
-  CUNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
+  CUPYNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
 
   if (info[0] != 0) {
     throw legate::TaskException(QrTask::ERROR_MESSAGE);
   }
 
   // extract R from upper triangular of geqrf result
-  CUNUMERIC_CHECK_CUDA(cudaMemsetAsync(r, 0, k * n * sizeof(VAL), stream));
+  CUPYNUMERIC_CHECK_CUDA(cudaMemsetAsync(r, 0, k * n * sizeof(VAL), stream));
   for (int i = 0; i < k; ++i) {
     int elements = i + 1;
     if (i == k - 1 && n > k) {
       elements = k * (n - k + 1);
     }
-    CUNUMERIC_CHECK_CUDA(cudaMemcpyAsync(
+    CUPYNUMERIC_CHECK_CUDA(cudaMemcpyAsync(
       r + i * k, q_tmp + i * m, sizeof(VAL) * elements, cudaMemcpyDeviceToDevice, stream));
   }
 
   // assemble Q
   CHECK_CUSOLVER(
     orgqr(handle, m, k, k, q_tmp, m, tau.ptr(0), buffer.ptr(0), lwork_total, info.ptr(0)));
-  CUNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
+  CUPYNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
 
   if (info[0] != 0) {
     throw legate::TaskException(QrTask::ERROR_MESSAGE);
@@ -100,13 +100,13 @@ static inline void qr_template(GeqrfBufferSize geqrf_buffer_size,
   // if we used a tmp storage we still need to copy back Q
   if (q_tmp != q) {
     assert(n > m);
-    CUNUMERIC_CHECK_CUDA(
+    CUPYNUMERIC_CHECK_CUDA(
       cudaMemcpyAsync(q, q_tmp, sizeof(VAL) * m * m, cudaMemcpyDeviceToDevice, stream));
   }
 
-  CUNUMERIC_CHECK_CUDA_STREAM(stream);
+  CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
 
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
   assert(info[0] == 0);
 #endif
 }
@@ -187,4 +187,4 @@ struct QrImplBody<VariantKind::GPU, Type::Code::COMPLEX128> {
 
 /*static*/ void QrTask::gpu_variant(TaskContext context) { qr_template<VariantKind::GPU>(context); }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/qr.h b/src/cupynumeric/matrix/qr.h
similarity index 63%
rename from src/cunumeric/matrix/qr.h
rename to src/cupynumeric/matrix/qr.h
index 9a29199ddb..f43865e2f8 100644
--- a/src/cunumeric/matrix/qr.h
+++ b/src/cupynumeric/matrix/qr.h
@@ -16,15 +16,19 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
-class QrTask : public CuNumericTask<QrTask> {
+class QrTask : public CuPyNumericTask<QrTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_QR};
+  static inline const auto TASK_CONFIG = legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_QR}};
   static const char* ERROR_MESSAGE;
 
+  static constexpr auto CPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+  static constexpr auto OMP_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+
  public:
   static void cpu_variant(legate::TaskContext context);
 #if LEGATE_DEFINED(LEGATE_USE_OPENMP)
@@ -35,4 +39,4 @@ class QrTask : public CuNumericTask<QrTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/qr_cpu.inl b/src/cupynumeric/matrix/qr_cpu.inl
similarity index 98%
rename from src/cunumeric/matrix/qr_cpu.inl
rename to src/cupynumeric/matrix/qr_cpu.inl
index ef143a57a0..56c2979ed6 100644
--- a/src/cunumeric/matrix/qr_cpu.inl
+++ b/src/cupynumeric/matrix/qr_cpu.inl
@@ -20,7 +20,7 @@
 #include <lapack.h>
 #include <cstring>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -128,4 +128,4 @@ struct QrImplBody<KIND, Type::Code::COMPLEX128> {
   }
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/qr_omp.cc b/src/cupynumeric/matrix/qr_omp.cc
similarity index 81%
rename from src/cunumeric/matrix/qr_omp.cc
rename to src/cupynumeric/matrix/qr_omp.cc
index c28c4c496b..a954689b2d 100644
--- a/src/cunumeric/matrix/qr_omp.cc
+++ b/src/cupynumeric/matrix/qr_omp.cc
@@ -14,13 +14,13 @@
  *
  */
 
-#include "cunumeric/matrix/qr.h"
-#include "cunumeric/matrix/qr_template.inl"
-#include "cunumeric/matrix/qr_cpu.inl"
+#include "cupynumeric/matrix/qr.h"
+#include "cupynumeric/matrix/qr_template.inl"
+#include "cupynumeric/matrix/qr_cpu.inl"
 
 #include <omp.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 /*static*/ void QrTask::omp_variant(TaskContext context)
 {
@@ -28,4 +28,4 @@ namespace cunumeric {
   qr_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/qr_template.inl b/src/cupynumeric/matrix/qr_template.inl
similarity index 94%
rename from src/cunumeric/matrix/qr_template.inl
rename to src/cupynumeric/matrix/qr_template.inl
index 2040b555b6..0fa17e40d6 100644
--- a/src/cunumeric/matrix/qr_template.inl
+++ b/src/cupynumeric/matrix/qr_template.inl
@@ -19,9 +19,9 @@
 #include <vector>
 
 // Useful for IDEs
-#include "cunumeric/matrix/qr.h"
+#include "cupynumeric/matrix/qr.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -48,7 +48,7 @@ struct QrImpl {
   {
     using VAL = type_of<CODE>;
 
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     assert(a_array.dim() == 2);
     assert(q_array.dim() == 2);
     assert(r_array.dim() == 2);
@@ -61,7 +61,7 @@ struct QrImpl {
     const int64_t n = a_shape.hi[1] - a_shape.lo[1] + 1;
     const int64_t k = std::min(m, n);
 
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     assert(q_shape.hi[0] - q_shape.lo[0] + 1 == m);
     assert(q_shape.hi[1] - q_shape.lo[1] + 1 == k);
     assert(r_shape.hi[0] - r_shape.lo[0] + 1 == k);
@@ -71,7 +71,7 @@ struct QrImpl {
     auto a_acc = a_array.read_accessor<VAL, 2>(a_shape);
     auto q_acc = q_array.write_accessor<VAL, 2>(q_shape);
     auto r_acc = r_array.write_accessor<VAL, 2>(r_shape);
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     assert(a_acc.accessor.is_dense_col_major(a_shape));
     assert(q_acc.accessor.is_dense_col_major(q_shape));
     assert(r_acc.accessor.is_dense_col_major(r_shape));
@@ -99,4 +99,4 @@ static void qr_template(TaskContext& context)
   type_dispatch(a_array.type().code(), QrImpl<KIND>{}, a_array, q_array, r_array);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/solve.cc b/src/cupynumeric/matrix/solve.cc
similarity index 76%
rename from src/cunumeric/matrix/solve.cc
rename to src/cupynumeric/matrix/solve.cc
index c86de37283..28c8b7e03c 100644
--- a/src/cunumeric/matrix/solve.cc
+++ b/src/cupynumeric/matrix/solve.cc
@@ -14,11 +14,11 @@
  *
  */
 
-#include "cunumeric/matrix/solve.h"
-#include "cunumeric/matrix/solve_template.inl"
-#include "cunumeric/matrix/solve_cpu.inl"
+#include "cupynumeric/matrix/solve.h"
+#include "cupynumeric/matrix/solve_template.inl"
+#include "cupynumeric/matrix/solve_cpu.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -34,7 +34,10 @@ using namespace legate;
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { SolveTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  SolveTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/solve.cu b/src/cupynumeric/matrix/solve.cu
similarity index 92%
rename from src/cunumeric/matrix/solve.cu
rename to src/cupynumeric/matrix/solve.cu
index 58e24baee3..147395e13a 100644
--- a/src/cunumeric/matrix/solve.cu
+++ b/src/cupynumeric/matrix/solve.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/matrix/solve.h"
-#include "cunumeric/matrix/solve_template.inl"
+#include "cupynumeric/matrix/solve.h"
+#include "cupynumeric/matrix/solve_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -47,7 +47,7 @@ static inline void solve_template(GetrfBufferSize getrf_buffer_size,
   auto info   = create_buffer<int32_t>(1, Memory::Kind::Z_COPY_MEM);
 
   CHECK_CUSOLVER(getrf(handle, m, n, a, m, buffer.ptr(0), ipiv.ptr(0), info.ptr(0)));
-  CUNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
+  CUPYNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
 
   if (info[0] != 0) {
     throw legate::TaskException(SolveTask::ERROR_MESSAGE);
@@ -55,9 +55,9 @@ static inline void solve_template(GetrfBufferSize getrf_buffer_size,
 
   CHECK_CUSOLVER(getrs(handle, trans, n, nrhs, a, m, ipiv.ptr(0), b, n, info.ptr(0)));
 
-  CUNUMERIC_CHECK_CUDA_STREAM(stream);
+  CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
 
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
   assert(info[0] == 0);
 #endif
 }
@@ -115,4 +115,4 @@ struct SolveImplBody<VariantKind::GPU, Type::Code::COMPLEX128> {
   solve_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cupynumeric/matrix/solve.h b/src/cupynumeric/matrix/solve.h
new file mode 100644
index 0000000000..8f6a76e50f
--- /dev/null
+++ b/src/cupynumeric/matrix/solve.h
@@ -0,0 +1,42 @@
+/* Copyright 2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "cupynumeric/cupynumeric_task.h"
+
+namespace cupynumeric {
+
+class SolveTask : public CuPyNumericTask<SolveTask> {
+ public:
+  static inline const auto TASK_CONFIG = legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_SOLVE}};
+  static const char* ERROR_MESSAGE;
+
+  static constexpr auto CPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+  static constexpr auto OMP_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+
+ public:
+  static void cpu_variant(legate::TaskContext context);
+#if LEGATE_DEFINED(LEGATE_USE_OPENMP)
+  static void omp_variant(legate::TaskContext context);
+#endif
+#if LEGATE_DEFINED(LEGATE_USE_CUDA)
+  static void gpu_variant(legate::TaskContext context);
+#endif
+};
+
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/solve_cpu.inl b/src/cupynumeric/matrix/solve_cpu.inl
similarity index 97%
rename from src/cunumeric/matrix/solve_cpu.inl
rename to src/cupynumeric/matrix/solve_cpu.inl
index 367bd346bc..4d38569ada 100644
--- a/src/cunumeric/matrix/solve_cpu.inl
+++ b/src/cupynumeric/matrix/solve_cpu.inl
@@ -19,7 +19,7 @@
 #include <cblas.h>
 #include <lapack.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -89,4 +89,4 @@ struct SolveImplBody<KIND, Type::Code::COMPLEX128> {
   }
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/solve_omp.cc b/src/cupynumeric/matrix/solve_omp.cc
similarity index 81%
rename from src/cunumeric/matrix/solve_omp.cc
rename to src/cupynumeric/matrix/solve_omp.cc
index 223715011b..a709da5a9d 100644
--- a/src/cunumeric/matrix/solve_omp.cc
+++ b/src/cupynumeric/matrix/solve_omp.cc
@@ -14,13 +14,13 @@
  *
  */
 
-#include "cunumeric/matrix/solve.h"
-#include "cunumeric/matrix/solve_template.inl"
-#include "cunumeric/matrix/solve_cpu.inl"
+#include "cupynumeric/matrix/solve.h"
+#include "cupynumeric/matrix/solve_template.inl"
+#include "cupynumeric/matrix/solve_cpu.inl"
 
 #include <omp.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 /*static*/ void SolveTask::omp_variant(TaskContext context)
 {
@@ -28,4 +28,4 @@ namespace cunumeric {
   solve_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/solve_template.inl b/src/cupynumeric/matrix/solve_template.inl
similarity index 92%
rename from src/cunumeric/matrix/solve_template.inl
rename to src/cupynumeric/matrix/solve_template.inl
index 8ccd85e578..c04ceb4da7 100644
--- a/src/cunumeric/matrix/solve_template.inl
+++ b/src/cupynumeric/matrix/solve_template.inl
@@ -19,9 +19,9 @@
 #include <vector>
 
 // Useful for IDEs
-#include "cunumeric/matrix/solve.h"
+#include "cupynumeric/matrix/solve.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -46,7 +46,7 @@ struct SolveImpl {
   {
     using VAL = type_of<CODE>;
 
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     assert(a_array.dim() == 2);
     assert(b_array.dim() == 1 || b_array.dim() == 2);
 #endif
@@ -55,14 +55,14 @@ struct SolveImpl {
     const int64_t m = a_shape.hi[0] - a_shape.lo[0] + 1;
     const int64_t n = a_shape.hi[1] - a_shape.lo[1] + 1;
 
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     // The Python code guarantees this property
     assert(m == n);
 #endif
 
     size_t a_strides[2];
     VAL* a = a_array.read_write_accessor<VAL, 2>(a_shape).ptr(a_shape, a_strides);
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     assert(a_array.is_future() || (a_strides[0] == 1 && static_cast<int64_t>(a_strides[1]) == m));
 #endif
     VAL* b = nullptr;
@@ -70,25 +70,25 @@ struct SolveImpl {
     int64_t nrhs = 1;
     if (b_array.dim() == 1) {
       const auto b_shape = b_array.shape<1>();
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
       assert(m == b_shape.hi[0] - b_shape.lo[0] + 1);
 #endif
       size_t b_strides;
       b = b_array.read_write_accessor<VAL, 1>(b_shape).ptr(b_shape, &b_strides);
     } else {
       const auto b_shape = b_array.shape<2>();
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
       assert(m == b_shape.hi[0] - b_shape.lo[0] + 1);
 #endif
       nrhs = b_shape.hi[1] - b_shape.lo[1] + 1;
       size_t b_strides[2];
       b = b_array.read_write_accessor<VAL, 2>(b_shape).ptr(b_shape, b_strides);
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
       assert(b_array.is_future() || (b_strides[0] == 1 && static_cast<int64_t>(b_strides[1]) == m));
 #endif
     }
 
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     assert(m > 0 && n > 0 && nrhs > 0);
 #endif
 
@@ -110,4 +110,4 @@ static void solve_template(TaskContext& context)
   type_dispatch(a_array.type().code(), SolveImpl<KIND>{}, a_array, b_array);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/svd.cc b/src/cupynumeric/matrix/svd.cc
similarity index 77%
rename from src/cunumeric/matrix/svd.cc
rename to src/cupynumeric/matrix/svd.cc
index 97d49d694f..f87019d033 100644
--- a/src/cunumeric/matrix/svd.cc
+++ b/src/cupynumeric/matrix/svd.cc
@@ -14,11 +14,11 @@
  *
  */
 
-#include "cunumeric/matrix/svd.h"
-#include "cunumeric/matrix/svd_template.inl"
-#include "cunumeric/matrix/svd_cpu.inl"
+#include "cupynumeric/matrix/svd.h"
+#include "cupynumeric/matrix/svd_template.inl"
+#include "cupynumeric/matrix/svd_cpu.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -34,7 +34,10 @@ using namespace legate;
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { SvdTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  SvdTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
\ No newline at end of file
+}  // namespace cupynumeric
\ No newline at end of file
diff --git a/src/cunumeric/matrix/svd.cu b/src/cupynumeric/matrix/svd.cu
similarity index 95%
rename from src/cunumeric/matrix/svd.cu
rename to src/cupynumeric/matrix/svd.cu
index 8df7cdac05..932c499070 100644
--- a/src/cunumeric/matrix/svd.cu
+++ b/src/cupynumeric/matrix/svd.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/matrix/svd.h"
-#include "cunumeric/matrix/svd_template.inl"
+#include "cupynumeric/matrix/svd.h"
+#include "cupynumeric/matrix/svd_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 #include <vector>
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -39,7 +39,7 @@ static inline void svd_template(DataType valTypeC,
   auto stream = get_cached_stream();
 
   auto a_copy = create_buffer<VAL>(m * n, Memory::Kind::GPU_FB_MEM);
-  CUNUMERIC_CHECK_CUDA(
+  CUPYNUMERIC_CHECK_CUDA(
     cudaMemcpyAsync(a_copy.ptr(0), a, m * n * sizeof(VAL), cudaMemcpyDeviceToDevice, stream));
 
   // a[m][n], u[m][m] s[k] vh[n][n]
@@ -95,15 +95,15 @@ static inline void svd_template(DataType valTypeC,
                                   lwork_host,
                                   info.ptr(0)));
 
-  CUNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
+  CUPYNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
 
   if (info[0] != 0) {
     throw legate::TaskException(SvdTask::ERROR_MESSAGE);
   }
 
-  CUNUMERIC_CHECK_CUDA_STREAM(stream);
+  CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
 
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
   assert(info[0] == 0);
 #endif
 }
@@ -191,4 +191,4 @@ struct SvdImplBody<VariantKind::GPU, Type::Code::COMPLEX128> {
   svd_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
\ No newline at end of file
+}  // namespace cupynumeric
\ No newline at end of file
diff --git a/src/cunumeric/matrix/svd.h b/src/cupynumeric/matrix/svd.h
similarity index 63%
rename from src/cunumeric/matrix/svd.h
rename to src/cupynumeric/matrix/svd.h
index bd71ba2e49..758ef371cc 100644
--- a/src/cunumeric/matrix/svd.h
+++ b/src/cupynumeric/matrix/svd.h
@@ -16,15 +16,19 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
-class SvdTask : public CuNumericTask<SvdTask> {
+class SvdTask : public CuPyNumericTask<SvdTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_SVD};
+  static inline const auto TASK_CONFIG = legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_SVD}};
   static const char* ERROR_MESSAGE;
 
+  static constexpr auto CPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+  static constexpr auto OMP_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+
  public:
   static void cpu_variant(legate::TaskContext context);
 #if LEGATE_DEFINED(LEGATE_USE_OPENMP)
@@ -35,4 +39,4 @@ class SvdTask : public CuNumericTask<SvdTask> {
 #endif
 };
 
-}  // namespace cunumeric
\ No newline at end of file
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/svd_cpu.inl b/src/cupynumeric/matrix/svd_cpu.inl
similarity index 99%
rename from src/cunumeric/matrix/svd_cpu.inl
rename to src/cupynumeric/matrix/svd_cpu.inl
index bb49aa2f65..8f43b79ce7 100644
--- a/src/cunumeric/matrix/svd_cpu.inl
+++ b/src/cupynumeric/matrix/svd_cpu.inl
@@ -20,7 +20,7 @@
 #include <lapack.h>
 #include <cstring>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -254,4 +254,4 @@ struct SvdImplBody<KIND, Type::Code::COMPLEX128> {
   }
 };
 
-}  // namespace cunumeric
\ No newline at end of file
+}  // namespace cupynumeric
\ No newline at end of file
diff --git a/src/cunumeric/matrix/svd_omp.cc b/src/cupynumeric/matrix/svd_omp.cc
similarity index 81%
rename from src/cunumeric/matrix/svd_omp.cc
rename to src/cupynumeric/matrix/svd_omp.cc
index fac2cf1b1b..36fc1ad209 100644
--- a/src/cunumeric/matrix/svd_omp.cc
+++ b/src/cupynumeric/matrix/svd_omp.cc
@@ -14,13 +14,13 @@
  *
  */
 
-#include "cunumeric/matrix/svd.h"
-#include "cunumeric/matrix/svd_template.inl"
-#include "cunumeric/matrix/svd_cpu.inl"
+#include "cupynumeric/matrix/svd.h"
+#include "cupynumeric/matrix/svd_template.inl"
+#include "cupynumeric/matrix/svd_cpu.inl"
 
 #include <omp.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 /*static*/ void SvdTask::omp_variant(TaskContext context)
 {
@@ -28,4 +28,4 @@ namespace cunumeric {
   svd_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
\ No newline at end of file
+}  // namespace cupynumeric
\ No newline at end of file
diff --git a/src/cunumeric/matrix/svd_template.inl b/src/cupynumeric/matrix/svd_template.inl
similarity index 96%
rename from src/cunumeric/matrix/svd_template.inl
rename to src/cupynumeric/matrix/svd_template.inl
index c529ed40e3..a43f0ef3c6 100644
--- a/src/cunumeric/matrix/svd_template.inl
+++ b/src/cupynumeric/matrix/svd_template.inl
@@ -19,9 +19,9 @@
 #include <vector>
 
 // Useful for IDEs
-#include "cunumeric/matrix/svd.h"
+#include "cupynumeric/matrix/svd.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -63,7 +63,7 @@ struct SvdImpl {
     using VAL      = type_of<CODE>;
     using VAL_REAL = typename real_type<CODE>::TYPE;
 
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     assert(a_array.dim() == 2);
     assert(u_array.dim() == 2);
     assert(s_array.dim() == 1);
@@ -81,7 +81,7 @@ struct SvdImpl {
     assert(m >= n);
     bool full_matrices = (u_shape.hi[1] - u_shape.lo[1] + 1 == m);
 
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     assert(u_shape.hi[0] - u_shape.lo[0] + 1 == m);
     if (full_matrices) {
       assert(u_shape.hi[1] - u_shape.lo[1] + 1 == m);
@@ -97,7 +97,7 @@ struct SvdImpl {
     auto u_acc  = u_array.write_accessor<VAL, 2>(u_shape);
     auto s_acc  = s_array.write_accessor<VAL_REAL, 1>(s_shape);
     auto vh_acc = vh_array.write_accessor<VAL, 2>(vh_shape);
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     assert(a_acc.accessor.is_dense_col_major(a_shape));
     assert(u_acc.accessor.is_dense_col_major(u_shape));
     assert(vh_acc.accessor.is_dense_col_major(vh_shape));
@@ -134,4 +134,4 @@ static void svd_template(TaskContext& context)
   type_dispatch(a_array.type().code(), SvdImpl<KIND>{}, a_array, u_array, s_array, vh_array);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/syrk.cc b/src/cupynumeric/matrix/syrk.cc
similarity index 91%
rename from src/cunumeric/matrix/syrk.cc
rename to src/cupynumeric/matrix/syrk.cc
index 2154cba597..1f646d7fe0 100644
--- a/src/cunumeric/matrix/syrk.cc
+++ b/src/cupynumeric/matrix/syrk.cc
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/matrix/syrk.h"
-#include "cunumeric/matrix/syrk_template.inl"
+#include "cupynumeric/matrix/syrk.h"
+#include "cupynumeric/matrix/syrk_template.inl"
 
 #include <cblas.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -85,7 +85,10 @@ struct SyrkImplBody<VariantKind::CPU, Type::Code::COMPLEX128> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { SyrkTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  SyrkTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/syrk.cu b/src/cupynumeric/matrix/syrk.cu
similarity index 92%
rename from src/cunumeric/matrix/syrk.cu
rename to src/cupynumeric/matrix/syrk.cu
index 032d86c31a..4be72374e8 100644
--- a/src/cunumeric/matrix/syrk.cu
+++ b/src/cupynumeric/matrix/syrk.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/matrix/syrk.h"
-#include "cunumeric/matrix/syrk_template.inl"
+#include "cupynumeric/matrix/syrk.h"
+#include "cupynumeric/matrix/syrk_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -38,7 +38,7 @@ static inline void syrk_template(
 
   CHECK_CUBLAS(syrk(context, uplo, trans, m, n, &alpha, rhs, m, &beta, lhs, m));
 
-  CUNUMERIC_CHECK_CUDA_STREAM(stream);
+  CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
 }
 
 template <>
@@ -86,4 +86,4 @@ struct SyrkImplBody<VariantKind::GPU, Type::Code::COMPLEX128> {
   syrk_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/syrk.h b/src/cupynumeric/matrix/syrk.h
similarity index 78%
rename from src/cunumeric/matrix/syrk.h
rename to src/cupynumeric/matrix/syrk.h
index 0ef23b0a58..5074c62d3c 100644
--- a/src/cunumeric/matrix/syrk.h
+++ b/src/cupynumeric/matrix/syrk.h
@@ -16,13 +16,13 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
-class SyrkTask : public CuNumericTask<SyrkTask> {
+class SyrkTask : public CuPyNumericTask<SyrkTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_SYRK};
+  static inline const auto TASK_CONFIG = legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_SYRK}};
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -34,4 +34,4 @@ class SyrkTask : public CuNumericTask<SyrkTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/syrk_omp.cc b/src/cupynumeric/matrix/syrk_omp.cc
similarity index 94%
rename from src/cunumeric/matrix/syrk_omp.cc
rename to src/cupynumeric/matrix/syrk_omp.cc
index 5146390236..5b25d7707c 100644
--- a/src/cunumeric/matrix/syrk_omp.cc
+++ b/src/cupynumeric/matrix/syrk_omp.cc
@@ -14,13 +14,13 @@
  *
  */
 
-#include "cunumeric/matrix/syrk.h"
-#include "cunumeric/matrix/syrk_template.inl"
+#include "cupynumeric/matrix/syrk.h"
+#include "cupynumeric/matrix/syrk_template.inl"
 
 #include <cblas.h>
 #include <omp.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -82,4 +82,4 @@ struct SyrkImplBody<VariantKind::CPU, Type::Code::COMPLEX128> {
   syrk_template<VariantKind::CPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/syrk_template.inl b/src/cupynumeric/matrix/syrk_template.inl
similarity index 96%
rename from src/cunumeric/matrix/syrk_template.inl
rename to src/cupynumeric/matrix/syrk_template.inl
index 58307e05e3..72aaa949c8 100644
--- a/src/cunumeric/matrix/syrk_template.inl
+++ b/src/cupynumeric/matrix/syrk_template.inl
@@ -17,9 +17,9 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/matrix/syrk.h"
+#include "cupynumeric/matrix/syrk.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -80,4 +80,4 @@ static void syrk_template(TaskContext& context)
   type_dispatch(lhs.type().code(), SyrkImpl<KIND>{}, lhs, rhs);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/tile.cc b/src/cupynumeric/matrix/tile.cc
similarity index 85%
rename from src/cunumeric/matrix/tile.cc
rename to src/cupynumeric/matrix/tile.cc
index 123bd9387f..ec6d92bf21 100644
--- a/src/cunumeric/matrix/tile.cc
+++ b/src/cupynumeric/matrix/tile.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/matrix/tile.h"
-#include "cunumeric/matrix/tile_template.inl"
+#include "cupynumeric/matrix/tile.h"
+#include "cupynumeric/matrix/tile_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -45,7 +45,10 @@ struct TileImplBody<VariantKind::CPU, VAL, OUT_DIM, IN_DIM> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { TileTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  TileTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/tile.cu b/src/cupynumeric/matrix/tile.cu
similarity index 90%
rename from src/cunumeric/matrix/tile.cu
rename to src/cupynumeric/matrix/tile.cu
index 5750f54359..24f46b419d 100644
--- a/src/cunumeric/matrix/tile.cu
+++ b/src/cupynumeric/matrix/tile.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/matrix/tile.h"
-#include "cunumeric/matrix/tile_template.inl"
+#include "cupynumeric/matrix/tile.h"
+#include "cupynumeric/matrix/tile_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename VAL, int32_t OUT_DIM, int32_t IN_DIM>
 __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
@@ -53,7 +53,7 @@ struct TileImplBody<VariantKind::GPU, VAL, OUT_DIM, IN_DIM> {
     auto stream         = get_cached_stream();
     tile_kernel<VAL, OUT_DIM, IN_DIM><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
       out_rect, out_pitches, out_volume, in_strides, out, in);
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -62,4 +62,4 @@ struct TileImplBody<VariantKind::GPU, VAL, OUT_DIM, IN_DIM> {
   tile_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/tile.h b/src/cupynumeric/matrix/tile.h
similarity index 79%
rename from src/cunumeric/matrix/tile.h
rename to src/cupynumeric/matrix/tile.h
index eeca4d858d..266ecdfe2f 100644
--- a/src/cunumeric/matrix/tile.h
+++ b/src/cupynumeric/matrix/tile.h
@@ -16,18 +16,18 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct TileArgs {
   legate::PhysicalStore in;
   legate::PhysicalStore out;
 };
 
-class TileTask : public CuNumericTask<TileTask> {
+class TileTask : public CuPyNumericTask<TileTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_TILE};
+  static inline const auto TASK_CONFIG = legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_TILE}};
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -39,4 +39,4 @@ class TileTask : public CuNumericTask<TileTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/tile_omp.cc b/src/cupynumeric/matrix/tile_omp.cc
similarity index 91%
rename from src/cunumeric/matrix/tile_omp.cc
rename to src/cupynumeric/matrix/tile_omp.cc
index 72f5610eeb..3d1553dc75 100644
--- a/src/cunumeric/matrix/tile_omp.cc
+++ b/src/cupynumeric/matrix/tile_omp.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/matrix/tile.h"
-#include "cunumeric/matrix/tile_template.inl"
+#include "cupynumeric/matrix/tile.h"
+#include "cupynumeric/matrix/tile_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -44,4 +44,4 @@ struct TileImplBody<VariantKind::OMP, VAL, OUT_DIM, IN_DIM> {
   tile_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/tile_template.inl b/src/cupynumeric/matrix/tile_template.inl
similarity index 95%
rename from src/cunumeric/matrix/tile_template.inl
rename to src/cupynumeric/matrix/tile_template.inl
index 423ece1cf7..4c5c305617 100644
--- a/src/cunumeric/matrix/tile_template.inl
+++ b/src/cupynumeric/matrix/tile_template.inl
@@ -17,10 +17,10 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/matrix/tile.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/matrix/tile.h"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -85,4 +85,4 @@ static void tile_template(TaskContext& context)
   type_dispatch(args.in.code(), TileDispatch<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/transpose.cc b/src/cupynumeric/matrix/transpose.cc
similarity index 89%
rename from src/cunumeric/matrix/transpose.cc
rename to src/cupynumeric/matrix/transpose.cc
index 22bdf46d2f..f38f6e5f87 100644
--- a/src/cunumeric/matrix/transpose.cc
+++ b/src/cupynumeric/matrix/transpose.cc
@@ -14,15 +14,15 @@
  *
  */
 
-#include "cunumeric/matrix/transpose.h"
-#include "cunumeric/matrix/transpose_template.inl"
+#include "cupynumeric/matrix/transpose.h"
+#include "cupynumeric/matrix/transpose_template.inl"
 
 #if LEGATE_DEFINED(LEGATE_USE_OPENMP)
 #include "omp.h"
 #endif
 #include "cblas.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -59,10 +59,10 @@ struct TransposeImplBody<VariantKind::CPU, CODE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+const auto reg = []() -> char {
   TransposeTask::register_variants();
-}
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/transpose.cu b/src/cupynumeric/matrix/transpose.cu
similarity index 93%
rename from src/cunumeric/matrix/transpose.cu
rename to src/cupynumeric/matrix/transpose.cu
index 9528792d83..67212a05d2 100644
--- a/src/cunumeric/matrix/transpose.cu
+++ b/src/cupynumeric/matrix/transpose.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/matrix/transpose.h"
-#include "cunumeric/matrix/transpose_template.inl"
+#include "cupynumeric/matrix/transpose.h"
+#include "cupynumeric/matrix/transpose_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 #define TILE_DIM 32
 #define BLOCK_ROWS 8
@@ -99,7 +99,7 @@ struct TransposeImplBody<VariantKind::GPU, CODE> {
 
     auto stream = get_cached_stream();
     transpose_2d_physical<VAL><<<blocks, threads, 0, stream>>>(out, in, rect.lo, rect.hi);
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -108,4 +108,4 @@ struct TransposeImplBody<VariantKind::GPU, CODE> {
   transpose_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/transpose.h b/src/cupynumeric/matrix/transpose.h
similarity index 78%
rename from src/cunumeric/matrix/transpose.h
rename to src/cupynumeric/matrix/transpose.h
index d87339e542..172e5815aa 100644
--- a/src/cunumeric/matrix/transpose.h
+++ b/src/cupynumeric/matrix/transpose.h
@@ -16,18 +16,19 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct TransposeArgs {
   legate::PhysicalStore out;
   legate::PhysicalStore in;
 };
 
-class TransposeTask : public CuNumericTask<TransposeTask> {
+class TransposeTask : public CuPyNumericTask<TransposeTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_TRANSPOSE_COPY_2D};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_TRANSPOSE_COPY_2D}};
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -39,4 +40,4 @@ class TransposeTask : public CuNumericTask<TransposeTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/transpose_omp.cc b/src/cupynumeric/matrix/transpose_omp.cc
similarity index 91%
rename from src/cunumeric/matrix/transpose_omp.cc
rename to src/cupynumeric/matrix/transpose_omp.cc
index ea4db6abe4..1b439ae9ae 100644
--- a/src/cunumeric/matrix/transpose_omp.cc
+++ b/src/cupynumeric/matrix/transpose_omp.cc
@@ -14,13 +14,13 @@
  *
  */
 
-#include "cunumeric/matrix/transpose.h"
-#include "cunumeric/matrix/transpose_template.inl"
+#include "cupynumeric/matrix/transpose.h"
+#include "cupynumeric/matrix/transpose_template.inl"
 
 #include "omp.h"
 #include "cblas.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -54,4 +54,4 @@ struct TransposeImplBody<VariantKind::OMP, CODE> {
   transpose_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/transpose_template.inl b/src/cupynumeric/matrix/transpose_template.inl
similarity index 93%
rename from src/cunumeric/matrix/transpose_template.inl
rename to src/cupynumeric/matrix/transpose_template.inl
index 9e1935471c..3a32efe913 100644
--- a/src/cunumeric/matrix/transpose_template.inl
+++ b/src/cupynumeric/matrix/transpose_template.inl
@@ -17,9 +17,9 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/matrix/transpose.h"
+#include "cupynumeric/matrix/transpose.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -55,4 +55,4 @@ static void transpose_template(TaskContext& context)
   type_dispatch(input.type().code(), TransposeImpl<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/trilu.cc b/src/cupynumeric/matrix/trilu.cc
similarity index 87%
rename from src/cunumeric/matrix/trilu.cc
rename to src/cupynumeric/matrix/trilu.cc
index ee694e2695..9c41b45193 100644
--- a/src/cunumeric/matrix/trilu.cc
+++ b/src/cupynumeric/matrix/trilu.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/matrix/trilu.h"
-#include "cunumeric/matrix/trilu_template.inl"
+#include "cupynumeric/matrix/trilu.h"
+#include "cupynumeric/matrix/trilu_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -62,7 +62,10 @@ struct TriluImplBody<VariantKind::CPU, CODE, DIM, LOWER> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { TriluTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  TriluTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/trilu.cu b/src/cupynumeric/matrix/trilu.cu
similarity index 90%
rename from src/cunumeric/matrix/trilu.cu
rename to src/cupynumeric/matrix/trilu.cu
index 0c4a24f97c..616581a6ea 100644
--- a/src/cunumeric/matrix/trilu.cu
+++ b/src/cupynumeric/matrix/trilu.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/matrix/trilu.h"
-#include "cunumeric/matrix/trilu_template.inl"
+#include "cupynumeric/matrix/trilu.h"
+#include "cupynumeric/matrix/trilu_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -70,7 +70,7 @@ struct TriluImplBody<VariantKind::GPU, CODE, DIM, LOWER> {
     auto stream         = get_cached_stream();
     trilu_kernel<VAL, DIM, LOWER, C_ORDER>
       <<<blocks, THREADS_PER_BLOCK, 0, stream>>>(out, in, pitches, lo, volume, k);
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -79,4 +79,4 @@ struct TriluImplBody<VariantKind::GPU, CODE, DIM, LOWER> {
   trilu_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/trilu.h b/src/cupynumeric/matrix/trilu.h
similarity index 80%
rename from src/cunumeric/matrix/trilu.h
rename to src/cupynumeric/matrix/trilu.h
index f53a0864fe..6364992157 100644
--- a/src/cunumeric/matrix/trilu.h
+++ b/src/cupynumeric/matrix/trilu.h
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct TriluArgs {
   bool lower;
@@ -27,9 +27,9 @@ struct TriluArgs {
   legate::PhysicalStore input;
 };
 
-class TriluTask : public CuNumericTask<TriluTask> {
+class TriluTask : public CuPyNumericTask<TriluTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_TRILU};
+  static inline const auto TASK_CONFIG = legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_TRILU}};
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -41,4 +41,4 @@ class TriluTask : public CuNumericTask<TriluTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/trilu_omp.cc b/src/cupynumeric/matrix/trilu_omp.cc
similarity index 92%
rename from src/cunumeric/matrix/trilu_omp.cc
rename to src/cupynumeric/matrix/trilu_omp.cc
index 3a5e1cc1e4..8b61824687 100644
--- a/src/cunumeric/matrix/trilu_omp.cc
+++ b/src/cupynumeric/matrix/trilu_omp.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/matrix/trilu.h"
-#include "cunumeric/matrix/trilu_template.inl"
+#include "cupynumeric/matrix/trilu.h"
+#include "cupynumeric/matrix/trilu_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -61,4 +61,4 @@ struct TriluImplBody<VariantKind::OMP, CODE, DIM, LOWER> {
   trilu_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/trilu_template.inl b/src/cupynumeric/matrix/trilu_template.inl
similarity index 95%
rename from src/cunumeric/matrix/trilu_template.inl
rename to src/cupynumeric/matrix/trilu_template.inl
index 5be34c2e7d..f1c51255c8 100644
--- a/src/cunumeric/matrix/trilu_template.inl
+++ b/src/cupynumeric/matrix/trilu_template.inl
@@ -17,10 +17,10 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/matrix/trilu.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/matrix/trilu.h"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -81,4 +81,4 @@ static void trilu_template(TaskContext& context)
   double_dispatch(args.output.dim(), args.output.type().code(), TriluImpl<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/trsm.cc b/src/cupynumeric/matrix/trsm.cc
similarity index 91%
rename from src/cunumeric/matrix/trsm.cc
rename to src/cupynumeric/matrix/trsm.cc
index 75d50d3138..74cc26ed40 100644
--- a/src/cunumeric/matrix/trsm.cc
+++ b/src/cupynumeric/matrix/trsm.cc
@@ -14,13 +14,13 @@
  *
  */
 
-#include "cunumeric/matrix/trsm.h"
-#include "cunumeric/matrix/trsm_template.inl"
+#include "cupynumeric/matrix/trsm.h"
+#include "cupynumeric/matrix/trsm_template.inl"
 
 #include <cblas.h>
 #include <lapack.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -96,7 +96,10 @@ struct TrsmImplBody<VariantKind::CPU, Type::Code::COMPLEX128> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { TrsmTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  TrsmTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/trsm.cu b/src/cupynumeric/matrix/trsm.cu
similarity index 92%
rename from src/cunumeric/matrix/trsm.cu
rename to src/cupynumeric/matrix/trsm.cu
index 7f77ad3711..adddfc5b33 100644
--- a/src/cunumeric/matrix/trsm.cu
+++ b/src/cupynumeric/matrix/trsm.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/matrix/trsm.h"
-#include "cunumeric/matrix/trsm_template.inl"
+#include "cupynumeric/matrix/trsm.h"
+#include "cupynumeric/matrix/trsm_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -39,7 +39,7 @@ static inline void trsm_template(
 
   CHECK_CUBLAS(trsm(context, side, uplo, transa, diag, m, n, &alpha, rhs, n, lhs, m));
 
-  CUNUMERIC_CHECK_CUDA_STREAM(stream);
+  CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
 }
 
 template <>
@@ -85,4 +85,4 @@ struct TrsmImplBody<VariantKind::GPU, Type::Code::COMPLEX128> {
   trsm_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/trsm.h b/src/cupynumeric/matrix/trsm.h
similarity index 78%
rename from src/cunumeric/matrix/trsm.h
rename to src/cupynumeric/matrix/trsm.h
index 658a29494a..ca3fa55b77 100644
--- a/src/cunumeric/matrix/trsm.h
+++ b/src/cupynumeric/matrix/trsm.h
@@ -16,13 +16,13 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
-class TrsmTask : public CuNumericTask<TrsmTask> {
+class TrsmTask : public CuPyNumericTask<TrsmTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_TRSM};
+  static inline const auto TASK_CONFIG = legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_TRSM}};
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -34,4 +34,4 @@ class TrsmTask : public CuNumericTask<TrsmTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/trsm_omp.cc b/src/cupynumeric/matrix/trsm_omp.cc
similarity index 95%
rename from src/cunumeric/matrix/trsm_omp.cc
rename to src/cupynumeric/matrix/trsm_omp.cc
index 039e1a59d4..f743a6e6bc 100644
--- a/src/cunumeric/matrix/trsm_omp.cc
+++ b/src/cupynumeric/matrix/trsm_omp.cc
@@ -14,14 +14,14 @@
  *
  */
 
-#include "cunumeric/matrix/trsm.h"
-#include "cunumeric/matrix/trsm_template.inl"
+#include "cupynumeric/matrix/trsm.h"
+#include "cupynumeric/matrix/trsm_template.inl"
 
 #include <cblas.h>
 #include <lapack.h>
 #include <omp.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -93,4 +93,4 @@ struct TrsmImplBody<VariantKind::CPU, Type::Code::COMPLEX128> {
   trsm_template<VariantKind::CPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/trsm_template.inl b/src/cupynumeric/matrix/trsm_template.inl
similarity index 96%
rename from src/cunumeric/matrix/trsm_template.inl
rename to src/cupynumeric/matrix/trsm_template.inl
index 83ec265936..32006bf486 100644
--- a/src/cunumeric/matrix/trsm_template.inl
+++ b/src/cupynumeric/matrix/trsm_template.inl
@@ -17,9 +17,9 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/matrix/trsm.h"
+#include "cupynumeric/matrix/trsm.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -80,4 +80,4 @@ static void trsm_template(TaskContext& context)
   type_dispatch(lhs.type().code(), TrsmImpl<KIND>{}, lhs, rhs);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/util.cc b/src/cupynumeric/matrix/util.cc
similarity index 96%
rename from src/cunumeric/matrix/util.cc
rename to src/cupynumeric/matrix/util.cc
index e1678c040a..066a493833 100644
--- a/src/cunumeric/matrix/util.cc
+++ b/src/cupynumeric/matrix/util.cc
@@ -15,7 +15,7 @@
  */
 
 #include "legate/data/buffer.h"
-#include "cunumeric/matrix/util.h"
+#include "cupynumeric/matrix/util.h"
 #include "legate/utilities/macros.h"
 #include "legate_defines.h"
 
@@ -23,7 +23,7 @@
 #include <omp.h>
 #endif
 
-namespace cunumeric {
+namespace cupynumeric {
 
 size_t stride_for_blas(size_t m, size_t n, size_t x_stride, size_t y_stride, bool& transpose)
 {
@@ -31,14 +31,14 @@ size_t stride_for_blas(size_t m, size_t n, size_t x_stride, size_t y_stride, boo
   if (n == 1) {
     // Column matrix: Every row has exactly 1 element, therefore it is trivially contiguous. Any
     // stride between rows is acceptable.
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     assert(x_stride >= 1);
 #endif
     blas_stride = x_stride;
     transpose   = false;
   } else if (m == 1) {
     // Row matrix
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     assert(y_stride >= 1);
 #endif
     if (y_stride == 1) {
@@ -56,7 +56,7 @@ size_t stride_for_blas(size_t m, size_t n, size_t x_stride, size_t y_stride, boo
     // General case: One dimension needs to be contiguous. If that's not the last dimension, then
     // the matrix represents the transpose of a row-major nxm matrix. We then tell the BLAS library
     // that we are passing a row-major nxm matrix, and ask for the matrix to be transposed.
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     assert((x_stride == 1 && y_stride > 1) || (y_stride == 1 && x_stride > 1));
 #endif
     blas_stride = std::max(x_stride, y_stride);
@@ -159,4 +159,4 @@ void float_tensor_to_half(
   }
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/matrix/util.h b/src/cupynumeric/matrix/util.h
similarity index 97%
rename from src/cunumeric/matrix/util.h
rename to src/cupynumeric/matrix/util.h
index 91b9797510..a9f4a13b35 100644
--- a/src/cunumeric/matrix/util.h
+++ b/src/cupynumeric/matrix/util.h
@@ -18,7 +18,7 @@
 
 #include "mathtypes/half.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 size_t stride_for_blas(size_t m, size_t n, size_t x_stride, size_t y_stride, bool& transpose);
 
@@ -55,4 +55,4 @@ void half_tensor_to_float(
 void float_tensor_to_half(
   __half* out, const float* in, size_t ndim, const int64_t* shape, const int64_t* out_strides);
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/ndarray.cc b/src/cupynumeric/ndarray.cc
similarity index 69%
rename from src/cunumeric/ndarray.cc
rename to src/cupynumeric/ndarray.cc
index f43a9ef7a6..10f0d017a3 100644
--- a/src/cunumeric/ndarray.cc
+++ b/src/cupynumeric/ndarray.cc
@@ -14,19 +14,19 @@
  *
  */
 
-#include "cunumeric/ndarray.h"
+#include "cupynumeric/ndarray.h"
 #include <stdexcept>
 #include <sys/types.h>
 
-#include "cunumeric/binary/binary_op_util.h"
-#include "cunumeric/operators.h"
-#include "cunumeric/random/rand_util.h"
-#include "cunumeric/runtime.h"
-#include "cunumeric/unary/convert_util.h"
-#include "cunumeric/unary/unary_op_util.h"
-#include "cunumeric/unary/unary_red_util.h"
+#include "cupynumeric/binary/binary_op_util.h"
+#include "cupynumeric/operators.h"
+#include "cupynumeric/random/rand_util.h"
+#include "cupynumeric/runtime.h"
+#include "cupynumeric/unary/convert_util.h"
+#include "cupynumeric/unary/unary_op_util.h"
+#include "cupynumeric/unary/unary_red_util.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 // ==========================================================================================
 
@@ -45,7 +45,7 @@ struct generate_zero_fn {
 
 struct check_nonzero_scalar_fn {
   template <legate::Type::Code CODE>
-  bool operator()(cunumeric::NDArray array)
+  bool operator()(cupynumeric::NDArray array)
   {
     assert(array.dim() == 0);
     using VAL = legate::type_of<CODE>;
@@ -83,7 +83,7 @@ struct generate_identity_fn {
     Scalar operator()(const legate::Type& type)
     {
       auto value       = UnaryRedOp<OP, CODE>::OP::identity;
-      auto argred_type = CuNumericRuntime::get_runtime()->get_argred_type(type);
+      auto argred_type = CuPyNumericRuntime::get_runtime()->get_argred_type(type);
       return Scalar(value, argred_type);
     }
 
@@ -165,7 +165,7 @@ NDArray NDArray::operator+(const NDArray& other) const { return add(*this, other
 
 NDArray NDArray::operator+(const legate::Scalar& other) const
 {
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
   auto scalar  = runtime->create_scalar_store(other);
   return operator+(NDArray(std::move(scalar)));
 }
@@ -180,7 +180,7 @@ NDArray NDArray::operator*(const NDArray& other) const { return multiply(*this,
 
 NDArray NDArray::operator*(const legate::Scalar& other) const
 {
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
   auto scalar  = runtime->create_scalar_store(other);
   return operator*(NDArray(std::move(scalar)));
 }
@@ -191,6 +191,15 @@ NDArray& NDArray::operator*=(const NDArray& other)
   return *this;
 }
 
+NDArray NDArray::operator/(const NDArray& other) const { return divide(*this, other); }
+
+NDArray NDArray::operator/(const legate::Scalar& other) const
+{
+  auto runtime = CuPyNumericRuntime::get_runtime();
+  auto scalar  = runtime->create_scalar_store(other);
+  return operator/(NDArray(std::move(scalar)));
+}
+
 NDArray NDArray::operator[](std::initializer_list<slice> slices) const
 {
   if (slices.size() > static_cast<size_t>(dim())) {
@@ -202,7 +211,7 @@ NDArray NDArray::operator[](std::initializer_list<slice> slices) const
   uint32_t dim = 0;
   auto sliced  = store_;
   for (const auto& sl : slices) {
-    sliced = sliced.slice(0, sl);
+    sliced = sliced.slice(dim, sl);
     ++dim;
   }
 
@@ -218,7 +227,7 @@ void NDArray::assign(const NDArray& other)
 
 void NDArray::assign(const legate::Scalar& other)
 {
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
   auto scalar  = runtime->create_scalar_store(other);
   assign(NDArray(std::move(scalar)));
 }
@@ -229,9 +238,9 @@ void NDArray::random(int32_t gen_code)
     return;
   }
 
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
 
-  auto task = runtime->create_task(CuNumericOpCode::CUNUMERIC_RAND);
+  auto task = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_RAND);
 
   task.add_output(store_);
   task.add_scalar_arg(legate::Scalar(static_cast<int32_t>(RandGenCode::UNIFORM)));
@@ -248,7 +257,7 @@ void NDArray::fill(const Scalar& value)
     return;
   }
 
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
 
   if (!store_.transformed()) {
     legate::Runtime::get_runtime()->issue_fill(store_, value);
@@ -257,7 +266,7 @@ void NDArray::fill(const Scalar& value)
 
   auto fill_value = runtime->create_scalar_store(value);
 
-  auto task = runtime->create_task(CuNumericOpCode::CUNUMERIC_FILL);
+  auto task = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_FILL);
 
   task.add_output(store_);
   task.add_input(fill_value);
@@ -277,8 +286,8 @@ void NDArray::_fill(legate::LogicalStore const& value)
     return;
   }
 
-  auto runtime = CuNumericRuntime::get_runtime();
-  auto task    = runtime->create_task(CuNumericOpCode::CUNUMERIC_FILL);
+  auto runtime = CuPyNumericRuntime::get_runtime();
+  auto task    = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_FILL);
   task.add_output(store_);
   task.add_input(value);
   task.add_scalar_arg(Scalar(false));
@@ -296,9 +305,9 @@ void NDArray::eye(int32_t k)
   auto zero = legate::type_dispatch(type().code(), generate_zero_fn{});
   fill(zero);
 
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
 
-  auto task = runtime->create_task(CuNumericOpCode::CUNUMERIC_EYE);
+  auto task = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_EYE);
 
   task.add_input(store_);
   task.add_output(store_);
@@ -315,7 +324,7 @@ void NDArray::bincount(NDArray rhs, std::optional<NDArray> weights /*=std::nullo
 
   assert(dim() == 1);
 
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
 
   if (weights.has_value()) {
     assert(rhs.shape() == weights.value().shape());
@@ -324,7 +333,7 @@ void NDArray::bincount(NDArray rhs, std::optional<NDArray> weights /*=std::nullo
   auto zero = legate::type_dispatch(type().code(), generate_zero_fn{});
   fill(zero);
 
-  auto task                     = runtime->create_task(CuNumericOpCode::CUNUMERIC_BINCOUNT);
+  auto task                     = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_BINCOUNT);
   legate::ReductionOpKind redop = legate::ReductionOpKind::ADD;
 
   auto p_lhs = task.add_reduction(store_, redop);
@@ -340,8 +349,8 @@ void NDArray::bincount(NDArray rhs, std::optional<NDArray> weights /*=std::nullo
 
 void NDArray::sort_task(NDArray rhs, bool argsort, bool stable)
 {
-  auto runtime = CuNumericRuntime::get_runtime();
-  auto task    = runtime->create_task(CuNumericOpCode::CUNUMERIC_SORT);
+  auto runtime = CuPyNumericRuntime::get_runtime();
+  auto task    = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_SORT);
   auto p_rhs   = task.add_input(rhs.store_);
 
   auto machine             = legate::Runtime::get_runtime()->get_machine();
@@ -375,7 +384,7 @@ void NDArray::sort_swapped(NDArray rhs, bool argsort, int32_t sort_axis, bool st
   sort_axis = normalize_axis_index(sort_axis, rhs.dim());
 
   auto swapped      = rhs.swapaxes(sort_axis, rhs.dim() - 1);
-  auto runtime      = CuNumericRuntime::get_runtime();
+  auto runtime      = CuPyNumericRuntime::get_runtime();
   auto swapped_copy = runtime->create_array(swapped.shape(), swapped.type());
   swapped_copy.assign(swapped);
 
@@ -430,9 +439,9 @@ void NDArray::trilu(NDArray rhs, int32_t k, bool lower)
     return;
   }
 
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
 
-  auto task = runtime->create_task(CuNumericOpCode::CUNUMERIC_TRILU);
+  auto task = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_TRILU);
 
   auto& out_shape = shape();
   rhs             = rhs.broadcast(out_shape, rhs.store_);
@@ -448,6 +457,8 @@ void NDArray::trilu(NDArray rhs, int32_t k, bool lower)
   runtime->submit(std::move(task));
 }
 
+void NDArray::dot(NDArray rhs1, NDArray rhs2) { dot_MM(rhs1.get_store(), rhs2.get_store()); }
+
 void NDArray::binary_op(int32_t op_code, NDArray rhs1, NDArray rhs2)
 {
   if (rhs1.type() != rhs2.type()) {
@@ -458,9 +469,9 @@ void NDArray::binary_op(int32_t op_code, NDArray rhs1, NDArray rhs2)
     return;
   }
 
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
 
-  auto task = runtime->create_task(CuNumericOpCode::CUNUMERIC_BINARY_OP);
+  auto task = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_BINARY_OP);
 
   auto& out_shape = shape();
   auto rhs1_store = broadcast(out_shape, rhs1.store_);
@@ -483,7 +494,7 @@ void NDArray::binary_reduction(int32_t op_code, NDArray rhs1, NDArray rhs2)
     return;
   }
 
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
 
   auto rhs1_store = broadcast(rhs1, rhs2);
   auto rhs2_store = broadcast(rhs2, rhs1);
@@ -496,7 +507,7 @@ void NDArray::binary_reduction(int32_t op_code, NDArray rhs1, NDArray rhs2)
     redop = get_reduction_op(UnaryRedCode::PROD);
     fill(legate::Scalar(true));
   }
-  auto task = runtime->create_task(CuNumericOpCode::CUNUMERIC_BINARY_RED);
+  auto task = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_BINARY_RED);
 
   task.add_reduction(store_, redop);
   auto p_rhs1 = task.add_input(rhs1_store);
@@ -516,9 +527,9 @@ void NDArray::unary_op(int32_t op_code,
     return;
   }
 
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
 
-  auto task = runtime->create_task(CuNumericOpCode::CUNUMERIC_UNARY_OP);
+  auto task = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_UNARY_OP);
 
   auto rhs = broadcast(shape(), input.store_);
 
@@ -541,13 +552,13 @@ void NDArray::unary_reduction(int32_t op_code_, NDArray input)
     return;
   }
 
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
 
   auto op_code = static_cast<UnaryRedCode>(op_code_);
 
   fill(get_reduction_identity(op_code, type()));
 
-  auto task = runtime->create_task(CuNumericOpCode::CUNUMERIC_SCALAR_UNARY_RED);
+  auto task = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_SCALAR_UNARY_RED);
 
   task.add_reduction(store_, get_reduction_op(op_code));
   task.add_input(input.store_);
@@ -560,51 +571,65 @@ void NDArray::unary_reduction(int32_t op_code_, NDArray input)
 
 uint64_t ceildiv(uint64_t a, uint64_t b) { return (a + b - 1) / b; }
 
-void NDArray::dot(NDArray rhs1, NDArray rhs2)
+void NDArray::dot_MM(const legate::LogicalStore& rhs1_store, const legate::LogicalStore& rhs2_store)
 {
-  if (size() == 0) {
-    return;
-  }
-
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime         = CuPyNumericRuntime::get_runtime();
+  const auto num_procs = legate::Runtime::get_runtime()->get_machine().count();
+  bool is_single_proc  = num_procs == 1;
 
   fill(get_reduction_identity(UnaryRedCode::SUM, type()));
 
-  assert(dim() == 2 && rhs1.dim() == 2 && rhs2.dim() == 2);
+  assert(dim() == 2 && rhs1_store.dim() == 2 && rhs2_store.dim() == 2);
+
+  auto m = rhs1_store.shape()[0];
+  auto n = rhs2_store.shape()[1];
+  auto k = rhs1_store.shape()[1];
+
+  static constexpr std::size_t MIN_MATRIX_SIZE = 1 << 20;
 
-  auto m = rhs1.shape()[0];
-  auto n = rhs2.shape()[1];
-  auto k = rhs1.shape()[1];
+  auto get_color_shape =
+    [&](const std::vector<std::uint64_t>& shape) -> std::vector<std::uint64_t> {
+    if (!is_in_test_mode() && shape[0] * shape[1] <= MIN_MATRIX_SIZE) {
+      return {1, 1};
+    }
+    auto color_shape = std::vector<std::uint64_t>{num_procs, 1};
 
-  // compute tilesize for lhs and batch_size for k
-  // TODO make generic
-  std::vector<std::uint64_t> initial_tile_shape = {512, 512};
+    while ((shape[0] / color_shape[0] < 2 * shape[1] / color_shape[1]) && color_shape[0] % 2 == 0) {
+      color_shape[0] /= 2;
+      color_shape[1] *= 2;
+    }
 
-  legate::tuple<std::uint64_t> color_shape = {ceildiv(m, initial_tile_shape[0]),
-                                              ceildiv(n, initial_tile_shape[1])};
-  std::vector<std::uint64_t> tile_shape = {ceildiv(m, color_shape[0]), ceildiv(n, color_shape[1])};
+    return color_shape;
+  };
 
   auto get_batchsize = [&](const std::vector<std::uint64_t>& tilesize, std::uint64_t k) {
     uint64_t typesize = legate::type_dispatch(type().code(), get_typesize_fn{});
     // default corresponds to 128MB (to store A and B tile)
-    uint64_t max_elements_per_tile = cunumeric_matmul_cache_size() / typesize;
+    uint64_t max_elements_per_tile = cupynumeric_matmul_cache_size() / typesize;
     uint64_t total_elements_rhs    = (tilesize[0] + tilesize[1]) * k;
     uint64_t num_batches           = ceildiv(total_elements_rhs, max_elements_per_tile);
     uint64_t batch_size            = ceildiv(k, num_batches);
     return batch_size;
   };
-  std::uint64_t k_batch_size = get_batchsize(tile_shape, k);
+
+  auto initial_color_shape = get_color_shape({m, n});
+  auto tile_shape          = std::vector<std::uint64_t>{ceildiv(m, initial_color_shape[0]),
+                                                        ceildiv(n, initial_color_shape[1])};
+  auto color_shape =
+    legate::tuple<std::uint64_t>{ceildiv(m, tile_shape[0]), ceildiv(n, tile_shape[1])};
+
+  std::uint64_t k_batch_size = is_single_proc ? k : get_batchsize(tile_shape, k);
 
   std::vector<std::uint64_t> tile_shape_rhs1 = {tile_shape[0], k_batch_size};
   std::vector<std::uint64_t> tile_shape_rhs2 = {k_batch_size, tile_shape[1]};
   auto color_k                               = ceildiv(k, k_batch_size);
 
   auto p_lhs  = store_.partition_by_tiling(tile_shape);
-  auto p_rhs1 = rhs1.store_.partition_by_tiling(tile_shape_rhs1);
-  auto p_rhs2 = rhs2.store_.partition_by_tiling(tile_shape_rhs2);
+  auto p_rhs1 = rhs1_store.partition_by_tiling(tile_shape_rhs1);
+  auto p_rhs2 = rhs2_store.partition_by_tiling(tile_shape_rhs2);
 
   for (std::uint64_t i = 0; i < color_k; ++i) {
-    auto task = runtime->create_task(CuNumericOpCode::CUNUMERIC_MATMUL, color_shape);
+    auto task = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_MATMUL, color_shape);
     task.add_output(p_lhs);
     task.add_input(p_lhs);
     task.add_input(p_rhs1, legate::SymbolicPoint{legate::dimension(0), legate::constant(i)});
@@ -619,7 +644,7 @@ void NDArray::arange(Scalar start, Scalar stop, Scalar step)
     return;
   }
 
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
 
   if (start.type() != type() || stop.type() != type() || step.type() != type()) {
     throw std::invalid_argument("start/stop/step should have the same type as the array");
@@ -629,7 +654,7 @@ void NDArray::arange(Scalar start, Scalar stop, Scalar step)
 
   // TODO: Optimization when value is a scalar
 
-  auto task = runtime->create_task(CuNumericOpCode::CUNUMERIC_ARANGE);
+  auto task = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_ARANGE);
 
   task.add_output(store_);
 
@@ -641,7 +666,7 @@ void NDArray::arange(Scalar start, Scalar stop, Scalar step)
 
 std::vector<NDArray> NDArray::nonzero()
 {
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
 
   std::vector<NDArray> outputs;
   auto ndim = dim();
@@ -649,7 +674,7 @@ std::vector<NDArray> NDArray::nonzero()
     outputs.emplace_back(runtime->create_array(legate::int64()));
   }
 
-  auto task = runtime->create_task(CuNumericOpCode::CUNUMERIC_NONZERO);
+  auto task = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_NONZERO);
 
   for (auto& output : outputs) {
     task.add_output(output.store_);
@@ -670,18 +695,21 @@ NDArray NDArray::unique()
   auto machine  = legate::Runtime::get_runtime()->get_machine();
   bool has_gpus = machine.count(legate::mapping::TaskTarget::GPU) > 0;
 
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
   auto result  = runtime->create_array(type());
 
-  auto task     = runtime->create_task(CuNumericOpCode::CUNUMERIC_UNIQUE);
+  auto task     = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_UNIQUE);
   auto part_out = task.declare_partition();
   auto part_in  = task.declare_partition();
   task.add_output(result.store_, part_out);
   task.add_input(store_, part_in);
-  task.add_communicator("nccl");
-  if (!has_gpus) {
+
+  if (has_gpus) {
+    task.add_communicator("nccl");
+  } else {
     task.add_constraint(legate::broadcast(part_in, legate::from_range<uint32_t>(0, dim())));
   }
+
   runtime->submit(std::move(task));
   return result;
 }
@@ -708,13 +736,13 @@ NDArray NDArray::swapaxes(int32_t axis1, int32_t axis2)
   std::swap(dims[axis1], dims[axis2]);
 
   auto transposed = store_.transpose(std::move(dims));
-  auto runtime    = CuNumericRuntime::get_runtime();
+  auto runtime    = CuPyNumericRuntime::get_runtime();
   return runtime->create_array(std::move(transposed));
 }
 
-NDArray NDArray::as_type(const legate::Type& type)
+NDArray NDArray::as_type(const legate::Type& type) const
 {
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
 
   // TODO: Check if conversion is valid
 
@@ -734,9 +762,9 @@ void NDArray::create_window(int32_t op_code, int64_t M, std::vector<double> args
     return;
   }
 
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
 
-  auto task = runtime->create_task(CuNumericOpCode::CUNUMERIC_WINDOW);
+  auto task = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_WINDOW);
 
   task.add_output(store_);
   task.add_scalar_arg(legate::Scalar(op_code));
@@ -751,9 +779,9 @@ void NDArray::create_window(int32_t op_code, int64_t M, std::vector<double> args
 
 void NDArray::convolve(NDArray input, NDArray filter)
 {
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
 
-  auto task = runtime->create_task(CuNumericOpCode::CUNUMERIC_CONVOLVE);
+  auto task = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_CONVOLVE);
 
   auto p_filter = task.add_input(filter.store_);
   auto p_input  = task.add_input(input.store_);
@@ -761,6 +789,8 @@ void NDArray::convolve(NDArray input, NDArray filter)
   task.add_input(input.store_, p_halo);
   auto p_output = task.add_output(store_);
   task.add_scalar_arg(legate::Scalar(shape()));
+  task.add_scalar_arg(
+    legate::Scalar(static_cast<int32_t>(CuPyNumericConvolveMethod::CUPYNUMERIC_CONVOLVE_AUTO)));
 
   auto offsets = (filter.store_.extents() + 1) / 2;
 
@@ -796,7 +826,7 @@ NDArray NDArray::transpose(std::vector<int32_t> axes)
 
 NDArray NDArray::argwhere()
 {
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
   if (dim() == 0) {
     auto not_zero = legate::type_dispatch(type().code(), check_nonzero_scalar_fn{}, *this);
     if (not_zero) {
@@ -810,7 +840,7 @@ NDArray NDArray::argwhere()
 
   auto result = runtime->create_array(legate::int64(), 2);
 
-  auto task     = runtime->create_task(CuNumericOpCode::CUNUMERIC_ARGWHERE);
+  auto task     = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_ARGWHERE);
   auto part_out = task.declare_partition();
   auto part_in  = task.declare_partition();
   task.add_output(result.store_, part_out);
@@ -824,7 +854,7 @@ NDArray NDArray::argwhere()
 
 NDArray NDArray::flip(std::optional<std::vector<int32_t>> axis)
 {
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
   auto result  = runtime->create_array(shape(), type());
 
   result.flip(*this, axis);
@@ -846,8 +876,8 @@ void NDArray::flip(NDArray rhs, std::optional<std::vector<int32_t>> axis)
     axes = normalize_axis_vector(axis.value(), dim());
   }
 
-  auto runtime = CuNumericRuntime::get_runtime();
-  auto task    = runtime->create_task(CuNumericOpCode::CUNUMERIC_FLIP);
+  auto runtime = CuPyNumericRuntime::get_runtime();
+  auto task    = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_FLIP);
   auto p_out   = task.add_output(output);
   auto p_in    = task.add_input(input);
   task.add_scalar_arg(legate::Scalar(axes));
@@ -857,9 +887,9 @@ void NDArray::flip(NDArray rhs, std::optional<std::vector<int32_t>> axis)
   runtime->submit(std::move(task));
 }
 
-NDArray NDArray::all(std::optional<std::vector<int32_t>> axis,
+NDArray NDArray::all(std::vector<int32_t> axis,
                      std::optional<NDArray> out,
-                     std::optional<bool> keepdims,
+                     bool keepdims,
                      std::optional<Scalar> initial,
                      std::optional<NDArray> where)
 {
@@ -870,22 +900,53 @@ NDArray NDArray::all(std::optional<std::vector<int32_t>> axis,
                                   legate::bool_(),
                                   out,
                                   keepdims,
-                                  std::nullopt,
+                                  {},
                                   initial,
                                   where);
 }
 
 NDArray NDArray::_perform_unary_reduction(int32_t op,
                                           NDArray src,
-                                          std::optional<std::vector<int32_t>> axis,
+                                          const std::vector<int32_t>& axis,
                                           std::optional<legate::Type> dtype,
                                           std::optional<legate::Type> res_dtype,
                                           std::optional<NDArray> out,
-                                          std::optional<bool> keepdims,
-                                          std::optional<std::vector<NDArray>> args,
+                                          bool keepdims,
+                                          const std::vector<NDArray>& args,
                                           std::optional<Scalar> initial,
                                           std::optional<NDArray> where)
 {
+  if (src.size() == 0 && !initial.has_value()) {
+    if (static_cast<UnaryRedCode>(op) == UnaryRedCode::MAX ||
+        static_cast<UnaryRedCode>(op) == UnaryRedCode::MIN) {
+      throw std::invalid_argument("Min/max reduction is not yet supported for empty arrays");
+    }
+  }
+
+  if (src.type() == legate::complex64() || src.type() == legate::complex128()) {
+    if (static_cast<UnaryRedCode>(op) == UnaryRedCode::MAX ||
+        static_cast<UnaryRedCode>(op) == UnaryRedCode::MIN ||
+        static_cast<UnaryRedCode>(op) == UnaryRedCode::ARGMAX ||
+        static_cast<UnaryRedCode>(op) == UnaryRedCode::ARGMIN) {
+      throw std::runtime_error("(arg)max/min not supported for complex-type arrays");
+    }
+  }
+
+  if (where.has_value() && where.value().type() != legate::bool_()) {
+    throw std::invalid_argument("where array should be bool");
+  }
+
+  if ((dtype.has_value() && !dtype.value().is_primitive()) ||
+      (res_dtype.has_value() && !res_dtype.value().is_primitive())) {
+    throw std::invalid_argument("dtype and res_dtype should be primitive type");
+  }
+
+  // Handle scalar array without any other inputs
+  if (src.dim() == 0 && !dtype.has_value() && !res_dtype.has_value() && !out.has_value() &&
+      !initial.has_value() && !where.has_value()) {
+    return src;
+  }
+
   if (res_dtype.has_value()) {
     assert(!dtype.has_value());
     dtype = src.type();
@@ -901,38 +962,25 @@ NDArray NDArray::_perform_unary_reduction(int32_t op,
     }
   }
 
-  if (src.type() == legate::complex64() || src.type() == legate::complex128()) {
-    auto ops = {UnaryRedCode::ARGMAX, UnaryRedCode::ARGMIN, UnaryRedCode::MAX, UnaryRedCode::MIN};
-    if (std::find(ops.begin(), ops.end(), static_cast<UnaryRedCode>(op)) != ops.end()) {
-      throw std::runtime_error("(arg)max/min not supported for complex-type arrays");
-    }
-  }
-
-  if (where.has_value()) {
-    if (where.value().type() != legate::bool_()) {
-      throw std::invalid_argument("where array should be bool");
-    }
-  }
-
   std::vector<int32_t> axes;
-  if (!axis.has_value()) {
+  if (axis.empty()) {
     for (auto i = 0; i < src.dim(); ++i) {
       axes.push_back(i);
     }
   } else {
-    axes = normalize_axis_vector(axis.value(), src.dim());
+    axes = normalize_axis_vector(axis, src.dim());
   }
 
   std::vector<uint64_t> out_shape;
   for (auto i = 0; i < src.dim(); ++i) {
     if (std::find(axes.begin(), axes.end(), i) == axes.end()) {
       out_shape.push_back(src.shape()[i]);
-    } else if (keepdims.value_or(false)) {
+    } else if (keepdims) {
       out_shape.push_back(1);
     }
   }
 
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
   if (!out.has_value()) {
     out = runtime->create_array(out_shape, res_dtype.value());
   } else if (out.value().shape() != out_shape) {
@@ -956,9 +1004,10 @@ NDArray NDArray::_perform_unary_reduction(int32_t op,
     where_array = broadcast_where(where.value(), src);
   }
 
-  std::vector<UnaryRedCode> ops = {
-    UnaryRedCode::ARGMAX, UnaryRedCode::ARGMIN, UnaryRedCode::NANARGMAX, UnaryRedCode::NANARGMIN};
-  auto argred = std::find(ops.begin(), ops.end(), static_cast<UnaryRedCode>(op)) != ops.end();
+  bool argred = static_cast<UnaryRedCode>(op) == UnaryRedCode::ARGMAX ||
+                static_cast<UnaryRedCode>(op) == UnaryRedCode::ARGMIN ||
+                static_cast<UnaryRedCode>(op) == UnaryRedCode::NANARGMAX ||
+                static_cast<UnaryRedCode>(op) == UnaryRedCode::NANARGMIN;
   if (argred) {
     assert(!initial.has_value());
     auto argred_dtype = runtime->get_argred_type(src.type());
@@ -980,17 +1029,17 @@ NDArray NDArray::_perform_unary_reduction(int32_t op,
 void NDArray::unary_reduction(int32_t op,
                               NDArray src,
                               std::optional<NDArray> where,
-                              std::optional<std::vector<int32_t>> orig_axis,
-                              std::optional<std::vector<int32_t>> axes,
-                              std::optional<bool> keepdims,
-                              std::optional<std::vector<NDArray>> args,
+                              const std::vector<int32_t>& orig_axis,
+                              const std::vector<int32_t>& axes,
+                              bool keepdims,
+                              const std::vector<NDArray>& args,
                               std::optional<Scalar> initial)
 {
   auto lhs_array = *this;
   auto rhs_array = src;
   assert(lhs_array.dim() <= rhs_array.dim());
 
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
   auto op_code = static_cast<UnaryRedCode>(op);
 
   if (initial.has_value()) {
@@ -999,70 +1048,66 @@ void NDArray::unary_reduction(int32_t op,
     lhs_array.fill(get_reduction_identity(op_code, lhs_array.type()));
   }
 
-  auto is_where    = where.has_value();
-  bool is_keepdims = keepdims.value_or(false);
+  auto is_where = where.has_value();
   if (lhs_array.size() == 1) {
-    assert(!axes.has_value() ||
-           lhs_array.dim() ==
-             (rhs_array.dim() - (is_keepdims ? 0 : static_cast<int32_t>(axes.value().size()))));
+    assert(axes.empty() || lhs_array.dim() == (rhs_array.dim() -
+                                               (keepdims ? 0 : static_cast<int32_t>(axes.size()))));
 
     auto p_lhs = lhs_array.store_;
     while (p_lhs.dim() > 1) {
       p_lhs = p_lhs.project(0, 0);
     }
 
-    auto task = runtime->create_task(CuNumericOpCode::CUNUMERIC_SCALAR_UNARY_RED);
+    auto task = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_SCALAR_UNARY_RED);
 
     task.add_reduction(p_lhs, get_reduction_op(op_code));
     auto p_rhs = task.add_input(rhs_array.store_);
     task.add_scalar_arg(legate::Scalar(op));
-    task.add_scalar_arg(legate::Scalar(rhs_array.shape()));
+    if (rhs_array.dim() > 0) {
+      task.add_scalar_arg(legate::Scalar(rhs_array.shape()));
+    } else {
+      task.add_scalar_arg(legate::Scalar(std::vector<size_t>({1})));
+    }
     task.add_scalar_arg(legate::Scalar(is_where));
     if (is_where) {
       auto p_where = task.add_input(where.value().store_);
       task.add_constraint(align(p_rhs, p_where));
     }
-    if (args.has_value()) {
-      auto arg_array = args.value();
-      for (auto& arg : arg_array) {
-        task.add_input(arg.store_);
-      }
+    for (auto& arg : args) {
+      task.add_input(arg.store_);
     }
 
     runtime->submit(std::move(task));
   } else {
-    assert(axes.has_value());
+    assert(!axes.empty());
     auto result = lhs_array.store_;
-    if (is_keepdims) {
-      for (auto axis : axes.value()) {
+    if (keepdims) {
+      for (auto axis : axes) {
         result = result.project(axis, 0);
       }
     }
     auto rhs_shape = rhs_array.shape();
-    for (auto axis : axes.value()) {
+    for (auto axis : axes) {
       result = result.promote(axis, rhs_shape[axis]);
     }
 
-    if (axes.value().size() > 1) {
+    if (axes.size() > 1) {
       throw std::runtime_error("Need support for reducing multiple dimensions");
     }
 
-    auto task = runtime->create_task(CuNumericOpCode::CUNUMERIC_UNARY_RED);
+    auto task = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_UNARY_RED);
 
     auto p_lhs = task.add_reduction(result, get_reduction_op(op_code));
     auto p_rhs = task.add_input(rhs_array.store_);
-    task.add_scalar_arg(legate::Scalar(axes.value()[0]));
+    task.add_scalar_arg(legate::Scalar(axes[0]));
     task.add_scalar_arg(legate::Scalar(op));
     task.add_scalar_arg(legate::Scalar(is_where));
     if (is_where) {
       auto p_where = task.add_input(where.value().store_);
       task.add_constraint(align(p_rhs, p_where));
     }
-    if (args != std::nullopt) {
-      auto arg_array = args.value();
-      for (auto& arg : arg_array) {
-        task.add_input(arg.store_);
-      }
+    for (auto& arg : args) {
+      task.add_input(arg.store_);
     }
     task.add_constraint(align(p_lhs, p_rhs));
 
@@ -1079,7 +1124,7 @@ NDArray NDArray::broadcast_where(NDArray where, NDArray source)
   auto where_shape = broadcast_shapes({where, source});
   auto where_store = broadcast(where_shape, where.store_);
 
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
   return runtime->create_array(std::move(where_store));
 }
 
@@ -1092,8 +1137,8 @@ void NDArray::convert(NDArray rhs, int32_t nan_op)
   auto lhs_s = lhs_array.store_;
   auto rhs_s = rhs_array.store_;
 
-  auto runtime = CuNumericRuntime::get_runtime();
-  auto task    = runtime->create_task(CuNumericOpCode::CUNUMERIC_CONVERT);
+  auto runtime = CuPyNumericRuntime::get_runtime();
+  auto task    = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_CONVERT);
   auto p_lhs   = task.add_output(lhs_s);
   auto p_rhs   = task.add_input(rhs_s);
   task.add_scalar_arg(legate::Scalar(nan_op));
@@ -1108,7 +1153,7 @@ NDArray NDArray::diag_helper(int32_t offset,
                              const std::optional<legate::Type>& type,
                              std::optional<NDArray> out)
 {
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
 
   if (dim() <= 1) {
     throw std::invalid_argument("diag_helper is implemented for dim > 1");
@@ -1127,7 +1172,7 @@ NDArray NDArray::diag_helper(int32_t offset,
   if (N != s_axes.size()) {
     throw std::invalid_argument("axes passed to diag_helper should be all different");
   }
-  if (dim() < N) {
+  if (static_cast<size_t>(dim()) < N) {
     throw std::invalid_argument("Dimension of input array shouldn't be less than number of axes");
   }
   std::vector<int32_t> transpose_axes;
@@ -1150,7 +1195,7 @@ NDArray NDArray::diag_helper(int32_t offset,
       offset = -offset;
     }
     a = transpose(transpose_axes);
-    if (offset >= a.shape()[dim() - 1]) {
+    if (offset >= static_cast<int32_t>(a.shape()[dim() - 1])) {
       throw std::invalid_argument("'offset' for diag or diagonal must be in range");
     }
     diag_size = std::max(static_cast<uint64_t>(0),
@@ -1224,7 +1269,7 @@ NDArray NDArray::diag_helper(int32_t offset,
 
 void NDArray::diag_task(NDArray rhs, int32_t offset, int32_t naxes, bool extract, bool trace)
 {
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
 
   legate::LogicalStore diag   = get_store();
   legate::LogicalStore matrix = get_store();
@@ -1264,9 +1309,8 @@ void NDArray::diag_task(NDArray rhs, int32_t offset, int32_t naxes, bool extract
       }
     }
   } else {
-    matrix    = store_;
-    diag      = rhs.store_;
-    auto ndim = dim();
+    matrix = store_;
+    diag   = rhs.store_;
     if (offset > 0) {
       matrix = matrix.slice(1, slice(offset));
     } else if (offset < 0) {
@@ -1280,7 +1324,7 @@ void NDArray::diag_task(NDArray rhs, int32_t offset, int32_t naxes, bool extract
     }
   }
 
-  auto task = runtime->create_task(CuNumericOpCode::CUNUMERIC_DIAG);
+  auto task = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_DIAG);
   if (extract) {
     auto p_diag   = task.add_reduction(diag, get_reduction_op(UnaryRedCode::SUM));
     auto p_matrix = task.add_input(matrix);
@@ -1347,9 +1391,9 @@ void NDArray::put(NDArray indices, NDArray values, std::string mode)
     self_tmp          = self_tmp._convert_future_to_regionfield(change_shape);
   }
 
-  auto runtime      = CuNumericRuntime::get_runtime();
+  auto runtime      = CuPyNumericRuntime::get_runtime();
   bool check_bounds = (mode == "raise");
-  auto task         = runtime->create_task(CuNumericOpCode::CUNUMERIC_WRAP);
+  auto task         = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_WRAP);
   auto indirect = runtime->create_array(indices.shape(), legate::point_type(self_tmp.dim()), false);
   auto p_indirect = task.add_output(indirect.store_);
   auto p_indices  = task.add_input(indices.store_);
@@ -1373,7 +1417,7 @@ void NDArray::put(NDArray indices, NDArray values, std::string mode)
 
 NDArray NDArray::copy()
 {
-  auto runtime        = CuNumericRuntime::get_runtime();
+  auto runtime        = CuPyNumericRuntime::get_runtime();
   auto legate_runtime = legate::Runtime::get_runtime();
   auto out            = runtime->create_array(shape(), type());
   if (store_.has_scalar_storage() && out.store_.has_scalar_storage()) {
@@ -1390,7 +1434,7 @@ NDArray NDArray::repeat(int64_t repeats, std::optional<int32_t> axis)
     throw std::invalid_argument("negative dimensions are not allowed");
   }
 
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
 
   // when array is a scalar
   if (dim() == 0) {
@@ -1423,7 +1467,7 @@ NDArray NDArray::repeat(int64_t repeats, std::optional<int32_t> axis)
     return runtime->create_array(empty_shape, src.type());
   }
 
-  auto task = runtime->create_task(CuNumericOpCode::CUNUMERIC_REPEAT);
+  auto task = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_REPEAT);
 
   auto out_shape = src.shape();
   out_shape[axis_int] *= repeats;
@@ -1487,10 +1531,10 @@ NDArray NDArray::repeat(NDArray repeats, std::optional<int32_t> axis)
     repeats = repeats._warn_and_convert(legate::int64());
   }
 
-  auto runtime        = CuNumericRuntime::get_runtime();
+  auto runtime        = CuPyNumericRuntime::get_runtime();
   auto legate_runtime = legate::Runtime::get_runtime();
   auto out_store      = legate_runtime->create_store(src.type(), src.dim());
-  auto task           = runtime->create_task(CuNumericOpCode::CUNUMERIC_REPEAT);
+  auto task           = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_REPEAT);
   auto p_src          = task.add_input(src.store_);
   task.add_output(out_store);
   task.add_scalar_arg(Scalar(axis_int));
@@ -1512,7 +1556,7 @@ NDArray NDArray::repeat(NDArray repeats, std::optional<int32_t> axis)
 
 NDArray NDArray::_convert_future_to_regionfield(bool change_shape)
 {
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
   if (change_shape && dim() == 0) {
     auto out = runtime->create_array({1}, type(), false);
     out.assign(*this);
@@ -1525,7 +1569,7 @@ NDArray NDArray::_convert_future_to_regionfield(bool change_shape)
 
 NDArray NDArray::_wrap(size_t new_len)
 {
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
 
   if (0 == new_len) {
     return runtime->create_array({0}, type());
@@ -1550,7 +1594,7 @@ NDArray NDArray::_wrap(size_t new_len)
     src               = src._convert_future_to_regionfield(change_shape);
   }
 
-  auto task     = runtime->create_task(CuNumericOpCode::CUNUMERIC_WRAP);
+  auto task     = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_WRAP);
   auto indirect = runtime->create_array({new_len}, legate::point_type(src.dim()), false);
   task.add_output(indirect.store_);
   task.add_scalar_arg(legate::Scalar(src.shape()));
@@ -1570,7 +1614,7 @@ NDArray NDArray::_warn_and_convert(legate::Type const& type)
   if (this->type() != type) {
     std::stringstream ss;
     ss << "converting array to " << type.to_string() << " type";
-    cunumeric_log().warning() << ss.str();
+    cupynumeric_log().warning() << ss.str();
     return as_type(type);
   }
   return *this;
@@ -1578,18 +1622,18 @@ NDArray NDArray::_warn_and_convert(legate::Type const& type)
 
 NDArray NDArray::wrap_indices(Scalar const& n)
 {
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
   auto out     = runtime->create_array(shape(), type());
-  auto divisor = cunumeric::full({}, n);
-  out.binary_op(static_cast<int32_t>(cunumeric::BinaryOpCode::MOD), *this, divisor);
+  auto divisor = cupynumeric::full({}, n);
+  out.binary_op(static_cast<int32_t>(cupynumeric::BinaryOpCode::MOD), *this, divisor);
   return out;
 }
 
 NDArray NDArray::clip_indices(Scalar const& min, Scalar const& max)
 {
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
   auto out     = runtime->create_array(shape(), type());
-  auto task    = runtime->create_task(CuNumericOpCode::CUNUMERIC_UNARY_OP);
+  auto task    = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_UNARY_OP);
   auto p_out   = task.add_output(out.store_);
   auto p_in    = task.add_input(store_);
   task.add_scalar_arg(legate::Scalar(static_cast<int32_t>(UnaryOpCode::CLIP)));
@@ -1612,7 +1656,7 @@ NDArray NDArray::diagonal(int32_t offset,
     if (axis1 || axis2) {
       throw std::invalid_argument("Axes shouldn't be specified when getting diagonal for 1D array");
     }
-    auto runtime = CuNumericRuntime::get_runtime();
+    auto runtime = CuPyNumericRuntime::get_runtime();
     auto m       = shape()[0] + std::abs(offset);
     auto res     = runtime->create_array({m, m}, store_.type());
     res.diag_task(*this, offset, 0, false, false);
@@ -1652,7 +1696,7 @@ NDArray NDArray::reshape(std::vector<int64_t> newshape, std::string order)
   }
   if (order == "F") {
     throw std::invalid_argument(
-      "cuNumeric has not implemented reshape using Fortran-like index order.");
+      "cuPyNumeric has not implemented reshape using Fortran-like index order.");
   }
   if (order != "C") {
     throw std::invalid_argument("order must be one of 'C', 'F', 'A'");
@@ -1662,7 +1706,7 @@ NDArray NDArray::reshape(std::vector<int64_t> newshape, std::string order)
 
 NDArray NDArray::reshape(std::vector<int64_t> newshape)
 {
-  auto runtime     = cunumeric::CuNumericRuntime::get_runtime();
+  auto runtime     = cupynumeric::CuPyNumericRuntime::get_runtime();
   int num_unknowns = std::count_if(newshape.begin(), newshape.end(), [](auto x) { return x < 0; });
   if (num_unknowns > 1) {
     throw std::invalid_argument("can only specify one unknown dimension");
@@ -1671,8 +1715,7 @@ NDArray NDArray::reshape(std::vector<int64_t> newshape)
   // case 1: zero size
   if (size() == 0) {
     if (1 == num_unknowns) {
-      std::replace_if(
-        newshape.begin(), newshape.end(), [](auto x) { return x < 0; }, 0);
+      std::replace_if(newshape.begin(), newshape.end(), [](auto x) { return x < 0; }, 0);
     }
     auto out_size = vec_prod(newshape);
     if (out_size != 0) {
@@ -1694,8 +1737,7 @@ NDArray NDArray::reshape(std::vector<int64_t> newshape)
   if (unknown_extent * known_volume != size()) {
     throw std::invalid_argument("cannot reshape, size mismatch");
   }
-  std::replace_if(
-    newshape.begin(), newshape.end(), [](auto x) { return x < 0; }, unknown_extent);
+  std::replace_if(newshape.begin(), newshape.end(), [](auto x) { return x < 0; }, unknown_extent);
 
   auto in_shape  = shape();
   auto out_shape = vec_convert<int64_t, uint64_t>(newshape);
@@ -1768,14 +1810,317 @@ NDArray NDArray::reshape(std::vector<int64_t> newshape)
   return NDArray(std::move(out_store));
 }
 
+NDArray NDArray::squeeze(
+  std::optional<std::reference_wrapper<std::vector<int32_t> const>> axis) const
+{
+  auto result = store_;
+  if (!axis.has_value()) {
+    int shift = 0;
+    for (int d = 0; d < dim(); d++) {
+      if (result.extents().data()[d + shift] == 1) {
+        result = result.project(d + shift, 0);
+        shift -= 1;
+      }
+    }
+  } else {
+    auto computed_axis = normalize_axis_vector(axis.value(), dim());
+    for (auto ax : computed_axis) {
+      if (shape()[ax] != 1) {
+        throw std::invalid_argument("can only select axes to squeeze out with size equal to one");
+      }
+    }
+    int shift = 0;
+    for (auto dim : computed_axis) {
+      result = result.project(dim + shift, 0);
+      shift -= 1;
+    }
+  }
+  if (result.extents().data() == store_.extents().data()) {
+    return *this;
+  } else {
+    auto runtime = CuPyNumericRuntime::get_runtime();
+    return runtime->create_array(std::move(result));
+  }
+}
+
+void NDArray::where(NDArray rhs1, NDArray rhs2, NDArray rhs3)
+{
+  const auto& out_shape = shape();
+  auto rhs1_store       = broadcast(out_shape, rhs1.store_);
+  auto rhs2_store       = broadcast(out_shape, rhs2.store_);
+  auto rhs3_store       = broadcast(out_shape, rhs3.store_);
+  assert(store_.type() == rhs2.store_.type());
+  assert(store_.type() == rhs3.store_.type());
+
+  auto runtime = CuPyNumericRuntime::get_runtime();
+  auto task    = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_WHERE);
+
+  auto p_lhs  = task.declare_partition();
+  auto p_rhs1 = task.declare_partition();
+  auto p_rhs2 = task.declare_partition();
+  auto p_rhs3 = task.declare_partition();
+
+  task.add_output(store_, p_lhs);
+  task.add_input(rhs1_store, p_rhs1);
+  task.add_input(rhs2_store, p_rhs2);
+  task.add_input(rhs3_store, p_rhs3);
+
+  task.add_constraint(legate::align(p_lhs, p_rhs1));
+  task.add_constraint(legate::align(p_lhs, p_rhs2));
+  task.add_constraint(legate::align(p_lhs, p_rhs3));
+
+  runtime->submit(std::move(task));
+}
+
+NDArray NDArray::_maybe_convert(const legate::Type& type) const
+{
+  if (type == store_.type()) {
+    return *this;
+  } else {
+    return as_type(type);
+  }
+}
+
+void NDArray::_verify_mode_extent(const std::map<char, int>& mode2extent,
+                                  const std::vector<char>& modes,
+                                  const std::vector<std::uint64_t>& shape) const
+{
+  for (int32_t i = 0; i < modes.size(); i++) {
+    assert(mode2extent.at(modes[i]) == shape[i]);
+  }
+}
+
+legate::LogicalStore NDArray::_alphabetical_transpose(legate::LogicalStore store,
+                                                      const std::vector<char>& modes) const
+{
+  std::map<char, unsigned int> map_mode_id;
+  for (int i = 0; i < modes.size(); i++) {
+    map_mode_id[modes[i]] = i;
+  }
+
+  auto modes_copy{modes};
+  std::sort(modes_copy.begin(), modes_copy.end());
+  std::vector<int32_t> axes;
+  for (int i = 0; i < modes_copy.size(); i++) {
+    axes.push_back(map_mode_id[modes_copy[i]]);
+  }
+  return store.transpose(std::move(axes));
+}
+
+enum BlasOps {
+  BlasOperationNone = 0,
+  BlasOperationVV   = 1,
+  BlasOperationMV   = 2,
+  BlasOperationMM   = 3,
+};
+
+// This function ports contract function in cupynumeric/_thunk/deferred.py
+// to-do: handle store overlap
+// to-do: support np.float16
+void NDArray::contract(const std::vector<char>& lhs_modes,
+                       NDArray rhs1,
+                       const std::vector<char>& rhs1_modes,
+                       NDArray rhs2,
+                       const std::vector<char>& rhs2_modes,
+                       const std::map<char, int>& mode2extent)
+{
+  // Sanity checks
+  // no duplicate modes within an array
+  std::set<char> s_lhs_modes(lhs_modes.begin(), lhs_modes.end());
+  assert(lhs_modes.size() == s_lhs_modes.size());
+  std::set<char> s_rhs1_modes(rhs1_modes.begin(), rhs1_modes.end());
+  assert(rhs1_modes.size() == s_rhs1_modes.size());
+  std::set<char> s_rhs2_modes(rhs2_modes.begin(), rhs2_modes.end());
+  assert(rhs2_modes.size() == s_rhs2_modes.size());
+  // no singleton modes
+  std::unordered_map<char, size_t> mode_counts;
+  for (auto v : lhs_modes) {
+    ++mode_counts[v];
+  }
+  for (auto v : rhs1_modes) {
+    ++mode_counts[v];
+  }
+  for (auto v : rhs2_modes) {
+    ++mode_counts[v];
+  }
+  for (auto const& count : mode_counts) {
+    assert(count.second == 2 || count.second == 3);
+  }
+
+  // arrays and mode lists agree on dimensionality
+  assert(dim() == lhs_modes.size());
+  assert(rhs1.dim() == rhs1_modes.size());
+  assert(rhs2.dim() == rhs2_modes.size());
+  // array shapes agree with mode extents (broadcasting should have been
+  // handled by the frontend)
+  _verify_mode_extent(mode2extent, lhs_modes, shape());
+  _verify_mode_extent(mode2extent, rhs1_modes, rhs1.shape());
+  _verify_mode_extent(mode2extent, rhs2_modes, rhs2.shape());
+  // casting has been handled by the frontend
+  assert(type() == rhs1.type());
+  assert(type() == rhs2.type());
+
+  // to-do: Handle store overlap
+
+  enum BlasOps blas_op    = BlasOperationNone;
+  bool count_unequals_two = false;
+  for (auto const& count : mode_counts) {
+    if (count.second != 2) {
+      count_unequals_two = true;
+      break;
+    }
+  }
+  if (!count_unequals_two) {
+    if (lhs_modes.size() == 0 && rhs1_modes.size() == 1 && rhs2_modes.size() == 1) {
+      blas_op = BlasOperationVV;
+    } else if (lhs_modes.size() == 1 && (rhs1_modes.size() == 2 and rhs2_modes.size() == 1 ||
+                                         rhs1_modes.size() == 1 and rhs2_modes.size() == 2)) {
+      blas_op = BlasOperationMV;
+    } else if (lhs_modes.size() == 2 and rhs1_modes.size() == 2 and rhs2_modes.size() == 2) {
+      blas_op = BlasOperationMM;
+    }
+  }
+  // to-do: support np.float16
+
+  // Clear output array
+  auto zero = legate::type_dispatch(type().code(), generate_zero_fn{});
+  fill(zero);
+  // Pull out the stores
+  auto lhs_s   = store_;
+  auto rhs1_s  = rhs1.store_;
+  auto rhs2_s  = rhs2.store_;
+  auto runtime = CuPyNumericRuntime::get_runtime();
+
+  if (blas_op != BlasOperationNone) {
+    if (blas_op == BlasOperationVV) {  // for Scalar, dim() == 0 or 1?
+      auto task  = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_DOT);
+      auto redop = get_reduction_op(UnaryRedCode::SUM);
+      task.add_reduction(lhs_s, redop);
+      auto p_rhs1 = task.add_input(rhs1_s);
+      auto p_rhs2 = task.add_input(rhs2_s);
+      task.add_constraint(align(p_rhs1, p_rhs2));
+      runtime->submit(std::move(task));
+    } else if (blas_op == BlasOperationMV) {
+      // Matrix-vector or vector-matrix multiply
+      // b,(ab/ba)->a --> (ab/ba),b->a
+      if (rhs1_modes.size() == 1) {
+        std::swap(rhs1_s, rhs2_s);
+      }
+      // ba,b->a --> ab,b->a
+      if (rhs1_modes[0] == rhs2_modes[0]) {
+        rhs1_s = rhs1_s.transpose({1, 0});
+      }
+      auto m = rhs1_s.extents().data()[0];
+      auto n = rhs1_s.extents().data()[1];
+      rhs2_s = rhs2_s.promote(0, m);
+      lhs_s  = lhs_s.promote(1, n);
+
+      auto task   = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_MATVECMUL);
+      auto redop  = get_reduction_op(UnaryRedCode::SUM);
+      auto p_lhs  = task.add_reduction(lhs_s, redop);
+      auto p_rhs1 = task.add_input(rhs1_s);
+      auto p_rhs2 = task.add_input(rhs2_s);
+      task.add_constraint(align(p_lhs, p_rhs1));
+      task.add_constraint(align(p_rhs1, p_rhs2));
+      runtime->submit(std::move(task));
+    } else if (blas_op == BlasOperationMM) {
+      auto rhs1_modes_copy{rhs1_modes};
+      auto rhs2_modes_copy{rhs2_modes};
+      // (cb/bc),(ab/ba)->ac --> (ab/ba),(cb/bc)->ac
+      if (!_is_in_vector<char>(rhs1_modes, lhs_modes[0])) {
+        std::swap(rhs1_s, rhs2_s);
+        std::swap(rhs1_modes_copy, rhs2_modes_copy);
+      }
+      assert(_is_in_vector<char>(rhs1_modes_copy, lhs_modes[0]) &&
+             _is_in_vector<char>(rhs2_modes_copy, lhs_modes[1]));
+      // ba,?->ac --> ab,?->ac
+      if (lhs_modes[0] != rhs1_modes_copy[0]) {
+        rhs1_s = rhs1_s.transpose({1, 0});
+      }
+      // ?,cb->ac --> ?,bc->ac
+      if (lhs_modes[1] != rhs2_modes_copy[1]) {
+        rhs2_s = rhs2_s.transpose({1, 0});
+      }
+      auto m = shape()[0];
+      auto n = shape()[1];
+      auto k = rhs1.shape()[1];
+      assert(m == rhs1.shape()[0]);
+      assert(n == rhs2.shape()[1]);
+      assert(k == rhs2.shape()[0]);
+
+      dot_MM(rhs1_s, rhs2_s);
+    } else {
+      assert(false);
+    }
+    return;
+  }
+
+  lhs_s  = _alphabetical_transpose(lhs_s, lhs_modes);
+  rhs1_s = _alphabetical_transpose(rhs1_s, rhs1_modes);
+  rhs2_s = _alphabetical_transpose(rhs2_s, rhs2_modes);
+
+  std::vector<int8_t> lhs_dim_mask;
+  std::vector<int8_t> rhs1_dim_mask;
+  std::vector<int8_t> rhs2_dim_mask;
+
+  std::vector<char> sorted_modes;
+  for (std::map<char, int>::const_iterator it = mode2extent.begin(); it != mode2extent.end();
+       it++) {
+    sorted_modes.push_back(it->first);
+  }
+  std::sort(sorted_modes.begin(), sorted_modes.end());
+  for (int i = 0; i < sorted_modes.size(); i++) {
+    auto dim    = i;
+    auto mode   = sorted_modes[i];
+    auto extent = mode2extent.at(mode);
+
+    auto add_mode = [&](legate::LogicalStore store,
+                        const std::vector<char>& modes,
+                        std::vector<int8_t>& dim_mask) {
+      if (!_is_in_vector<char>(modes, mode)) {
+        dim_mask.emplace_back(false);
+        return store.promote(dim, extent);
+      } else {
+        dim_mask.emplace_back(true);
+        return store;
+      }
+    };
+
+    lhs_s  = add_mode(lhs_s, lhs_modes, lhs_dim_mask);
+    rhs1_s = add_mode(rhs1_s, rhs1_modes, rhs1_dim_mask);
+    rhs2_s = add_mode(rhs2_s, rhs2_modes, rhs2_dim_mask);
+  }
+
+  assert(lhs_s.extents().data() == rhs1_s.extents().data());
+  assert(lhs_s.extents().data() == rhs2_s.extents().data());
+
+  auto task   = runtime->create_task(CuPyNumericOpCode::CUPYNUMERIC_CONTRACT);
+  auto redop  = get_reduction_op(UnaryRedCode::SUM);
+  auto p_lhs  = task.add_reduction(lhs_s, redop);
+  auto p_rhs1 = task.add_input(rhs1_s);
+  auto p_rhs2 = task.add_input(rhs2_s);
+
+  auto add_scalar_arg = [&](std::vector<int8_t> dim_mask) {
+    auto arg_type = legate::fixed_array_type(legate::bool_(), dim_mask.size());
+    task.add_scalar_arg(legate::Scalar(arg_type, dim_mask.data(), true));
+  };
+  add_scalar_arg(lhs_dim_mask);
+  add_scalar_arg(rhs1_dim_mask);
+  add_scalar_arg(rhs2_dim_mask);
+
+  task.add_constraint(align(p_lhs, p_rhs1));
+  task.add_constraint(align(p_rhs1, p_rhs2));
+  runtime->submit(std::move(task));
+}
+
 legate::LogicalStore NDArray::get_store() { return store_; }
 
 legate::LogicalStore NDArray::broadcast(const std::vector<uint64_t>& shape,
-                                        legate::LogicalStore& store)
+                                        legate::LogicalStore& store) const
 {
   int32_t diff = static_cast<int32_t>(shape.size()) - store.dim();
 
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
   assert(diff >= 0);
 #endif
 
@@ -1787,14 +2132,14 @@ legate::LogicalStore NDArray::broadcast(const std::vector<uint64_t>& shape,
   std::vector<uint64_t> orig_shape = result.extents().data();
   for (uint32_t dim = 0; dim < shape.size(); ++dim) {
     if (orig_shape[dim] != shape[dim]) {
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
       assert(orig_shape[dim] == 1);
 #endif
       result = result.project(dim, 0).promote(dim, shape[dim]);
     }
   }
 
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
   assert(static_cast<size_t>(result.dim()) == shape.size());
 #endif
 
@@ -1812,7 +2157,7 @@ legate::LogicalStore NDArray::broadcast(NDArray rhs1, NDArray rhs2)
 
 /*static*/ legate::Library NDArray::get_library()
 {
-  return CuNumericRuntime::get_runtime()->get_library();
+  return CuPyNumericRuntime::get_runtime()->get_library();
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/ndarray.h b/src/cupynumeric/ndarray.h
similarity index 72%
rename from src/cunumeric/ndarray.h
rename to src/cupynumeric/ndarray.h
index 1bc32dceea..c7a24a0669 100644
--- a/src/cunumeric/ndarray.h
+++ b/src/cupynumeric/ndarray.h
@@ -20,13 +20,13 @@
 #include <initializer_list>
 
 #include "legate.h"
-#include "cunumeric/slice.h"
-#include "cunumeric/typedefs.h"
+#include "cupynumeric/slice.h"
+#include "cupynumeric/typedefs.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 class NDArray {
-  friend class CuNumericRuntime;
+  friend class CuPyNumericRuntime;
 
  private:
   NDArray(legate::LogicalStore&& store);
@@ -57,6 +57,8 @@ class NDArray {
   NDArray& operator+=(const NDArray& other);
   NDArray operator*(const NDArray& other) const;
   NDArray operator*(const legate::Scalar& other) const;
+  NDArray operator/(const NDArray& other) const;
+  NDArray operator/(const legate::Scalar& other) const;
   NDArray& operator*=(const NDArray& other);
   NDArray operator[](std::initializer_list<slice> slices) const;
   operator bool() const;
@@ -91,11 +93,11 @@ class NDArray {
   NDArray transpose(std::vector<int32_t> axes);
   NDArray argwhere();
   NDArray flip(std::optional<std::vector<int32_t>> axis = std::nullopt);
-  NDArray all(std::optional<std::vector<int32_t>> axis = std::nullopt,
-              std::optional<NDArray> out               = std::nullopt,
-              std::optional<bool> keepdims             = std::nullopt,
-              std::optional<Scalar> initial            = std::nullopt,
-              std::optional<NDArray> where             = std::nullopt);
+  NDArray all(std::vector<int32_t> axis     = {},
+              std::optional<NDArray> out    = std::nullopt,
+              bool keepdims                 = false,
+              std::optional<Scalar> initial = std::nullopt,
+              std::optional<NDArray> where  = std::nullopt);
   void put(NDArray indices, NDArray values, std::string mode = "raise");
   NDArray diagonal(int32_t offset               = 0,
                    std::optional<int32_t> axis1 = std::nullopt,
@@ -111,9 +113,18 @@ class NDArray {
   NDArray reshape(std::vector<int64_t> newshape, std::string order);
   NDArray reshape(std::vector<int64_t> newshape);
   NDArray ravel(std::string order = "C");
+  NDArray squeeze(
+    std::optional<std::reference_wrapper<std::vector<int32_t> const>> axis = std::nullopt) const;
+  void where(NDArray rhs1, NDArray rhs2, NDArray rhs3);
+  void contract(const std::vector<char>& lhs_modes,
+                NDArray rhs1,
+                const std::vector<char>& rhs1_modes,
+                NDArray rhs2,
+                const std::vector<char>& rhs2_modes,
+                const std::map<char, int>& mode2extent);
 
  public:
-  NDArray as_type(const legate::Type& type);
+  NDArray as_type(const legate::Type& type) const;
   legate::LogicalStore get_store();
   void sort(NDArray rhs, bool argsort, std::optional<int32_t> axis = -1, bool stable = false);
   NDArray _convert_future_to_regionfield(bool change_shape = false);
@@ -121,10 +132,22 @@ class NDArray {
   NDArray _warn_and_convert(legate::Type const& type);
   NDArray wrap_indices(Scalar const& n);
   NDArray clip_indices(Scalar const& min, Scalar const& max);
+  NDArray _perform_unary_reduction(int32_t op,
+                                   NDArray src,
+                                   const std::vector<int32_t>& axis,
+                                   std::optional<legate::Type> dtype,
+                                   std::optional<legate::Type> res_dtype,
+                                   std::optional<NDArray> out,
+                                   bool keepdims,
+                                   const std::vector<NDArray>& args,
+                                   std::optional<Scalar> initial,
+                                   std::optional<NDArray> where);
   NDArray copy();
+  NDArray _maybe_convert(const legate::Type& type) const;
 
  private:
-  legate::LogicalStore broadcast(const std::vector<uint64_t>& shape, legate::LogicalStore& store);
+  legate::LogicalStore broadcast(const std::vector<uint64_t>& shape,
+                                 legate::LogicalStore& store) const;
   legate::LogicalStore broadcast(NDArray rhs1, NDArray rhs2);
   void sort_task(NDArray rhs, bool argsort, bool stable);
   void sort_swapped(NDArray rhs, bool argsort, int32_t sort_axis, bool stable);
@@ -132,22 +155,12 @@ class NDArray {
   void unary_reduction(int32_t op,
                        NDArray src,
                        std::optional<NDArray> where,
-                       std::optional<std::vector<int32_t>> orig_axis,
-                       std::optional<std::vector<int32_t>> axes,
-                       std::optional<bool> keepdims,
-                       std::optional<std::vector<NDArray>> args,
+                       const std::vector<int32_t>& orig_axis,
+                       const std::vector<int32_t>& axes,
+                       bool keepdims,
+                       const std::vector<NDArray>& args,
                        std::optional<Scalar> initial);
   NDArray broadcast_where(NDArray where, NDArray source);
-  NDArray _perform_unary_reduction(int32_t op,
-                                   NDArray src,
-                                   std::optional<std::vector<int32_t>> axis = std::nullopt,
-                                   std::optional<legate::Type> dtype        = std::nullopt,
-                                   std::optional<legate::Type> res_dtype    = std::nullopt,
-                                   std::optional<NDArray> out               = std::nullopt,
-                                   std::optional<bool> keepdims             = std::nullopt,
-                                   std::optional<std::vector<NDArray>> args = std::nullopt,
-                                   std::optional<Scalar> initial            = std::nullopt,
-                                   std::optional<NDArray> where             = std::nullopt);
   void flip(NDArray rhs, std::optional<std::vector<int32_t>> axis);
   void diag_task(NDArray rhs, int32_t offset, int32_t naxes, bool extract, bool trace);
   NDArray diag_helper(int32_t offset,
@@ -158,6 +171,13 @@ class NDArray {
                       std::optional<NDArray> out              = std::nullopt);
   void _fill(legate::LogicalStore const& value);
 
+  void dot_MM(const legate::LogicalStore& rhs1_store, const legate::LogicalStore& rhs2_store);
+  void _verify_mode_extent(const std::map<char, int>& mode2extent,
+                           const std::vector<char>& modes,
+                           const std::vector<std::uint64_t>& shape) const;
+  legate::LogicalStore _alphabetical_transpose(legate::LogicalStore store,
+                                               const std::vector<char>& modes) const;
+
  public:
   static legate::Library get_library();
 
@@ -165,6 +185,6 @@ class NDArray {
   legate::LogicalStore store_;
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
 
-#include "cunumeric/ndarray.inl"
+#include "cupynumeric/ndarray.inl"
diff --git a/src/cunumeric/ndarray.inl b/src/cupynumeric/ndarray.inl
similarity index 94%
rename from src/cunumeric/ndarray.inl
rename to src/cupynumeric/ndarray.inl
index 341e957ab6..31cbe83a3a 100644
--- a/src/cunumeric/ndarray.inl
+++ b/src/cupynumeric/ndarray.inl
@@ -14,7 +14,7 @@
  *
  */
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename T, int32_t DIM>
 legate::AccessorRO<T, DIM> NDArray::get_read_accessor()
@@ -30,4 +30,4 @@ legate::AccessorWO<T, DIM> NDArray::get_write_accessor()
   return mapped.write_accessor<T, DIM>();
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/nullary/arange.cc b/src/cupynumeric/nullary/arange.cc
similarity index 82%
rename from src/cunumeric/nullary/arange.cc
rename to src/cupynumeric/nullary/arange.cc
index fc3b42608d..8773cb3639 100644
--- a/src/cunumeric/nullary/arange.cc
+++ b/src/cupynumeric/nullary/arange.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/nullary/arange.h"
-#include "cunumeric/nullary/arange_template.inl"
+#include "cupynumeric/nullary/arange.h"
+#include "cupynumeric/nullary/arange_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -41,7 +41,10 @@ struct ArangeImplBody<VariantKind::CPU, VAL> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { ArangeTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  ArangeTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/nullary/arange.cu b/src/cupynumeric/nullary/arange.cu
similarity index 88%
rename from src/cunumeric/nullary/arange.cu
rename to src/cupynumeric/nullary/arange.cu
index 08968221cd..94ba2e7472 100644
--- a/src/cunumeric/nullary/arange.cu
+++ b/src/cupynumeric/nullary/arange.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/nullary/arange.h"
-#include "cunumeric/nullary/arange_template.inl"
+#include "cupynumeric/nullary/arange.h"
+#include "cupynumeric/nullary/arange_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename VAL>
 __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) arange_kernel(
@@ -45,7 +45,7 @@ struct ArangeImplBody<VariantKind::GPU, VAL> {
     auto stream         = get_cached_stream();
     arange_kernel<VAL>
       <<<blocks, THREADS_PER_BLOCK, 0, stream>>>(out, rect.lo[0], start, step, distance);
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -54,4 +54,4 @@ struct ArangeImplBody<VariantKind::GPU, VAL> {
   arange_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/nullary/arange.h b/src/cupynumeric/nullary/arange.h
similarity index 79%
rename from src/cunumeric/nullary/arange.h
rename to src/cupynumeric/nullary/arange.h
index 46ac863c0a..c3b993a157 100644
--- a/src/cunumeric/nullary/arange.h
+++ b/src/cupynumeric/nullary/arange.h
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct ArangeArgs {
   legate::PhysicalStore out;
@@ -26,9 +26,10 @@ struct ArangeArgs {
   legate::Scalar step;
 };
 
-class ArangeTask : public CuNumericTask<ArangeTask> {
+class ArangeTask : public CuPyNumericTask<ArangeTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_ARANGE};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_ARANGE}};
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -40,4 +41,4 @@ class ArangeTask : public CuNumericTask<ArangeTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/nullary/arange_omp.cc b/src/cupynumeric/nullary/arange_omp.cc
similarity index 88%
rename from src/cunumeric/nullary/arange_omp.cc
rename to src/cupynumeric/nullary/arange_omp.cc
index 9fccfee3a0..6ebc80bbeb 100644
--- a/src/cunumeric/nullary/arange_omp.cc
+++ b/src/cupynumeric/nullary/arange_omp.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/nullary/arange.h"
-#include "cunumeric/nullary/arange_template.inl"
+#include "cupynumeric/nullary/arange.h"
+#include "cupynumeric/nullary/arange_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -40,4 +40,4 @@ struct ArangeImplBody<VariantKind::OMP, VAL> {
   arange_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/nullary/arange_template.inl b/src/cupynumeric/nullary/arange_template.inl
similarity index 88%
rename from src/cunumeric/nullary/arange_template.inl
rename to src/cupynumeric/nullary/arange_template.inl
index 82bf335b91..29ba759c2f 100644
--- a/src/cunumeric/nullary/arange_template.inl
+++ b/src/cupynumeric/nullary/arange_template.inl
@@ -17,12 +17,12 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/nullary/arange.h"
-#include "cunumeric/arg.h"
-#include "cunumeric/arg.inl"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/nullary/arange.h"
+#include "cupynumeric/arg.h"
+#include "cupynumeric/arg.inl"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -58,4 +58,4 @@ static void arange_template(TaskContext& context)
   type_dispatch(args.out.code(), ArangeImpl<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/nullary/eye.cc b/src/cupynumeric/nullary/eye.cc
similarity index 81%
rename from src/cunumeric/nullary/eye.cc
rename to src/cupynumeric/nullary/eye.cc
index c1750cafab..d1fdcd101d 100644
--- a/src/cunumeric/nullary/eye.cc
+++ b/src/cupynumeric/nullary/eye.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/nullary/eye.h"
-#include "cunumeric/nullary/eye_template.inl"
+#include "cupynumeric/nullary/eye.h"
+#include "cupynumeric/nullary/eye_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -40,7 +40,10 @@ struct EyeImplBody<VariantKind::CPU, VAL> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { EyeTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  EyeTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/nullary/eye.cu b/src/cupynumeric/nullary/eye.cu
similarity index 87%
rename from src/cunumeric/nullary/eye.cu
rename to src/cupynumeric/nullary/eye.cu
index 99b0fa5f16..e2c79c2609 100644
--- a/src/cunumeric/nullary/eye.cu
+++ b/src/cupynumeric/nullary/eye.cu
@@ -14,11 +14,11 @@
  *
  */
 
-#include "cunumeric/nullary/eye.h"
-#include "cunumeric/nullary/eye_template.inl"
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/nullary/eye.h"
+#include "cupynumeric/nullary/eye_template.inl"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename VAL>
 __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
@@ -40,7 +40,7 @@ struct EyeImplBody<VariantKind::GPU, VAL> {
     const size_t blocks = (distance + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
     auto stream         = get_cached_stream();
     eye_kernel<VAL><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(out, start, distance);
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -49,4 +49,4 @@ struct EyeImplBody<VariantKind::GPU, VAL> {
   eye_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/nullary/eye.h b/src/cupynumeric/nullary/eye.h
similarity index 79%
rename from src/cunumeric/nullary/eye.h
rename to src/cupynumeric/nullary/eye.h
index 09d29bf6b7..921101510f 100644
--- a/src/cunumeric/nullary/eye.h
+++ b/src/cupynumeric/nullary/eye.h
@@ -16,18 +16,18 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct EyeArgs {
   legate::PhysicalStore out;
   int32_t k;
 };
 
-class EyeTask : public CuNumericTask<EyeTask> {
+class EyeTask : public CuPyNumericTask<EyeTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_EYE};
+  static inline const auto TASK_CONFIG = legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_EYE}};
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -39,4 +39,4 @@ class EyeTask : public CuNumericTask<EyeTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/nullary/eye_omp.cc b/src/cupynumeric/nullary/eye_omp.cc
similarity index 88%
rename from src/cunumeric/nullary/eye_omp.cc
rename to src/cupynumeric/nullary/eye_omp.cc
index b28c9b6528..294477f7cd 100644
--- a/src/cunumeric/nullary/eye_omp.cc
+++ b/src/cupynumeric/nullary/eye_omp.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/nullary/eye.h"
-#include "cunumeric/nullary/eye_template.inl"
+#include "cupynumeric/nullary/eye.h"
+#include "cupynumeric/nullary/eye_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -39,4 +39,4 @@ struct EyeImplBody<VariantKind::OMP, VAL> {
   eye_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/nullary/eye_template.inl b/src/cupynumeric/nullary/eye_template.inl
similarity index 92%
rename from src/cunumeric/nullary/eye_template.inl
rename to src/cupynumeric/nullary/eye_template.inl
index 3786cbd9cd..0a2bc20180 100644
--- a/src/cunumeric/nullary/eye_template.inl
+++ b/src/cupynumeric/nullary/eye_template.inl
@@ -17,12 +17,12 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/nullary/eye.h"
-#include "cunumeric/arg.h"
-#include "cunumeric/arg.inl"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/nullary/eye.h"
+#include "cupynumeric/arg.h"
+#include "cupynumeric/arg.inl"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -75,4 +75,4 @@ static void eye_template(TaskContext& context)
   type_dispatch(args.out.code(), EyeImpl<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/nullary/fill.cc b/src/cupynumeric/nullary/fill.cc
similarity index 85%
rename from src/cunumeric/nullary/fill.cc
rename to src/cupynumeric/nullary/fill.cc
index f375b897fc..56ee2b6985 100644
--- a/src/cunumeric/nullary/fill.cc
+++ b/src/cupynumeric/nullary/fill.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/nullary/fill.h"
-#include "cunumeric/nullary/fill_template.inl"
+#include "cupynumeric/nullary/fill.h"
+#include "cupynumeric/nullary/fill_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -52,7 +52,10 @@ struct FillImplBody<VariantKind::CPU, VAL, DIM> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { FillTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  FillTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/nullary/fill.cu b/src/cupynumeric/nullary/fill.cu
similarity index 90%
rename from src/cunumeric/nullary/fill.cu
rename to src/cupynumeric/nullary/fill.cu
index 5b2a42a798..249f0203d5 100644
--- a/src/cunumeric/nullary/fill.cu
+++ b/src/cupynumeric/nullary/fill.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/nullary/fill.h"
-#include "cunumeric/nullary/fill_template.inl"
+#include "cupynumeric/nullary/fill.h"
+#include "cupynumeric/nullary/fill_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename ARG, typename ReadAcc>
 static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
@@ -61,7 +61,7 @@ struct FillImplBody<VariantKind::GPU, VAL, DIM> {
     } else {
       generic_kernel<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(volume, out, in, pitches, rect);
     }
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -70,4 +70,4 @@ struct FillImplBody<VariantKind::GPU, VAL, DIM> {
   fill_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/nullary/fill.h b/src/cupynumeric/nullary/fill.h
similarity index 80%
rename from src/cunumeric/nullary/fill.h
rename to src/cupynumeric/nullary/fill.h
index e6023e850e..bd18797cd5 100644
--- a/src/cunumeric/nullary/fill.h
+++ b/src/cupynumeric/nullary/fill.h
@@ -16,18 +16,18 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct FillArgs {
   legate::PhysicalStore out;
   legate::PhysicalStore fill_value;
 };
 
-class FillTask : public CuNumericTask<FillTask> {
+class FillTask : public CuPyNumericTask<FillTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_FILL};
+  static inline const auto TASK_CONFIG = legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_FILL}};
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -39,4 +39,4 @@ class FillTask : public CuNumericTask<FillTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/nullary/fill_omp.cc b/src/cupynumeric/nullary/fill_omp.cc
similarity index 92%
rename from src/cunumeric/nullary/fill_omp.cc
rename to src/cupynumeric/nullary/fill_omp.cc
index afb7ec78d3..35c23b38f9 100644
--- a/src/cunumeric/nullary/fill_omp.cc
+++ b/src/cupynumeric/nullary/fill_omp.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/nullary/fill.h"
-#include "cunumeric/nullary/fill_template.inl"
+#include "cupynumeric/nullary/fill.h"
+#include "cupynumeric/nullary/fill_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -57,4 +57,4 @@ struct FillImplBody<VariantKind::OMP, VAL, DIM> {
   fill_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/nullary/fill_template.inl b/src/cupynumeric/nullary/fill_template.inl
similarity index 90%
rename from src/cunumeric/nullary/fill_template.inl
rename to src/cupynumeric/nullary/fill_template.inl
index da92359312..79c015afd7 100644
--- a/src/cunumeric/nullary/fill_template.inl
+++ b/src/cupynumeric/nullary/fill_template.inl
@@ -17,12 +17,12 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/nullary/fill.h"
-#include "cunumeric/arg.h"
-#include "cunumeric/arg.inl"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/nullary/fill.h"
+#include "cupynumeric/arg.h"
+#include "cupynumeric/arg.inl"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -71,4 +71,4 @@ static void fill_template(TaskContext& context)
   double_dispatch(std::max<int32_t>(args.out.dim(), 1), args.out.code(), FillImpl<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/nullary/window.cc b/src/cupynumeric/nullary/window.cc
similarity index 84%
rename from src/cunumeric/nullary/window.cc
rename to src/cupynumeric/nullary/window.cc
index 273ce7cabb..ed3d71575d 100644
--- a/src/cunumeric/nullary/window.cc
+++ b/src/cupynumeric/nullary/window.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/nullary/window.h"
-#include "cunumeric/nullary/window_template.inl"
+#include "cupynumeric/nullary/window.h"
+#include "cupynumeric/nullary/window_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -48,7 +48,10 @@ struct WindowImplBody<VariantKind::CPU, OP_CODE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { WindowTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  WindowTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/nullary/window.cu b/src/cupynumeric/nullary/window.cu
similarity index 90%
rename from src/cunumeric/nullary/window.cu
rename to src/cupynumeric/nullary/window.cu
index 0dfbce2044..2abcf01e95 100644
--- a/src/cunumeric/nullary/window.cu
+++ b/src/cupynumeric/nullary/window.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/nullary/window.h"
-#include "cunumeric/nullary/window_template.inl"
+#include "cupynumeric/nullary/window.h"
+#include "cupynumeric/nullary/window_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <WindowOpCode OP_CODE>
 static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
@@ -64,7 +64,7 @@ struct WindowImplBody<VariantKind::GPU, OP_CODE> {
         <<<blocks, THREADS_PER_BLOCK, 0, stream>>>(gen, volume, out, rect.lo[0]);
     }
 
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -73,4 +73,4 @@ struct WindowImplBody<VariantKind::GPU, OP_CODE> {
   window_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/nullary/window.h b/src/cupynumeric/nullary/window.h
similarity index 77%
rename from src/cunumeric/nullary/window.h
rename to src/cupynumeric/nullary/window.h
index 0b688a3951..aa0d7c28fe 100644
--- a/src/cunumeric/nullary/window.h
+++ b/src/cupynumeric/nullary/window.h
@@ -16,13 +16,14 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
-class WindowTask : public CuNumericTask<WindowTask> {
+class WindowTask : public CuPyNumericTask<WindowTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_WINDOW};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_WINDOW}};
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -34,4 +35,4 @@ class WindowTask : public CuNumericTask<WindowTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/nullary/window_omp.cc b/src/cupynumeric/nullary/window_omp.cc
similarity index 90%
rename from src/cunumeric/nullary/window_omp.cc
rename to src/cupynumeric/nullary/window_omp.cc
index e43d3a23b8..c0adf0e298 100644
--- a/src/cunumeric/nullary/window_omp.cc
+++ b/src/cupynumeric/nullary/window_omp.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/nullary/window.h"
-#include "cunumeric/nullary/window_template.inl"
+#include "cupynumeric/nullary/window.h"
+#include "cupynumeric/nullary/window_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -47,4 +47,4 @@ struct WindowImplBody<VariantKind::OMP, OP_CODE> {
   window_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/nullary/window_template.inl b/src/cupynumeric/nullary/window_template.inl
similarity index 92%
rename from src/cunumeric/nullary/window_template.inl
rename to src/cupynumeric/nullary/window_template.inl
index f744e2ad21..216d0f03aa 100644
--- a/src/cunumeric/nullary/window_template.inl
+++ b/src/cupynumeric/nullary/window_template.inl
@@ -17,10 +17,10 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/nullary/window.h"
-#include "cunumeric/nullary/window_util.h"
+#include "cupynumeric/nullary/window.h"
+#include "cupynumeric/nullary/window_util.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -63,4 +63,4 @@ static void window_template(TaskContext& context)
   op_dispatch(op_code, WindowImpl<KIND>{}, output, M, beta);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/nullary/window_util.h b/src/cupynumeric/nullary/window_util.h
similarity index 91%
rename from src/cunumeric/nullary/window_util.h
rename to src/cupynumeric/nullary/window_util.h
index 4d6e400316..84993548cb 100644
--- a/src/cunumeric/nullary/window_util.h
+++ b/src/cupynumeric/nullary/window_util.h
@@ -18,19 +18,19 @@
 
 #define _USE_MATH_DEFINES
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 #include <math.h>
 
 extern double i0(double);
 
-namespace cunumeric {
+namespace cupynumeric {
 
 enum class WindowOpCode : int {
-  BARLETT  = CUNUMERIC_WINDOW_BARLETT,
-  BLACKMAN = CUNUMERIC_WINDOW_BLACKMAN,
-  HAMMING  = CUNUMERIC_WINDOW_HAMMING,
-  HANNING  = CUNUMERIC_WINDOW_HANNING,
-  KAISER   = CUNUMERIC_WINDOW_KAISER,
+  BARLETT  = CUPYNUMERIC_WINDOW_BARLETT,
+  BLACKMAN = CUPYNUMERIC_WINDOW_BLACKMAN,
+  HAMMING  = CUPYNUMERIC_WINDOW_HAMMING,
+  HANNING  = CUPYNUMERIC_WINDOW_HANNING,
+  KAISER   = CUPYNUMERIC_WINDOW_KAISER,
 };
 
 template <typename Functor, typename... Fnargs>
@@ -121,4 +121,4 @@ struct WindowOp<WindowOpCode::KAISER> {
   double beta_;
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/omp_help.h b/src/cupynumeric/omp_help.h
similarity index 96%
rename from src/cunumeric/omp_help.h
rename to src/cupynumeric/omp_help.h
index fe132280fa..9c3cdf738d 100644
--- a/src/cunumeric/omp_help.h
+++ b/src/cupynumeric/omp_help.h
@@ -18,7 +18,7 @@
 
 #include <vector>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 // Simple STL vector-based thread local storage for OpenMP threads to avoid false sharing
 template <typename VAL>
@@ -47,4 +47,4 @@ struct ThreadLocalStorage {
   size_t num_threads_;
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/operators.cc b/src/cupynumeric/operators.cc
similarity index 66%
rename from src/cunumeric/operators.cc
rename to src/cupynumeric/operators.cc
index 997f8f7731..c5a4fde9e5 100644
--- a/src/cunumeric/operators.cc
+++ b/src/cupynumeric/operators.cc
@@ -14,31 +14,31 @@
  *
  */
 
-#include "cunumeric/operators.h"
+#include "cupynumeric/operators.h"
 
-#include "cunumeric/runtime.h"
-#include "cunumeric/binary/binary_op_util.h"
-#include "cunumeric/unary/unary_op_util.h"
-#include "cunumeric/unary/unary_red_util.h"
-#include "cunumeric/random/rand_util.h"
-#include "cunumeric/nullary/window_util.h"
+#include "cupynumeric/runtime.h"
+#include "cupynumeric/binary/binary_op_util.h"
+#include "cupynumeric/unary/unary_op_util.h"
+#include "cupynumeric/unary/unary_red_util.h"
+#include "cupynumeric/random/rand_util.h"
+#include "cupynumeric/nullary/window_util.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
-static legate::Logger log_cunumeric("cunumeric");
+static legate::Logger log_cupynumeric("cupynumeric");
 
-legate::Logger& cunumeric_log() { return log_cunumeric; }
+legate::Logger& cupynumeric_log() { return log_cupynumeric; }
 
 NDArray array(std::vector<uint64_t> shape, const legate::Type& type)
 {
-  return CuNumericRuntime::get_runtime()->create_array(std::move(shape), type);
+  return CuPyNumericRuntime::get_runtime()->create_array(std::move(shape), type);
 }
 
 NDArray unary_op(UnaryOpCode op_code,
                  NDArray input,
                  const std::vector<legate::Scalar>& extra_args = {})
 {
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
   auto out     = runtime->create_array(input.shape(), input.type());
   out.unary_op(static_cast<int32_t>(op_code), std::move(input), extra_args);
   return out;
@@ -46,7 +46,7 @@ NDArray unary_op(UnaryOpCode op_code,
 
 NDArray unary_reduction(UnaryRedCode op_code, NDArray input)
 {
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
   auto out     = runtime->create_array({1}, input.type());
   out.unary_reduction(static_cast<int32_t>(op_code), std::move(input));
   return out;
@@ -54,7 +54,7 @@ NDArray unary_reduction(UnaryRedCode op_code, NDArray input)
 
 NDArray binary_op(BinaryOpCode op_code, NDArray rhs1, NDArray rhs2, std::optional<NDArray> out)
 {
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
   if (!out.has_value()) {
     auto out_shape = broadcast_shapes({rhs1, rhs2});
     out            = runtime->create_array(out_shape, rhs1.type());
@@ -82,11 +82,16 @@ NDArray multiply(NDArray rhs1, NDArray rhs2, std::optional<NDArray> out)
   return binary_op(BinaryOpCode::MULTIPLY, std::move(rhs1), std::move(rhs2), std::move(out));
 }
 
+NDArray divide(NDArray rhs1, NDArray rhs2, std::optional<NDArray> out)
+{
+  return binary_op(BinaryOpCode::DIVIDE, std::move(rhs1), std::move(rhs2), std::move(out));
+}
+
 NDArray negative(NDArray input) { return unary_op(UnaryOpCode::NEGATIVE, std::move(input)); }
 
 NDArray random(std::vector<uint64_t> shape)
 {
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
   auto out     = runtime->create_array(std::move(shape), legate::float64());
   out.random(static_cast<int32_t>(RandGenCode::UNIFORM));
   return out;
@@ -161,7 +166,7 @@ NDArray zeros(std::vector<uint64_t> shape, std::optional<legate::Type> type)
 
 NDArray full(std::vector<uint64_t> shape, const Scalar& value)
 {
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
   auto out     = runtime->create_array(std::move(shape), value.type());
   out.fill(value);
   return out;
@@ -177,7 +182,7 @@ NDArray eye(int32_t n, std::optional<int32_t> m, int32_t k, const legate::Type&
     throw std::invalid_argument("Type must be a primitive type");
   }
 
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
   auto out =
     runtime->create_array({static_cast<size_t>(n), static_cast<size_t>(m.value_or(n))}, type);
   out.eye(k);
@@ -214,7 +219,7 @@ NDArray bincount(NDArray x,
     min_length = max_val + 1;
   }
 
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
   if (!weights.has_value()) {
     auto out = runtime->create_array({min_length}, legate::int64());
     out.bincount(x);
@@ -250,7 +255,7 @@ NDArray trilu(NDArray rhs, int32_t k, bool lower)
     out_shape.emplace_back(shape[0]);
   }
 
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
   auto out     = runtime->create_array(std::move(out_shape), rhs.type());
   out.trilu(std::move(rhs), k, lower);
   return out;
@@ -260,41 +265,10 @@ NDArray tril(NDArray rhs, int32_t k) { return trilu(rhs, k, true); }
 
 NDArray triu(NDArray rhs, int32_t k) { return trilu(rhs, k, false); }
 
-NDArray dot(NDArray rhs1, NDArray rhs2)
-{
-  if (rhs1.dim() != 2 || rhs2.dim() != 2) {
-    LEGATE_ABORT("cunumeric::dot only supports matrices now");
-  }
-
-  auto& rhs1_shape = rhs1.shape();
-  auto& rhs2_shape = rhs2.shape();
-
-  if (rhs1_shape[1] != rhs2_shape[0]) {
-    LEGATE_ABORT("Incompatible matrices: (",
-                 rhs1_shape[0],
-                 ", ",
-                 rhs1_shape[1],
-                 ") x (",
-                 rhs2_shape[0],
-                 ", ",
-                 rhs2_shape[1],
-                 ")");
-  }
-
-  auto runtime = CuNumericRuntime::get_runtime();
-  std::vector<uint64_t> shape;
-  shape.push_back(rhs1_shape[0]);
-  shape.push_back(rhs2_shape[1]);
-
-  auto out = runtime->create_array(std::move(shape), rhs1.type());
-  out.dot(std::move(rhs1), std::move(rhs2));
-  return out;
-}
-
 NDArray all(NDArray input,
-            std::optional<std::vector<int32_t>> axis,
+            std::vector<int32_t> axis,
             std::optional<NDArray> out,
-            std::optional<bool> keepdims,
+            bool keepdims,
             std::optional<NDArray> where)
 {
   return input.all(axis, out, keepdims, std::nullopt, where);
@@ -302,9 +276,45 @@ NDArray all(NDArray input,
 
 NDArray sum(NDArray input) { return unary_reduction(UnaryRedCode::SUM, std::move(input)); }
 
-NDArray amax(NDArray input) { return unary_reduction(UnaryRedCode::MAX, std::move(input)); }
-
-NDArray amin(NDArray input) { return unary_reduction(UnaryRedCode::MIN, std::move(input)); }
+NDArray amax(NDArray input,
+             std::vector<int32_t> axis,
+             std::optional<legate::Type> dtype,
+             std::optional<NDArray> out,
+             bool keepdims,
+             std::optional<Scalar> initial,
+             std::optional<NDArray> where)
+{
+  return input._perform_unary_reduction(static_cast<int32_t>(UnaryRedCode::MAX),
+                                        input,
+                                        axis,
+                                        dtype,
+                                        std::nullopt,
+                                        out,
+                                        keepdims,
+                                        {},
+                                        initial,
+                                        where);
+}
+
+NDArray amin(NDArray input,
+             std::vector<int32_t> axis,
+             std::optional<legate::Type> dtype,
+             std::optional<NDArray> out,
+             bool keepdims,
+             std::optional<Scalar> initial,
+             std::optional<NDArray> where)
+{
+  return input._perform_unary_reduction(static_cast<int32_t>(UnaryRedCode::MIN),
+                                        input,
+                                        axis,
+                                        dtype,
+                                        std::nullopt,
+                                        out,
+                                        keepdims,
+                                        {},
+                                        initial,
+                                        where);
+}
 
 NDArray unique(NDArray input) { return input.unique(); }
 
@@ -322,19 +332,19 @@ NDArray arange(Scalar start, Scalar stop, Scalar step)
     throw std::invalid_argument("start/stop/step should be of the same type");
   }
 
-  auto out = CuNumericRuntime::get_runtime()->create_array({N}, start.type());
+  auto out = CuPyNumericRuntime::get_runtime()->create_array({N}, start.type());
   out.arange(start, stop, step);
   return out;
 }
 
 NDArray as_array(legate::LogicalStore store)
 {
-  return CuNumericRuntime::get_runtime()->create_array(std::move(store));
+  return CuPyNumericRuntime::get_runtime()->create_array(std::move(store));
 }
 
 NDArray array_equal(NDArray input0, NDArray input1)
 {
-  auto dst = CuNumericRuntime::get_runtime()->create_array({1}, legate::bool_());
+  auto dst = CuPyNumericRuntime::get_runtime()->create_array({1}, legate::bool_());
 
   if (input0.shape() != input1.shape()) {
     dst.fill(legate::Scalar(false));
@@ -350,7 +360,7 @@ std::vector<NDArray> nonzero(NDArray input) { return input.nonzero(); }
 NDArray create_window(int64_t M, WindowOpCode op_code, std::vector<double> args)
 {
   auto type    = legate::float64();
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
   if (M <= 0) {
     return runtime->create_array({0}, std::move(type));
   } else if (M == 1) {
@@ -382,7 +392,7 @@ NDArray convolve(NDArray a, NDArray v)
   if (a.dim() > 3) {
     throw std::runtime_error(std::to_string(a.dim()) + "-D arrays are not yet supported");
   }
-  auto out = CuNumericRuntime::get_runtime()->create_array(a.shape(), a.type());
+  auto out = CuPyNumericRuntime::get_runtime()->create_array(a.shape(), a.type());
   if (a.type() != v.type()) {
     v = v.as_type(a.type());
   }
@@ -392,7 +402,7 @@ NDArray convolve(NDArray a, NDArray v)
 
 NDArray sort(NDArray input, std::optional<int32_t> axis /*=-1*/, std::string kind /*="quicksort"*/)
 {
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
   auto result  = runtime->create_array(input.shape(), input.type());
   result.sort(input, false, axis, kind);
   return result;
@@ -402,7 +412,7 @@ NDArray argsort(NDArray input,
                 std::optional<int32_t> axis /*=-1*/,
                 std::string kind /*="quicksort"*/)
 {
-  auto runtime = CuNumericRuntime::get_runtime();
+  auto runtime = CuPyNumericRuntime::get_runtime();
   auto result  = runtime->create_array(input.shape(), legate::int64());
   result.sort(input, true, axis, kind);
   return result;
@@ -441,7 +451,7 @@ int32_t normalize_axis_index(int32_t axis, int32_t ndim)
   return axis;
 }
 
-std::vector<int32_t> normalize_axis_vector(std::vector<int32_t> axis,
+std::vector<int32_t> normalize_axis_vector(const std::vector<int32_t>& axis,
                                            int32_t ndim,
                                            bool allow_duplicate)
 {
@@ -540,4 +550,137 @@ NDArray reshape(NDArray a, std::vector<int64_t> newshape, std::string order)
 
 NDArray ravel(NDArray a, std::string order) { return a.ravel(order); }
 
-}  // namespace cunumeric
+NDArray squeeze(NDArray a, std::optional<std::reference_wrapper<std::vector<int32_t> const>> axis)
+{
+  return a.squeeze(axis);
+}
+
+std::vector<NDArray> where(NDArray a) { return nonzero(a); }
+
+NDArray where(NDArray a, NDArray x, NDArray y)
+{
+  auto rhs1        = a._maybe_convert(legate::bool_());
+  auto common_type = find_common_type({x, y});
+  auto rhs2        = x._maybe_convert(common_type);
+  auto rhs3        = y._maybe_convert(common_type);
+
+  auto out_shape = broadcast_shapes({rhs1, rhs2, rhs3});
+  auto runtime   = CuPyNumericRuntime::get_runtime();
+  auto out       = runtime->create_array(std::move(out_shape), common_type);
+  out.where(std::move(rhs1), std::move(rhs2), std::move(rhs3));
+  return out;
+}
+
+legate::Type find_common_type(const std::vector<NDArray>& arrays)
+{
+  legate::Type max_type = legate::bool_();
+  for (auto arr : arrays) {
+    if (!arr.type().is_primitive()) {
+      throw std::invalid_argument("Type must be a primitive type");
+    }
+    if (arr.type().code() > max_type.code()) {
+      max_type = arr.type();
+    }
+  }
+  return max_type;
+}
+
+NDArray dot(NDArray a, NDArray b)
+{
+  if (a.type().code() != b.type().code()) {
+    throw std::invalid_argument("Type of array a is not equal to type of array b");
+  }
+
+  if (a.dim() == 0 || b.dim() == 0) {
+    return multiply(a, b);
+  }
+
+  auto modes_vec = dot_modes(a.dim(), b.dim());
+  auto a_modes   = modes_vec[0];
+  auto b_modes   = modes_vec[1];
+  auto c_modes   = modes_vec[2];
+  std::map<char, int> mode2extent;
+  for (uint i = 0; i < a_modes.size(); i++) {
+    auto mode   = a_modes[i];
+    auto extent = a.shape()[i];
+    auto search = mode2extent.find(mode);
+    if (search != mode2extent.end()) {
+      if (search->second != extent) {
+        throw std::invalid_argument("Incompatible sizes between matched dimensions");
+      }
+    }
+    mode2extent[mode] = extent;
+  }
+  for (uint i = 0; i < b_modes.size(); i++) {
+    auto mode   = b_modes[i];
+    auto extent = b.shape()[i];
+    auto search = mode2extent.find(mode);
+    if (search != mode2extent.end()) {
+      if (search->second != extent) {
+        throw std::invalid_argument("Incompatible sizes between matched dimensions");
+      }
+    }
+    mode2extent[mode] = extent;
+  }
+
+  std::vector<std::uint64_t> c_shape;
+  for (auto mode : c_modes) {
+    auto search = mode2extent.find(mode);
+    if (search != mode2extent.end()) {
+      c_shape.push_back(mode2extent[mode]);
+    } else {
+      c_shape.push_back(1);
+    }
+  }
+
+  auto runtime = CuPyNumericRuntime::get_runtime();
+  auto c_dtype = a.type();
+  auto c       = runtime->create_array(std::move(c_shape), c_dtype);
+  // Perform operation
+  c.contract(modes_vec[2], std::move(a), modes_vec[0], std::move(b), modes_vec[1], mode2extent);
+  return c;
+}
+
+template <typename T>
+std::vector<T> merge_vectors(
+  std::vector<T> a, std::vector<T> b, uint start_a, uint end_a, uint start_b, uint end_b)
+{
+  std::vector<T> out;
+  for (uint i = start_a; i < end_a; i++) {
+    out.push_back(a[i]);
+  }
+  for (uint i = start_b; i < end_b; i++) {
+    out.push_back(b[i]);
+  }
+  return out;
+}
+
+std::vector<std::vector<char>> dot_modes(uint a_ndim, uint b_ndim)
+{
+  std::vector<char> a_modes, b_modes, out_modes;
+  assert(a_ndim < 26);
+  assert(b_ndim < 26);
+  for (uint i = 0; i < a_ndim; i++) {
+    a_modes.push_back('a' + i);
+  }
+  for (uint i = 0; i < b_ndim; i++) {
+    b_modes.push_back('A' + i);
+  }
+  if (a_ndim == 0) {
+    out_modes = b_modes;
+  } else if (b_ndim == 0) {
+    out_modes = a_modes;
+  } else if (b_ndim == 1) {
+    b_modes[b_modes.size() - 1] = a_modes[a_modes.size() - 1];
+    for (int i = 0; i < a_modes.size() - 1; i++) {
+      out_modes.push_back(a_modes[i]);
+    }
+  } else {
+    b_modes[b_modes.size() - 2] = a_modes[a_modes.size() - 1];
+    out_modes = merge_vectors<char>(a_modes, b_modes, 0, a_modes.size() - 1, 0, b_modes.size() - 2);
+    out_modes.push_back(b_modes[b_modes.size() - 1]);
+  }
+  return {a_modes, b_modes, out_modes};
+}
+
+}  // namespace cupynumeric
diff --git a/src/cunumeric/operators.h b/src/cupynumeric/operators.h
similarity index 69%
rename from src/cunumeric/operators.h
rename to src/cupynumeric/operators.h
index 4b1c96ed8b..31b4093f75 100644
--- a/src/cunumeric/operators.h
+++ b/src/cupynumeric/operators.h
@@ -20,12 +20,12 @@
 #include <optional>
 
 #include "legate.h"
-#include "cunumeric/ndarray.h"
-#include "cunumeric/typedefs.h"
+#include "cupynumeric/ndarray.h"
+#include "cupynumeric/typedefs.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
-legate::Logger& cunumeric_log();
+legate::Logger& cupynumeric_log();
 
 void initialize(int32_t argc, char** argv);
 
@@ -37,7 +37,9 @@ NDArray add(NDArray rhs1, NDArray rhs2, std::optional<NDArray> out = std::nullop
 
 NDArray multiply(NDArray rhs1, NDArray rhs2, std::optional<NDArray> out = std::nullopt);
 
-NDArray dot(NDArray rhs1, NDArray rhs2);
+NDArray divide(NDArray rhs1, NDArray rhs2, std::optional<NDArray> out = std::nullopt);
+
+NDArray dot(NDArray a, NDArray b);
 
 NDArray negative(NDArray input);
 
@@ -48,16 +50,28 @@ NDArray zeros(std::vector<uint64_t> shape, std::optional<legate::Type> type = st
 NDArray full(std::vector<uint64_t> shape, const Scalar& value);
 
 NDArray all(NDArray input,
-            std::optional<std::vector<int32_t>> axis = std::nullopt,
-            std::optional<NDArray> out               = std::nullopt,
-            std::optional<bool> keepdims             = std::nullopt,
-            std::optional<NDArray> where             = std::nullopt);
+            std::vector<int32_t> axis    = {},
+            std::optional<NDArray> out   = std::nullopt,
+            bool keepdims                = false,
+            std::optional<NDArray> where = std::nullopt);
 
 NDArray sum(NDArray input);
 
-NDArray amax(NDArray input);
-
-NDArray amin(NDArray input);
+NDArray amax(NDArray input,
+             std::vector<int32_t> axis         = {},
+             std::optional<legate::Type> dtype = std::nullopt,
+             std::optional<NDArray> out        = std::nullopt,
+             bool keepdims                     = false,
+             std::optional<Scalar> initial     = std::nullopt,
+             std::optional<NDArray> where      = std::nullopt);
+
+NDArray amin(NDArray input,
+             std::vector<int32_t> axis         = {},
+             std::optional<legate::Type> dtype = std::nullopt,
+             std::optional<NDArray> out        = std::nullopt,
+             bool keepdims                     = false,
+             std::optional<Scalar> initial     = std::nullopt,
+             std::optional<NDArray> where      = std::nullopt);
 
 NDArray unique(NDArray input);
 
@@ -124,7 +138,7 @@ NDArray repeat(NDArray a, int64_t repeats, std::optional<int32_t> axis = std::nu
 // helper methods
 int32_t normalize_axis_index(int32_t axis, int32_t ndim);
 
-std::vector<int32_t> normalize_axis_vector(std::vector<int32_t> axis,
+std::vector<int32_t> normalize_axis_vector(const std::vector<int32_t>& axis,
                                            int32_t ndim,
                                            bool allow_duplicate = false);
 
@@ -147,6 +161,16 @@ NDArray reshape(NDArray a, std::vector<int64_t> newshape, std::string order = "C
 
 NDArray ravel(NDArray a, std::string order = "C");
 
+NDArray squeeze(
+  NDArray a, std::optional<std::reference_wrapper<std::vector<int32_t> const>> axis = std::nullopt);
+
+std::vector<NDArray> where(NDArray a);
+
+NDArray where(NDArray a, NDArray x, NDArray y);
+
+// helper methods
+legate::Type find_common_type(const std::vector<NDArray>& arrays);
+
 template <typename T>
 bool vec_is_equal(const std::vector<T>& vec1, const std::vector<T>& vec2)
 {
@@ -174,5 +198,21 @@ std::vector<U> vec_convert(const std::vector<T>& input)
   return output;
 }
 
-}  // namespace cunumeric
-#include "cunumeric/operators.inl"
+template <typename T>
+bool _is_in_vector(const std::vector<T>& vec, T item)
+{
+  if (std::find(vec.begin(), vec.end(), item) != vec.end()) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+std::vector<std::vector<char>> dot_modes(uint a_ndim, uint b_ndim);
+
+template <typename T>
+std::vector<T> merge_vectors(
+  std::vector<T> a, std::vector<T> b, uint start_a, uint end_a, uint start_b, uint end_b);
+
+}  // namespace cupynumeric
+#include "cupynumeric/operators.inl"
diff --git a/src/cunumeric/operators.inl b/src/cupynumeric/operators.inl
similarity index 85%
rename from src/cunumeric/operators.inl
rename to src/cupynumeric/operators.inl
index 0249892881..0b7a24955e 100644
--- a/src/cunumeric/operators.inl
+++ b/src/cupynumeric/operators.inl
@@ -13,8 +13,8 @@
  * limitations under the License.
  *
  */
-#include "cunumeric/runtime.h"
-namespace cunumeric {
+#include "cupynumeric/runtime.h"
+namespace cupynumeric {
 
 template <typename T>
 NDArray arange(T start, std::optional<T> stop, T step)
@@ -30,9 +30,9 @@ NDArray arange(T start, std::optional<T> stop, T step)
   auto s_start = Scalar(start);
   auto s_stop  = Scalar(stop.value());
   auto s_step  = Scalar(step);
-  auto out     = CuNumericRuntime::get_runtime()->create_array({N}, s_start.type());
+  auto out     = CuPyNumericRuntime::get_runtime()->create_array({N}, s_start.type());
   out.arange(s_start, s_stop, s_step);
   return out;
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/pitches.h b/src/cupynumeric/pitches.h
similarity index 98%
rename from src/cunumeric/pitches.h
rename to src/cupynumeric/pitches.h
index 445d1ea47a..05808dc82b 100644
--- a/src/cunumeric/pitches.h
+++ b/src/cupynumeric/pitches.h
@@ -18,7 +18,7 @@
 
 #include "legate/utilities/typedefs.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 // This is a small helper class that will also work if we have zero-sized arrays
 // We also need to have this instead of std::array so that it works on devices
@@ -120,4 +120,4 @@ class Pitches<0, C_ORDER> {
   }
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/random/bitgenerator.cc b/src/cupynumeric/random/bitgenerator.cc
similarity index 79%
rename from src/cunumeric/random/bitgenerator.cc
rename to src/cupynumeric/random/bitgenerator.cc
index 6c3728b85c..9d2dbab21a 100644
--- a/src/cunumeric/random/bitgenerator.cc
+++ b/src/cupynumeric/random/bitgenerator.cc
@@ -18,33 +18,33 @@
 
 // CPU Builds:
 //
-#if !LEGATE_DEFINED(LEGATE_USE_CUDA) && !LEGATE_DEFINED(CUNUMERIC_CURAND_FOR_CPU_BUILD)
-#define CUNUMERIC_USE_STL_RANDOM_ENGINE
+#if !LEGATE_DEFINED(LEGATE_USE_CUDA) && !LEGATE_DEFINED(CUPYNUMERIC_CURAND_FOR_CPU_BUILD)
+#define CUPYNUMERIC_USE_STL_RANDOM_ENGINE
 #endif
 
-#include "cunumeric/random/bitgenerator.h"
-#include "cunumeric/random/bitgenerator_template.inl"
-#include "cunumeric/random/bitgenerator_util.h"
+#include "cupynumeric/random/bitgenerator.h"
+#include "cupynumeric/random/bitgenerator_template.inl"
+#include "cupynumeric/random/bitgenerator_util.h"
 
-#include "cunumeric/random/rnd_types.h"
+#include "cupynumeric/random/rnd_types.h"
 
-#if !LEGATE_DEFINED(CUNUMERIC_USE_STL_RANDOM_ENGINE)
-#include "cunumeric/random/curand_help.h"
+#if !LEGATE_DEFINED(CUPYNUMERIC_USE_STL_RANDOM_ENGINE)
+#include "cupynumeric/random/curand_help.h"
 #endif
 
-#include "cunumeric/random/randutil/randutil.h"
+#include "cupynumeric/random/randutil/randutil.h"
 
-#include "cunumeric/random/bitgenerator_curand.inl"
+#include "cupynumeric/random/bitgenerator_curand.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
-static Logger log_curand("cunumeric.random");
+static Logger log_curand("cupynumeric.random");
 
 Logger& randutil_log() { return log_curand; }
 
-#ifdef CUNUMERIC_USE_STL_RANDOM_ENGINE
+#ifdef CUPYNUMERIC_USE_STL_RANDOM_ENGINE
 void randutil_check_status(rnd_status_t error, std::string_view file, int line)
 {
   if (error) {
@@ -108,11 +108,11 @@ std::mutex BitGeneratorImplBody<VariantKind::CPU>::lock_generators = {};
 namespace  // unnamed
 {
 
-static void __attribute__((constructor)) register_tasks(void)
-{
+const auto cupynumeric_reg_task_ = []() -> char {
   BitGeneratorTask::register_variants();
-}
+  return 0;
+}();
 
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/random/bitgenerator.cu b/src/cupynumeric/random/bitgenerator.cu
similarity index 83%
rename from src/cunumeric/random/bitgenerator.cu
rename to src/cupynumeric/random/bitgenerator.cu
index 4471964d45..b4dbbee4a9 100644
--- a/src/cunumeric/random/bitgenerator.cu
+++ b/src/cupynumeric/random/bitgenerator.cu
@@ -14,19 +14,19 @@
  *
  */
 
-#include "cunumeric/random/bitgenerator.h"
-#include "cunumeric/random/bitgenerator_template.inl"
-#include "cunumeric/random/bitgenerator_util.h"
+#include "cupynumeric/random/bitgenerator.h"
+#include "cupynumeric/random/bitgenerator_template.inl"
+#include "cupynumeric/random/bitgenerator_util.h"
 
-#include "cunumeric/cuda_help.h"
-#include "cunumeric/random/curand_help.h"
+#include "cupynumeric/cuda_help.h"
+#include "cupynumeric/random/curand_help.h"
 
-#include "cunumeric/random/bitgenerator_curand.inl"
+#include "cupynumeric/random/bitgenerator_curand.inl"
 
 #include <string_view>
 #include <mutex>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -46,13 +46,13 @@ struct GPUGenerator : public CURANDGenerator {
   GPUGenerator(BitGeneratorType gentype, uint64_t seed, uint64_t generatorId, uint32_t flags)
     : CURANDGenerator(gentype, seed, generatorId)
   {
-    CUNUMERIC_CHECK_CUDA(::cudaStreamCreate(&stream_));
+    CUPYNUMERIC_CHECK_CUDA(::cudaStreamCreate(&stream_));
     CHECK_CURAND(::randutilCreateGenerator(&gen_, type_, seed, generatorId, flags, stream_));
   }
 
   virtual ~GPUGenerator()
   {
-    CUNUMERIC_CHECK_CUDA(::cudaStreamSynchronize(stream_));
+    CUPYNUMERIC_CHECK_CUDA(::cudaStreamSynchronize(stream_));
     CHECK_CURAND(::randutilDestroyGenerator(gen_));
   }
 };
@@ -87,4 +87,4 @@ void destroy_bitgenerator(const legate::Processor& proc)
   BitGeneratorImplBody<VariantKind::GPU>::destroy_bitgenerator(proc);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/random/bitgenerator.h b/src/cupynumeric/random/bitgenerator.h
similarity index 89%
rename from src/cunumeric/random/bitgenerator.h
rename to src/cupynumeric/random/bitgenerator.h
index fff2d3a617..9ace4b0356 100644
--- a/src/cunumeric/random/bitgenerator.h
+++ b/src/cupynumeric/random/bitgenerator.h
@@ -16,10 +16,10 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/random/bitgenerator_util.h"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/random/bitgenerator_util.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct BitGeneratorArgs {
   BitGeneratorOperation bitgen_op;
@@ -78,9 +78,10 @@ struct BitGeneratorArgs {
   }
 };
 
-class BitGeneratorTask : public CuNumericTask<BitGeneratorTask> {
+class BitGeneratorTask : public CuPyNumericTask<BitGeneratorTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_BITGENERATOR};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_BITGENERATOR}};
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -96,4 +97,4 @@ class BitGeneratorTask : public CuNumericTask<BitGeneratorTask> {
 
 void destroy_bitgenerator(const legate::Processor& proc);
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/random/bitgenerator_curand.inl b/src/cupynumeric/random/bitgenerator_curand.inl
similarity index 99%
rename from src/cunumeric/random/bitgenerator_curand.inl
rename to src/cupynumeric/random/bitgenerator_curand.inl
index ddf19b0244..94058161ea 100644
--- a/src/cunumeric/random/bitgenerator_curand.inl
+++ b/src/cupynumeric/random/bitgenerator_curand.inl
@@ -21,14 +21,14 @@
 #include <sys/types.h>
 #include <mutex>
 
-#include "cunumeric/random/bitgenerator.h"
-#include "cunumeric/random/bitgenerator_template.inl"
-#include "cunumeric/random/bitgenerator_util.h"
+#include "cupynumeric/random/bitgenerator.h"
+#include "cupynumeric/random/bitgenerator_template.inl"
+#include "cupynumeric/random/bitgenerator_util.h"
 
-#include "cunumeric/random/rnd_types.h"
-#include "cunumeric/random/randutil/randutil.h"
+#include "cupynumeric/random/rnd_types.h"
+#include "cupynumeric/random/randutil/randutil.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -1633,4 +1633,4 @@ struct BitGeneratorImplBody {
   }
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/random/bitgenerator_template.inl b/src/cupynumeric/random/bitgenerator_template.inl
similarity index 97%
rename from src/cunumeric/random/bitgenerator_template.inl
rename to src/cupynumeric/random/bitgenerator_template.inl
index 7ed2467a0d..a9375acd25 100644
--- a/src/cunumeric/random/bitgenerator_template.inl
+++ b/src/cupynumeric/random/bitgenerator_template.inl
@@ -16,12 +16,12 @@
 
 #pragma once
 
-#include "cunumeric/arg.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/arg.h"
+#include "cupynumeric/pitches.h"
 
 #include "bitgenerator_util.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -128,4 +128,4 @@ static void bitgenerator_template(TaskContext& context)
   BitGeneratorImpl<KIND>{}(args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cupynumeric/random/bitgenerator_util.h b/src/cupynumeric/random/bitgenerator_util.h
new file mode 100644
index 0000000000..ba87111289
--- /dev/null
+++ b/src/cupynumeric/random/bitgenerator_util.h
@@ -0,0 +1,98 @@
+/* Copyright 2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "cupynumeric/cupynumeric_task.h"
+
+namespace cupynumeric {
+
+// Match these to BitGeneratorOperation in config.py
+enum class BitGeneratorOperation : int32_t {
+  CREATE       = CUPYNUMERIC_BITGENOP_CREATE,
+  DESTROY      = CUPYNUMERIC_BITGENOP_DESTROY,
+  RAND_RAW     = CUPYNUMERIC_BITGENOP_RAND_RAW,
+  DISTRIBUTION = CUPYNUMERIC_BITGENOP_DISTRIBUTION,
+};
+
+// Match these to BitGeneratorType in config.py
+enum class BitGeneratorType : uint32_t {
+  DEFAULT       = CUPYNUMERIC_BITGENTYPE_DEFAULT,
+  XORWOW        = CUPYNUMERIC_BITGENTYPE_XORWOW,
+  MRG32K3A      = CUPYNUMERIC_BITGENTYPE_MRG32K3A,
+  MTGP32        = CUPYNUMERIC_BITGENTYPE_MTGP32,
+  MT19937       = CUPYNUMERIC_BITGENTYPE_MT19937,
+  PHILOX4_32_10 = CUPYNUMERIC_BITGENTYPE_PHILOX4_32_10,
+};
+
+// Match these to BitGeneratorDistribution in config.py
+enum class BitGeneratorDistribution : int32_t {
+  INTEGERS_16       = CUPYNUMERIC_BITGENDIST_INTEGERS_16,
+  INTEGERS_32       = CUPYNUMERIC_BITGENDIST_INTEGERS_32,
+  INTEGERS_64       = CUPYNUMERIC_BITGENDIST_INTEGERS_64,
+  UNIFORM_32        = CUPYNUMERIC_BITGENDIST_UNIFORM_32,
+  UNIFORM_64        = CUPYNUMERIC_BITGENDIST_UNIFORM_64,
+  LOGNORMAL_32      = CUPYNUMERIC_BITGENDIST_LOGNORMAL_32,
+  LOGNORMAL_64      = CUPYNUMERIC_BITGENDIST_LOGNORMAL_64,
+  NORMAL_32         = CUPYNUMERIC_BITGENDIST_NORMAL_32,
+  NORMAL_64         = CUPYNUMERIC_BITGENDIST_NORMAL_64,
+  POISSON           = CUPYNUMERIC_BITGENDIST_POISSON,
+  EXPONENTIAL_32    = CUPYNUMERIC_BITGENDIST_EXPONENTIAL_32,
+  EXPONENTIAL_64    = CUPYNUMERIC_BITGENDIST_EXPONENTIAL_64,
+  GUMBEL_32         = CUPYNUMERIC_BITGENDIST_GUMBEL_32,
+  GUMBEL_64         = CUPYNUMERIC_BITGENDIST_GUMBEL_64,
+  LAPLACE_32        = CUPYNUMERIC_BITGENDIST_LAPLACE_32,
+  LAPLACE_64        = CUPYNUMERIC_BITGENDIST_LAPLACE_64,
+  LOGISTIC_32       = CUPYNUMERIC_BITGENDIST_LOGISTIC_32,
+  LOGISTIC_64       = CUPYNUMERIC_BITGENDIST_LOGISTIC_64,
+  PARETO_32         = CUPYNUMERIC_BITGENDIST_PARETO_32,
+  PARETO_64         = CUPYNUMERIC_BITGENDIST_PARETO_64,
+  POWER_32          = CUPYNUMERIC_BITGENDIST_POWER_32,
+  POWER_64          = CUPYNUMERIC_BITGENDIST_POWER_64,
+  RAYLEIGH_32       = CUPYNUMERIC_BITGENDIST_RAYLEIGH_32,
+  RAYLEIGH_64       = CUPYNUMERIC_BITGENDIST_RAYLEIGH_64,
+  CAUCHY_32         = CUPYNUMERIC_BITGENDIST_CAUCHY_32,
+  CAUCHY_64         = CUPYNUMERIC_BITGENDIST_CAUCHY_64,
+  TRIANGULAR_32     = CUPYNUMERIC_BITGENDIST_TRIANGULAR_32,
+  TRIANGULAR_64     = CUPYNUMERIC_BITGENDIST_TRIANGULAR_64,
+  WEIBULL_32        = CUPYNUMERIC_BITGENDIST_WEIBULL_32,
+  WEIBULL_64        = CUPYNUMERIC_BITGENDIST_WEIBULL_64,
+  BYTES             = CUPYNUMERIC_BITGENDIST_BYTES,
+  BETA_32           = CUPYNUMERIC_BITGENDIST_BETA_32,
+  BETA_64           = CUPYNUMERIC_BITGENDIST_BETA_64,
+  F_32              = CUPYNUMERIC_BITGENDIST_F_32,
+  F_64              = CUPYNUMERIC_BITGENDIST_F_64,
+  LOGSERIES         = CUPYNUMERIC_BITGENDIST_LOGSERIES,
+  NONCENTRAL_F_32   = CUPYNUMERIC_BITGENDIST_NONCENTRAL_F_32,
+  NONCENTRAL_F_64   = CUPYNUMERIC_BITGENDIST_NONCENTRAL_F_64,
+  CHISQUARE_32      = CUPYNUMERIC_BITGENDIST_CHISQUARE_32,
+  CHISQUARE_64      = CUPYNUMERIC_BITGENDIST_CHISQUARE_64,
+  GAMMA_32          = CUPYNUMERIC_BITGENDIST_GAMMA_32,
+  GAMMA_64          = CUPYNUMERIC_BITGENDIST_GAMMA_64,
+  STANDARD_T_32     = CUPYNUMERIC_BITGENDIST_STANDARD_T_32,
+  STANDARD_T_64     = CUPYNUMERIC_BITGENDIST_STANDARD_T_64,
+  HYPERGEOMETRIC    = CUPYNUMERIC_BITGENDIST_HYPERGEOMETRIC,
+  VONMISES_32       = CUPYNUMERIC_BITGENDIST_VONMISES_32,
+  VONMISES_64       = CUPYNUMERIC_BITGENDIST_VONMISES_64,
+  ZIPF              = CUPYNUMERIC_BITGENDIST_ZIPF,
+  GEOMETRIC         = CUPYNUMERIC_BITGENDIST_GEOMETRIC,
+  WALD_32           = CUPYNUMERIC_BITGENDIST_WALD_32,
+  WALD_64           = CUPYNUMERIC_BITGENDIST_WALD_64,
+  BINOMIAL          = CUPYNUMERIC_BITGENDIST_BINOMIAL,
+  NEGATIVE_BINOMIAL = CUPYNUMERIC_BITGENDIST_NEGATIVE_BINOMIAL,
+};
+
+}  // namespace cupynumeric
diff --git a/src/cunumeric/random/curand_help.h b/src/cupynumeric/random/curand_help.h
similarity index 75%
rename from src/cunumeric/random/curand_help.h
rename to src/cupynumeric/random/curand_help.h
index 39847860aa..b30ac6b117 100644
--- a/src/cunumeric/random/curand_help.h
+++ b/src/cupynumeric/random/curand_help.h
@@ -38,7 +38,7 @@
     randutil_check_curand_device(__result__, __FILE__, __LINE__); \
   } while (false)
 
-namespace cunumeric {
+namespace cupynumeric {
 legate::Logger& randutil_log();
 void randutil_check_curand(curandStatus_t error, std::string_view file, int line);
 
@@ -46,19 +46,19 @@ void randutil_check_curand(curandStatus_t error, std::string_view file, int line
 //
 void randutil_check_curand_device(curandStatus_t error, const char* file, int line);
 
-static inline curandRngType get_curandRngType(cunumeric::BitGeneratorType kind)
+static inline curandRngType get_curandRngType(cupynumeric::BitGeneratorType kind)
 {
   switch (kind) {
-    case cunumeric::BitGeneratorType::DEFAULT: return curandRngType::CURAND_RNG_PSEUDO_XORWOW;
-    case cunumeric::BitGeneratorType::XORWOW: return curandRngType::CURAND_RNG_PSEUDO_XORWOW;
-    case cunumeric::BitGeneratorType::MRG32K3A: return curandRngType::CURAND_RNG_PSEUDO_MRG32K3A;
-    case cunumeric::BitGeneratorType::MTGP32: return curandRngType::CURAND_RNG_PSEUDO_MTGP32;
-    case cunumeric::BitGeneratorType::MT19937: return curandRngType::CURAND_RNG_PSEUDO_MT19937;
-    case cunumeric::BitGeneratorType::PHILOX4_32_10:
+    case cupynumeric::BitGeneratorType::DEFAULT: return curandRngType::CURAND_RNG_PSEUDO_XORWOW;
+    case cupynumeric::BitGeneratorType::XORWOW: return curandRngType::CURAND_RNG_PSEUDO_XORWOW;
+    case cupynumeric::BitGeneratorType::MRG32K3A: return curandRngType::CURAND_RNG_PSEUDO_MRG32K3A;
+    case cupynumeric::BitGeneratorType::MTGP32: return curandRngType::CURAND_RNG_PSEUDO_MTGP32;
+    case cupynumeric::BitGeneratorType::MT19937: return curandRngType::CURAND_RNG_PSEUDO_MT19937;
+    case cupynumeric::BitGeneratorType::PHILOX4_32_10:
       return curandRngType::CURAND_RNG_PSEUDO_PHILOX4_32_10;
     default: LEGATE_ABORT("Unknown generator type");
   }
   return curandRngType::CURAND_RNG_TEST;
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/random/philox.h b/src/cupynumeric/random/philox.h
similarity index 97%
rename from src/cunumeric/random/philox.h
rename to src/cupynumeric/random/philox.h
index acada656e4..5666d6def7 100644
--- a/src/cunumeric/random/philox.h
+++ b/src/cupynumeric/random/philox.h
@@ -26,7 +26,7 @@
 #endif
 #endif
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <int ROUNDS>
 class Philox_2x32 {
@@ -118,7 +118,7 @@ class Philox_2x32 {
     // This syntax is only supported on >= c++17
     const float scale = 0x1.p-32;  // 2^-32
 #else
-    const float scale  = 0.00000000023283064365386962890625;
+    const float scale = 0.00000000023283064365386962890625;
 #endif
     return (bits * scale);
   }
@@ -139,4 +139,4 @@ class Philox_2x32 {
   }
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/random/rand.cc b/src/cupynumeric/random/rand.cc
similarity index 85%
rename from src/cunumeric/random/rand.cc
rename to src/cupynumeric/random/rand.cc
index 690a81fdeb..4dffe7d5cf 100644
--- a/src/cunumeric/random/rand.cc
+++ b/src/cupynumeric/random/rand.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/random/rand.h"
-#include "cunumeric/random/rand_template.inl"
+#include "cupynumeric/random/rand.h"
+#include "cupynumeric/random/rand_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -48,7 +48,10 @@ struct RandImplBody<VariantKind::CPU, RNG, VAL, DIM> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { RandTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  RandTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/random/rand.cu b/src/cupynumeric/random/rand.cu
similarity index 89%
rename from src/cunumeric/random/rand.cu
rename to src/cupynumeric/random/rand.cu
index 85ab708376..7d02293c51 100644
--- a/src/cunumeric/random/rand.cu
+++ b/src/cupynumeric/random/rand.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/random/rand.h"
-#include "cunumeric/random/rand_template.inl"
+#include "cupynumeric/random/rand.h"
+#include "cupynumeric/random/rand_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename WriteAcc, typename Rng, int32_t DIM>
 static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) rand_kernel(
@@ -50,7 +50,7 @@ struct RandImplBody<VariantKind::GPU, RNG, VAL, DIM> {
     auto stream         = get_cached_stream();
     rand_kernel<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
       volume, out, rng, strides, pitches, rect.lo);
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -59,4 +59,4 @@ struct RandImplBody<VariantKind::GPU, RNG, VAL, DIM> {
   rand_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/random/rand.h b/src/cupynumeric/random/rand.h
similarity index 78%
rename from src/cunumeric/random/rand.h
rename to src/cupynumeric/random/rand.h
index 1269c25df0..f9d2d9ec17 100644
--- a/src/cunumeric/random/rand.h
+++ b/src/cupynumeric/random/rand.h
@@ -16,10 +16,10 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/random/rand_util.h"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/random/rand_util.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct RandArgs {
   legate::PhysicalStore out;
@@ -29,9 +29,9 @@ struct RandArgs {
   std::vector<legate::Scalar> args;
 };
 
-class RandTask : public CuNumericTask<RandTask> {
+class RandTask : public CuPyNumericTask<RandTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_RAND};
+  static inline const auto TASK_CONFIG = legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_RAND}};
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -43,4 +43,4 @@ class RandTask : public CuNumericTask<RandTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/random/rand_omp.cc b/src/cupynumeric/random/rand_omp.cc
similarity index 91%
rename from src/cunumeric/random/rand_omp.cc
rename to src/cupynumeric/random/rand_omp.cc
index 089a8829e6..6eaaccdca8 100644
--- a/src/cunumeric/random/rand_omp.cc
+++ b/src/cupynumeric/random/rand_omp.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/random/rand.h"
-#include "cunumeric/random/rand_template.inl"
+#include "cupynumeric/random/rand.h"
+#include "cupynumeric/random/rand_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -47,4 +47,4 @@ struct RandImplBody<VariantKind::OMP, RNG, VAL, DIM> {
   rand_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/random/rand_template.inl b/src/cupynumeric/random/rand_template.inl
similarity index 92%
rename from src/cunumeric/random/rand_template.inl
rename to src/cupynumeric/random/rand_template.inl
index 9aac043c44..4db3fec1c2 100644
--- a/src/cunumeric/random/rand_template.inl
+++ b/src/cupynumeric/random/rand_template.inl
@@ -17,12 +17,12 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/random/rand.h"
-#include "cunumeric/arg.h"
-#include "cunumeric/arg.inl"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/random/rand.h"
+#include "cupynumeric/arg.h"
+#include "cupynumeric/arg.inl"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -89,4 +89,4 @@ static void rand_template(TaskContext& context)
   op_dispatch(args.gen_code, RandDispatch<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/random/rand_util.h b/src/cupynumeric/random/rand_util.h
similarity index 97%
rename from src/cunumeric/random/rand_util.h
rename to src/cupynumeric/random/rand_util.h
index 69412cbc30..b5ff655a7b 100644
--- a/src/cunumeric/random/rand_util.h
+++ b/src/cupynumeric/random/rand_util.h
@@ -16,13 +16,13 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/random/philox.h"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/random/philox.h"
 
 #define HI_BITS(x) (static_cast<unsigned>((x) >> 32))
-#define LO_BITS(x) (static_cast<unsigned>((x)&0x00000000FFFFFFFF))
+#define LO_BITS(x) (static_cast<unsigned>((x) & 0x00000000FFFFFFFF))
 
-namespace cunumeric {
+namespace cupynumeric {
 
 // Match these to RandGenCode in config.py
 enum class RandGenCode : int32_t {
@@ -202,4 +202,4 @@ struct RandomGenerator<RandGenCode::INTEGER, CODE> {
   uint64_t diff;
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/random/randutil/generator.cuh b/src/cupynumeric/random/randutil/generator.cuh
similarity index 84%
rename from src/cunumeric/random/randutil/generator.cuh
rename to src/cupynumeric/random/randutil/generator.cuh
index b0046bf683..45e1706cd6 100644
--- a/src/cunumeric/random/randutil/generator.cuh
+++ b/src/cupynumeric/random/randutil/generator.cuh
@@ -18,7 +18,7 @@
 
 #include "generator.h"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
 namespace randutilimpl {
 static constexpr int blocksPerMultiProcessor = 2;    // TODO: refine => number of blocks per mp
@@ -73,47 +73,47 @@ struct inner_generator<gen_t, randutilimpl::execlocation::DEVICE> : basegenerato
     : seed(seed), generatorID(generatorID), stream(stream)
   {
     int deviceId;
-    CUNUMERIC_CHECK_CUDA(::cudaGetDevice(&deviceId));
-    CUNUMERIC_CHECK_CUDA(
+    CUPYNUMERIC_CHECK_CUDA(::cudaGetDevice(&deviceId));
+    CUPYNUMERIC_CHECK_CUDA(
       ::cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId));
     // get number of generators
     ngenerators = blockDimX * multiProcessorCount * blocksPerMultiProcessor;
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     assert(ngenerators > 0);
 #endif
 
     // allocate buffer for generators state
     int driverVersion, runtimeVersion;
-    CUNUMERIC_CHECK_CUDA(::cudaDriverGetVersion(&driverVersion));
-    CUNUMERIC_CHECK_CUDA(::cudaRuntimeGetVersion(&runtimeVersion));
+    CUPYNUMERIC_CHECK_CUDA(::cudaDriverGetVersion(&driverVersion));
+    CUPYNUMERIC_CHECK_CUDA(::cudaRuntimeGetVersion(&runtimeVersion));
     asyncsupported = ((driverVersion >= 10020) && (runtimeVersion >= 10020));
     if (asyncsupported) {
 #if (__CUDACC_VER_MAJOR__ > 11 || ((__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 2)))
-      CUNUMERIC_CHECK_CUDA(::cudaMallocAsync(&generators, ngenerators * sizeof(gen_t), stream));
+      CUPYNUMERIC_CHECK_CUDA(::cudaMallocAsync(&generators, ngenerators * sizeof(gen_t), stream));
 #else
-      CUNUMERIC_CHECK_CUDA(::cudaMalloc(&generators, ngenerators * sizeof(gen_t)));
+      CUPYNUMERIC_CHECK_CUDA(::cudaMalloc(&generators, ngenerators * sizeof(gen_t)));
 #endif
     } else {
-      CUNUMERIC_CHECK_CUDA(::cudaMalloc(&generators, ngenerators * sizeof(gen_t)));
+      CUPYNUMERIC_CHECK_CUDA(::cudaMalloc(&generators, ngenerators * sizeof(gen_t)));
     }
 
     // initialize generators
     initgenerators<<<blocksPerMultiProcessor * multiProcessorCount, blockDimX, 0, stream>>>(
       generators, seed, generatorID);
-    CUNUMERIC_CHECK_CUDA(::cudaPeekAtLastError());
+    CUPYNUMERIC_CHECK_CUDA(::cudaPeekAtLastError());
   }
 
   virtual void destroy() override
   {
-    CUNUMERIC_CHECK_CUDA(::cudaStreamSynchronize(stream));
+    CUPYNUMERIC_CHECK_CUDA(::cudaStreamSynchronize(stream));
     if (asyncsupported) {
 #if (__CUDACC_VER_MAJOR__ > 11 || ((__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 2)))
-      CUNUMERIC_CHECK_CUDA(::cudaFreeAsync(generators, stream));
+      CUPYNUMERIC_CHECK_CUDA(::cudaFreeAsync(generators, stream));
 #else
-      CUNUMERIC_CHECK_CUDA(::cudaFree(generators));
+      CUPYNUMERIC_CHECK_CUDA(::cudaFree(generators));
 #endif
     } else {
-      CUNUMERIC_CHECK_CUDA(::cudaFree(generators));
+      CUPYNUMERIC_CHECK_CUDA(::cudaFree(generators));
     }
 
     generators = nullptr;
diff --git a/src/cunumeric/random/randutil/generator.h b/src/cupynumeric/random/randutil/generator.h
similarity index 96%
rename from src/cunumeric/random/randutil/generator.h
rename to src/cupynumeric/random/randutil/generator.h
index 50d3b0489f..41b2fe35fe 100644
--- a/src/cunumeric/random/randutil/generator.h
+++ b/src/cupynumeric/random/randutil/generator.h
@@ -23,7 +23,7 @@
 #include "randutil_curand.h"
 #include "randutil_impl.h"
 
-#include "cunumeric/random/rnd_aliases.h"
+#include "cupynumeric/random/rnd_aliases.h"
 
 namespace randutilimpl {
 
@@ -42,7 +42,7 @@ struct generatorid<gen_XORWOW_t> {
   static constexpr int rng_type = RND_RNG_PSEUDO_XORWOW;
 };
 
-#ifndef CUNUMERIC_USE_STL_RANDOM_ENGINE
+#ifndef CUPYNUMERIC_USE_STL_RANDOM_ENGINE
 // Curand *different* specializations, not possible with only one generator
 //
 template <>
@@ -64,12 +64,12 @@ struct inner_generator<gen_t, randutilimpl::execlocation::HOST> : basegenerator
   inner_generator(uint64_t seed, uint64_t generatorID, stream_t)
     : seed(seed),
       generatorID(generatorID)
-#ifdef CUNUMERIC_USE_STL_RANDOM_ENGINE
+#ifdef CUPYNUMERIC_USE_STL_RANDOM_ENGINE
       ,
       generator(seed)
 #endif
   {
-#ifdef CUNUMERIC_USE_STL_RANDOM_ENGINE
+#ifdef CUPYNUMERIC_USE_STL_RANDOM_ENGINE
     std::srand(seed);
 #else
     curand_init(seed, generatorID, 0, &generator);
diff --git a/src/cunumeric/random/randutil/generator_beta.inl b/src/cupynumeric/random/randutil/generator_beta.inl
similarity index 100%
rename from src/cunumeric/random/randutil/generator_beta.inl
rename to src/cupynumeric/random/randutil/generator_beta.inl
diff --git a/src/cunumeric/random/randutil/generator_binomial.inl b/src/cupynumeric/random/randutil/generator_binomial.inl
similarity index 100%
rename from src/cunumeric/random/randutil/generator_binomial.inl
rename to src/cupynumeric/random/randutil/generator_binomial.inl
diff --git a/src/cunumeric/random/randutil/generator_cauchy.inl b/src/cupynumeric/random/randutil/generator_cauchy.inl
similarity index 100%
rename from src/cunumeric/random/randutil/generator_cauchy.inl
rename to src/cupynumeric/random/randutil/generator_cauchy.inl
diff --git a/src/cunumeric/random/randutil/generator_chisquare.inl b/src/cupynumeric/random/randutil/generator_chisquare.inl
similarity index 100%
rename from src/cunumeric/random/randutil/generator_chisquare.inl
rename to src/cupynumeric/random/randutil/generator_chisquare.inl
diff --git a/src/cunumeric/random/randutil/generator_create.inl b/src/cupynumeric/random/randutil/generator_create.inl
similarity index 100%
rename from src/cunumeric/random/randutil/generator_create.inl
rename to src/cupynumeric/random/randutil/generator_create.inl
diff --git a/src/cunumeric/random/randutil/generator_device.cu b/src/cupynumeric/random/randutil/generator_device.cu
similarity index 100%
rename from src/cunumeric/random/randutil/generator_device.cu
rename to src/cupynumeric/random/randutil/generator_device.cu
diff --git a/src/cunumeric/random/randutil/generator_device_advanced.cu b/src/cupynumeric/random/randutil/generator_device_advanced.cu
similarity index 100%
rename from src/cunumeric/random/randutil/generator_device_advanced.cu
rename to src/cupynumeric/random/randutil/generator_device_advanced.cu
diff --git a/src/cunumeric/random/randutil/generator_device_straightforward.cu b/src/cupynumeric/random/randutil/generator_device_straightforward.cu
similarity index 100%
rename from src/cunumeric/random/randutil/generator_device_straightforward.cu
rename to src/cupynumeric/random/randutil/generator_device_straightforward.cu
diff --git a/src/cunumeric/random/randutil/generator_exponential.inl b/src/cupynumeric/random/randutil/generator_exponential.inl
similarity index 100%
rename from src/cunumeric/random/randutil/generator_exponential.inl
rename to src/cupynumeric/random/randutil/generator_exponential.inl
diff --git a/src/cunumeric/random/randutil/generator_f.inl b/src/cupynumeric/random/randutil/generator_f.inl
similarity index 100%
rename from src/cunumeric/random/randutil/generator_f.inl
rename to src/cupynumeric/random/randutil/generator_f.inl
diff --git a/src/cunumeric/random/randutil/generator_gamma.inl b/src/cupynumeric/random/randutil/generator_gamma.inl
similarity index 100%
rename from src/cunumeric/random/randutil/generator_gamma.inl
rename to src/cupynumeric/random/randutil/generator_gamma.inl
diff --git a/src/cunumeric/random/randutil/generator_geometric.inl b/src/cupynumeric/random/randutil/generator_geometric.inl
similarity index 100%
rename from src/cunumeric/random/randutil/generator_geometric.inl
rename to src/cupynumeric/random/randutil/generator_geometric.inl
diff --git a/src/cunumeric/random/randutil/generator_gumbel.inl b/src/cupynumeric/random/randutil/generator_gumbel.inl
similarity index 93%
rename from src/cunumeric/random/randutil/generator_gumbel.inl
rename to src/cupynumeric/random/randutil/generator_gumbel.inl
index e60b8a60bc..dffc255fd0 100644
--- a/src/cunumeric/random/randutil/generator_gumbel.inl
+++ b/src/cupynumeric/random/randutil/generator_gumbel.inl
@@ -31,7 +31,9 @@ struct gumbel_t<float> {
   {
     auto y = randutilimpl::engine_uniform<float>(gen);  // y cannot be zero
 
-    if (y == 1.0f) return mu;
+    if (y == 1.0f) {
+      return mu;
+    }
     float lny = ::logf(y);
     return mu - beta * ::logf(-lny);
   }
@@ -46,7 +48,9 @@ struct gumbel_t<double> {
   {
     auto y = randutilimpl::engine_uniform<double>(gen);  // y cannot be zero
 
-    if (y == 1.0) return mu;
+    if (y == 1.0) {
+      return mu;
+    }
     double lny = ::log(y);
     return mu - beta * ::log(-lny);
   }
diff --git a/src/cunumeric/random/randutil/generator_host.cc b/src/cupynumeric/random/randutil/generator_host.cc
similarity index 96%
rename from src/cunumeric/random/randutil/generator_host.cc
rename to src/cupynumeric/random/randutil/generator_host.cc
index 0752149222..6a20278c95 100644
--- a/src/cunumeric/random/randutil/generator_host.cc
+++ b/src/cupynumeric/random/randutil/generator_host.cc
@@ -18,15 +18,15 @@
 
 // CPU Builds:
 //
-#if !LEGATE_DEFINED(LEGATE_USE_CUDA) && !LEGATE_DEFINED(CUNUMERIC_CURAND_FOR_CPU_BUILD)
-#define CUNUMERIC_USE_STL_RANDOM_ENGINE
+#if !LEGATE_DEFINED(LEGATE_USE_CUDA) && !LEGATE_DEFINED(CUPYNUMERIC_CURAND_FOR_CPU_BUILD)
+#define CUPYNUMERIC_USE_STL_RANDOM_ENGINE
 #endif
 
 #include "generator.h"
 #include "generator_create.inl"
-#include "cunumeric/random/rnd_aliases.h"
+#include "cupynumeric/random/rnd_aliases.h"
 
-#if LEGATE_DEFINED(CUNUMERIC_CURAND_FOR_CPU_BUILD)
+#if LEGATE_DEFINED(CUPYNUMERIC_CURAND_FOR_CPU_BUILD)
 // the host code of cuRAND try to extern these variables out of nowhere,
 // so we need to define them somewhere.
 const dim3 blockDim{};
diff --git a/src/cunumeric/random/randutil/generator_host_advanced.cc b/src/cupynumeric/random/randutil/generator_host_advanced.cc
similarity index 98%
rename from src/cunumeric/random/randutil/generator_host_advanced.cc
rename to src/cupynumeric/random/randutil/generator_host_advanced.cc
index fc3817c8b5..97d47570f6 100644
--- a/src/cunumeric/random/randutil/generator_host_advanced.cc
+++ b/src/cupynumeric/random/randutil/generator_host_advanced.cc
@@ -18,8 +18,8 @@
 
 // CPU Builds:
 //
-#if !LEGATE_DEFINED(LEGATE_USE_CUDA) && !LEGATE_DEFINED(CUNUMERIC_CURAND_FOR_CPU_BUILD)
-#define CUNUMERIC_USE_STL_RANDOM_ENGINE
+#if !LEGATE_DEFINED(LEGATE_USE_CUDA) && !LEGATE_DEFINED(CUPYNUMERIC_CURAND_FOR_CPU_BUILD)
+#define CUPYNUMERIC_USE_STL_RANDOM_ENGINE
 #endif
 
 #include "generator.h"
diff --git a/src/cunumeric/random/randutil/generator_host_straightforward.cc b/src/cupynumeric/random/randutil/generator_host_straightforward.cc
similarity index 98%
rename from src/cunumeric/random/randutil/generator_host_straightforward.cc
rename to src/cupynumeric/random/randutil/generator_host_straightforward.cc
index 3deffdf216..6984fb8664 100644
--- a/src/cunumeric/random/randutil/generator_host_straightforward.cc
+++ b/src/cupynumeric/random/randutil/generator_host_straightforward.cc
@@ -18,8 +18,8 @@
 
 // CPU Builds:
 //
-#if !LEGATE_DEFINED(LEGATE_USE_CUDA) && !LEGATE_DEFINED(CUNUMERIC_CURAND_FOR_CPU_BUILD)
-#define CUNUMERIC_USE_STL_RANDOM_ENGINE
+#if !LEGATE_DEFINED(LEGATE_USE_CUDA) && !LEGATE_DEFINED(CUPYNUMERIC_CURAND_FOR_CPU_BUILD)
+#define CUPYNUMERIC_USE_STL_RANDOM_ENGINE
 #endif
 
 #include "generator.h"
diff --git a/src/cunumeric/random/randutil/generator_hypergeometric.inl b/src/cupynumeric/random/randutil/generator_hypergeometric.inl
similarity index 100%
rename from src/cunumeric/random/randutil/generator_hypergeometric.inl
rename to src/cupynumeric/random/randutil/generator_hypergeometric.inl
diff --git a/src/cunumeric/random/randutil/generator_integers.inl b/src/cupynumeric/random/randutil/generator_integers.inl
similarity index 100%
rename from src/cunumeric/random/randutil/generator_integers.inl
rename to src/cupynumeric/random/randutil/generator_integers.inl
diff --git a/src/cunumeric/random/randutil/generator_laplace.inl b/src/cupynumeric/random/randutil/generator_laplace.inl
similarity index 100%
rename from src/cunumeric/random/randutil/generator_laplace.inl
rename to src/cupynumeric/random/randutil/generator_laplace.inl
diff --git a/src/cunumeric/random/randutil/generator_logistic.inl b/src/cupynumeric/random/randutil/generator_logistic.inl
similarity index 94%
rename from src/cunumeric/random/randutil/generator_logistic.inl
rename to src/cupynumeric/random/randutil/generator_logistic.inl
index e0db0e4dd4..913ab06028 100644
--- a/src/cunumeric/random/randutil/generator_logistic.inl
+++ b/src/cupynumeric/random/randutil/generator_logistic.inl
@@ -30,7 +30,9 @@ struct logistic_t<float> {
   {
     float y = randutilimpl::engine_uniform<float>(gen);  // y cannot be 0
     float t = 1.0f / y - 1.0f;
-    if (t == 0) t = 1.0f;
+    if (t == 0) {
+      t = 1.0f;
+    }
     return mu - beta * ::logf(t);
   }
 };
@@ -44,7 +46,9 @@ struct logistic_t<double> {
   {
     auto y = randutilimpl::engine_uniform<double>(gen);  // y cannot be 0
     auto t = 1.0 / y - 1.0;
-    if (t == 0) t = 1.0;
+    if (t == 0) {
+      t = 1.0;
+    }
     return mu - beta * ::log(t);
   }
 };
diff --git a/src/cunumeric/random/randutil/generator_lognormal.inl b/src/cupynumeric/random/randutil/generator_lognormal.inl
similarity index 100%
rename from src/cunumeric/random/randutil/generator_lognormal.inl
rename to src/cupynumeric/random/randutil/generator_lognormal.inl
diff --git a/src/cunumeric/random/randutil/generator_logseries.inl b/src/cupynumeric/random/randutil/generator_logseries.inl
similarity index 100%
rename from src/cunumeric/random/randutil/generator_logseries.inl
rename to src/cupynumeric/random/randutil/generator_logseries.inl
diff --git a/src/cunumeric/random/randutil/generator_negative_binomial.inl b/src/cupynumeric/random/randutil/generator_negative_binomial.inl
similarity index 100%
rename from src/cunumeric/random/randutil/generator_negative_binomial.inl
rename to src/cupynumeric/random/randutil/generator_negative_binomial.inl
diff --git a/src/cunumeric/random/randutil/generator_normal.inl b/src/cupynumeric/random/randutil/generator_normal.inl
similarity index 100%
rename from src/cunumeric/random/randutil/generator_normal.inl
rename to src/cupynumeric/random/randutil/generator_normal.inl
diff --git a/src/cunumeric/random/randutil/generator_pareto.inl b/src/cupynumeric/random/randutil/generator_pareto.inl
similarity index 100%
rename from src/cunumeric/random/randutil/generator_pareto.inl
rename to src/cupynumeric/random/randutil/generator_pareto.inl
diff --git a/src/cunumeric/random/randutil/generator_poisson.inl b/src/cupynumeric/random/randutil/generator_poisson.inl
similarity index 100%
rename from src/cunumeric/random/randutil/generator_poisson.inl
rename to src/cupynumeric/random/randutil/generator_poisson.inl
diff --git a/src/cunumeric/random/randutil/generator_power.inl b/src/cupynumeric/random/randutil/generator_power.inl
similarity index 100%
rename from src/cunumeric/random/randutil/generator_power.inl
rename to src/cupynumeric/random/randutil/generator_power.inl
diff --git a/src/cunumeric/random/randutil/generator_raw.inl b/src/cupynumeric/random/randutil/generator_raw.inl
similarity index 100%
rename from src/cunumeric/random/randutil/generator_raw.inl
rename to src/cupynumeric/random/randutil/generator_raw.inl
diff --git a/src/cunumeric/random/randutil/generator_rayleigh.inl b/src/cupynumeric/random/randutil/generator_rayleigh.inl
similarity index 100%
rename from src/cunumeric/random/randutil/generator_rayleigh.inl
rename to src/cupynumeric/random/randutil/generator_rayleigh.inl
diff --git a/src/cunumeric/random/randutil/generator_standard_t.inl b/src/cupynumeric/random/randutil/generator_standard_t.inl
similarity index 100%
rename from src/cunumeric/random/randutil/generator_standard_t.inl
rename to src/cupynumeric/random/randutil/generator_standard_t.inl
diff --git a/src/cunumeric/random/randutil/generator_triangular.inl b/src/cupynumeric/random/randutil/generator_triangular.inl
similarity index 100%
rename from src/cunumeric/random/randutil/generator_triangular.inl
rename to src/cupynumeric/random/randutil/generator_triangular.inl
diff --git a/src/cunumeric/random/randutil/generator_uniform.inl b/src/cupynumeric/random/randutil/generator_uniform.inl
similarity index 100%
rename from src/cunumeric/random/randutil/generator_uniform.inl
rename to src/cupynumeric/random/randutil/generator_uniform.inl
diff --git a/src/cunumeric/random/randutil/generator_vonmises.inl b/src/cupynumeric/random/randutil/generator_vonmises.inl
similarity index 100%
rename from src/cunumeric/random/randutil/generator_vonmises.inl
rename to src/cupynumeric/random/randutil/generator_vonmises.inl
diff --git a/src/cunumeric/random/randutil/generator_wald.inl b/src/cupynumeric/random/randutil/generator_wald.inl
similarity index 100%
rename from src/cunumeric/random/randutil/generator_wald.inl
rename to src/cupynumeric/random/randutil/generator_wald.inl
diff --git a/src/cunumeric/random/randutil/generator_weibull.inl b/src/cupynumeric/random/randutil/generator_weibull.inl
similarity index 93%
rename from src/cunumeric/random/randutil/generator_weibull.inl
rename to src/cupynumeric/random/randutil/generator_weibull.inl
index 4f0411e2ac..19d2875979 100644
--- a/src/cunumeric/random/randutil/generator_weibull.inl
+++ b/src/cupynumeric/random/randutil/generator_weibull.inl
@@ -31,7 +31,9 @@ struct weibull_t<float> {
     float y = randutilimpl::engine_uniform<float>(gen);  // y cannot be 0
     // log(y) can be zero !
     auto lny = ::logf(y);
-    if (lny == 0.0f) return 0.0f;
+    if (lny == 0.0f) {
+      return 0.0f;
+    }
     return lambda * ::expf(::logf(-lny) * invk);
   }
 };
@@ -46,7 +48,9 @@ struct weibull_t<double> {
     double y = randutilimpl::engine_uniform<double>(gen);  // y cannot be 0
     // log(y) can be zero !
     auto lny = ::log(y);
-    if (lny == 0.0f) return 0.0f;
+    if (lny == 0.0f) {
+      return 0.0f;
+    }
     return lambda * ::exp(::log(-lny) * invk);
   }
 };
diff --git a/src/cunumeric/random/randutil/generator_zipf.inl b/src/cupynumeric/random/randutil/generator_zipf.inl
similarity index 100%
rename from src/cunumeric/random/randutil/generator_zipf.inl
rename to src/cupynumeric/random/randutil/generator_zipf.inl
diff --git a/src/cunumeric/random/randutil/random_distributions.h b/src/cupynumeric/random/randutil/random_distributions.h
similarity index 90%
rename from src/cunumeric/random/randutil/random_distributions.h
rename to src/cupynumeric/random/randutil/random_distributions.h
index 244c8d83f4..5e0977309d 100644
--- a/src/cunumeric/random/randutil/random_distributions.h
+++ b/src/cupynumeric/random/randutil/random_distributions.h
@@ -145,11 +145,15 @@ RANDUTIL_QUALIFIERS double rk_standard_gamma(rk_state* state, double shape)
       V = rk_standard_exponential(state);
       if (U <= 1.0 - shape) {
         X = pow(U, 1. / shape);
-        if (X <= V) { return X; }
+        if (X <= V) {
+          return X;
+        }
       } else {
         Y = -log((1 - U) / shape);
         X = pow(1.0 - shape + shape * Y, 1. / shape);
-        if (X <= (V + Y)) { return X; }
+        if (X <= (V + Y)) {
+          return X;
+        }
       }
     }
   } else {
@@ -162,8 +166,12 @@ RANDUTIL_QUALIFIERS double rk_standard_gamma(rk_state* state, double shape)
       } while (V <= 0.0);
       V = V * V * V;
       U = rk_double(state);
-      if (U < 1.0 - 0.0331 * (X * X) * (X * X)) return (b * V);
-      if (log(U) < 0.5 * X * X + b * (1. - V + log(V))) return (b * V);
+      if (U < 1.0 - 0.0331 * (X * X) * (X * X)) {
+        return (b * V);
+      }
+      if (log(U) < 0.5 * X * X + b * (1. - V + log(V))) {
+        return (b * V);
+      }
     }
   }
 }
@@ -223,8 +231,12 @@ RANDUTIL_QUALIFIERS long rk_poisson_ptrs(rk_state* state, double lam)
     V  = rk_double(state);
     us = 0.5 - fabs(U);
     k  = (long)floor((2 * a / us + b) * U + lam + 0.43);
-    if ((us >= 0.07) && (V <= vr)) { return k; }
-    if ((k < 0) || ((us < 0.013) && (V > us))) { continue; }
+    if ((us >= 0.07) && (V <= vr)) {
+      return k;
+    }
+    if ((k < 0) || ((us < 0.013) && (V > us))) {
+      continue;
+    }
     if ((log(V) + log(invalpha) - log(a / (us * us) + b)) <= (-lam + k * loglam - loggam(k + 1))) {
       return k;
     }
@@ -274,7 +286,9 @@ RANDUTIL_QUALIFIERS double rk_chisquare(rk_state* state, double df)
 template <typename rk_state>
 RANDUTIL_QUALIFIERS double rk_noncentral_chisquare(rk_state* state, double df, double nonc)
 {
-  if (nonc == 0) { return rk_chisquare(state, df); }
+  if (nonc == 0) {
+    return rk_chisquare(state, df);
+  }
   if (1 < df) {
     const double Chi2 = rk_chisquare(state, df - 1);
     const double N    = rk_gauss(state) + sqrt(nonc);
@@ -306,7 +320,9 @@ RANDUTIL_QUALIFIERS long rk_logseries(rk_state* state, double p)
   r = log(1.0 - p);
   while (1) {
     V = rk_double(state);
-    if (V >= p) { return 1; }
+    if (V >= p) {
+      return 1;
+    }
     U = rk_double(state);
     q = 1.0 - exp(r * U);
     if (V <= q * q) {
@@ -317,7 +333,9 @@ RANDUTIL_QUALIFIERS long rk_logseries(rk_state* state, double p)
         return result;
       }
     }
-    if (V >= q) { return 1; }
+    if (V >= q) {
+      return 1;
+    }
     return 2;
   }
 }
@@ -373,9 +391,13 @@ RANDUTIL_QUALIFIERS long rk_zipf(rk_state* state, double a)
     U = 1.0 - rk_double(state);
     V = rk_double(state);
     X = floor(pow(U, -1.0 / am1));
-    if (X < 1.0) { continue; }
+    if (X < 1.0) {
+      continue;
+    }
     T = pow(1.0 + 1.0 / X, am1);
-    if (V * X * (T - 1.0) / (b - 1.0) <= T / b) { return (long)X; }
+    if (V * X * (T - 1.0) / (b - 1.0) <= T / b) {
+      return (long)X;
+    }
   }
 }
 
@@ -407,16 +429,22 @@ RANDUTIL_QUALIFIERS double rk_vonmises(rk_state* state, double mu, double kappa)
       W = (1 + s * Z) / (s + Z);
       Y = kappa * (s - W);
       V = rk_double(state);
-      if ((Y * (2 - Y) - V >= 0) || (log(Y / V) + 1 - Y >= 0)) { break; }
+      if ((Y * (2 - Y) - V >= 0) || (log(Y / V) + 1 - Y >= 0)) {
+        break;
+      }
     }
     U      = rk_double(state);
     result = acos(W);
-    if (U < 0.5) { result = -result; }
+    if (U < 0.5) {
+      result = -result;
+    }
     result += mu;
     neg = (result < 0);
     mod = fabs(result);
     mod = (fmod(mod + M_PI, 2 * M_PI) - M_PI);
-    if (neg) { mod *= -1; }
+    if (neg) {
+      mod *= -1;
+    }
     return mod;
   }
 }
@@ -437,10 +465,14 @@ RANDUTIL_QUALIFIERS long rk_hypergeometric_hyp(rk_state* state, long good, long
     U = rk_double(state);
     Y -= (long)floor(U + Y / (d1 + K));
     K--;
-    if (K == 0) break;
+    if (K == 0) {
+      break;
+    }
   }
   Z = (long)(d2 - Y);
-  if (good > bad) Z = sample - Z;
+  if (good > bad) {
+    Z = sample - Z;
+  }
   return Z;
 }
 /* D1 = 2*sqrt(2/e) */
@@ -473,20 +505,32 @@ RANDUTIL_QUALIFIERS long rk_hypergeometric_hrua(rk_state* state, long good, long
     Y = rk_double(state);
     W = d6 + d8 * (Y - 0.5) / X;
     /* fast rejection: */
-    if ((W < 0.0) || (W >= d11)) continue;
+    if ((W < 0.0) || (W >= d11)) {
+      continue;
+    }
     Z = (long)floor(W);
     T = d10 - (loggam(Z + 1) + loggam(mingoodbad - Z + 1) + loggam(m - Z + 1) +
                loggam(maxgoodbad - m + Z + 1));
     /* fast acceptance: */
-    if ((X * (4.0 - X) - 3.0) <= T) break;
+    if ((X * (4.0 - X) - 3.0) <= T) {
+      break;
+    }
     /* fast rejection: */
-    if (X * (X - T) >= 1) continue;
-    if (2.0 * log(X) <= T) break; /* acceptance */
+    if (X * (X - T) >= 1) {
+      continue;
+    }
+    if (2.0 * log(X) <= T) {
+      break; /* acceptance */
+    }
   }
   /* this is a correction to HRUA* by Ivan Frohne in rv.py */
-  if (good > bad) Z = m - Z;
+  if (good > bad) {
+    Z = m - Z;
+  }
   /* another fix from rv.py to allow sample to exceed popsize/2 */
-  if (m < sample) Z = good - Z;
+  if (m < sample) {
+    Z = good - Z;
+  }
   return Z;
 }
 #undef D1
@@ -531,45 +575,69 @@ RANDUTIL_QUALIFIERS unsigned rk_binomial_btpe(rk_state* state, unsigned n, doubl
   nrq = n * r * q;
   u   = rk_double(state) * p4;
   v   = rk_double(state);
-  if (u > p1) goto Step20;
+  if (u > p1) {
+    goto Step20;
+  }
   y = (long)floor(xm - p1 * v + u);
   goto Step60;
 Step20:
-  if (u > p2) goto Step30;
+  if (u > p2) {
+    goto Step30;
+  }
   x = xl + (u - p1) / c;
   v = v * c + 1.0 - fabs(m - x + 0.5) / p1;
-  if (v > 1.0) goto Step10;
+  if (v > 1.0) {
+    goto Step10;
+  }
   y = (long)floor(x);
   goto Step50;
 Step30:
-  if (u > p3) goto Step40;
+  if (u > p3) {
+    goto Step40;
+  }
   y = (long)floor(xl + log(v) / laml);
-  if (y < 0) goto Step10;
+  if (y < 0) {
+    goto Step10;
+  }
   v = v * (u - p2) * laml;
   goto Step50;
 Step40:
   y = (long)floor(xr - log(v) / lamr);
-  if (y > n) goto Step10;
+  if (y > n) {
+    goto Step10;
+  }
   v = v * (u - p3) * lamr;
 Step50:
   k = labs(y - m);
-  if ((k > 20) && (k < ((nrq) / 2.0 - 1))) goto Step52;
+  if ((k > 20) && (k < ((nrq) / 2.0 - 1))) {
+    goto Step52;
+  }
   s = r / q;
   a = s * (n + 1);
   F = 1.0;
   if (m < y) {
-    for (i = m + 1; i <= y; i++) { F *= (a / i - s); }
+    for (i = m + 1; i <= y; i++) {
+      F *= (a / i - s);
+    }
   } else if (m > y) {
-    for (i = y + 1; i <= m; i++) { F /= (a / i - s); }
+    for (i = y + 1; i <= m; i++) {
+      F /= (a / i - s);
+    }
+  }
+  if (v > F) {
+    goto Step10;
   }
-  if (v > F) goto Step10;
   goto Step60;
 Step52:
   rho = (k / (nrq)) * ((k * (k / 3.0 + 0.625) + 0.16666666666666666) / nrq + 0.5);
   t   = -k * k / (2 * nrq);
   A   = log(v);
-  if (A < (t - rho)) goto Step60;
-  if (A > (t + rho)) goto Step10;
+  if (A < (t - rho)) {
+    goto Step60;
+  }
+  if (A > (t + rho)) {
+    goto Step10;
+  }
   x1 = y + 1;
   f1 = m + 1;
   z  = n + 1 - m;
@@ -586,7 +654,9 @@ RANDUTIL_QUALIFIERS unsigned rk_binomial_btpe(rk_state* state, unsigned n, doubl
     goto Step10;
   }
 Step60:
-  if (p > 0.5) { y = n - y; }
+  if (p > 0.5) {
+    y = n - y;
+  }
   return (unsigned)y;
 }
 
diff --git a/src/cunumeric/random/randutil/randomizer.h b/src/cupynumeric/random/randutil/randomizer.h
similarity index 94%
rename from src/cunumeric/random/randutil/randomizer.h
rename to src/cupynumeric/random/randutil/randomizer.h
index 4b9fc068db..796f28a47c 100644
--- a/src/cunumeric/random/randutil/randomizer.h
+++ b/src/cupynumeric/random/randutil/randomizer.h
@@ -32,7 +32,7 @@ namespace randutilimpl {
 template <typename element_t, typename gen_t>
 RANDUTIL_QUALIFIERS decltype(auto) engine_uniform(gen_t& gen)
 {
-#ifdef CUNUMERIC_USE_STL_RANDOM_ENGINE
+#ifdef CUPYNUMERIC_USE_STL_RANDOM_ENGINE
   std::uniform_real_distribution<element_t> dis(0, 1);
   auto y = dis(gen);  // returns [0, 1);
 
@@ -52,7 +52,7 @@ RANDUTIL_QUALIFIERS decltype(auto) engine_uniform(gen_t& gen)
 template <typename ret_t, typename gen_t>
 RANDUTIL_QUALIFIERS decltype(auto) engine_poisson(gen_t& gen, double lambda)
 {
-#ifdef CUNUMERIC_USE_STL_RANDOM_ENGINE
+#ifdef CUPYNUMERIC_USE_STL_RANDOM_ENGINE
   std::poisson_distribution<ret_t> dis(lambda);
   return dis(gen);
 #else
@@ -63,7 +63,7 @@ RANDUTIL_QUALIFIERS decltype(auto) engine_poisson(gen_t& gen, double lambda)
 template <typename element_t, typename gen_t>
 RANDUTIL_QUALIFIERS decltype(auto) engine_normal(gen_t& gen)
 {
-#ifdef CUNUMERIC_USE_STL_RANDOM_ENGINE
+#ifdef CUPYNUMERIC_USE_STL_RANDOM_ENGINE
   std::normal_distribution<element_t> dis(0, 1);
   return dis(gen);
 #else
@@ -80,7 +80,7 @@ RANDUTIL_QUALIFIERS decltype(auto) engine_normal(gen_t& gen)
 template <typename gen_t, typename element_t>
 RANDUTIL_QUALIFIERS decltype(auto) engine_log_normal(gen_t& gen, element_t mean, element_t stddev)
 {
-#ifdef CUNUMERIC_USE_STL_RANDOM_ENGINE
+#ifdef CUPYNUMERIC_USE_STL_RANDOM_ENGINE
   std::lognormal_distribution<element_t> dis{mean, stddev};
   return dis(gen);
 #else
@@ -97,7 +97,7 @@ RANDUTIL_QUALIFIERS decltype(auto) engine_log_normal(gen_t& gen, element_t mean,
 template <typename gen_t>
 RANDUTIL_QUALIFIERS decltype(auto) engine_rand(gen_t& gen)
 {
-#ifdef CUNUMERIC_USE_STL_RANDOM_ENGINE
+#ifdef CUPYNUMERIC_USE_STL_RANDOM_ENGINE
   return std::rand();
 #else
   return curand(&gen);
diff --git a/src/cunumeric/random/randutil/randutil.h b/src/cupynumeric/random/randutil/randutil.h
similarity index 99%
rename from src/cunumeric/random/randutil/randutil.h
rename to src/cupynumeric/random/randutil/randutil.h
index e52e8a77ca..31130ce464 100644
--- a/src/cunumeric/random/randutil/randutil.h
+++ b/src/cupynumeric/random/randutil/randutil.h
@@ -18,7 +18,7 @@
 #include <cstdint>
 // #include <curand.h>
 
-#include "cunumeric/random/rnd_aliases.h"
+#include "cupynumeric/random/rnd_aliases.h"
 
 typedef void* randutilGenerator_t;
 
diff --git a/src/cunumeric/random/randutil/randutil_curand.h b/src/cupynumeric/random/randutil/randutil_curand.h
similarity index 100%
rename from src/cunumeric/random/randutil/randutil_curand.h
rename to src/cupynumeric/random/randutil/randutil_curand.h
diff --git a/src/cunumeric/random/randutil/randutil_impl.h b/src/cupynumeric/random/randutil/randutil_impl.h
similarity index 100%
rename from src/cunumeric/random/randutil/randutil_impl.h
rename to src/cupynumeric/random/randutil/randutil_impl.h
diff --git a/src/cunumeric/random/rnd_aliases.h b/src/cupynumeric/random/rnd_aliases.h
similarity index 98%
rename from src/cunumeric/random/rnd_aliases.h
rename to src/cupynumeric/random/rnd_aliases.h
index 89d3fbebd2..e232e2d61f 100644
--- a/src/cunumeric/random/rnd_aliases.h
+++ b/src/cupynumeric/random/rnd_aliases.h
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#ifdef CUNUMERIC_USE_STL_RANDOM_ENGINE
+#ifdef CUPYNUMERIC_USE_STL_RANDOM_ENGINE
 
 // #pragma message("************ STL path *************")
 
diff --git a/src/cunumeric/random/rnd_types.h b/src/cupynumeric/random/rnd_types.h
similarity index 66%
rename from src/cunumeric/random/rnd_types.h
rename to src/cupynumeric/random/rnd_types.h
index 0fcab9f489..dfd633792f 100644
--- a/src/cunumeric/random/rnd_types.h
+++ b/src/cupynumeric/random/rnd_types.h
@@ -16,10 +16,10 @@
 
 #pragma once
 
-#include "cunumeric/random/rnd_aliases.h"
+#include "cupynumeric/random/rnd_aliases.h"
 #include <string_view>
 
-#ifdef CUNUMERIC_USE_STL_RANDOM_ENGINE
+#ifdef CUPYNUMERIC_USE_STL_RANDOM_ENGINE
 
 #define CHECK_RND_ENGINE(expr)                             \
   do {                                                     \
@@ -29,33 +29,33 @@
 
 // #define randutil_check_curand randutil_check_status
 
-namespace cunumeric {
+namespace cupynumeric {
 legate::Logger& randutil_log();
 
 void randutil_check_status(rnd_status_t error, std::string_view, int line);
 
-static inline randRngType get_rndRngType(cunumeric::BitGeneratorType kind)
+static inline randRngType get_rndRngType(cupynumeric::BitGeneratorType kind)
 {
   // for now, all generator types rerouted to STL
   // would use the MT19937 generator; perhaps,
   // this might become more flexible in the future;
   //
   switch (kind) {
-    case cunumeric::BitGeneratorType::DEFAULT: return randRngType::STL_MT_19937;
-    case cunumeric::BitGeneratorType::XORWOW: return randRngType::STL_MT_19937;
-    case cunumeric::BitGeneratorType::MRG32K3A: return randRngType::STL_MT_19937;
-    case cunumeric::BitGeneratorType::MTGP32: return randRngType::STL_MT_19937;
-    case cunumeric::BitGeneratorType::MT19937: return randRngType::STL_MT_19937;
-    case cunumeric::BitGeneratorType::PHILOX4_32_10: return randRngType::STL_MT_19937;
+    case cupynumeric::BitGeneratorType::DEFAULT: return randRngType::STL_MT_19937;
+    case cupynumeric::BitGeneratorType::XORWOW: return randRngType::STL_MT_19937;
+    case cupynumeric::BitGeneratorType::MRG32K3A: return randRngType::STL_MT_19937;
+    case cupynumeric::BitGeneratorType::MTGP32: return randRngType::STL_MT_19937;
+    case cupynumeric::BitGeneratorType::MT19937: return randRngType::STL_MT_19937;
+    case cupynumeric::BitGeneratorType::PHILOX4_32_10: return randRngType::STL_MT_19937;
     default: LEGATE_ABORT("Unsupported random generator.");
   }
   return randRngType::RND_RNG_TEST;
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
 
 #else
-#include "cunumeric/random/curand_help.h"
+#include "cupynumeric/random/curand_help.h"
 
 #define CHECK_RND_ENGINE(...) CHECK_CURAND(__VA_ARGS__)
 #define get_rndRngType get_curandRngType
diff --git a/src/cupynumeric/runtime.cc b/src/cupynumeric/runtime.cc
new file mode 100644
index 0000000000..db0a63bf8d
--- /dev/null
+++ b/src/cupynumeric/runtime.cc
@@ -0,0 +1,192 @@
+/* Copyright 2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "env_defaults.h"
+#include "cupynumeric/runtime.h"
+
+#include "cupynumeric/ndarray.h"
+#include "cupynumeric/unary/unary_red_util.h"
+
+#include <charconv>
+#include <cstdlib>
+#include <string_view>
+
+namespace cupynumeric {
+
+/*static*/ CuPyNumericRuntime* CuPyNumericRuntime::runtime_;
+
+extern void bootstrapping_callback(Legion::Machine machine,
+                                   Legion::Runtime* runtime,
+                                   const std::set<Legion::Processor>& local_procs);
+
+void initialize(int32_t argc, char** argv) { cupynumeric_perform_registration(); }
+
+CuPyNumericRuntime::CuPyNumericRuntime(legate::Runtime* legate_runtime, legate::Library library)
+  : legate_runtime_(legate_runtime), library_(library)
+{
+}
+
+NDArray CuPyNumericRuntime::create_array(const legate::Type& type)
+{
+  auto store = legate_runtime_->create_store(type);
+  return NDArray(std::move(store));
+}
+
+NDArray CuPyNumericRuntime::create_array(std::vector<uint64_t> shape,
+                                         const legate::Type& type,
+                                         bool optimize_scalar)
+{
+  auto store = legate_runtime_->create_store(legate::Shape{shape}, type, optimize_scalar);
+  return NDArray(std::move(store));
+}
+
+NDArray CuPyNumericRuntime::create_array(legate::LogicalStore&& store)
+{
+  return NDArray(std::move(store));
+}
+
+NDArray CuPyNumericRuntime::create_array(const legate::Type& type, int32_t dim)
+{
+  auto store = legate_runtime_->create_store(type, dim);
+  return NDArray(std::move(store));
+}
+
+legate::LogicalStore CuPyNumericRuntime::create_scalar_store(const Scalar& value)
+{
+  return legate_runtime_->create_store(value);
+}
+
+legate::Type CuPyNumericRuntime::get_argred_type(const legate::Type& value_type)
+{
+  auto finder = argred_types_.find(value_type.code());
+  if (finder != argred_types_.end()) {
+    return finder->second;
+  }
+
+  auto argred_type = legate::struct_type({legate::int64(), value_type}, true /*align*/);
+  argred_types_.insert({value_type.code(), argred_type});
+  return argred_type;
+}
+
+legate::AutoTask CuPyNumericRuntime::create_task(CuPyNumericOpCode op_code)
+{
+  return legate_runtime_->create_task(library_, legate::LocalTaskID{op_code});
+}
+
+legate::ManualTask CuPyNumericRuntime::create_task(CuPyNumericOpCode op_code,
+                                                   const legate::tuple<std::uint64_t>& launch_shape)
+{
+  return legate_runtime_->create_task(library_, legate::LocalTaskID{op_code}, launch_shape);
+}
+
+void CuPyNumericRuntime::submit(legate::AutoTask&& task)
+{
+  legate_runtime_->submit(std::move(task));
+}
+
+void CuPyNumericRuntime::submit(legate::ManualTask&& task)
+{
+  legate_runtime_->submit(std::move(task));
+}
+
+uint32_t CuPyNumericRuntime::get_next_random_epoch() { return next_epoch_++; }
+
+/*static*/ CuPyNumericRuntime* CuPyNumericRuntime::get_runtime() { return runtime_; }
+
+/*static*/ void CuPyNumericRuntime::initialize(legate::Runtime* legate_runtime,
+                                               legate::Library library)
+{
+  runtime_ = new CuPyNumericRuntime(legate_runtime, library);
+}
+
+namespace {
+
+std::uint32_t parse_value(const char* value_char)
+{
+  auto value_sv = std::string_view{value_char};
+
+  std::uint32_t result{};
+  if (auto&& [_, ec] = std::from_chars(value_sv.begin(), value_sv.end(), result);
+      ec != std::errc{}) {
+    throw std::runtime_error{std::make_error_code(ec).message()};
+  }
+
+  return result;
+}
+
+}  // namespace
+
+bool is_in_test_mode()
+{
+  static const auto value = [] {
+    const auto* is_in_test_mode = std::getenv("LEGATE_TEST");
+    return is_in_test_mode && static_cast<bool>(parse_value(is_in_test_mode));
+  }();
+
+  return value;
+}
+
+namespace {
+
+std::uint32_t extract_env(const char* env_name,
+                          std::uint32_t default_value,
+                          std::uint32_t test_value)
+{
+  if (const auto* env_value = std::getenv(env_name); env_value) {
+    return parse_value(env_value);
+  }
+
+  if (is_in_test_mode()) {
+    return test_value;
+  }
+
+  return default_value;
+}
+
+}  // namespace
+
+}  // namespace cupynumeric
+
+extern "C" {
+
+unsigned cupynumeric_max_eager_volume()
+{
+  static const auto min_gpu_chunk = cupynumeric::extract_env(
+    "CUPYNUMERIC_MIN_GPU_CHUNK", MIN_GPU_CHUNK_DEFAULT, MIN_GPU_CHUNK_TEST);
+  static const auto min_cpu_chunk = cupynumeric::extract_env(
+    "CUPYNUMERIC_MIN_CPU_CHUNK", MIN_CPU_CHUNK_DEFAULT, MIN_CPU_CHUNK_TEST);
+  static const auto min_omp_chunk = cupynumeric::extract_env(
+    "CUPYNUMERIC_MIN_OMP_CHUNK", MIN_OMP_CHUNK_DEFAULT, MIN_OMP_CHUNK_TEST);
+
+  auto machine = legate::get_machine();
+
+  if (machine.count(legate::mapping::TaskTarget::GPU) > 0) {
+    return min_gpu_chunk;
+  }
+  if (machine.count(legate::mapping::TaskTarget::OMP) > 0) {
+    return min_omp_chunk;
+  }
+  return min_cpu_chunk;
+}
+
+unsigned cupynumeric_matmul_cache_size()
+{
+  static const auto max_cache_size = cupynumeric::extract_env(
+    "CUPYNUMERIC_MATMUL_CACHE_SIZE", MATMUL_CACHE_SIZE_DEFAULT, MATMUL_CACHE_SIZE_TEST);
+  return max_cache_size;
+}
+
+}  // extern "C"
diff --git a/src/cunumeric/runtime.h b/src/cupynumeric/runtime.h
similarity index 78%
rename from src/cunumeric/runtime.h
rename to src/cupynumeric/runtime.h
index ce8fa49247..da2884467e 100644
--- a/src/cunumeric/runtime.h
+++ b/src/cupynumeric/runtime.h
@@ -20,16 +20,16 @@
 
 #include "legate.h"
 
-#include "cunumeric/cunumeric_c.h"
-#include "cunumeric/typedefs.h"
+#include "cupynumeric/cupynumeric_c.h"
+#include "cupynumeric/typedefs.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 class NDArray;
 
-class CuNumericRuntime {
+class CuPyNumericRuntime {
  private:
-  CuNumericRuntime(legate::Runtime* legate_runtime, legate::Library library);
+  CuPyNumericRuntime(legate::Runtime* legate_runtime, legate::Library library);
 
  public:
   NDArray create_array(const legate::Type& type);
@@ -44,8 +44,8 @@ class CuNumericRuntime {
   legate::Type get_argred_type(const legate::Type& value_type);
 
  public:
-  legate::AutoTask create_task(CuNumericOpCode op_code);
-  legate::ManualTask create_task(CuNumericOpCode op_code,
+  legate::AutoTask create_task(CuPyNumericOpCode op_code);
+  legate::ManualTask create_task(CuPyNumericOpCode op_code,
                                  const legate::tuple<std::uint64_t>& launch_shape);
   void submit(legate::AutoTask&& task);
   void submit(legate::ManualTask&& task);
@@ -57,11 +57,11 @@ class CuNumericRuntime {
   legate::Library get_library() const { return library_; }
 
  public:
-  static CuNumericRuntime* get_runtime();
+  static CuPyNumericRuntime* get_runtime();
   static void initialize(legate::Runtime* legate_runtime, legate::Library library);
 
  private:
-  static CuNumericRuntime* runtime_;
+  static CuPyNumericRuntime* runtime_;
 
  private:
   legate::Runtime* legate_runtime_;
@@ -70,4 +70,6 @@ class CuNumericRuntime {
   std::unordered_map<legate::Type::Code, legate::Type> argred_types_;
 };
 
-}  // namespace cunumeric
+[[nodiscard]] bool is_in_test_mode();
+
+}  // namespace cupynumeric
diff --git a/src/cunumeric/scan/scan_global.cc b/src/cupynumeric/scan/scan_global.cc
similarity index 92%
rename from src/cunumeric/scan/scan_global.cc
rename to src/cupynumeric/scan/scan_global.cc
index 965ce98ec6..e80a3c86e8 100644
--- a/src/cunumeric/scan/scan_global.cc
+++ b/src/cupynumeric/scan/scan_global.cc
@@ -14,13 +14,13 @@
  *
  */
 
-#include "cunumeric/scan/scan_global.h"
-#include "cunumeric/scan/scan_global_template.inl"
+#include "cupynumeric/scan/scan_global.h"
+#include "cupynumeric/scan/scan_global_template.inl"
 
 #include <thrust/reduce.h>
 #include <thrust/execution_policy.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -74,10 +74,10 @@ struct ScanGlobalImplBody<VariantKind::CPU, OP_CODE, CODE, DIM> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+const auto cupynumeric_reg_task_ = []() -> char {
   ScanGlobalTask::register_variants();
-}
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/scan/scan_global.cu b/src/cupynumeric/scan/scan_global.cu
similarity index 91%
rename from src/cunumeric/scan/scan_global.cu
rename to src/cupynumeric/scan/scan_global.cu
index fd2fb603ee..6e05d6457a 100644
--- a/src/cunumeric/scan/scan_global.cu
+++ b/src/cupynumeric/scan/scan_global.cu
@@ -14,15 +14,15 @@
  *
  */
 
-#include "cunumeric/scan/scan_global.h"
-#include "cunumeric/scan/scan_global_template.inl"
-#include "cunumeric/utilities/thrust_util.h"
+#include "cupynumeric/scan/scan_global.h"
+#include "cupynumeric/scan/scan_global_template.inl"
+#include "cupynumeric/utilities/thrust_util.h"
 
 #include <thrust/reduce.h>
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -79,7 +79,7 @@ struct ScanGlobalImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
       scalar_kernel<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
         stride, func, &outptr[index], global_prefix);
     }
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -88,4 +88,4 @@ struct ScanGlobalImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
   scan_global_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/scan/scan_global.h b/src/cupynumeric/scan/scan_global.h
similarity index 77%
rename from src/cunumeric/scan/scan_global.h
rename to src/cupynumeric/scan/scan_global.h
index 908520ccc3..cd94729c7d 100644
--- a/src/cunumeric/scan/scan_global.h
+++ b/src/cupynumeric/scan/scan_global.h
@@ -16,10 +16,10 @@
 
 #pragma once
 
-#include "cunumeric/scan/scan_util.h"
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/scan/scan_util.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct ScanGlobalArgs {
   legate::PhysicalStore sum_vals;
@@ -28,9 +28,10 @@ struct ScanGlobalArgs {
   const legate::DomainPoint& partition_index;
 };
 
-class ScanGlobalTask : public CuNumericTask<ScanGlobalTask> {
+class ScanGlobalTask : public CuPyNumericTask<ScanGlobalTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_SCAN_GLOBAL};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_SCAN_GLOBAL}};
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -42,4 +43,4 @@ class ScanGlobalTask : public CuNumericTask<ScanGlobalTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/scan/scan_global_omp.cc b/src/cupynumeric/scan/scan_global_omp.cc
similarity index 94%
rename from src/cunumeric/scan/scan_global_omp.cc
rename to src/cupynumeric/scan/scan_global_omp.cc
index f9f1355fa7..0d6fcdb918 100644
--- a/src/cunumeric/scan/scan_global_omp.cc
+++ b/src/cupynumeric/scan/scan_global_omp.cc
@@ -14,15 +14,15 @@
  *
  */
 
-#include "cunumeric/scan/scan_global.h"
-#include "cunumeric/scan/scan_global_template.inl"
+#include "cupynumeric/scan/scan_global.h"
+#include "cupynumeric/scan/scan_global_template.inl"
 
 #include <thrust/reduce.h>
 #include <thrust/execution_policy.h>
 #include <thrust/system/omp/execution_policy.h>
 #include <omp.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -75,4 +75,4 @@ struct ScanGlobalImplBody<VariantKind::OMP, OP_CODE, CODE, DIM> {
   scan_global_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/scan/scan_global_template.inl b/src/cupynumeric/scan/scan_global_template.inl
similarity index 95%
rename from src/cunumeric/scan/scan_global_template.inl
rename to src/cupynumeric/scan/scan_global_template.inl
index 15fda91abf..34c39f6140 100644
--- a/src/cunumeric/scan/scan_global_template.inl
+++ b/src/cupynumeric/scan/scan_global_template.inl
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/scan/scan_util.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/scan/scan_util.h"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -76,4 +76,4 @@ static void scan_global_template(TaskContext& context)
   op_dispatch(args.op_code, ScanGlobalDispatch<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/scan/scan_local.cc b/src/cupynumeric/scan/scan_local.cc
similarity index 92%
rename from src/cunumeric/scan/scan_local.cc
rename to src/cupynumeric/scan/scan_local.cc
index f5387b05de..b0663fe836 100644
--- a/src/cunumeric/scan/scan_local.cc
+++ b/src/cupynumeric/scan/scan_local.cc
@@ -14,15 +14,15 @@
  *
  */
 
-#include "cunumeric/scan/scan_local.h"
-#include "cunumeric/scan/scan_local_template.inl"
-#include "cunumeric/unary/isnan.h"
+#include "cupynumeric/scan/scan_local.h"
+#include "cupynumeric/scan/scan_local_template.inl"
+#include "cupynumeric/unary/isnan.h"
 
 #include <thrust/scan.h>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/transform_iterator.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -70,7 +70,7 @@ struct ScanLocalNanImplBody<VariantKind::CPU, OP_CODE, CODE, DIM> {
   struct convert_nan_func {
     VAL operator()(VAL x) const
     {
-      return cunumeric::is_nan(x) ? (VAL)ScanOp<OP_CODE, CODE>::nan_identity : x;
+      return cupynumeric::is_nan(x) ? (VAL)ScanOp<OP_CODE, CODE>::nan_identity : x;
     }
   };
 
@@ -116,10 +116,10 @@ struct ScanLocalNanImplBody<VariantKind::CPU, OP_CODE, CODE, DIM> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+const auto cupynumeric_reg_task_ = []() -> char {
   ScanLocalTask::register_variants();
-}
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/scan/scan_local.cu b/src/cupynumeric/scan/scan_local.cu
similarity index 90%
rename from src/cunumeric/scan/scan_local.cu
rename to src/cupynumeric/scan/scan_local.cu
index 2ded8c7056..fe7938b53c 100644
--- a/src/cunumeric/scan/scan_local.cu
+++ b/src/cupynumeric/scan/scan_local.cu
@@ -14,17 +14,17 @@
  *
  */
 
-#include "cunumeric/scan/scan_local.h"
-#include "cunumeric/scan/scan_local_template.inl"
-#include "cunumeric/unary/isnan.h"
-#include "cunumeric/utilities/thrust_util.h"
+#include "cupynumeric/scan/scan_local.h"
+#include "cupynumeric/scan/scan_local_template.inl"
+#include "cupynumeric/unary/isnan.h"
+#include "cupynumeric/utilities/thrust_util.h"
 
 #include <thrust/scan.h>
 #include <thrust/iterator/transform_iterator.h>
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -75,7 +75,7 @@ struct ScanLocalImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
       lazy_kernel<<<1, THREADS_PER_BLOCK, 0, stream>>>(&outptr[index + stride - 1],
                                                        &sum_valsptr[sum_valp]);
     }
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -87,7 +87,7 @@ struct ScanLocalNanImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
   struct convert_nan_func {
     __device__ VAL operator()(VAL x)
     {
-      return cunumeric::is_nan(x) ? (VAL)ScanOp<OP_CODE, CODE>::nan_identity : x;
+      return cupynumeric::is_nan(x) ? (VAL)ScanOp<OP_CODE, CODE>::nan_identity : x;
     }
   };
 
@@ -126,7 +126,7 @@ struct ScanLocalNanImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
       lazy_kernel<<<1, THREADS_PER_BLOCK, 0, stream>>>(&outptr[index + stride - 1],
                                                        &sum_valsptr[sum_valp]);
     }
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -135,4 +135,4 @@ struct ScanLocalNanImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
   scan_local_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/scan/scan_local.h b/src/cupynumeric/scan/scan_local.h
similarity index 63%
rename from src/cunumeric/scan/scan_local.h
rename to src/cupynumeric/scan/scan_local.h
index 30b3a2ab87..c143877010 100644
--- a/src/cunumeric/scan/scan_local.h
+++ b/src/cupynumeric/scan/scan_local.h
@@ -16,10 +16,10 @@
 
 #pragma once
 
-#include "cunumeric/scan/scan_util.h"
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/scan/scan_util.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct ScanLocalArgs {
   legate::PhysicalStore out;
@@ -29,9 +29,14 @@ struct ScanLocalArgs {
   bool nan_to_identity;
 };
 
-class ScanLocalTask : public CuNumericTask<ScanLocalTask> {
+class ScanLocalTask : public CuPyNumericTask<ScanLocalTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_SCAN_LOCAL};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_SCAN_LOCAL}};
+
+  static constexpr auto CPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+  static constexpr auto OMP_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -43,4 +48,4 @@ class ScanLocalTask : public CuNumericTask<ScanLocalTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/scan/scan_local_omp.cc b/src/cupynumeric/scan/scan_local_omp.cc
similarity index 93%
rename from src/cunumeric/scan/scan_local_omp.cc
rename to src/cupynumeric/scan/scan_local_omp.cc
index 9e4e9f0639..d78a7c21ec 100644
--- a/src/cunumeric/scan/scan_local_omp.cc
+++ b/src/cupynumeric/scan/scan_local_omp.cc
@@ -14,9 +14,9 @@
  *
  */
 
-#include "cunumeric/scan/scan_local.h"
-#include "cunumeric/scan/scan_local_template.inl"
-#include "cunumeric/unary/isnan.h"
+#include "cupynumeric/scan/scan_local.h"
+#include "cupynumeric/scan/scan_local_template.inl"
+#include "cupynumeric/unary/isnan.h"
 
 #include <thrust/scan.h>
 #include <thrust/execution_policy.h>
@@ -24,7 +24,7 @@
 #include <thrust/system/omp/execution_policy.h>
 #include <omp.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -72,7 +72,7 @@ struct ScanLocalNanImplBody<VariantKind::OMP, OP_CODE, CODE, DIM> {
   struct convert_nan_func {
     VAL operator()(VAL x) const
     {
-      return cunumeric::is_nan(x) ? (VAL)ScanOp<OP_CODE, CODE>::nan_identity : x;
+      return cupynumeric::is_nan(x) ? (VAL)ScanOp<OP_CODE, CODE>::nan_identity : x;
     }
   };
 
@@ -116,4 +116,4 @@ struct ScanLocalNanImplBody<VariantKind::OMP, OP_CODE, CODE, DIM> {
   scan_local_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/scan/scan_local_template.inl b/src/cupynumeric/scan/scan_local_template.inl
similarity index 96%
rename from src/cunumeric/scan/scan_local_template.inl
rename to src/cupynumeric/scan/scan_local_template.inl
index 889f9007fb..bfb9095f4e 100644
--- a/src/cunumeric/scan/scan_local_template.inl
+++ b/src/cupynumeric/scan/scan_local_template.inl
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/scan/scan_util.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/scan/scan_util.h"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -104,4 +104,4 @@ static void scan_local_template(TaskContext& context)
   op_dispatch(args.op_code, args.nan_to_identity, ScanLocalDispatch<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/scan/scan_local_util.h b/src/cupynumeric/scan/scan_local_util.h
similarity index 92%
rename from src/cunumeric/scan/scan_local_util.h
rename to src/cupynumeric/scan/scan_local_util.h
index 38a0420159..660ad13b23 100644
--- a/src/cunumeric/scan/scan_local_util.h
+++ b/src/cupynumeric/scan/scan_local_util.h
@@ -16,15 +16,15 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
 #include <thrust/functional.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 enum class ScanCode : int {
-  PROD = CUNUMERIC_SCAN_PROD,
-  SUM  = CUNUMERIC_SCAN_SUM,
+  PROD = CUPYNUMERIC_SCAN_PROD,
+  SUM  = CUPYNUMERIC_SCAN_SUM,
 };
 
 template <typename Functor, typename... Fnargs>
@@ -67,4 +67,4 @@ struct ScanOp<ScanCode::PROD, CODE> : thrust::multiplies<legate::type_of<CODE>>
   ScanOp() {}
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/scan/scan_util.h b/src/cupynumeric/scan/scan_util.h
similarity index 94%
rename from src/cunumeric/scan/scan_util.h
rename to src/cupynumeric/scan/scan_util.h
index 84e05fced5..6e32de4ac2 100644
--- a/src/cunumeric/scan/scan_util.h
+++ b/src/cupynumeric/scan/scan_util.h
@@ -16,15 +16,15 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
 #include <thrust/functional.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 enum class ScanCode : int {
-  PROD = CUNUMERIC_SCAN_PROD,
-  SUM  = CUNUMERIC_SCAN_SUM,
+  PROD = CUPYNUMERIC_SCAN_PROD,
+  SUM  = CUPYNUMERIC_SCAN_SUM,
 };
 
 template <typename Functor, typename... Fnargs>
@@ -87,4 +87,4 @@ struct ScanOp<ScanCode::PROD, legate::Type::Code::BOOL> {
   constexpr T operator()(const bool& lhs, const bool& rhs) const { return lhs && rhs; }
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/search/argwhere.cc b/src/cupynumeric/search/argwhere.cc
similarity index 87%
rename from src/cunumeric/search/argwhere.cc
rename to src/cupynumeric/search/argwhere.cc
index 8367d6f1d2..93e616d3fa 100644
--- a/src/cunumeric/search/argwhere.cc
+++ b/src/cupynumeric/search/argwhere.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/search/argwhere.h"
-#include "cunumeric/search/argwhere_template.inl"
+#include "cupynumeric/search/argwhere.h"
+#include "cupynumeric/search/argwhere_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -62,7 +62,10 @@ struct ArgWhereImplBody<VariantKind::CPU, CODE, DIM> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { ArgWhereTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  ArgWhereTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/search/argwhere.cu b/src/cupynumeric/search/argwhere.cu
similarity index 88%
rename from src/cunumeric/search/argwhere.cu
rename to src/cupynumeric/search/argwhere.cu
index d8c999dbf7..6a38ac531c 100644
--- a/src/cunumeric/search/argwhere.cu
+++ b/src/cupynumeric/search/argwhere.cu
@@ -14,13 +14,13 @@
  *
  */
 
-#include "cunumeric/search/argwhere.h"
-#include "cunumeric/search/argwhere_template.inl"
-#include "cunumeric/search/nonzero.cuh"
+#include "cupynumeric/search/argwhere.h"
+#include "cupynumeric/search/argwhere_template.inl"
+#include "cupynumeric/search/nonzero.cuh"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename VAL, int32_t DIM>
 static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
@@ -59,7 +59,7 @@ struct ArgWhereImplBody<VariantKind::GPU, CODE, DIM> {
 
     auto offsets = create_buffer<int64_t>(volume, legate::Memory::Kind::GPU_FB_MEM);
     auto size    = compute_offsets(input, pitches, rect, volume, offsets, stream);
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
 
     auto out = out_array.create_output_buffer<int64_t, 2>(Point<2>(size, DIM), true);
 
@@ -67,7 +67,7 @@ struct ArgWhereImplBody<VariantKind::GPU, CODE, DIM> {
       const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
       argwhere_kernel<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
         volume, input, pitches, rect.lo, offsets, out);
-      CUNUMERIC_CHECK_CUDA_STREAM(stream);
+      CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
     }
   }
 };
@@ -77,4 +77,4 @@ struct ArgWhereImplBody<VariantKind::GPU, CODE, DIM> {
   argwhere_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/search/argwhere.h b/src/cupynumeric/search/argwhere.h
similarity index 63%
rename from src/cunumeric/search/argwhere.h
rename to src/cupynumeric/search/argwhere.h
index 78d4a8fbf4..4567860862 100644
--- a/src/cunumeric/search/argwhere.h
+++ b/src/cupynumeric/search/argwhere.h
@@ -16,18 +16,23 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct ArgWhereArgs {
   legate::PhysicalStore out;
   legate::PhysicalStore in;
 };
 
-class ArgWhereTask : public CuNumericTask<ArgWhereTask> {
+class ArgWhereTask : public CuPyNumericTask<ArgWhereTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_ARGWHERE};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_ARGWHERE}};
+
+  static constexpr auto CPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+  static constexpr auto OMP_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -39,4 +44,4 @@ class ArgWhereTask : public CuNumericTask<ArgWhereTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/search/argwhere_omp.cc b/src/cupynumeric/search/argwhere_omp.cc
similarity index 93%
rename from src/cunumeric/search/argwhere_omp.cc
rename to src/cupynumeric/search/argwhere_omp.cc
index cd8fbf3653..d40cf57edb 100644
--- a/src/cunumeric/search/argwhere_omp.cc
+++ b/src/cupynumeric/search/argwhere_omp.cc
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/search/argwhere.h"
-#include "cunumeric/search/argwhere_template.inl"
-#include "cunumeric/omp_help.h"
+#include "cupynumeric/search/argwhere.h"
+#include "cupynumeric/search/argwhere_template.inl"
+#include "cupynumeric/omp_help.h"
 
 #include <omp.h>
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -87,4 +87,4 @@ struct ArgWhereImplBody<VariantKind::OMP, CODE, DIM> {
   argwhere_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/search/argwhere_template.inl b/src/cupynumeric/search/argwhere_template.inl
similarity index 92%
rename from src/cunumeric/search/argwhere_template.inl
rename to src/cupynumeric/search/argwhere_template.inl
index bcc6f4d014..6a0cd886c2 100644
--- a/src/cunumeric/search/argwhere_template.inl
+++ b/src/cupynumeric/search/argwhere_template.inl
@@ -17,10 +17,10 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/search/argwhere.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/search/argwhere.h"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -56,4 +56,4 @@ static void argwhere_template(TaskContext& context)
   double_dispatch(args.in.dim(), args.in.code(), ArgWhereImpl<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/search/nonzero.cc b/src/cupynumeric/search/nonzero.cc
similarity index 87%
rename from src/cunumeric/search/nonzero.cc
rename to src/cupynumeric/search/nonzero.cc
index f5e9c8a1bb..3b7cc2d3f8 100644
--- a/src/cunumeric/search/nonzero.cc
+++ b/src/cupynumeric/search/nonzero.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/search/nonzero.h"
-#include "cunumeric/search/nonzero_template.inl"
+#include "cupynumeric/search/nonzero.h"
+#include "cupynumeric/search/nonzero_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -65,7 +65,10 @@ struct NonzeroImplBody<VariantKind::CPU, CODE, DIM> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { NonzeroTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  NonzeroTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/search/nonzero.cu b/src/cupynumeric/search/nonzero.cu
similarity index 87%
rename from src/cunumeric/search/nonzero.cu
rename to src/cupynumeric/search/nonzero.cu
index 180f124b66..b68298368d 100644
--- a/src/cunumeric/search/nonzero.cu
+++ b/src/cupynumeric/search/nonzero.cu
@@ -14,11 +14,11 @@
  *
  */
 
-#include "cunumeric/search/nonzero.h"
-#include "cunumeric/search/nonzero_template.inl"
-#include "cunumeric/search/nonzero.cuh"
+#include "cupynumeric/search/nonzero.h"
+#include "cupynumeric/search/nonzero_template.inl"
+#include "cupynumeric/search/nonzero.cuh"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename Pitches, typename Point, typename VAL, int32_t DIM>
 static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
@@ -74,8 +74,9 @@ struct NonzeroImplBody<VariantKind::GPU, CODE, DIM> {
   {
     auto stream = get_cached_stream();
 
-    auto offsets = create_buffer<int64_t>(volume, legate::Memory::Kind::GPU_FB_MEM);
-    auto size    = compute_offsets(in, pitches, rect, volume, offsets, stream);
+    auto offsets =
+      create_buffer<std::int64_t>(volume, legate::Memory::Kind::GPU_FB_MEM, sizeof(std::int64_t));
+    auto size = compute_offsets(in, pitches, rect, volume, offsets, stream);
 
     std::vector<Buffer<int64_t>> results;
     for (auto& output : outputs) {
@@ -85,7 +86,7 @@ struct NonzeroImplBody<VariantKind::GPU, CODE, DIM> {
     if (size > 0) {
       populate_nonzeros(in, pitches, rect, volume, results, offsets, stream);
     }
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -94,4 +95,4 @@ struct NonzeroImplBody<VariantKind::GPU, CODE, DIM> {
   nonzero_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/search/nonzero.cuh b/src/cupynumeric/search/nonzero.cuh
similarity index 94%
rename from src/cunumeric/search/nonzero.cuh
rename to src/cupynumeric/search/nonzero.cuh
index e08d6267e9..5befb3b594 100644
--- a/src/cunumeric/search/nonzero.cuh
+++ b/src/cupynumeric/search/nonzero.cuh
@@ -16,10 +16,10 @@
 
 #include <thrust/scan.h>
 
-#include "cunumeric/cuda_help.h"
-#include "cunumeric/utilities/thrust_util.h"
+#include "cupynumeric/cuda_help.h"
+#include "cupynumeric/utilities/thrust_util.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -78,8 +78,8 @@ int64_t compute_offsets(const AccessorRO<VAL, DIM>& in,
 
   exclusive_sum(p_offsets, volume, stream);
 
-  CUNUMERIC_CHECK_CUDA_STREAM(stream);
+  CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   return size.read(stream);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/search/nonzero.h b/src/cupynumeric/search/nonzero.h
similarity index 64%
rename from src/cunumeric/search/nonzero.h
rename to src/cupynumeric/search/nonzero.h
index 78cf34a42e..f6d64307fa 100644
--- a/src/cunumeric/search/nonzero.h
+++ b/src/cupynumeric/search/nonzero.h
@@ -16,18 +16,23 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct NonzeroArgs {
   legate::PhysicalStore input;
   std::vector<legate::PhysicalStore> results;
 };
 
-class NonzeroTask : public CuNumericTask<NonzeroTask> {
+class NonzeroTask : public CuPyNumericTask<NonzeroTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_NONZERO};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_NONZERO}};
+
+  static constexpr auto CPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+  static constexpr auto OMP_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -39,4 +44,4 @@ class NonzeroTask : public CuNumericTask<NonzeroTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/search/nonzero_omp.cc b/src/cupynumeric/search/nonzero_omp.cc
similarity index 93%
rename from src/cunumeric/search/nonzero_omp.cc
rename to src/cupynumeric/search/nonzero_omp.cc
index 1f85499d78..fbbeab14bd 100644
--- a/src/cunumeric/search/nonzero_omp.cc
+++ b/src/cupynumeric/search/nonzero_omp.cc
@@ -14,13 +14,13 @@
  *
  */
 
-#include "cunumeric/search/nonzero.h"
-#include "cunumeric/search/nonzero_template.inl"
-#include "cunumeric/omp_help.h"
+#include "cupynumeric/search/nonzero.h"
+#include "cupynumeric/search/nonzero_template.inl"
+#include "cupynumeric/omp_help.h"
 
 #include <omp.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -93,4 +93,4 @@ struct NonzeroImplBody<VariantKind::OMP, CODE, DIM> {
   nonzero_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/search/nonzero_template.inl b/src/cupynumeric/search/nonzero_template.inl
similarity index 92%
rename from src/cunumeric/search/nonzero_template.inl
rename to src/cupynumeric/search/nonzero_template.inl
index 250bb88e5f..36c6fbc0f8 100644
--- a/src/cunumeric/search/nonzero_template.inl
+++ b/src/cupynumeric/search/nonzero_template.inl
@@ -17,10 +17,10 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/search/nonzero.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/search/nonzero.h"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -62,4 +62,4 @@ static void nonzero_template(TaskContext& context)
   double_dispatch(args.input.dim(), args.input.code(), NonzeroImpl<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/set/unique.cc b/src/cupynumeric/set/unique.cc
similarity index 84%
rename from src/cunumeric/set/unique.cc
rename to src/cupynumeric/set/unique.cc
index b4dba93990..97bcd0810b 100644
--- a/src/cunumeric/set/unique.cc
+++ b/src/cupynumeric/set/unique.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/set/unique.h"
-#include "cunumeric/set/unique_template.inl"
+#include "cupynumeric/set/unique.h"
+#include "cupynumeric/set/unique_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -56,11 +56,10 @@ struct UniqueImplBody<VariantKind::CPU, CODE, DIM> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
-  UniqueTask::register_variants(
-    {{LEGATE_GPU_VARIANT, legate::VariantOptions{}.with_concurrent(true)}});
-}
+static const auto cupynumeric_reg_task_ = []() -> char {
+  UniqueTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/set/unique.cu b/src/cupynumeric/set/unique.cu
similarity index 88%
rename from src/cunumeric/set/unique.cu
rename to src/cupynumeric/set/unique.cu
index fa2b14155a..7866cc5fb2 100644
--- a/src/cunumeric/set/unique.cu
+++ b/src/cupynumeric/set/unique.cu
@@ -14,17 +14,17 @@
  *
  */
 
-#include "cunumeric/set/unique.h"
-#include "cunumeric/set/unique_template.inl"
-#include "cunumeric/utilities/thrust_util.h"
+#include "cupynumeric/set/unique.h"
+#include "cupynumeric/set/unique_template.inl"
+#include "cupynumeric/utilities/thrust_util.h"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
 #include <thrust/merge.h>
 #include <thrust/sort.h>
 #include <thrust/unique.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -59,14 +59,14 @@ static Piece<VAL> tree_reduce(legate::PhysicalStore& output,
 {
   size_t remaining = num_ranks;
   size_t radix     = 2;
-  auto all_sizes   = create_buffer<size_t>(num_ranks, Memory::Z_COPY_MEM);
+  auto all_sizes = create_buffer<std::size_t>(num_ranks, Memory::Z_COPY_MEM, alignof(std::size_t));
 
   while (remaining > 1) {
     // TODO: This could be point-to-point, as we don't need all the sizes,
     //       but I suspect point-to-point can be slower...
     all_sizes[my_id] = my_piece.second;
     CHECK_NCCL(ncclAllGather(all_sizes.ptr(my_id), all_sizes.ptr(0), 1, ncclUint64, *comm, stream));
-    CUNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
+    CUPYNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
 
     Piece<VAL> other_piece;
     size_t offset           = radix / 2;
@@ -121,11 +121,11 @@ static Piece<VAL> tree_reduce(legate::PhysicalStore& output,
       assert(my_piece.second <= buf_size);
       my_piece.first = output.create_output_buffer<VAL, 1>(buf_size);
 
-      CUNUMERIC_CHECK_CUDA(cudaMemcpyAsync(my_piece.first.ptr(0),
-                                           p_merged,
-                                           sizeof(VAL) * my_piece.second,
-                                           cudaMemcpyDeviceToDevice,
-                                           stream));
+      CUPYNUMERIC_CHECK_CUDA(cudaMemcpyAsync(my_piece.first.ptr(0),
+                                             p_merged,
+                                             sizeof(VAL) * my_piece.second,
+                                             cudaMemcpyDeviceToDevice,
+                                             stream));
       merged.destroy();
     }
 
@@ -163,14 +163,14 @@ struct UniqueImplBody<VariantKind::GPU, CODE, DIM> {
     if (volume > 0) {
       if (in.accessor.is_dense_arbitrary(rect)) {
         auto* src = in.ptr(rect.lo);
-        CUNUMERIC_CHECK_CUDA(
+        CUPYNUMERIC_CHECK_CUDA(
           cudaMemcpyAsync(ptr, src, sizeof(VAL) * volume, cudaMemcpyDeviceToDevice, stream));
       } else {
         const size_t num_blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
         copy_into_buffer<<<num_blocks, THREADS_PER_BLOCK, 0, stream>>>(
           ptr, in, rect.lo, pitches, volume);
       }
-      CUNUMERIC_CHECK_CUDA_STREAM(stream);
+      CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
 
       // Find unique values
       thrust::sort(DEFAULT_POLICY.on(stream), ptr, ptr + volume);
@@ -183,7 +183,7 @@ struct UniqueImplBody<VariantKind::GPU, CODE, DIM> {
     assert(end - ptr <= buf_size);
     result.first = output.create_output_buffer<VAL, 1>(buf_size);
     if (result.second > 0) {
-      CUNUMERIC_CHECK_CUDA(cudaMemcpyAsync(
+      CUPYNUMERIC_CHECK_CUDA(cudaMemcpyAsync(
         result.first.ptr(0), ptr, sizeof(VAL) * result.second, cudaMemcpyDeviceToDevice, stream));
     }
 
@@ -193,7 +193,7 @@ struct UniqueImplBody<VariantKind::GPU, CODE, DIM> {
       auto comm = comms[0].get<ncclComm_t*>();
       result    = tree_reduce(output, result, point[0], launch_domain.get_volume(), stream, comm);
     }
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
 
     // Finally we pack the result
     output.bind_data(result.first, Point<1>(result.second));
@@ -205,4 +205,4 @@ struct UniqueImplBody<VariantKind::GPU, CODE, DIM> {
   unique_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cupynumeric/set/unique.h b/src/cupynumeric/set/unique.h
new file mode 100644
index 0000000000..9dca0f66a2
--- /dev/null
+++ b/src/cupynumeric/set/unique.h
@@ -0,0 +1,43 @@
+/* Copyright 2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "cupynumeric/cupynumeric_task.h"
+
+namespace cupynumeric {
+
+class UniqueTask : public CuPyNumericTask<UniqueTask> {
+ public:
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_UNIQUE}};
+
+  static constexpr auto CPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+  static constexpr auto GPU_VARIANT_OPTIONS =
+    legate::VariantOptions{}.with_concurrent(true).with_has_allocations(true);
+  static constexpr auto OMP_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+
+ public:
+  static void cpu_variant(legate::TaskContext context);
+#if LEGATE_DEFINED(LEGATE_USE_OPENMP)
+  static void omp_variant(legate::TaskContext context);
+#endif
+#if LEGATE_DEFINED(LEGATE_USE_CUDA)
+  static void gpu_variant(legate::TaskContext context);
+#endif
+};
+
+}  // namespace cupynumeric
diff --git a/src/cunumeric/set/unique_omp.cc b/src/cupynumeric/set/unique_omp.cc
similarity index 94%
rename from src/cunumeric/set/unique_omp.cc
rename to src/cupynumeric/set/unique_omp.cc
index 1f3646f5e0..e65783467e 100644
--- a/src/cunumeric/set/unique_omp.cc
+++ b/src/cupynumeric/set/unique_omp.cc
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/set/unique.h"
-#include "cunumeric/set/unique_template.inl"
+#include "cupynumeric/set/unique.h"
+#include "cupynumeric/set/unique_template.inl"
 
 #include <omp.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -79,4 +79,4 @@ struct UniqueImplBody<VariantKind::OMP, CODE, DIM> {
   unique_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/set/unique_reduce.cc b/src/cupynumeric/set/unique_reduce.cc
similarity index 78%
rename from src/cunumeric/set/unique_reduce.cc
rename to src/cupynumeric/set/unique_reduce.cc
index 095def2927..9e75929d65 100644
--- a/src/cunumeric/set/unique_reduce.cc
+++ b/src/cupynumeric/set/unique_reduce.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/set/unique_reduce.h"
-#include "cunumeric/set/unique_reduce_template.inl"
+#include "cupynumeric/set/unique_reduce.h"
+#include "cupynumeric/set/unique_reduce_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 /*static*/ void UniqueReduceTask::cpu_variant(TaskContext context)
 {
@@ -26,10 +26,10 @@ namespace cunumeric {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+const auto cupynumeric_reg_task_ = []() -> char {
   UniqueReduceTask::register_variants();
-}
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cupynumeric/set/unique_reduce.h b/src/cupynumeric/set/unique_reduce.h
new file mode 100644
index 0000000000..10b192f722
--- /dev/null
+++ b/src/cupynumeric/set/unique_reduce.h
@@ -0,0 +1,38 @@
+/* Copyright 2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "cupynumeric/cupynumeric_task.h"
+
+namespace cupynumeric {
+
+class UniqueReduceTask : public CuPyNumericTask<UniqueReduceTask> {
+ public:
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_UNIQUE_REDUCE}};
+
+  static constexpr auto CPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+  static constexpr auto OMP_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+
+ public:
+  static void cpu_variant(legate::TaskContext context);
+#if LEGATE_DEFINED(LEGATE_USE_OPENMP)
+  static void omp_variant(legate::TaskContext context);
+#endif
+};
+
+}  // namespace cupynumeric
diff --git a/src/cunumeric/set/unique_reduce_omp.cc b/src/cupynumeric/set/unique_reduce_omp.cc
similarity index 83%
rename from src/cunumeric/set/unique_reduce_omp.cc
rename to src/cupynumeric/set/unique_reduce_omp.cc
index efb30b2694..2f683524a6 100644
--- a/src/cunumeric/set/unique_reduce_omp.cc
+++ b/src/cupynumeric/set/unique_reduce_omp.cc
@@ -14,16 +14,16 @@
  *
  */
 
-#include "cunumeric/set/unique_reduce.h"
-#include "cunumeric/set/unique_reduce_template.inl"
+#include "cupynumeric/set/unique_reduce.h"
+#include "cupynumeric/set/unique_reduce_template.inl"
 
 #include <thrust/system/omp/execution_policy.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 /*static*/ void UniqueReduceTask::omp_variant(TaskContext context)
 {
   unique_reduce_template(context, thrust::omp::par);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/set/unique_reduce_template.inl b/src/cupynumeric/set/unique_reduce_template.inl
similarity index 94%
rename from src/cunumeric/set/unique_reduce_template.inl
rename to src/cupynumeric/set/unique_reduce_template.inl
index f356ee5109..d8f9aed2b9 100644
--- a/src/cunumeric/set/unique_reduce_template.inl
+++ b/src/cupynumeric/set/unique_reduce_template.inl
@@ -17,15 +17,15 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/set/unique_reduce.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/set/unique_reduce.h"
+#include "cupynumeric/pitches.h"
 
 #include <thrust/copy.h>
 #include <thrust/sort.h>
 #include <thrust/unique.h>
 #include <thrust/execution_policy.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -72,4 +72,4 @@ static void unique_reduce_template(TaskContext& context, const exe_pol_t& exe_po
   type_dispatch(output.type().code(), UniqueReduceImpl<exe_pol_t>{}, output, inputs, exe_pol);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/set/unique_template.inl b/src/cupynumeric/set/unique_template.inl
similarity index 93%
rename from src/cunumeric/set/unique_template.inl
rename to src/cupynumeric/set/unique_template.inl
index e30980c320..a7e425f345 100644
--- a/src/cunumeric/set/unique_template.inl
+++ b/src/cupynumeric/set/unique_template.inl
@@ -17,10 +17,10 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/set/unique.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/set/unique.h"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -64,4 +64,4 @@ static void unique_template(TaskContext& context)
                   context.get_launch_domain());
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/slice.h b/src/cupynumeric/slice.h
similarity index 93%
rename from src/cunumeric/slice.h
rename to src/cupynumeric/slice.h
index f2cd46da36..38e6819d68 100644
--- a/src/cunumeric/slice.h
+++ b/src/cupynumeric/slice.h
@@ -18,10 +18,10 @@
 
 #include <optional>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using slice = legate::Slice;
 
 constexpr auto open = legate::Slice::OPEN;
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/cub_sort.cuh b/src/cupynumeric/sort/cub_sort.cuh
similarity index 97%
rename from src/cunumeric/sort/cub_sort.cuh
rename to src/cupynumeric/sort/cub_sort.cuh
index 2ed55e7ef5..578e1384b9 100644
--- a/src/cunumeric/sort/cub_sort.cuh
+++ b/src/cupynumeric/sort/cub_sort.cuh
@@ -25,9 +25,9 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 namespace detail {
 
 using namespace legate;
@@ -47,7 +47,7 @@ void cub_local_sort(const VAL* values_in,
   if (values_in == values_out) {
     keys_in       = create_buffer<VAL>(volume, Memory::Kind::GPU_FB_MEM);
     values_in_cub = keys_in.ptr(0);
-    CUNUMERIC_CHECK_CUDA(cudaMemcpyAsync(
+    CUPYNUMERIC_CHECK_CUDA(cudaMemcpyAsync(
       keys_in.ptr(0), values_out, sizeof(VAL) * volume, cudaMemcpyDeviceToDevice, stream));
   }
 
@@ -111,7 +111,7 @@ void cub_local_sort(const VAL* values_in,
     if (indices_in == indices_out) {
       idx_in         = create_buffer<int64_t>(volume, Memory::Kind::GPU_FB_MEM);
       indices_in_cub = idx_in.ptr(0);
-      CUNUMERIC_CHECK_CUDA(cudaMemcpyAsync(
+      CUPYNUMERIC_CHECK_CUDA(cudaMemcpyAsync(
         idx_in.ptr(0), indices_out, sizeof(int64_t) * volume, cudaMemcpyDeviceToDevice, stream));
     }
 
@@ -193,4 +193,4 @@ void cub_local_sort(const VAL* values_in,
 }
 
 }  // namespace detail
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/cub_sort.h b/src/cupynumeric/sort/cub_sort.h
similarity index 98%
rename from src/cunumeric/sort/cub_sort.h
rename to src/cupynumeric/sort/cub_sort.h
index cfd03a3853..fcadb5ca91 100644
--- a/src/cunumeric/sort/cub_sort.h
+++ b/src/cupynumeric/sort/cub_sort.h
@@ -16,7 +16,7 @@
 
 #pragma once
 
-namespace cunumeric {
+namespace cupynumeric {
 
 void cub_local_sort(const bool* values_in,
                     bool* values_out,
@@ -114,4 +114,4 @@ void cub_local_sort(const double* values_in,
                     const size_t sort_dim_size,
                     cudaStream_t stream);
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/cub_sort_bool.cu b/src/cupynumeric/sort/cub_sort_bool.cu
similarity index 91%
rename from src/cunumeric/sort/cub_sort_bool.cu
rename to src/cupynumeric/sort/cub_sort_bool.cu
index 3f951454b4..e3f9117b58 100644
--- a/src/cunumeric/sort/cub_sort_bool.cu
+++ b/src/cupynumeric/sort/cub_sort_bool.cu
@@ -14,9 +14,9 @@
  *
  */
 
-#include "cunumeric/sort/cub_sort.cuh"
+#include "cupynumeric/sort/cub_sort.cuh"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 void cub_local_sort(const bool* values_in,
                     bool* values_out,
@@ -30,4 +30,4 @@ void cub_local_sort(const bool* values_in,
     values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stream);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/cub_sort_double.cu b/src/cupynumeric/sort/cub_sort_double.cu
similarity index 91%
rename from src/cunumeric/sort/cub_sort_double.cu
rename to src/cupynumeric/sort/cub_sort_double.cu
index 03a02fef27..e644802838 100644
--- a/src/cunumeric/sort/cub_sort_double.cu
+++ b/src/cupynumeric/sort/cub_sort_double.cu
@@ -14,9 +14,9 @@
  *
  */
 
-#include "cunumeric/sort/cub_sort.cuh"
+#include "cupynumeric/sort/cub_sort.cuh"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 void cub_local_sort(const double* values_in,
                     double* values_out,
@@ -30,4 +30,4 @@ void cub_local_sort(const double* values_in,
     values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stream);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/cub_sort_float.cu b/src/cupynumeric/sort/cub_sort_float.cu
similarity index 91%
rename from src/cunumeric/sort/cub_sort_float.cu
rename to src/cupynumeric/sort/cub_sort_float.cu
index 1502ad795f..2edc8db40c 100644
--- a/src/cunumeric/sort/cub_sort_float.cu
+++ b/src/cupynumeric/sort/cub_sort_float.cu
@@ -14,9 +14,9 @@
  *
  */
 
-#include "cunumeric/sort/cub_sort.cuh"
+#include "cupynumeric/sort/cub_sort.cuh"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 void cub_local_sort(const float* values_in,
                     float* values_out,
@@ -30,4 +30,4 @@ void cub_local_sort(const float* values_in,
     values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stream);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/cub_sort_half.cu b/src/cupynumeric/sort/cub_sort_half.cu
similarity index 91%
rename from src/cunumeric/sort/cub_sort_half.cu
rename to src/cupynumeric/sort/cub_sort_half.cu
index ef89de7379..d75bc4d189 100644
--- a/src/cunumeric/sort/cub_sort_half.cu
+++ b/src/cupynumeric/sort/cub_sort_half.cu
@@ -14,9 +14,9 @@
  *
  */
 
-#include "cunumeric/sort/cub_sort.cuh"
+#include "cupynumeric/sort/cub_sort.cuh"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 void cub_local_sort(const __half* values_in,
                     __half* values_out,
@@ -30,4 +30,4 @@ void cub_local_sort(const __half* values_in,
     values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stream);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/cub_sort_int16.cu b/src/cupynumeric/sort/cub_sort_int16.cu
similarity index 91%
rename from src/cunumeric/sort/cub_sort_int16.cu
rename to src/cupynumeric/sort/cub_sort_int16.cu
index b6a0b419e2..843c797556 100644
--- a/src/cunumeric/sort/cub_sort_int16.cu
+++ b/src/cupynumeric/sort/cub_sort_int16.cu
@@ -14,9 +14,9 @@
  *
  */
 
-#include "cunumeric/sort/cub_sort.cuh"
+#include "cupynumeric/sort/cub_sort.cuh"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 void cub_local_sort(const int16_t* values_in,
                     int16_t* values_out,
@@ -30,4 +30,4 @@ void cub_local_sort(const int16_t* values_in,
     values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stream);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/cub_sort_int32.cu b/src/cupynumeric/sort/cub_sort_int32.cu
similarity index 91%
rename from src/cunumeric/sort/cub_sort_int32.cu
rename to src/cupynumeric/sort/cub_sort_int32.cu
index f3af455476..5f682a5bd5 100644
--- a/src/cunumeric/sort/cub_sort_int32.cu
+++ b/src/cupynumeric/sort/cub_sort_int32.cu
@@ -14,9 +14,9 @@
  *
  */
 
-#include "cunumeric/sort/cub_sort.cuh"
+#include "cupynumeric/sort/cub_sort.cuh"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 void cub_local_sort(const int32_t* values_in,
                     int32_t* values_out,
@@ -30,4 +30,4 @@ void cub_local_sort(const int32_t* values_in,
     values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stream);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/cub_sort_int64.cu b/src/cupynumeric/sort/cub_sort_int64.cu
similarity index 91%
rename from src/cunumeric/sort/cub_sort_int64.cu
rename to src/cupynumeric/sort/cub_sort_int64.cu
index cd16f992ff..99410c6051 100644
--- a/src/cunumeric/sort/cub_sort_int64.cu
+++ b/src/cupynumeric/sort/cub_sort_int64.cu
@@ -14,9 +14,9 @@
  *
  */
 
-#include "cunumeric/sort/cub_sort.cuh"
+#include "cupynumeric/sort/cub_sort.cuh"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 void cub_local_sort(const int64_t* values_in,
                     int64_t* values_out,
@@ -30,4 +30,4 @@ void cub_local_sort(const int64_t* values_in,
     values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stream);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/cub_sort_int8.cu b/src/cupynumeric/sort/cub_sort_int8.cu
similarity index 91%
rename from src/cunumeric/sort/cub_sort_int8.cu
rename to src/cupynumeric/sort/cub_sort_int8.cu
index c960dbb345..fc58527e23 100644
--- a/src/cunumeric/sort/cub_sort_int8.cu
+++ b/src/cupynumeric/sort/cub_sort_int8.cu
@@ -14,9 +14,9 @@
  *
  */
 
-#include "cunumeric/sort/cub_sort.cuh"
+#include "cupynumeric/sort/cub_sort.cuh"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 void cub_local_sort(const int8_t* values_in,
                     int8_t* values_out,
@@ -30,4 +30,4 @@ void cub_local_sort(const int8_t* values_in,
     values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stream);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/cub_sort_uint16.cu b/src/cupynumeric/sort/cub_sort_uint16.cu
similarity index 91%
rename from src/cunumeric/sort/cub_sort_uint16.cu
rename to src/cupynumeric/sort/cub_sort_uint16.cu
index 3058dd8794..231c71ad55 100644
--- a/src/cunumeric/sort/cub_sort_uint16.cu
+++ b/src/cupynumeric/sort/cub_sort_uint16.cu
@@ -14,9 +14,9 @@
  *
  */
 
-#include "cunumeric/sort/cub_sort.cuh"
+#include "cupynumeric/sort/cub_sort.cuh"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 void cub_local_sort(const uint16_t* values_in,
                     uint16_t* values_out,
@@ -30,4 +30,4 @@ void cub_local_sort(const uint16_t* values_in,
     values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stream);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/cub_sort_uint32.cu b/src/cupynumeric/sort/cub_sort_uint32.cu
similarity index 91%
rename from src/cunumeric/sort/cub_sort_uint32.cu
rename to src/cupynumeric/sort/cub_sort_uint32.cu
index 49cb1556ff..b339c70439 100644
--- a/src/cunumeric/sort/cub_sort_uint32.cu
+++ b/src/cupynumeric/sort/cub_sort_uint32.cu
@@ -14,9 +14,9 @@
  *
  */
 
-#include "cunumeric/sort/cub_sort.cuh"
+#include "cupynumeric/sort/cub_sort.cuh"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 void cub_local_sort(const uint32_t* values_in,
                     uint32_t* values_out,
@@ -30,4 +30,4 @@ void cub_local_sort(const uint32_t* values_in,
     values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stream);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/cub_sort_uint64.cu b/src/cupynumeric/sort/cub_sort_uint64.cu
similarity index 91%
rename from src/cunumeric/sort/cub_sort_uint64.cu
rename to src/cupynumeric/sort/cub_sort_uint64.cu
index 9ed917f9b8..bfaeb2a0df 100644
--- a/src/cunumeric/sort/cub_sort_uint64.cu
+++ b/src/cupynumeric/sort/cub_sort_uint64.cu
@@ -14,9 +14,9 @@
  *
  */
 
-#include "cunumeric/sort/cub_sort.cuh"
+#include "cupynumeric/sort/cub_sort.cuh"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 void cub_local_sort(const uint64_t* values_in,
                     uint64_t* values_out,
@@ -30,4 +30,4 @@ void cub_local_sort(const uint64_t* values_in,
     values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stream);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/cub_sort_uint8.cu b/src/cupynumeric/sort/cub_sort_uint8.cu
similarity index 91%
rename from src/cunumeric/sort/cub_sort_uint8.cu
rename to src/cupynumeric/sort/cub_sort_uint8.cu
index ca776af561..073164bce8 100644
--- a/src/cunumeric/sort/cub_sort_uint8.cu
+++ b/src/cupynumeric/sort/cub_sort_uint8.cu
@@ -14,9 +14,9 @@
  *
  */
 
-#include "cunumeric/sort/cub_sort.cuh"
+#include "cupynumeric/sort/cub_sort.cuh"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 void cub_local_sort(const uint8_t* values_in,
                     uint8_t* values_out,
@@ -30,4 +30,4 @@ void cub_local_sort(const uint8_t* values_in,
     values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stream);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/searchsorted.cc b/src/cupynumeric/sort/searchsorted.cc
similarity index 93%
rename from src/cunumeric/sort/searchsorted.cc
rename to src/cupynumeric/sort/searchsorted.cc
index 6ea3d85c18..6d56046901 100644
--- a/src/cunumeric/sort/searchsorted.cc
+++ b/src/cupynumeric/sort/searchsorted.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/sort/searchsorted.h"
-#include "cunumeric/sort/searchsorted_template.inl"
+#include "cupynumeric/sort/searchsorted.h"
+#include "cupynumeric/sort/searchsorted_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -79,10 +79,10 @@ struct SearchSortedImplBody<VariantKind::CPU, CODE, DIM> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+const auto cupynumeric_reg_task_ = []() -> char {
   SearchSortedTask::register_variants();
-}
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/searchsorted.cu b/src/cupynumeric/sort/searchsorted.cu
similarity index 95%
rename from src/cunumeric/sort/searchsorted.cu
rename to src/cupynumeric/sort/searchsorted.cu
index 2c7488dc6e..d358c89ae0 100644
--- a/src/cunumeric/sort/searchsorted.cu
+++ b/src/cupynumeric/sort/searchsorted.cu
@@ -14,13 +14,13 @@
  *
  */
 
-#include "cunumeric/sort/searchsorted.h"
-#include "cunumeric/sort/searchsorted_template.inl"
+#include "cupynumeric/sort/searchsorted.h"
+#include "cupynumeric/sort/searchsorted_template.inl"
 #include <cub/thread/thread_search.cuh>
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename VAL, int32_t DIM>
 static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
@@ -106,7 +106,7 @@ struct SearchSortedImplBody<VariantKind::GPU, CODE, DIM> {
       searchsorted_kernel_max<VAL><<<num_blocks_desired, THREADS_PER_BLOCK, 0, stream>>>(
         output_reduction, input, input_v, rect_values.lo, pitches, volume, num_values, offset);
     }
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -115,4 +115,4 @@ struct SearchSortedImplBody<VariantKind::GPU, CODE, DIM> {
   searchsorted_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/searchsorted.h b/src/cupynumeric/sort/searchsorted.h
similarity index 80%
rename from src/cunumeric/sort/searchsorted.h
rename to src/cupynumeric/sort/searchsorted.h
index a55b0ddf63..a1b149eac8 100644
--- a/src/cunumeric/sort/searchsorted.h
+++ b/src/cupynumeric/sort/searchsorted.h
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct SearchSortedArgs {
   legate::PhysicalStore input_base;
@@ -29,9 +29,10 @@ struct SearchSortedArgs {
   bool is_index_space;
 };
 
-class SearchSortedTask : public CuNumericTask<SearchSortedTask> {
+class SearchSortedTask : public CuPyNumericTask<SearchSortedTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_SEARCHSORTED};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_SEARCHSORTED}};
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -43,4 +44,4 @@ class SearchSortedTask : public CuNumericTask<SearchSortedTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/searchsorted_omp.cc b/src/cupynumeric/sort/searchsorted_omp.cc
similarity index 95%
rename from src/cunumeric/sort/searchsorted_omp.cc
rename to src/cupynumeric/sort/searchsorted_omp.cc
index b19b09f77d..7130b7d1e7 100644
--- a/src/cunumeric/sort/searchsorted_omp.cc
+++ b/src/cupynumeric/sort/searchsorted_omp.cc
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/sort/searchsorted.h"
-#include "cunumeric/sort/searchsorted_template.inl"
+#include "cupynumeric/sort/searchsorted.h"
+#include "cupynumeric/sort/searchsorted_template.inl"
 
 #include <omp.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -81,4 +81,4 @@ struct SearchSortedImplBody<VariantKind::OMP, CODE, DIM> {
   searchsorted_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/searchsorted_template.inl b/src/cupynumeric/sort/searchsorted_template.inl
similarity index 95%
rename from src/cunumeric/sort/searchsorted_template.inl
rename to src/cupynumeric/sort/searchsorted_template.inl
index 9c910808f8..0bc3906a19 100644
--- a/src/cunumeric/sort/searchsorted_template.inl
+++ b/src/cupynumeric/sort/searchsorted_template.inl
@@ -17,10 +17,10 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/sort/searchsorted.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/sort/searchsorted.h"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -82,4 +82,4 @@ static void searchsorted_template(TaskContext& context)
     std::max(1, args.input_values.dim()), args.input_base.code(), SearchSortedImpl<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/sort.cc b/src/cupynumeric/sort/sort.cc
similarity index 84%
rename from src/cunumeric/sort/sort.cc
rename to src/cupynumeric/sort/sort.cc
index 985fdaa1ac..620356c33b 100644
--- a/src/cunumeric/sort/sort.cc
+++ b/src/cupynumeric/sort/sort.cc
@@ -17,14 +17,14 @@
 #include <cstring>
 #include <sstream>
 
-#include "cunumeric/sort/sort.h"
-#include "cunumeric/sort/sort_cpu.inl"
-#include "cunumeric/sort/sort_template.inl"
+#include "cupynumeric/sort/sort.h"
+#include "cupynumeric/sort/sort_cpu.inl"
+#include "cupynumeric/sort/sort_template.inl"
 
 #include <functional>
 #include <numeric>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -73,12 +73,10 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
-  auto options = legate::VariantOptions{}.with_concurrent(true);
-  SortTask::register_variants(
-    {{LEGATE_CPU_VARIANT, options}, {LEGATE_GPU_VARIANT, options}, {LEGATE_OMP_VARIANT, options}});
-}
+static const auto cupynumeric_reg_task_ = []() -> char {
+  SortTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/sort.cu b/src/cupynumeric/sort/sort.cu
similarity index 87%
rename from src/cunumeric/sort/sort.cu
rename to src/cupynumeric/sort/sort.cu
index 0c23890802..1109f78da1 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cupynumeric/sort/sort.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/sort/sort.h"
-#include "cunumeric/sort/sort_template.inl"
-#include "cunumeric/sort/cub_sort.h"
-#include "cunumeric/sort/thrust_sort.h"
-#include "cunumeric/utilities/thrust_allocator.h"
-#include "cunumeric/utilities/thrust_util.h"
+#include "cupynumeric/sort/sort.h"
+#include "cupynumeric/sort/sort_template.inl"
+#include "cupynumeric/sort/cub_sort.h"
+#include "cupynumeric/sort/thrust_sort.h"
+#include "cupynumeric/utilities/thrust_allocator.h"
+#include "cupynumeric/utilities/thrust_util.h"
 
 #include <thrust/scan.h>
 #include <thrust/sort.h>
@@ -33,14 +33,25 @@
 
 #include <cub/thread/thread_search.cuh>
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
 // above this threshold segment sort will be performed
 // by cub::DeviceSegmentedRadixSort instead of thrust::(stable_)sort
 // with tuple keys (not available for complex)
 #define SEGMENT_THRESHOLD_RADIX_SORT 400
 
-namespace cunumeric {
+namespace cupynumeric {
+
+namespace {
+
+template <typename T>
+legate::Buffer<T> create_non_empty_buffer(
+  size_t size, legate::Memory::Kind kind = legate::Memory::Kind::NO_MEMKIND)
+{
+  return legate::create_buffer<T>(std::max(size, size_t{1}), kind, alignof(T));
+}
+
+}  // namespace
 
 template <Type::Code CODE>
 struct support_cub : std::true_type {};
@@ -484,7 +495,7 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
     }
   }
 
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
   assert(target_offset == (segment_id + 1) * segment_size_l);
 #endif
 }
@@ -612,22 +623,22 @@ SegmentMergePiece<type_of<CODE>> merge_all_buffers(
     size_t merged_size    = 0;
     size_t num_sort_ranks = merge_buffers.size();
     Buffer<size_t> target_offsets =
-      create_buffer<size_t>(num_sort_ranks, legate::Memory::Z_COPY_MEM);
+      create_non_empty_buffer<size_t>(num_sort_ranks, legate::Memory::Z_COPY_MEM);
 
     // loop comparably small -> no init kernel
     for (size_t i = 0; i < num_sort_ranks; ++i) {
       target_offsets[i] = merged_size;
       merged_size += merge_buffers[i].size;
     }
-    result.values   = create_buffer<VAL>(merged_size);
-    result.indices  = create_buffer<int64_t>(argsort ? merged_size : 0);
-    result.segments = create_buffer<size_t>(segmented ? merged_size : 0);
+    result.values   = create_non_empty_buffer<VAL>(merged_size);
+    result.indices  = create_non_empty_buffer<int64_t>(argsort ? merged_size : 0);
+    result.segments = create_non_empty_buffer<size_t>(segmented ? merged_size : 0);
     result.size     = merged_size;
 
     // copy data into result
     {
       Buffer<VAL*> val_buffers_ptr =
-        create_buffer<VAL*>(num_sort_ranks, legate::Memory::Z_COPY_MEM);
+        create_non_empty_buffer<VAL*>(num_sort_ranks, legate::Memory::Z_COPY_MEM);
       for (size_t r = 0; r < num_sort_ranks; r++) {
         val_buffers_ptr[r] = merge_buffers[r].values.ptr(0);
       }
@@ -640,17 +651,17 @@ SegmentMergePiece<type_of<CODE>> merge_all_buffers(
         val_buffers_ptr, target_offsets, result.values, merged_size, num_sort_ranks);
       if (argsort) {
         Buffer<int64_t*> idc_buffers_ptr =
-          create_buffer<int64_t*>(num_sort_ranks, legate::Memory::Z_COPY_MEM);
+          create_non_empty_buffer<int64_t*>(num_sort_ranks, legate::Memory::Z_COPY_MEM);
         for (size_t r = 0; r < num_sort_ranks; r++) {
           idc_buffers_ptr[r] = merge_buffers[r].indices.ptr(0);
         }
         combine_buffers_no_sort<<<grid_shape, block_shape, 0, stream>>>(
           idc_buffers_ptr, target_offsets, result.indices, merged_size, num_sort_ranks);
 
-        CUNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));  // needed before Z-copy destroy()
+        CUPYNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));  // needed before Z-copy destroy()
         idc_buffers_ptr.destroy();
       } else {
-        CUNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));  // needed before Z-copy destroy()
+        CUPYNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));  // needed before Z-copy destroy()
       }
       val_buffers_ptr.destroy();
       target_offsets.destroy();
@@ -672,7 +683,7 @@ SegmentMergePiece<type_of<CODE>> merge_all_buffers(
     local_sort<CODE>(
       p_values, p_values, p_indices, p_indices, merged_size, merged_size, true, stream);
 
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
     return result;
   } else {
     // maybe k-way merge is more efficient here...
@@ -684,12 +695,12 @@ SegmentMergePiece<type_of<CODE>> merge_all_buffers(
         SegmentMergePiece<VAL> source1 = merge_buffers[pos];
         SegmentMergePiece<VAL> source2 = merge_buffers[pos + stride];
         auto merged_size               = source1.size + source2.size;
-        auto merged_values             = create_buffer<VAL>(merged_size);
-        auto merged_indices            = create_buffer<int64_t>(argsort ? merged_size : 0);
-        auto merged_segments           = create_buffer<size_t>(segmented ? merged_size : 0);
-        auto p_merged_values           = merged_values.ptr(0);
-        auto p_values1                 = source1.values.ptr(0);
-        auto p_values2                 = source2.values.ptr(0);
+        auto merged_values             = create_non_empty_buffer<VAL>(merged_size);
+        auto merged_indices  = create_non_empty_buffer<int64_t>(argsort ? merged_size : 0);
+        auto merged_segments = create_non_empty_buffer<size_t>(segmented ? merged_size : 0);
+        auto p_merged_values = merged_values.ptr(0);
+        auto p_values1       = source1.values.ptr(0);
+        auto p_values2       = source2.values.ptr(0);
 
         if (segmented) {
           auto p_merged_segments = merged_segments.ptr(0);
@@ -773,7 +784,7 @@ SegmentMergePiece<type_of<CODE>> merge_all_buffers(
     }
     SegmentMergePiece<VAL> result = merge_buffers[0];
     merge_buffers.clear();
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
     return result;
   }
 }
@@ -816,7 +827,8 @@ void rebalance_data(SegmentMergePiece<VAL>& merge_buffer,
   {
     // compute diff for each segment
     const size_t num_segments_l_aligned = get_16b_aligned_count(num_segments_l, sizeof(size_t));
-    auto segment_diff = create_buffer<int64_t>(num_segments_l_aligned, legate::Memory::GPU_FB_MEM);
+    auto segment_diff =
+      create_non_empty_buffer<int64_t>(num_segments_l_aligned, legate::Memory::GPU_FB_MEM);
     {
       // start kernel to search from merge_buffer.segments
       const size_t num_blocks = (num_segments_l + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
@@ -833,7 +845,7 @@ void rebalance_data(SegmentMergePiece<VAL>& merge_buffer,
       merge_buffer.values.destroy();
     }
 
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     {
       size_t reduce =
         thrust::reduce(exec_policy, segment_diff.ptr(0), segment_diff.ptr(0) + num_segments_l, 0);
@@ -843,8 +855,8 @@ void rebalance_data(SegmentMergePiece<VAL>& merge_buffer,
 #endif
 
     // allocate target
-    Buffer<int64_t> segment_diff_buffers =
-      create_buffer<int64_t>(num_segments_l_aligned * num_sort_ranks, legate::Memory::GPU_FB_MEM);
+    Buffer<int64_t> segment_diff_buffers = create_non_empty_buffer<int64_t>(
+      num_segments_l_aligned * num_sort_ranks, legate::Memory::GPU_FB_MEM);
 
     // communicate segment diffs
     CHECK_NCCL(ncclGroupStart());
@@ -861,8 +873,8 @@ void rebalance_data(SegmentMergePiece<VAL>& merge_buffer,
     CHECK_NCCL(ncclGroupEnd());
 
     // copy to transpose structure [segments][ranks]
-    auto segment_diff_2d =
-      create_buffer<int64_t>(num_segments_l_aligned * num_sort_ranks, legate::Memory::GPU_FB_MEM);
+    auto segment_diff_2d = create_non_empty_buffer<int64_t>(num_segments_l_aligned * num_sort_ranks,
+                                                            legate::Memory::GPU_FB_MEM);
 
     // Transpose
     {
@@ -875,7 +887,7 @@ void rebalance_data(SegmentMergePiece<VAL>& merge_buffer,
                                             num_sort_ranks);
     }
 
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     {
       for (size_t segment = 0; segment < num_segments_l; ++segment) {
         assert(0 == thrust::reduce(exec_policy,
@@ -901,40 +913,40 @@ void rebalance_data(SegmentMergePiece<VAL>& merge_buffer,
           edge case --> send more than whole line should not happen due to sample choice!
     */
     // 2 (signed) arrays - left/right for every segment
-    auto send_left  = create_buffer<int64_t>(num_segments_l, legate::Memory::GPU_FB_MEM);
-    auto send_right = create_buffer<int64_t>(num_segments_l, legate::Memory::GPU_FB_MEM);
+    auto send_left  = create_non_empty_buffer<int64_t>(num_segments_l, legate::Memory::GPU_FB_MEM);
+    auto send_right = create_non_empty_buffer<int64_t>(num_segments_l, legate::Memory::GPU_FB_MEM);
 
     // compute data to send....
     auto segment_diff_2d_scan =
-      create_buffer<int64_t>(num_segments_l * num_sort_ranks, legate::Memory::GPU_FB_MEM);
+      create_non_empty_buffer<int64_t>(num_segments_l * num_sort_ranks, legate::Memory::GPU_FB_MEM);
     thrust::device_ptr<int64_t> segment_diff_2d_ptr(segment_diff_2d.ptr(0));
     thrust::device_ptr<int64_t> segment_diff_2d_scan_ptr(segment_diff_2d_scan.ptr(0));
     thrust::inclusive_scan(exec_policy,
                            segment_diff_2d_ptr,
                            segment_diff_2d_ptr + num_segments_l * num_sort_ranks,
                            segment_diff_2d_scan_ptr);
-    CUNUMERIC_CHECK_CUDA(cudaMemcpy2DAsync(send_right.ptr(0),
-                                           sizeof(int64_t),
-                                           segment_diff_2d_scan.ptr(0) + my_sort_rank,
-                                           num_sort_ranks * sizeof(int64_t),
-                                           sizeof(int64_t),
-                                           num_segments_l,
-                                           cudaMemcpyDeviceToDevice,
-                                           stream));
+    CUPYNUMERIC_CHECK_CUDA(cudaMemcpy2DAsync(send_right.ptr(0),
+                                             sizeof(int64_t),
+                                             segment_diff_2d_scan.ptr(0) + my_sort_rank,
+                                             num_sort_ranks * sizeof(int64_t),
+                                             sizeof(int64_t),
+                                             num_segments_l,
+                                             cudaMemcpyDeviceToDevice,
+                                             stream));
     thrust::reverse_iterator<thrust::device_vector<int64_t>::iterator> iter_in(
       segment_diff_2d_ptr + num_segments_l * num_sort_ranks);
     thrust::reverse_iterator<thrust::device_vector<int64_t>::iterator> iter_out(
       segment_diff_2d_scan_ptr + num_segments_l * num_sort_ranks);
     thrust::inclusive_scan(
       exec_policy, iter_in, iter_in + num_segments_l * num_sort_ranks, iter_out);
-    CUNUMERIC_CHECK_CUDA(cudaMemcpy2DAsync(send_left.ptr(0),
-                                           sizeof(int64_t),
-                                           segment_diff_2d_scan.ptr(0) + my_sort_rank,
-                                           num_sort_ranks * sizeof(int64_t),
-                                           sizeof(int64_t),
-                                           num_segments_l,
-                                           cudaMemcpyDeviceToDevice,
-                                           stream));
+    CUPYNUMERIC_CHECK_CUDA(cudaMemcpy2DAsync(send_left.ptr(0),
+                                             sizeof(int64_t),
+                                             segment_diff_2d_scan.ptr(0) + my_sort_rank,
+                                             num_sort_ranks * sizeof(int64_t),
+                                             sizeof(int64_t),
+                                             num_segments_l,
+                                             cudaMemcpyDeviceToDevice,
+                                             stream));
 
     segment_diff_2d.destroy();
     segment_diff_2d_scan.destroy();
@@ -970,24 +982,35 @@ void rebalance_data(SegmentMergePiece<VAL>& merge_buffer,
     send_right_data.size = send_right_size;
     recv_right_data.size = recv_right_size;
     if (argsort) {
-      send_left_data.indices  = create_buffer<int64_t>(send_left_size, legate::Memory::GPU_FB_MEM);
-      recv_left_data.indices  = create_buffer<int64_t>(recv_left_size, legate::Memory::GPU_FB_MEM);
-      send_right_data.indices = create_buffer<int64_t>(send_right_size, legate::Memory::GPU_FB_MEM);
-      recv_right_data.indices = create_buffer<int64_t>(recv_right_size, legate::Memory::GPU_FB_MEM);
+      send_left_data.indices =
+        create_non_empty_buffer<int64_t>(send_left_size, legate::Memory::GPU_FB_MEM);
+      recv_left_data.indices =
+        create_non_empty_buffer<int64_t>(recv_left_size, legate::Memory::GPU_FB_MEM);
+      send_right_data.indices =
+        create_non_empty_buffer<int64_t>(send_right_size, legate::Memory::GPU_FB_MEM);
+      recv_right_data.indices =
+        create_non_empty_buffer<int64_t>(recv_right_size, legate::Memory::GPU_FB_MEM);
     } else {
-      send_left_data.values  = create_buffer<VAL>(send_left_size, legate::Memory::GPU_FB_MEM);
-      recv_left_data.values  = create_buffer<VAL>(recv_left_size, legate::Memory::GPU_FB_MEM);
-      send_right_data.values = create_buffer<VAL>(send_right_size, legate::Memory::GPU_FB_MEM);
-      recv_right_data.values = create_buffer<VAL>(recv_right_size, legate::Memory::GPU_FB_MEM);
+      send_left_data.values =
+        create_non_empty_buffer<VAL>(send_left_size, legate::Memory::GPU_FB_MEM);
+      recv_left_data.values =
+        create_non_empty_buffer<VAL>(recv_left_size, legate::Memory::GPU_FB_MEM);
+      send_right_data.values =
+        create_non_empty_buffer<VAL>(send_right_size, legate::Memory::GPU_FB_MEM);
+      recv_right_data.values =
+        create_non_empty_buffer<VAL>(recv_right_size, legate::Memory::GPU_FB_MEM);
     }
 
     Buffer<int64_t> segment_diff_pos;
     {
       // need scan of segment_diff
       // need scan of (positive!) send_left, send_right
-      segment_diff_pos    = create_buffer<int64_t>(num_segments_l, legate::Memory::GPU_FB_MEM);
-      auto send_left_pos  = create_buffer<int64_t>(num_segments_l, legate::Memory::GPU_FB_MEM);
-      auto send_right_pos = create_buffer<int64_t>(num_segments_l, legate::Memory::GPU_FB_MEM);
+      segment_diff_pos =
+        create_non_empty_buffer<int64_t>(num_segments_l, legate::Memory::GPU_FB_MEM);
+      auto send_left_pos =
+        create_non_empty_buffer<int64_t>(num_segments_l, legate::Memory::GPU_FB_MEM);
+      auto send_right_pos =
+        create_non_empty_buffer<int64_t>(num_segments_l, legate::Memory::GPU_FB_MEM);
       {
         thrust::device_ptr<int64_t> segment_diff_ptr(segment_diff.ptr(0));
         thrust::device_ptr<int64_t> segment_diff_pos_ptr(segment_diff_pos.ptr(0));
@@ -1138,8 +1161,10 @@ void rebalance_data(SegmentMergePiece<VAL>& merge_buffer,
     // merge data into target
     {
       // need scan of (negative!) send_left, send_right
-      auto recv_left_pos  = create_buffer<int64_t>(num_segments_l, legate::Memory::GPU_FB_MEM);
-      auto recv_right_pos = create_buffer<int64_t>(num_segments_l, legate::Memory::GPU_FB_MEM);
+      auto recv_left_pos =
+        create_non_empty_buffer<int64_t>(num_segments_l, legate::Memory::GPU_FB_MEM);
+      auto recv_right_pos =
+        create_non_empty_buffer<int64_t>(num_segments_l, legate::Memory::GPU_FB_MEM);
       {
         thrust::device_ptr<int64_t> recv_left_ptr(send_left.ptr(0));
         thrust::device_ptr<int64_t> recv_left_pos_ptr(recv_left_pos.ptr(0));
@@ -1212,7 +1237,7 @@ void rebalance_data(SegmentMergePiece<VAL>& merge_buffer,
       recv_left_data.values.destroy();
       recv_right_data.values.destroy();
     }
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 }
 
@@ -1258,17 +1283,17 @@ void sample_sort_nccl_nd(
   // sort ranks. Note that if segment_size_l>0 && volume==0 means that we have
   // a full sort group being empty, this should not affect local sort rank size.
   {
-    auto worker_count_d = create_buffer<int32_t>(1, legate::Memory::GPU_FB_MEM);
+    auto worker_count_d = create_non_empty_buffer<int32_t>(1, legate::Memory::GPU_FB_MEM);
     size_t worker_count = (segment_size_l > 0 ? 1 : 0);
-    CUNUMERIC_CHECK_CUDA(cudaMemcpyAsync(
+    CUPYNUMERIC_CHECK_CUDA(cudaMemcpyAsync(
       worker_count_d.ptr(0), &worker_count, sizeof(int32_t), cudaMemcpyHostToDevice, stream));
     context.concurrent_task_barrier();
     CHECK_NCCL(ncclAllReduce(
       worker_count_d.ptr(0), worker_count_d.ptr(0), 1, ncclInt32, ncclSum, *comm, stream));
     context.concurrent_task_barrier();
-    CUNUMERIC_CHECK_CUDA(cudaMemcpyAsync(
+    CUPYNUMERIC_CHECK_CUDA(cudaMemcpyAsync(
       &worker_count, worker_count_d.ptr(0), sizeof(int32_t), cudaMemcpyDeviceToHost, stream));
-    CUNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
+    CUPYNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
     if (worker_count < num_ranks) {
       const size_t number_sort_groups = num_ranks / num_sort_ranks;
       num_sort_ranks                  = worker_count / number_sort_groups;
@@ -1280,10 +1305,10 @@ void sample_sort_nccl_nd(
       if (is_unbound_1d_storage) {
         // we need to return an empty buffer here
         if (argsort) {
-          auto buffer = create_buffer<int64_t>(0, legate::Memory::GPU_FB_MEM);
+          auto buffer = create_non_empty_buffer<int64_t>(0, legate::Memory::GPU_FB_MEM);
           output_array_unbound.bind_data(buffer, Point<1>(0));
         } else {
-          auto buffer = create_buffer<VAL>(0, legate::Memory::GPU_FB_MEM);
+          auto buffer = create_non_empty_buffer<VAL>(0, legate::Memory::GPU_FB_MEM);
           output_array_unbound.bind_data(buffer, Point<1>(0));
         }
       }
@@ -1302,7 +1327,8 @@ void sample_sort_nccl_nd(
   size_t num_samples_l             = num_samples_per_segment_l * num_segments_l;
   size_t num_samples_per_segment_g = num_samples_per_segment_l * num_sort_ranks;
   size_t num_samples_g             = num_samples_per_segment_g * num_segments_l;
-  auto samples = create_buffer<SegmentSample<VAL>>(num_samples_g, legate::Memory::GPU_FB_MEM);
+  auto samples =
+    create_non_empty_buffer<SegmentSample<VAL>>(num_samples_g, legate::Memory::GPU_FB_MEM);
 
   size_t offset = num_samples_l * my_sort_rank;
   {
@@ -1316,7 +1342,7 @@ void sample_sort_nccl_nd(
       offset,
       num_sort_ranks,
       my_sort_rank);
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 
   // AllGather does not work here as not all have the same amount!
@@ -1324,15 +1350,16 @@ void sample_sort_nccl_nd(
   {
     // allocate receive buffer
     const size_t aligned_count = get_16b_aligned_count(num_samples_l, sizeof(SegmentSample<VAL>));
-    auto send_buffer = create_buffer<SegmentSample<VAL>>(aligned_count, legate::Memory::GPU_FB_MEM);
-    CUNUMERIC_CHECK_CUDA(cudaMemcpyAsync(send_buffer.ptr(0),
-                                         samples.ptr(offset),
-                                         sizeof(SegmentSample<VAL>) * num_samples_l,
-                                         cudaMemcpyDeviceToDevice,
-                                         stream));
+    auto send_buffer =
+      create_non_empty_buffer<SegmentSample<VAL>>(aligned_count, legate::Memory::GPU_FB_MEM);
+    CUPYNUMERIC_CHECK_CUDA(cudaMemcpyAsync(send_buffer.ptr(0),
+                                           samples.ptr(offset),
+                                           sizeof(SegmentSample<VAL>) * num_samples_l,
+                                           cudaMemcpyDeviceToDevice,
+                                           stream));
 
-    auto recv_buffer =
-      create_buffer<SegmentSample<VAL>>(aligned_count * num_sort_ranks, legate::Memory::GPU_FB_MEM);
+    auto recv_buffer = create_non_empty_buffer<SegmentSample<VAL>>(aligned_count * num_sort_ranks,
+                                                                   legate::Memory::GPU_FB_MEM);
 
     CHECK_NCCL(ncclGroupStart());
     for (size_t r = 0; r < num_sort_ranks; r++) {
@@ -1356,11 +1383,11 @@ void sample_sort_nccl_nd(
     // copy back
     for (size_t r = 0; r < num_sort_ranks; r++) {
       if (r != my_sort_rank) {
-        CUNUMERIC_CHECK_CUDA(cudaMemcpyAsync(samples.ptr(num_samples_l * r),
-                                             recv_buffer.ptr(aligned_count * r),
-                                             sizeof(SegmentSample<VAL>) * num_samples_l,
-                                             cudaMemcpyDeviceToDevice,
-                                             stream));
+        CUPYNUMERIC_CHECK_CUDA(cudaMemcpyAsync(samples.ptr(num_samples_l * r),
+                                               recv_buffer.ptr(aligned_count * r),
+                                               sizeof(SegmentSample<VAL>) * num_samples_l,
+                                               cudaMemcpyDeviceToDevice,
+                                               stream));
       }
     }
 
@@ -1368,7 +1395,7 @@ void sample_sort_nccl_nd(
     send_buffer.destroy();
     recv_buffer.destroy();
 
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 
   /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1395,7 +1422,7 @@ void sample_sort_nccl_nd(
   // select splitters / positions based on samples (on device)
   // the indexing is split_positions[segments][positions]
   const size_t num_splitters = (num_sort_ranks - 1) * num_segments_l;
-  auto split_positions       = create_buffer<size_t>(num_splitters, legate::Memory::GPU_FB_MEM);
+  auto split_positions = create_non_empty_buffer<size_t>(num_splitters, legate::Memory::GPU_FB_MEM);
   {
     const size_t num_blocks = (num_splitters + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
     extract_split_positions_segments<<<num_blocks, THREADS_PER_BLOCK, 0, stream>>>(
@@ -1413,12 +1440,12 @@ void sample_sort_nccl_nd(
   // segment_blocks[r][segment]->position of data in segment for process r
   // perform blocksize wide scan on size_send[r][block*blocksize] within warp
   Buffer<size_t> segment_blocks =
-    create_buffer<size_t>(num_segments_l * num_sort_ranks, legate::Memory::GPU_FB_MEM);
+    create_non_empty_buffer<size_t>(num_segments_l * num_sort_ranks, legate::Memory::GPU_FB_MEM);
 
   // initialize sizes to send
   const size_t num_segments_l_aligned = get_16b_aligned_count(num_segments_l + 1, sizeof(size_t));
-  Buffer<size_t> size_send =
-    create_buffer<size_t>(num_segments_l_aligned * num_sort_ranks, legate::Memory::GPU_FB_MEM);
+  Buffer<size_t> size_send            = create_non_empty_buffer<size_t>(
+    num_segments_l_aligned * num_sort_ranks, legate::Memory::GPU_FB_MEM);
 
   {
     const size_t num_send_parts = num_sort_ranks * num_segments_l;
@@ -1434,7 +1461,7 @@ void sample_sort_nccl_nd(
     compute_scan_per_rank<<<num_sort_ranks, THREADS_PER_BLOCK, 0, stream>>>(
       segment_blocks.ptr(0), size_send.ptr(0), num_segments_l, num_segments_l_aligned);
 
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 
   // cleanup intermediate data structures
@@ -1446,8 +1473,8 @@ void sample_sort_nccl_nd(
   /////////////////////////////////////////////////////////////////////////////////////////////////
 
   // all2all exchange send/receive sizes
-  Buffer<size_t> size_recv =
-    create_buffer<size_t>(num_segments_l_aligned * num_sort_ranks, legate::Memory::GPU_FB_MEM);
+  Buffer<size_t> size_recv = create_non_empty_buffer<size_t>(
+    num_segments_l_aligned * num_sort_ranks, legate::Memory::GPU_FB_MEM);
   CHECK_NCCL(ncclGroupStart());
   for (size_t r = 0; r < num_sort_ranks; r++) {
     CHECK_NCCL(ncclSend(size_send.ptr(r * num_segments_l_aligned),
@@ -1467,29 +1494,29 @@ void sample_sort_nccl_nd(
 
   // we need the amount of data to transfer on the host --> get it
   Buffer<size_t> size_send_total =
-    create_buffer<size_t>(num_sort_ranks, legate::Memory::Z_COPY_MEM);
+    create_non_empty_buffer<size_t>(num_sort_ranks, legate::Memory::Z_COPY_MEM);
   Buffer<size_t> size_recv_total =
-    create_buffer<size_t>(num_sort_ranks, legate::Memory::Z_COPY_MEM);
+    create_non_empty_buffer<size_t>(num_sort_ranks, legate::Memory::Z_COPY_MEM);
   {
-    CUNUMERIC_CHECK_CUDA(cudaMemcpy2DAsync(size_send_total.ptr(0),
-                                           1 * sizeof(size_t),
-                                           size_send.ptr(num_segments_l),
-                                           num_segments_l_aligned * sizeof(size_t),
-                                           sizeof(int64_t),
-                                           num_sort_ranks,
-                                           cudaMemcpyDeviceToHost,
-                                           stream));
-    CUNUMERIC_CHECK_CUDA(cudaMemcpy2DAsync(size_recv_total.ptr(0),
-                                           1 * sizeof(size_t),
-                                           size_recv.ptr(num_segments_l),
-                                           num_segments_l_aligned * sizeof(size_t),
-                                           sizeof(int64_t),
-                                           num_sort_ranks,
-                                           cudaMemcpyDeviceToHost,
-                                           stream));
+    CUPYNUMERIC_CHECK_CUDA(cudaMemcpy2DAsync(size_send_total.ptr(0),
+                                             1 * sizeof(size_t),
+                                             size_send.ptr(num_segments_l),
+                                             num_segments_l_aligned * sizeof(size_t),
+                                             sizeof(int64_t),
+                                             num_sort_ranks,
+                                             cudaMemcpyDeviceToHost,
+                                             stream));
+    CUPYNUMERIC_CHECK_CUDA(cudaMemcpy2DAsync(size_recv_total.ptr(0),
+                                             1 * sizeof(size_t),
+                                             size_recv.ptr(num_segments_l),
+                                             num_segments_l_aligned * sizeof(size_t),
+                                             sizeof(int64_t),
+                                             num_sort_ranks,
+                                             cudaMemcpyDeviceToHost,
+                                             stream));
 
     // need to sync as we share values in between host/device
-    CUNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
+    CUPYNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
   }
 
   // copy values into aligned send buffer
@@ -1497,16 +1524,17 @@ void sample_sort_nccl_nd(
   std::vector<Buffer<int64_t>> idc_send_buffers(num_sort_ranks);
   {
     for (size_t r = 0; r < num_sort_ranks; r++) {
-      val_send_buffers[r] = create_buffer<VAL>(size_send_total[r], legate::Memory::GPU_FB_MEM);
+      val_send_buffers[r] =
+        create_non_empty_buffer<VAL>(size_send_total[r], legate::Memory::GPU_FB_MEM);
       if (argsort) {
         idc_send_buffers[r] =
-          create_buffer<int64_t>(size_send_total[r], legate::Memory::GPU_FB_MEM);
+          create_non_empty_buffer<int64_t>(size_send_total[r], legate::Memory::GPU_FB_MEM);
       }
     }
 
     {
       Buffer<VAL*> val_send_buffers_ptr =
-        create_buffer<VAL*>(num_sort_ranks, legate::Memory::Z_COPY_MEM);
+        create_non_empty_buffer<VAL*>(num_sort_ranks, legate::Memory::Z_COPY_MEM);
       for (size_t r = 0; r < num_sort_ranks; r++) {
         val_send_buffers_ptr[r] = val_send_buffers[r].ptr(0);
       }
@@ -1526,7 +1554,7 @@ void sample_sort_nccl_nd(
 
       if (argsort) {
         Buffer<int64_t*> idc_send_buffers_ptr =
-          create_buffer<int64_t*>(num_sort_ranks, legate::Memory::Z_COPY_MEM);
+          create_non_empty_buffer<int64_t*>(num_sort_ranks, legate::Memory::Z_COPY_MEM);
         for (size_t r = 0; r < num_sort_ranks; r++) {
           idc_send_buffers_ptr[r] = idc_send_buffers[r].ptr(0);
         }
@@ -1540,13 +1568,13 @@ void sample_sort_nccl_nd(
                                                                            segment_size_l,
                                                                            my_rank,
                                                                            num_sort_ranks);
-        CUNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));  // needed before Z-copy destroy()
+        CUPYNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));  // needed before Z-copy destroy()
         idc_send_buffers_ptr.destroy();
       } else {
-        CUNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));  // needed before Z-copy destroy()
+        CUPYNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));  // needed before Z-copy destroy()
       }
       val_send_buffers_ptr.destroy();
-      CUNUMERIC_CHECK_CUDA_STREAM(stream);
+      CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
     }
 
     local_sorted.values.destroy();
@@ -1566,7 +1594,8 @@ void sample_sort_nccl_nd(
 
       // initialize segment information
       if (num_segments_l > 1) {
-        merge_buffers[r].segments = create_buffer<size_t>(size, legate::Memory::GPU_FB_MEM);
+        merge_buffers[r].segments =
+          create_non_empty_buffer<size_t>(size, legate::Memory::GPU_FB_MEM);
         // 0  1  2  1  3      // counts per segment to receive
         // 0  1  3  4  7
         // 0 1 2 3 4 5 6
@@ -1576,7 +1605,7 @@ void sample_sort_nccl_nd(
                                size_recv.ptr(r * num_segments_l_aligned),
                                size_recv.ptr(r * num_segments_l_aligned) + num_segments_l + 1,
                                size_recv.ptr(r * num_segments_l_aligned));
-        CUNUMERIC_CHECK_CUDA(
+        CUPYNUMERIC_CHECK_CUDA(
           cudaMemsetAsync(merge_buffers[r].segments.ptr(0), 0, size * sizeof(size_t), stream));
         const size_t num_blocks = (num_segments_l + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
         assert(sizeof(unsigned long long int) ==
@@ -1592,15 +1621,16 @@ void sample_sort_nccl_nd(
                                merge_buffers[r].segments.ptr(0));
       }
 
-      merge_buffers[r].values = create_buffer<VAL>(size, legate::Memory::GPU_FB_MEM);
+      merge_buffers[r].values = create_non_empty_buffer<VAL>(size, legate::Memory::GPU_FB_MEM);
       if (argsort) {
-        merge_buffers[r].indices = create_buffer<int64_t>(size, legate::Memory::GPU_FB_MEM);
+        merge_buffers[r].indices =
+          create_non_empty_buffer<int64_t>(size, legate::Memory::GPU_FB_MEM);
       } else {
-        merge_buffers[r].indices = create_buffer<int64_t>(0, legate::Memory::GPU_FB_MEM);
+        merge_buffers[r].indices = create_non_empty_buffer<int64_t>(0, legate::Memory::GPU_FB_MEM);
       }
     }
 
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 
   // communicate all2all (in sort dimension)
@@ -1655,7 +1685,7 @@ void sample_sort_nccl_nd(
       idc_send_buffers[r].destroy();
     }
   }
-  CUNUMERIC_CHECK_CUDA_STREAM(stream);
+  CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
 
   /////////////////////////////////////////////////////////////////////////////////////////////////
   /////////////// Part 4: merge data
@@ -1744,13 +1774,14 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
     VAL* values_ptr      = nullptr;
     if (argsort) {
       // make a buffer for input
-      auto input_copy     = create_buffer<VAL>(volume, legate::Memory::Kind::GPU_FB_MEM);
+      auto input_copy     = create_non_empty_buffer<VAL>(volume, legate::Memory::Kind::GPU_FB_MEM);
       local_sorted.values = input_copy;
       values_ptr          = input_copy.ptr(0);
 
       // initialize indices
       if (need_distributed_sort) {
-        auto indices_buffer  = create_buffer<int64_t>(volume, legate::Memory::Kind::GPU_FB_MEM);
+        auto indices_buffer =
+          create_non_empty_buffer<int64_t>(volume, legate::Memory::Kind::GPU_FB_MEM);
         indices_ptr          = indices_buffer.ptr(0);
         local_sorted.indices = indices_buffer;
         local_sorted.size    = volume;
@@ -1775,18 +1806,19 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
     } else {
       // initialize output
       if (need_distributed_sort) {
-        auto input_copy      = create_buffer<VAL>(volume, legate::Memory::Kind::GPU_FB_MEM);
-        values_ptr           = input_copy.ptr(0);
-        local_sorted.values  = input_copy;
-        local_sorted.indices = create_buffer<int64_t>(0, legate::Memory::Kind::GPU_FB_MEM);
-        local_sorted.size    = volume;
+        auto input_copy = create_non_empty_buffer<VAL>(volume, legate::Memory::Kind::GPU_FB_MEM);
+        values_ptr      = input_copy.ptr(0);
+        local_sorted.values = input_copy;
+        local_sorted.indices =
+          create_non_empty_buffer<int64_t>(0, legate::Memory::Kind::GPU_FB_MEM);
+        local_sorted.size = volume;
       } else {
         AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
         assert(rect.empty() || output.accessor.is_dense_row_major(rect));
         values_ptr = output.ptr(rect.lo);
       }
     }
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
 
     if (volume > 0) {
       // sort data (locally)
@@ -1799,7 +1831,7 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
                        stable,
                        stream);
     }
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
 
     if (need_distributed_sort) {
       if (is_index_space) {
@@ -1854,7 +1886,7 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
       local_sorted.values.destroy();
     }
 
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -1863,4 +1895,4 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
   sort_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/sort.h b/src/cupynumeric/sort/sort.h
similarity index 80%
rename from src/cunumeric/sort/sort.h
rename to src/cupynumeric/sort/sort.h
index 722f2e8cc3..e719c553e2 100644
--- a/src/cunumeric/sort/sort.h
+++ b/src/cupynumeric/sort/sort.h
@@ -16,11 +16,11 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
 #include <thrust/sort.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct SortArgs {
   legate::PhysicalStore input;
@@ -92,9 +92,16 @@ struct modulusWithOffset : public thrust::binary_function<int64_t, int64_t, int6
   }
 };
 
-class SortTask : public CuNumericTask<SortTask> {
+class SortTask : public CuPyNumericTask<SortTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_SORT};
+  static inline const auto TASK_CONFIG = legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_SORT}};
+
+  static constexpr auto CPU_VARIANT_OPTIONS =
+    legate::VariantOptions{}.with_concurrent(true).with_has_allocations(true);
+  static constexpr auto GPU_VARIANT_OPTIONS =
+    legate::VariantOptions{}.with_concurrent(true).with_has_allocations(true);
+  static constexpr auto OMP_VARIANT_OPTIONS =
+    legate::VariantOptions{}.with_concurrent(true).with_has_allocations(true);
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -106,4 +113,4 @@ class SortTask : public CuNumericTask<SortTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/sort_cpu.inl b/src/cupynumeric/sort/sort_cpu.inl
similarity index 93%
rename from src/cunumeric/sort/sort_cpu.inl
rename to src/cupynumeric/sort/sort_cpu.inl
index a8043a3131..cdcba6cebb 100644
--- a/src/cunumeric/sort/sort_cpu.inl
+++ b/src/cupynumeric/sort/sort_cpu.inl
@@ -17,8 +17,8 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/sort/sort.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/sort/sort.h"
+#include "cupynumeric/pitches.h"
 #include "legate/comm/coll.h"
 
 #include <thrust/detail/config.h>
@@ -32,7 +32,18 @@
 #include <functional>
 #include <numeric>
 
-namespace cunumeric {
+namespace cupynumeric {
+
+namespace {
+
+template <typename T>
+legate::Buffer<T> create_non_empty_buffer(size_t size)
+{
+  return legate::create_buffer<T>(
+    std::max(size, size_t{1}), legate::Memory::Kind::NO_MEMKIND, alignof(T));
+}
+
+}  // namespace
 
 using namespace legate;
 
@@ -96,7 +107,7 @@ void rebalance_data(SegmentMergePiece<VAL>& merge_buffer,
 
   {
     // compute diff for each segment
-    auto segment_diff = create_buffer<int64_t>(num_segments_l);
+    auto segment_diff = create_non_empty_buffer<int64_t>(num_segments_l);
     {
       if (num_segments_l > 1) {
         auto* p_segments = merge_buffer.segments.ptr(0);
@@ -120,7 +131,7 @@ void rebalance_data(SegmentMergePiece<VAL>& merge_buffer,
       merge_buffer.values.destroy();
     }
 
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     {
       size_t reduce =
         thrust::reduce(exec, segment_diff.ptr(0), segment_diff.ptr(0) + num_segments_l, 0);
@@ -130,7 +141,7 @@ void rebalance_data(SegmentMergePiece<VAL>& merge_buffer,
 #endif
 
     // allocate target
-    auto segment_diff_buffers = create_buffer<int64_t>(num_segments_l * num_sort_ranks);
+    auto segment_diff_buffers = create_non_empty_buffer<int64_t>(num_segments_l * num_sort_ranks);
 
     {
       // using alltoallv to mimic allgather on subset
@@ -163,7 +174,7 @@ void rebalance_data(SegmentMergePiece<VAL>& merge_buffer,
     }
 
     // copy to transpose structure [segments][ranks]  (not in-place for now)
-    auto segment_diff_2d = create_buffer<int64_t>(num_segments_l * num_sort_ranks);
+    auto segment_diff_2d = create_non_empty_buffer<int64_t>(num_segments_l * num_sort_ranks);
     {
       int pos = 0;
       for (size_t segment = 0; segment < num_segments_l; ++segment) {
@@ -174,7 +185,7 @@ void rebalance_data(SegmentMergePiece<VAL>& merge_buffer,
       segment_diff_buffers.destroy();
     }
 
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
     {
       for (size_t segment = 0; segment < num_segments_l; ++segment) {
         assert(0 == thrust::reduce(exec,
@@ -199,11 +210,11 @@ void rebalance_data(SegmentMergePiece<VAL>& merge_buffer,
           edge case --> send more than whole line should not happen due to sample choice!
     */
     // 2 (signed) arrays - left/right for every segment
-    auto send_left  = create_buffer<int64_t>(num_segments_l);
-    auto send_right = create_buffer<int64_t>(num_segments_l);
+    auto send_left  = create_non_empty_buffer<int64_t>(num_segments_l);
+    auto send_right = create_non_empty_buffer<int64_t>(num_segments_l);
 
     // compute data to send....
-    auto segment_diff_2d_scan = create_buffer<int64_t>(num_segments_l * num_sort_ranks);
+    auto segment_diff_2d_scan = create_non_empty_buffer<int64_t>(num_segments_l * num_sort_ranks);
 
     auto* segment_diff_2d_ptr      = segment_diff_2d.ptr(0);
     auto* segment_diff_2d_scan_ptr = segment_diff_2d_scan.ptr(0);
@@ -249,11 +260,11 @@ void rebalance_data(SegmentMergePiece<VAL>& merge_buffer,
     recv_leftright_data.size = recv_left_size + recv_right_size;
 
     if (argsort) {
-      send_leftright_data.indices = create_buffer<int64_t>(send_leftright_data.size);
-      recv_leftright_data.indices = create_buffer<int64_t>(recv_leftright_data.size);
+      send_leftright_data.indices = create_non_empty_buffer<int64_t>(send_leftright_data.size);
+      recv_leftright_data.indices = create_non_empty_buffer<int64_t>(recv_leftright_data.size);
     } else {
-      send_leftright_data.values = create_buffer<VAL>(send_leftright_data.size);
-      recv_leftright_data.values = create_buffer<VAL>(recv_leftright_data.size);
+      send_leftright_data.values = create_non_empty_buffer<VAL>(send_leftright_data.size);
+      recv_leftright_data.values = create_non_empty_buffer<VAL>(recv_leftright_data.size);
     }
 
     // copy into send buffer
@@ -523,8 +534,8 @@ void sample_sort_nd(
   size_t num_samples_l             = num_samples_per_segment_l * num_segments_l;
   size_t num_samples_per_segment_g = num_samples_per_segment_l * num_sort_ranks;
   size_t num_samples_g             = num_samples_per_segment_g * num_segments_l;
-  auto samples_l                   = create_buffer<SegmentSample<VAL>>(num_samples_l);
-  auto samples_g                   = create_buffer<SegmentSample<VAL>>(num_samples_g);
+  auto samples_l                   = create_non_empty_buffer<SegmentSample<VAL>>(num_samples_l);
+  auto samples_g                   = create_non_empty_buffer<SegmentSample<VAL>>(num_samples_g);
   auto* p_samples                  = samples_l.ptr(0);
   auto* local_values               = local_sorted.values.ptr(0);
 
@@ -619,10 +630,10 @@ void sample_sort_nd(
 
   // segment_blocks[r][segment]->global position in data for segment and r
   // perform blocksize wide scan on size_send[r][block*blocksize] within warp
-  auto segment_blocks = create_buffer<int32_t>(num_sort_ranks * num_segments_l);
+  auto segment_blocks = create_non_empty_buffer<int32_t>(num_sort_ranks * num_segments_l);
 
   // initialize sizes to send [r][segment]
-  auto size_send   = create_buffer<int32_t>(num_sort_ranks * (num_segments_l + 1));
+  auto size_send   = create_non_empty_buffer<int32_t>(num_sort_ranks * (num_segments_l + 1));
   auto p_size_send = size_send.ptr(0);
   std::fill(p_size_send, p_size_send + num_sort_ranks * (num_segments_l + 1), 0);
 
@@ -670,7 +681,7 @@ void sample_sort_nd(
   // cleanup intermediate data structures
   samples_g.destroy();
 
-#ifdef DEBUG_CUNUMERIC
+#ifdef DEBUG_CUPYNUMERIC
   {
     size_t total_send = 0;
     for (size_t r = 0; r < num_sort_ranks; ++r) {
@@ -685,7 +696,7 @@ void sample_sort_nd(
   /////////////////////////////////////////////////////////////////////////////////////////////////
 
   // all2all exchange send/receive sizes  [r][segment]
-  auto size_recv = create_buffer<int32_t>(num_sort_ranks * (num_segments_l + 1));
+  auto size_recv = create_non_empty_buffer<int32_t>(num_sort_ranks * (num_segments_l + 1));
 
   {
     // workaround - using alltoallv
@@ -714,11 +725,13 @@ void sample_sort_nd(
   }
 
   // copy values into send buffer
-  auto val_send_buffer = create_buffer<VAL>(volume);
-  auto idc_send_buffer = create_buffer<int64_t>(argsort ? volume : 0);
+  auto val_send_buffer = create_non_empty_buffer<VAL>(volume);
+  auto idc_send_buffer = create_non_empty_buffer<int64_t>(argsort ? volume : 0);
   auto* local_indices  = local_sorted.indices.ptr(0);
 
-  auto positions = create_buffer<int32_t>(num_sort_ranks);
+  // This line is particularly problematic, as the following line makes an out-of-bounds access when
+  // teh buffer is empty
+  auto positions = create_non_empty_buffer<int32_t>(num_sort_ranks);
   positions[0]   = 0;
   for (size_t sort_rank = 1; sort_rank < num_sort_ranks; ++sort_rank) {
     positions[sort_rank] =
@@ -759,7 +772,7 @@ void sample_sort_nd(
       total_receive += size_recv[sort_rank * (num_segments_l + 1) + num_segments_l];
     }
 
-    merge_buffer.segments = create_buffer<size_t>(num_segments_l > 1 ? total_receive : 0);
+    merge_buffer.segments = create_non_empty_buffer<size_t>(num_segments_l > 1 ? total_receive : 0);
     if (num_segments_l > 1) {
       auto* p_segments = merge_buffer.segments.ptr(0);
       // initialize segment information
@@ -774,8 +787,8 @@ void sample_sort_nd(
       assert(start_pos == total_receive);
     }
 
-    merge_buffer.values  = create_buffer<VAL>(total_receive);
-    merge_buffer.indices = create_buffer<int64_t>(argsort ? total_receive : 0);
+    merge_buffer.values  = create_non_empty_buffer<VAL>(total_receive);
+    merge_buffer.indices = create_non_empty_buffer<int64_t>(argsort ? total_receive : 0);
     merge_buffer.size    = total_receive;
   }
 
@@ -944,13 +957,14 @@ struct SortImplBodyCpu {
     VAL* values_ptr      = nullptr;
     if (argsort) {
       // make a buffer for input
-      auto input_copy     = create_buffer<VAL>(volume);
+      auto input_copy = create_buffer<VAL>(volume, legate::Memory::Kind::NO_MEMKIND, alignof(VAL));
       local_sorted.values = input_copy;
       values_ptr          = input_copy.ptr(0);
 
       // initialize indices
       if (need_distributed_sort) {
-        auto indices_buffer  = create_buffer<int64_t>(volume);
+        auto indices_buffer =
+          create_buffer<int64_t>(volume, legate::Memory::Kind::NO_MEMKIND, alignof(int64_t));
         indices_ptr          = indices_buffer.ptr(0);
         local_sorted.indices = indices_buffer;
         local_sorted.size    = volume;
@@ -975,11 +989,13 @@ struct SortImplBodyCpu {
     } else {
       // initialize output
       if (need_distributed_sort) {
-        auto input_copy      = create_buffer<VAL>(volume);
-        values_ptr           = input_copy.ptr(0);
-        local_sorted.values  = input_copy;
-        local_sorted.indices = create_buffer<int64_t>(0);
-        local_sorted.size    = volume;
+        auto input_copy =
+          create_buffer<VAL>(volume, legate::Memory::Kind::NO_MEMKIND, alignof(VAL));
+        values_ptr          = input_copy.ptr(0);
+        local_sorted.values = input_copy;
+        local_sorted.indices =
+          create_buffer<int64_t>(0, legate::Memory::Kind::NO_MEMKIND, alignof(int64_t));
+        local_sorted.size = volume;
       } else {
         AccessorWO<VAL, DIM> output = output_array.write_accessor<VAL, DIM>(rect);
         assert(rect.empty() || output.accessor.is_dense_row_major(rect));
@@ -1050,4 +1066,4 @@ struct SortImplBodyCpu {
   }
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/sort_omp.cc b/src/cupynumeric/sort/sort_omp.cc
similarity index 93%
rename from src/cunumeric/sort/sort_omp.cc
rename to src/cupynumeric/sort/sort_omp.cc
index 373ecfe095..47e687f640 100644
--- a/src/cunumeric/sort/sort_omp.cc
+++ b/src/cupynumeric/sort/sort_omp.cc
@@ -14,9 +14,9 @@
  *
  */
 
-#include "cunumeric/sort/sort.h"
-#include "cunumeric/sort/sort_cpu.inl"
-#include "cunumeric/sort/sort_template.inl"
+#include "cupynumeric/sort/sort.h"
+#include "cupynumeric/sort/sort_cpu.inl"
+#include "cupynumeric/sort/sort_template.inl"
 
 #include <thrust/sort.h>
 #include <thrust/detail/config.h>
@@ -25,7 +25,7 @@
 #include <numeric>
 #include <omp.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -72,4 +72,4 @@ struct SortImplBody<VariantKind::OMP, CODE, DIM> {
   sort_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/sort_template.inl b/src/cupynumeric/sort/sort_template.inl
similarity index 97%
rename from src/cunumeric/sort/sort_template.inl
rename to src/cupynumeric/sort/sort_template.inl
index 7963310152..a3c8f169fe 100644
--- a/src/cunumeric/sort/sort_template.inl
+++ b/src/cupynumeric/sort/sort_template.inl
@@ -17,10 +17,10 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/sort/sort.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/sort/sort.h"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -112,4 +112,4 @@ static void sort_template(TaskContext& context)
     args.input.dim(), args.input.code(), SortImpl<KIND>{}, args, context, context.communicators());
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/thrust_sort.cuh b/src/cupynumeric/sort/thrust_sort.cuh
similarity index 94%
rename from src/cunumeric/sort/thrust_sort.cuh
rename to src/cupynumeric/sort/thrust_sort.cuh
index df643573fa..c14428b2c0 100644
--- a/src/cunumeric/sort/thrust_sort.cuh
+++ b/src/cupynumeric/sort/thrust_sort.cuh
@@ -17,17 +17,17 @@
 #pragma once
 
 #include "legate/data/buffer.h"
-#include "cunumeric/utilities/thrust_allocator.h"
-#include "cunumeric/utilities/thrust_util.h"
+#include "cupynumeric/utilities/thrust_allocator.h"
+#include "cupynumeric/utilities/thrust_util.h"
 
 #include <thrust/sort.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/execution_policy.h>
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 namespace detail {
 
 using namespace legate;
@@ -48,12 +48,12 @@ void thrust_local_sort(const VAL* values_in,
 
   if (values_in != values_out) {
     // not in-place --> need a copy
-    CUNUMERIC_CHECK_CUDA(cudaMemcpyAsync(
+    CUPYNUMERIC_CHECK_CUDA(cudaMemcpyAsync(
       values_out, values_in, sizeof(VAL) * volume, cudaMemcpyDeviceToDevice, stream));
   }
   if (indices_in != indices_out) {
     // not in-place --> need a copy
-    CUNUMERIC_CHECK_CUDA(cudaMemcpyAsync(
+    CUPYNUMERIC_CHECK_CUDA(cudaMemcpyAsync(
       indices_out, values_in, sizeof(int64_t) * volume, cudaMemcpyDeviceToDevice, stream));
   }
 
@@ -123,4 +123,4 @@ void thrust_local_sort(const VAL* values_in,
 }
 
 }  // namespace detail
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/thrust_sort.h b/src/cupynumeric/sort/thrust_sort.h
similarity index 99%
rename from src/cunumeric/sort/thrust_sort.h
rename to src/cupynumeric/sort/thrust_sort.h
index 0dcd12a90c..4e578c24b4 100644
--- a/src/cunumeric/sort/thrust_sort.h
+++ b/src/cupynumeric/sort/thrust_sort.h
@@ -16,7 +16,7 @@
 
 #pragma once
 
-namespace cunumeric {
+namespace cupynumeric {
 
 void thrust_local_sort(const bool* values_in,
                        bool* values_out,
@@ -144,4 +144,4 @@ void thrust_local_sort(const complex<double>* values_in,
                        const bool stable,
                        cudaStream_t stream);
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/thrust_sort_bool.cu b/src/cupynumeric/sort/thrust_sort_bool.cu
similarity index 91%
rename from src/cunumeric/sort/thrust_sort_bool.cu
rename to src/cupynumeric/sort/thrust_sort_bool.cu
index 3406171eb8..e5b1358e95 100644
--- a/src/cunumeric/sort/thrust_sort_bool.cu
+++ b/src/cupynumeric/sort/thrust_sort_bool.cu
@@ -14,9 +14,9 @@
  *
  */
 
-#include "cunumeric/sort/thrust_sort.cuh"
+#include "cupynumeric/sort/thrust_sort.cuh"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 void thrust_local_sort(const bool* values_in,
                        bool* values_out,
@@ -31,4 +31,4 @@ void thrust_local_sort(const bool* values_in,
     values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stable, stream);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/thrust_sort_complex128.cu b/src/cupynumeric/sort/thrust_sort_complex128.cu
similarity index 92%
rename from src/cunumeric/sort/thrust_sort_complex128.cu
rename to src/cupynumeric/sort/thrust_sort_complex128.cu
index 978afa6918..97c1a5d41a 100644
--- a/src/cunumeric/sort/thrust_sort_complex128.cu
+++ b/src/cupynumeric/sort/thrust_sort_complex128.cu
@@ -14,9 +14,9 @@
  *
  */
 
-#include "cunumeric/sort/thrust_sort.cuh"
+#include "cupynumeric/sort/thrust_sort.cuh"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 void thrust_local_sort(const complex<double>* values_in,
                        complex<double>* values_out,
@@ -31,4 +31,4 @@ void thrust_local_sort(const complex<double>* values_in,
     values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stable, stream);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/thrust_sort_complex64.cu b/src/cupynumeric/sort/thrust_sort_complex64.cu
similarity index 92%
rename from src/cunumeric/sort/thrust_sort_complex64.cu
rename to src/cupynumeric/sort/thrust_sort_complex64.cu
index 15a6072255..9f5f959e85 100644
--- a/src/cunumeric/sort/thrust_sort_complex64.cu
+++ b/src/cupynumeric/sort/thrust_sort_complex64.cu
@@ -14,9 +14,9 @@
  *
  */
 
-#include "cunumeric/sort/thrust_sort.cuh"
+#include "cupynumeric/sort/thrust_sort.cuh"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 void thrust_local_sort(const complex<float>* values_in,
                        complex<float>* values_out,
@@ -31,4 +31,4 @@ void thrust_local_sort(const complex<float>* values_in,
     values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stable, stream);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/thrust_sort_double.cu b/src/cupynumeric/sort/thrust_sort_double.cu
similarity index 91%
rename from src/cunumeric/sort/thrust_sort_double.cu
rename to src/cupynumeric/sort/thrust_sort_double.cu
index 0b3d54db1d..ea783fa2a6 100644
--- a/src/cunumeric/sort/thrust_sort_double.cu
+++ b/src/cupynumeric/sort/thrust_sort_double.cu
@@ -14,9 +14,9 @@
  *
  */
 
-#include "cunumeric/sort/thrust_sort.cuh"
+#include "cupynumeric/sort/thrust_sort.cuh"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 void thrust_local_sort(const double* values_in,
                        double* values_out,
@@ -31,4 +31,4 @@ void thrust_local_sort(const double* values_in,
     values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stable, stream);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/thrust_sort_float.cu b/src/cupynumeric/sort/thrust_sort_float.cu
similarity index 91%
rename from src/cunumeric/sort/thrust_sort_float.cu
rename to src/cupynumeric/sort/thrust_sort_float.cu
index a32af26012..182ecdb3ef 100644
--- a/src/cunumeric/sort/thrust_sort_float.cu
+++ b/src/cupynumeric/sort/thrust_sort_float.cu
@@ -14,9 +14,9 @@
  *
  */
 
-#include "cunumeric/sort/thrust_sort.cuh"
+#include "cupynumeric/sort/thrust_sort.cuh"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 void thrust_local_sort(const float* values_in,
                        float* values_out,
@@ -31,4 +31,4 @@ void thrust_local_sort(const float* values_in,
     values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stable, stream);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/thrust_sort_half.cu b/src/cupynumeric/sort/thrust_sort_half.cu
similarity index 91%
rename from src/cunumeric/sort/thrust_sort_half.cu
rename to src/cupynumeric/sort/thrust_sort_half.cu
index 86467247e2..a582d919b6 100644
--- a/src/cunumeric/sort/thrust_sort_half.cu
+++ b/src/cupynumeric/sort/thrust_sort_half.cu
@@ -14,9 +14,9 @@
  *
  */
 
-#include "cunumeric/sort/thrust_sort.cuh"
+#include "cupynumeric/sort/thrust_sort.cuh"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 void thrust_local_sort(const __half* values_in,
                        __half* values_out,
@@ -31,4 +31,4 @@ void thrust_local_sort(const __half* values_in,
     values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stable, stream);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/thrust_sort_int16.cu b/src/cupynumeric/sort/thrust_sort_int16.cu
similarity index 91%
rename from src/cunumeric/sort/thrust_sort_int16.cu
rename to src/cupynumeric/sort/thrust_sort_int16.cu
index d0f80ac6d5..d4f14ccffe 100644
--- a/src/cunumeric/sort/thrust_sort_int16.cu
+++ b/src/cupynumeric/sort/thrust_sort_int16.cu
@@ -14,9 +14,9 @@
  *
  */
 
-#include "cunumeric/sort/thrust_sort.cuh"
+#include "cupynumeric/sort/thrust_sort.cuh"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 void thrust_local_sort(const int16_t* values_in,
                        int16_t* values_out,
@@ -31,4 +31,4 @@ void thrust_local_sort(const int16_t* values_in,
     values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stable, stream);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/thrust_sort_int32.cu b/src/cupynumeric/sort/thrust_sort_int32.cu
similarity index 91%
rename from src/cunumeric/sort/thrust_sort_int32.cu
rename to src/cupynumeric/sort/thrust_sort_int32.cu
index 8217b19c18..7398e16213 100644
--- a/src/cunumeric/sort/thrust_sort_int32.cu
+++ b/src/cupynumeric/sort/thrust_sort_int32.cu
@@ -14,9 +14,9 @@
  *
  */
 
-#include "cunumeric/sort/thrust_sort.cuh"
+#include "cupynumeric/sort/thrust_sort.cuh"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 void thrust_local_sort(const int32_t* values_in,
                        int32_t* values_out,
@@ -31,4 +31,4 @@ void thrust_local_sort(const int32_t* values_in,
     values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stable, stream);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/thrust_sort_int64.cu b/src/cupynumeric/sort/thrust_sort_int64.cu
similarity index 91%
rename from src/cunumeric/sort/thrust_sort_int64.cu
rename to src/cupynumeric/sort/thrust_sort_int64.cu
index 238bc828d0..a4a2faf9ef 100644
--- a/src/cunumeric/sort/thrust_sort_int64.cu
+++ b/src/cupynumeric/sort/thrust_sort_int64.cu
@@ -14,9 +14,9 @@
  *
  */
 
-#include "cunumeric/sort/thrust_sort.cuh"
+#include "cupynumeric/sort/thrust_sort.cuh"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 void thrust_local_sort(const int64_t* values_in,
                        int64_t* values_out,
@@ -31,4 +31,4 @@ void thrust_local_sort(const int64_t* values_in,
     values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stable, stream);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/thrust_sort_int8.cu b/src/cupynumeric/sort/thrust_sort_int8.cu
similarity index 91%
rename from src/cunumeric/sort/thrust_sort_int8.cu
rename to src/cupynumeric/sort/thrust_sort_int8.cu
index 8ce4fbcff4..b9d7aa06b1 100644
--- a/src/cunumeric/sort/thrust_sort_int8.cu
+++ b/src/cupynumeric/sort/thrust_sort_int8.cu
@@ -14,9 +14,9 @@
  *
  */
 
-#include "cunumeric/sort/thrust_sort.cuh"
+#include "cupynumeric/sort/thrust_sort.cuh"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 void thrust_local_sort(const int8_t* values_in,
                        int8_t* values_out,
@@ -31,4 +31,4 @@ void thrust_local_sort(const int8_t* values_in,
     values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stable, stream);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/thrust_sort_uint16.cu b/src/cupynumeric/sort/thrust_sort_uint16.cu
similarity index 91%
rename from src/cunumeric/sort/thrust_sort_uint16.cu
rename to src/cupynumeric/sort/thrust_sort_uint16.cu
index 31d0db9b19..569f9be78d 100644
--- a/src/cunumeric/sort/thrust_sort_uint16.cu
+++ b/src/cupynumeric/sort/thrust_sort_uint16.cu
@@ -14,9 +14,9 @@
  *
  */
 
-#include "cunumeric/sort/thrust_sort.cuh"
+#include "cupynumeric/sort/thrust_sort.cuh"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 void thrust_local_sort(const uint16_t* values_in,
                        uint16_t* values_out,
@@ -31,4 +31,4 @@ void thrust_local_sort(const uint16_t* values_in,
     values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stable, stream);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/thrust_sort_uint32.cu b/src/cupynumeric/sort/thrust_sort_uint32.cu
similarity index 91%
rename from src/cunumeric/sort/thrust_sort_uint32.cu
rename to src/cupynumeric/sort/thrust_sort_uint32.cu
index 318a1e991b..571c5b5006 100644
--- a/src/cunumeric/sort/thrust_sort_uint32.cu
+++ b/src/cupynumeric/sort/thrust_sort_uint32.cu
@@ -14,9 +14,9 @@
  *
  */
 
-#include "cunumeric/sort/thrust_sort.cuh"
+#include "cupynumeric/sort/thrust_sort.cuh"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 void thrust_local_sort(const uint32_t* values_in,
                        uint32_t* values_out,
@@ -31,4 +31,4 @@ void thrust_local_sort(const uint32_t* values_in,
     values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stable, stream);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/thrust_sort_uint64.cu b/src/cupynumeric/sort/thrust_sort_uint64.cu
similarity index 91%
rename from src/cunumeric/sort/thrust_sort_uint64.cu
rename to src/cupynumeric/sort/thrust_sort_uint64.cu
index e457cfb9b3..d63a8acaf9 100644
--- a/src/cunumeric/sort/thrust_sort_uint64.cu
+++ b/src/cupynumeric/sort/thrust_sort_uint64.cu
@@ -14,9 +14,9 @@
  *
  */
 
-#include "cunumeric/sort/thrust_sort.cuh"
+#include "cupynumeric/sort/thrust_sort.cuh"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 void thrust_local_sort(const uint64_t* values_in,
                        uint64_t* values_out,
@@ -31,4 +31,4 @@ void thrust_local_sort(const uint64_t* values_in,
     values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stable, stream);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/sort/thrust_sort_uint8.cu b/src/cupynumeric/sort/thrust_sort_uint8.cu
similarity index 91%
rename from src/cunumeric/sort/thrust_sort_uint8.cu
rename to src/cupynumeric/sort/thrust_sort_uint8.cu
index 873d51796a..9d99fc2eff 100644
--- a/src/cunumeric/sort/thrust_sort_uint8.cu
+++ b/src/cupynumeric/sort/thrust_sort_uint8.cu
@@ -14,9 +14,9 @@
  *
  */
 
-#include "cunumeric/sort/thrust_sort.cuh"
+#include "cupynumeric/sort/thrust_sort.cuh"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 void thrust_local_sort(const uint8_t* values_in,
                        uint8_t* values_out,
@@ -31,4 +31,4 @@ void thrust_local_sort(const uint8_t* values_in,
     values_in, values_out, indices_in, indices_out, volume, sort_dim_size, stable, stream);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/stat/bincount.cc b/src/cupynumeric/stat/bincount.cc
similarity index 87%
rename from src/cunumeric/stat/bincount.cc
rename to src/cupynumeric/stat/bincount.cc
index b18c95d8db..0db2c105cc 100644
--- a/src/cunumeric/stat/bincount.cc
+++ b/src/cupynumeric/stat/bincount.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/stat/bincount.h"
-#include "cunumeric/stat/bincount_template.inl"
+#include "cupynumeric/stat/bincount.h"
+#include "cupynumeric/stat/bincount_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -58,7 +58,10 @@ struct BincountImplBody<VariantKind::CPU, CODE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { BincountTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  BincountTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/stat/bincount.cu b/src/cupynumeric/stat/bincount.cu
similarity index 96%
rename from src/cunumeric/stat/bincount.cu
rename to src/cupynumeric/stat/bincount.cu
index 314b0f00ec..fa2c79eaf1 100644
--- a/src/cunumeric/stat/bincount.cu
+++ b/src/cupynumeric/stat/bincount.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/stat/bincount.h"
-#include "cunumeric/stat/bincount_template.inl"
+#include "cupynumeric/stat/bincount.h"
+#include "cupynumeric/stat/bincount_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename VAL>
 static __device__ inline void _bincount(int32_t* bins,
@@ -183,7 +183,7 @@ struct BincountImplBody<VariantKind::GPU, CODE> {
       bincount_kernel_rd_global<VAL>
         <<<blocks, THREADS_PER_BLOCK, 0, stream>>>(lhs, rhs, volume, rect.lo);
     }
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 
   void operator()(AccessorRD<SumReduction<double>, false, 1> lhs,
@@ -212,7 +212,7 @@ struct BincountImplBody<VariantKind::GPU, CODE> {
       weighted_bincount_kernel_rd_global<VAL>
         <<<blocks, THREADS_PER_BLOCK, 0, stream>>>(lhs, rhs, weights, volume, rect.lo);
     }
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -221,4 +221,4 @@ struct BincountImplBody<VariantKind::GPU, CODE> {
   bincount_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/stat/bincount.h b/src/cupynumeric/stat/bincount.h
similarity index 80%
rename from src/cunumeric/stat/bincount.h
rename to src/cupynumeric/stat/bincount.h
index 2571737150..f737a67857 100644
--- a/src/cunumeric/stat/bincount.h
+++ b/src/cupynumeric/stat/bincount.h
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct BincountArgs {
   legate::PhysicalStore lhs{nullptr};
@@ -27,9 +27,10 @@ struct BincountArgs {
   bool has_weights;
 };
 
-class BincountTask : public CuNumericTask<BincountTask> {
+class BincountTask : public CuPyNumericTask<BincountTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_BINCOUNT};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_BINCOUNT}};
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -41,4 +42,4 @@ class BincountTask : public CuNumericTask<BincountTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/stat/bincount_omp.cc b/src/cupynumeric/stat/bincount_omp.cc
similarity index 96%
rename from src/cunumeric/stat/bincount_omp.cc
rename to src/cupynumeric/stat/bincount_omp.cc
index a0fe21817b..c3cb67ee4a 100644
--- a/src/cunumeric/stat/bincount_omp.cc
+++ b/src/cupynumeric/stat/bincount_omp.cc
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/stat/bincount.h"
-#include "cunumeric/stat/bincount_template.inl"
+#include "cupynumeric/stat/bincount.h"
+#include "cupynumeric/stat/bincount_template.inl"
 
 #include <omp.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -111,4 +111,4 @@ struct BincountImplBody<VariantKind::OMP, CODE> {
   bincount_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/stat/bincount_template.inl b/src/cupynumeric/stat/bincount_template.inl
similarity index 85%
rename from src/cunumeric/stat/bincount_template.inl
rename to src/cupynumeric/stat/bincount_template.inl
index 02706fa8f4..66bc9c723c 100644
--- a/src/cunumeric/stat/bincount_template.inl
+++ b/src/cupynumeric/stat/bincount_template.inl
@@ -17,9 +17,9 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/stat/bincount.h"
+#include "cupynumeric/stat/bincount.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -42,12 +42,12 @@ struct BincountImpl {
     auto rhs = args.rhs.read_accessor<VAL, 1>(rect);
     if (args.has_weights) {
       auto weights = args.weights.read_accessor<double, 1>(rect);
-      auto lhs =
-        args.lhs.reduce_accessor<SumReduction<double>, KIND != VariantKind::GPU, 1>(lhs_rect);
+      auto lhs     = args.lhs.reduce_accessor < SumReduction<double>, KIND != VariantKind::GPU,
+           1 > (lhs_rect);
       BincountImplBody<KIND, CODE>()(lhs, rhs, weights, rect, lhs_rect);
     } else {
-      auto lhs =
-        args.lhs.reduce_accessor<SumReduction<int64_t>, KIND != VariantKind::GPU, 1>(lhs_rect);
+      auto lhs = args.lhs.reduce_accessor < SumReduction<int64_t>, KIND != VariantKind::GPU,
+           1 > (lhs_rect);
       BincountImplBody<KIND, CODE>()(lhs, rhs, rect, lhs_rect);
     }
   }
@@ -78,4 +78,4 @@ static void bincount_template(TaskContext& context)
   type_dispatch(args.rhs.code(), BincountImpl<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/stat/histogram.cc b/src/cupynumeric/stat/histogram.cc
similarity index 86%
rename from src/cunumeric/stat/histogram.cc
rename to src/cupynumeric/stat/histogram.cc
index c8c19fd117..e76155ef71 100644
--- a/src/cunumeric/stat/histogram.cc
+++ b/src/cupynumeric/stat/histogram.cc
@@ -14,11 +14,11 @@
  *
  */
 
-#include "cunumeric/stat/histogram.h"
-#include "cunumeric/stat/histogram_template.inl"
+#include "cupynumeric/stat/histogram.h"
+#include "cupynumeric/stat/histogram_template.inl"
 
-#include "cunumeric/stat/histogram_cpu.h"
-#include "cunumeric/stat/histogram_impl.h"
+#include "cupynumeric/stat/histogram_cpu.h"
+#include "cupynumeric/stat/histogram_impl.h"
 
 #include <algorithm>
 #include <numeric>
@@ -30,7 +30,7 @@
 #include <iterator>
 #endif
 
-namespace cunumeric {
+namespace cupynumeric {
 using namespace legate;
 
 template <Type::Code CODE>
@@ -69,10 +69,10 @@ struct HistogramImplBody<VariantKind::CPU, CODE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+const auto cupynumeric_reg_task_ = []() -> char {
   HistogramTask::register_variants();
-}
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/stat/histogram.cu b/src/cupynumeric/stat/histogram.cu
similarity index 86%
rename from src/cunumeric/stat/histogram.cu
rename to src/cupynumeric/stat/histogram.cu
index ece9cca26a..ff8912daad 100644
--- a/src/cunumeric/stat/histogram.cu
+++ b/src/cupynumeric/stat/histogram.cu
@@ -14,15 +14,15 @@
  *
  */
 
-#include "cunumeric/stat/histogram.h"
-#include "cunumeric/stat/histogram_template.inl"
+#include "cupynumeric/stat/histogram.h"
+#include "cupynumeric/stat/histogram_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-#include "cunumeric/stat/histogram.cuh"
-#include "cunumeric/stat/histogram_impl.h"
+#include "cupynumeric/stat/histogram.cuh"
+#include "cupynumeric/stat/histogram_impl.h"
 
-#include "cunumeric/utilities/thrust_util.h"
+#include "cupynumeric/utilities/thrust_util.h"
 
 #include <tuple>
 
@@ -36,7 +36,7 @@
 #include <thrust/host_vector.h>
 #endif
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <Type::Code CODE>
 struct HistogramImplBody<VariantKind::GPU, CODE> {
@@ -74,4 +74,4 @@ struct HistogramImplBody<VariantKind::GPU, CODE> {
   histogram_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/stat/histogram.cuh b/src/cupynumeric/stat/histogram.cuh
similarity index 93%
rename from src/cunumeric/stat/histogram.cuh
rename to src/cupynumeric/stat/histogram.cuh
index 1d5ec1b38b..94c63587c7 100644
--- a/src/cunumeric/stat/histogram.cuh
+++ b/src/cupynumeric/stat/histogram.cuh
@@ -38,11 +38,11 @@
 #include <iostream>
 #endif
 
-#include "cunumeric/utilities/thrust_util.h"
+#include "cupynumeric/utilities/thrust_util.h"
 
-#include "cunumeric/stat/histogram_gen.h"
+#include "cupynumeric/stat/histogram_gen.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 namespace detail {
 
 // device specialization:
@@ -108,8 +108,8 @@ template <typename exe_policy_t>
 struct sync_policy_t<exe_policy_t, std::enable_if_t<!is_host_policy_v<exe_policy_t>>> {
   sync_policy_t() {}
 
-  void operator()(cudaStream_t stream) { CUNUMERIC_CHECK_CUDA_STREAM(stream); }
+  void operator()(cudaStream_t stream) { CUPYNUMERIC_CHECK_CUDA_STREAM(stream); }
 };
 
 }  // namespace detail
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/stat/histogram.h b/src/cupynumeric/stat/histogram.h
similarity index 65%
rename from src/cunumeric/stat/histogram.h
rename to src/cupynumeric/stat/histogram.h
index f2ee746566..f086e351b5 100644
--- a/src/cunumeric/stat/histogram.h
+++ b/src/cupynumeric/stat/histogram.h
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct HistogramArgs {
   legate::PhysicalStore result;
@@ -27,9 +27,14 @@ struct HistogramArgs {
   legate::PhysicalStore weights;
 };
 
-class HistogramTask : public CuNumericTask<HistogramTask> {
+class HistogramTask : public CuPyNumericTask<HistogramTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_HISTOGRAM};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_HISTOGRAM}};
+
+  static constexpr auto CPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+  static constexpr auto OMP_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -41,4 +46,4 @@ class HistogramTask : public CuNumericTask<HistogramTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/stat/histogram_cpu.h b/src/cupynumeric/stat/histogram_cpu.h
similarity index 95%
rename from src/cunumeric/stat/histogram_cpu.h
rename to src/cupynumeric/stat/histogram_cpu.h
index 28677e52a1..c162737d69 100644
--- a/src/cunumeric/stat/histogram_cpu.h
+++ b/src/cupynumeric/stat/histogram_cpu.h
@@ -30,13 +30,13 @@
 #include <vector>
 
 #if LEGATE_DEFINED(LEGATE_USE_CUDA) and LEGATE_DEFINED(LEGATE_NVCC)
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 #else
 #define cudaStream_t std::int32_t
 #endif
-#include "cunumeric/stat/histogram_gen.h"
+#include "cupynumeric/stat/histogram_gen.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 namespace detail {
 
 // host specialization:
@@ -93,4 +93,4 @@ struct sync_policy_t<exe_policy_t, std::enable_if_t<is_host_policy_v<exe_policy_
 };
 
 }  // namespace detail
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/stat/histogram_gen.h b/src/cupynumeric/stat/histogram_gen.h
similarity index 98%
rename from src/cunumeric/stat/histogram_gen.h
rename to src/cupynumeric/stat/histogram_gen.h
index cf92923e5f..c5c4bbf1c2 100644
--- a/src/cunumeric/stat/histogram_gen.h
+++ b/src/cupynumeric/stat/histogram_gen.h
@@ -16,7 +16,7 @@
 
 #pragma once
 
-namespace cunumeric {
+namespace cupynumeric {
 namespace detail {
 // primary templates, to be specialized (SFINAEd)
 //
@@ -126,4 +126,4 @@ struct allocator_t {
 };
 
 }  // namespace detail
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/stat/histogram_impl.h b/src/cupynumeric/stat/histogram_impl.h
similarity index 98%
rename from src/cunumeric/stat/histogram_impl.h
rename to src/cupynumeric/stat/histogram_impl.h
index 47ece8e6f3..11ad127fb7 100644
--- a/src/cunumeric/stat/histogram_impl.h
+++ b/src/cupynumeric/stat/histogram_impl.h
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include "cunumeric/stat/histogram_gen.h"
+#include "cupynumeric/stat/histogram_gen.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 namespace detail {
 
 template <typename exe_policy_t, typename elem_t, typename bin_t>
@@ -177,4 +177,4 @@ void histogram_wrapper(exe_policy_t exe_pol,
 }
 
 }  // namespace detail
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/stat/histogram_omp.cc b/src/cupynumeric/stat/histogram_omp.cc
similarity index 94%
rename from src/cunumeric/stat/histogram_omp.cc
rename to src/cupynumeric/stat/histogram_omp.cc
index edae0fa769..26078ebbb2 100644
--- a/src/cunumeric/stat/histogram_omp.cc
+++ b/src/cupynumeric/stat/histogram_omp.cc
@@ -14,14 +14,14 @@
  *
  */
 
-#include "cunumeric/stat/histogram.h"
-#include "cunumeric/stat/histogram_template.inl"
+#include "cupynumeric/stat/histogram.h"
+#include "cupynumeric/stat/histogram_template.inl"
 
 #define _USE_THRUST_
 
 #ifdef _USE_THRUST_
-#include "cunumeric/stat/histogram_cpu.h"
-#include "cunumeric/stat/histogram_impl.h"
+#include "cupynumeric/stat/histogram_cpu.h"
+#include "cupynumeric/stat/histogram_impl.h"
 #endif
 
 #include <omp.h>
@@ -30,7 +30,7 @@
 #include <numeric>
 #include <tuple>
 
-namespace cunumeric {
+namespace cupynumeric {
 using namespace legate;
 
 template <Type::Code CODE>
@@ -118,4 +118,4 @@ struct HistogramImplBody<VariantKind::OMP, CODE> {
   histogram_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/stat/histogram_template.inl b/src/cupynumeric/stat/histogram_template.inl
similarity index 96%
rename from src/cunumeric/stat/histogram_template.inl
rename to src/cupynumeric/stat/histogram_template.inl
index 4fc1a69e03..a132acd929 100644
--- a/src/cunumeric/stat/histogram_template.inl
+++ b/src/cupynumeric/stat/histogram_template.inl
@@ -17,9 +17,9 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/stat/histogram.h"
+#include "cupynumeric/stat/histogram.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -75,4 +75,4 @@ static void histogram_template(TaskContext& context)
   type_dispatch(args.src.code(), HistogramImpl<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/ternary/where.cc b/src/cupynumeric/ternary/where.cc
similarity index 87%
rename from src/cunumeric/ternary/where.cc
rename to src/cupynumeric/ternary/where.cc
index 3ee46c2c35..f74929c258 100644
--- a/src/cunumeric/ternary/where.cc
+++ b/src/cupynumeric/ternary/where.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/ternary/where.h"
-#include "cunumeric/ternary/where_template.inl"
+#include "cupynumeric/ternary/where.h"
+#include "cupynumeric/ternary/where_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -59,7 +59,10 @@ struct WhereImplBody<VariantKind::CPU, CODE, DIM> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { WhereTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  WhereTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/ternary/where.cu b/src/cupynumeric/ternary/where.cu
similarity index 92%
rename from src/cunumeric/ternary/where.cu
rename to src/cupynumeric/ternary/where.cu
index c665d40045..b810124b2c 100644
--- a/src/cunumeric/ternary/where.cu
+++ b/src/cupynumeric/ternary/where.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/ternary/where.h"
-#include "cunumeric/ternary/where_template.inl"
+#include "cupynumeric/ternary/where.h"
+#include "cupynumeric/ternary/where_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename VAL>
 static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
@@ -71,7 +71,7 @@ struct WhereImplBody<VariantKind::GPU, CODE, DIM> {
       generic_kernel<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
         volume, out, mask, in1, in2, pitches, rect);
     }
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -80,4 +80,4 @@ struct WhereImplBody<VariantKind::GPU, CODE, DIM> {
   where_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/ternary/where.h b/src/cupynumeric/ternary/where.h
similarity index 80%
rename from src/cunumeric/ternary/where.h
rename to src/cupynumeric/ternary/where.h
index 702c90adea..03a9dbf20c 100644
--- a/src/cunumeric/ternary/where.h
+++ b/src/cupynumeric/ternary/where.h
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct WhereArgs {
   legate::PhysicalStore out;
@@ -27,9 +27,9 @@ struct WhereArgs {
   legate::PhysicalStore in2;
 };
 
-class WhereTask : public CuNumericTask<WhereTask> {
+class WhereTask : public CuPyNumericTask<WhereTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_WHERE};
+  static inline const auto TASK_CONFIG = legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_WHERE}};
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -41,4 +41,4 @@ class WhereTask : public CuNumericTask<WhereTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/ternary/where_omp.cc b/src/cupynumeric/ternary/where_omp.cc
similarity index 92%
rename from src/cunumeric/ternary/where_omp.cc
rename to src/cupynumeric/ternary/where_omp.cc
index 6a8928ee42..e7ff77bc2a 100644
--- a/src/cunumeric/ternary/where_omp.cc
+++ b/src/cupynumeric/ternary/where_omp.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/ternary/where.h"
-#include "cunumeric/ternary/where_template.inl"
+#include "cupynumeric/ternary/where.h"
+#include "cupynumeric/ternary/where_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -59,4 +59,4 @@ struct WhereImplBody<VariantKind::OMP, CODE, DIM> {
   where_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/ternary/where_template.inl b/src/cupynumeric/ternary/where_template.inl
similarity index 94%
rename from src/cunumeric/ternary/where_template.inl
rename to src/cupynumeric/ternary/where_template.inl
index 1e3d2001a0..61d1fda87f 100644
--- a/src/cunumeric/ternary/where_template.inl
+++ b/src/cupynumeric/ternary/where_template.inl
@@ -17,10 +17,10 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/ternary/where.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/ternary/where.h"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -70,4 +70,4 @@ static void where_template(TaskContext& context)
   double_dispatch(dim, args.out.code(), WhereImpl<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/transform/flip.cc b/src/cupynumeric/transform/flip.cc
similarity index 84%
rename from src/cunumeric/transform/flip.cc
rename to src/cupynumeric/transform/flip.cc
index 17dd7b0896..5b45cea010 100644
--- a/src/cunumeric/transform/flip.cc
+++ b/src/cupynumeric/transform/flip.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/transform/flip.h"
-#include "cunumeric/transform/flip_template.inl"
+#include "cupynumeric/transform/flip.h"
+#include "cupynumeric/transform/flip_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -49,7 +49,10 @@ struct FlipImplBody<VariantKind::CPU, CODE, DIM> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { FlipTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  FlipTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/transform/flip.cu b/src/cupynumeric/transform/flip.cu
similarity index 87%
rename from src/cunumeric/transform/flip.cu
rename to src/cupynumeric/transform/flip.cu
index ab7bb0111e..b56c29d9f4 100644
--- a/src/cunumeric/transform/flip.cu
+++ b/src/cupynumeric/transform/flip.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/transform/flip.h"
-#include "cunumeric/transform/flip_template.inl"
+#include "cupynumeric/transform/flip.h"
+#include "cupynumeric/transform/flip_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -59,14 +59,14 @@ struct FlipImplBody<VariantKind::GPU, CODE, DIM> {
     const size_t volume = rect.volume();
     const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
     auto num_axes       = axes.size();
-    auto gpu_axes       = create_buffer<int32_t>(num_axes, Memory::Kind::Z_COPY_MEM);
+    auto gpu_axes = create_buffer<int32_t>(num_axes, Memory::Kind::Z_COPY_MEM, sizeof(int32_t));
     for (uint32_t idx = 0; idx < num_axes; ++idx) {
       gpu_axes[idx] = axes[idx];
     }
     auto stream = get_cached_stream();
     flip_kernel<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
       volume, out, in, pitches, rect, gpu_axes, num_axes);
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -75,4 +75,4 @@ struct FlipImplBody<VariantKind::GPU, CODE, DIM> {
   flip_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/transform/flip.h b/src/cupynumeric/transform/flip.h
similarity index 74%
rename from src/cunumeric/transform/flip.h
rename to src/cupynumeric/transform/flip.h
index 18b6ec1128..f9692128eb 100644
--- a/src/cunumeric/transform/flip.h
+++ b/src/cupynumeric/transform/flip.h
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct FlipArgs {
   legate::PhysicalStore in;
@@ -26,9 +26,11 @@ struct FlipArgs {
   legate::Span<const int32_t> axes;
 };
 
-class FlipTask : public CuNumericTask<FlipTask> {
+class FlipTask : public CuPyNumericTask<FlipTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_FLIP};
+  static inline const auto TASK_CONFIG = legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_FLIP}};
+
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -40,4 +42,4 @@ class FlipTask : public CuNumericTask<FlipTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/transform/flip_omp.cc b/src/cupynumeric/transform/flip_omp.cc
similarity index 91%
rename from src/cunumeric/transform/flip_omp.cc
rename to src/cupynumeric/transform/flip_omp.cc
index eb7e640136..fce3b1d7a6 100644
--- a/src/cunumeric/transform/flip_omp.cc
+++ b/src/cupynumeric/transform/flip_omp.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/transform/flip.h"
-#include "cunumeric/transform/flip_template.inl"
+#include "cupynumeric/transform/flip.h"
+#include "cupynumeric/transform/flip_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -50,4 +50,4 @@ struct FlipImplBody<VariantKind::OMP, CODE, DIM> {
   flip_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/transform/flip_template.inl b/src/cupynumeric/transform/flip_template.inl
similarity index 92%
rename from src/cunumeric/transform/flip_template.inl
rename to src/cupynumeric/transform/flip_template.inl
index b4956622b4..d74ec8a95a 100644
--- a/src/cunumeric/transform/flip_template.inl
+++ b/src/cupynumeric/transform/flip_template.inl
@@ -17,10 +17,10 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/transform/flip.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/transform/flip.h"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -57,4 +57,4 @@ static void flip_template(TaskContext& context)
   double_dispatch(args.in.dim(), args.in.code(), FlipImpl<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/typedefs.h b/src/cupynumeric/typedefs.h
similarity index 93%
rename from src/cunumeric/typedefs.h
rename to src/cupynumeric/typedefs.h
index 840cf875aa..ad9ea1229f 100644
--- a/src/cunumeric/typedefs.h
+++ b/src/cupynumeric/typedefs.h
@@ -20,9 +20,9 @@
 
 #include "legate.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using Array  = legate::PhysicalStore;
 using Scalar = legate::Scalar;
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/unary/convert.cc b/src/cupynumeric/unary/convert.cc
similarity index 87%
rename from src/cunumeric/unary/convert.cc
rename to src/cupynumeric/unary/convert.cc
index 78ece02ebb..fb08696c64 100644
--- a/src/cunumeric/unary/convert.cc
+++ b/src/cupynumeric/unary/convert.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/unary/convert.h"
-#include "cunumeric/unary/convert_template.inl"
+#include "cupynumeric/unary/convert.h"
+#include "cupynumeric/unary/convert_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -57,7 +57,10 @@ struct ConvertImplBody<VariantKind::CPU, NAN_OP, DST_TYPE, SRC_TYPE, DIM> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { ConvertTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  ConvertTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/unary/convert.cu b/src/cupynumeric/unary/convert.cu
similarity index 91%
rename from src/cunumeric/unary/convert.cu
rename to src/cupynumeric/unary/convert.cu
index 564473a4e0..903ffc929b 100644
--- a/src/cunumeric/unary/convert.cu
+++ b/src/cupynumeric/unary/convert.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/unary/convert.h"
-#include "cunumeric/unary/convert_template.inl"
+#include "cupynumeric/unary/convert.h"
+#include "cupynumeric/unary/convert_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename Function, typename ARG, typename RES>
 static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
@@ -68,7 +68,7 @@ struct ConvertImplBody<VariantKind::GPU, NAN_OP, DST_TYPE, SRC_TYPE, DIM> {
       generic_kernel<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
         volume, func, out, in, pitches, rect);
     }
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -77,4 +77,4 @@ struct ConvertImplBody<VariantKind::GPU, NAN_OP, DST_TYPE, SRC_TYPE, DIM> {
   convert_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/unary/convert.h b/src/cupynumeric/unary/convert.h
similarity index 76%
rename from src/cunumeric/unary/convert.h
rename to src/cupynumeric/unary/convert.h
index 3d29d0bf7c..835b0d1cb7 100644
--- a/src/cunumeric/unary/convert.h
+++ b/src/cupynumeric/unary/convert.h
@@ -16,10 +16,10 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/unary/convert_util.h"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/unary/convert_util.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct ConvertArgs {
   legate::PhysicalStore out;
@@ -27,9 +27,10 @@ struct ConvertArgs {
   ConvertCode nan_op;
 };
 
-class ConvertTask : public CuNumericTask<ConvertTask> {
+class ConvertTask : public CuPyNumericTask<ConvertTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_CONVERT};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_CONVERT}};
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -41,4 +42,4 @@ class ConvertTask : public CuNumericTask<ConvertTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/unary/convert_omp.cc b/src/cupynumeric/unary/convert_omp.cc
similarity index 92%
rename from src/cunumeric/unary/convert_omp.cc
rename to src/cupynumeric/unary/convert_omp.cc
index d0823daf34..6273e99f28 100644
--- a/src/cunumeric/unary/convert_omp.cc
+++ b/src/cupynumeric/unary/convert_omp.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/unary/convert.h"
-#include "cunumeric/unary/convert_template.inl"
+#include "cupynumeric/unary/convert.h"
+#include "cupynumeric/unary/convert_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -57,4 +57,4 @@ struct ConvertImplBody<VariantKind::OMP, NAN_OP, DST_TYPE, SRC_TYPE, DIM> {
   convert_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/unary/convert_template.inl b/src/cupynumeric/unary/convert_template.inl
similarity index 95%
rename from src/cunumeric/unary/convert_template.inl
rename to src/cupynumeric/unary/convert_template.inl
index d78f13b4db..c252112050 100644
--- a/src/cunumeric/unary/convert_template.inl
+++ b/src/cupynumeric/unary/convert_template.inl
@@ -17,11 +17,11 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/unary/convert.h"
-#include "cunumeric/pitches.h"
-#include "cunumeric/unary/convert_util.h"
+#include "cupynumeric/unary/convert.h"
+#include "cupynumeric/pitches.h"
+#include "cupynumeric/unary/convert_util.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -106,4 +106,4 @@ static void convert_template(TaskContext& context)
   type_dispatch(args.in.code(), SourceTypeDispatch<KIND>{}, args);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/unary/convert_util.h b/src/cupynumeric/unary/convert_util.h
similarity index 81%
rename from src/cunumeric/unary/convert_util.h
rename to src/cupynumeric/unary/convert_util.h
index 08951f6b12..ef2a1ce534 100644
--- a/src/cunumeric/unary/convert_util.h
+++ b/src/cupynumeric/unary/convert_util.h
@@ -16,15 +16,15 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/unary/isnan.h"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/unary/isnan.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 enum class ConvertCode : int {
-  NOOP = CUNUMERIC_CONVERT_NAN_NOOP,
-  PROD = CUNUMERIC_CONVERT_NAN_PROD,
-  SUM  = CUNUMERIC_CONVERT_NAN_SUM,
+  NOOP = CUPYNUMERIC_CONVERT_NAN_NOOP,
+  PROD = CUPYNUMERIC_CONVERT_NAN_PROD,
+  SUM  = CUPYNUMERIC_CONVERT_NAN_SUM,
 };
 
 template <typename Functor, typename... Fnargs>
@@ -112,7 +112,7 @@ struct ConvertOp<ConvertCode::PROD, DST_TYPE, SRC_TYPE> {
                              legate::is_complex_type<DST>::value>* = nullptr>
   constexpr DST operator()(const _SRC& src) const
   {
-    return cunumeric::is_nan(src) ? static_cast<DST>(1) : static_cast<DST>(src);
+    return cupynumeric::is_nan(src) ? static_cast<DST>(1) : static_cast<DST>(src);
   }
 
   template <typename _SRC                                           = SRC,
@@ -120,7 +120,7 @@ struct ConvertOp<ConvertCode::PROD, DST_TYPE, SRC_TYPE> {
                              !legate::is_complex_type<DST>::value>* = nullptr>
   constexpr DST operator()(const _SRC& src) const
   {
-    return cunumeric::is_nan(src) ? static_cast<DST>(1) : static_cast<DST>(src.real());
+    return cupynumeric::is_nan(src) ? static_cast<DST>(1) : static_cast<DST>(src.real());
   }
 };
 
@@ -131,15 +131,15 @@ struct ConvertOp<ConvertCode::PROD, legate::Type::Code::FLOAT16, SRC_TYPE> {
   template <typename _SRC = SRC, std::enable_if_t<!legate::is_complex_type<_SRC>::value>* = nullptr>
   __CUDA_HD__ __half operator()(const _SRC& src) const
   {
-    return cunumeric::is_nan(src) ? static_cast<__half>(1)
-                                  : static_cast<__half>(static_cast<double>(src));
+    return cupynumeric::is_nan(src) ? static_cast<__half>(1)
+                                    : static_cast<__half>(static_cast<double>(src));
   }
 
   template <typename _SRC = SRC, std::enable_if_t<legate::is_complex_type<_SRC>::value>* = nullptr>
   __CUDA_HD__ __half operator()(const _SRC& src) const
   {
-    return cunumeric::is_nan(src) ? static_cast<__half>(1)
-                                  : static_cast<__half>(static_cast<double>(src.real()));
+    return cupynumeric::is_nan(src) ? static_cast<__half>(1)
+                                    : static_cast<__half>(static_cast<double>(src.real()));
   }
 };
 
@@ -149,8 +149,8 @@ struct ConvertOp<ConvertCode::PROD, DST_TYPE, legate::Type::Code::FLOAT16> {
 
   constexpr DST operator()(const __half& src) const
   {
-    return cunumeric::is_nan(src) ? static_cast<DST>(1)
-                                  : static_cast<DST>(static_cast<double>(src));
+    return cupynumeric::is_nan(src) ? static_cast<DST>(1)
+                                    : static_cast<DST>(static_cast<double>(src));
   }
 };
 
@@ -164,7 +164,7 @@ struct ConvertOp<ConvertCode::SUM, DST_TYPE, SRC_TYPE> {
                              legate::is_complex_type<DST>::value>* = nullptr>
   constexpr DST operator()(const _SRC& src) const
   {
-    return cunumeric::is_nan(src) ? static_cast<DST>(0) : static_cast<DST>(src);
+    return cupynumeric::is_nan(src) ? static_cast<DST>(0) : static_cast<DST>(src);
   }
 
   template <typename _SRC                                           = SRC,
@@ -172,7 +172,7 @@ struct ConvertOp<ConvertCode::SUM, DST_TYPE, SRC_TYPE> {
                              !legate::is_complex_type<DST>::value>* = nullptr>
   constexpr DST operator()(const _SRC& src) const
   {
-    return cunumeric::is_nan(src) ? static_cast<DST>(0) : static_cast<DST>(src.real());
+    return cupynumeric::is_nan(src) ? static_cast<DST>(0) : static_cast<DST>(src.real());
   }
 };
 
@@ -183,15 +183,15 @@ struct ConvertOp<ConvertCode::SUM, legate::Type::Code::FLOAT16, SRC_TYPE> {
   template <typename _SRC = SRC, std::enable_if_t<!legate::is_complex_type<_SRC>::value>* = nullptr>
   __CUDA_HD__ __half operator()(const _SRC& src) const
   {
-    return cunumeric::is_nan(src) ? static_cast<__half>(0)
-                                  : static_cast<__half>(static_cast<double>(src));
+    return cupynumeric::is_nan(src) ? static_cast<__half>(0)
+                                    : static_cast<__half>(static_cast<double>(src));
   }
 
   template <typename _SRC = SRC, std::enable_if_t<legate::is_complex_type<_SRC>::value>* = nullptr>
   __CUDA_HD__ __half operator()(const _SRC& src) const
   {
-    return cunumeric::is_nan(src) ? static_cast<__half>(0)
-                                  : static_cast<__half>(static_cast<double>(src.real()));
+    return cupynumeric::is_nan(src) ? static_cast<__half>(0)
+                                    : static_cast<__half>(static_cast<double>(src.real()));
   }
 };
 
@@ -201,9 +201,9 @@ struct ConvertOp<ConvertCode::SUM, DST_TYPE, legate::Type::Code::FLOAT16> {
 
   constexpr DST operator()(const __half& src) const
   {
-    return cunumeric::is_nan(src) ? static_cast<DST>(0)
-                                  : static_cast<DST>(static_cast<double>(src));
+    return cupynumeric::is_nan(src) ? static_cast<DST>(0)
+                                    : static_cast<DST>(static_cast<double>(src));
   }
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/unary/isnan.h b/src/cupynumeric/unary/isnan.h
similarity index 95%
rename from src/cunumeric/unary/isnan.h
rename to src/cupynumeric/unary/isnan.h
index 1f3dacd253..9808b55153 100644
--- a/src/cunumeric/unary/isnan.h
+++ b/src/cupynumeric/unary/isnan.h
@@ -18,7 +18,7 @@
 
 #include "legate/utilities/typedefs.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename T, std::enable_if_t<std::is_integral<T>::value>* = nullptr>
 constexpr bool is_nan(const T& x)
@@ -44,4 +44,4 @@ __CUDA_HD__ inline bool is_nan(const __half& x)
   return isnan(x);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/unary/scalar_unary_red.cc b/src/cupynumeric/unary/scalar_unary_red.cc
similarity index 78%
rename from src/cunumeric/unary/scalar_unary_red.cc
rename to src/cupynumeric/unary/scalar_unary_red.cc
index 94640035af..02c9a86585 100644
--- a/src/cunumeric/unary/scalar_unary_red.cc
+++ b/src/cupynumeric/unary/scalar_unary_red.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/unary/scalar_unary_red.h"
-#include "cunumeric/unary/scalar_unary_red_template.inl"
+#include "cupynumeric/unary/scalar_unary_red.h"
+#include "cupynumeric/unary/scalar_unary_red_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 /*static*/ void ScalarUnaryRedTask::cpu_variant(TaskContext context)
 {
@@ -26,10 +26,10 @@ namespace cunumeric {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+const auto cupynumeric_reg_task_ = []() -> char {
   ScalarUnaryRedTask::register_variants();
-}
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/unary/scalar_unary_red.cu b/src/cupynumeric/unary/scalar_unary_red.cu
similarity index 73%
rename from src/cunumeric/unary/scalar_unary_red.cu
rename to src/cupynumeric/unary/scalar_unary_red.cu
index 71521be73c..0dfc12d3ab 100644
--- a/src/cunumeric/unary/scalar_unary_red.cu
+++ b/src/cupynumeric/unary/scalar_unary_red.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/unary/scalar_unary_red.h"
-#include "cunumeric/unary/scalar_unary_red_template.inl"
-#include "cunumeric/execution_policy/reduction/scalar_reduction.cuh"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/unary/scalar_unary_red.h"
+#include "cupynumeric/unary/scalar_unary_red_template.inl"
+#include "cupynumeric/execution_policy/reduction/scalar_reduction.cuh"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -28,4 +28,4 @@ using namespace legate;
   scalar_unary_red_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/unary/scalar_unary_red.h b/src/cupynumeric/unary/scalar_unary_red.h
similarity index 73%
rename from src/cunumeric/unary/scalar_unary_red.h
rename to src/cupynumeric/unary/scalar_unary_red.h
index 0941896ab5..20aa30eb2b 100644
--- a/src/cunumeric/unary/scalar_unary_red.h
+++ b/src/cupynumeric/unary/scalar_unary_red.h
@@ -16,10 +16,10 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/unary/unary_red_util.h"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/unary/unary_red_util.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct ScalarUnaryRedArgs {
   legate::PhysicalStore out;
@@ -31,9 +31,12 @@ struct ScalarUnaryRedArgs {
 };
 
 // Unary reduction task that produces scalar results
-class ScalarUnaryRedTask : public CuNumericTask<ScalarUnaryRedTask> {
+class ScalarUnaryRedTask : public CuPyNumericTask<ScalarUnaryRedTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_SCALAR_UNARY_RED};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_SCALAR_UNARY_RED}};
+
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -45,4 +48,4 @@ class ScalarUnaryRedTask : public CuNumericTask<ScalarUnaryRedTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/unary/scalar_unary_red_omp.cc b/src/cupynumeric/unary/scalar_unary_red_omp.cc
similarity index 76%
rename from src/cunumeric/unary/scalar_unary_red_omp.cc
rename to src/cupynumeric/unary/scalar_unary_red_omp.cc
index 1bba055b3e..f711580ab2 100644
--- a/src/cunumeric/unary/scalar_unary_red_omp.cc
+++ b/src/cupynumeric/unary/scalar_unary_red_omp.cc
@@ -14,15 +14,15 @@
  *
  */
 
-#include "cunumeric/unary/scalar_unary_red.h"
-#include "cunumeric/unary/scalar_unary_red_template.inl"
-#include "cunumeric/execution_policy/reduction/scalar_reduction_omp.h"
+#include "cupynumeric/unary/scalar_unary_red.h"
+#include "cupynumeric/unary/scalar_unary_red_template.inl"
+#include "cupynumeric/execution_policy/reduction/scalar_reduction_omp.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 /*static*/ void ScalarUnaryRedTask::omp_variant(TaskContext context)
 {
   scalar_unary_red_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/unary/scalar_unary_red_template.inl b/src/cupynumeric/unary/scalar_unary_red_template.inl
similarity index 95%
rename from src/cunumeric/unary/scalar_unary_red_template.inl
rename to src/cupynumeric/unary/scalar_unary_red_template.inl
index 762b10ddfe..20d20bb50d 100644
--- a/src/cunumeric/unary/scalar_unary_red_template.inl
+++ b/src/cupynumeric/unary/scalar_unary_red_template.inl
@@ -18,13 +18,13 @@
 
 // Useful for IDEs
 #include <legate/utilities/typedefs.h>
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/unary/scalar_unary_red.h"
-#include "cunumeric/unary/unary_red_util.h"
-#include "cunumeric/pitches.h"
-#include "cunumeric/execution_policy/reduction/scalar_reduction.h"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/unary/scalar_unary_red.h"
+#include "cupynumeric/unary/unary_red_util.h"
+#include "cupynumeric/pitches.h"
+#include "cupynumeric/execution_policy/reduction/scalar_reduction.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -219,4 +219,4 @@ static void scalar_unary_red_template(TaskContext& context)
   op_dispatch(args.op_code, ScalarUnaryRedDispatch<KIND>{}, args, has_where);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/unary/unary_op.cc b/src/cupynumeric/unary/unary_op.cc
similarity index 93%
rename from src/cunumeric/unary/unary_op.cc
rename to src/cupynumeric/unary/unary_op.cc
index 547e75ec08..0341e8e094 100644
--- a/src/cunumeric/unary/unary_op.cc
+++ b/src/cupynumeric/unary/unary_op.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/unary/unary_op.h"
-#include "cunumeric/unary/unary_op_template.inl"
+#include "cupynumeric/unary/unary_op.h"
+#include "cupynumeric/unary/unary_op_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -113,7 +113,10 @@ struct MultiOutUnaryOpImplBody<VariantKind::CPU, OP_CODE, CODE, DIM> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { UnaryOpTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  UnaryOpTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/unary/unary_op.cu b/src/cupynumeric/unary/unary_op.cu
similarity index 95%
rename from src/cunumeric/unary/unary_op.cu
rename to src/cupynumeric/unary/unary_op.cu
index 88838f958b..31b2eb3b5b 100644
--- a/src/cunumeric/unary/unary_op.cu
+++ b/src/cupynumeric/unary/unary_op.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/unary/unary_op.h"
-#include "cunumeric/unary/unary_op_template.inl"
+#include "cupynumeric/unary/unary_op.h"
+#include "cupynumeric/unary/unary_op_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename Function, typename ARG, typename RES>
 static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
@@ -95,7 +95,7 @@ struct UnaryOpImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
       generic_kernel<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
         volume, func, out, in, pitches, rect);
     }
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -117,7 +117,7 @@ struct PointCopyImplBody<VariantKind::GPU, VAL, DIM> {
     } else {
       generic_copy_kernel<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(volume, out, in, pitches, rect);
     }
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -183,7 +183,7 @@ struct MultiOutUnaryOpImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
       generic_kernel_multiout<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
         volume, func, lhs, rhs1, rhs2, pitches, rect);
     }
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -192,4 +192,4 @@ struct MultiOutUnaryOpImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
   unary_op_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/unary/unary_op.h b/src/cupynumeric/unary/unary_op.h
similarity index 71%
rename from src/cunumeric/unary/unary_op.h
rename to src/cupynumeric/unary/unary_op.h
index 973ed916bb..89b40c77ed 100644
--- a/src/cunumeric/unary/unary_op.h
+++ b/src/cupynumeric/unary/unary_op.h
@@ -16,10 +16,10 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/unary/unary_op_util.h"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/unary/unary_op_util.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct UnaryOpArgs {
   legate::PhysicalStore in;
@@ -35,9 +35,10 @@ struct MultiOutUnaryOpArgs {
   UnaryOpCode op_code;
 };
 
-class UnaryOpTask : public CuNumericTask<UnaryOpTask> {
+class UnaryOpTask : public CuPyNumericTask<UnaryOpTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_UNARY_OP};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_UNARY_OP}};
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -112,52 +113,52 @@ constexpr decltype(auto) double_dispatch(int dim, int point_dim, Functor f, Fnar
   switch (dim) {
 #if LEGATE_MAX_DIM >= 1
     case 1: {
-      return cunumeric::inner_type_dispatch_fn<1>{}(point_dim, f, std::forward<Fnargs>(args)...);
+      return cupynumeric::inner_type_dispatch_fn<1>{}(point_dim, f, std::forward<Fnargs>(args)...);
     }
 #endif
 #if LEGATE_MAX_DIM >= 2
     case 2: {
-      return cunumeric::inner_type_dispatch_fn<2>{}(point_dim, f, std::forward<Fnargs>(args)...);
+      return cupynumeric::inner_type_dispatch_fn<2>{}(point_dim, f, std::forward<Fnargs>(args)...);
     }
 #endif
 #if LEGATE_MAX_DIM >= 3
     case 3: {
-      return cunumeric::inner_type_dispatch_fn<3>{}(point_dim, f, std::forward<Fnargs>(args)...);
+      return cupynumeric::inner_type_dispatch_fn<3>{}(point_dim, f, std::forward<Fnargs>(args)...);
     }
 #endif
 #if LEGATE_MAX_DIM >= 4
     case 4: {
-      return cunumeric::inner_type_dispatch_fn<4>{}(point_dim, f, std::forward<Fnargs>(args)...);
+      return cupynumeric::inner_type_dispatch_fn<4>{}(point_dim, f, std::forward<Fnargs>(args)...);
     }
 #endif
 #if LEGATE_MAX_DIM >= 5
     case 5: {
-      return cunumeric::inner_type_dispatch_fn<5>{}(point_dim, f, std::forward<Fnargs>(args)...);
+      return cupynumeric::inner_type_dispatch_fn<5>{}(point_dim, f, std::forward<Fnargs>(args)...);
     }
 #endif
 #if LEGATE_MAX_DIM >= 6
     case 6: {
-      return cunumeric::inner_type_dispatch_fn<6>{}(point_dim, f, std::forward<Fnargs>(args)...);
+      return cupynumeric::inner_type_dispatch_fn<6>{}(point_dim, f, std::forward<Fnargs>(args)...);
     }
 #endif
 #if LEGATE_MAX_DIM >= 7
     case 7: {
-      return cunumeric::inner_type_dispatch_fn<7>{}(point_dim, f, std::forward<Fnargs>(args)...);
+      return cupynumeric::inner_type_dispatch_fn<7>{}(point_dim, f, std::forward<Fnargs>(args)...);
     }
 #endif
 #if LEGATE_MAX_DIM >= 8
     case 8: {
-      return cunumeric::inner_type_dispatch_fn<8>{}(point_dim, f, std::forward<Fnargs>(args)...);
+      return cupynumeric::inner_type_dispatch_fn<8>{}(point_dim, f, std::forward<Fnargs>(args)...);
     }
 #endif
 #if LEGATE_MAX_DIM >= 9
     case 9: {
-      return cunumeric::inner_type_dispatch_fn<9>{}(point_dim, f, std::forward<Fnargs>(args)...);
+      return cupynumeric::inner_type_dispatch_fn<9>{}(point_dim, f, std::forward<Fnargs>(args)...);
     }
 #endif
   }
   assert(false);
-  return cunumeric::inner_type_dispatch_fn<1>{}(point_dim, f, std::forward<Fnargs>(args)...);
+  return cupynumeric::inner_type_dispatch_fn<1>{}(point_dim, f, std::forward<Fnargs>(args)...);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/unary/unary_op_omp.cc b/src/cupynumeric/unary/unary_op_omp.cc
similarity index 96%
rename from src/cunumeric/unary/unary_op_omp.cc
rename to src/cupynumeric/unary/unary_op_omp.cc
index fad475fce9..4c42344cd7 100644
--- a/src/cunumeric/unary/unary_op_omp.cc
+++ b/src/cupynumeric/unary/unary_op_omp.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/unary/unary_op.h"
-#include "cunumeric/unary/unary_op_template.inl"
+#include "cupynumeric/unary/unary_op.h"
+#include "cupynumeric/unary/unary_op_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -117,4 +117,4 @@ struct MultiOutUnaryOpImplBody<VariantKind::OMP, OP_CODE, CODE, DIM> {
   unary_op_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/unary/unary_op_template.inl b/src/cupynumeric/unary/unary_op_template.inl
similarity index 96%
rename from src/cunumeric/unary/unary_op_template.inl
rename to src/cupynumeric/unary/unary_op_template.inl
index 3642f0b8b0..0df142530c 100644
--- a/src/cunumeric/unary/unary_op_template.inl
+++ b/src/cupynumeric/unary/unary_op_template.inl
@@ -17,10 +17,10 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/unary/unary_op.h"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/unary/unary_op.h"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -172,7 +172,7 @@ struct UnaryOpDispatch {
     auto dim = std::max(args.in.dim(), 1);
     if ((OP_CODE == UnaryOpCode::COPY) && (args.in.code() == Type::Code::FIXED_ARRAY)) {
       auto type = args.in.type().as_fixed_array_type();
-      cunumeric::double_dispatch(dim, type.num_elements(), UnaryCopyImpl<KIND>{}, args);
+      cupynumeric::double_dispatch(dim, type.num_elements(), UnaryCopyImpl<KIND>{}, args);
     } else {
       auto code = OP_CODE == UnaryOpCode::GETARG ? args.out.code() : args.in.code();
       legate::double_dispatch(dim, code, UnaryOpImpl<KIND, OP_CODE>{}, args);
@@ -215,4 +215,4 @@ static void unary_op_template(TaskContext& context)
   }
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/unary/unary_op_util.h b/src/cupynumeric/unary/unary_op_util.h
similarity index 85%
rename from src/cunumeric/unary/unary_op_util.h
rename to src/cupynumeric/unary/unary_op_util.h
index 4bcd6ec8f6..bb9d617c48 100644
--- a/src/cunumeric/unary/unary_op_util.h
+++ b/src/cupynumeric/unary/unary_op_util.h
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/arg.h"
-#include "cunumeric/arg.inl"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/arg.h"
+#include "cupynumeric/arg.inl"
 
 #ifdef __NVCC__
 #include "thrust/complex.h"
@@ -29,58 +29,58 @@
 #include <math.h>
 #include <complex>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 enum class UnaryOpCode : int {
-  ABSOLUTE    = CUNUMERIC_UOP_ABSOLUTE,
-  ANGLE       = CUNUMERIC_UOP_ANGLE,
-  ARCCOS      = CUNUMERIC_UOP_ARCCOS,
-  ARCCOSH     = CUNUMERIC_UOP_ARCCOSH,
-  ARCSIN      = CUNUMERIC_UOP_ARCSIN,
-  ARCSINH     = CUNUMERIC_UOP_ARCSINH,
-  ARCTAN      = CUNUMERIC_UOP_ARCTAN,
-  ARCTANH     = CUNUMERIC_UOP_ARCTANH,
-  CBRT        = CUNUMERIC_UOP_CBRT,
-  CEIL        = CUNUMERIC_UOP_CEIL,
-  CLIP        = CUNUMERIC_UOP_CLIP,
-  CONJ        = CUNUMERIC_UOP_CONJ,
-  COPY        = CUNUMERIC_UOP_COPY,
-  COS         = CUNUMERIC_UOP_COS,
-  COSH        = CUNUMERIC_UOP_COSH,
-  DEG2RAD     = CUNUMERIC_UOP_DEG2RAD,
-  EXP         = CUNUMERIC_UOP_EXP,
-  EXP2        = CUNUMERIC_UOP_EXP2,
-  EXPM1       = CUNUMERIC_UOP_EXPM1,
-  FLOOR       = CUNUMERIC_UOP_FLOOR,
-  FREXP       = CUNUMERIC_UOP_FREXP,
-  GETARG      = CUNUMERIC_UOP_GETARG,
-  IMAG        = CUNUMERIC_UOP_IMAG,
-  INVERT      = CUNUMERIC_UOP_INVERT,
-  ISFINITE    = CUNUMERIC_UOP_ISFINITE,
-  ISINF       = CUNUMERIC_UOP_ISINF,
-  ISNAN       = CUNUMERIC_UOP_ISNAN,
-  LOG         = CUNUMERIC_UOP_LOG,
-  LOG10       = CUNUMERIC_UOP_LOG10,
-  LOG1P       = CUNUMERIC_UOP_LOG1P,
-  LOG2        = CUNUMERIC_UOP_LOG2,
-  LOGICAL_NOT = CUNUMERIC_UOP_LOGICAL_NOT,
-  MODF        = CUNUMERIC_UOP_MODF,
-  NEGATIVE    = CUNUMERIC_UOP_NEGATIVE,
-  POSITIVE    = CUNUMERIC_UOP_POSITIVE,
-  RAD2DEG     = CUNUMERIC_UOP_RAD2DEG,
-  REAL        = CUNUMERIC_UOP_REAL,
-  RECIPROCAL  = CUNUMERIC_UOP_RECIPROCAL,
-  RINT        = CUNUMERIC_UOP_RINT,
-  ROUND       = CUNUMERIC_UOP_ROUND,
-  SIGN        = CUNUMERIC_UOP_SIGN,
-  SIGNBIT     = CUNUMERIC_UOP_SIGNBIT,
-  SIN         = CUNUMERIC_UOP_SIN,
-  SINH        = CUNUMERIC_UOP_SINH,
-  SQRT        = CUNUMERIC_UOP_SQRT,
-  SQUARE      = CUNUMERIC_UOP_SQUARE,
-  TAN         = CUNUMERIC_UOP_TAN,
-  TANH        = CUNUMERIC_UOP_TANH,
-  TRUNC       = CUNUMERIC_UOP_TRUNC,
+  ABSOLUTE    = CUPYNUMERIC_UOP_ABSOLUTE,
+  ANGLE       = CUPYNUMERIC_UOP_ANGLE,
+  ARCCOS      = CUPYNUMERIC_UOP_ARCCOS,
+  ARCCOSH     = CUPYNUMERIC_UOP_ARCCOSH,
+  ARCSIN      = CUPYNUMERIC_UOP_ARCSIN,
+  ARCSINH     = CUPYNUMERIC_UOP_ARCSINH,
+  ARCTAN      = CUPYNUMERIC_UOP_ARCTAN,
+  ARCTANH     = CUPYNUMERIC_UOP_ARCTANH,
+  CBRT        = CUPYNUMERIC_UOP_CBRT,
+  CEIL        = CUPYNUMERIC_UOP_CEIL,
+  CLIP        = CUPYNUMERIC_UOP_CLIP,
+  CONJ        = CUPYNUMERIC_UOP_CONJ,
+  COPY        = CUPYNUMERIC_UOP_COPY,
+  COS         = CUPYNUMERIC_UOP_COS,
+  COSH        = CUPYNUMERIC_UOP_COSH,
+  DEG2RAD     = CUPYNUMERIC_UOP_DEG2RAD,
+  EXP         = CUPYNUMERIC_UOP_EXP,
+  EXP2        = CUPYNUMERIC_UOP_EXP2,
+  EXPM1       = CUPYNUMERIC_UOP_EXPM1,
+  FLOOR       = CUPYNUMERIC_UOP_FLOOR,
+  FREXP       = CUPYNUMERIC_UOP_FREXP,
+  GETARG      = CUPYNUMERIC_UOP_GETARG,
+  IMAG        = CUPYNUMERIC_UOP_IMAG,
+  INVERT      = CUPYNUMERIC_UOP_INVERT,
+  ISFINITE    = CUPYNUMERIC_UOP_ISFINITE,
+  ISINF       = CUPYNUMERIC_UOP_ISINF,
+  ISNAN       = CUPYNUMERIC_UOP_ISNAN,
+  LOG         = CUPYNUMERIC_UOP_LOG,
+  LOG10       = CUPYNUMERIC_UOP_LOG10,
+  LOG1P       = CUPYNUMERIC_UOP_LOG1P,
+  LOG2        = CUPYNUMERIC_UOP_LOG2,
+  LOGICAL_NOT = CUPYNUMERIC_UOP_LOGICAL_NOT,
+  MODF        = CUPYNUMERIC_UOP_MODF,
+  NEGATIVE    = CUPYNUMERIC_UOP_NEGATIVE,
+  POSITIVE    = CUPYNUMERIC_UOP_POSITIVE,
+  RAD2DEG     = CUPYNUMERIC_UOP_RAD2DEG,
+  REAL        = CUPYNUMERIC_UOP_REAL,
+  RECIPROCAL  = CUPYNUMERIC_UOP_RECIPROCAL,
+  RINT        = CUPYNUMERIC_UOP_RINT,
+  ROUND       = CUPYNUMERIC_UOP_ROUND,
+  SIGN        = CUPYNUMERIC_UOP_SIGN,
+  SIGNBIT     = CUPYNUMERIC_UOP_SIGNBIT,
+  SIN         = CUPYNUMERIC_UOP_SIN,
+  SINH        = CUPYNUMERIC_UOP_SINH,
+  SQRT        = CUPYNUMERIC_UOP_SQRT,
+  SQUARE      = CUPYNUMERIC_UOP_SQUARE,
+  TAN         = CUPYNUMERIC_UOP_TAN,
+  TANH        = CUPYNUMERIC_UOP_TANH,
+  TRUNC       = CUPYNUMERIC_UOP_TRUNC,
 };
 
 template <typename Functor, typename... Fnargs>
@@ -240,7 +240,6 @@ struct UnaryOp<UnaryOpCode::ABSOLUTE, CODE> {
                              !std::is_integral<_T>::value>* = nullptr>
   constexpr _T operator()(const _T& x) const
   {
-    using std::fabs;
     return static_cast<_T>(fabs(x));
   }
 };
@@ -279,11 +278,7 @@ struct UnaryOp<UnaryOpCode::ARCCOS, CODE> {
 
   UnaryOp(const std::vector<legate::Scalar>& args) {}
 
-  constexpr decltype(auto) operator()(const T& x) const
-  {
-    using std::acos;
-    return acos(x);
-  }
+  constexpr decltype(auto) operator()(const T& x) const { return acos(x); }
 };
 
 template <legate::Type::Code CODE>
@@ -293,11 +288,7 @@ struct UnaryOp<UnaryOpCode::ARCCOSH, CODE> {
 
   UnaryOp(const std::vector<legate::Scalar>& args) {}
 
-  constexpr decltype(auto) operator()(const T& x) const
-  {
-    using std::acosh;
-    return acosh(x);
-  }
+  constexpr decltype(auto) operator()(const T& x) const { return acosh(x); }
 };
 
 template <>
@@ -309,7 +300,6 @@ struct UnaryOp<UnaryOpCode::ARCCOSH, legate::Type::Code::FLOAT16> {
 
   __CUDA_HD__ __half operator()(const __half& x) const
   {
-    using std::acosh;
     return __half{acosh(static_cast<float>(x))};
   }
 };
@@ -321,11 +311,7 @@ struct UnaryOp<UnaryOpCode::ARCSIN, CODE> {
 
   UnaryOp(const std::vector<legate::Scalar>& args) {}
 
-  constexpr decltype(auto) operator()(const T& x) const
-  {
-    using std::asin;
-    return asin(x);
-  }
+  constexpr decltype(auto) operator()(const T& x) const { return asin(x); }
 };
 
 template <legate::Type::Code CODE>
@@ -335,11 +321,7 @@ struct UnaryOp<UnaryOpCode::ARCSINH, CODE> {
 
   UnaryOp(const std::vector<legate::Scalar>& args) {}
 
-  constexpr decltype(auto) operator()(const T& x) const
-  {
-    using std::asinh;
-    return asinh(x);
-  }
+  constexpr decltype(auto) operator()(const T& x) const { return asinh(x); }
 };
 
 template <>
@@ -351,7 +333,6 @@ struct UnaryOp<UnaryOpCode::ARCSINH, legate::Type::Code::FLOAT16> {
 
   __CUDA_HD__ __half operator()(const __half& x) const
   {
-    using std::asinh;
     return __half{asinh(static_cast<float>(x))};
   }
 };
@@ -363,11 +344,7 @@ struct UnaryOp<UnaryOpCode::ARCTAN, CODE> {
 
   UnaryOp(const std::vector<legate::Scalar>& args) {}
 
-  constexpr decltype(auto) operator()(const T& x) const
-  {
-    using std::atan;
-    return atan(x);
-  }
+  constexpr decltype(auto) operator()(const T& x) const { return atan(x); }
 };
 
 template <legate::Type::Code CODE>
@@ -377,11 +354,7 @@ struct UnaryOp<UnaryOpCode::ARCTANH, CODE> {
 
   UnaryOp(const std::vector<legate::Scalar>& args) {}
 
-  constexpr decltype(auto) operator()(const T& x) const
-  {
-    using std::atanh;
-    return atanh(x);
-  }
+  constexpr decltype(auto) operator()(const T& x) const { return atanh(x); }
 };
 
 template <>
@@ -393,7 +366,6 @@ struct UnaryOp<UnaryOpCode::ARCTANH, legate::Type::Code::FLOAT16> {
 
   __CUDA_HD__ __half operator()(const __half& x) const
   {
-    using std::atanh;
     return __half{atanh(static_cast<float>(x))};
   }
 };
@@ -405,11 +377,7 @@ struct UnaryOp<UnaryOpCode::CBRT, CODE> {
 
   UnaryOp(const std::vector<legate::Scalar>& args) {}
 
-  constexpr decltype(auto) operator()(const T& x) const
-  {
-    using std::cbrt;
-    return cbrt(x);
-  }
+  constexpr decltype(auto) operator()(const T& x) const { return cbrt(x); }
 };
 
 template <>
@@ -421,7 +389,6 @@ struct UnaryOp<UnaryOpCode::CBRT, legate::Type::Code::FLOAT16> {
 
   __CUDA_HD__ __half operator()(const __half& x) const
   {
-    using std::cbrt;
     return __half{cbrt(static_cast<float>(x))};
   }
 };
@@ -433,11 +400,7 @@ struct UnaryOp<UnaryOpCode::CEIL, CODE> {
 
   UnaryOp(const std::vector<legate::Scalar>& args) {}
 
-  constexpr decltype(auto) operator()(const T& x) const
-  {
-    using std::ceil;
-    return ceil(x);
-  }
+  constexpr decltype(auto) operator()(const T& x) const { return ceil(x); }
 };
 
 template <legate::Type::Code CODE>
@@ -494,11 +457,7 @@ struct UnaryOp<UnaryOpCode::COS, CODE> {
 
   UnaryOp(const std::vector<legate::Scalar>& args) {}
 
-  constexpr decltype(auto) operator()(const T& x) const
-  {
-    using std::cos;
-    return cos(x);
-  }
+  constexpr decltype(auto) operator()(const T& x) const { return cos(x); }
 };
 
 template <legate::Type::Code CODE>
@@ -508,11 +467,7 @@ struct UnaryOp<UnaryOpCode::COSH, CODE> {
 
   UnaryOp(const std::vector<legate::Scalar>& args) {}
 
-  constexpr decltype(auto) operator()(const T& x) const
-  {
-    using std::cosh;
-    return cosh(x);
-  }
+  constexpr decltype(auto) operator()(const T& x) const { return cosh(x); }
 };
 
 template <>
@@ -524,7 +479,6 @@ struct UnaryOp<UnaryOpCode::COSH, legate::Type::Code::FLOAT16> {
 
   __CUDA_HD__ __half operator()(const __half& x) const
   {
-    using std::cosh;
     return __half{cosh(static_cast<float>(x))};
   }
 };
@@ -559,11 +513,7 @@ struct UnaryOp<UnaryOpCode::EXP, CODE> {
 
   UnaryOp(const std::vector<legate::Scalar>& args) {}
 
-  constexpr decltype(auto) operator()(const T& x) const
-  {
-    using std::exp;
-    return exp(x);
-  }
+  constexpr decltype(auto) operator()(const T& x) const { return exp(x); }
 };
 
 template <legate::Type::Code CODE>
@@ -576,12 +526,13 @@ struct UnaryOp<UnaryOpCode::EXP2, CODE> {
   template <typename _T = T, std::enable_if_t<!legate::is_complex_type<_T>::value>* = nullptr>
   constexpr T operator()(const T& x) const
   {
-    return std::exp2(x);
+    return exp2(x);
   }
 
   template <typename _T = T, std::enable_if_t<legate::is_complex_type<_T>::value>* = nullptr>
   constexpr T operator()(const T& x) const
   {
+    // we can keep using std:: here since CUDA version will use thrust::
     using std::exp;
     using std::log;
 #ifdef __NVCC__
@@ -602,7 +553,6 @@ struct UnaryOp<UnaryOpCode::EXP2, legate::Type::Code::FLOAT16> {
 
   __CUDA_HD__ __half operator()(const __half& x) const
   {
-    using std::exp2;
     return __half{exp2(static_cast<float>(x))};
   }
 };
@@ -617,13 +567,14 @@ struct UnaryOp<UnaryOpCode::EXPM1, CODE> {
   template <typename _T = T, std::enable_if_t<!legate::is_complex_type<_T>::value>* = nullptr>
   constexpr decltype(auto) operator()(const T& x) const
   {
-    using std::expm1;
     return expm1(x);
   }
 
   template <typename _T = T, std::enable_if_t<legate::is_complex_type<_T>::value>* = nullptr>
   constexpr decltype(auto) operator()(const T& x) const
   {
+    // CUDA's "exp" function does not directly support complex numbers,
+    // so using one from std
     using std::exp;
     return exp(x) - T(1);
   }
@@ -638,7 +589,6 @@ struct UnaryOp<UnaryOpCode::EXPM1, legate::Type::Code::FLOAT16> {
 
   __CUDA_HD__ __half operator()(const __half& x) const
   {
-    using std::expm1;
     return __half{expm1(static_cast<float>(x))};
   }
 };
@@ -650,11 +600,7 @@ struct UnaryOp<UnaryOpCode::FLOOR, CODE> {
 
   UnaryOp(const std::vector<legate::Scalar>& args) {}
 
-  constexpr decltype(auto) operator()(const T& x) const
-  {
-    using std::floor;
-    return floor(x);
-  }
+  constexpr decltype(auto) operator()(const T& x) const { return floor(x); }
 };
 
 template <legate::Type::Code CODE>
@@ -704,13 +650,13 @@ struct UnaryOp<UnaryOpCode::ISFINITE, CODE> {
   template <typename _T = T, std::enable_if_t<std::is_floating_point<_T>::value>* = nullptr>
   __CUDA_HD__ bool operator()(const T& x) const
   {
-    return std::isfinite(x);
+    return isfinite(x);
   }
 
   template <typename _T>
   __CUDA_HD__ bool operator()(const complex<_T>& x) const
   {
-    return std::isfinite(x.imag()) && std::isfinite(x.real());
+    return isfinite(x.imag()) && isfinite(x.real());
   }
 
   __CUDA_HD__ bool operator()(const __half& x) const { return isfinite(static_cast<float>(x)); }
@@ -732,13 +678,13 @@ struct UnaryOp<UnaryOpCode::ISINF, CODE> {
   template <typename _T = T, std::enable_if_t<std::is_floating_point<_T>::value>* = nullptr>
   __CUDA_HD__ bool operator()(const T& x) const
   {
-    return std::isinf(x);
+    return isinf(x);
   }
 
   template <typename _T>
   __CUDA_HD__ bool operator()(const complex<_T>& x) const
   {
-    return std::isinf(x.imag()) || std::isinf(x.real());
+    return isinf(x.imag()) || isinf(x.real());
   }
 
   __CUDA_HD__ bool operator()(const __half& x) const { return isinf(static_cast<float>(x)); }
@@ -760,14 +706,13 @@ struct UnaryOp<UnaryOpCode::ISNAN, CODE> {
   template <typename _T = T, std::enable_if_t<std::is_floating_point<_T>::value>* = nullptr>
   __CUDA_HD__ bool operator()(const T& x) const
   {
-    using std::isnan;
     return isnan(x);
   }
 
   template <typename _T>
   __CUDA_HD__ bool operator()(const complex<_T>& x) const
   {
-    return std::isnan(x.imag()) || std::isnan(x.real());
+    return isnan(x.imag()) || isnan(x.real());
   }
 
   __CUDA_HD__ bool operator()(const __half& x) const { return isnan(x); }
@@ -781,11 +726,7 @@ struct UnaryOp<UnaryOpCode::LOG, CODE> {
 
   UnaryOp(const std::vector<legate::Scalar>& args) {}
 
-  constexpr decltype(auto) operator()(const T& x) const
-  {
-    using std::log;
-    return log(x);
-  }
+  constexpr decltype(auto) operator()(const T& x) const { return log(x); }
 };
 
 template <legate::Type::Code CODE>
@@ -796,11 +737,7 @@ struct UnaryOp<UnaryOpCode::LOG10, CODE> {
 
   UnaryOp(const std::vector<legate::Scalar>& args) {}
 
-  constexpr decltype(auto) operator()(const T& x) const
-  {
-    using std::log10;
-    return log10(x);
-  }
+  constexpr decltype(auto) operator()(const T& x) const { return log10(x); }
 };
 
 template <>
@@ -812,8 +749,7 @@ struct UnaryOp<UnaryOpCode::LOG10, legate::Type::Code::FLOAT16> {
 
   __CUDA_HD__ __half operator()(const __half& x) const
   {
-    using std::log10;
-    return __half{log10(static_cast<float>(x))};
+    return __half{log10f(static_cast<float>(x))};
   }
 };
 
@@ -828,14 +764,12 @@ struct UnaryOp<UnaryOpCode::LOG1P, CODE> {
   template <typename _T = T, std::enable_if_t<!legate::is_complex_type<_T>::value>* = nullptr>
   constexpr decltype(auto) operator()(const T& x) const
   {
-    using std::log1p;
     return log1p(x);
   }
 
   template <typename _T = T, std::enable_if_t<legate::is_complex_type<_T>::value>* = nullptr>
   constexpr decltype(auto) operator()(const T& x) const
   {
-    using std::log;
     return log(T(1) + x);
   }
 };
@@ -849,8 +783,7 @@ struct UnaryOp<UnaryOpCode::LOG1P, legate::Type::Code::FLOAT16> {
 
   __CUDA_HD__ __half operator()(const __half& x) const
   {
-    using std::log1p;
-    return __half{log1p(static_cast<float>(x))};
+    return __half{log1pf(static_cast<float>(x))};
   }
 };
 
@@ -865,14 +798,12 @@ struct UnaryOp<UnaryOpCode::LOG2, CODE> {
   template <typename _T = T, std::enable_if_t<!legate::is_complex_type<_T>::value>* = nullptr>
   constexpr decltype(auto) operator()(const T& x) const
   {
-    using std::log2;
     return log2(x);
   }
 
   template <typename _T = T, std::enable_if_t<legate::is_complex_type<_T>::value>* = nullptr>
   constexpr decltype(auto) operator()(const T& x) const
   {
-    using std::log;
     return log(x) / log(T{2});
   }
 };
@@ -886,8 +817,7 @@ struct UnaryOp<UnaryOpCode::LOG2, legate::Type::Code::FLOAT16> {
 
   __CUDA_HD__ __half operator()(const __half& x) const
   {
-    using std::sinh;
-    return __half{log2(static_cast<float>(x))};
+    return __half{log2f(static_cast<float>(x))};
   }
 };
 
@@ -991,13 +921,13 @@ struct UnaryOp<UnaryOpCode::RINT, CODE> {
   template <typename _T = T, std::enable_if_t<legate::is_complex_type<_T>::value>* = nullptr>
   constexpr decltype(auto) operator()(const _T& x) const
   {
-    return _T(std::rint(x.real()), std::rint(x.imag()));
+    return _T(rint(x.real()), rint(x.imag()));
   }
 
   template <typename _T = T, std::enable_if_t<!legate::is_complex_type<_T>::value>* = nullptr>
   constexpr decltype(auto) operator()(const _T& x) const
   {
-    return std::rint(x);
+    return rint(x);
   }
 };
 
@@ -1010,7 +940,6 @@ struct UnaryOp<UnaryOpCode::RINT, legate::Type::Code::FLOAT16> {
 
   __CUDA_HD__ __half operator()(const __half& x) const
   {
-    using std::rint;
     return __half{rint(static_cast<float>(x))};
   }
 };
@@ -1032,17 +961,17 @@ struct UnaryOp<UnaryOpCode::ROUND, CODE> {
   {
     if constexpr (legate::is_complex_type<T>::value) {
       if (decimals < 0) {
-        return T{static_cast<typename T::value_type>(std::rint(x.real() / factor) * factor),
-                 static_cast<typename T::value_type>(std::rint(x.imag() / factor) * factor)};
+        return T{static_cast<typename T::value_type>(rint(x.real() / factor) * factor),
+                 static_cast<typename T::value_type>(rint(x.imag() / factor) * factor)};
       } else {
-        return T{static_cast<typename T::value_type>(std::rint(x.real() * factor) / factor),
-                 static_cast<typename T::value_type>(std::rint(x.imag() * factor) / factor)};
+        return T{static_cast<typename T::value_type>(rint(x.real() * factor) / factor),
+                 static_cast<typename T::value_type>(rint(x.imag() * factor) / factor)};
       }
     } else {
       if (decimals < 0) {
-        return static_cast<T>(std::rint(x / factor) * factor);
+        return static_cast<T>(rint(x / factor) * factor);
       } else {
-        return static_cast<T>(std::rint(x * factor) / factor);
+        return static_cast<T>(rint(x * factor) / factor);
       }
     }
   }
@@ -1067,9 +996,10 @@ struct UnaryOp<UnaryOpCode::ROUND, legate::Type::Code::FLOAT16> {
   __CUDA_HD__ __half operator()(const __half& x) const
   {
     if (decimals < 0) {
-      return static_cast<__half>(std::rint(static_cast<float>(x) / factor) * factor);
+      return static_cast<__half>(rint(static_cast<float>(x) / factor) * factor);
     } else {
-      return static_cast<__half>(std::rint(static_cast<float>(x) * factor) / factor);
+      __half fh = static_cast<__half>(factor);
+      return static_cast<__half>(rint(static_cast<float>(x * fh)) / factor);
     }
   }
 
@@ -1103,11 +1033,11 @@ struct UnaryOp<UnaryOpCode::SIGN, CODE> {
   template <typename _T = T, std::enable_if_t<legate::is_complex_type<_T>::value>* = nullptr>
   constexpr decltype(auto) operator()(const _T& x) const
   {
-    if (x.real() != 0) {
-      return _T(detail::sign(x.real()), 0);
-    } else {
-      return _T(detail::sign(x.imag()), 0);
+    auto magnitude = abs(x);  // Magnitude of the complex number
+    if (magnitude == 0) {
+      return _T(0, 0);  // Return 0 if the input is 0
     }
+    return x / magnitude;  // Normalize to unit magnitude
   }
 
   template <typename _T = T, std::enable_if_t<!legate::is_complex_type<_T>::value>* = nullptr>
@@ -1139,6 +1069,8 @@ struct UnaryOp<UnaryOpCode::SIGNBIT, CODE> {
 
   constexpr bool operator()(const T& x) const
   {
+    // the signbit function is not directly supported by CUDA ,
+    // so using one from std
     using std::signbit;
     return signbit(x);
   }
@@ -1153,6 +1085,8 @@ struct UnaryOp<UnaryOpCode::SIGNBIT, legate::Type::Code::FLOAT16> {
 
   __CUDA_HD__ bool operator()(const __half& x) const
   {
+    // the signbit function is not directly supported by CUDA ,
+    // so using one from std
     using std::signbit;
     return std::signbit(static_cast<float>(x));
   }
@@ -1165,11 +1099,7 @@ struct UnaryOp<UnaryOpCode::SIN, CODE> {
 
   UnaryOp(const std::vector<legate::Scalar>& args) {}
 
-  constexpr decltype(auto) operator()(const T& x) const
-  {
-    using std::sin;
-    return sin(x);
-  }
+  constexpr decltype(auto) operator()(const T& x) const { return sin(x); }
 };
 
 template <legate::Type::Code CODE>
@@ -1179,11 +1109,7 @@ struct UnaryOp<UnaryOpCode::SINH, CODE> {
 
   UnaryOp(const std::vector<legate::Scalar>& args) {}
 
-  constexpr decltype(auto) operator()(const T& x) const
-  {
-    using std::sinh;
-    return sinh(x);
-  }
+  constexpr decltype(auto) operator()(const T& x) const { return sinh(x); }
 };
 
 template <>
@@ -1195,7 +1121,6 @@ struct UnaryOp<UnaryOpCode::SINH, legate::Type::Code::FLOAT16> {
 
   __CUDA_HD__ __half operator()(const __half& x) const
   {
-    using std::sinh;
     return __half{sinh(static_cast<float>(x))};
   }
 };
@@ -1227,11 +1152,7 @@ struct UnaryOp<UnaryOpCode::SQRT, CODE> {
 
   UnaryOp(const std::vector<legate::Scalar>& args) {}
 
-  constexpr decltype(auto) operator()(const T& x) const
-  {
-    using std::sqrt;
-    return sqrt(x);
-  }
+  constexpr decltype(auto) operator()(const T& x) const { return sqrt(x); }
 };
 
 template <legate::Type::Code CODE>
@@ -1241,11 +1162,7 @@ struct UnaryOp<UnaryOpCode::TAN, CODE> {
 
   UnaryOp(const std::vector<legate::Scalar>& args) {}
 
-  constexpr decltype(auto) operator()(const T& x) const
-  {
-    using std::tan;
-    return tan(x);
-  }
+  constexpr decltype(auto) operator()(const T& x) const { return tan(x); }
 };
 
 template <legate::Type::Code CODE>
@@ -1255,11 +1172,7 @@ struct UnaryOp<UnaryOpCode::TANH, CODE> {
 
   UnaryOp(const std::vector<legate::Scalar>& args) {}
 
-  constexpr decltype(auto) operator()(const T& x) const
-  {
-    using std::tanh;
-    return tanh(x);
-  }
+  constexpr decltype(auto) operator()(const T& x) const { return tanh(x); }
 };
 
 template <legate::Type::Code CODE>
@@ -1269,11 +1182,7 @@ struct UnaryOp<UnaryOpCode::TRUNC, CODE> {
 
   UnaryOp(const std::vector<legate::Scalar>& args) {}
 
-  constexpr decltype(auto) operator()(const T& x) const
-  {
-    using std::trunc;
-    return trunc(x);
-  }
+  constexpr decltype(auto) operator()(const T& x) const { return trunc(x); }
 };
 
 template <>
@@ -1285,7 +1194,6 @@ struct UnaryOp<UnaryOpCode::TRUNC, legate::Type::Code::FLOAT16> {
 
   __CUDA_HD__ __half operator()(const __half& x) const
   {
-    using std::trunc;
     return __half{trunc(static_cast<float>(x))};
   }
 };
@@ -1302,11 +1210,7 @@ struct MultiOutUnaryOp<UnaryOpCode::FREXP, CODE> {
   using RHS2                  = int32_t;
   using LHS                   = RHS1;
 
-  __CUDA_HD__ LHS operator()(const RHS1& rhs1, RHS2* rhs2) const
-  {
-    using std::frexp;
-    return frexp(rhs1, rhs2);
-  }
+  __CUDA_HD__ LHS operator()(const RHS1& rhs1, RHS2* rhs2) const { return frexp(rhs1, rhs2); }
 };
 
 template <>
@@ -1318,7 +1222,6 @@ struct MultiOutUnaryOp<UnaryOpCode::FREXP, legate::Type::Code::FLOAT16> {
 
   __CUDA_HD__ LHS operator()(const RHS1& rhs1, RHS2* rhs2) const
   {
-    using std::frexp;
     return static_cast<__half>(frexp(static_cast<float>(rhs1), rhs2));
   }
 };
@@ -1330,11 +1233,7 @@ struct MultiOutUnaryOp<UnaryOpCode::MODF, CODE> {
   using RHS2                  = RHS1;
   using LHS                   = RHS1;
 
-  __CUDA_HD__ LHS operator()(const RHS1& rhs1, RHS2* rhs2) const
-  {
-    using std::modf;
-    return modf(rhs1, rhs2);
-  }
+  __CUDA_HD__ LHS operator()(const RHS1& rhs1, RHS2* rhs2) const { return modf(rhs1, rhs2); }
 };
 
 template <>
@@ -1346,7 +1245,6 @@ struct MultiOutUnaryOp<UnaryOpCode::MODF, legate::Type::Code::FLOAT16> {
 
   __CUDA_HD__ LHS operator()(const RHS1& rhs1, RHS2* rhs2) const
   {
-    using std::modf;
     float tmp;
     float result = modf(static_cast<float>(rhs1), &tmp);
     *rhs2        = static_cast<__half>(tmp);
@@ -1354,4 +1252,4 @@ struct MultiOutUnaryOp<UnaryOpCode::MODF, legate::Type::Code::FLOAT16> {
   }
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/unary/unary_red.cc b/src/cupynumeric/unary/unary_red.cc
similarity index 87%
rename from src/cunumeric/unary/unary_red.cc
rename to src/cupynumeric/unary/unary_red.cc
index 827f27a9c9..0d11ceeb5b 100644
--- a/src/cunumeric/unary/unary_red.cc
+++ b/src/cupynumeric/unary/unary_red.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/unary/unary_red.h"
-#include "cunumeric/unary/unary_red_template.inl"
+#include "cupynumeric/unary/unary_red.h"
+#include "cupynumeric/unary/unary_red_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -56,7 +56,10 @@ struct UnaryRedImplBody<VariantKind::CPU, OP_CODE, CODE, DIM, HAS_WHERE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { UnaryRedTask::register_variants(); }
+static const auto cupynumeric_reg_task_ = []() -> char {
+  UnaryRedTask::register_variants();
+  return 0;
+}();
 }  // namespace
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/unary/unary_red.cu b/src/cupynumeric/unary/unary_red.cu
similarity index 98%
rename from src/cunumeric/unary/unary_red.cu
rename to src/cupynumeric/unary/unary_red.cu
index f245305e88..55f3616cbf 100644
--- a/src/cunumeric/unary/unary_red.cu
+++ b/src/cupynumeric/unary/unary_red.cu
@@ -14,12 +14,12 @@
  *
  */
 
-#include "cunumeric/unary/unary_red.h"
-#include "cunumeric/unary/unary_red_template.inl"
+#include "cupynumeric/unary/unary_red.h"
+#include "cupynumeric/unary/unary_red_template.inl"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename T>
 static constexpr T div_and_ceil(T value, T divider)
@@ -353,7 +353,7 @@ struct UnaryRedImplBody<VariantKind::GPU, OP_CODE, CODE, DIM, HAS_WHERE> {
     blocks.compute_maximum_concurrency(reinterpret_cast<const void*>(Kernel));
     Kernel<<<blocks.num_blocks(), blocks.num_threads(), 0, stream>>>(
       lhs, rhs, where, LG_OP::identity, blocks, rect, collapsed_dim);
-    CUNUMERIC_CHECK_CUDA_STREAM(stream);
+    CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
 
@@ -362,4 +362,4 @@ struct UnaryRedImplBody<VariantKind::GPU, OP_CODE, CODE, DIM, HAS_WHERE> {
   unary_red_template<VariantKind::GPU>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/unary/unary_red.h b/src/cupynumeric/unary/unary_red.h
similarity index 77%
rename from src/cunumeric/unary/unary_red.h
rename to src/cupynumeric/unary/unary_red.h
index b86868d71b..7aca22baa2 100644
--- a/src/cunumeric/unary/unary_red.h
+++ b/src/cupynumeric/unary/unary_red.h
@@ -16,10 +16,10 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/unary/unary_red_util.h"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/unary/unary_red_util.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 struct UnaryRedArgs {
   legate::PhysicalStore lhs;
@@ -29,9 +29,10 @@ struct UnaryRedArgs {
   UnaryRedCode op_code;
 };
 
-class UnaryRedTask : public CuNumericTask<UnaryRedTask> {
+class UnaryRedTask : public CuPyNumericTask<UnaryRedTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{CUNUMERIC_UNARY_RED};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CUPYNUMERIC_UNARY_RED}};
 
  public:
   static void cpu_variant(legate::TaskContext context);
@@ -43,4 +44,4 @@ class UnaryRedTask : public CuNumericTask<UnaryRedTask> {
 #endif
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/unary/unary_red_omp.cc b/src/cupynumeric/unary/unary_red_omp.cc
similarity index 95%
rename from src/cunumeric/unary/unary_red_omp.cc
rename to src/cupynumeric/unary/unary_red_omp.cc
index 8ed659aee2..d5d885766f 100644
--- a/src/cunumeric/unary/unary_red_omp.cc
+++ b/src/cupynumeric/unary/unary_red_omp.cc
@@ -14,10 +14,10 @@
  *
  */
 
-#include "cunumeric/unary/unary_red.h"
-#include "cunumeric/unary/unary_red_template.inl"
+#include "cupynumeric/unary/unary_red.h"
+#include "cupynumeric/unary/unary_red_template.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -112,4 +112,4 @@ struct UnaryRedImplBody<VariantKind::OMP, OP_CODE, CODE, DIM, HAS_WHERE> {
   unary_red_template<VariantKind::OMP>(context);
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/unary/unary_red_template.inl b/src/cupynumeric/unary/unary_red_template.inl
similarity index 89%
rename from src/cunumeric/unary/unary_red_template.inl
rename to src/cupynumeric/unary/unary_red_template.inl
index 35bc589746..975c716c3b 100644
--- a/src/cunumeric/unary/unary_red_template.inl
+++ b/src/cupynumeric/unary/unary_red_template.inl
@@ -17,13 +17,13 @@
 #pragma once
 
 // Useful for IDEs
-#include "cunumeric/unary/unary_red.h"
-#include "cunumeric/unary/unary_red_util.h"
-#include "cunumeric/arg.h"
-#include "cunumeric/arg.inl"
-#include "cunumeric/pitches.h"
+#include "cupynumeric/unary/unary_red.h"
+#include "cupynumeric/unary/unary_red_util.h"
+#include "cupynumeric/arg.h"
+#include "cupynumeric/arg.inl"
+#include "cupynumeric/pitches.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace legate;
 
@@ -50,7 +50,7 @@ struct UnaryRedImpl {
 
     auto rhs = args.rhs.read_accessor<RHS, DIM>(rect);
 
-    auto lhs = args.lhs.reduce_accessor<typename OP::OP, KIND != VariantKind::GPU, DIM>(rect);
+    auto lhs = args.lhs.reduce_accessor < typename OP::OP, KIND != VariantKind::GPU, DIM > (rect);
 
     AccessorRO<bool, DIM> where;
     if constexpr (HAS_WHERE) {
@@ -95,4 +95,4 @@ static void unary_red_template(TaskContext& context)
   }
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/unary/unary_red_util.h b/src/cupynumeric/unary/unary_red_util.h
similarity index 94%
rename from src/cunumeric/unary/unary_red_util.h
rename to src/cupynumeric/unary/unary_red_util.h
index 3dafba0bf6..09162507d2 100644
--- a/src/cunumeric/unary/unary_red_util.h
+++ b/src/cupynumeric/unary/unary_red_util.h
@@ -16,32 +16,32 @@
 
 #pragma once
 
-#include "cunumeric/cunumeric_task.h"
-#include "cunumeric/arg.h"
-#include "cunumeric/arg.inl"
-#include "cunumeric/unary/isnan.h"
+#include "cupynumeric/cupynumeric_task.h"
+#include "cupynumeric/arg.h"
+#include "cupynumeric/arg.inl"
+#include "cupynumeric/unary/isnan.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 enum class UnaryRedCode : int {
-  ALL           = CUNUMERIC_RED_ALL,
-  ANY           = CUNUMERIC_RED_ANY,
-  ARGMAX        = CUNUMERIC_RED_ARGMAX,
-  ARGMIN        = CUNUMERIC_RED_ARGMIN,
-  CONTAINS      = CUNUMERIC_RED_CONTAINS,
-  COUNT_NONZERO = CUNUMERIC_RED_COUNT_NONZERO,
-  MAX           = CUNUMERIC_RED_MAX,
-  MIN           = CUNUMERIC_RED_MIN,
-  NANARGMAX     = CUNUMERIC_RED_NANARGMAX,
-  NANARGMIN     = CUNUMERIC_RED_NANARGMIN,
-  NANMAX        = CUNUMERIC_RED_NANMAX,
-  NANMIN        = CUNUMERIC_RED_NANMIN,
-  NANPROD       = CUNUMERIC_RED_NANPROD,
-  NANSUM        = CUNUMERIC_RED_NANSUM,
-  PROD          = CUNUMERIC_RED_PROD,
-  SUM           = CUNUMERIC_RED_SUM,
-  SUM_SQUARES   = CUNUMERIC_RED_SUM_SQUARES,
-  VARIANCE      = CUNUMERIC_RED_VARIANCE
+  ALL           = CUPYNUMERIC_RED_ALL,
+  ANY           = CUPYNUMERIC_RED_ANY,
+  ARGMAX        = CUPYNUMERIC_RED_ARGMAX,
+  ARGMIN        = CUPYNUMERIC_RED_ARGMIN,
+  CONTAINS      = CUPYNUMERIC_RED_CONTAINS,
+  COUNT_NONZERO = CUPYNUMERIC_RED_COUNT_NONZERO,
+  MAX           = CUPYNUMERIC_RED_MAX,
+  MIN           = CUPYNUMERIC_RED_MIN,
+  NANARGMAX     = CUPYNUMERIC_RED_NANARGMAX,
+  NANARGMIN     = CUPYNUMERIC_RED_NANARGMIN,
+  NANMAX        = CUPYNUMERIC_RED_NANMAX,
+  NANMIN        = CUPYNUMERIC_RED_NANMIN,
+  NANPROD       = CUPYNUMERIC_RED_NANPROD,
+  NANSUM        = CUPYNUMERIC_RED_NANSUM,
+  PROD          = CUPYNUMERIC_RED_PROD,
+  SUM           = CUPYNUMERIC_RED_SUM,
+  SUM_SQUARES   = CUPYNUMERIC_RED_SUM_SQUARES,
+  VARIANCE      = CUPYNUMERIC_RED_VARIANCE
 };
 
 template <UnaryRedCode OP_CODE>
@@ -608,4 +608,4 @@ struct UnaryRedOp<UnaryRedCode::CONTAINS, TYPE_CODE> {
   using OP      = _RED_OP::OP;
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/utilities/repartition.cc b/src/cupynumeric/utilities/repartition.cc
similarity index 98%
rename from src/cunumeric/utilities/repartition.cc
rename to src/cupynumeric/utilities/repartition.cc
index a27e28955e..9b009939ad 100644
--- a/src/cunumeric/utilities/repartition.cc
+++ b/src/cupynumeric/utilities/repartition.cc
@@ -16,7 +16,7 @@
 
 #include "repartition.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 std::tuple<size_t, size_t> elements_for_rank_in_dimension(
   size_t dim_length, size_t offset_id, size_t proc_id, size_t num_dim_procs, size_t tilesize)
@@ -68,4 +68,4 @@ std::tuple<size_t, size_t> elements_for_rank_in_dimension(
   return {num_elements, offset_elements};
 }
 
-}  // namespace cunumeric
\ No newline at end of file
+}  // namespace cupynumeric
\ No newline at end of file
diff --git a/src/cunumeric/utilities/repartition.cu b/src/cupynumeric/utilities/repartition.cu
similarity index 98%
rename from src/cunumeric/utilities/repartition.cu
rename to src/cupynumeric/utilities/repartition.cu
index 917c84be98..ec5d2ea1d3 100644
--- a/src/cunumeric/utilities/repartition.cu
+++ b/src/cupynumeric/utilities/repartition.cu
@@ -16,9 +16,9 @@
 
 #include "repartition.h"
 
-#include "cunumeric/cuda_help.h"
+#include "cupynumeric/cuda_help.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 using namespace Legion;
 using namespace legate;
@@ -426,7 +426,7 @@ std::tuple<Buffer<VAL>, size_t, size_t> repartition_matrix_2dbc(const VAL* input
   assert(total_send_elements == volume);
 
   // TODO / OPTIMIZE
-  // in case we have the global partition information of the cuNumeric block partition
+  // in case we have the global partition information of the cuPyNumeric block partition
   // we can compute receive buffers instead and skip this all2all
   // same applies for inverse operation
 
@@ -439,7 +439,7 @@ std::tuple<Buffer<VAL>, size_t, size_t> repartition_matrix_2dbc(const VAL* input
       recv_info.ptr(r * stored_size_per_rank), stored_size_per_rank, ncclUint64, r, *comm, stream));
   }
   CHECK_NCCL(ncclGroupEnd());
-  CUNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));  // need Z-copy synchronized to Host
+  CUPYNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));  // need Z-copy synchronized to Host
 
   // allocate send/recv buffer
   std::vector<Buffer<VAL>> send_buffers;
@@ -499,11 +499,11 @@ std::tuple<Buffer<VAL>, size_t, size_t> repartition_matrix_2dbc(const VAL* input
                                                     p_c,
                                                     tile_r,
                                                     tile_c);
-    CUNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
+    CUPYNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
     send_buffers_ptr.destroy();
   }
 
-  CUNUMERIC_CHECK_CUDA_STREAM(stream);
+  CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
 
   // all2all data
   CHECK_NCCL(ncclGroupStart());
@@ -550,11 +550,11 @@ std::tuple<Buffer<VAL>, size_t, size_t> repartition_matrix_2dbc(const VAL* input
                                                                        tile_c,
                                                                        (size_t)nccl_rank,
                                                                        num_ranks);
-    CUNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
+    CUPYNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
     recv_buffers_ptr.destroy();
   }
 
-  CUNUMERIC_CHECK_CUDA_STREAM(stream);
+  CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
 
   recv_info.destroy();
   for (auto&& buf : recv_buffers) {
@@ -610,7 +610,7 @@ void repartition_matrix_block(
     offsets[2 * local_rank + 1] = num_target_cols > 0 ? target_offset_c + num_target_cols : 0;
     CHECK_NCCL(
       ncclAllGather(offsets.ptr(2 * local_rank), offsets.ptr(0), 2, ncclUint64, *comm, stream));
-    CUNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
+    CUPYNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
 
     // re-arrange so that all row offsets come first
     for (size_t i = 1; i < num_ranks; i += 2) {
@@ -697,7 +697,7 @@ void repartition_matrix_block(
 
   // Assumptions:
   // a. local_rank == nccl_rank == 2dbc-id (col-major)
-  // b. local_rank interpreted row-major (cuNumeric) should match offsets in offset mappings
+  // b. local_rank interpreted row-major (cuPyNumeric) should match offsets in offset mappings
   // c. offsets for ranks outside valid bounds are not considered
   size_t rank_r_rm = local_rank / target_p_c;
   size_t rank_c_rm = local_rank % target_p_c;
@@ -805,7 +805,7 @@ void repartition_matrix_block(
                                                                              tile_r,
                                                                              tile_c);
 
-    CUNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
+    CUPYNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
     send_buffers_ptr.destroy();
   }
   // we can destroy the input once we distributed data into the buffers
@@ -904,11 +904,11 @@ void repartition_matrix_block(
                                                                p_c,
                                                                tile_r,
                                                                tile_c);
-    CUNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
+    CUPYNUMERIC_CHECK_CUDA(cudaStreamSynchronize(stream));
     recv_buffers_ptr.destroy();
   }
 
-  CUNUMERIC_CHECK_CUDA_STREAM(stream);
+  CUPYNUMERIC_CHECK_CUDA_STREAM(stream);
 
   // cleanup
   offsets_r.destroy();
@@ -1354,4 +1354,4 @@ template void repartition_matrix_block<type_of<Type::Code::COMPLEX128>>(
   size_t,
   comm::Communicator);
 
-}  // namespace cunumeric
\ No newline at end of file
+}  // namespace cupynumeric
\ No newline at end of file
diff --git a/src/cunumeric/utilities/repartition.h b/src/cupynumeric/utilities/repartition.h
similarity index 94%
rename from src/cunumeric/utilities/repartition.h
rename to src/cupynumeric/utilities/repartition.h
index d513cb7727..febbb81c9f 100644
--- a/src/cunumeric/utilities/repartition.h
+++ b/src/cupynumeric/utilities/repartition.h
@@ -17,9 +17,9 @@
 #pragma once
 
 #include "legate.h"
-#include "cunumeric/cunumeric_task.h"
+#include "cupynumeric/cupynumeric_task.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 enum BlockInfo {
   TOTAL_SIZE,  // # values send
@@ -79,7 +79,7 @@ void repartition_matrix_block(
   VAL* target,
   size_t target_volume,
   size_t target_lld,
-  // cuNumeric process grid layout (needs to match communicator size)
+  // cuPyNumeric process grid layout (needs to match communicator size)
   size_t num_target_rows,
   size_t num_target_cols,
   bool target_row_major,
@@ -92,4 +92,4 @@ void repartition_matrix_block(
 [[nodiscard]] std::tuple<size_t, size_t> elements_for_rank_in_dimension(
   size_t dim_length, size_t offset_id, size_t proc_id, size_t num_dim_procs, size_t tilesize);
 
-}  // namespace cunumeric
\ No newline at end of file
+}  // namespace cupynumeric
\ No newline at end of file
diff --git a/src/cunumeric/utilities/thrust_allocator.h b/src/cupynumeric/utilities/thrust_allocator.h
similarity index 95%
rename from src/cunumeric/utilities/thrust_allocator.h
rename to src/cupynumeric/utilities/thrust_allocator.h
index 465b74974f..934df71f8a 100644
--- a/src/cunumeric/utilities/thrust_allocator.h
+++ b/src/cupynumeric/utilities/thrust_allocator.h
@@ -18,7 +18,7 @@
 
 #include "legate.h"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 class ThrustAllocator : public legate::ScopedAllocator {
  public:
@@ -34,4 +34,4 @@ class ThrustAllocator : public legate::ScopedAllocator {
   void deallocate(char* ptr, size_t n) { ScopedAllocator::deallocate(ptr); }
 };
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
diff --git a/src/cunumeric/utilities/thrust_util.h b/src/cupynumeric/utilities/thrust_util.h
similarity index 100%
rename from src/cunumeric/utilities/thrust_util.h
rename to src/cupynumeric/utilities/thrust_util.h
diff --git a/src/env_defaults.h b/src/env_defaults.h
index ec63f96027..0307aaa9ac 100644
--- a/src/env_defaults.h
+++ b/src/env_defaults.h
@@ -14,7 +14,7 @@
  *
  */
 
-// These values are copied manually in cunumeric.settings and there is a Python
+// These values are copied manually in cupynumeric.settings and there is a Python
 // unit test that will maintain that these values and the Python settings
 // values agree. If these values are modified, the corresponding Python values
 // must also be updated.
diff --git a/test.py b/test.py
index e8111d5824..27752fee0c 100755
--- a/test.py
+++ b/test.py
@@ -18,29 +18,46 @@
 
 import sys
 
-import legate.tester
-from legate.tester import CustomTest
+from legate.tester import CustomTest, FeatureType
 from legate.tester.config import Config
+from legate.tester.project import Project
 from legate.tester.test_plan import TestPlan
 from legate.tester.test_system import TestSystem
+from legate.util.types import EnvDict
+
+
+class CPNProject(Project):
+    def custom_files(self) -> list[CustomTest]:
+        return [
+            CustomTest("examples/quantiles.py"),
+            CustomTest("examples/sort.py"),
+            CustomTest("tests/integration/test_argsort.py"),
+            CustomTest("tests/integration/test_msort.py"),
+            CustomTest("tests/integration/test_nanpercentiles.py"),
+            CustomTest("tests/integration/test_nanquantiles.py"),
+            CustomTest("tests/integration/test_partition.py"),
+            CustomTest("tests/integration/test_percentiles.py"),
+            CustomTest("tests/integration/test_quantiles.py"),
+            CustomTest("tests/integration/test_sort_complex.py"),
+            CustomTest("tests/integration/test_sort.py"),
+            CustomTest("tests/integration/test_unique.py"),
+        ]
+
+    def stage_env(self, feature: FeatureType) -> EnvDict:
+        match feature:
+            case "eager":
+                return {
+                    "CUPYNUMERIC_FORCE_THUNK": "eager",
+                    "CUPYNUMERIC_MIN_CPU_CHUNK": "2000000000",
+                    "CUPYNUMERIC_MIN_OMP_CHUNK": "2000000000",
+                    "CUPYNUMERIC_MIN_GPU_CHUNK": "2000000000",
+                }
+            case _:
+                return {}
 
-legate.tester.CUSTOM_FILES = [
-    CustomTest("examples/quantiles.py"),
-    CustomTest("examples/sort.py"),
-    CustomTest("tests/integration/test_argsort.py"),
-    CustomTest("tests/integration/test_msort.py"),
-    CustomTest("tests/integration/test_nanpercentiles.py"),
-    CustomTest("tests/integration/test_nanquantiles.py"),
-    CustomTest("tests/integration/test_partition.py"),
-    CustomTest("tests/integration/test_percentiles.py"),
-    CustomTest("tests/integration/test_quantiles.py"),
-    CustomTest("tests/integration/test_sort_complex.py"),
-    CustomTest("tests/integration/test_sort.py"),
-    CustomTest("tests/integration/test_unique.py"),
-]
 
 if __name__ == "__main__":
-    config = Config(sys.argv)
+    config = Config(sys.argv, project=CPNProject())
 
     system = TestSystem(dry_run=config.dry_run)
 
diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt
index c19bc0514a..89d6d8c907 100644
--- a/tests/cpp/CMakeLists.txt
+++ b/tests/cpp/CMakeLists.txt
@@ -20,7 +20,7 @@ project(cpp_tests VERSION 0.1 LANGUAGES C CXX)
 
 if(PROJECT_IS_TOP_LEVEL)
   # To catch people trying to build the tests from within tests/cpp instead of top-level
-  message(FATAL_ERROR "Error: Tests can only be built as part of the main library build. Please re-run cmake from top-level directory (\${CMAKE_SOURCE_DIR}) with -Dcunumeric_BUILD_TESTS=ON"
+  message(FATAL_ERROR "Error: Tests can only be built as part of the main library build. Please re-run cmake from top-level directory (\${CMAKE_SOURCE_DIR}) with -Dcupynumeric_BUILD_TESTS=ON"
   )
 endif()
 
@@ -37,8 +37,8 @@ include(${rapids-cmake-dir}/cpm/gtest.cmake)
 
 # BUILD_EXPORT_SET and INSTALL_EXPORT_SET are crucial, otherwise gtest does not get
 # installed
-rapids_cpm_gtest(BUILD_EXPORT_SET cunumeric-exports
-                 INSTALL_EXPORT_SET cunumeric-exports)
+rapids_cpm_gtest(BUILD_EXPORT_SET cupynumeric-exports
+                 INSTALL_EXPORT_SET cupynumeric-exports)
 
 file(GLOB main_SRC ${PROJECT_SOURCE_DIR}/main.cc)
 file(GLOB integration_SRC ${PROJECT_SOURCE_DIR}/integration/*.cc)
@@ -50,7 +50,7 @@ endif()
 
 add_executable(cpp_tests ${main_SRC} ${tasks_SRC} ${integration_SRC} ${unit_SRC})
 
-target_link_libraries(cpp_tests PRIVATE legate::legate cunumeric::cunumeric GTest::gtest)
+target_link_libraries(cpp_tests PRIVATE legate::legate cupynumeric::cupynumeric GTest::gtest)
 if(Legion_USE_CUDA)
   target_link_libraries(cpp_tests PRIVATE NCCL::NCCL)
 endif()
diff --git a/tests/cpp/integration/common_utils.cc b/tests/cpp/integration/common_utils.cc
index 107309f399..94b8cb132b 100644
--- a/tests/cpp/integration/common_utils.cc
+++ b/tests/cpp/integration/common_utils.cc
@@ -18,7 +18,7 @@
 #include <gtest/gtest.h>
 #include <gtest/gtest-spi.h>
 
-namespace cunumeric {
+namespace cupynumeric {
 
 template <typename T>
 void show_array(NDArray& a)
@@ -67,9 +67,9 @@ void debug_array(NDArray a, bool show_data)
   }
 }
 
-}  // namespace cunumeric
+}  // namespace cupynumeric
 
-using namespace cunumeric;
+using namespace cupynumeric;
 
 // unit test for common_utils
 namespace {
@@ -153,7 +153,7 @@ TEST(Utils, test_ndarray_warn_and_convert)
   auto y    = x._warn_and_convert(legate::int32());
   debug_array(x);
   debug_array(y);
-  cunumeric_log().warning() << "Just a test!";
+  cupynumeric_log().warning() << "Just a test!";
 }
 
 TEST(Utils, test_wrap_indices_and_clip_indices)
diff --git a/tests/cpp/integration/common_utils.h b/tests/cpp/integration/common_utils.h
index e919c50c17..2078c4c556 100644
--- a/tests/cpp/integration/common_utils.h
+++ b/tests/cpp/integration/common_utils.h
@@ -27,11 +27,11 @@
 #include <gtest/gtest.h>
 
 #include "legate.h"
-#include "cunumeric.h"
-#include "cunumeric/runtime.h"
+#include "cupynumeric.h"
+#include "cupynumeric/runtime.h"
 #include "util.inl"
 
-namespace cunumeric {
+namespace cupynumeric {
 
 void debug_array(NDArray a, bool show_data = true);
 
@@ -66,7 +66,7 @@ NDArray mk_array(std::vector<T> const& values, std::vector<uint64_t> shape = {})
   } else {
     auto a1 = zeros({out.size()}, out.type());
     assign_values(a1, values);
-    auto runtime = CuNumericRuntime::get_runtime();
+    auto runtime = CuPyNumericRuntime::get_runtime();
     auto a2      = runtime->create_array(std::move(a1.get_store().delinearize(0, shape)));
     out.assign(a2);
   }
@@ -74,14 +74,14 @@ NDArray mk_array(std::vector<T> const& values, std::vector<uint64_t> shape = {})
 }
 
 template <typename T>
-void check_and_wrap(NDArray& a, const std::vector<T>& values, std::vector<size_t>& shape)
+void check_and_wrap(NDArray& a, const std::vector<T>& values, std::vector<uint64_t>& shape)
 {
   if (shape.empty() && values.size() > 1) {
     shape.push_back(values.size());
   }
   ASSERT_EQ(a.size(), values.size());
   ASSERT_EQ(a.shape(), shape);
-  ASSERT_EQ(a.type().code(), legate::type_code_of<T>);
+  ASSERT_EQ(a.type().code(), legate::type_code_of_v<T>);
 
   if (a.dim() > 1) {
     a = a._wrap(a.size());
@@ -89,7 +89,7 @@ void check_and_wrap(NDArray& a, const std::vector<T>& values, std::vector<size_t
 }
 
 template <typename T>
-void check_array(NDArray a, const std::vector<T>& values, std::vector<size_t> shape = {})
+void check_array(NDArray a, const std::vector<T>& values, std::vector<uint64_t> shape = {})
 {
   check_and_wrap<T>(a, values, shape);
   if (a.size() == 0) {
@@ -111,8 +111,8 @@ void check_array(NDArray a, const std::vector<T>& values, std::vector<size_t> sh
 template <typename T>
 void check_array_near(NDArray a,
                       const std::vector<T>& values,
-                      std::vector<size_t> shape = {},
-                      double abs_error          = 1.e-8)
+                      std::vector<uint64_t> shape = {},
+                      double abs_error            = 1.e-8)
 {
   check_and_wrap<T>(a, values, shape);
   if (a.size() == 0) {
@@ -127,14 +127,14 @@ void check_array_near(NDArray a,
 
   auto acc = a.get_read_accessor<T, 1>();
   for (size_t i = 0; i < values.size(); ++i) {
-    EXPECT_NEAR(acc[i], values[i], abs_error) << err_msg(i);
+    ASSERT_NEAR(acc[i], values[i], abs_error) << err_msg(i);
   }
 }
 
 template <typename T>
 struct PrintArray {
   template <int32_t DIM>
-  void operator()(cunumeric::NDArray array)
+  void operator()(cupynumeric::NDArray array)
   {
     auto acc            = array.get_read_accessor<T, DIM>();
     auto& shape         = array.shape();
@@ -177,7 +177,7 @@ void debug_vector(const std::vector<T>& vec)
 template <typename T>
 std::vector<T> mk_seq_vector(std::vector<uint64_t> shape, T a = 1, T b = 0)
 {
-  size_t size = std::accumulate(shape.begin(), shape.end(), size_t(1), std::multiplies<size_t>());
+  size_t size = std::accumulate(shape.begin(), shape.end(), size_t(1), std::multiplies<>());
   std::vector<T> v(size);
   std::generate(v.begin(), v.end(), [a, x = b]() mutable { return x += a; });
   return v;
@@ -193,4 +193,22 @@ std::vector<T> as_type_vector(std::vector<U> const& in)
   return out;
 }
 
-}  // namespace cunumeric
+template <typename T>
+std::vector<T> to_vector(NDArray a)
+{
+  std::vector<T> result;
+  if (a.size() == 0) {
+    return result;
+  }
+  if (a.dim() > 1) {
+    a = a._wrap(a.size());
+  }
+  auto acc = a.get_read_accessor<T, 1>();
+  result.reserve(a.size());
+  for (size_t i = 0; i < a.size(); ++i) {
+    result.push_back(acc[i]);
+  }
+  return result;
+}
+
+}  // namespace cupynumeric
diff --git a/tests/cpp/integration/test_amax.cc b/tests/cpp/integration/test_amax.cc
new file mode 100644
index 0000000000..3a50f66779
--- /dev/null
+++ b/tests/cpp/integration/test_amax.cc
@@ -0,0 +1,339 @@
+/* Copyright 2023 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+#include "common_utils.h"
+
+template <typename T, typename OUT_T>
+void test_amax(const std::vector<T>& in_array,
+               const std::vector<uint64_t>& shape,
+               const std::vector<OUT_T>& expect_result,
+               const std::vector<uint64_t>& expect_shape,
+               std::vector<int32_t> axis                 = {},
+               std::optional<legate::Type> dtype         = std::nullopt,
+               std::optional<cupynumeric::NDArray> out   = std::nullopt,
+               bool keepdims                             = false,
+               std::optional<legate::Scalar> initial     = std::nullopt,
+               std::optional<cupynumeric::NDArray> where = std::nullopt)
+{
+  auto array = cupynumeric::mk_array<T>(in_array, shape);
+
+  if (!out.has_value()) {
+    auto result = cupynumeric::amax(array, axis, dtype, std::nullopt, keepdims, initial, where);
+    cupynumeric::check_array<OUT_T>(result, expect_result, expect_shape);
+  } else {
+    cupynumeric::amax(array, axis, dtype, out, keepdims, initial, where);
+    cupynumeric::check_array<OUT_T>(out.value(), expect_result, expect_shape);
+  }
+}
+
+template <typename T, typename OUT_T>
+void test_amax_each_axis(const std::vector<T>& arr,
+                         const std::vector<uint64_t>& shape,
+                         std::map<int32_t, std::vector<OUT_T>>& expect_results,
+                         std::map<int32_t, std::vector<uint64_t>>& expect_shapes,
+                         bool keepdims                         = false,
+                         std::optional<legate::Scalar> initial = std::nullopt)
+{
+  int32_t dim = shape.size();
+  auto df     = std::nullopt;
+  for (int32_t axis = -dim + 1; axis < dim; ++axis) {
+    auto index     = axis < 0 ? axis + dim : axis;
+    auto exp       = expect_results[index];
+    auto exp_shape = expect_shapes[index];
+    auto axes      = {axis};
+    test_amax<T, OUT_T>(arr, shape, exp, exp_shape, axes, df, df, keepdims, initial, df);
+  }
+}
+
+void test_amax_basic()
+{
+  typedef std::map<int32_t, std::vector<int32_t>> IntResult;
+  typedef std::map<int32_t, std::vector<double>> DoubleResult;
+  typedef std::map<int32_t, std::vector<uint64_t>> ShapeResult;
+
+  // Test int type - dim=1
+  std::vector<int32_t> arr1    = {-1, 4, 5, 2, 0};
+  std::vector<uint64_t> shape1 = {5};
+  ShapeResult exp_shape1       = {{0, {}}};
+  ShapeResult exp_shape1_k     = {{0, {1}}};
+  IntResult exp1               = {{0, {5}}};
+  test_amax_each_axis<int32_t, int32_t>(arr1, shape1, exp1, exp_shape1);
+  test_amax_each_axis<int32_t, int32_t>(arr1, shape1, exp1, exp_shape1_k, true);
+
+  // Test int type - dim=2
+  std::vector<int32_t> arr2    = {1, 0, 0, 5, 3, 2};
+  std::vector<uint64_t> shape2 = {3, 2};
+  ShapeResult exp_shape2       = {{0, {2}}, {1, {3}}};
+  ShapeResult exp_shape2_k     = {{0, {1, 2}}, {1, {3, 1}}};
+  IntResult exp2               = {{0, {3, 5}}, {1, {1, 5, 3}}};
+  test_amax_each_axis<int32_t, int32_t>(arr2, shape2, exp2, exp_shape2);
+  test_amax_each_axis<int32_t, int32_t>(arr2, shape2, exp2, exp_shape2_k, true);
+
+  // Test int type - dim=3
+  std::vector<int32_t> arr3    = {0, 11, 2, 3, -4, 0, -6, 7};
+  std::vector<uint64_t> shape3 = {2, 2, 2};
+  ShapeResult exp_shape3       = {{0, {2, 2}}, {1, {2, 2}}, {2, {2, 2}}};
+  ShapeResult exp_shape3_k     = {{0, {1, 2, 2}}, {1, {2, 1, 2}}, {2, {2, 2, 1}}};
+  IntResult exp3               = {{0, {0, 11, 2, 7}}, {1, {2, 11, -4, 7}}, {2, {11, 3, 0, 7}}};
+  test_amax_each_axis<int32_t, int32_t>(arr3, shape3, exp3, exp_shape3);
+  test_amax_each_axis<int32_t, int32_t>(arr3, shape3, exp3, exp_shape3_k, true);
+
+  // Test float type - dim=3
+  std::vector<double> arr4     = {0.0, -0.99, 10.0, -5.0, 2.999, 1.51, -1.0, 2.99, 3.0};
+  std::vector<uint64_t> shape4 = {3, 1, 3};
+  ShapeResult exp_shape4       = {{0, {1, 3}}, {1, {3, 3}}, {2, {3, 1}}};
+  ShapeResult exp_shape4_k     = {{0, {1, 1, 3}}, {1, {3, 1, 3}}, {2, {3, 1, 1}}};
+  DoubleResult exp4            = {{0, {0.0, 2.999, 10.0}}, {1, arr4}, {2, {10.0, 2.999, 3.0}}};
+  test_amax_each_axis<double, double>(arr4, shape4, exp4, exp_shape4);
+  test_amax_each_axis<double, double>(arr4, shape4, exp4, exp_shape4_k, true);
+}
+
+void test_amax_initial_input()
+{
+  typedef std::map<int32_t, std::vector<int32_t>> IntResult;
+  typedef std::map<int32_t, std::vector<double>> DoubleResult;
+  typedef std::map<int32_t, std::vector<uint64_t>> ShapeResult;
+
+  std::vector<int32_t> arr1    = {0, 11, 2, 3, -4, 0, -6, 7};
+  std::vector<uint64_t> shape1 = {2, 2, 2};
+  ShapeResult exp_shape1       = {{0, {2, 2}}, {1, {2, 2}}, {2, {2, 2}}};
+  ShapeResult exp_shape1_k     = {{0, {1, 2, 2}}, {1, {2, 1, 2}}, {2, {2, 2, 1}}};
+  // use initial in each axis
+  auto initial1  = legate::Scalar(6);
+  IntResult exp1 = {{0, {6, 11, 6, 7}}, {1, {6, 11, 6, 7}}, {2, {11, 6, 6, 7}}};
+  test_amax_each_axis<int32_t, int32_t>(arr1, shape1, exp1, exp_shape1, false, initial1);
+  test_amax_each_axis<int32_t, int32_t>(arr1, shape1, exp1, exp_shape1_k, true, initial1);
+
+  std::vector<double> arr2     = {0.0, -0.99, 10.0, -5.0, 2.999, 1.51, -1.0, 2.99, 3.0};
+  std::vector<uint64_t> shape2 = {3, 3};
+  ShapeResult exp_shape2       = {{0, {3}}, {1, {3}}};
+  ShapeResult exp_shape2_k     = {{0, {1, 3}}, {1, {3, 1}}};
+  auto initial2                = legate::Scalar(2.9999);
+  DoubleResult exp2            = {{0, {2.9999, 2.9999, 10.0}}, {1, {10.0, 2.9999, 3.0}}};
+  test_amax_each_axis<double, double>(arr2, shape2, exp2, exp_shape2, false, initial2);
+  test_amax_each_axis<double, double>(arr2, shape2, exp2, exp_shape2_k, true, initial2);
+}
+
+void test_amax_dtype_input()
+{
+  // int to float
+  std::vector<int32_t> arr1        = {-1, 4, 5, 2, 0};
+  std::vector<uint64_t> shape1     = {5};
+  std::vector<uint64_t> exp_shape1 = {};
+  auto dtype1                      = legate::float64();
+  std::vector<double> exp1         = {5.0};
+  test_amax<int32_t, double>(arr1, shape1, exp1, exp_shape1, {}, dtype1);
+
+  // float to int
+  std::vector<double> arr2         = {0.0, -0.99, 10.1, -5.6, 2.999, 1.51};
+  std::vector<uint64_t> shape2     = {3, 2};
+  std::vector<uint64_t> exp_shape2 = {};
+  auto dtype2                      = legate::int32();
+  std::vector<int32_t> exp2        = {10};
+  test_amax<double, int32_t>(arr2, shape2, exp2, exp_shape2, {}, dtype2);
+}
+
+void test_amax_axis_input()
+{
+  std::vector<double> arr     = {0.0, -0.99, 10.0, -5.0, 2.999, 1.51, -1.0, 2.99, 3.0};
+  std::vector<uint64_t> shape = {3, 1, 3};
+
+  std::vector<int32_t> axis       = {-1, 0, 1};
+  std::vector<uint64_t> exp_shape = {};
+  std::vector<double> exp         = {10.0};
+  test_amax<double, double>(arr, shape, exp, exp_shape, axis);
+}
+
+void test_amax_out_input()
+{
+  // Test out input with dim-1 and different datatype
+  std::vector<int32_t> arr         = {-1, 4, 5, 2, 0, 3};
+  std::vector<uint64_t> shape1     = {6};
+  std::vector<uint64_t> exp_shape1 = {};
+  auto df                          = std::nullopt;
+  auto out1                        = cupynumeric::zeros(exp_shape1, legate::int32());
+  auto out1_1                      = cupynumeric::zeros(exp_shape1, legate::float64());
+  std::vector<int32_t> exp1        = {5};
+  std::vector<double> exp1_1       = {5.0};
+  test_amax<int32_t, int32_t>(arr, shape1, exp1, exp_shape1, {}, df, out1);
+  test_amax<int32_t, double>(arr, shape1, exp1_1, exp_shape1, {}, df, out1_1);
+
+  // Test out input with axis, keepdims and initial params
+  std::vector<uint64_t> shape2       = {2, 3};
+  std::vector<uint64_t> exp_shape2   = {2};
+  std::vector<uint64_t> exp_shape2_k = {2, 1};
+  auto out2                          = cupynumeric::zeros(exp_shape2, legate::int32());
+  auto out2_k                        = cupynumeric::zeros(exp_shape2_k, legate::int32());
+  std::vector<int32_t> axis          = {-1};
+  auto ini                           = legate::Scalar(2);
+  std::vector<int32_t> exp2          = {5, 3};
+  test_amax<int32_t, int32_t>(arr, shape2, exp2, exp_shape2, axis, df, out2);
+  test_amax<int32_t, int32_t>(arr, shape2, exp2, exp_shape2_k, axis, df, out2_k, true);
+
+  test_amax<int32_t, int32_t>(arr, shape2, exp2, exp_shape2, axis, df, out2, false, ini);
+  test_amax<int32_t, int32_t>(arr, shape2, exp2, exp_shape2_k, axis, df, out2_k, true, ini);
+}
+
+void test_amax_max_dim()
+{
+  std::vector<int32_t> arr  = {14, 10, 3, 12, 5, 13, 2, 4, 16, 8, 9, 7, 6, 11, 1, 15};
+  std::vector<int32_t> axis = {-1};
+#if LEGATE_MAX_DIM >= 4
+  std::vector<uint64_t> shape_4d     = {2, 2, 2, 2};
+  std::vector<uint64_t> exp_shape_4d = {2, 2, 2};
+  std::vector<int32_t> exp_4d        = {14, 12, 13, 4, 16, 9, 11, 15};
+  test_amax<int32_t, int32_t>(arr, shape_4d, exp_4d, exp_shape_4d, axis);
+#endif
+
+#if LEGATE_MAX_DIM >= 5
+  std::vector<uint64_t> shape_5d     = {1, 2, 2, 1, 4};
+  std::vector<uint64_t> exp_shape_5d = {1, 2, 2, 1};
+  std::vector<int32_t> exp_5d        = {14, 13, 16, 15};
+  test_amax<int32_t, int32_t>(arr, shape_5d, exp_5d, exp_shape_5d, axis);
+#endif
+
+#if LEGATE_MAX_DIM >= 6
+  std::vector<uint64_t> shape_6d     = {2, 1, 1, 2, 2, 2};
+  std::vector<uint64_t> exp_shape_6d = {2, 1, 1, 2, 2};
+  std::vector<int32_t> exp_6d        = {14, 12, 13, 4, 16, 9, 11, 15};
+  test_amax<int32_t, int32_t>(arr, shape_6d, exp_6d, exp_shape_6d, axis);
+#endif
+
+#if LEGATE_MAX_DIM >= 7
+  std::vector<uint64_t> shape_7d     = {2, 1, 1, 2, 1, 1, 4};
+  std::vector<uint64_t> exp_shape_7d = {2, 1, 1, 2, 1, 1};
+  std::vector<int32_t> exp_7d        = {14, 13, 16, 15};
+  test_amax<int32_t, int32_t>(arr, shape_7d, exp_7d, exp_shape_7d, axis);
+#endif
+}
+
+void test_amax_large_array()
+{
+  const int32_t count             = 100000;
+  std::vector<uint64_t> shape     = {count};
+  std::vector<uint64_t> exp_shape = {};
+
+  // Test int type for large array
+  std::vector<int32_t> arr1(count);
+  for (int32_t i = 0; i < count; i++) {
+    arr1[i] = i + 1;
+  }
+  std::vector<int32_t> exp1 = {count};
+  test_amax<int32_t, int32_t>(arr1, shape, exp1, exp_shape);
+
+  // Test float type
+  std::vector<double> arr2(count);
+  for (int32_t i = 0; i < count; i++) {
+    arr2[i] = i + 1.1;
+  }
+  std::vector<double> exp2 = {count + 0.1};
+  test_amax<double, double>(arr2, shape, exp2, exp_shape);
+}
+
+void test_amax_scalar_array()
+{
+  std::vector<int32_t> arr    = {10};
+  std::vector<uint64_t> shape = {};
+  std::vector<int32_t> exp    = {10};
+  auto out                    = cupynumeric::zeros(shape, legate::int32());
+  auto df                     = std::nullopt;
+  test_amax<int32_t, int32_t>(arr, shape, exp, shape);
+  test_amax<int32_t, int32_t>(arr, shape, exp, shape, {}, df, out);
+
+  // Test with initial
+  auto initial              = legate::Scalar(11);
+  std::vector<int32_t> exp1 = {11};
+  test_amax<int32_t, int32_t>(arr, shape, exp1, shape, {}, df, df, false, initial);
+}
+
+void test_amax_invalid_array()
+{
+  // Test zero size array
+  std::vector<int32_t> arr1    = {};
+  std::vector<uint64_t> shape1 = {0};
+  auto arr_emp                 = cupynumeric::mk_array<int32_t>(arr1, shape1);
+  EXPECT_THROW(cupynumeric::amax(arr_emp), std::invalid_argument);
+
+  // Test complex array (not supported now)
+  std::vector<complex<float>> arr2 = {complex<float>(0, 1), complex<float>(1, 1)};
+  std::vector<uint64_t> shape2     = {2};
+  auto arr_comp                    = cupynumeric::mk_array<complex<float>>(arr2, shape2);
+  EXPECT_THROW(cupynumeric::amax(arr_comp), std::runtime_error);
+}
+
+void test_amax_invalid_axis()
+{
+  std::vector<int32_t> arr    = {1, 2, 3, 4, 5, 6};
+  std::vector<uint64_t> shape = {1, 3, 2};
+  auto array                  = cupynumeric::mk_array<int32_t>(arr, shape);
+
+  // Test out-of-bound
+  std::vector<int32_t> axis1 = {-4, 3};
+  std::vector<int32_t> axis2 = {0, 3};
+  EXPECT_THROW(cupynumeric::amax(array, axis1), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::amax(array, axis2), std::invalid_argument);
+
+  // Test repeated axes
+  std::vector<int32_t> axis3 = {1, 1};
+  std::vector<int32_t> axis4 = {-1, 2};
+  EXPECT_THROW(cupynumeric::amax(array, axis3), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::amax(array, axis4), std::invalid_argument);
+
+  // Not reduce to one value (valid but not supported now)
+  std::vector<int32_t> axis5 = {0, 1};
+  EXPECT_THROW(cupynumeric::amax(array, axis5), std::runtime_error);
+}
+
+void test_amax_invalid_shape()
+{
+  std::vector<int32_t> arr    = {1, 2, 3, 4, 5, 6};
+  std::vector<uint64_t> shape = {1, 3, 2};
+  auto array                  = cupynumeric::mk_array<int32_t>(arr, shape);
+  auto df                     = std::nullopt;
+
+  std::vector<uint64_t> out_shape1 = {1};
+  auto out1                        = cupynumeric::zeros(out_shape1, legate::int32());
+  EXPECT_THROW(cupynumeric::amax(array, {}, df, out1), std::invalid_argument);
+
+  std::vector<uint64_t> out_shape2 = {2};
+  std::vector<int32_t> axis2       = {1};
+  auto out2                        = cupynumeric::zeros(out_shape2, legate::int32());
+  EXPECT_THROW(cupynumeric::amax(array, axis2, df, out2), std::invalid_argument);
+}
+
+void test_amax_invalid_dtype()
+{
+  std::vector<int32_t> arr    = {1, 2, 3, 4, 5, 6};
+  std::vector<uint64_t> shape = {1, 3, 2};
+  auto array                  = cupynumeric::mk_array<int32_t>(arr, shape);
+
+  // Test invalid dtype
+  auto dtype = legate::point_type(2);
+  EXPECT_THROW(cupynumeric::amax(array, {}, dtype), std::invalid_argument);
+}
+
+// void cpp_test()
+TEST(Amax, BasicTest) { test_amax_basic(); }
+TEST(Amax, InitialInput) { test_amax_initial_input(); }
+TEST(Amax, DtypeInput) { test_amax_dtype_input(); }
+TEST(Amax, AxisInput) { test_amax_axis_input(); }
+TEST(Amax, OutInput) { test_amax_out_input(); }
+TEST(Amax, MaxDim) { test_amax_max_dim(); }
+TEST(Amax, LargeArray) { test_amax_large_array(); }
+TEST(Amax, ScalarArray) { test_amax_scalar_array(); }
+TEST(Amax, InvalidArray) { test_amax_invalid_array(); }
+TEST(Amax, InvalidAxis) { test_amax_invalid_axis(); }
+TEST(Amax, InvalidShape) { test_amax_invalid_shape(); }
+TEST(Amax, InvalidDtype) { test_amax_invalid_dtype(); }
diff --git a/tests/cpp/integration/test_amin.cc b/tests/cpp/integration/test_amin.cc
new file mode 100644
index 0000000000..22e135a472
--- /dev/null
+++ b/tests/cpp/integration/test_amin.cc
@@ -0,0 +1,339 @@
+/* Copyright 2023 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+#include "common_utils.h"
+
+template <typename T, typename OUT_T>
+void test_amin(const std::vector<T>& in_array,
+               const std::vector<uint64_t>& shape,
+               const std::vector<OUT_T>& expect_result,
+               const std::vector<uint64_t>& expect_shape,
+               std::vector<int32_t> axis                 = {},
+               std::optional<legate::Type> dtype         = std::nullopt,
+               std::optional<cupynumeric::NDArray> out   = std::nullopt,
+               bool keepdims                             = false,
+               std::optional<legate::Scalar> initial     = std::nullopt,
+               std::optional<cupynumeric::NDArray> where = std::nullopt)
+{
+  auto array = cupynumeric::mk_array<T>(in_array, shape);
+
+  if (!out.has_value()) {
+    auto result = cupynumeric::amin(array, axis, dtype, std::nullopt, keepdims, initial, where);
+    cupynumeric::check_array<OUT_T>(result, expect_result, expect_shape);
+  } else {
+    cupynumeric::amin(array, axis, dtype, out, keepdims, initial, where);
+    cupynumeric::check_array<OUT_T>(out.value(), expect_result, expect_shape);
+  }
+}
+
+template <typename T, typename OUT_T>
+void test_amin_each_axis(const std::vector<T>& arr,
+                         const std::vector<uint64_t>& shape,
+                         std::map<int32_t, std::vector<OUT_T>>& expect_results,
+                         std::map<int32_t, std::vector<uint64_t>>& expect_shapes,
+                         bool keepdims                         = false,
+                         std::optional<legate::Scalar> initial = std::nullopt)
+{
+  int32_t dim = shape.size();
+  auto df     = std::nullopt;
+  for (int32_t axis = -dim + 1; axis < dim; ++axis) {
+    auto index     = axis < 0 ? axis + dim : axis;
+    auto exp       = expect_results[index];
+    auto exp_shape = expect_shapes[index];
+    auto axes      = {axis};
+    test_amin<T, OUT_T>(arr, shape, exp, exp_shape, axes, df, df, keepdims, initial, df);
+  }
+}
+
+void test_amin_basic()
+{
+  typedef std::map<int32_t, std::vector<int32_t>> IntResult;
+  typedef std::map<int32_t, std::vector<double>> DoubleResult;
+  typedef std::map<int32_t, std::vector<uint64_t>> ShapeResult;
+
+  // Test int type - dim=1
+  std::vector<int32_t> arr1    = {-1, 4, 5, 2, 0};
+  std::vector<uint64_t> shape1 = {5};
+  ShapeResult exp_shape1       = {{0, {}}};
+  ShapeResult exp_shape1_k     = {{0, {1}}};
+  IntResult exp1               = {{0, {-1}}};
+  test_amin_each_axis<int32_t, int32_t>(arr1, shape1, exp1, exp_shape1);
+  test_amin_each_axis<int32_t, int32_t>(arr1, shape1, exp1, exp_shape1_k, true);
+
+  // Test int type - dim=2
+  std::vector<int32_t> arr2    = {1, 0, 0, 5, 3, 2};
+  std::vector<uint64_t> shape2 = {3, 2};
+  ShapeResult exp_shape2       = {{0, {2}}, {1, {3}}};
+  ShapeResult exp_shape2_k     = {{0, {1, 2}}, {1, {3, 1}}};
+  IntResult exp2               = {{0, {0, 0}}, {1, {0, 0, 2}}};
+  test_amin_each_axis<int32_t, int32_t>(arr2, shape2, exp2, exp_shape2);
+  test_amin_each_axis<int32_t, int32_t>(arr2, shape2, exp2, exp_shape2_k, true);
+
+  // Test int type - dim=3
+  std::vector<int32_t> arr3    = {0, 11, 2, 3, -4, 0, -6, 7};
+  std::vector<uint64_t> shape3 = {2, 2, 2};
+  ShapeResult exp_shape3       = {{0, {2, 2}}, {1, {2, 2}}, {2, {2, 2}}};
+  ShapeResult exp_shape3_k     = {{0, {1, 2, 2}}, {1, {2, 1, 2}}, {2, {2, 2, 1}}};
+  IntResult exp3               = {{0, {-4, 0, -6, 3}}, {1, {0, 3, -6, 0}}, {2, {0, 2, -4, -6}}};
+  test_amin_each_axis<int32_t, int32_t>(arr3, shape3, exp3, exp_shape3);
+  test_amin_each_axis<int32_t, int32_t>(arr3, shape3, exp3, exp_shape3_k, true);
+
+  // Test float type - dim=3
+  std::vector<double> arr4     = {0.0, -0.99, 10.0, -5.0, 2.999, 1.51, -1.0, 2.99, 3.0};
+  std::vector<uint64_t> shape4 = {3, 1, 3};
+  ShapeResult exp_shape4       = {{0, {1, 3}}, {1, {3, 3}}, {2, {3, 1}}};
+  ShapeResult exp_shape4_k     = {{0, {1, 1, 3}}, {1, {3, 1, 3}}, {2, {3, 1, 1}}};
+  DoubleResult exp4            = {{0, {-5.0, -0.99, 1.51}}, {1, arr4}, {2, {-0.99, -5.0, -1.0}}};
+  test_amin_each_axis<double, double>(arr4, shape4, exp4, exp_shape4);
+  test_amin_each_axis<double, double>(arr4, shape4, exp4, exp_shape4_k, true);
+}
+
+void test_amin_initial_input()
+{
+  typedef std::map<int32_t, std::vector<int32_t>> IntResult;
+  typedef std::map<int32_t, std::vector<double>> DoubleResult;
+  typedef std::map<int32_t, std::vector<uint64_t>> ShapeResult;
+
+  std::vector<int32_t> arr1    = {0, 11, 2, 3, -4, 0, -6, 7};
+  std::vector<uint64_t> shape1 = {2, 2, 2};
+  ShapeResult exp_shape1       = {{0, {2, 2}}, {1, {2, 2}}, {2, {2, 2}}};
+  ShapeResult exp_shape1_k     = {{0, {1, 2, 2}}, {1, {2, 1, 2}}, {2, {2, 2, 1}}};
+  // use initial in each axis
+  auto initial1  = legate::Scalar(-1);
+  IntResult exp1 = {{0, {-4, -1, -6, -1}}, {1, {-1, -1, -6, -1}}, {2, {-1, -1, -4, -6}}};
+  test_amin_each_axis<int32_t, int32_t>(arr1, shape1, exp1, exp_shape1, false, initial1);
+  test_amin_each_axis<int32_t, int32_t>(arr1, shape1, exp1, exp_shape1_k, true, initial1);
+
+  std::vector<double> arr2     = {0.0, -0.99, 10.0, -5.0, 2.999, 1.51, -1.0, 2.99, 3.0};
+  std::vector<uint64_t> shape2 = {3, 3};
+  ShapeResult exp_shape2       = {{0, {3}}, {1, {3}}};
+  ShapeResult exp_shape2_k     = {{0, {1, 3}}, {1, {3, 1}}};
+  auto initial2                = legate::Scalar(0.0);
+  DoubleResult exp2            = {{0, {-5.0, -0.99, 0.0}}, {1, {-0.99, -5.0, -1.0}}};
+  test_amin_each_axis<double, double>(arr2, shape2, exp2, exp_shape2, false, initial2);
+  test_amin_each_axis<double, double>(arr2, shape2, exp2, exp_shape2_k, true, initial2);
+}
+
+void test_amin_dtype_input()
+{
+  // int to float
+  std::vector<int32_t> arr1        = {-1, 4, 5, 2, 0};
+  std::vector<uint64_t> shape1     = {5};
+  std::vector<uint64_t> exp_shape1 = {};
+  auto dtype1                      = legate::float64();
+  std::vector<double> exp1         = {-1.0};
+  test_amin<int32_t, double>(arr1, shape1, exp1, exp_shape1, {}, dtype1);
+
+  // float to int
+  std::vector<double> arr2         = {0.0, -0.99, 10.1, -5.6, 2.999, 1.51};
+  std::vector<uint64_t> shape2     = {3, 2};
+  std::vector<uint64_t> exp_shape2 = {};
+  auto dtype2                      = legate::int32();
+  std::vector<int32_t> exp2        = {-5};
+  test_amin<double, int32_t>(arr2, shape2, exp2, exp_shape2, {}, dtype2);
+}
+
+void test_amin_axis_input()
+{
+  std::vector<double> arr     = {0.0, -0.99, 10.0, -5.0, 2.999, 1.51, -1.0, 2.99, 3.0};
+  std::vector<uint64_t> shape = {3, 1, 3};
+
+  std::vector<int32_t> axis       = {-1, 0, 1};
+  std::vector<uint64_t> exp_shape = {};
+  std::vector<double> exp         = {-5.0};
+  test_amin<double, double>(arr, shape, exp, exp_shape, axis);
+}
+
+void test_amin_out_input()
+{
+  // Test out input with dim-1 and different datatype
+  std::vector<int32_t> arr         = {-1, 4, 5, 2, 0, 3};
+  std::vector<uint64_t> shape1     = {6};
+  std::vector<uint64_t> exp_shape1 = {};
+  auto df                          = std::nullopt;
+  auto out1                        = cupynumeric::zeros(exp_shape1, legate::int32());
+  auto out1_1                      = cupynumeric::zeros(exp_shape1, legate::float64());
+  std::vector<int32_t> exp1        = {-1};
+  std::vector<double> exp1_1       = {-1.0};
+  test_amin<int32_t, int32_t>(arr, shape1, exp1, exp_shape1, {}, df, out1);
+  test_amin<int32_t, double>(arr, shape1, exp1_1, exp_shape1, {}, df, out1_1);
+
+  // Test out input with axis, keepdims and initial params
+  std::vector<uint64_t> shape2       = {2, 3};
+  std::vector<uint64_t> exp_shape2   = {2};
+  std::vector<uint64_t> exp_shape2_k = {2, 1};
+  auto out2                          = cupynumeric::zeros(exp_shape2, legate::int32());
+  auto out2_k                        = cupynumeric::zeros(exp_shape2_k, legate::int32());
+  std::vector<int32_t> axis          = {-1};
+  auto ini                           = legate::Scalar(2);
+  std::vector<int32_t> exp2          = {-1, 0};
+  test_amin<int32_t, int32_t>(arr, shape2, exp2, exp_shape2, axis, df, out2);
+  test_amin<int32_t, int32_t>(arr, shape2, exp2, exp_shape2_k, axis, df, out2_k, true);
+
+  test_amin<int32_t, int32_t>(arr, shape2, exp2, exp_shape2, axis, df, out2, false, ini);
+  test_amin<int32_t, int32_t>(arr, shape2, exp2, exp_shape2_k, axis, df, out2_k, true, ini);
+}
+
+void test_amin_max_dim()
+{
+  std::vector<int32_t> arr  = {14, 10, 3, 12, 5, 13, 2, 4, 16, 8, 9, 7, 6, 11, 1, 15};
+  std::vector<int32_t> axis = {-1};
+#if LEGATE_MAX_DIM >= 4
+  std::vector<uint64_t> shape_4d     = {2, 2, 2, 2};
+  std::vector<uint64_t> exp_shape_4d = {2, 2, 2};
+  std::vector<int32_t> exp_4d        = {10, 3, 5, 2, 8, 7, 6, 1};
+  test_amin<int32_t, int32_t>(arr, shape_4d, exp_4d, exp_shape_4d, axis);
+#endif
+
+#if LEGATE_MAX_DIM >= 5
+  std::vector<uint64_t> shape_5d     = {1, 2, 2, 1, 4};
+  std::vector<uint64_t> exp_shape_5d = {1, 2, 2, 1};
+  std::vector<int32_t> exp_5d        = {3, 2, 7, 1};
+  test_amin<int32_t, int32_t>(arr, shape_5d, exp_5d, exp_shape_5d, axis);
+#endif
+
+#if LEGATE_MAX_DIM >= 6
+  std::vector<uint64_t> shape_6d     = {2, 1, 1, 2, 2, 2};
+  std::vector<uint64_t> exp_shape_6d = {2, 1, 1, 2, 2};
+  std::vector<int32_t> exp_6d        = {10, 3, 5, 2, 8, 7, 6, 1};
+  test_amin<int32_t, int32_t>(arr, shape_6d, exp_6d, exp_shape_6d, axis);
+#endif
+
+#if LEGATE_MAX_DIM >= 7
+  std::vector<uint64_t> shape_7d     = {2, 1, 1, 2, 1, 1, 4};
+  std::vector<uint64_t> exp_shape_7d = {2, 1, 1, 2, 1, 1};
+  std::vector<int32_t> exp_7d        = {3, 2, 7, 1};
+  test_amin<int32_t, int32_t>(arr, shape_7d, exp_7d, exp_shape_7d, axis);
+#endif
+}
+
+void test_amin_large_array()
+{
+  const int32_t count             = 100000;
+  std::vector<uint64_t> shape     = {count};
+  std::vector<uint64_t> exp_shape = {};
+
+  // Test int type for large array
+  std::vector<int32_t> arr1(count);
+  for (int32_t i = 0; i < count; i++) {
+    arr1[i] = i + 1;
+  }
+  std::vector<int32_t> exp1 = {1};
+  test_amin<int32_t, int32_t>(arr1, shape, exp1, exp_shape);
+
+  // Test float type
+  std::vector<double> arr2(count);
+  for (int32_t i = 0; i < count; i++) {
+    arr2[i] = i + 1.1;
+  }
+  std::vector<double> exp2 = {1.1};
+  test_amin<double, double>(arr2, shape, exp2, exp_shape);
+}
+
+void test_amin_scalar_array()
+{
+  std::vector<int32_t> arr    = {10};
+  std::vector<uint64_t> shape = {};
+  std::vector<int32_t> exp    = {10};
+  auto out                    = cupynumeric::zeros(shape, legate::int32());
+  auto df                     = std::nullopt;
+  test_amin<int32_t, int32_t>(arr, shape, exp, shape);
+  test_amin<int32_t, int32_t>(arr, shape, exp, shape, {}, df, out);
+
+  // Test with initial
+  auto initial              = legate::Scalar(9);
+  std::vector<int32_t> exp1 = {9};
+  test_amin<int32_t, int32_t>(arr, shape, exp1, shape, {}, df, df, false, initial);
+}
+
+void test_amin_invalid_array()
+{
+  // Test zero size array
+  std::vector<int32_t> arr1    = {};
+  std::vector<uint64_t> shape1 = {0};
+  auto arr_emp                 = cupynumeric::mk_array<int32_t>(arr1, shape1);
+  EXPECT_THROW(cupynumeric::amin(arr_emp), std::invalid_argument);
+
+  // Test complex array (not supported now)
+  std::vector<complex<float>> arr2 = {complex<float>(0, 1), complex<float>(1, 1)};
+  std::vector<uint64_t> shape2     = {2};
+  auto arr_comp                    = cupynumeric::mk_array<complex<float>>(arr2, shape2);
+  EXPECT_THROW(cupynumeric::amin(arr_comp), std::runtime_error);
+}
+
+void test_amin_invalid_axis()
+{
+  std::vector<int32_t> arr    = {1, 2, 3, 4, 5, 6};
+  std::vector<uint64_t> shape = {1, 3, 2};
+  auto array                  = cupynumeric::mk_array<int32_t>(arr, shape);
+
+  // Test out-of-bound
+  std::vector<int32_t> axis1 = {-4, 3};
+  std::vector<int32_t> axis2 = {0, 3};
+  EXPECT_THROW(cupynumeric::amin(array, axis1), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::amin(array, axis2), std::invalid_argument);
+
+  // Test repeated axes
+  std::vector<int32_t> axis3 = {1, 1};
+  std::vector<int32_t> axis4 = {-1, 2};
+  EXPECT_THROW(cupynumeric::amin(array, axis3), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::amin(array, axis4), std::invalid_argument);
+
+  // Not reduce to one value (valid but not supported now)
+  std::vector<int32_t> axis5 = {0, 1};
+  EXPECT_THROW(cupynumeric::amin(array, axis5), std::runtime_error);
+}
+
+void test_amin_invalid_shape()
+{
+  std::vector<int32_t> arr    = {1, 2, 3, 4, 5, 6};
+  std::vector<uint64_t> shape = {1, 3, 2};
+  auto array                  = cupynumeric::mk_array<int32_t>(arr, shape);
+  auto df                     = std::nullopt;
+
+  std::vector<uint64_t> out_shape1 = {1};
+  auto out1                        = cupynumeric::zeros(out_shape1, legate::int32());
+  EXPECT_THROW(cupynumeric::amin(array, {}, df, out1), std::invalid_argument);
+
+  std::vector<uint64_t> out_shape2 = {2};
+  std::vector<int32_t> axis2       = {1};
+  auto out2                        = cupynumeric::zeros(out_shape2, legate::int32());
+  EXPECT_THROW(cupynumeric::amin(array, axis2, df, out2), std::invalid_argument);
+}
+
+void test_amin_invalid_dtype()
+{
+  std::vector<int32_t> arr    = {1, 2, 3, 4, 5, 6};
+  std::vector<uint64_t> shape = {1, 3, 2};
+  auto array                  = cupynumeric::mk_array<int32_t>(arr, shape);
+
+  // Test invalid dtype
+  auto dtype = legate::point_type(2);
+  EXPECT_THROW(cupynumeric::amin(array, {}, dtype), std::invalid_argument);
+}
+
+// void cpp_test()
+TEST(Amin, BasicTest) { test_amin_basic(); }
+TEST(Amin, InitialInput) { test_amin_initial_input(); }
+TEST(Amin, DtypeInput) { test_amin_dtype_input(); }
+TEST(Amin, AxisInput) { test_amin_axis_input(); }
+TEST(Amin, OutInput) { test_amin_out_input(); }
+TEST(Amin, MaxDim) { test_amin_max_dim(); }
+TEST(Amin, LargeArray) { test_amin_large_array(); }
+TEST(Amin, ScalarArray) { test_amin_scalar_array(); }
+TEST(Amin, InvalidArray) { test_amin_invalid_array(); }
+TEST(Amin, InvalidAxis) { test_amin_invalid_axis(); }
+TEST(Amin, InvalidShape) { test_amin_invalid_shape(); }
+TEST(Amin, InvalidDtype) { test_amin_invalid_dtype(); }
diff --git a/tests/cpp/integration/test_arange.cc b/tests/cpp/integration/test_arange.cc
index d6a5fb7300..3152d58497 100644
--- a/tests/cpp/integration/test_arange.cc
+++ b/tests/cpp/integration/test_arange.cc
@@ -18,7 +18,7 @@
 #include <algorithm>
 #include <cstdint>
 #include "legate.h"
-#include "cunumeric.h"
+#include "cupynumeric.h"
 #include "util.inl"
 
 TEST(ArangeType, ImplicitInt64)
@@ -34,7 +34,7 @@ TEST(ArangeType, ImplicitInt64)
                                  1567891032462,
                                  1567891032463,
                                  1567891032464};
-  auto arr                    = cunumeric::arange(start, stop);
+  auto arr                    = cupynumeric::arange(start, stop);
   check_array_eq<int64_t, 1>(arr, exp.data(), exp.size());
 }
 
@@ -42,7 +42,7 @@ TEST(ArangeType, ImplicitInt32)
 {
   int32_t stop                = 10;
   std::array<int32_t, 10> exp = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-  auto arr                    = cunumeric::arange(stop);
+  auto arr                    = cupynumeric::arange(stop);
   check_array_eq<int32_t, 1>(arr, exp.data(), exp.size());
 }
 
@@ -51,7 +51,7 @@ TEST(ArangeType, ImplicitFloat64)
   double start              = 1.5;
   double stop               = 10.5;
   std::array<double, 9> exp = {1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5};
-  auto arr                  = cunumeric::arange(start, (std::optional<double>)stop);
+  auto arr                  = cupynumeric::arange(start, (std::optional<double>)stop);
   check_array_eq<double, 1>(arr, exp.data(), exp.size());
 }
 
@@ -60,7 +60,7 @@ TEST(ArangeType, ImplicitFloat32)
   float start              = 1.5;
   float stop               = 10.5;
   std::array<float, 9> exp = {1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5};
-  auto arr                 = cunumeric::arange(start, (std::optional<float>)stop);
+  auto arr                 = cupynumeric::arange(start, (std::optional<float>)stop);
   check_array_eq<float, 1>(arr, exp.data(), exp.size());
 }
 
@@ -69,7 +69,7 @@ TEST(ArangeType, ExplicitInt32)
   float start                = 1.5;
   float stop                 = 10.5;
   std::array<int32_t, 9> exp = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-  auto arr                   = cunumeric::arange<int32_t>(start, stop);
+  auto arr                   = cupynumeric::arange<int32_t>(start, stop);
   check_array_eq<int32_t, 1>(arr, exp.data(), exp.size());
 }
 
@@ -78,7 +78,7 @@ TEST(ArangeScalar, Float32)
   float start              = 1.5;
   float stop               = 10.5;
   std::array<float, 9> exp = {1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5};
-  auto arr                 = cunumeric::arange(legate::Scalar(start), legate::Scalar(stop));
+  auto arr                 = cupynumeric::arange(legate::Scalar(start), legate::Scalar(stop));
   check_array_eq<float, 1>(arr, exp.data(), exp.size());
 }
 
@@ -86,6 +86,6 @@ TEST(ArangeErrors, ScalarTypeMismatch)
 {
   float start  = 1.5;
   int32_t stop = 10;
-  EXPECT_THROW(cunumeric::arange(legate::Scalar(start), legate::Scalar(stop)),
+  EXPECT_THROW(cupynumeric::arange(legate::Scalar(start), legate::Scalar(stop)),
                std::invalid_argument);
 }
diff --git a/tests/cpp/integration/test_argsort.cc b/tests/cpp/integration/test_argsort.cc
index 62b3ececaa..17c394887c 100644
--- a/tests/cpp/integration/test_argsort.cc
+++ b/tests/cpp/integration/test_argsort.cc
@@ -20,7 +20,7 @@
 
 #include <gtest/gtest.h>
 #include "legate.h"
-#include "cunumeric.h"
+#include "cupynumeric.h"
 #include "util.inl"
 
 auto get_argsort_expect_result()
@@ -211,7 +211,7 @@ void test_argsort(std::array<T, SIZE>& in_array,
                   std::optional<int32_t> axis,
                   bool test_only_stable = false)
 {
-  auto A1 = cunumeric::zeros(shape, leg_type);
+  auto A1 = cupynumeric::zeros(shape, leg_type);
   if (in_array.size() != 0) {
     if (in_array.size() == 1) {
       A1.fill(legate::Scalar(in_array[0]));
@@ -224,7 +224,7 @@ void test_argsort(std::array<T, SIZE>& in_array,
     algos = {"mergesort", "stable"};
   }
   for (auto algo = algos.begin(); algo < algos.end(); ++algo) {
-    auto B1 = cunumeric::argsort(A1, axis, *algo);
+    auto B1 = cupynumeric::argsort(A1, axis, *algo);
     if (in_array.size() != 0) {
       check_array_eq<int64_t, DIM>(B1, expect.data(), expect.size());
     }
@@ -244,7 +244,6 @@ void argsort_basic_axis_impl(
     auto test_shape = test_shapes[i];
     int32_t dim     = test_shape.size();
     for (int32_t axis = -dim + 1; axis < dim; ++axis) {
-      std::cout << "Axis is: " << axis << std::endl;
       auto expect_val = expect_result[i][axis];
       if (dim == 1) {
         test_argsort<T, SIZE, 1>(
@@ -457,14 +456,14 @@ void argsort_single_item_array()
 
 void argsort_negative_test()
 {
-  auto in_ar1 = cunumeric::zeros({2, 3}, legate::int32());
+  auto in_ar1 = cupynumeric::zeros({2, 3}, legate::int32());
 
   // Test invalid input sort axis
-  EXPECT_THROW(cunumeric::argsort(in_ar1, 2, "quicksort"), std::invalid_argument);
-  EXPECT_THROW(cunumeric::argsort(in_ar1, -3, "quicksort"), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::argsort(in_ar1, 2, "quicksort"), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::argsort(in_ar1, -3, "quicksort"), std::invalid_argument);
 
   // Test invalid input algorithm
-  EXPECT_THROW(cunumeric::argsort(in_ar1, 0, "negative"), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::argsort(in_ar1, 0, "negative"), std::invalid_argument);
 }
 
 // void cpp_test()
diff --git a/tests/cpp/integration/test_argwhere.cc b/tests/cpp/integration/test_argwhere.cc
index 9627e6eb78..1ebc9c985a 100644
--- a/tests/cpp/integration/test_argwhere.cc
+++ b/tests/cpp/integration/test_argwhere.cc
@@ -20,10 +20,10 @@
 
 #include <gtest/gtest.h>
 #include "legate.h"
-#include "cunumeric.h"
+#include "cupynumeric.h"
 #include "common_utils.h"
 
-using namespace cunumeric;
+using namespace cupynumeric;
 
 namespace {
 std::vector<std::vector<uint64_t>> get_in_shapes_basic()
@@ -268,7 +268,7 @@ TEST(Argwhere, EmptyArray)
     {0, 0},
     {0, 0},
     {0, 0},
-    {0, 0}  // This is shape of cunumeric output array
+    {0, 0}  // This is shape of cupynumeric output array
   };
 
   assert(in_shapes.size() == exp_shapes.size());
@@ -289,7 +289,7 @@ TEST(Argwhere, Scalar)
   std::vector<uint64_t> exp_shape2 = {1, 0};
   auto A2                          = zeros({}, legate::float64());
   A2.fill(legate::Scalar(static_cast<double>(1)));
-  auto B2 = cunumeric::argwhere(A2);
+  auto B2 = cupynumeric::argwhere(A2);
   EXPECT_EQ(B2.size(), 0);
   EXPECT_EQ(B2.type(), legate::int64());
   EXPECT_EQ(B2.shape(), exp_shape2);
diff --git a/tests/cpp/integration/test_bincount.cc b/tests/cpp/integration/test_bincount.cc
index d4f4dc7f23..8cbd0953ae 100644
--- a/tests/cpp/integration/test_bincount.cc
+++ b/tests/cpp/integration/test_bincount.cc
@@ -20,7 +20,7 @@
 
 #include <gtest/gtest.h>
 #include "legate.h"
-#include "cunumeric.h"
+#include "cupynumeric.h"
 #include "util.inl"
 
 void bincount_test()
@@ -28,76 +28,76 @@ void bincount_test()
   // case: x, no w, min_length=0. out NDArray type is int64_t if no weights
   std::array<int64_t, 7> exp1  = {0, 1, 1, 2, 0, 1, 1};
   std::array<int32_t, 6> in_x1 = {1, 2, 3, 3, 5, 6};
-  auto A1                      = cunumeric::zeros({6}, legate::int32());
+  auto A1                      = cupynumeric::zeros({6}, legate::int32());
   assign_values_to_array<int32_t, 1>(A1, in_x1.data(), in_x1.size());
-  auto B1 = cunumeric::bincount(A1);
+  auto B1 = cupynumeric::bincount(A1);
   check_array_eq<int64_t, 1>(B1, exp1.data(), exp1.size());
 
   // case: x, w, min_length=0.
   std::array<double, 7> exp2  = {0, 1, 1.2, 2, 0, 1, 0.1};
   std::array<double, 6> in_w2 = {1, 1.2, 1, 1, 1, 0.1};
-  auto w2                     = cunumeric::zeros({6}, legate::float64());
+  auto w2                     = cupynumeric::zeros({6}, legate::float64());
   assign_values_to_array<double, 1>(w2, in_w2.data(), in_w2.size());
-  auto B2 = cunumeric::bincount(A1, w2);
+  auto B2 = cupynumeric::bincount(A1, w2);
   check_array_eq<double, 1>(B2, exp2.data(), exp2.size());
 
   // case: x, no w, min_length=8. out NDArray type is int64_t if no weights
   std::array<int64_t, 8> exp3 = {0, 1, 1, 2, 0, 1, 1, 0};
-  auto B3                     = cunumeric::bincount(A1, std::nullopt, 8);
+  auto B3                     = cupynumeric::bincount(A1, std::nullopt, 8);
   check_array_eq<int64_t, 1>(B3, exp3.data(), exp3.size());
 
   // case: x of length 1, no w, min_length=0
   std::array<int64_t, 6> exp4 = {0, 0, 0, 0, 0, 1};
-  auto A4                     = cunumeric::full({1}, cunumeric::Scalar(5));
+  auto A4                     = cupynumeric::full({1}, cupynumeric::Scalar(5));
   // If we use another way to initialize A4 of length 1 as below, it would rasie error. Seems a lock
   // issue. In this way, if A4 is not of length 1, it pass. int64_t in_x4[1] = {5}; auto A4 =
-  // cunumeric::zeros({1}, legate::int64()); assign_values_to_array(A4, (void *)in_x4,
+  // cupynumeric::zeros({1}, legate::int64()); assign_values_to_array(A4, (void *)in_x4,
   // sizeof(in_x4)/sizeof(int64_t)); cpp_tests: legion/runtime/realm/runtime_impl.cc:2755:
   // Realm::RegionInstanceImpl* Realm::RuntimeImpl::get_instance_impl(Realm::ID): Assertion `0 &&
   // "invalid instance handle"' failed.
-  auto B4 = cunumeric::bincount(A4);
+  auto B4 = cupynumeric::bincount(A4);
   check_array_eq<int64_t, 1>(B4, exp4.data(), exp4.size());
 
   // case: x of length 1, w of length 1, min_length=0
   std::array<double, 6> exp5 = {0, 0, 0, 0, 0, 1.3};
-  auto w5                    = cunumeric::full({1}, cunumeric::Scalar(1.3));
-  auto B5                    = cunumeric::bincount(A4, w5);
+  auto w5                    = cupynumeric::full({1}, cupynumeric::Scalar(1.3));
+  auto B5                    = cupynumeric::bincount(A4, w5);
   check_array_eq<double, 1>(B5, exp5.data(), exp5.size());
 
   // case: x of length 1, w of length 1, min_length=8
   std::array<double, 8> exp6 = {0, 0, 0, 0, 0, 1.3, 0, 0};
-  auto B6                    = cunumeric::bincount(A4, w5, 8);
+  auto B6                    = cupynumeric::bincount(A4, w5, 8);
   check_array_eq<double, 1>(B6, exp6.data(), exp6.size());
 }
 
 void bincount_negative_test()
 {
   // case: x.size() == 0
-  auto A1 = cunumeric::full({0}, cunumeric::Scalar(5));
-  EXPECT_THROW(cunumeric::bincount(A1), std::invalid_argument);
+  auto A1 = cupynumeric::full({0}, cupynumeric::Scalar(5));
+  EXPECT_THROW(cupynumeric::bincount(A1), std::invalid_argument);
 
   // case: x.dim() != 1
-  auto A2 = cunumeric::full({1, 1}, cunumeric::Scalar(5));
-  EXPECT_THROW(cunumeric::bincount(A2), std::invalid_argument);
+  auto A2 = cupynumeric::full({1, 1}, cupynumeric::Scalar(5));
+  EXPECT_THROW(cupynumeric::bincount(A2), std::invalid_argument);
 
   // case: x.type() is not int
-  auto A3 = cunumeric::full({3}, cunumeric::Scalar(1.3));
-  EXPECT_THROW(cunumeric::bincount(A3), std::invalid_argument);
+  auto A3 = cupynumeric::full({3}, cupynumeric::Scalar(1.3));
+  EXPECT_THROW(cupynumeric::bincount(A3), std::invalid_argument);
 
   // case: x.shape() != w.shape()
-  auto A4 = cunumeric::zeros({6}, legate::int32());
-  auto w4 = cunumeric::zeros({4}, legate::int32());
-  EXPECT_THROW(cunumeric::bincount(A4, w4), std::invalid_argument);
+  auto A4 = cupynumeric::zeros({6}, legate::int32());
+  auto w4 = cupynumeric::zeros({4}, legate::int32());
+  EXPECT_THROW(cupynumeric::bincount(A4, w4), std::invalid_argument);
 
   // case: w.type() is not convertible to float64
-  auto w5 = cunumeric::zeros({6}, legate::complex64());
-  EXPECT_THROW(cunumeric::bincount(A4, w5), std::invalid_argument);
+  auto w5 = cupynumeric::zeros({6}, legate::complex64());
+  EXPECT_THROW(cupynumeric::bincount(A4, w5), std::invalid_argument);
 
   // case: x is negative
   std::array<int32_t, 6> in_x = {1, 2, -3, 4, 5, 6};
-  auto A7                     = cunumeric::zeros({6}, legate::int32());
+  auto A7                     = cupynumeric::zeros({6}, legate::int32());
   assign_values_to_array<int32_t, 1>(A7, in_x.data(), in_x.size());
-  EXPECT_THROW(cunumeric::bincount(A7), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::bincount(A7), std::invalid_argument);
 }
 
 // void cpp_test()
diff --git a/tests/cpp/integration/test_convolve.cc b/tests/cpp/integration/test_convolve.cc
index fa93172b9d..fe9ca7e323 100644
--- a/tests/cpp/integration/test_convolve.cc
+++ b/tests/cpp/integration/test_convolve.cc
@@ -17,7 +17,7 @@
 #include "common_utils.h"
 #include <tuple>
 
-using namespace cunumeric;
+using namespace cupynumeric;
 
 namespace {
 
@@ -95,7 +95,6 @@ TEST(Convolve, test_int)
     auto v   = mk_array(v_in, shape_v);
     auto out = convolve(a, v);
     check_array(out, out_gt, shape_a);
-    debug_array(out, false);
   }
 }
 
@@ -106,7 +105,6 @@ TEST(Convolve, test_double)
     auto v   = mk_array(as_type_vector<double>(v_in), shape_v);
     auto out = convolve(a, v);
     check_array(out, as_type_vector<double>(out_gt), shape_a);
-    debug_array(out, false);
   }
 }
 
@@ -125,7 +123,6 @@ TEST(Convolve, test_ndim)
     if (ndim <= 3) {
       auto out = convolve(a, v);
       check_array(out, a_in, shape);
-      debug_array(out, false);
     } else {
       EXPECT_ANY_THROW(convolve(a, v));
     }
diff --git a/tests/cpp/integration/test_diagonal.cc b/tests/cpp/integration/test_diagonal.cc
index cd7e1986ac..300cf8e852 100644
--- a/tests/cpp/integration/test_diagonal.cc
+++ b/tests/cpp/integration/test_diagonal.cc
@@ -15,7 +15,7 @@
  */
 
 #include <gtest/gtest.h>
-#include "cunumeric.h"
+#include "cupynumeric.h"
 #include "util.inl"
 
 template <size_t IN_SIZE, size_t IN_DIM, size_t EXP_SIZE, size_t EXP_DIM>
@@ -27,9 +27,9 @@ void diagonal_test(std::array<double, IN_SIZE> input,
                    int32_t axis2  = 1,
                    bool extract   = true)
 {
-  auto a_input = cunumeric::zeros(in_shape);
+  auto a_input = cupynumeric::zeros(in_shape);
   assign_values_to_array<double, IN_DIM>(a_input, input.data(), input.size());
-  auto a_output = cunumeric::diagonal(a_input, offset, axis1, axis2, extract);
+  auto a_output = cupynumeric::diagonal(a_input, offset, axis1, axis2, extract);
   check_array_eq<double, EXP_DIM>(a_output, exp.data(), exp.size());
 }
 
@@ -45,9 +45,9 @@ TEST(Diagonal, Singleton)
                                        0.,  0., 0.,  0., 5., 0., 0., 0., 0., 0., 0., 6.};
   std::vector<uint64_t> in_shape    = {6};
 
-  auto a_input = cunumeric::zeros(in_shape);
+  auto a_input = cupynumeric::zeros(in_shape);
   assign_values_to_array<double, in_dim>(a_input, input.data(), input.size());
-  auto a_output = cunumeric::diagonal(a_input, 0, std::nullopt, std::nullopt, false);
+  auto a_output = cupynumeric::diagonal(a_input, 0, std::nullopt, std::nullopt, false);
   check_array_eq<double, exp_dim>(a_output, exp.data(), exp.size());
 }
 
@@ -55,8 +55,8 @@ TEST(Diagonal, SingletonExtract)
 {
   std::vector<uint64_t> in_shape = {6};
 
-  auto a_input = cunumeric::zeros(in_shape);
-  EXPECT_THROW(cunumeric::diagonal(a_input, 0, std::nullopt, std::nullopt, true),
+  auto a_input = cupynumeric::zeros(in_shape);
+  EXPECT_THROW(cupynumeric::diagonal(a_input, 0, std::nullopt, std::nullopt, true),
                std::invalid_argument);
 }
 
@@ -64,8 +64,8 @@ TEST(Diagonal, SingletonAxes)
 {
   std::vector<uint64_t> in_shape = {6};
 
-  auto a_input = cunumeric::zeros(in_shape);
-  EXPECT_THROW(cunumeric::diagonal(a_input, 0, 0, 1, false), std::invalid_argument);
+  auto a_input = cupynumeric::zeros(in_shape);
+  EXPECT_THROW(cupynumeric::diagonal(a_input, 0, 0, 1, false), std::invalid_argument);
 }
 
 TEST(Diagonal, Defaults)
@@ -78,9 +78,9 @@ TEST(Diagonal, Defaults)
   std::array<double, exp_size> exp  = {9, 2, 6};
   std::vector<uint64_t> in_shape    = {3, 3};
 
-  auto a_input = cunumeric::zeros(in_shape);
+  auto a_input = cupynumeric::zeros(in_shape);
   assign_values_to_array<double, in_dim>(a_input, input.data(), input.size());
-  auto a_output = cunumeric::diagonal(a_input);
+  auto a_output = cupynumeric::diagonal(a_input);
   check_array_eq<double, exp_dim>(a_output, exp.data(), exp.size());
 }
 
@@ -90,8 +90,8 @@ TEST(Diagonal, EmptyArray)
   const size_t exp_dim              = 2;
   std::array<int32_t, exp_size> exp = {};
 
-  auto a_input  = cunumeric::array({0}, legate::int32());
-  auto a_output = cunumeric::diagonal(a_input, 0, std::nullopt, std::nullopt, false);
+  auto a_input  = cupynumeric::array({0}, legate::int32());
+  auto a_output = cupynumeric::diagonal(a_input, 0, std::nullopt, std::nullopt, false);
   check_array_eq<int32_t, exp_dim>(a_output, exp.data(), exp.size());
 }
 
@@ -141,10 +141,10 @@ TEST(Diagonal, InvalidAxes)
   std::array<double, in_size> input = {1.3, 2, 3.6, 4, 5, 6};
   std::vector<uint64_t> in_shape    = {2, 3};
 
-  auto a_input = cunumeric::zeros(in_shape);
+  auto a_input = cupynumeric::zeros(in_shape);
   assign_values_to_array<double, in_dim>(a_input, input.data(), input.size());
-  EXPECT_THROW(cunumeric::diagonal(a_input, 0, 2, 6, true), std::invalid_argument);
-  EXPECT_THROW(cunumeric::diagonal(a_input, 0, 1, 1, true), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::diagonal(a_input, 0, 2, 6, true), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::diagonal(a_input, 0, 1, 1, true), std::invalid_argument);
 }
 
 TEST(Diagonal, InvalidOffset)
@@ -154,9 +154,9 @@ TEST(Diagonal, InvalidOffset)
   std::array<double, in_size> input = {1.3, 2, 3.6, 4, 5, 6};
   std::vector<uint64_t> in_shape    = {2, 3};
 
-  auto a_input = cunumeric::zeros(in_shape);
+  auto a_input = cupynumeric::zeros(in_shape);
   assign_values_to_array<double, in_dim>(a_input, input.data(), input.size());
-  EXPECT_THROW(cunumeric::diagonal(a_input, 3), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::diagonal(a_input, 3), std::invalid_argument);
 }
 
 TEST(Diagonal, IntArray)
@@ -169,9 +169,9 @@ TEST(Diagonal, IntArray)
   std::array<int32_t, exp_size> exp  = {1, 5};
   std::vector<uint64_t> in_shape     = {2, 3};
 
-  auto a_input = cunumeric::zeros(in_shape, legate::int32());
+  auto a_input = cupynumeric::zeros(in_shape, legate::int32());
   assign_values_to_array<int32_t, in_dim>(a_input, input.data(), input.size());
-  auto a_output = cunumeric::diagonal(a_input);
+  auto a_output = cupynumeric::diagonal(a_input);
   check_array_eq<int32_t, exp_dim>(a_output, exp.data(), exp.size());
 }
 
@@ -201,15 +201,15 @@ template <size_t IN_SIZE, size_t IN_DIM, size_t EXP_SIZE, size_t EXP_DIM>
 void trace_test(std::array<double, IN_SIZE> input,
                 std::array<double, EXP_SIZE> exp,
                 std::vector<uint64_t> in_shape,
-                int32_t offset                        = 0,
-                int32_t axis1                         = 0,
-                int32_t axis2                         = 1,
-                std::optional<legate::Type> type      = std::nullopt,
-                std::optional<cunumeric::NDArray> out = std::nullopt)
+                int32_t offset                          = 0,
+                int32_t axis1                           = 0,
+                int32_t axis2                           = 1,
+                std::optional<legate::Type> type        = std::nullopt,
+                std::optional<cupynumeric::NDArray> out = std::nullopt)
 {
-  auto a_input = cunumeric::zeros(in_shape);
+  auto a_input = cupynumeric::zeros(in_shape);
   assign_values_to_array<double, IN_DIM>(a_input, input.data(), input.size());
-  auto a_output = cunumeric::trace(a_input, offset, axis1, axis2, type, out);
+  auto a_output = cupynumeric::trace(a_input, offset, axis1, axis2, type, out);
   check_array_eq<double, EXP_DIM>(a_output, exp.data(), exp.size());
 }
 
@@ -258,9 +258,9 @@ TEST(Trace, IntArray)
   std::array<int32_t, in_size> input = {9, 7, 5, 3, 2, 6, 4, 1};
   std::array<int32_t, exp_size> exp  = {15};
   std::vector<uint64_t> in_shape     = {2, 4, 1};
-  auto a_input                       = cunumeric::zeros(in_shape, legate::int32());
+  auto a_input                       = cupynumeric::zeros(in_shape, legate::int32());
   assign_values_to_array<int32_t, in_dim>(a_input, input.data(), input.size());
-  auto a_output = cunumeric::trace(a_input, 0, 0, 1);
+  auto a_output = cupynumeric::trace(a_input, 0, 0, 1);
   check_array_eq<int32_t, exp_dim>(a_output, exp.data(), exp.size());
 }
 
@@ -274,9 +274,9 @@ TEST(Trace, TypeInt)
   std::array<int32_t, exp_size> exp = {12};
   std::vector<uint64_t> in_shape    = {2, 4, 1};
 
-  auto a_input = cunumeric::zeros(in_shape);
+  auto a_input = cupynumeric::zeros(in_shape);
   assign_values_to_array<double, in_dim>(a_input, input.data(), input.size());
-  auto a_output = cunumeric::trace(a_input, 0, 0, 1, legate::int32());
+  auto a_output = cupynumeric::trace(a_input, 0, 0, 1, legate::int32());
   check_array_eq<int32_t, exp_dim>(a_output, exp.data(), exp.size());
 }
 
@@ -291,10 +291,10 @@ TEST(Trace, OutType)
   std::vector<uint64_t> in_shape    = {2, 4, 1};
   std::vector<uint64_t> out_shape   = {1};
 
-  auto a_input  = cunumeric::zeros(in_shape);
-  auto a_output = cunumeric::zeros(out_shape, legate::int32());
+  auto a_input  = cupynumeric::zeros(in_shape);
+  auto a_output = cupynumeric::zeros(out_shape, legate::int32());
   assign_values_to_array<double, in_dim>(a_input, input.data(), input.size());
-  cunumeric::trace(a_input, 0, 0, 1, std::nullopt, a_output);
+  cupynumeric::trace(a_input, 0, 0, 1, std::nullopt, a_output);
   check_array_eq<int32_t, exp_dim>(a_output, exp.data(), exp.size());
 }
 
@@ -305,7 +305,7 @@ TEST(Trace, InvalidArray)
   std::array<double, in_size> input = {9, 7, 0.5, 1.3, 2, 3.6, 4, 5};
   std::vector<uint64_t> in_shape    = {8};
 
-  auto a_input = cunumeric::zeros(in_shape);
+  auto a_input = cupynumeric::zeros(in_shape);
   assign_values_to_array<double, in_dim>(a_input, input.data(), input.size());
-  EXPECT_THROW(cunumeric::trace(a_input), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::trace(a_input), std::invalid_argument);
 }
diff --git a/tests/cpp/integration/test_dot.cc b/tests/cpp/integration/test_dot.cc
index f38f646b7d..8d09defca8 100644
--- a/tests/cpp/integration/test_dot.cc
+++ b/tests/cpp/integration/test_dot.cc
@@ -14,60 +14,339 @@
  *
  */
 
-#include <exception>
-#include <iomanip>
 #include <iostream>
 #include <sstream>
 
 #include <gtest/gtest.h>
 #include "legate.h"
-#include "cunumeric.h"
+#include "cupynumeric.h"
 #include "common_utils.h"
 
-using namespace cunumeric;
+std::vector<uint64_t> calc_c_shape_scalar(const std::vector<uint64_t>& a_shape,
+                                          const std::vector<uint64_t>& b_shape)
+{
+  assert(a_shape.size() == 0 || b_shape.size() == 0);
+
+  std::vector<uint64_t> c_shape;
+  if (a_shape.size() == 0 && b_shape.size() == 0) {
+    c_shape = {};
+  } else {
+    c_shape = a_shape.size() == 0 ? b_shape : a_shape;
+  }
+  return c_shape;
+}
+
+template <typename T>
+std::vector<T> calc_result_scalar(const std::vector<T>& vec_a,
+                                  const std::vector<uint64_t>& a_shape,
+                                  const std::vector<T>& vec_b,
+                                  const std::vector<uint64_t>& b_shape,
+                                  const std::vector<uint64_t>& c_shape)
+{
+  if (a_shape.size() == 0) {
+    assert(vec_a.size() == 1);
+  }
+  if (b_shape.size() == 0) {
+    assert(vec_b.size() == 1);
+  }
+
+  std::vector<T> vec_c;
+  if (a_shape.size() == 0 && b_shape.size() == 0) {
+    vec_c.push_back(vec_a[0] * vec_b[0]);
+    return vec_c;
+  }
+
+  auto vec_scalar     = a_shape.size() == 0 ? vec_a : vec_b;
+  auto vec_non_scalar = a_shape.size() == 0 ? vec_b : vec_a;
+  for (int i = 0; i < vec_non_scalar.size(); i++) {
+    vec_c.push_back(vec_non_scalar[i] * vec_scalar[0]);
+  }
+  return vec_c;
+}
+
+std::vector<uint64_t> calc_c_shape_b_is_vector(const std::vector<uint64_t>& a_shape,
+                                               const std::vector<uint64_t>& b_shape)
+{
+  assert(a_shape[a_shape.size() - 1] == b_shape[b_shape.size() - 1]);
+  std::vector<uint64_t> c_shape;
+  int a_size_in_c = 1;
+  for (int i = 0; i < a_shape.size() - 1; i++) {
+    c_shape.push_back(a_shape[i]);
+    a_size_in_c *= a_shape[i];
+  }
+  return c_shape;
+}
+
+template <typename T>
+std::vector<T> calc_result_b_is_vector(const std::vector<T>& vec_a,
+                                       const std::vector<uint64_t>& a_shape,
+                                       const std::vector<T>& vec_b,
+                                       const std::vector<uint64_t>& b_shape,
+                                       const std::vector<uint64_t>& c_shape)
+{
+  int a_size_in_c = 1;
+  for (int i = 0; i < a_shape.size() - 1; i++) {
+    a_size_in_c *= a_shape[i];
+  }
+
+  std::vector<T> vec_c;
+  auto x       = a_shape[a_shape.size() - 1];
+  int offset_a = 0;
+  for (int i_a = 0; i_a < a_size_in_c; i_a++) {
+    T sum = 0;
+    for (int j = 0; j < x; j++) {
+      sum += vec_a[offset_a + j] * vec_b[j];
+    }
+    vec_c.push_back(sum);
+    offset_a += x;
+  }
+  return vec_c;
+}
+
+std::vector<uint64_t> calc_c_shape_contract(const std::vector<uint64_t>& a_shape,
+                                            const std::vector<uint64_t>& b_shape)
+{
+  std::vector<uint64_t> c_shape = {};
+  for (int i = 0; i < a_shape.size() - 1; i++) {
+    c_shape.push_back(a_shape[i]);
+  }
+  for (int i = 0; i < b_shape.size() - 2; i++) {
+    c_shape.push_back(b_shape[i]);
+  }
+  c_shape.push_back(b_shape[b_shape.size() - 1]);
+  return c_shape;
+}
+
+template <typename T>
+std::vector<T> calc_result_contract(const std::vector<T>& vec_a,
+                                    const std::vector<uint64_t>& a_shape,
+                                    const std::vector<T>& vec_b,
+                                    const std::vector<uint64_t>& b_shape,
+                                    const std::vector<uint64_t>& c_shape)
+{
+  int a_size_in_c = 1, b_size_in_c = 1;
+  for (int i = 0; i < a_shape.size() - 1; i++) {
+    a_size_in_c *= a_shape[i];
+  }
+  for (int i = 0; i < b_shape.size() - 2; i++) {
+    b_size_in_c *= b_shape[i];
+  }
+  b_size_in_c *= b_shape[b_shape.size() - 1];
+
+  std::vector<T> vec_c;
+  assert(a_shape[a_shape.size() - 1] == b_shape[b_shape.size() - 2]);
 
-namespace {
+  auto x       = a_shape[a_shape.size() - 1];
+  auto m       = b_shape[b_shape.size() - 1];
+  int offset_a = 0;
+  for (int i_a = 0; i_a < a_size_in_c; i_a++) {
+    int offset_b = 0, b_i = 0;
+    for (int i_b = 0; i_b < b_size_in_c; i_b++) {
+      T sum = 0;
+      for (int j = 0; j < x; j++) {
+        sum += vec_a[offset_a + j] * vec_b[offset_b + j * m];
+      }
+      vec_c.push_back(sum);
+      if (++b_i >= m) {
+        offset_b = offset_b + m * x - m + 1;
+        b_i      = 0;
+      } else {
+        offset_b += 1;
+      }
+    }
+    offset_a += x;
+  }
+  return vec_c;
+}
 
 template <typename T>
-auto test_standard(uint64_t m, uint64_t n, uint64_t k, legate::Type leg_type)
+void verify_dot_output(cupynumeric::NDArray A, cupynumeric::NDArray B, cupynumeric::NDArray C)
 {
-  std::vector<T> data_a(m * k);
-  std::vector<T> data_b(n * k);
-  std::iota(data_a.begin(), data_a.end(), 0);
-  std::iota(data_b.begin(), data_b.end(), 0.0);
+  auto vec_a = cupynumeric::to_vector<T>(A);
+  auto vec_b = cupynumeric::to_vector<T>(B);
+  std::vector<T> vec_c;
+  auto a_shape                      = A.shape();
+  auto b_shape                      = B.shape();
+  std::vector<uint64_t> vec_c_shape = {};
 
-  auto A = cunumeric::zeros({m, k}, leg_type);
-  auto B = cunumeric::zeros({k, n}, leg_type);
+  if (A.dim() == 0 || B.dim() == 0) {
+    vec_c_shape = calc_c_shape_scalar(a_shape, b_shape);
+    vec_c       = calc_result_scalar<T>(vec_a, a_shape, vec_b, b_shape, vec_c_shape);
+  } else if (B.dim() == 1 && A.dim() >= 1) {
+    vec_c_shape = calc_c_shape_b_is_vector(a_shape, b_shape);
+    vec_c       = calc_result_b_is_vector<T>(vec_a, a_shape, vec_b, b_shape, vec_c_shape);
+  } else {
+    vec_c_shape = calc_c_shape_contract(a_shape, b_shape);
+    vec_c       = calc_result_contract<T>(vec_a, a_shape, vec_b, b_shape, vec_c_shape);
+  }
 
-  assign_values_to_array<T, 2>(A, data_a.data(), m * k);
-  assign_values_to_array<T, 2>(B, data_b.data(), n * k);
+  auto leg_type = legate::primitive_type(legate::type_code_of_v<T>);
+  if (leg_type == legate::float32() || leg_type == legate::float64()) {
+    double abs_error = 1.e-4;
+    cupynumeric::check_array_near<T>(C, vec_c, vec_c_shape, abs_error);
+  }
+}
+
+template <typename T>
+void test_contract_full(std::vector<uint64_t> a_shape, std::vector<uint64_t> b_shape)
+{
+  auto leg_type = legate::primitive_type(legate::type_code_of_v<T>);
+  if (leg_type == legate::float64()) {
+    auto A = a_shape.size() == 0 ? cupynumeric::mk_array<T>({10}) : cupynumeric::random(a_shape);
+    auto B = b_shape.size() == 0 ? cupynumeric::mk_array<T>({10}) : cupynumeric::random(b_shape);
+    auto C = cupynumeric::dot(A, B);
+    verify_dot_output<T>(A, B, C);
+  } else {
+    auto A = a_shape.size() == 0 ? cupynumeric::mk_array<T>({10})
+                                 : cupynumeric::random(a_shape).as_type(leg_type);
+    auto B = b_shape.size() == 0 ? cupynumeric::mk_array<T>({10})
+                                 : cupynumeric::random(b_shape).as_type(leg_type);
+    auto C = cupynumeric::dot(A, B);
+    if (leg_type == legate::float32()) {
+      verify_dot_output<T>(A, B, C);
+    }
+  }
+}
+
+template <typename T>
+void test_contract_standard(std::vector<uint64_t> a_shape, std::vector<uint64_t> b_shape)
+{
+  auto A =
+    a_shape.size() == 0
+      ? cupynumeric::mk_array<T>({10})
+      : cupynumeric::random(a_shape).as_type(legate::primitive_type(legate::type_code_of_v<T>));
+  auto B =
+    b_shape.size() == 0
+      ? cupynumeric::mk_array<T>({10})
+      : cupynumeric::random(b_shape).as_type(legate::primitive_type(legate::type_code_of_v<T>));
+  auto C = cupynumeric::dot(A, B);
 
-  auto C                          = dot(A, B);
-  std::vector<uint64_t> exp_shape = {m, n};
+  auto leg_type                     = legate::primitive_type(legate::type_code_of_v<T>);
+  std::vector<uint64_t> vec_c_shape = {};
+  if (A.dim() == 0 || B.dim() == 0) {
+    vec_c_shape = calc_c_shape_scalar(a_shape, b_shape);
+  } else if (B.dim() == 1 && A.dim() >= 1) {
+    vec_c_shape = calc_c_shape_b_is_vector(a_shape, b_shape);
+  } else {
+    vec_c_shape = calc_c_shape_contract(a_shape, b_shape);
+  }
   EXPECT_EQ(C.type(), leg_type);
-  EXPECT_EQ(C.shape(), exp_shape);
+  EXPECT_EQ(C.shape(), vec_c_shape);
 }
 
-TEST(Dot, Standard)
+template <typename T>
+void test_contract_full_all(void)
+{
+  test_contract_full<T>({}, {});  // 0x0
+  test_contract_full<T>({},
+                        {
+                          3,
+                        });        // 0x1
+  test_contract_full<T>({3}, {});  // 1x0
+  test_contract_full<T>(
+    {
+      3,
+    },
+    {
+      3,
+    });  // 1x1
+  test_contract_full<T>(
+    {
+      3,
+    },
+    {3, 4});  // 1x2
+  test_contract_full<T>({2, 3},
+                        {
+                          3,
+                        });  // 2x1
+  test_contract_full<T>(
+    {
+      3,
+    },
+    {2, 3, 4});  // 1x3
+  test_contract_full<T>({2, 3, 4},
+                        {
+                          4,
+                        });                  // 3x1
+  test_contract_full<T>({2, 3}, {3, 4});     // 2x2
+  test_contract_full<T>({2, 3}, {5, 3, 4});  // 2x3
+  test_contract_full<T>({2, 3, 4}, {4, 2});  // 3x2
+#if LEGATE_MAX_DIM >= 5
+  test_contract_full<T>({2, 3, 4}, {5, 4, 7});  // 3x3
+  test_contract_full<T>({2, 3}, {5, 2, 3, 4});  // 2x4
+#endif
+}
+
+template <typename T>
+void test_contract_standard_all(void)
 {
-  test_standard<float>(124, 95, 30, legate::float32());
-  test_standard<double>(124, 95, 30, legate::float64());
+  test_contract_standard<T>({}, {});  // 0x0
+  test_contract_standard<T>({},
+                            {
+                              3,
+                            });        // 0x1
+  test_contract_standard<T>({3}, {});  // 1x0
+  test_contract_standard<T>(
+    {
+      3,
+    },
+    {
+      3,
+    });  // 1x1
+  test_contract_standard<T>(
+    {
+      3,
+    },
+    {3, 4});  // 1x2
+  test_contract_standard<T>({2, 3},
+                            {
+                              3,
+                            });  // 2x1
+  test_contract_standard<T>(
+    {
+      3,
+    },
+    {2, 3, 4});  // 1x3
+  test_contract_standard<T>({2, 3, 4},
+                            {
+                              4,
+                            });                  // 3x1
+  test_contract_standard<T>({2, 3}, {3, 4});     // 2x2
+  test_contract_standard<T>({2, 3}, {5, 3, 4});  // 2x3
+  test_contract_standard<T>({2, 3, 4}, {4, 2});  // 3x2
+#if LEGATE_MAX_DIM >= 5
+  test_contract_standard<T>({2, 3, 4}, {5, 4, 7});  // 3x3
+  test_contract_standard<T>({2, 3}, {5, 2, 3, 4});  // 2x4
+#endif
 }
 
-TEST(Dot, Complex)
+TEST(Dot, MMStandard)
 {
-  test_standard<complex<float>>(124, 95, 30, legate::complex64());
-  test_standard<complex<double>>(124, 95, 30, legate::complex128());
+  test_contract_full<float>({124, 30}, {30, 95});
+  test_contract_full<double>({124, 30}, {30, 95});
 }
 
-TEST(Dot, Large)
+TEST(Dot, MMComplex)
 {
-  // activate tiling (m,n) and/or batching (k)
-  test_standard<float>(513, 12, 4, legate::float32());
-  test_standard<float>(12, 518, 30, legate::float32());
-  test_standard<float>(513, 513, 30, legate::float32());
-  test_standard<double>(512, 512, 4097, legate::float64());
-  test_standard<double>(1024, 1024, 4097, legate::float64());
+  test_contract_standard<complex<float>>({124, 30}, {30, 95});
+  test_contract_standard<complex<float>>({124, 30}, {30, 95});
 }
 
-}  // namespace
+TEST(Dot, MMLarge)
+{
+  test_contract_full<float>({513, 4}, {4, 12});
+  test_contract_full<float>({12, 30}, {30, 518});
+  test_contract_full<float>({513, 30}, {30, 513});
+  test_contract_full<double>({512, 4097}, {4097, 512});
+  // test_contract_full<double>({1024, 4097}, {4097, 1024}); # There is not enough space because
+  // Legate is reserving 67125248 of the available 268435456 bytes (minus the eager pool allocation)
+  // for the following LogicalStores
+}
+
+TEST(Dot, AllFloat) { test_contract_full_all<float>(); }
+
+TEST(Dot, AllDouble) { test_contract_full_all<double>(); }
+
+TEST(Dot, AllComplex64) { test_contract_standard_all<complex<float>>(); }
+
+TEST(Dot, AllComplex128) { test_contract_standard_all<complex<double>>(); }
diff --git a/tests/cpp/integration/test_eye.cc b/tests/cpp/integration/test_eye.cc
index a60c6693ff..fef0925c5e 100644
--- a/tests/cpp/integration/test_eye.cc
+++ b/tests/cpp/integration/test_eye.cc
@@ -20,7 +20,7 @@
 
 #include <gtest/gtest.h>
 #include "legate.h"
-#include "cunumeric.h"
+#include "cupynumeric.h"
 #include "util.inl"
 
 template <typename T>
@@ -77,13 +77,13 @@ auto test_eye_3_2(std::vector<int32_t>& k_vals, std::optional<legate::Type> type
   std::vector<uint64_t> expect_shape = {3, 2};
   for (auto k : k_vals) {
     if (type.has_value()) {
-      auto result = cunumeric::eye(3, 2, k, type.value());
+      auto result = cupynumeric::eye(3, 2, k, type.value());
       EXPECT_EQ(result.type(), type.value());
       EXPECT_EQ(result.shape(), expect_shape);
       auto expect = expect_result[k];
       check_array_eq<T, 2>(result, expect.data(), expect.size());
     } else {
-      auto result = cunumeric::eye(3, 2, k);
+      auto result = cupynumeric::eye(3, 2, k);
       EXPECT_EQ(result.type(), legate::float64());
       EXPECT_EQ(result.shape(), expect_shape);
       auto expect = expect_result[k];
@@ -99,13 +99,13 @@ auto test_eye_3_3(std::vector<int32_t>& k_vals, std::optional<legate::Type> type
   std::vector<uint64_t> expect_shape = {3, 3};
   for (auto k : k_vals) {
     if (type.has_value()) {
-      auto result = cunumeric::eye(3, 3, k, type.value());
+      auto result = cupynumeric::eye(3, 3, k, type.value());
       EXPECT_EQ(result.type(), type.value());
       EXPECT_EQ(result.shape(), expect_shape);
       auto expect = expect_result[k];
       check_array_eq<T, 2>(result, expect.data(), expect.size());
     } else {
-      auto result = cunumeric::eye(3, 3, k);
+      auto result = cupynumeric::eye(3, 3, k);
       EXPECT_EQ(result.type(), legate::float64());
       EXPECT_EQ(result.shape(), expect_shape);
       auto expect = expect_result[k];
@@ -121,13 +121,13 @@ auto test_eye_3_4(std::vector<int32_t>& k_vals, std::optional<legate::Type> type
   std::vector<uint64_t> expect_shape = {3, 4};
   for (auto k : k_vals) {
     if (type.has_value()) {
-      auto result = cunumeric::eye(3, 4, k, type.value());
+      auto result = cupynumeric::eye(3, 4, k, type.value());
       EXPECT_EQ(result.type(), type.value());
       EXPECT_EQ(result.shape(), expect_shape);
       auto expect = expect_result[k];
       check_array_eq<T, 2>(result, expect.data(), expect.size());
     } else {
-      auto result = cunumeric::eye(3, 4, k);
+      auto result = cupynumeric::eye(3, 4, k);
       EXPECT_EQ(result.type(), legate::float64());
       EXPECT_EQ(result.shape(), expect_shape);
       auto expect = expect_result[k];
@@ -145,13 +145,13 @@ auto test_eye_square_3(std::optional<std::vector<int32_t>> k_vals = std::nullopt
   if (k_vals.has_value()) {
     for (auto k : k_vals.value()) {
       if (type.has_value()) {
-        auto result = cunumeric::eye(3, std::nullopt, k, type.value());
+        auto result = cupynumeric::eye(3, std::nullopt, k, type.value());
         EXPECT_EQ(result.type(), type.value());
         EXPECT_EQ(result.shape(), expect_shape);
         auto expect = expect_result[k];
         check_array_eq<T, 2>(result, expect.data(), expect.size());
       } else {
-        auto result = cunumeric::eye(3, std::nullopt, k);
+        auto result = cupynumeric::eye(3, std::nullopt, k);
         EXPECT_EQ(result.type(), legate::float64());
         auto expect = expect_result[k];
         check_array_eq<T, 2>(result, expect.data(), expect.size());
@@ -159,13 +159,13 @@ auto test_eye_square_3(std::optional<std::vector<int32_t>> k_vals = std::nullopt
     }
   } else {
     if (type.has_value()) {
-      auto result = cunumeric::eye(3, std::nullopt, 0, type.value());
+      auto result = cupynumeric::eye(3, std::nullopt, 0, type.value());
       EXPECT_EQ(result.type(), type.value());
       EXPECT_EQ(result.shape(), expect_shape);
       auto expect = expect_result[0];
       check_array_eq<T, 2>(result, expect.data(), expect.size());
     } else {
-      auto result = cunumeric::eye(3);
+      auto result = cupynumeric::eye(3);
       EXPECT_EQ(result.type(), legate::float64());
       EXPECT_EQ(result.shape(), expect_shape);
       auto expect = expect_result[0];
@@ -214,14 +214,14 @@ void eye_square()
 void eye_input_zero()
 {
   // Test n=0
-  auto result1                        = cunumeric::eye(0);
+  auto result1                        = cupynumeric::eye(0);
   std::vector<uint64_t> expect_shape1 = {0, 0};
   EXPECT_EQ(result1.type(), legate::float64());
   EXPECT_EQ(result1.size(), 0);
   EXPECT_EQ(result1.shape(), expect_shape1);
 
   // Test m=0
-  auto result2                        = cunumeric::eye(3, 0);
+  auto result2                        = cupynumeric::eye(3, 0);
   std::vector<uint64_t> expect_shape2 = {3, 0};
   EXPECT_EQ(result2.type(), legate::float64());
   EXPECT_EQ(result2.size(), 0);
@@ -233,7 +233,7 @@ void eye_large_array()
   const size_t n_or_m = 1000;
 
   // Test 1000 * 1000 array
-  auto result1                        = cunumeric::eye(n_or_m);
+  auto result1                        = cupynumeric::eye(n_or_m);
   std::vector<uint64_t> expect_shape1 = {n_or_m, n_or_m};
   std::array<double, n_or_m * n_or_m> expect_result1;
   expect_result1.fill(0);
@@ -246,7 +246,7 @@ void eye_large_array()
 
   // Test 3 * 1000 array
   const size_t n                      = 3;
-  auto result2                        = cunumeric::eye(n, n_or_m, 0, legate::int32());
+  auto result2                        = cupynumeric::eye(n, n_or_m, 0, legate::int32());
   std::vector<uint64_t> expect_shape2 = {n, n_or_m};
   std::array<int32_t, n * n_or_m> expect_result2;
   expect_result2.fill(0);
@@ -259,7 +259,7 @@ void eye_large_array()
 
   // Test 1000 * 3 array
   const size_t m                      = 3;
-  auto result3                        = cunumeric::eye(n_or_m, m, 0, legate::complex64());
+  auto result3                        = cupynumeric::eye(n_or_m, m, 0, legate::complex64());
   std::vector<uint64_t> expect_shape3 = {n_or_m, m};
   std::array<complex<float>, n_or_m * m> expect_result3;
   expect_result3.fill(0);
@@ -276,16 +276,16 @@ void eye_large_array()
 void eye_negative()
 {
   // Test bad n
-  EXPECT_THROW(cunumeric::eye(-1), std::invalid_argument);
-  EXPECT_THROW(cunumeric::eye(-1, 3), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::eye(-1), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::eye(-1, 3), std::invalid_argument);
 
   // Test bad m
-  EXPECT_THROW(cunumeric::eye(3, -1), std::invalid_argument);
-  EXPECT_THROW(cunumeric::eye(-1, -1), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::eye(3, -1), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::eye(-1, -1), std::invalid_argument);
 
   // Test bad dtype
-  EXPECT_THROW(cunumeric::eye(3, std::nullopt, 0, legate::binary_type(2)), std::invalid_argument);
-  EXPECT_THROW(cunumeric::eye(3, std::nullopt, 0, legate::point_type(2)), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::eye(3, std::nullopt, 0, legate::binary_type(2)), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::eye(3, std::nullopt, 0, legate::point_type(2)), std::invalid_argument);
 }
 
 // void cpp_test()
diff --git a/tests/cpp/integration/test_fill.cc b/tests/cpp/integration/test_fill.cc
index 27800d488b..76daaab284 100644
--- a/tests/cpp/integration/test_fill.cc
+++ b/tests/cpp/integration/test_fill.cc
@@ -18,7 +18,7 @@
 #include <limits>
 #include <cmath>
 
-using namespace cunumeric;
+using namespace cupynumeric;
 
 namespace {
 
diff --git a/tests/cpp/integration/test_flip.cc b/tests/cpp/integration/test_flip.cc
index 80226665e1..f20afe7a17 100644
--- a/tests/cpp/integration/test_flip.cc
+++ b/tests/cpp/integration/test_flip.cc
@@ -20,7 +20,7 @@
 
 #include <gtest/gtest.h>
 #include "legate.h"
-#include "cunumeric.h"
+#include "cupynumeric.h"
 #include "util.inl"
 
 auto get_flip_expect_result_int()
@@ -330,7 +330,7 @@ void test_flip(std::array<T, SIZE>& in_array,
                std::vector<uint64_t> shape,
                std::optional<std::vector<int32_t>> axis = std::nullopt)
 {
-  auto A1 = cunumeric::zeros(shape, leg_type);
+  auto A1 = cupynumeric::zeros(shape, leg_type);
   if (in_array.size() != 0) {
     if (in_array.size() == 1) {
       A1.fill(legate::Scalar(in_array[0]));
@@ -339,7 +339,7 @@ void test_flip(std::array<T, SIZE>& in_array,
     }
   }
 
-  auto B1 = cunumeric::flip(A1, axis);
+  auto B1 = cupynumeric::flip(A1, axis);
   check_array_eq<T, DIM>(B1, expect.data(), expect.size());
 }
 
@@ -636,23 +636,23 @@ void flip_single_item_array()
 
 void flip_negative_test()
 {
-  auto in_array = cunumeric::zeros({2, 3}, legate::int32());
+  auto in_array = cupynumeric::zeros({2, 3}, legate::int32());
 
   // Test axis out-of-bound
   auto axes1 = {12};
-  EXPECT_THROW(cunumeric::flip(in_array, axes1), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::flip(in_array, axes1), std::invalid_argument);
 
   // Test axis out-of-bound negative
   auto axes2 = {-12};
-  EXPECT_THROW(cunumeric::flip(in_array, axes2), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::flip(in_array, axes2), std::invalid_argument);
 
   // Test axis repeated axis
   auto axes3 = {1, 1};
-  EXPECT_THROW(cunumeric::flip(in_array, axes3), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::flip(in_array, axes3), std::invalid_argument);
 
   // Test axis out-of-bound multiple
   auto axes4 = {1, 2};
-  EXPECT_THROW(cunumeric::flip(in_array, axes4), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::flip(in_array, axes4), std::invalid_argument);
 }
 
 // void cpp_test()
diff --git a/tests/cpp/integration/test_logical.cc b/tests/cpp/integration/test_logical.cc
index cbd7ce7e91..f1c1908abc 100644
--- a/tests/cpp/integration/test_logical.cc
+++ b/tests/cpp/integration/test_logical.cc
@@ -19,7 +19,7 @@
 
 #include <gtest/gtest.h>
 #include "legate.h"
-#include "cunumeric.h"
+#include "cupynumeric.h"
 #include "util.inl"
 
 template <typename T,
@@ -32,12 +32,12 @@ void test_all(std::array<T, IN_SIZE>& in_array,
               std::array<OUT_T, OUT_SIZE>& expect_result,
               legate::Type leg_type,
               std::vector<uint64_t> shape,
-              std::optional<std::vector<int32_t>> axis = std::nullopt,
-              std::optional<cunumeric::NDArray> out    = std::nullopt,
-              std::optional<bool> keepdims             = std::nullopt,
-              std::optional<cunumeric::NDArray> where  = std::nullopt)
+              std::vector<int32_t> axis                 = {},
+              std::optional<cupynumeric::NDArray> out   = std::nullopt,
+              bool keepdims                             = false,
+              std::optional<cupynumeric::NDArray> where = std::nullopt)
 {
-  auto A1 = cunumeric::zeros(shape, leg_type);
+  auto A1 = cupynumeric::zeros(shape, leg_type);
   if (in_array.size() != 0) {
     if (in_array.size() == 1) {
       A1.fill(legate::Scalar(in_array[0]));
@@ -47,10 +47,10 @@ void test_all(std::array<T, IN_SIZE>& in_array,
   }
 
   if (!out.has_value()) {
-    auto B1 = cunumeric::all(A1, axis, std::nullopt, keepdims, where);
+    auto B1 = cupynumeric::all(A1, axis, std::nullopt, keepdims, where);
     check_array_eq<OUT_T, OUT_DIM>(B1, expect_result.data(), expect_result.size());
   } else {
-    cunumeric::all(A1, axis, out, keepdims, where);
+    cupynumeric::all(A1, axis, out, keepdims, where);
     check_array_eq<OUT_T, OUT_DIM>(out.value(), expect_result.data(), expect_result.size());
   }
 }
@@ -222,29 +222,29 @@ void test_all_where_input()
 
   // Test where with multiple bool values
   std::array<bool, 2> where_in1 = {true, false};
-  auto where_array1             = cunumeric::zeros({2}, legate::bool_());
+  auto where_array1             = cupynumeric::zeros({2}, legate::bool_());
   assign_values_to_array<bool, 1>(where_array1, where_in1.data(), where_in1.size());
 
   std::array<bool, 1> expect_val1 = {true};
   test_all<bool, bool, 4, 1, 2, 1>(
-    in_array, expect_val1, legate::bool_(), shape, std::nullopt, std::nullopt, false, where_array1);
+    in_array, expect_val1, legate::bool_(), shape, {}, std::nullopt, false, where_array1);
 
   // Test where with single bool value
   std::array<bool, 1> where_in2 = {true};
-  auto where_array2             = cunumeric::zeros({1}, legate::bool_());
+  auto where_array2             = cupynumeric::zeros({1}, legate::bool_());
   assign_values_to_array<bool, 1>(where_array2, where_in2.data(), where_in2.size());
 
   std::array<bool, 1> expect_val2 = {false};
   test_all<bool, bool, 4, 1, 2, 1>(
-    in_array, expect_val2, legate::bool_(), shape, std::nullopt, std::nullopt, false, where_array2);
+    in_array, expect_val2, legate::bool_(), shape, {}, std::nullopt, false, where_array2);
 
   std::array<bool, 1> where_in3 = {false};
-  auto where_array3             = cunumeric::zeros({1}, legate::bool_());
+  auto where_array3             = cupynumeric::zeros({1}, legate::bool_());
   assign_values_to_array<bool, 1>(where_array3, where_in3.data(), where_in3.size());
 
   std::array<bool, 1> expect_val3 = {true};
   test_all<bool, bool, 4, 1, 2, 1>(
-    in_array, expect_val3, legate::bool_(), shape, std::nullopt, std::nullopt, false, where_array3);
+    in_array, expect_val3, legate::bool_(), shape, {}, std::nullopt, false, where_array3);
 }
 
 void test_all_out_input()
@@ -254,21 +254,21 @@ void test_all_out_input()
   std::vector<uint64_t> out_shape = {2, 2};
   std::vector<int32_t> axis       = {0};
 
-  auto out1                          = cunumeric::zeros(out_shape, legate::int32());
+  auto out1                          = cupynumeric::zeros(out_shape, legate::int32());
   std::array<int32_t, 4> expect_val1 = {0, 1, 1, 1};
   test_all<int32_t, int32_t, 8, 4, 3, 2>(in_array, expect_val1, legate::int32(), shape, axis, out1);
 
-  auto out2                         = cunumeric::zeros(out_shape, legate::float64());
+  auto out2                         = cupynumeric::zeros(out_shape, legate::float64());
   std::array<double, 4> expect_val2 = {0.0, 1.0, 1.0, 1.0};
   test_all<int32_t, double, 8, 4, 3, 2>(in_array, expect_val2, legate::int32(), shape, axis, out2);
 
-  auto out3                                 = cunumeric::zeros(out_shape, legate::complex64());
+  auto out3                                 = cupynumeric::zeros(out_shape, legate::complex64());
   std::array<complex<float>, 4> expect_val3 = {
     complex<float>(0, 0), complex<float>(1, 0), complex<float>(1, 0), complex<float>(1, 0)};
   test_all<int32_t, complex<float>, 8, 4, 3, 2>(
     in_array, expect_val3, legate::int32(), shape, axis, out3);
 
-  auto out4                       = cunumeric::zeros(out_shape, legate::bool_());
+  auto out4                       = cupynumeric::zeros(out_shape, legate::bool_());
   std::array<bool, 4> expect_val4 = {false, true, true, true};
   test_all<int32_t, bool, 8, 4, 3, 2>(in_array, expect_val4, legate::int32(), shape, axis, out4);
 }
@@ -375,66 +375,65 @@ void test_all_invalid_axis()
 {
   std::array<int32_t, 4> in_array = {5, 10, 0, 100};
   std::vector<uint64_t> shape     = {1, 2, 2};
-  auto array                      = cunumeric::zeros(shape, legate::int32());
+  auto array                      = cupynumeric::zeros(shape, legate::int32());
   assign_values_to_array<int32_t, 3>(array, in_array.data(), in_array.size());
 
   // Test out-of-bound
   std::vector<int32_t> axis1 = {-4, 3};
-  EXPECT_THROW(cunumeric::all(array, axis1), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::all(array, axis1), std::invalid_argument);
 
   std::vector<int32_t> axis2 = {0, 3};
-  EXPECT_THROW(cunumeric::all(array, axis2), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::all(array, axis2), std::invalid_argument);
 
   // Test repeated axes
   std::vector<int32_t> axis3 = {1, 1};
-  EXPECT_THROW(cunumeric::all(array, axis3), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::all(array, axis3), std::invalid_argument);
 
   std::vector<int32_t> axis4 = {-1, 2};
-  EXPECT_THROW(cunumeric::all(array, axis4), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::all(array, axis4), std::invalid_argument);
 }
 
 void test_all_invalid_shape()
 {
   std::array<int32_t, 4> in_array = {5, 10, 0, 100};
   std::vector<uint64_t> shape     = {1, 2, 2};
-  auto array                      = cunumeric::zeros(shape, legate::int32());
+  auto array                      = cupynumeric::zeros(shape, legate::int32());
   assign_values_to_array<int32_t, 3>(array, in_array.data(), in_array.size());
 
   std::vector<uint64_t> out_shape1 = {1};
-  auto out1                        = cunumeric::zeros(out_shape1, legate::int32());
-  EXPECT_THROW(cunumeric::all(array, std::nullopt, out1), std::invalid_argument);
+  auto out1                        = cupynumeric::zeros(out_shape1, legate::int32());
+  EXPECT_THROW(cupynumeric::all(array, {}, out1), std::invalid_argument);
 
   std::vector<uint64_t> out_shape2 = {2};
   std::vector<int32_t> axis2       = {1};
-  auto out2                        = cunumeric::zeros(out_shape2, legate::int32());
-  EXPECT_THROW(cunumeric::all(array, axis2, out2), std::invalid_argument);
+  auto out2                        = cupynumeric::zeros(out_shape2, legate::int32());
+  EXPECT_THROW(cupynumeric::all(array, axis2, out2), std::invalid_argument);
 
   std::vector<uint64_t> out_shape3 = {2, 2};
   std::vector<int32_t> axis3       = {1};
-  auto out3                        = cunumeric::zeros(out_shape3, legate::int32());
-  EXPECT_THROW(cunumeric::all(array, axis3, out3), std::invalid_argument);
+  auto out3                        = cupynumeric::zeros(out_shape3, legate::int32());
+  EXPECT_THROW(cupynumeric::all(array, axis3, out3), std::invalid_argument);
 }
 
 void test_all_invalid_where()
 {
   std::array<int32_t, 4> in_array = {5, 10, 0, 100};
   std::vector<uint64_t> shape     = {1, 2, 2};
-  auto array                      = cunumeric::zeros(shape, legate::int32());
+  auto array                      = cupynumeric::zeros(shape, legate::int32());
   assign_values_to_array<int32_t, 3>(array, in_array.data(), in_array.size());
 
   // Test where with invalid type
   std::array<int32_t, 4> in_where1 = {0, 1, 0, 1};
-  auto where1                      = cunumeric::zeros(shape, legate::int32());
+  auto where1                      = cupynumeric::zeros(shape, legate::int32());
   assign_values_to_array<int32_t, 3>(where1, in_where1.data(), in_where1.size());
-  EXPECT_THROW(cunumeric::all(array, std::nullopt, std::nullopt, false, where1),
-               std::invalid_argument);
+  EXPECT_THROW(cupynumeric::all(array, {}, std::nullopt, false, where1), std::invalid_argument);
 
   // Test where with invalid shape
   std::vector<uint64_t> where_shape = {2, 2, 1};
   std::array<bool, 4> in_where2     = {false, true, false, true};
-  auto where2                       = cunumeric::zeros(where_shape, legate::bool_());
+  auto where2                       = cupynumeric::zeros(where_shape, legate::bool_());
   assign_values_to_array<bool, 3>(where2, in_where2.data(), in_where2.size());
-  EXPECT_THROW(cunumeric::all(array, std::nullopt, std::nullopt, false, where2), std::exception);
+  EXPECT_THROW(cupynumeric::all(array, {}, std::nullopt, false, where2), std::exception);
 }
 
 // void cpp_test()
diff --git a/tests/cpp/integration/test_moveaxis.cc b/tests/cpp/integration/test_moveaxis.cc
index ad0db4dfa9..4fde5db796 100644
--- a/tests/cpp/integration/test_moveaxis.cc
+++ b/tests/cpp/integration/test_moveaxis.cc
@@ -20,7 +20,7 @@
 
 #include <gtest/gtest.h>
 #include "legate.h"
-#include "cunumeric.h"
+#include "cupynumeric.h"
 #include "util.inl"
 
 template <int32_t DIM>
@@ -31,9 +31,9 @@ static void moveaxis_int32_test(std::vector<int32_t> input,
                                 std::vector<int32_t> source,
                                 std::vector<int32_t> destination)
 {
-  auto a_input = cunumeric::zeros(in_shape, legate::int32());
+  auto a_input = cupynumeric::zeros(in_shape, legate::int32());
   assign_values_to_array<int32_t, DIM>(a_input, input.data(), input.size());
-  auto a_output = cunumeric::moveaxis(a_input, source, destination);
+  auto a_output = cupynumeric::moveaxis(a_input, source, destination);
   check_array_eq<int32_t, DIM>(a_output, exp.data(), exp.size());
   EXPECT_EQ(a_output.shape(), out_shape);
 }
@@ -43,8 +43,8 @@ static void moveaxis_int32_test_2(std::vector<uint64_t> in_shape,
                                   std::vector<int32_t> source,
                                   std::vector<int32_t> destination)
 {
-  auto a_input  = cunumeric::zeros(in_shape, legate::int32());
-  auto a_output = cunumeric::moveaxis(a_input, source, destination);
+  auto a_input  = cupynumeric::zeros(in_shape, legate::int32());
+  auto a_output = cupynumeric::moveaxis(a_input, source, destination);
   EXPECT_EQ(a_output.shape(), out_shape);
 }
 
@@ -66,25 +66,25 @@ TEST(MoveAxis, SpecialArrays)
   // test single element array
   {
     std::vector<int32_t> input{99};
-    auto a = cunumeric::zeros({1}, legate::int32());
+    auto a = cupynumeric::zeros({1}, legate::int32());
     a.fill(legate::Scalar(input[0]));
-    auto a_out = cunumeric::moveaxis(a, {0}, {-1});
+    auto a_out = cupynumeric::moveaxis(a, {0}, {-1});
     check_array_eq<int32_t, 1>(a_out, input.data(), input.size());
     EXPECT_EQ(a_out.shape(), a.shape());
   }
   {
     std::vector<int32_t> input{-100};
-    auto a = cunumeric::zeros({1, 1}, legate::int32());
+    auto a = cupynumeric::zeros({1, 1}, legate::int32());
     a.fill(legate::Scalar(input[0]));
-    auto a_out = cunumeric::moveaxis(a, {0, 1}, {-1, -2});
+    auto a_out = cupynumeric::moveaxis(a, {0, 1}, {-1, -2});
     check_array_eq<int32_t, 2>(a_out, input.data(), input.size());
     EXPECT_EQ(a_out.shape(), a.shape());
   }
 
   // test empty array
   {
-    auto a     = cunumeric::zeros({0}, legate::int32());
-    auto a_out = cunumeric::moveaxis(a, {0}, {-1});
+    auto a     = cupynumeric::zeros({0}, legate::int32());
+    auto a_out = cupynumeric::moveaxis(a, {0}, {-1});
     EXPECT_EQ(a_out.shape(), a.shape());
   }
 }
@@ -129,22 +129,22 @@ TEST(MoveAxis, With_empty_array)
 
 TEST(MoveAxisErrors, Repeated_axis)
 {
-  auto x = cunumeric::zeros({3, 4, 5}, legate::int32());
-  EXPECT_THROW(cunumeric::moveaxis(x, {0, 0}, {1, 0}), std::invalid_argument);
-  EXPECT_THROW(cunumeric::moveaxis(x, {0, 1}, {0, -3}), std::invalid_argument);
+  auto x = cupynumeric::zeros({3, 4, 5}, legate::int32());
+  EXPECT_THROW(cupynumeric::moveaxis(x, {0, 0}, {1, 0}), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::moveaxis(x, {0, 1}, {0, -3}), std::invalid_argument);
 }
 
 TEST(MoveAxisErrors, Axis_out_of_bound)
 {
-  auto x = cunumeric::zeros({3, 4, 5}, legate::int32());
-  EXPECT_THROW(cunumeric::moveaxis(x, {0, 3}, {0, 1}), std::invalid_argument);
-  EXPECT_THROW(cunumeric::moveaxis(x, {0, 1}, {0, -4}), std::invalid_argument);
-  EXPECT_THROW(cunumeric::moveaxis(x, {4}, {0}), std::invalid_argument);
-  EXPECT_THROW(cunumeric::moveaxis(x, {0}, {-4}), std::invalid_argument);
+  auto x = cupynumeric::zeros({3, 4, 5}, legate::int32());
+  EXPECT_THROW(cupynumeric::moveaxis(x, {0, 3}, {0, 1}), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::moveaxis(x, {0, 1}, {0, -4}), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::moveaxis(x, {4}, {0}), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::moveaxis(x, {0}, {-4}), std::invalid_argument);
 }
 
 TEST(MoveAxisErrors, Axis_with_different_length)
 {
-  auto x = cunumeric::zeros({3, 4, 5}, legate::int32());
-  EXPECT_THROW(cunumeric::moveaxis(x, {0}, {1, 0}), std::invalid_argument);
+  auto x = cupynumeric::zeros({3, 4, 5}, legate::int32());
+  EXPECT_THROW(cupynumeric::moveaxis(x, {0}, {1, 0}), std::invalid_argument);
 }
diff --git a/tests/cpp/integration/test_msort.cc b/tests/cpp/integration/test_msort.cc
index 6516532450..0fa004bb8a 100644
--- a/tests/cpp/integration/test_msort.cc
+++ b/tests/cpp/integration/test_msort.cc
@@ -20,7 +20,7 @@
 
 #include <gtest/gtest.h>
 #include "legate.h"
-#include "cunumeric.h"
+#include "cupynumeric.h"
 #include "util.inl"
 
 auto get_msort_expect_result_int()
@@ -193,17 +193,16 @@ void test_msort(std::array<T, SIZE>& in_array,
                 legate::Type leg_type,
                 std::vector<uint64_t> shape)
 {
-  auto A1 = cunumeric::zeros(shape, leg_type);
+  auto A1 = cupynumeric::zeros(shape, leg_type);
   if (in_array.size() != 0) {
     if (in_array.size() == 1) {
       A1.fill(legate::Scalar(in_array[0]));
     } else {
       assign_values_to_array<T, DIM>(A1, in_array.data(), in_array.size());
     }
-    print_array<T, DIM>(A1);
   }
 
-  auto B1 = cunumeric::msort(A1);
+  auto B1 = cupynumeric::msort(A1);
   check_array_eq<T, DIM>(B1, expect.data(), expect.size());
 }
 
diff --git a/tests/cpp/integration/test_nonzero.cc b/tests/cpp/integration/test_nonzero.cc
index 34516d840f..b1d1fb9f52 100644
--- a/tests/cpp/integration/test_nonzero.cc
+++ b/tests/cpp/integration/test_nonzero.cc
@@ -120,8 +120,8 @@ void test_nonzero(const std::vector<T>& in_array,
                   const std::vector<std::vector<int64_t>>& expect,
                   const std::vector<uint64_t>& shape)
 {
-  auto array         = cunumeric::mk_array<T>(in_array, shape);
-  auto result_vec    = cunumeric::nonzero(array);
+  auto array         = cupynumeric::mk_array<T>(in_array, shape);
+  auto result_vec    = cupynumeric::nonzero(array);
   size_t result_size = result_vec.size();
   ASSERT_EQ(result_size, expect.size());
   std::vector<uint64_t> expect_shape = {};
@@ -133,7 +133,7 @@ void test_nonzero(const std::vector<T>& in_array,
     }
   }
   for (size_t i = 0; i < result_size; ++i) {
-    cunumeric::check_array<int64_t>(result_vec[i], expect[i], expect_shape);
+    cupynumeric::check_array<int64_t>(result_vec[i], expect[i], expect_shape);
   }
 }
 
diff --git a/tests/cpp/integration/test_put.cc b/tests/cpp/integration/test_put.cc
index ec2ec4fb02..d0fd3db97d 100644
--- a/tests/cpp/integration/test_put.cc
+++ b/tests/cpp/integration/test_put.cc
@@ -17,7 +17,7 @@
 #include "common_utils.h"
 #include <tuple>
 
-using namespace cunumeric;
+using namespace cupynumeric;
 namespace {
 
 template <typename T, typename U, typename V>
diff --git a/tests/cpp/integration/test_repartition.cc b/tests/cpp/integration/test_repartition.cc
index 2d094576a6..09d20e9adb 100644
--- a/tests/cpp/integration/test_repartition.cc
+++ b/tests/cpp/integration/test_repartition.cc
@@ -18,9 +18,9 @@
 #include <ostream>
 #include <numeric>
 #include "legate.h"
-#include "cunumeric.h"
+#include "cupynumeric.h"
 #include "util.inl"
-#include "cunumeric/utilities/repartition.h"
+#include "cupynumeric/utilities/repartition.h"
 
 namespace repartition_test {
 
@@ -35,19 +35,13 @@ enum TaskIDs {
 template <bool I_ROW_MAJOR, bool O_ROW_MAJOR>
 struct CheckRepartitionTask
   : public legate::LegateTask<CheckRepartitionTask<I_ROW_MAJOR, O_ROW_MAJOR>> {
-  static constexpr auto TASK_ID =
-    legate::LocalTaskID{CHECK_REPARTITION_TASK + I_ROW_MAJOR * 2 + O_ROW_MAJOR};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{CHECK_REPARTITION_TASK + I_ROW_MAJOR * 2 + O_ROW_MAJOR}};
 
   static void gpu_variant(legate::TaskContext context);
 };
 
 class RepartitionLayoutMapper : public legate::mapping::Mapper {
-  legate::mapping::TaskTarget task_target(
-    const legate::mapping::Task& /*task*/,
-    const std::vector<legate::mapping::TaskTarget>& options) override
-  {
-    return options.front();
-  }
   std::vector<legate::mapping::StoreMapping> store_mappings(
     const legate::mapping::Task& task,
     const std::vector<legate::mapping::StoreTarget>& options) override
@@ -83,6 +77,11 @@ class RepartitionLayoutMapper : public legate::mapping::Mapper {
   {
     return legate::Scalar{};
   }
+  std::optional<std::size_t> allocation_pool_size(
+    const legate::mapping::Task& /*task*/, legate::mapping::StoreTarget /*memory_kind*/) override
+  {
+    return std::nullopt;
+  }
 };
 
 int get_rank_row_major(legate::Domain domain, legate::DomainPoint index_point)
@@ -120,17 +119,17 @@ void repartition_2dbc_test(legate::AccessorRO<int32_t, 2> input,
   size_t input_lld =
     in_rect.empty() ? 1 : (in_rect.hi[in_row_major ? 1 : 0] - in_rect.lo[in_row_major ? 1 : 0] + 1);
 
-  auto [buffer_2dbc, volume_2dbc, lld_2dbc] = cunumeric::repartition_matrix_2dbc(input_ptr,
-                                                                                 input_volume,
-                                                                                 in_row_major,
-                                                                                 input_offset_r,
-                                                                                 input_offset_c,
-                                                                                 input_lld,
-                                                                                 proc_r,
-                                                                                 proc_c,
-                                                                                 tile_r,
-                                                                                 tile_c,
-                                                                                 comm);
+  auto [buffer_2dbc, volume_2dbc, lld_2dbc] = cupynumeric::repartition_matrix_2dbc(input_ptr,
+                                                                                   input_volume,
+                                                                                   in_row_major,
+                                                                                   input_offset_r,
+                                                                                   input_offset_c,
+                                                                                   input_lld,
+                                                                                   proc_r,
+                                                                                   proc_c,
+                                                                                   tile_r,
+                                                                                   tile_c,
+                                                                                   comm);
 
   int32_t* output_ptr    = output.ptr(out_rect.lo);
   size_t output_volume   = out_rect.volume();
@@ -151,23 +150,23 @@ void repartition_2dbc_test(legate::AccessorRO<int32_t, 2> input,
     std::cerr << stringStream.str();
   }
 
-  cunumeric::repartition_matrix_block(buffer_2dbc,
-                                      volume_2dbc,
-                                      lld_2dbc,
-                                      local_rank,
-                                      proc_r,
-                                      proc_c,
-                                      tile_r,
-                                      tile_c,
-                                      output_ptr,
-                                      output_volume,
-                                      output_lld,
-                                      num_rows,
-                                      num_cols,
-                                      out_row_major,
-                                      output_offset_r,
-                                      output_offset_c,
-                                      comm);
+  cupynumeric::repartition_matrix_block(buffer_2dbc,
+                                        volume_2dbc,
+                                        lld_2dbc,
+                                        local_rank,
+                                        proc_r,
+                                        proc_c,
+                                        tile_r,
+                                        tile_c,
+                                        output_ptr,
+                                        output_volume,
+                                        output_lld,
+                                        num_rows,
+                                        num_cols,
+                                        out_row_major,
+                                        output_offset_r,
+                                        output_offset_c,
+                                        comm);
 }
 #endif
 
@@ -272,8 +271,8 @@ void run_test_aligned_default_launch(std::vector<uint64_t>& data_shape,
 
   // generate data
   size_t volume    = data_shape[0] * data_shape[1];
-  auto data_input  = cunumeric::zeros(data_shape, legate::int32());
-  auto data_output = cunumeric::zeros(data_shape, legate::int32());
+  auto data_input  = cupynumeric::zeros(data_shape, legate::int32());
+  auto data_output = cupynumeric::zeros(data_shape, legate::int32());
   if (volume != 0) {
     if (volume == 1) {
       data_input.fill(legate::Scalar(0));
diff --git a/tests/cpp/integration/test_repeat.cc b/tests/cpp/integration/test_repeat.cc
index 8ab064a973..dc89a1448b 100644
--- a/tests/cpp/integration/test_repeat.cc
+++ b/tests/cpp/integration/test_repeat.cc
@@ -19,7 +19,7 @@
 #include <cassert>
 #include <cstdlib>
 
-using namespace cunumeric;
+using namespace cupynumeric;
 
 namespace {
 
@@ -115,7 +115,7 @@ TEST(Repeat, test_array_empty_repeats_valid)
   }
 }
 
-// numpy fail, cunumeric pass
+// numpy fail, cupynumeric pass
 TEST(Repeat, test_array_empty_repeats_invalid_negative)
 {
   std::vector<std::vector<uint64_t>> repeats_list{{3, 4}, {1, 2, 3}};
diff --git a/tests/cpp/integration/test_reshape.cc b/tests/cpp/integration/test_reshape.cc
index 6a254bbe00..640c92c183 100644
--- a/tests/cpp/integration/test_reshape.cc
+++ b/tests/cpp/integration/test_reshape.cc
@@ -16,7 +16,7 @@
 
 #include "common_utils.h"
 
-using namespace cunumeric;
+using namespace cupynumeric;
 
 namespace {
 
@@ -45,7 +45,7 @@ TEST_F(Reshape_TestSquare, test_shape)
 {
   for (auto shape : SQUARE_CASES) {
     auto a = arange<int32_t>(100).reshape({10, 10});
-    check_array(reshape(a, shape), a_gt, as_type_vector<size_t>(shape));
+    check_array(reshape(a, shape), a_gt, as_type_vector<uint64_t>(shape));
   }
   {
     auto a = arange<int32_t>(100).reshape({10, 10});
@@ -61,7 +61,7 @@ TEST_F(Reshape_TestSquare, test_shape_mode)
       if (order == "F") {
         EXPECT_THROW(reshape(a, shape, order), std::invalid_argument);
       } else {
-        check_array(reshape(a, shape, order), a_gt, as_type_vector<size_t>(shape));
+        check_array(reshape(a, shape, order), a_gt, as_type_vector<uint64_t>(shape));
       }
     }
     {
@@ -115,7 +115,7 @@ TEST_F(Reshape_TestRect, test_shape)
 {
   for (auto shape : RECT_CASES) {
     auto a = mk_array(a_gt, {5, 4, 10});
-    check_array(reshape(a, shape), a_gt, as_type_vector<size_t>(shape));
+    check_array(reshape(a, shape), a_gt, as_type_vector<uint64_t>(shape));
   }
   {
     auto a = mk_array(a_gt, {5, 4, 10});
@@ -131,7 +131,7 @@ TEST_F(Reshape_TestRect, test_shape_mode)
       if (order == "F") {
         EXPECT_THROW(reshape(a, shape, order), std::invalid_argument);
       } else {
-        check_array(reshape(a, shape, order), a_gt, as_type_vector<size_t>(shape));
+        check_array(reshape(a, shape, order), a_gt, as_type_vector<uint64_t>(shape));
       }
     }
     {
@@ -166,13 +166,13 @@ TEST(Reshape, test_reshape_empty_array)
   };
   auto a = mk_array<int32_t>({}, {0, 1});
   for (auto shape : shape_list) {
-    check_array<int32_t>(reshape(a, shape), {}, as_type_vector<size_t>(shape));
+    check_array<int32_t>(reshape(a, shape), {}, as_type_vector<uint64_t>(shape));
   }
 }
 
 TEST(Reshape, test_reshape_same_shape)
 {
-  std::vector<size_t> shape{1, 2, 3};
+  std::vector<uint64_t> shape{1, 2, 3};
   auto a_gt  = mk_seq_vector<int32_t>(shape);
   auto a     = mk_array<int32_t>(a_gt, shape);
   auto a_out = reshape(a, as_type_vector<int64_t>(shape));
diff --git a/tests/cpp/integration/test_sort.cc b/tests/cpp/integration/test_sort.cc
index 8c4c5c3e86..9a5a76a47a 100644
--- a/tests/cpp/integration/test_sort.cc
+++ b/tests/cpp/integration/test_sort.cc
@@ -20,7 +20,7 @@
 
 #include <gtest/gtest.h>
 #include "legate.h"
-#include "cunumeric.h"
+#include "cupynumeric.h"
 #include "util.inl"
 
 auto get_expect_result_int()
@@ -498,7 +498,7 @@ void test_sort(std::array<T, SIZE>& in_array,
                std::vector<uint64_t> shape,
                std::optional<int32_t> axis)
 {
-  auto A1 = cunumeric::zeros(shape, leg_type);
+  auto A1 = cupynumeric::zeros(shape, leg_type);
   if (in_array.size() != 0) {
     if (in_array.size() == 1) {
       A1.fill(legate::Scalar(in_array[0]));
@@ -508,7 +508,7 @@ void test_sort(std::array<T, SIZE>& in_array,
   }
   std::vector<std::string> algos = {"quicksort", "mergesort", "heapsort", "stable"};
   for (auto algo = algos.begin(); algo < algos.end(); ++algo) {
-    auto B1 = cunumeric::sort(A1, axis, *algo);
+    auto B1 = cupynumeric::sort(A1, axis, *algo);
     if (in_array.size() != 0) {
       check_array_eq<T, DIM>(B1, expect.data(), expect.size());
     }
@@ -615,14 +615,14 @@ void sort_single_item_array()
 
 void sort_negative_test()
 {
-  auto in_ar1 = cunumeric::zeros({2, 3}, legate::int32());
+  auto in_ar1 = cupynumeric::zeros({2, 3}, legate::int32());
 
   // Test invalid input sort axis
-  EXPECT_THROW(cunumeric::sort(in_ar1, 2, "quicksort"), std::invalid_argument);
-  EXPECT_THROW(cunumeric::sort(in_ar1, -3, "quicksort"), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::sort(in_ar1, 2, "quicksort"), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::sort(in_ar1, -3, "quicksort"), std::invalid_argument);
 
   // Test invalid input algorithm
-  EXPECT_THROW(cunumeric::sort(in_ar1, 0, "negative"), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::sort(in_ar1, 0, "negative"), std::invalid_argument);
 }
 
 // void cpp_test()
diff --git a/tests/cpp/integration/test_sort_complex.cc b/tests/cpp/integration/test_sort_complex.cc
index e2f237da5a..53c36c3758 100644
--- a/tests/cpp/integration/test_sort_complex.cc
+++ b/tests/cpp/integration/test_sort_complex.cc
@@ -20,7 +20,7 @@
 
 #include <gtest/gtest.h>
 #include "legate.h"
-#include "cunumeric.h"
+#include "cupynumeric.h"
 #include "util.inl"
 
 template <typename T>
@@ -197,7 +197,7 @@ void test_sort_complex(std::array<T_IN, SIZE>& in_array,
                        legate::Type leg_type,
                        std::vector<uint64_t> shape)
 {
-  auto A1 = cunumeric::zeros(shape, leg_type);
+  auto A1 = cupynumeric::zeros(shape, leg_type);
   if (in_array.size() != 0) {
     if (in_array.size() == 1) {
       A1.fill(legate::Scalar(in_array[0]));
@@ -205,7 +205,7 @@ void test_sort_complex(std::array<T_IN, SIZE>& in_array,
       assign_values_to_array<T_IN, DIM>(A1, in_array.data(), in_array.size());
     }
   }
-  auto B1 = cunumeric::sort_complex(A1);
+  auto B1 = cupynumeric::sort_complex(A1);
   if (in_array.size() != 0) {
     check_array_eq<T_OUT, DIM>(B1, expect.data(), expect.size());
   }
diff --git a/tests/cpp/integration/test_squeeze.cc b/tests/cpp/integration/test_squeeze.cc
new file mode 100644
index 0000000000..1313afbdc3
--- /dev/null
+++ b/tests/cpp/integration/test_squeeze.cc
@@ -0,0 +1,212 @@
+/* Copyright 2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "common_utils.h"
+#include <tuple>
+
+using namespace cupynumeric;
+namespace {
+
+typedef std::vector<std::tuple<std::vector<uint64_t>, std::vector<int32_t>>> VEC_SHAPE_AXES;
+
+std::vector<uint64_t> squeeze_result(
+  const std::vector<uint64_t>& shape,
+  std::optional<std::reference_wrapper<std::vector<int32_t> const>> axes = std::nullopt)
+{
+  std::vector<uint64_t> result;
+  if (!axes.has_value()) {
+    for (int i = 0; i < shape.size(); i++) {
+      if (shape[i] != 1) {
+        result.push_back(shape[i]);
+      }
+    }
+  } else {
+    auto computed_axes = normalize_axis_vector(axes.value(), shape.size());
+    for (int i = 0; i < shape.size(); i++) {
+      auto flag = true;
+      if (shape[i] == 1) {
+        for (int j = 0; j < computed_axes.size(); j++) {
+          if (computed_axes[j] == i) {
+            flag = false;
+            break;
+          }
+        }
+      }
+      if (flag) {
+        result.push_back(shape[i]);
+      }
+    }
+  }
+  return result;
+}
+
+void test_squeeze(
+  const std::vector<uint64_t>& shape,
+  std::optional<std::reference_wrapper<std::vector<int32_t> const>> axes = std::nullopt)
+{
+  auto vec_a        = mk_seq_vector<int32_t>(shape);
+  auto arr_a        = mk_array<int32_t>(vec_a, shape);
+  auto x            = axes.has_value() ? squeeze(arr_a, axes) : squeeze(arr_a);
+  auto result_shape = squeeze_result(shape, axes);
+  check_array<int32_t>(x, vec_a, result_shape);
+}
+
+static constexpr int32_t DIM             = 5;
+std::vector<std::vector<uint64_t>> SIZES = {
+  {},
+  {
+    0,
+  },
+  {1},
+  {DIM},
+  {0, 1},
+  {1, 0},
+  {1, 1},
+  {1, DIM},
+  {DIM, 1},
+  {DIM, DIM},
+  {1, 0, 0},
+  {1, 1, 0},
+  {1, 0, 1},
+  {1, 1, 1},
+  {DIM, 1, 1},
+  {1, DIM, 1},
+  {1, 1, DIM},
+  {DIM, DIM, DIM},
+};
+
+VEC_SHAPE_AXES gen_shape_axes_all()
+{
+  VEC_SHAPE_AXES shape_axes;
+  for (auto shape : SIZES) {
+    std::vector<int32_t> axes;
+    for (int i = 0; i < shape.size(); i++) {
+      if (shape[i] == 1) {
+        axes.push_back(i);
+      }
+    }
+    shape_axes.push_back({shape, axes});
+  }
+  return shape_axes;
+}
+
+VEC_SHAPE_AXES gen_shape_axes_single()
+{
+  VEC_SHAPE_AXES shape_axes;
+  for (auto shape : SIZES) {
+    std::vector<int32_t> axes;
+    for (int i = 0; i < shape.size(); i++) {
+      if (shape[i] == 1) {
+        axes.push_back(i);
+      }
+    }
+    for (int i = 0; i < axes.size(); i++) {
+      shape_axes.push_back({shape, {axes[i]}});
+    }
+  }
+  return shape_axes;
+}
+
+VEC_SHAPE_AXES gen_shape_axes_negative()
+{
+  VEC_SHAPE_AXES shape_axes;
+  for (auto shape : SIZES) {
+    std::vector<int32_t> axes;
+    for (int i = 0; i < shape.size(); i++) {
+      if (shape[i] == 1) {
+        axes.push_back(i - shape.size());
+      }
+    }
+    if (axes.size() > 0) {
+      shape_axes.push_back({shape, axes});
+    }
+  }
+  return shape_axes;
+}
+
+TEST(Squeeze, Basic)
+{
+  for (auto shape : SIZES) {
+    test_squeeze(shape);
+  }
+}
+
+TEST(Squeeze, AxesAll)
+{
+  auto SHAPE_AXES = gen_shape_axes_all();
+  for (auto [shape, axes] : SHAPE_AXES) {
+    test_squeeze(shape, axes);
+  }
+}
+
+TEST(Squeeze, AxesSingle)
+{
+  auto SHAPE_AXES = gen_shape_axes_single();
+  for (auto [shape, axes] : SHAPE_AXES) {
+    test_squeeze(shape, axes);
+  }
+}
+
+TEST(Squeeze, AxesNegative)
+{
+  auto SHAPE_AXES = gen_shape_axes_negative();
+  for (auto [shape, axes] : SHAPE_AXES) {
+    test_squeeze(shape, axes);
+  }
+}
+
+TEST(Squeeze, InvalidAxesNotEqualToOne)
+{
+  std::vector<uint64_t> shape                = {1, 2, 1};
+  std::vector<std::vector<int32_t>> vec_axes = {{
+                                                  1,
+                                                },
+                                                {0, 1}};
+  auto vec_a                                 = mk_seq_vector<int32_t>(shape);
+  auto arr_a                                 = mk_array<int32_t>(vec_a, shape);
+  for (auto axes : vec_axes) {
+    EXPECT_THROW(squeeze(arr_a, axes), std::invalid_argument);
+  }
+}
+
+TEST(Squeeze, InvalidAxesOutOfBound)
+{
+  std::vector<uint64_t> shape                = {1, 2, 1};
+  std::vector<std::vector<int32_t>> vec_axes = {{
+                                                  3,
+                                                },
+                                                {0, 3},
+                                                {-4},
+                                                {-4, 0}};
+  auto vec_a                                 = mk_seq_vector<int32_t>(shape);
+  auto arr_a                                 = mk_array<int32_t>(vec_a, shape);
+  for (auto axes : vec_axes) {
+    EXPECT_THROW(squeeze(arr_a, axes), std::invalid_argument);
+  }
+}
+
+TEST(Squeeze, InvalidAxesDuplicate)
+{
+  std::vector<uint64_t> shape                = {1, 2, 1};
+  std::vector<std::vector<int32_t>> vec_axes = {{0, -3}, {-1, 0, 2}};
+  auto vec_a                                 = mk_seq_vector<int32_t>(shape);
+  auto arr_a                                 = mk_array<int32_t>(vec_a, shape);
+  for (auto axes : vec_axes) {
+    EXPECT_THROW(squeeze(arr_a, axes), std::invalid_argument);
+  }
+}
+
+}  // namespace
diff --git a/tests/cpp/integration/test_swapaxes.cc b/tests/cpp/integration/test_swapaxes.cc
index fbadbc96c1..15edf5787d 100644
--- a/tests/cpp/integration/test_swapaxes.cc
+++ b/tests/cpp/integration/test_swapaxes.cc
@@ -20,67 +20,67 @@
 
 #include <gtest/gtest.h>
 #include "legate.h"
-#include "cunumeric.h"
+#include "cupynumeric.h"
 #include "util.inl"
 
 void swapaxes_test()
 {
   // Test small
   {
-    auto A = cunumeric::zeros({3, 3}, legate::int32());
+    auto A = cupynumeric::zeros({3, 3}, legate::int32());
     EXPECT_EQ(A.shape(), (std::vector<uint64_t>{3, 3}));
-    auto B = cunumeric::swapaxes(A, 0, 1);
+    auto B = cupynumeric::swapaxes(A, 0, 1);
     EXPECT_EQ(B.shape(), (std::vector<uint64_t>{3, 3}));
   }
 
   // Test tall
   {
-    auto A_tall = cunumeric::zeros({300, 3}, legate::int32());
+    auto A_tall = cupynumeric::zeros({300, 3}, legate::int32());
     EXPECT_EQ(A_tall.shape(), (std::vector<uint64_t>{300, 3}));
-    auto B_tall = cunumeric::swapaxes(A_tall, 0, 1);
+    auto B_tall = cupynumeric::swapaxes(A_tall, 0, 1);
     EXPECT_EQ(B_tall.shape(), (std::vector<uint64_t>{3, 300}));
   }
 
   // Test wide
   {
-    auto A_wide = cunumeric::zeros({3, 300}, legate::int32());
+    auto A_wide = cupynumeric::zeros({3, 300}, legate::int32());
     EXPECT_EQ(A_wide.shape(), (std::vector<uint64_t>{3, 300}));
-    auto B_wide = cunumeric::swapaxes(A_wide, 0, 1);
+    auto B_wide = cupynumeric::swapaxes(A_wide, 0, 1);
     EXPECT_EQ(B_wide.shape(), (std::vector<uint64_t>{300, 3}));
   }
 
   // Test big
   {
-    auto A_big = cunumeric::zeros({300, 300}, legate::int32());
+    auto A_big = cupynumeric::zeros({300, 300}, legate::int32());
     EXPECT_EQ(A_big.shape(), (std::vector<uint64_t>{300, 300}));
-    auto B_big = cunumeric::swapaxes(A_big, 0, 1);
+    auto B_big = cupynumeric::swapaxes(A_big, 0, 1);
     EXPECT_EQ(B_big.shape(), (std::vector<uint64_t>{300, 300}));
   }
 
   // Test 3-dim array with different swap axes
   {
-    auto A = cunumeric::zeros({3, 4, 5}, legate::int32());
+    auto A = cupynumeric::zeros({3, 4, 5}, legate::int32());
     EXPECT_EQ(A.shape(), (std::vector<uint64_t>{3, 4, 5}));
 
-    auto B1 = cunumeric::swapaxes(A, 0, 0);
+    auto B1 = cupynumeric::swapaxes(A, 0, 0);
     EXPECT_EQ(B1.shape(), (std::vector<uint64_t>{3, 4, 5}));
 
-    auto B2 = cunumeric::swapaxes(A, -3, 1);
+    auto B2 = cupynumeric::swapaxes(A, -3, 1);
     EXPECT_EQ(B2.shape(), (std::vector<uint64_t>{4, 3, 5}));
 
-    auto B3 = cunumeric::swapaxes(A, 0, 2);
+    auto B3 = cupynumeric::swapaxes(A, 0, 2);
     EXPECT_EQ(B3.shape(), (std::vector<uint64_t>{5, 4, 3}));
 
-    auto B4 = cunumeric::swapaxes(A, -3, -2);
+    auto B4 = cupynumeric::swapaxes(A, -3, -2);
     EXPECT_EQ(B4.shape(), (std::vector<uint64_t>{4, 3, 5}));
   }
 
   // Test empty array
   {
-    auto A = cunumeric::zeros({0}, legate::int32());
+    auto A = cupynumeric::zeros({0}, legate::int32());
     EXPECT_EQ(A.shape(), (std::vector<uint64_t>{0}));
 
-    auto B = cunumeric::swapaxes(A, 0, 0);
+    auto B = cupynumeric::swapaxes(A, 0, 0);
     EXPECT_EQ(B.shape(), (std::vector<uint64_t>{0}));
   }
 }
@@ -88,13 +88,13 @@ void swapaxes_test()
 void swapaxes_negative_test()
 {
   // Test out-of-bound1
-  auto A = cunumeric::zeros({3, 3}, legate::int32());
-  EXPECT_THROW(cunumeric::swapaxes(A, 3, 0), std::invalid_argument);
-  EXPECT_THROW(cunumeric::swapaxes(A, 0, 3), std::invalid_argument);
+  auto A = cupynumeric::zeros({3, 3}, legate::int32());
+  EXPECT_THROW(cupynumeric::swapaxes(A, 3, 0), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::swapaxes(A, 0, 3), std::invalid_argument);
 
   // Test out-of-bound2
-  EXPECT_THROW(cunumeric::swapaxes(A, -4, 0), std::invalid_argument);
-  EXPECT_THROW(cunumeric::swapaxes(A, 0, -4), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::swapaxes(A, -4, 0), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::swapaxes(A, 0, -4), std::invalid_argument);
 }
 
 // void cpp_test()
diff --git a/tests/cpp/integration/test_transpose.cc b/tests/cpp/integration/test_transpose.cc
index 0124588c60..6047fe094a 100644
--- a/tests/cpp/integration/test_transpose.cc
+++ b/tests/cpp/integration/test_transpose.cc
@@ -16,7 +16,7 @@
 
 #include <gtest/gtest.h>
 #include "legate.h"
-#include "cunumeric.h"
+#include "cupynumeric.h"
 #include "util.inl"
 
 template <size_t SIZE, int32_t DIM>
@@ -26,15 +26,15 @@ void transpose_int32_test(std::array<int32_t, SIZE> input,
                           std::vector<uint64_t> out_shape,
                           std::optional<std::vector<int32_t>> axes = std::nullopt)
 {
-  auto a_input = cunumeric::zeros(in_shape, legate::int32());
+  auto a_input = cupynumeric::zeros(in_shape, legate::int32());
   assign_values_to_array<int32_t, DIM>(a_input, input.data(), input.size());
 
-  auto a_output = cunumeric::array(out_shape, legate::int32());
+  auto a_output = cupynumeric::array(out_shape, legate::int32());
 
   if (axes) {
-    a_output = cunumeric::transpose(a_input, axes.value());
+    a_output = cupynumeric::transpose(a_input, axes.value());
   } else {
-    a_output = cunumeric::transpose(a_input);
+    a_output = cupynumeric::transpose(a_input);
   }
   check_array_eq<int32_t, DIM>(a_output, exp.data(), exp.size());
   EXPECT_EQ(a_output.shape(), out_shape);
@@ -114,9 +114,9 @@ TEST(Transpose, DefaultType)
   std::vector<uint64_t> in_shape  = {2, 3};
   std::vector<uint64_t> out_shape = {3, 2};
 
-  auto a_input = cunumeric::zeros(in_shape);
+  auto a_input = cupynumeric::zeros(in_shape);
   assign_values_to_array<double, dim>(a_input, input.data(), input.size());
-  auto a_output = cunumeric::transpose(a_input);
+  auto a_output = cupynumeric::transpose(a_input);
   check_array_eq<double, dim>(a_output, exp.data(), exp.size());
   EXPECT_EQ(a_output.shape(), out_shape);
 }
@@ -129,10 +129,11 @@ TEST(TransposeErrors, InvalidAxes)
   std::vector<uint64_t> in_shape  = {2, 3};
   std::vector<uint64_t> out_shape = {3, 2};
 
-  auto a_input = cunumeric::zeros(in_shape);
+  auto a_input = cupynumeric::zeros(in_shape);
   assign_values_to_array<double, dim>(a_input, input.data(), input.size());
-  EXPECT_THROW(cunumeric::transpose(a_input, (std::vector<int32_t>){0, 1, 2}),
+  EXPECT_THROW(cupynumeric::transpose(a_input, (std::vector<int32_t>){0, 1, 2}),
+               std::invalid_argument);
+  EXPECT_THROW(cupynumeric::transpose(a_input, (std::vector<int32_t>){1}), std::invalid_argument);
+  EXPECT_THROW(cupynumeric::transpose(a_input, (std::vector<int32_t>){3, 4}),
                std::invalid_argument);
-  EXPECT_THROW(cunumeric::transpose(a_input, (std::vector<int32_t>){1}), std::invalid_argument);
-  EXPECT_THROW(cunumeric::transpose(a_input, (std::vector<int32_t>){3, 4}), std::invalid_argument);
 }
diff --git a/tests/cpp/integration/test_trilu.cc b/tests/cpp/integration/test_trilu.cc
index c83f51bc6c..29e7915152 100644
--- a/tests/cpp/integration/test_trilu.cc
+++ b/tests/cpp/integration/test_trilu.cc
@@ -17,7 +17,7 @@
 #include "common_utils.h"
 #include <tuple>
 
-using namespace cunumeric;
+using namespace cupynumeric;
 
 namespace {
 
diff --git a/tests/cpp/integration/test_unique.cc b/tests/cpp/integration/test_unique.cc
index b7aa1bacab..18cc112812 100644
--- a/tests/cpp/integration/test_unique.cc
+++ b/tests/cpp/integration/test_unique.cc
@@ -18,7 +18,7 @@
 #include <algorithm>
 #include <functional>
 
-using namespace cunumeric;
+using namespace cupynumeric;
 
 namespace {
 
@@ -49,8 +49,6 @@ TEST(Unique, test_basic)
     auto x     = mk_array(x_in);
     auto x_out = unique(x);
     check_array(x_out, x_gt);
-    debug_array(x);
-    debug_array(x_out);
   }
 }
 
@@ -64,9 +62,9 @@ TEST(Unique, test_scalar)
 }
 
 template <typename T>
-std::vector<T> mk_random_vector(std::vector<size_t> shape, std::function<T()> gen)
+std::vector<T> mk_random_vector(std::vector<uint64_t> shape, std::function<T()> gen)
 {
-  size_t size = std::accumulate(shape.begin(), shape.end(), size_t(1), std::multiplies<size_t>());
+  size_t size = std::accumulate(shape.begin(), shape.end(), size_t(1), std::multiplies<uint64_t>());
   std::vector<T> v(size);
   std::generate(v.begin(), v.end(), gen);
   return v;
@@ -77,7 +75,7 @@ static int randint(int low, int high) { return rand() % (high - low) + low; }
 TEST(Unique, test_ndim)
 {
   srand(111);
-  std::vector<size_t> shape;
+  std::vector<uint64_t> shape;
   size_t size = 1;
   for (int32_t ndim = 1; ndim <= LEGATE_MAX_DIM; ++ndim) {
     shape.emplace_back(4);
@@ -88,8 +86,6 @@ TEST(Unique, test_ndim)
     auto x     = mk_array(x_in, shape);
     auto x_out = unique(x);
     check_array(x_out, x_gt);
-    debug_array(x, false);
-    debug_array(x_out);
   }
 }
 
diff --git a/tests/cpp/integration/test_where.cc b/tests/cpp/integration/test_where.cc
new file mode 100644
index 0000000000..642959884b
--- /dev/null
+++ b/tests/cpp/integration/test_where.cc
@@ -0,0 +1,273 @@
+/* Copyright 2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+
+#include <gtest/gtest.h>
+#include "legate.h"
+#include "cupynumeric.h"
+#include "common_utils.h"
+
+using namespace cupynumeric;
+
+template <typename T>
+void test_where_basic(std::vector<T> in_a,
+                      std::vector<std::vector<int64_t>>& exp_vec,
+                      std::vector<uint64_t> in_shape)
+{
+  auto A = mk_array<T>(in_a, in_shape);
+  auto B = where(A);
+  assert(exp_vec.size() == B.size());
+  for (size_t i = 0; i < B.size(); i++) {
+    auto exp_arr                    = exp_vec[i];
+    std::vector<uint64_t> exp_shape = {exp_arr.size()};
+    check_array<int64_t>(B[i], exp_arr, exp_shape);
+  }
+}
+
+template <typename T>
+void test_where_full(
+  NDArray A, NDArray X, NDArray Y, std::vector<T> exp_arr, std::vector<uint64_t> exp_shape)
+{
+  auto B = where(A, X, Y);
+  check_array<T>(B, exp_arr, exp_shape);
+}
+
+TEST(Where, Basic)
+{
+  std::vector<int32_t> in_a = {-1, 54, 4, 4, 0, 45, 5, 58, 0, 9, 0, 4, 0, 0, 0, 5, 0, 1};
+  std::vector<std::vector<uint64_t>> test_shapes = {{18}, {6, 3}, {3, 2, 3}};
+
+  std::vector<int64_t> exp_vec1_1            = {0, 1, 2, 3, 5, 6, 7, 9, 11, 15, 17};
+  std::vector<std::vector<int64_t>> exp_vec1 = {exp_vec1_1};
+  test_where_basic<int32_t>(in_a, exp_vec1, test_shapes[0]);
+
+  std::vector<int64_t> exp_vec2_1            = {0, 0, 0, 1, 1, 2, 2, 3, 3, 5, 5};
+  std::vector<int64_t> exp_vec2_2            = {0, 1, 2, 0, 2, 0, 1, 0, 2, 0, 2};
+  std::vector<std::vector<int64_t>> exp_vec2 = {exp_vec2_1, exp_vec2_2};
+  test_where_basic<int32_t>(in_a, exp_vec2, test_shapes[1]);
+
+  std::vector<int64_t> exp_vec3_1            = {0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2};
+  std::vector<int64_t> exp_vec3_2            = {0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1};
+  std::vector<int64_t> exp_vec3_3            = {0, 1, 2, 0, 2, 0, 1, 0, 2, 0, 2};
+  std::vector<std::vector<int64_t>> exp_vec3 = {exp_vec3_1, exp_vec3_2, exp_vec3_3};
+  test_where_basic<int32_t>(in_a, exp_vec3, test_shapes[2]);
+}
+
+TEST(Where, Condition)
+{
+  std::vector<uint64_t> shape = {2, 2};
+  auto X                      = mk_array<int32_t>({1, 2, 3, 4}, shape);
+  auto Y                      = mk_array<int32_t>({9, 8, 7, 6}, shape);
+
+  auto A1 = mk_array<bool>({true, false, true, true}, shape);
+  test_where_full<int32_t>(A1, X, Y, {1, 8, 3, 4}, shape);
+
+  auto A2 = mk_array<bool>({true, false}, {1, 2});
+  test_where_full<int32_t>(A2, X, Y, {1, 8, 3, 6}, shape);
+
+  auto A3 = mk_array<bool>({true, false},
+                           {
+                             2,
+                           });
+  test_where_full<int32_t>(A3, X, Y, {1, 8, 3, 6}, shape);
+
+  auto A4 = mk_array<float>({0.0, 1.0, 0, -2}, shape);
+  test_where_full<int32_t>(A4, X, Y, {9, 2, 7, 4}, shape);
+}
+
+TEST(Where, Type)
+{
+  std::vector<uint64_t> shape = {2, 2};
+  auto A                      = mk_array<bool>({true, false, true, true}, shape);
+  auto X_BOOL                 = mk_array<bool>({true, false, true, false}, shape);
+  auto X_INT                  = mk_array<int32_t>({1, 2, 3, 4}, shape);
+  auto X_FLOAT                = mk_array<float>({1, 2, 3, 4}, shape);
+  auto X_COMPLEX128           = mk_array<complex<double>>({1, 2, 3, 4}, shape);
+  auto Y_BOOL                 = mk_array<bool>({false, true, true, false}, shape);
+  auto Y_INT                  = mk_array<int32_t>({9, 8, 7, 6}, shape);
+  auto Y_FLOAT                = mk_array<float>({9, 8, 7, 6}, shape);
+  auto Y_COMPLEX128           = mk_array<complex<double>>({9, 8, 7, 6}, shape);
+
+  test_where_full<bool>(A, X_BOOL, Y_BOOL, {true, true, true, false}, shape);
+
+  test_where_full<int32_t>(A, X_BOOL, Y_INT, {1, 8, 1, 0}, shape);
+  test_where_full<int32_t>(A, X_INT, Y_INT, {1, 8, 3, 4}, shape);
+  test_where_full<int32_t>(A, Y_INT, X_BOOL, {9, 0, 7, 6}, shape);
+
+  test_where_full<float>(A, X_BOOL, Y_FLOAT, {1, 8, 1, 0}, shape);
+  test_where_full<float>(A, X_INT, Y_FLOAT, {1, 8, 3, 4}, shape);
+  test_where_full<float>(A, X_FLOAT, Y_FLOAT, {1, 8, 3, 4}, shape);
+  test_where_full<float>(A, Y_FLOAT, X_BOOL, {9, 0, 7, 6}, shape);
+  test_where_full<float>(A, Y_FLOAT, X_INT, {9, 2, 7, 6}, shape);
+
+  test_where_full<complex<double>>(A, X_BOOL, Y_COMPLEX128, {1, 8, 1, 0}, shape);
+  test_where_full<complex<double>>(A, X_INT, Y_COMPLEX128, {1, 8, 3, 4}, shape);
+  test_where_full<complex<double>>(A, X_FLOAT, Y_COMPLEX128, {1, 8, 3, 4}, shape);
+  test_where_full<complex<double>>(A, X_COMPLEX128, Y_COMPLEX128, {1, 8, 3, 4}, shape);
+  test_where_full<complex<double>>(A, Y_COMPLEX128, X_BOOL, {9, 0, 7, 6}, shape);
+  test_where_full<complex<double>>(A, Y_COMPLEX128, X_INT, {9, 2, 7, 6}, shape);
+  test_where_full<complex<double>>(A, Y_COMPLEX128, X_FLOAT, {9, 2, 7, 6}, shape);
+}
+
+TEST(Where, BroadcastShape)
+{
+  auto X = mk_array<int32_t>({1, 2, 3, 4, 5, 6, 7, 8, 9}, {3, 3});
+  auto Y = mk_array<int32_t>({10, 20, 30}, {1, 3});
+
+  auto A1 = mk_array<bool>({false}, {1});
+  test_where_full<int32_t>(A1, X, Y, {10, 20, 30, 10, 20, 30, 10, 20, 30}, {3, 3});
+
+  auto A2 = mk_array<bool>({false, true, true}, {3});
+  test_where_full<int32_t>(A2, X, Y, {10, 2, 3, 10, 5, 6, 10, 8, 9}, {3, 3});
+
+  auto A3 = mk_array<bool>({false, true, true}, {1, 3});
+  test_where_full<int32_t>(A3, X, Y, {10, 2, 3, 10, 5, 6, 10, 8, 9}, {3, 3});
+
+  auto A4 = mk_array<bool>({false, true, true, true, false, false, true, false, false}, {3, 3});
+  test_where_full<int32_t>(A4, X, Y, {10, 2, 3, 4, 20, 30, 7, 20, 30}, {3, 3});
+
+  auto A5 = mk_array<bool>({false,
+                            true,
+                            true,
+                            true,
+                            false,
+                            false,
+                            true,
+                            false,
+                            false,
+                            false,
+                            true,
+                            true,
+                            true,
+                            false,
+                            false,
+                            true,
+                            false,
+                            false},
+                           {2, 3, 3});
+  test_where_full<int32_t>(
+    A5, X, Y, {10, 2, 3, 4, 20, 30, 7, 20, 30, 10, 2, 3, 4, 20, 30, 7, 20, 30}, {2, 3, 3});
+}
+
+TEST(Where, EmptyAndScalar)
+{
+  auto A        = mk_array<bool>({true},
+                                 {
+                            1,
+                          });
+  auto A_SCALAR = mk_array<bool>({false}, {});
+  auto A_EMPTY  = mk_array<bool>({},
+                                 {
+                                  0,
+                                });
+  auto X        = mk_array<int32_t>({10},
+                                    {
+                               1,
+                             });
+  auto Y        = mk_array<int32_t>({20},
+                                    {
+                               1,
+                             });
+  auto X_SCALAR = mk_array<int32_t>({10}, {});
+  auto Y_SCALAR = mk_array<int32_t>({20}, {});
+  auto EMPTY    = mk_array<int32_t>({},
+                                    {
+                                   0,
+                                 });
+
+  auto B1 = where(A_EMPTY, X, Y);
+  check_array<int32_t>(B1,
+                       {},
+                       {
+                         0,
+                       });
+
+  auto B2 = where(A_EMPTY, X_SCALAR, Y_SCALAR);
+  check_array<int32_t>(B2,
+                       {},
+                       {
+                         0,
+                       });
+
+  auto B3 = where(A, EMPTY, Y_SCALAR);
+  check_array<int32_t>(B3,
+                       {},
+                       {
+                         0,
+                       });
+
+  auto B4 = where(A, EMPTY, EMPTY);
+  check_array<int32_t>(B4,
+                       {},
+                       {
+                         0,
+                       });
+
+  auto B5 = where(A_EMPTY, EMPTY, EMPTY);
+  check_array<int32_t>(B5,
+                       {},
+                       {
+                         0,
+                       });
+
+  auto B6 = where(A_SCALAR, X, Y_SCALAR);
+  check_array<int32_t>(B6,
+                       {20},
+                       {
+                         1,
+                       });
+
+  auto B7 = where(A_SCALAR, X_SCALAR, Y_SCALAR);
+  check_array<int32_t>(B7, {20}, {});
+
+  auto B8 = where(A, X_SCALAR, Y_SCALAR);
+  check_array<int32_t>(B8,
+                       {10},
+                       {
+                         1,
+                       });
+
+  auto B9 = where(A, X_SCALAR, Y);
+  check_array<int32_t>(B9,
+                       {10},
+                       {
+                         1,
+                       });
+}
+
+TEST(Where, InvalidShape)
+{
+  auto A = mk_array<bool>({false, true, true, true, false, false, true, false, false}, {3, 3});
+  auto X = mk_array<int32_t>({1, 2, 3, 4, 5, 6, 7, 8, 9}, {3, 3});
+
+  auto Y1 = mk_array<int32_t>({10, 20},
+                              {
+                                2,
+                              });
+  auto Y2 = mk_array<int32_t>({10, 20}, {1, 2});
+  auto Y3 = mk_array<int32_t>({10, 20, 30, 40}, {4, 1});
+  auto Y4 = mk_array<int32_t>({},
+                              {
+                                0,
+                              });
+
+  for (auto Y : {Y1, Y2, Y3, Y4}) {
+    EXPECT_THROW(where(A, X, Y), std::exception);
+  }
+}
diff --git a/tests/cpp/integration/test_window.cc b/tests/cpp/integration/test_window.cc
index f25cdf81bf..456c53a072 100644
--- a/tests/cpp/integration/test_window.cc
+++ b/tests/cpp/integration/test_window.cc
@@ -17,21 +17,21 @@
 #include <gtest/gtest.h>
 #include "common_utils.h"
 
-using namespace cunumeric;
+using namespace cupynumeric;
 
 namespace {
 
 struct windows_case {
   int64_t input;
   std::vector<double> expected_values;
-  std::vector<size_t> expected_shape;
+  std::vector<uint64_t> expected_shape;
 };
 
 struct kaiser_case {
   int64_t input;
   double beta_input;
   std::vector<double> expected_values;
-  std::vector<size_t> expected_shape;
+  std::vector<uint64_t> expected_shape;
 };
 
 class NormalInput : public ::testing::Test, public ::testing::WithParamInterface<windows_case> {};
@@ -157,7 +157,7 @@ TEST_P(BartlettTest, Basic)
 {
   auto& [input, expected_values, expected_shape] = GetParam();
 
-  auto result = cunumeric::bartlett(input);
+  auto result = cupynumeric::bartlett(input);
   check_array_near(result, expected_values, expected_shape);
 }
 
@@ -165,7 +165,7 @@ TEST_P(BlackmanTest, Basic)
 {
   auto& [input, expected_values, expected_shape] = GetParam();
 
-  auto result = cunumeric::blackman(input);
+  auto result = cupynumeric::blackman(input);
   check_array_near(result, expected_values, expected_shape);
 }
 
@@ -173,7 +173,7 @@ TEST_P(HammingTest, Basic)
 {
   auto& [input, expected_values, expected_shape] = GetParam();
 
-  auto result = cunumeric::hamming(input);
+  auto result = cupynumeric::hamming(input);
   check_array_near(result, expected_values, expected_shape);
 }
 
@@ -181,7 +181,7 @@ TEST_P(HanningTest, Basic)
 {
   auto& [input, expected_values, expected_shape] = GetParam();
 
-  auto result = cunumeric::hanning(input);
+  auto result = cupynumeric::hanning(input);
   check_array_near(result, expected_values, expected_shape);
 }
 
@@ -189,8 +189,8 @@ TEST_P(KaiserTest, Basic)
 {
   auto& [input, beta_input, expected_values, expected_shape] = GetParam();
 
-  auto result = cunumeric::kaiser(input, beta_input);
+  auto result = cupynumeric::kaiser(input, beta_input);
   check_array_near(result, expected_values, expected_shape);
 }
 
-}  // namespace
\ No newline at end of file
+}  // namespace
diff --git a/tests/cpp/integration/test_zeros.cc b/tests/cpp/integration/test_zeros.cc
index 861bf0f6bc..e0cc1b838d 100644
--- a/tests/cpp/integration/test_zeros.cc
+++ b/tests/cpp/integration/test_zeros.cc
@@ -16,7 +16,7 @@
 
 #include "common_utils.h"
 
-using namespace cunumeric;
+using namespace cupynumeric;
 using Code = legate::Type::Code;
 
 namespace {
diff --git a/tests/cpp/integration/util.inl b/tests/cpp/integration/util.inl
index 4a18896284..232ba31b23 100644
--- a/tests/cpp/integration/util.inl
+++ b/tests/cpp/integration/util.inl
@@ -98,34 +98,35 @@ std::string to_string(legate::AccessorRO<T, DIM> acc,
 }
 
 template <typename T, int32_t DIM>
-std::string check_array_eq(legate::AccessorRO<T, DIM> acc,
-                           T* values_ptr,
-                           const std::vector<uint64_t>& shape,
-                           legate::Rect<DIM> rect)
+void check_array_eq(legate::AccessorRO<T, DIM> acc,
+                    T* values_ptr,
+                    const std::vector<uint64_t>& shape,
+                    legate::Rect<DIM> rect)
 {
-  std::stringstream ss;
-
   auto index = 0;
-  auto size  = shape.size();
-  ss << "size: " << size << "\n";
   for (legate::PointInRectIterator<DIM> itr(rect, false); itr.valid(); ++itr) {
-    auto q = *itr;
-    ss << std::left << std::setprecision(3);
-    ss << std::setw(13) << "Array value: " << std::setw(10);
-    print_value(ss, acc[q]) << ", ";
-    ss << std::setw(16) << "Expected value: " << std::setw(10);
-    print_value(ss, acc[q]) << ", ";
-    if (size > 0) {
-      ss << std::setw(8) << "index: [";
-      for (uint32_t i = 0; i < size - 1; ++i) {
-        ss << q[i] << ",";
+    auto q      = *itr;
+    auto value  = acc[q];
+    auto expect = values_ptr[index++];
+    if (value != expect) {
+      std::stringstream ss;
+      auto size = shape.size();
+      ss << "size: " << size << "\n";
+      ss << std::left << std::setprecision(3);
+      ss << std::setw(13) << "Array value: " << std::setw(10);
+      print_value(ss, value) << ", ";
+      ss << std::setw(16) << "Expected value: " << std::setw(10);
+      print_value(ss, expect) << ", ";
+      if (size > 0) {
+        ss << std::setw(8) << "index: [";
+        for (uint32_t i = 0; i < size - 1; ++i) {
+          ss << q[i] << ",";
+        }
+        ss << q[size - 1] << "]\n";
       }
-      ss << q[size - 1] << "]\n";
+      FAIL() << ss.str();
     }
-    EXPECT_EQ(acc[q], values_ptr[index++]);
   }
-
-  return ss.str();
 }
 
 template <typename T, int32_t DIM>
@@ -145,10 +146,7 @@ struct check_array_eq_fn {
                   const std::vector<uint64_t>& shape,
                   legate::Rect<DIM> rect)
   {
-    auto string_result = check_array_eq<T, DIM>(acc, values_ptr, shape, rect);
-    if (rect.volume() <= 256) {
-      std::cerr << string_result << std::endl;
-    }
+    check_array_eq<T, DIM>(acc, values_ptr, shape, rect);
   }
 };
 
@@ -175,7 +173,7 @@ struct copy_array_fn {
 };
 
 template <typename T, int32_t DIM>
-void print_array(cunumeric::NDArray array)
+void print_array(cupynumeric::NDArray array)
 {
   auto acc            = array.get_read_accessor<T, DIM>();
   auto& shape         = array.shape();
@@ -186,7 +184,7 @@ void print_array(cunumeric::NDArray array)
 }
 
 template <typename T, int32_t DIM>
-void check_array_eq(cunumeric::NDArray array, T* values_ptr, size_t length)
+void check_array_eq(cupynumeric::NDArray array, T* values_ptr, size_t length)
 {
   assert(array.size() == length);
   if (length == 0) {
@@ -202,7 +200,7 @@ void check_array_eq(cunumeric::NDArray array, T* values_ptr, size_t length)
 }
 
 template <typename T, int32_t DIM>
-void assign_values_to_array(cunumeric::NDArray array, T* values_ptr, size_t length)
+void assign_values_to_array(cupynumeric::NDArray array, T* values_ptr, size_t length)
 {
   assert(array.size() == length);
   if (length == 0) {
@@ -217,7 +215,7 @@ void assign_values_to_array(cunumeric::NDArray array, T* values_ptr, size_t leng
 }
 
 template <typename T, int32_t DIM>
-std::vector<T> assign_array_to_values(cunumeric::NDArray array)
+std::vector<T> assign_array_to_values(cupynumeric::NDArray array)
 {
   std::vector<T> result(array.size());
   if (array.size() > 0) {
@@ -233,7 +231,7 @@ std::vector<T> assign_array_to_values(cunumeric::NDArray array)
 }
 
 template <typename T, int32_t DIM>
-void check_array_eq(cunumeric::NDArray array1, cunumeric::NDArray array2)
+void check_array_eq(cupynumeric::NDArray array1, cupynumeric::NDArray array2)
 {
   assert(array1.size() == array2.size());
   if (array1.size() == 0) {
diff --git a/tests/cpp/main.cc b/tests/cpp/main.cc
index 97211a77c8..1dd56f6c34 100644
--- a/tests/cpp/main.cc
+++ b/tests/cpp/main.cc
@@ -16,7 +16,7 @@
 
 #include <gtest/gtest.h>
 #include "legate.h"
-#include "cunumeric.h"
+#include "cupynumeric.h"
 
 class Environment : public ::testing::Environment {
  public:
@@ -25,7 +25,7 @@ class Environment : public ::testing::Environment {
   void SetUp() override
   {
     EXPECT_EQ(legate::start(argc_, argv_), 0);
-    cunumeric::initialize(argc_, argv_);
+    cupynumeric::initialize(argc_, argv_);
   }
   void TearDown() override { EXPECT_EQ(legate::finish(), 0); }
 
diff --git a/tests/cpp/run.py b/tests/cpp/run.py
index e3c775e0a3..f83ce1c469 100755
--- a/tests/cpp/run.py
+++ b/tests/cpp/run.py
@@ -29,7 +29,7 @@
     "PYTHON",
     "UCX_",
     "NCCL_",
-    "CUNUMERIC_",
+    "CUPYNUMERIC_",
     "NVIDIA_",
 )
 
@@ -95,14 +95,14 @@ def is_launcher_var(name: str) -> bool:
 
 
 def main():
-    CUNUMERIC_DIR = Path(__file__).resolve().parent.parent.parent
+    CUPYNUMERIC_DIR = Path(__file__).resolve().parent.parent.parent
     parser = argparse.ArgumentParser(description="Run Legate cpp tests.")
     parser.add_argument(
         "--binary-path",
         dest="binary_path",
         required=False,
         default=str(
-            CUNUMERIC_DIR / "build" / "tests" / "cpp" / "bin" / "cpp_tests"
+            CUPYNUMERIC_DIR / "build" / "tests" / "cpp" / "bin" / "cpp_tests"
         ),
         help="Path to binary under test.",
     )
@@ -110,7 +110,7 @@ def main():
         "--log-path",
         dest="log_path",
         required=False,
-        default=str(CUNUMERIC_DIR / "build" / "results.log"),
+        default=str(CUPYNUMERIC_DIR / "build" / "results.log"),
         help="Path to output log file.",
     )
     parser.add_argument(
diff --git a/tests/integration/test_0d_store.py b/tests/integration/test_0d_store.py
index a2d22fab83..7e0ef278e7 100644
--- a/tests/integration/test_0d_store.py
+++ b/tests/integration/test_0d_store.py
@@ -17,7 +17,7 @@
 
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 SIZE = 3
 
diff --git a/tests/integration/test_advanced_indexing.py b/tests/integration/test_advanced_indexing.py
index 90751500de..29aa538399 100644
--- a/tests/integration/test_advanced_indexing.py
+++ b/tests/integration/test_advanced_indexing.py
@@ -17,8 +17,9 @@
 import pytest
 from legate.core import LEGATE_MAX_DIM
 from utils.generators import mk_seq_array
+from utils.utils import ONE_MAX_DIM_RANGE, TWO_MAX_DIM_RANGE
 
-import cunumeric as num
+import cupynumeric as num
 
 
 @pytest.fixture
@@ -120,7 +121,7 @@ def mk_deferred_array(lib, shape):
 
 
 def gen_args():
-    for arr_ndim in range(1, LEGATE_MAX_DIM + 1):
+    for arr_ndim in ONE_MAX_DIM_RANGE[:-1]:
         for idx_ndim in range(1, arr_ndim + 1):
             for zero_dim in range(arr_ndim):
                 yield arr_ndim, idx_ndim, zero_dim
@@ -919,7 +920,7 @@ def test():
 
     # we do less than LEGATE_MAX_DIM becasue the dimension will be increased by
     # 1 when passig 2d index array
-    for ndim in range(2, LEGATE_MAX_DIM):
+    for ndim in TWO_MAX_DIM_RANGE[:-1]:
         a_shape = tuple(np.random.randint(2, 5) for i in range(ndim))
         np_array = mk_seq_array(np, a_shape)
         num_array = mk_seq_array(num, a_shape)
diff --git a/tests/integration/test_allclose.py b/tests/integration/test_allclose.py
index 9270c77d3a..130a12b3ca 100755
--- a/tests/integration/test_allclose.py
+++ b/tests/integration/test_allclose.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 SCALARS_TRUE_DEFAULT = (
     (0, -1e-8),
@@ -150,7 +150,7 @@ def test_array_false(shape):
 def test_broadcast_true1(shape_b):
     # for all cases,
     # In Numpy, it pass
-    # In cuNumeric, it raises AttributeError:
+    # In cuPyNumeric, it raises AttributeError:
     # 'Store' object has no attribute '_broadcast'
     len_scalars = len(SCALARS_TRUE_DEFAULT)
 
@@ -186,7 +186,7 @@ def test_broadcast_true1(shape_b):
 def test_broadcast_true2(shape_b):
     # for all cases,
     # In Numpy, it pass
-    # In cuNumeric, it raises AttributeError:
+    # In cuPyNumeric, it raises AttributeError:
     # 'Store' object has no attribute '_broadcast'
     shape_a = (3,)
     size_a = np.prod(shape_a)
@@ -219,7 +219,7 @@ def test_broadcast_true2(shape_b):
 def test_equal_nan_basic(arr, equal_nan):
     # If equal_nan is True,
     # In Numpy, it pass
-    # In cuNumeric, it raises NotImplementedError
+    # In cuPyNumeric, it raises NotImplementedError
     res_np = np.allclose(arr, arr, equal_nan=equal_nan)
     res_num = num.allclose(arr, arr, equal_nan=equal_nan)
     assert res_np == res_num
@@ -257,7 +257,7 @@ def test_empty_array(a, b):
 def test_scalar_broadcasting(a, b):
     # for all cases,
     # In Numpy, it pass
-    # In cuNumeric, it raises AttributeError:
+    # In cuPyNumeric, it raises AttributeError:
     # 'Store' object has no attribute '_broadcast'
     res_np = np.allclose(a, b)
     res_num = num.allclose(a, b)
diff --git a/tests/integration/test_amax_amin.py b/tests/integration/test_amax_amin.py
index ee85b2e2b0..2c99f35cef 100755
--- a/tests/integration/test_amax_amin.py
+++ b/tests/integration/test_amax_amin.py
@@ -15,16 +15,16 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
+from utils.utils import MAX_DIM_RANGE, TWO_MAX_DIM_RANGE
 
-import cunumeric as num
+import cupynumeric as num
 
 FUNCS = ("amax", "amin")
 
 
 @pytest.mark.parametrize("initial", (None, -2, 0, 0.5, 2))
 @pytest.mark.parametrize("keepdims", [True, False])
-@pytest.mark.parametrize("ndim", range(LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", MAX_DIM_RANGE)
 @pytest.mark.parametrize("func_name", FUNCS)
 def test_basic(func_name, ndim, keepdims, initial):
     shape = (5,) * ndim
@@ -54,7 +54,7 @@ def test_basic(func_name, ndim, keepdims, initial):
 def test_src_dt(func_name, keepdims, src_dt):
     # For src_dt=np.complex128,
     # In Numpy, it pass
-    # In cuNumeric, it raises NotImplementedError
+    # In cuPyNumeric, it raises NotImplementedError
     ndim = 3
     shape = (5,) * ndim
     in_np = np.random.randint(-5, 5, size=shape).astype(src_dt)
@@ -71,7 +71,7 @@ def test_src_dt(func_name, keepdims, src_dt):
 
 @pytest.mark.parametrize("initial", (None, -2, 0, 0.5, 2))
 @pytest.mark.parametrize("keepdims", [True, False])
-@pytest.mark.parametrize("ndim", range(LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", MAX_DIM_RANGE)
 @pytest.mark.parametrize("func_name", FUNCS)
 def test_axis(func_name, ndim, keepdims, initial):
     shape = (5,) * ndim
@@ -97,7 +97,7 @@ def test_axis(func_name, ndim, keepdims, initial):
 @pytest.mark.parametrize("func_name", FUNCS)
 def test_axis_tuple(func_name, keepdims, axes):
     # In Numpy, it pass
-    # In cuNumeric, it raises NotImplementedError
+    # In cuPyNumeric, it raises NotImplementedError
     shape = (3, 4, 5)
     in_np = np.random.randint(-5, 5, size=shape)
     in_num = num.array(in_np)
@@ -151,7 +151,7 @@ def test_out_dim1(func_name, keepdims):
 
 @pytest.mark.parametrize("initial", (None, -2, 0, 0.5, 2))
 @pytest.mark.parametrize("keepdims", [True, False])
-@pytest.mark.parametrize("ndim", range(2, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", TWO_MAX_DIM_RANGE)
 @pytest.mark.parametrize("func_name", FUNCS)
 def test_out(func_name, ndim, keepdims, initial):
     shape = (5,) * ndim
@@ -189,7 +189,7 @@ def test_out(func_name, ndim, keepdims, initial):
 def test_out_with_dtype(func_name, keepdims, out_dt):
     # For out_dt=np.complex128
     # In Numpy, it pass
-    # In cuNumeric, it raises KeyError
+    # In cuPyNumeric, it raises KeyError
     ndim = 3
     shape = (5,) * ndim
     in_np = np.random.randint(-5, 5, size=shape)
@@ -216,7 +216,7 @@ def test_out_with_dtype(func_name, keepdims, out_dt):
 @pytest.mark.parametrize("func_name", FUNCS)
 def test_where(func_name):
     # In Numpy, it pass
-    # In cuNumeric, it raises NotImplementedError
+    # In cuPyNumeric, it raises NotImplementedError
     shape = (3, 4, 5)
     in_np = np.random.randint(-5, 5, size=shape)
     in_num = num.array(in_np)
diff --git a/tests/integration/test_angle.py b/tests/integration/test_angle.py
index 16a81bc5ce..76ec6658b0 100644
--- a/tests/integration/test_angle.py
+++ b/tests/integration/test_angle.py
@@ -15,10 +15,10 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
 from utils.generators import mk_seq_array
+from utils.utils import ONE_MAX_DIM_RANGE
 
-import cunumeric as num
+import cupynumeric as num
 
 
 class TestAngleErrors:
@@ -51,7 +51,7 @@ def test_pure_real_and_imaginary(self):
         assert np.array_equal(num.angle(5j), np.angle(5j))
         assert np.array_equal(num.angle(-5j), np.angle(-5j))
 
-    @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
     @pytest.mark.parametrize("in_type", (int, float, complex))
     @pytest.mark.parametrize("deg", (False, True))
     def test_basic(self, ndim, in_type, deg):
diff --git a/tests/integration/test_append.py b/tests/integration/test_append.py
index bece5e7f85..5693072165 100644
--- a/tests/integration/test_append.py
+++ b/tests/integration/test_append.py
@@ -17,7 +17,7 @@
 import pytest
 from utils.utils import check_module_function
 
-import cunumeric as num
+import cupynumeric as num
 
 DIM = 10
 
diff --git a/tests/integration/test_arg_reduce.py b/tests/integration/test_arg_reduce.py
index 17c491fbec..4e9e24f486 100644
--- a/tests/integration/test_arg_reduce.py
+++ b/tests/integration/test_arg_reduce.py
@@ -15,10 +15,14 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
-from utils.utils import AxisError
+from utils.utils import (
+    MAX_DIM_RANGE,
+    ONE_MAX_DIM_RANGE,
+    TWO_MAX_DIM_RANGE,
+    AxisError,
+)
 
-import cunumeric as num
+import cupynumeric as num
 
 ARG_FUNCS = ("argmax", "argmin")
 
@@ -104,7 +108,7 @@ class TestArgMaxAndArgMin:
     """
 
     @pytest.mark.parametrize("func_name", ARG_FUNCS)
-    @pytest.mark.parametrize("ndim", range(LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("ndim", MAX_DIM_RANGE)
     @pytest.mark.parametrize("keepdims", [True, False])
     def test_argmax_and_argmin_basic(self, func_name, ndim, keepdims):
         shape = (5,) * ndim
@@ -120,7 +124,7 @@ def test_argmax_and_argmin_basic(self, func_name, ndim, keepdims):
         )
 
     @pytest.mark.parametrize("func_name", ARG_FUNCS)
-    @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
     @pytest.mark.parametrize("keepdims", [True, False])
     def test_argmax_and_argmin_axis(self, func_name, ndim, keepdims):
         shape = (5,) * ndim
@@ -175,7 +179,7 @@ def test_argmax_and_argmin_out_1dim(self, func_name, keepdims):
         assert np.array_equal(res_np, res_num)
 
     @pytest.mark.parametrize("func_name", ARG_FUNCS)
-    @pytest.mark.parametrize("ndim", range(2, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("ndim", TWO_MAX_DIM_RANGE)
     @pytest.mark.parametrize("keepdims", [True, False])
     def test_argmax_and_argmin_out(self, func_name, ndim, keepdims):
         shape = (5,) * ndim
diff --git a/tests/integration/test_argsort.py b/tests/integration/test_argsort.py
index 07d165eefb..89fcc2d7a6 100644
--- a/tests/integration/test_argsort.py
+++ b/tests/integration/test_argsort.py
@@ -16,9 +16,9 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
-# cunumeric.argsort(a: ndarray, axis: int = -1, kind: SortType = 'quicksort',
+# cupynumeric.argsort(a: ndarray, axis: int = -1, kind: SortType = 'quicksort',
 # order: Optional = None) → ndarray
 
 # ndarray.argsort(axis=-1, kind=None, order=None)
@@ -93,8 +93,8 @@ def test_structured_array_order(self):
 
         res_np = np.argsort(a_np, order="height")
         res_num = num.argsort(a_num, order="height")
-        # cuNumeric raises AssertionError in
-        # function cunumeric/cunumeric/eager.py:to_deferred_array
+        # cuPyNumeric raises AssertionError in
+        # function cupynumeric/cupynumeric/eager.py:to_deferred_array
         #     if self.deferred is None:
         #         if self.parent is None:
         #
@@ -124,7 +124,7 @@ def test_sort_type_invalid(self):
         res_num = num.argsort(arr_num, kind="negative")
         # Numpy raises "ValueError: sort kind must be one of 'quick',
         # 'heap', or 'stable' (got 'negative')"
-        # cuNumeric passed. The code basically supports ‘stable’
+        # cuPyNumeric passed. The code basically supports ‘stable’
         # or not ‘stable’.
         assert np.array_equal(res_num, res_np)
 
@@ -151,7 +151,7 @@ def test_basic_axis_sort_type(self, size, sort_type):
     @pytest.mark.parametrize("sort_type", UNSTABLE_SORT_TYPES)
     def test_basic_axis_sort_type_unstable(self, size, sort_type):
         # have to guarantee unique values in input
-        # see https://github.com/nv-legate/cunumeric/issues/782
+        # see https://github.com/nv-legate/cupynumeric/issues/782
         arr_np = np.arange(np.prod(size))
         np.random.shuffle(arr_np)
         arr_np = arr_np.reshape(size)
@@ -188,7 +188,7 @@ def test_arr_basic_axis_sort(self, size, sort_type):
     @pytest.mark.parametrize("sort_type", UNSTABLE_SORT_TYPES)
     def test_arr_basic_axis_sort_unstable(self, size, sort_type):
         # have to guarantee unique values in input
-        # see https://github.com/nv-legate/cunumeric/issues/782
+        # see https://github.com/nv-legate/cupynumeric/issues/782
         arr_np = np.arange(np.prod(size))
         np.random.shuffle(arr_np)
         arr_np = arr_np.reshape(size)
diff --git a/tests/integration/test_array.py b/tests/integration/test_array.py
index 43854a42df..6029f8a1c9 100755
--- a/tests/integration/test_array.py
+++ b/tests/integration/test_array.py
@@ -17,7 +17,7 @@
 import pytest
 from legate.core import LEGATE_MAX_DIM
 
-import cunumeric as num
+import cupynumeric as num
 
 SCALARS = (
     0,
@@ -61,7 +61,7 @@ def test_array_basic(obj):
 
 @pytest.mark.parametrize("obj", UNSUPPORTED_OBJECTS)
 def test_array_unsupported(obj):
-    with pytest.raises(TypeError, match="cuNumeric does not support dtype"):
+    with pytest.raises(TypeError, match="cuPyNumeric does not support dtype"):
         num.array(obj)
 
 
@@ -153,7 +153,7 @@ def test_asarray_basic(obj):
 
 @pytest.mark.parametrize("obj", UNSUPPORTED_OBJECTS)
 def test_asarray_unsupported(obj):
-    with pytest.raises(TypeError, match="cuNumeric does not support dtype"):
+    with pytest.raises(TypeError, match="cuPyNumeric does not support dtype"):
         num.array(obj)
 
 
diff --git a/tests/integration/test_array_creation.py b/tests/integration/test_array_creation.py
index 65e9b3c821..a015522dad 100644
--- a/tests/integration/test_array_creation.py
+++ b/tests/integration/test_array_creation.py
@@ -18,7 +18,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def test_array():
@@ -195,6 +195,11 @@ def test_func_like(fn, x_np, dtype, shape):
 @pytest.mark.parametrize("x_np, dtype", DATA_ARGS)
 @pytest.mark.parametrize("shape", SHAPE_ARG)
 def test_full_like(x_np, dtype, value, shape):
+    if np.dtype(dtype).itemsize == 1 and value > 255:
+        with pytest.raises(OverflowError):
+            num.full_like(x_np, value, dtype=dtype, shape=shape)
+        return
+
     shape = shape if shape is None else x_np.reshape(shape).shape
     x = num.array(x_np)
 
diff --git a/tests/integration/test_array_dunders.py b/tests/integration/test_array_dunders.py
index 83e4c2a5ec..c7a3a53a93 100644
--- a/tests/integration/test_array_dunders.py
+++ b/tests/integration/test_array_dunders.py
@@ -17,7 +17,7 @@
 import pytest
 from numpy.lib import NumpyVersion
 
-import cunumeric as num
+import cupynumeric as num
 
 arr_np = np.eye(4)
 vec_np = np.arange(4).astype(np.float64)
diff --git a/tests/integration/test_array_equal.py b/tests/integration/test_array_equal.py
index bb298eec57..71d2c88f3c 100755
--- a/tests/integration/test_array_equal.py
+++ b/tests/integration/test_array_equal.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 
 @pytest.mark.parametrize(
@@ -86,7 +86,7 @@ def test_equal_values_with_different_dtype(dtype1, dtype2):
 def test_equal_nan_basic(arr, equal_nan):
     # If equal_nan is True,
     # In Numpy, it pass
-    # In cuNumeric, it raises NotImplementedError
+    # In cuPyNumeric, it raises NotImplementedError
     res_np = np.array_equal(arr, arr, equal_nan=equal_nan)
     res_num = num.array_equal(arr, arr, equal_nan=equal_nan)
     assert res_np == res_num
@@ -98,7 +98,7 @@ def test_equal_nan_basic(arr, equal_nan):
 def test_equal_nan_complex_values(equal_nan):
     # If equal_nan is True,
     # In Numpy, it pass
-    # In cuNumeric, it raises NotImplementedError
+    # In cuPyNumeric, it raises NotImplementedError
     a = np.array([1, 1 + 1j])
     b = a.copy()
     a.real = np.nan
diff --git a/tests/integration/test_array_fallback.py b/tests/integration/test_array_fallback.py
index 46e74a8faf..a20032ad6c 100644
--- a/tests/integration/test_array_fallback.py
+++ b/tests/integration/test_array_fallback.py
@@ -15,10 +15,10 @@
 
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 
-# ref: https://github.com/nv-legate/cunumeric/pull/430
+# ref: https://github.com/nv-legate/cupynumeric/pull/430
 def test_unimplemented_method_self_fallback():
     ones = num.ones((10,))
     ones.mean()
@@ -27,7 +27,7 @@ def test_unimplemented_method_self_fallback():
     # to verify a behaviour of unimplemented ndarray method wrappers. If std
     # becomes implemeneted in the future, this assertion will start to fail,
     # and a new (unimplemented) ndarray method should be found to replace it
-    assert not ones.std._cunumeric.implemented
+    assert not ones.std._cupynumeric_metadata.implemented
 
     ones.std()
 
diff --git a/tests/integration/test_array_split.py b/tests/integration/test_array_split.py
index 6b73c16813..29c52d1d41 100644
--- a/tests/integration/test_array_split.py
+++ b/tests/integration/test_array_split.py
@@ -19,7 +19,7 @@
 import pytest
 from utils.utils import check_module_function
 
-import cunumeric as num
+import cupynumeric as num
 
 DIM = 20
 
diff --git a/tests/integration/test_astype.py b/tests/integration/test_astype.py
index fe5aa3068f..da0ed1c7a1 100644
--- a/tests/integration/test_astype.py
+++ b/tests/integration/test_astype.py
@@ -17,7 +17,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 TEST_VECTOR = [0, 0, 1, 2, 3, 0, 1, 2, 3]
 ALL_BUT_COMPLEX = ["?", "b", "h", "i", "l", "B", "H", "I", "L", "e", "f", "d"]
@@ -120,14 +120,14 @@ def test_complex_negative(src_dtype):
     out_np = in_np.astype(to_dtype("?"))
     out_num = in_num.astype(to_dtype("?"))
 
-    # Numpy and cuNumeric have different performance.
-    # For complex data 0.+1.j, Numpy set as True, cuNumeric set as False.
+    # Numpy and cuPyNumeric have different performance.
+    # For complex data 0.+1.j, Numpy set as True, cuPyNumeric set as False.
     assert np.array_equal(out_num, out_np)
 
 
 def test_default_copy_value():
     # it was decided to explicitly diverge from the numpy default value in
-    # https://github.com/nv-legate/cunumeric.internal/issues/421
+    # https://github.com/nv-legate/cupynumeric.internal/issues/421
     a = num.array([])
     assert inspect.signature(a.astype).parameters["copy"].default is False
 
diff --git a/tests/integration/test_atleast_nd.py b/tests/integration/test_atleast_nd.py
index cac98ad722..f499e7f84f 100644
--- a/tests/integration/test_atleast_nd.py
+++ b/tests/integration/test_atleast_nd.py
@@ -15,14 +15,13 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
-from utils.utils import check_module_function
+from utils.utils import MAX_DIM_RANGE, check_module_function
 
-import cunumeric as num
+import cupynumeric as num
 
 DIM = 10
 
-SIZE_CASES = list((DIM,) * ndim for ndim in range(LEGATE_MAX_DIM + 1))
+SIZE_CASES = list((DIM,) * ndim for ndim in MAX_DIM_RANGE)
 
 SIZE_CASES += [
     (0,),  # empty array
@@ -34,7 +33,7 @@
 @pytest.mark.parametrize("size", SIZE_CASES, ids=str)
 def test_atleast_1d(size):
     a = [np.arange(np.prod(size)).reshape(size)]
-    print_msg = f"np & cunumeric.atleast_1d(size={size})"
+    print_msg = f"np & cupynumeric.atleast_1d(size={size})"
     check_module_function("atleast_1d", a, {}, print_msg)
 
 
@@ -46,7 +45,7 @@ def test_atleast_1d_scalar():
 @pytest.mark.parametrize("size", SIZE_CASES, ids=str)
 def test_atleast_2d(size):
     a = [np.arange(np.prod(size)).reshape(size)]
-    print_msg = f"np & cunumeric.atleast_2d(size={size})"
+    print_msg = f"np & cupynumeric.atleast_2d(size={size})"
     check_module_function("atleast_2d", a, {}, print_msg)
 
 
@@ -58,7 +57,7 @@ def test_atleast_2d_scalar():
 @pytest.mark.parametrize("size", SIZE_CASES, ids=str)
 def test_atleast_3d(size):
     a = [np.arange(np.prod(size)).reshape(size)]
-    print_msg = f"np & cunumeric.atleast_3d(size={size})"
+    print_msg = f"np & cupynumeric.atleast_3d(size={size})"
     check_module_function("atleast_3d", a, {}, print_msg)
 
 
@@ -73,7 +72,7 @@ def test_atleast_nd(dim):
     a = list(np.arange(np.prod(size)).reshape(size) for size in SIZE_CASES)
     scalar = 10.0
     a.append(scalar)
-    print_msg = f"np & cunumeric.atleast_{dim}d(size={SIZE_CASES})"
+    print_msg = f"np & cupynumeric.atleast_{dim}d(size={SIZE_CASES})"
     check_module_function(f"atleast_{dim}d", a, {}, print_msg)
 
 
diff --git a/tests/integration/test_average.py b/tests/integration/test_average.py
index e8ff4934da..6d7f8df943 100644
--- a/tests/integration/test_average.py
+++ b/tests/integration/test_average.py
@@ -17,7 +17,7 @@
 import pytest
 from utils.comparisons import allclose
 
-import cunumeric as num
+import cupynumeric as num
 
 axes = [None, 0, 1, 2, (0, 1, 2)]
 
diff --git a/tests/integration/test_binary_op_broadcast.py b/tests/integration/test_binary_op_broadcast.py
index d779d5c343..303fe7856e 100644
--- a/tests/integration/test_binary_op_broadcast.py
+++ b/tests/integration/test_binary_op_broadcast.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 N = 20
 
diff --git a/tests/integration/test_binary_op_complex.py b/tests/integration/test_binary_op_complex.py
index b9263bd170..48d12e7d78 100644
--- a/tests/integration/test_binary_op_complex.py
+++ b/tests/integration/test_binary_op_complex.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 xn = np.array([1 + 4j, 2 + 5j, 3 + 6j], np.complex64)
 yn = np.array([4 + 7j, 5 + 8j, 6 + 9j], np.complex64)
diff --git a/tests/integration/test_binary_op_typing.py b/tests/integration/test_binary_op_typing.py
index e78d432634..d4612b5671 100644
--- a/tests/integration/test_binary_op_typing.py
+++ b/tests/integration/test_binary_op_typing.py
@@ -18,7 +18,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def value_type(obj):
@@ -87,7 +87,7 @@ def generate_array_array_cases():
 # the code somewhat compatible with NumPy for cases where Python scalars
 # are passed.
 #
-# If anyone can do a better job than me and finally make cuNumeric
+# If anyone can do a better job than me and finally make cuPyNumeric
 # implement the same typing rules, please put these tests back.
 def generate_array_scalar_cases():
     for idx, lhs_type in enumerate(TYPES):
@@ -129,7 +129,7 @@ def test_array_array(lhs_np, rhs_np, lhs_num, rhs_num):
 
     print(f"LHS {lhs_np}")
     print(f"RHS {rhs_np}")
-    print(f"NumPy type: {out_np.dtype}, cuNumeric type: {out_num.dtype}")
+    print(f"NumPy type: {out_np.dtype}, cuPyNumeric type: {out_num.dtype}")
 
     assert out_np.dtype == out_num.dtype
 
@@ -145,7 +145,7 @@ def test_array_scalar(lhs_np, rhs_np, lhs_num, rhs_num):
 
     print(f"LHS {lhs_np}")
     print(f"RHS {rhs_np}")
-    print(f"NumPy type: {out_np.dtype}, cuNumeric type: {out_num.dtype}")
+    print(f"NumPy type: {out_np.dtype}, cuPyNumeric type: {out_num.dtype}")
 
     assert out_np.dtype == out_num.dtype
 
diff --git a/tests/integration/test_binary_ufunc.py b/tests/integration/test_binary_ufunc.py
index c27f20d7df..9ae99edd6f 100644
--- a/tests/integration/test_binary_ufunc.py
+++ b/tests/integration/test_binary_ufunc.py
@@ -20,7 +20,7 @@
 import pytest
 from utils.comparisons import allclose
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def check_result(op, in_np, out_np, out_num):
@@ -33,7 +33,7 @@ def check_result(op, in_np, out_np, out_num):
         getattr(out_np, "dtype", None) == getattr(out_num, "dtype", None)
     )
     if not result:
-        print(f"cunumeric.{op} failed the test")
+        print(f"cupynumeric.{op} failed the test")
         print("Inputs:")
         for arr in in_np:
             print(arr)
@@ -42,7 +42,7 @@ def check_result(op, in_np, out_np, out_num):
         print("NumPy output:")
         print(out_np)
         print(f"dtype: {out_np.dtype}")
-        print("cuNumeric output:")
+        print("cuPyNumeric output:")
         print(out_num)
         print(f"dtype: {out_num.dtype}")
         assert False
@@ -70,7 +70,7 @@ def check_op(op, in_np, out_dtype="D"):
 
         check_result(op, in_np, out_np, out_num)
 
-        # Ask cuNumeric to produce outputs to NumPy ndarrays
+        # Ask cuPyNumeric to produce outputs to NumPy ndarrays
         out_num = np.empty(out_np.shape, dtype=out_dtype)
         op_num(*in_num, out=out_num)
 
@@ -297,7 +297,7 @@ def test_bit_ops_arr_scalar(op) -> None:
     check_op(op, (arrs[0], scalars[0]))
     check_op(op, (arrs[0], scalars[1]))
     check_op(op, (arrs[0], scalars[2]))
-    # Cunumeric << and >> have problems with python integers:
+    # cuPyNumeric << and >> have problems with python integers:
     # check_op(op, (scalars[0], arrs[0]))
     check_op(op, (scalars[1], arrs[0]))
     check_op(op, (scalars[2], arrs[0]))
diff --git a/tests/integration/test_bincount.py b/tests/integration/test_bincount.py
index d382d1a702..9f61f43ad6 100644
--- a/tests/integration/test_bincount.py
+++ b/tests/integration/test_bincount.py
@@ -17,7 +17,7 @@
 import pytest
 from utils.comparisons import allclose
 
-import cunumeric as num
+import cupynumeric as num
 
 N = 8000
 MAX_VAL = 9
diff --git a/tests/integration/test_bits.py b/tests/integration/test_bits.py
index 40882706ee..f8e13da18a 100644
--- a/tests/integration/test_bits.py
+++ b/tests/integration/test_bits.py
@@ -16,16 +16,16 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
+from utils.utils import ONE_MAX_DIM_RANGE
 
-import cunumeric as num
+import cupynumeric as num
 
 
 class TestPackbits(object):
     def test_none_arr(self):
         # Numpy raises "TypeError:
         # Expected an input array of integer or boolean data type"
-        # For cuNumeric raises:
+        # For cuPyNumeric raises:
         #  > if a.dtype.kind not in ("u", "i", "b"):
         #  E AttributeError: 'NoneType' object has no attribute 'dtype'
         with pytest.raises(AttributeError):
@@ -50,7 +50,7 @@ def test_bitorder_negative(self, bitorder):
         in_num = num.random.randint(low=0, high=2, size=shape, dtype="i")
         # when bitorder is 1 or True, Numpy raises
         # "TypeError: pack() argument 3 must be str".
-        # while cuNumeric raises valueError.
+        # while cuPyNumeric raises valueError.
         with pytest.raises(ValueError):
             num.packbits(in_num, bitorder=bitorder)
 
@@ -64,7 +64,7 @@ def test_arr(self, arr, dtype, bitorder):
         out_num = num.packbits(in_num, bitorder=bitorder)
         assert np.array_equal(out_np, out_num)
 
-    @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
     @pytest.mark.parametrize("dtype", ("B", "i", "?"))
     @pytest.mark.parametrize("bitorder", ("little", "big"))
     def test_common(self, ndim, dtype, bitorder):
@@ -76,7 +76,7 @@ def test_common(self, ndim, dtype, bitorder):
         out_num = num.packbits(in_num, bitorder=bitorder)
         assert np.array_equal(out_np, out_num)
 
-    @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
     @pytest.mark.parametrize("dtype", ("B", "i", "?"))
     @pytest.mark.parametrize("bitorder", ("little", "big"))
     def test_axis(self, ndim, dtype, bitorder):
@@ -94,7 +94,7 @@ class TestUnpackbits(object):
     def test_none_arr(self):
         # Numpy raises "TypeError:
         # TypeError: Expected an input array of unsigned byte data type
-        # For cuNumeric raises:
+        # For cuPyNumeric raises:
         # > if a.dtype != "B":
         # E AttributeError: 'NoneType' object has no attribute 'dtype'
         with pytest.raises(AttributeError):
@@ -121,7 +121,7 @@ def test_bitorder_negative(self, bitorder):
         in_num = num.array(in_np)
         # when bitorder is 1 or True, Numpy raises
         # "TypeError: unpack() argument 4 must be str".
-        # while cuNumeric raises valueError.
+        # while cuPyNumeric raises valueError.
         with pytest.raises(ValueError):
             num.unpackbits(in_num, bitorder=bitorder)
 
@@ -145,7 +145,7 @@ def test_arr(self, arr, bitorder):
         out_num = num.unpackbits(in_num, bitorder=bitorder)
         assert np.array_equal(out_np, out_num)
 
-    @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
     @pytest.mark.parametrize("bitorder", ("little", "big"))
     def test_common(self, ndim, bitorder):
         shape = (5,) * ndim
@@ -157,7 +157,7 @@ def test_common(self, ndim, bitorder):
         assert np.array_equal(out_np, out_num)
 
     @pytest.mark.parametrize("count", (-9, 4, -1, 0, 4, 8, 9))
-    @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
     @pytest.mark.parametrize("bitorder", ("little", "big"))
     def test_count(self, ndim, count, bitorder):
         shape = (5,) * ndim
@@ -168,7 +168,7 @@ def test_count(self, ndim, count, bitorder):
         out_num = num.unpackbits(in_num, count=count, bitorder=bitorder)
         assert np.array_equal(out_np, out_num)
 
-    @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
     @pytest.mark.parametrize("bitorder", ("little", "big"))
     def test_axis(self, ndim, bitorder):
         shape = (5,) * ndim
@@ -180,7 +180,7 @@ def test_axis(self, ndim, bitorder):
             out_num = num.unpackbits(in_num, axis=axis, bitorder=bitorder)
             assert np.array_equal(out_np, out_num)
 
-    @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
     @pytest.mark.parametrize("bitorder", ("little", "big"))
     @pytest.mark.parametrize("count", (-2, 0, 2, 5))
     def test_axis_count(self, ndim, bitorder, count):
@@ -198,7 +198,7 @@ def test_axis_count(self, ndim, bitorder, count):
             assert np.array_equal(out_np, out_num)
 
 
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
 @pytest.mark.parametrize("bitorder", ("little", "big"))
 @pytest.mark.parametrize("dtype", ("B", "i", "?"))
 def test_pack_unpack(ndim, bitorder, dtype):
diff --git a/tests/integration/test_block.py b/tests/integration/test_block.py
index 326b18e518..7692af1f3f 100644
--- a/tests/integration/test_block.py
+++ b/tests/integration/test_block.py
@@ -17,7 +17,7 @@
 import pytest
 from utils.utils import check_module_function
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def _deepen(depth, x):
@@ -58,7 +58,7 @@ def test_block_simple_row_wise(self):
         arg = [a_2d, b_2d]
 
         print_msg = (
-            f"np & cunumeric.block([array({a_2d.shape}), "
+            f"np & cupynumeric.block([array({a_2d.shape}), "
             f"array({b_2d.shape})])"
         )
         check_module_function("block", [arg], {}, print_msg)
@@ -69,7 +69,7 @@ def test_block_simple_column_wise(self):
         arg = [[a_2d], [b_2d]]
 
         print_msg = (
-            f"np & cunumeric.block([[array({a_2d.shape})], "
+            f"np & cupynumeric.block([[array({a_2d.shape})], "
             f"[array({b_2d.shape})]])"
         )
         check_module_function("block", [arg], {}, print_msg)
@@ -80,7 +80,7 @@ def test_block_with_1d_arrays_multiple_rows(self):
         arg = [[a, b], [a, b]]
 
         print_msg = (
-            f"np & cunumeric.block([[array({a.shape}), array({b.shape})], "
+            f"np & cupynumeric.block([[array({a.shape}), array({b.shape})], "
             f"[array({a.shape}), array({b.shape})]])"
         )
         check_module_function("block", [arg], {}, print_msg, check_type=False)
@@ -91,7 +91,7 @@ def test_block_mixed_1d_and_2d(self):
         arg = [[a_2d], [b_1d]]
 
         print_msg = (
-            f"np & cunumeric.block([[array({a_2d.shape})], "
+            f"np & cupynumeric.block([[array({a_2d.shape})], "
             f"[array({b_1d.shape})]])"
         )
         check_module_function("block", [arg], {}, print_msg)
@@ -112,7 +112,7 @@ def test_block_complicated(self):
             [zero_2d],
         ]
 
-        print_msg = "np & cunumeric.block()"
+        print_msg = "np & cupynumeric.block()"
         check_module_function("block", [arg], {}, print_msg)
 
     def test_nested(self):
@@ -164,7 +164,7 @@ def test_3d(self):
             ],
         ]
 
-        print_msg = "np & cunumeric.block()"
+        print_msg = "np & cupynumeric.block()"
         check_module_function("block", [arg], {}, print_msg, check_type=False)
 
 
@@ -197,11 +197,11 @@ def test_mismatched_shape_3(self):
 
     def test_no_lists(self):
         # numpy: pass, output is np.array(1)
-        # cunumeric: raises TypeError, cunumeric doesn't support 0-D array
+        # cupynumeric: raises TypeError, cupynumeric doesn't support 0-D array
         # assert np.array_equal(num.block(1), np.array(1))
 
         # numpy: pass, output is np.eye(3)
-        # cunumeric: pass, output is 1-D array: [1, 0, 0, 0, 1, 0, 0, 0, 1]
+        # cupynumeric: pass, output is 1-D array: [1, 0, 0, 0, 1, 0, 0, 0, 1]
         # assert np.array_equal(num.block(np.eye(3)), np.eye(3))
         np.array_equal(num.block(num.eye(3)), [1, 0, 0, 0, 1, 0, 0, 0, 1])
 
@@ -235,7 +235,7 @@ def test_tuple(self):
         # TypeError: arrays is a tuple. Only lists can be used
         # to arrange blocks,and np.block does not allow implicit
         # conversion from tuple to ndarray.
-        # cunumeric: pass
+        # cupynumeric: pass
         np.array_equal(num.block(([1, 2], [3, 4])), [1, 2, 3, 4])
         np.array_equal(num.block([(1, 2), (3, 4)]), [1, 2, 3, 4])
 
@@ -246,7 +246,7 @@ def test_different_ndims(self):
         c = 3 * np.ones((1, 1, 3))
 
         # numpy: pass, output is np.array([[[1., 2., 2., 3., 3., 3.]]])
-        # cunumeric: raises ValueError
+        # cupynumeric: raises ValueError
         with pytest.raises(ValueError, match=msg):
             num.block([a, b, c])
 
@@ -259,7 +259,7 @@ def test_different_ndims_depths(self):
         # numpy: pass,output is np.array([[[1., 2., 2.],
         #                       [3., 3., 3.],
         #                       [3., 3., 3.]]])
-        # cunumeric: raises ValueError
+        # cupynumeric: raises ValueError
         with pytest.raises(ValueError, match=msg):
             num.block([[a, b], [c]])
 
diff --git a/tests/integration/test_broadcast.py b/tests/integration/test_broadcast.py
index a051054aaf..586746189e 100644
--- a/tests/integration/test_broadcast.py
+++ b/tests/integration/test_broadcast.py
@@ -17,9 +17,9 @@
 import pytest
 from legate.core import LEGATE_MAX_DIM
 
-import cunumeric as num
+import cupynumeric as num
 
-DIM_CASES = [5, 40]
+DIM_CASES = [5, 20]
 
 
 def _check_result(print_msg, err_arrs):
@@ -29,11 +29,11 @@ def _check_result(print_msg, err_arrs):
             print_output += (
                 f"Attr, {err_arr[0]}\n"
                 f"numpy result: {err_arr[1]}\n"
-                f"cunumeric_result: {err_arr[2]}\n"
+                f"cupynumeric_result: {err_arr[2]}\n"
             )
         assert False, (
             f"{print_output}"
-            f"cunumeric and numpy shows"
+            f"cupynumeric and numpy shows"
             f" different result\n"
         )
     else:
@@ -135,7 +135,7 @@ def _check(*args, params: list, routine: str):
 def gen_shapes(dim):
     base = (dim,)
     result = [base]
-    for i in range(1, LEGATE_MAX_DIM):
+    for i in range(1, min(4, LEGATE_MAX_DIM)):
         base = base + (1,) if i % 2 == 0 else base + (dim,)
         result.append(base)
     return result
diff --git a/tests/integration/test_cholesky.py b/tests/integration/test_cholesky.py
index e0f9d260e5..7f2dd5d102 100644
--- a/tests/integration/test_cholesky.py
+++ b/tests/integration/test_cholesky.py
@@ -17,7 +17,7 @@
 import pytest
 from utils.comparisons import allclose
 
-import cunumeric as num
+import cupynumeric as num
 
 SIZES = [8, 9, 255, 512, 1024]
 
diff --git a/tests/integration/test_clip.py b/tests/integration/test_clip.py
index f583398dbb..fd9a317352 100644
--- a/tests/integration/test_clip.py
+++ b/tests/integration/test_clip.py
@@ -15,10 +15,10 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
 from utils.generators import mk_seq_array
+from utils.utils import ONE_MAX_DIM_RANGE
 
-import cunumeric as num
+import cupynumeric as num
 
 
 class TestClipErrors:
@@ -28,7 +28,7 @@ def test_none_array(self):
         with pytest.raises(expected_exc):
             np.clip(None, a_min=0, a_max=0)
         with pytest.raises(expected_exc):
-            # cunumeric raises
+            # cupynumeric raises
             # AttributeError: 'NoneType' object has no attribute 'clip'
             num.clip(None, a_min=0, a_max=0)
 
@@ -41,7 +41,7 @@ def test_value_none(self):
             # ValueError: One of max or min must be given
             np.clip(array, a_min=None, a_max=None)
         with pytest.raises(expected_exc):
-            # cunumeric raises:
+            # cupynumeric raises:
             # TypeError: int() argument must be a string,
             # a bytes-like object or a real number, not 'NoneType'
             num.clip(array, a_min=None, a_max=None)
@@ -74,6 +74,22 @@ def test_empty_array():
     assert np.array_equal(res_np, res_num)
 
 
+def test_bool() -> None:
+    np.clip(True, a_min=1, a_max=1)
+    # Numpy returns 1
+    # See https://github.com/nv-legate/cunumeric.internal/issues/491
+    msg = r"Expected bytes or NumPy ndarray, but got <class 'int'>"
+    with pytest.raises(ValueError, match=msg):
+        num.clip(True, a_min=1, a_max=1)
+
+
+@pytest.mark.parametrize("v", (True, False))
+def test_bool_None(v: bool) -> None:
+    # Different Numpy versions error variously with both bounds None
+    res = num.clip(v, a_min=None, a_max=None)
+    assert np.array_equal(res, np.asarray(v))
+
+
 @pytest.mark.xfail
 def test_amin_amax():
     array = np.arange(0, 10)
@@ -95,7 +111,7 @@ def test_amin_value(amin):
     # res_np is not match res_num
     # in Numpy, when one of a_min of a_max is float,
     # all data are marked as float,
-    # while in cunumeric, all datas are int.
+    # while in cupynumeric, all datas are int.
     # for example, amin = 5
     # array = array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
     # res_np = array([5., 5., 5., 5., 5., 5., 6., 7., 8., 8.5])
@@ -111,7 +127,7 @@ def test_amin_complex():
     #  res_np = array([5. +5.j, 5. +5.j, 5. +5.j, 5. +5.j, 5. +5.j,
     #  5. +5.j, 6. +0.j, 7. +0.j, 8. +0.j, 8.5+0.j])
     res_num = num.clip(array, a_min=amin, a_max=8.5)
-    # cunumeric raises:
+    # cupynumeric raises:
     # TypeError: int() argument must be a string, a bytes-like object
     # or a real number, not 'complex'
     assert np.array_equal(res_np, res_num)
@@ -153,7 +169,7 @@ def test_out_np_array():
     assert np.array_equal(out_np, out_num)
 
 
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
 def test_basic(ndim):
     shape = (5,) * ndim
     np_arr = mk_seq_array(np, shape)
@@ -167,7 +183,7 @@ def test_basic(ndim):
     assert np.array_equal(res_num, res_np)
 
 
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
 def test_out(ndim):
     shape = (5,) * ndim
     np_arr = mk_seq_array(np, shape)
diff --git a/tests/integration/test_complex_ops.py b/tests/integration/test_complex_ops.py
index e00de22e2c..20ffe77fd8 100644
--- a/tests/integration/test_complex_ops.py
+++ b/tests/integration/test_complex_ops.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 ARRAYS = (
     [1, 2, 3],
@@ -69,7 +69,7 @@ def test_non_complex_array(dtype):
 def test_scalar(val):
     # e.g., np.array_equal(1.1, array(1.1))
     # In numpy, it returns val as a scalar
-    # In cunumeric, it returns a 0-dim array(val)
+    # In cupynumeric, it returns a 0-dim array(val)
     assert np.array_equal(np.real(val), num.real(val))
     assert np.array_equal(np.imag(val), num.imag(val))
 
@@ -79,7 +79,7 @@ def test_scalar(val):
 @pytest.mark.parametrize("real_val", ([7, 8, 9], 9))
 def test_assignment(real_val, imag_val):
     # In numpy, x_np.real = real_val pass
-    # In cunumeric, it rasies AttributeError: can't set attribute
+    # In cupynumeric, it rasies AttributeError: can't set attribute
     arr = [1 + 4j, 2 + 5j, 3 + 6j]
     x_np = np.array(arr)
     x_num = num.array(x_np)
diff --git a/tests/integration/test_compress.py b/tests/integration/test_compress.py
index af466f7bf1..ce5a3a3652 100644
--- a/tests/integration/test_compress.py
+++ b/tests/integration/test_compress.py
@@ -15,16 +15,16 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
 from utils.generators import mk_seq_array
+from utils.utils import ONE_MAX_DIM_RANGE
 
-import cunumeric as num
+import cupynumeric as num
 
 
 @pytest.mark.xfail
 def test_none_array():
     res_np = np.compress([0], None)  # numpy return []
-    # cuNumeric raises:
+    # cuPyNumeric raises:
     # AttributeError: 'NoneType' object has no attribute 'compress'
     res_num = num.compress([0], None)
     assert np.array_equal(res_np, res_num)
@@ -33,7 +33,7 @@ def test_none_array():
 @pytest.mark.xfail
 def test_empty_array():
     res_np = np.compress([0], [])  # numpy return []
-    # cuNumeric raises: ValueError:
+    # cuPyNumeric raises: ValueError:
     # Shape mismatch: condition contains entries that are out of bounds
     res_num = num.compress([0], [])
     assert np.array_equal(res_np, res_num)
@@ -79,14 +79,14 @@ def test_dtype_out1():
     # for Numpy, it will raise TypeError:
     # "Cannot cast array data from dtype('float64') to dtype('int64')
     # according to the rule 'safe'".
-    # cuNumeric passed.
+    # cuPyNumeric passed.
     np.compress([True, True, True, True], a, out=out_np)
     num.compress([True, True, True, True], b, out=out_num)
     assert np.array_equal(out_np, out_num)
 
 
 def test_dtype_out2():
-    # both Numpy and cuNumeric turn float into int
+    # both Numpy and cuPyNumeric turn float into int
     a = np.random.random((4,)) * 10
     b = num.array(a)
     out_np = np.random.randint(1, 10, (4,))
@@ -104,7 +104,7 @@ def test_out_parameter():
     out_num = np.random.randint(1, 5, (4,))
     np.compress([True, True, True, True], a, 0, out_np)
     num.compress([True, True, True, True], b, 0, out_num)
-    # for cuNumeric, the last parameter 'out',
+    # for cuPyNumeric, the last parameter 'out',
     # it should be written as 'out=out_num'
     # otherwise it raises error
     assert np.array_equal(out_num, out_np)
@@ -120,7 +120,7 @@ def test_bool_condition():
     assert np.array_equal(res_num, res_np)
 
 
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
 def test_ndim_basic(ndim):
     shape = (5,) * ndim
     np_arr = mk_seq_array(np, shape)
@@ -134,7 +134,7 @@ def test_ndim_basic(ndim):
     assert np.array_equal(res_num, res_np)
 
 
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
 def test_ndim_axis(ndim):
     shape = (5,) * ndim
     np_arr = mk_seq_array(np, shape)
@@ -149,7 +149,7 @@ def test_ndim_axis(ndim):
         assert np.array_equal(res_num, res_np)
 
 
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
 def test_ndim_out(ndim):
     shape = (5,) * ndim
     np_arr = mk_seq_array(np, shape)
diff --git a/tests/integration/test_concatenate_stack.py b/tests/integration/test_concatenate_stack.py
index d59fd47ce3..f476f19792 100644
--- a/tests/integration/test_concatenate_stack.py
+++ b/tests/integration/test_concatenate_stack.py
@@ -18,7 +18,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def run_test(arr, routine, input_size):
@@ -56,8 +56,8 @@ def run_test(arr, routine, input_size):
         assert is_equal, (
             f"Failed, {print_msg}\n"
             f"numpy result: {err_arr[0]}, {b.shape}\n"
-            f"cunumeric_result: {err_arr[1]}, {c.shape}\n"
-            f"cunumeric and numpy shows"
+            f"cupynumeric_result: {err_arr[1]}, {c.shape}\n"
+            f"cupynumeric and numpy shows"
             f" different result\n"
             f"array({arr}),"
             f"routine: {routine},"
@@ -65,7 +65,7 @@ def run_test(arr, routine, input_size):
         )
         print(
             f"Passed, {print_msg}, np: ({b.shape}, {b.dtype})"
-            f", cunumeric: ({c.shape}, {c.dtype}"
+            f", cupynumeric: ({c.shape}, {c.dtype}"
         )
 
 
diff --git a/tests/integration/test_contains.py b/tests/integration/test_contains.py
index 08ab23dc8c..0e392e51ba 100644
--- a/tests/integration/test_contains.py
+++ b/tests/integration/test_contains.py
@@ -18,7 +18,7 @@
 import pytest
 from utils.generators import mk_seq_array
 
-import cunumeric as num
+import cupynumeric as num
 
 DIM = 128
 NO_EMPTY_SIZES = [
diff --git a/tests/integration/test_convolve.py b/tests/integration/test_convolve.py
index 687f11d62e..5410781425 100644
--- a/tests/integration/test_convolve.py
+++ b/tests/integration/test_convolve.py
@@ -13,16 +13,15 @@
 # limitations under the License.
 #
 
-import os
-
 import numpy as np
 import pytest
 import scipy.signal as sig
 from utils.comparisons import allclose
 
-import cunumeric as num
+import cupynumeric as num
+from cupynumeric.runtime import runtime
 
-CUDA_TEST = os.environ.get("LEGATE_NEED_CUDA") == "1"
+CUDA_TEST = runtime.num_gpus > 0
 
 SHAPES = [(100,), (10, 10), (10, 10, 10), (32, 2, 32)]
 FILTER_SHAPES = [(5,), (3, 5), (3, 5, 3), (32, 1, 32)]
@@ -80,7 +79,7 @@ def test_none():
     expected_exc = TypeError
     with pytest.raises(expected_exc):
         num.convolve(None, None, mode="same")
-        # cuNumeric raises AttributeError
+        # cuPyNumeric raises AttributeError
     with pytest.raises(expected_exc):
         np.convolve(None, None, mode="same")
 
@@ -166,7 +165,7 @@ def test_modes(mode):
     arr1 = num.random.random(shape)
     arr2 = num.random.random(shape)
     out_num = num.convolve(arr1, arr2, mode=mode)
-    # when mode!="same", cunumeric raises
+    # when mode!="same", cupynumeric raises
     # NotImplementedError: Need to implement other convolution modes
     out_np = np.convolve(arr1, arr2, mode=mode)
     assert allclose(out_num, out_np)
@@ -188,11 +187,29 @@ def test_ndim(ndim):
     arr1 = num.random.random(shape)
     arr2 = num.random.random(shape)
     out_num = num.convolve(arr1, arr2, mode="same")
-    # cunumeric raises,  NotImplementedError: 4-D arrays are not yet supported
+    # cupynumeric raises NotImplementedError: 4-D arrays are not yet supported
     out_np = np.convolve(arr1, arr2, mode="same")
     assert allclose(out_num, out_np)
 
 
+@pytest.mark.parametrize(
+    "method",
+    ("auto", "direct", "fft"),
+)
+def test_methods(method):
+    shape = (5,) * 2
+    arr1 = num.random.random(shape)
+    arr2 = num.random.random(shape)
+    out_num = num.convolve(arr1, arr2, mode="same", method=method)
+    out_np = sig.convolve(arr1, arr2, mode="same", method=method)
+    assert allclose(out_num, out_np)
+
+
+def test_invalid_method():
+    with pytest.raises(ValueError):
+        num.convolve([], [], mode="same", method="test")
+
+
 if __name__ == "__main__":
     import sys
 
diff --git a/tests/integration/test_copy.py b/tests/integration/test_copy.py
index 76efb4f834..a0ff1b8ae5 100644
--- a/tests/integration/test_copy.py
+++ b/tests/integration/test_copy.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def test_basic():
diff --git a/tests/integration/test_corner_quantiles.py b/tests/integration/test_corner_quantiles.py
index 152bc5d84b..8538c06e02 100644
--- a/tests/integration/test_corner_quantiles.py
+++ b/tests/integration/test_corner_quantiles.py
@@ -19,7 +19,7 @@
 from legate.core import LEGATE_MAX_DIM
 from utils.comparisons import allclose
 
-import cunumeric as num
+import cupynumeric as num
 
 ALL_METHODS = (
     "inverted_cdf",
@@ -116,8 +116,8 @@ def test_quantiles_w_output(str_method, axes, qs_arr, keepdims):
         q_out = num.zeros((*qs_arr.shape, *remaining_shape), dtype=float)
         # np_q_out = np.zeros((*qs_arr.shape, *remaining_shape), dtype=float)
 
-    # cunumeric:
-    # print("cunumeric axis = %d:"%(axis))
+    # cupynumeric:
+    # print("cupynumeric axis = %d:"%(axis))
     num.quantile(
         arr, qs_arr, axis=axes, out=q_out, method=str_method, keepdims=keepdims
     )
@@ -189,8 +189,8 @@ def test_quantiles_axis_none(str_method, qin_arr, keepdims):
     else:
         qs_arr = np.array(qin_arr)
 
-    # cunumeric:
-    # print("cunumeric axis = %d:"%(axis))
+    # cupynumeric:
+    # print("cupynumeric axis = %d:"%(axis))
     q_out = num.quantile(
         arr,
         qs_arr,
diff --git a/tests/integration/test_data_interface.py b/tests/integration/test_data_interface.py
index 7214aa0f1d..77ffaff713 100644
--- a/tests/integration/test_data_interface.py
+++ b/tests/integration/test_data_interface.py
@@ -15,8 +15,8 @@
 
 import pytest
 
-import cunumeric as num
-from cunumeric._utils.array import SUPPORTED_DTYPES
+import cupynumeric as num
+from cupynumeric._utils.array import SUPPORTED_DTYPES
 
 DTYPES = SUPPORTED_DTYPES.keys()
 
diff --git a/tests/integration/test_diag_indices.py b/tests/integration/test_diag_indices.py
index 03659d44bb..d1fddb9557 100644
--- a/tests/integration/test_diag_indices.py
+++ b/tests/integration/test_diag_indices.py
@@ -15,9 +15,9 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
+from utils.utils import MAX_DIM_RANGE, TWO_MAX_DIM_RANGE
 
-import cunumeric as num
+import cupynumeric as num
 
 
 @pytest.mark.parametrize("n", [10, -10.5, -1])
@@ -27,7 +27,7 @@ def test_diag_indices_default_ndim(n):
     assert np.array_equal(a_np, a_num)
 
 
-@pytest.mark.parametrize("ndim", range(0, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", MAX_DIM_RANGE)
 def test_diag_indices_basic(ndim):
     a_np = np.diag_indices(10, ndim)
     a_num = num.diag_indices(10, ndim)
@@ -72,7 +72,7 @@ def test_none_ndim(self):
 
 
 @pytest.mark.parametrize("size", [(5,), (0,)], ids=str)
-@pytest.mark.parametrize("ndim", range(2, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", TWO_MAX_DIM_RANGE)
 def test_diag_indices_from_basic(size, ndim):
     shape = size * ndim
     a = np.ones(shape, dtype=int)
diff --git a/tests/integration/test_diff.py b/tests/integration/test_diff.py
index 0644f5c9cc..8eeafd4cf5 100644
--- a/tests/integration/test_diff.py
+++ b/tests/integration/test_diff.py
@@ -17,7 +17,7 @@
 import pytest
 from utils.comparisons import allclose
 
-import cunumeric as num
+import cupynumeric as num
 
 
 @pytest.mark.parametrize(
@@ -37,6 +37,8 @@
         ((5,), 6, 0, None, None),
         ((5, 5), 5, 1, None, None),
         ((5, 5), 6, 1, None, None),
+        ((5, 5), 6, 1, np.array(2), None),
+        ((5, 5), 6, 1, None, np.array(2)),
     ],
 )
 def test_diff(args):
@@ -54,12 +56,32 @@ def test_diff(args):
     assert allclose(res_np, res_cn)
 
 
-def test_diff_nzero():
+def test_diff_nzero() -> None:
     a = num.ones(100)
     ad = num.diff(a, n=0)
     assert a is ad
 
 
+def test_negative_time() -> None:
+    arr_np = np.random.random((5, 5))
+    arr_num = num.array(arr_np)
+    msg = r"order must be non-negative but got -1"
+    with pytest.raises(ValueError, match=msg):
+        num.diff(arr_num, n=-1)
+    with pytest.raises(ValueError, match=msg):
+        np.diff(arr_np, n=-1)
+
+
+def test_scalar() -> None:
+    arr_np = np.array(2)
+    arr_num = num.array(2)
+    msg = "diff requires input that is at least one dimensional"
+    with pytest.raises(ValueError, match=msg):
+        np.diff(arr_np)
+    with pytest.raises(ValueError, match=msg):
+        num.diff(arr_num)
+
+
 if __name__ == "__main__":
     import sys
 
diff --git a/tests/integration/test_digitize.py b/tests/integration/test_digitize.py
index f7d524f2cf..194814e8d7 100644
--- a/tests/integration/test_digitize.py
+++ b/tests/integration/test_digitize.py
@@ -18,7 +18,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 DTYPES = (
     np.uint32,
@@ -49,7 +49,7 @@ def test_bad_array(self):
         bins = [0, 5, 3]
         expected_exc = ValueError
         with pytest.raises(expected_exc):
-            # cunumeric raises TypeError
+            # cupynumeric raises TypeError
             num.digitize(None, bins)
         with pytest.raises(expected_exc):
             np.digitize(None, bins)
@@ -59,7 +59,7 @@ def test_bad_bins(self):
         a = [2, 3, 10, 9]
         expected_exc = ValueError
         with pytest.raises(expected_exc):
-            # cunumeric raises TypeError
+            # cupynumeric raises TypeError
             num.digitize(a, None)
         with pytest.raises(expected_exc):
             np.digitize(a, None)
diff --git a/tests/integration/test_dot.py b/tests/integration/test_dot.py
index e3b775145e..49c1762900 100644
--- a/tests/integration/test_dot.py
+++ b/tests/integration/test_dot.py
@@ -14,16 +14,16 @@
 #
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
 from utils.contractions import check_default
 from utils.generators import mk_0to1_array
+from utils.utils import MAX_DIM_RANGE
 
-import cunumeric as num
-from cunumeric._utils.linalg import dot_modes
+import cupynumeric as num
+from cupynumeric._utils.linalg import dot_modes
 
 
-@pytest.mark.parametrize("b_ndim", range(LEGATE_MAX_DIM + 1))
-@pytest.mark.parametrize("a_ndim", range(LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("b_ndim", MAX_DIM_RANGE)
+@pytest.mark.parametrize("a_ndim", MAX_DIM_RANGE)
 def test_dot(a_ndim, b_ndim):
     name = f"dot({a_ndim} x {b_ndim})"
     modes = dot_modes(a_ndim, b_ndim)
@@ -64,7 +64,7 @@ def test_out_invalid_shape(self, shape):
     )
     def test_out_invalid_dtype(self, dtype):
         # In Numpy, for np.float32 and np.int64, it raises ValueError
-        # In cuNumeric,
+        # In cuPyNumeric,
         # for np.float32, it pass
         # for np.int64, it raises TypeError: Unsupported type: int64
         out = num.zeros((5, 2), dtype=dtype)
diff --git a/tests/integration/test_eig.py b/tests/integration/test_eig.py
new file mode 100644
index 0000000000..943d46b1af
--- /dev/null
+++ b/tests/integration/test_eig.py
@@ -0,0 +1,161 @@
+# Copyright 2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import pytest
+
+import cupynumeric as num
+
+SIZES = [
+    (5, 5),
+    (
+        3,
+        3,
+    ),
+    (2, 5, 5),
+    (12, 3, 3),
+    (1, 5, 5),
+    (3, 1, 1),
+    (
+        10,
+        2,
+        2,
+    ),
+    (1, 4, 4),
+    (1, 0, 0),
+    (1, 1, 1),
+]
+
+
+SIZES_4D = [
+    (3, 2, 5, 5),
+    (1, 2, 5, 5),
+    (4, 1, 5, 5),
+    (2, 1, 0, 0),
+]
+
+
+def assert_individual(a, ew, ev):
+    assert num.linalg.norm(ev, ord=np.inf) > 0
+
+    ew_diag = num.array(np.diagflat(ew))
+    a_ev = num.matmul(a, ev)
+    ev_ew = num.matmul(ev, ew_diag)
+
+    if ev_ew.dtype is np.dtype(np.complex64):
+        rtol = 1e-02
+        atol = 1e-04
+    else:
+        rtol = 1e-05
+        atol = 1e-08
+
+    assert num.allclose(a_ev, ev_ew, rtol=rtol, atol=atol)
+
+
+def assert_result(a, ew, ev):
+    m = a.shape[-1]
+    if m == 0:
+        return
+    num_matrices = int(np.prod(a.shape) // (m * m))
+    batch_view_a = a.reshape(num_matrices, m, m)
+    batch_view_ew = ew.reshape(num_matrices, m)
+    batch_view_ev = ev.reshape(num_matrices, m, m)
+
+    for idx in range(num_matrices):
+        assert_individual(
+            batch_view_a[idx, :, :],
+            batch_view_ew[idx, :],
+            batch_view_ev[idx, :, :],
+        )
+
+
+class TestEig(object):
+    @pytest.mark.xfail
+    def test_arr_none(self):
+        res_np = np.linalg.eig(
+            None
+        )  # AxisError: axis -1 is out of bounds for array of dimension 0
+        res_num = num.linalg.eig(
+            None
+        )  # AttributeError: 'NoneType' object has no attribute 'shape'
+        assert np.equal(res_np, res_num)
+
+    @pytest.mark.xfail
+    @pytest.mark.parametrize("arr", ([], [[]], [[], []]))
+    def test_arr_empty(self, arr):
+        res_np = np.linalg.eig(arr)
+        res_num = num.linalg.eig(arr)
+        assert np.equal(res_np, res_num)
+
+    @pytest.mark.xfail
+    @pytest.mark.parametrize(
+        "arr", ([1], [[2]], [[2], [1]], [[[2], [1]], [[3], [4]]])
+    )
+    def atest_arr_dim_1(self, arr):
+        res_np = np.linalg.eig(arr)
+        res_num = num.linalg.eig(arr)
+        assert np.equal(res_np, res_num)
+
+    @pytest.mark.parametrize("size", SIZES)
+    @pytest.mark.parametrize("dtype", (np.float32, np.float64))
+    def test_arr_basic_real(self, size, dtype):
+        arr_np = np.random.randint(-100, 100, size).astype(dtype)
+        arr_num = num.array(arr_np)
+        ew, ev = num.linalg.eig(arr_num)
+        assert_result(arr_num, ew, ev)
+
+    @pytest.mark.parametrize("size", SIZES)
+    @pytest.mark.parametrize("dtype", (np.complex64, np.complex128))
+    def test_arr_basic_complex(self, size, dtype):
+        arr_np = (
+            np.random.randint(-100, 100, size)
+            + np.random.randint(-100, 100, size) * 1.0j
+        ).astype(dtype)
+        arr_num = num.array(arr_np)
+        ew, ev = num.linalg.eig(arr_num)
+        assert_result(arr_num, ew, ev)
+
+    @pytest.mark.parametrize("size", SIZES)
+    @pytest.mark.parametrize("dtype", (np.int32, np.int64))
+    def test_arr_basic_int(self, size, dtype):
+        arr_np = np.random.randint(-100, 100, size).astype(dtype)
+        arr_num = num.array(arr_np)
+        ew, ev = num.linalg.eig(arr_num)
+        assert_result(arr_num, ew, ev)
+
+    @pytest.mark.parametrize("size", SIZES_4D)
+    @pytest.mark.parametrize("dtype", (np.float32, np.float64))
+    def test_arr_4d_real(self, size, dtype):
+        arr_np = np.random.randint(-100, 100, size).astype(dtype)
+        arr_num = num.array(arr_np)
+        ew, ev = num.linalg.eig(arr_num)
+        assert_result(arr_num, ew, ev)
+
+    @pytest.mark.parametrize("size", SIZES_4D)
+    @pytest.mark.parametrize("dtype", (np.complex64, np.complex128))
+    def test_arr_4d_complex(self, size, dtype):
+        arr_np = (
+            np.random.randint(-100, 100, size)
+            + np.random.randint(-100, 100, size) * 1.0j
+        ).astype(dtype)
+        arr_num = num.array(arr_np)
+        ew, ev = num.linalg.eig(arr_num)
+        assert_result(arr_num, ew, ev)
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/integration/test_einsum.py b/tests/integration/test_einsum.py
index f79033b1f7..3debf76749 100644
--- a/tests/integration/test_einsum.py
+++ b/tests/integration/test_einsum.py
@@ -22,7 +22,7 @@
 from utils.comparisons import allclose
 from utils.generators import mk_0to1_array, permutes_to
 
-import cunumeric as num
+import cupynumeric as num
 
 # Limits for exhaustive expression generation routines
 MAX_MODES = 3
@@ -294,7 +294,7 @@ def test_expr_opposite():
         # sum subscripts string, subscripts must be letters
     with pytest.raises(expected_exc):
         num.einsum("ik,kj=>ij", a, b)
-        # cuNumeric raises ValueError: Subscripts can only contain one '->'
+        # cuPyNumeric raises ValueError: Subscripts can only contain one '->'
 
 
 @pytest.mark.xfail
@@ -309,6 +309,17 @@ def test_order(order):
     assert allclose(np_res, num_res)
 
 
+def test_negative() -> None:
+    a = np.random.rand(256, 256)
+    b = np.random.rand(256, 256)
+    msg = r"invalid subscript"
+    with pytest.raises(ValueError, match=msg):
+        np.einsum("ik,1j->ij", a, b)
+    msg = r"Non-alphabetic mode labels"
+    with pytest.raises(NotImplementedError, match=msg):
+        num.einsum("ik,1j->ij", a, b)
+
+
 if __name__ == "__main__":
     import sys
 
diff --git a/tests/integration/test_einsum_path.py b/tests/integration/test_einsum_path.py
index 675ae44500..c5b59cb5da 100644
--- a/tests/integration/test_einsum_path.py
+++ b/tests/integration/test_einsum_path.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 expr = "ij,jk,kl->il"
 np_a = np.empty((2, 2))
@@ -61,7 +61,7 @@ def test_einsum_path_optimize_opposite(optimize):
         path_num, _ = num.einsum_path(
             expr, num_a, num_b, num_c, optimize=optimize
         )
-        # cuNumeric raises ValueError: einsum_path: unexpected value
+        # cuPyNumeric raises ValueError: einsum_path: unexpected value
         # for optimize: 2
 
 
@@ -71,7 +71,7 @@ def test_einsum_path_optimize_none():
     path_np, _ = np.einsum_path(expr, np_a, np_b, np_c, optimize=optimize)
     # Numpy returns results
     path_num, _ = num.einsum_path(expr, num_a, num_b, num_c, optimize=optimize)
-    # cunumeric raises ValueError: einsum_path: unexpected value
+    # cupynumeric raises ValueError: einsum_path: unexpected value
     # for optimize: None
     assert path_np == path_num
 
diff --git a/tests/integration/test_exp.py b/tests/integration/test_exp.py
index 1b0fe195a0..af14b1f64a 100644
--- a/tests/integration/test_exp.py
+++ b/tests/integration/test_exp.py
@@ -16,9 +16,9 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
-# cunumeric.exp(*args: Any, out: Union[ndarray, None] = None,
+# cupynumeric.exp(*args: Any, out: Union[ndarray, None] = None,
 # where: bool = True, casting: CastingKind = 'same_kind',
 # order: str = 'K',
 # dtype: Union[np.dtype[Any], None] = None, **kwargs: Any) → ndarray
@@ -69,7 +69,7 @@ def test_casting_negative(casting):
     arr_np = np.array(arr_num)
     res_num = num.exp(arr_num, casting=casting)
     res_np = np.exp(arr_np, casting=casting)
-    # cuNumeric run successfully.
+    # cuPyNumeric run successfully.
     # Numpy raises " numpy.core._exceptions._UFuncInputCastingError:
     # Cannot cast ufunc 'exp' input from dtype('int64') to dtype('float64')
     # with casting rule 'no'
@@ -98,7 +98,7 @@ def test_where_false():
     arr_np = np.array(arr_num)
     np_out = np.ones(shape=shape)
     # Numpy get the results.
-    # cuNumeric raises "NotImplementedError:
+    # cuPyNumeric raises "NotImplementedError:
     # the 'where' keyword is not yet supported"
     num.exp(arr_num, where=False, out=num_out)
     np.exp(arr_np, where=False, out=np_out)
diff --git a/tests/integration/test_expand_dims.py b/tests/integration/test_expand_dims.py
index 043d209e78..ab69bea0ec 100644
--- a/tests/integration/test_expand_dims.py
+++ b/tests/integration/test_expand_dims.py
@@ -17,7 +17,7 @@
 import pytest
 from utils.utils import AxisError
 
-import cunumeric as num
+import cupynumeric as num
 
 DIM = 5
 SIZES = [
@@ -37,7 +37,7 @@
 def test_none_array_compare():
     res_num = num.expand_dims(
         None, 0
-    )  # TypeError: cuNumeric does not support dtype=object
+    )  # TypeError: cuPyNumeric does not support dtype=object
     res_np = np.expand_dims(None, 0)  # return array([None], dtype=object)
     assert np.array_equal(res_num, res_np, equal_nan=True)
 
diff --git a/tests/integration/test_expm_sh.py b/tests/integration/test_expm_sh.py
new file mode 100644
index 0000000000..6a41eeb6b6
--- /dev/null
+++ b/tests/integration/test_expm_sh.py
@@ -0,0 +1,195 @@
+# Copyright 2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import pytest
+import scipy as sp
+from utils.comparisons import allclose
+
+import cupynumeric as num
+
+SIZES = (4, 10)
+
+RTOL = {
+    np.dtype(np.float32): 1e-1,
+    np.dtype(np.complex64): 1e-1,
+    np.dtype(np.float64): 1e-5,
+    np.dtype(np.complex128): 1e-5,
+}
+
+ATOL = {
+    np.dtype(np.float32): 1e-3,
+    np.dtype(np.complex64): 1e-3,
+    np.dtype(np.float64): 1e-8,
+    np.dtype(np.complex128): 1e-8,
+}
+
+
+def make_skew_hermitian(
+    n: int, min_v: float = 0.0, max_v: float = 100.0
+) -> np.ndarray:
+    num_off_d = int(n * (n - 1) / 2)
+
+    np.random.seed(1729)
+
+    r_array = np.array(
+        [np.random.uniform(min_v, max_v) for k in range(num_off_d)],
+        dtype=np.dtype("float64"),
+    )
+
+    i_array = np.array(
+        [np.random.uniform(min_v, max_v) for k in range(num_off_d)],
+        dtype=np.dtype("float64"),
+    )
+
+    d_array = np.array(
+        [np.random.uniform(min_v, max_v) for k in range(n)],
+        dtype=np.dtype("float64"),
+    )
+
+    mat = np.zeros((n, n), dtype=np.dtype("complex64"))
+
+    arr_index = 0
+    for col in range(1, n):
+        for row in range(0, col):
+            mat[row, col] = r_array[arr_index] + i_array[arr_index] * 1.0j
+            mat[col, row] = -np.conjugate(mat[row, col])
+
+            arr_index = arr_index + 1
+
+        c_1 = col - 1
+        mat[c_1, c_1] = d_array[c_1] * 1.0j
+
+    mat[n - 1][n - 1] = d_array[n - 1] * 1.0j
+
+    return mat
+
+
+def check_skew_hermitian(A: np.ndarray) -> bool:
+    assert A.ndim == 2
+    n = A.shape[0]
+    assert n == A.shape[1]
+    num_half_off_d = int(n * (n - 1) / 2)
+
+    arr_off_d = np.array(
+        [A[i, j] + np.conjugate(A[j, i]) for i in range(n) for j in range(i)],
+        dtype=np.dtype("complex64"),
+    )
+
+    check_arr = np.zeros((num_half_off_d,), dtype=np.dtype("complex64"))
+    assert arr_off_d.size == num_half_off_d
+
+    assert allclose(
+        arr_off_d, check_arr, atol=ATOL[A.dtype], check_dtype=False
+    )
+
+    assert np.all([np.real(A[k, k]) for k in range(n)] == np.zeros(n))
+    return True
+
+
+@pytest.mark.parametrize("n", SIZES)
+@pytest.mark.parametrize("min_v", (0.0,))
+@pytest.mark.parametrize("max_v", (10.0,))
+def test_expm_rnd_sh_tensor_pade(n, min_v, max_v):
+    m = 3
+    a = np.zeros(shape=(m, n, n), dtype=np.complex64)
+    for idx in np.ndindex(a.shape[:-2]):
+        a[idx] = make_skew_hermitian(n, min_v, max_v)
+
+    # more info for debug purposes:
+    # (out_num, m, s) = num.linalg.expm_impl(a)
+    #
+    out_num = num.linalg.expm(a, method="pade")
+    out_s = sp.linalg.expm(a)
+
+    rtol = RTOL[out_num.dtype]
+    atol = ATOL[out_num.dtype]
+    if n > 1024:
+        atol *= 20.0
+
+    tol_satisfied = allclose(
+        out_num, out_s, rtol=rtol, atol=atol, check_dtype=False
+    )
+
+    # scipy result may not be reliable,
+    # hence check which exp L2 norm is
+    # closer to unity:
+    #
+    if tol_satisfied == False:
+        for i in range(m):
+            # check diff in ||exp(A)||_2:
+            #
+            norm_exp_s = np.linalg.norm(out_s[i], ord=2)
+            norm_exp_num = np.linalg.norm(out_num[i], ord=2)
+            #
+            # conversion to string shows more decimals...
+            #
+            print("external ||exp(A)|| = %s\n" % (str(norm_exp_s)))
+            print("Cupynumeric ||exp(A)|| = %s\n" % (str(norm_exp_num)))
+
+            assert np.abs(1.0 - norm_exp_num) <= np.abs(1.0 - norm_exp_s)
+
+    assert True
+
+
+@pytest.mark.parametrize("n", SIZES)
+@pytest.mark.parametrize("min_v", (0.0,))
+@pytest.mark.parametrize("max_v", (10.0,))
+def test_expm_rnd_sh_tensor_taylor(n, min_v, max_v):
+    m = 3
+    a = np.zeros(shape=(m, n, n), dtype=np.complex64)
+    for idx in np.ndindex(a.shape[:-2]):
+        a[idx] = make_skew_hermitian(n, min_v, max_v)
+
+    # more info for debug purposes:
+    # (out_num, m, s) = num.linalg.expm_impl(a)
+    #
+    out_num = num.linalg.expm(a, method="taylor")
+    out_s = sp.linalg.expm(a)
+
+    rtol = RTOL[out_num.dtype]
+    atol = ATOL[out_num.dtype]
+    if n > 1024:
+        atol *= 20.0
+
+    tol_satisfied = allclose(
+        out_num, out_s, rtol=rtol, atol=atol, check_dtype=False
+    )
+
+    # scipy result may not be reliable,
+    # hence check which exp L2 norm is
+    # closer to unity:
+    #
+    if tol_satisfied == False:
+        for i in range(m):
+            # check diff in ||exp(A)||_2:
+            #
+            norm_exp_s = np.linalg.norm(out_s[i], ord=2)
+            norm_exp_num = np.linalg.norm(out_num[i], ord=2)
+            #
+            # conversion to string shows more decimals...
+            #
+            print("external ||exp(A)|| = %s\n" % (str(norm_exp_s)))
+            print("Cupynumeric ||exp(A)|| = %s\n" % (str(norm_exp_num)))
+
+            assert np.abs(1.0 - norm_exp_num) <= np.abs(1.0 - norm_exp_s)
+
+    assert True
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/integration/test_expm_sh_cpx_qr.py b/tests/integration/test_expm_sh_cpx_qr.py
new file mode 100644
index 0000000000..934277d242
--- /dev/null
+++ b/tests/integration/test_expm_sh_cpx_qr.py
@@ -0,0 +1,163 @@
+# Copyright 2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+
+# import cupy as cp
+# import cupyx.scipy.linalg as cpxl
+import pytest
+import scipy as sp
+from utils.comparisons import allclose
+
+import cupynumeric as num
+
+SIZES = (4, 10, 50)
+
+RTOL = {
+    np.dtype(np.float32): 1e-1,
+    np.dtype(np.complex64): 1e-1,
+    np.dtype(np.float64): 1e-5,
+    np.dtype(np.complex128): 1e-5,
+}
+
+ATOL = {
+    np.dtype(np.float32): 1e-3,
+    np.dtype(np.complex64): 1e-3,
+    np.dtype(np.float64): 1e-6,
+    np.dtype(np.complex128): 1e-6,
+}
+
+
+def make_skew_hermitian(
+    n: int, min_v: float = 0.0, max_v: float = 100.0
+) -> np.ndarray:
+    num_off_d = int(n * (n - 1) / 2)
+
+    np.random.seed(1729)
+
+    r_array = np.array(
+        [np.random.uniform(min_v, max_v) for k in range(num_off_d)],
+        dtype=np.dtype("float64"),
+    )
+
+    i_array = np.array(
+        [np.random.uniform(min_v, max_v) for k in range(num_off_d)],
+        dtype=np.dtype("float64"),
+    )
+
+    d_array = np.array(
+        [np.random.uniform(min_v, max_v) for k in range(n)],
+        dtype=np.dtype("float64"),
+    )
+
+    mat = np.zeros((n, n), dtype=np.dtype("complex64"))
+
+    arr_index = 0
+    for col in range(1, n):
+        for row in range(0, col):
+            mat[row, col] = r_array[arr_index] + i_array[arr_index] * 1.0j
+            mat[col, row] = -np.conjugate(mat[row, col])
+
+            arr_index = arr_index + 1
+
+        c_1 = col - 1
+        mat[c_1, c_1] = d_array[c_1] * 1.0j
+
+    mat[n - 1][n - 1] = d_array[n - 1] * 1.0j
+
+    return mat
+
+
+def check_skew_hermitian(A: np.ndarray) -> bool:
+    assert A.ndim == 2
+    n = A.shape[0]
+    assert n == A.shape[1]
+    num_half_off_d = int(n * (n - 1) / 2)
+
+    arr_off_d = np.array(
+        [A[i, j] + np.conjugate(A[j, i]) for i in range(n) for j in range(i)],
+        dtype=np.dtype("complex64"),
+    )
+
+    check_arr = np.zeros((num_half_off_d,), dtype=np.dtype("complex64"))
+    assert arr_off_d.size == num_half_off_d
+
+    assert allclose(
+        arr_off_d, check_arr, atol=ATOL[A.dtype], check_dtype=False
+    )
+
+    assert np.all([np.real(A[k, k]) for k in range(n)] == np.zeros(n))
+    return True
+
+
+@pytest.mark.parametrize("n", SIZES)
+@pytest.mark.parametrize("min_v", (0.0,))  # 10.0)
+@pytest.mark.parametrize("max_v", (2.0,))  # 100.0)
+def test_expm_rnd_skew_h(n, min_v, max_v):
+    a = make_skew_hermitian(n, min_v, max_v)
+    check_skew_hermitian(a)
+
+    # more info for debug purposes:
+    # (out_num, m, s) = num.linalg.expm_impl(a)
+    #
+    out_num = num.linalg.expm(a)
+    out_s = sp.linalg.expm(a)
+
+    # cupy experiments:
+    # (keep this code for possible future use)
+    #
+    # a_cp = cp.asarray(a)
+    # out_cp = cpxl.expm(a_cp)
+    # out_s = cp.asnumpy(out_cp)
+
+    rtol = RTOL[out_num.dtype]
+    atol = ATOL[out_num.dtype]
+    if n > 1024:
+        atol *= 20.0
+
+    print("\nexternal solver: %s\n" % (str(out_s)))
+    print("CuPyNumeric: %s\n" % (str(out_num)))
+
+    tol_satisfied = allclose(
+        out_num, out_s, rtol=rtol, atol=atol, check_dtype=False
+    )
+
+    if tol_satisfied == False:
+        # check diff in ||exp(A)||_2:
+        #
+        norm_exp_s = np.linalg.norm(out_s, ord=2)
+        norm_exp_num = np.linalg.norm(out_num, ord=2)
+        #
+        # conversion to string shows more decimals...
+        #
+        print("external ||exp(A)|| = %s\n" % (str(norm_exp_s)))
+        print("Cupynumeric ||exp(A)|| = %s\n" % (str(norm_exp_num)))
+        assert np.abs(1.0 - norm_exp_num) <= np.abs(1.0 - norm_exp_s)
+
+        (_, R) = np.linalg.qr(a)
+        min_abs_diag = np.min([np.abs(R[k, k]) for k in range(a.shape[0])])
+        if min_abs_diag.item() < atol:
+            print("source matrix close to singular!")
+            assert False
+
+        return
+
+    assert True
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/integration/test_extract.py b/tests/integration/test_extract.py
index 5268a51f8c..a87f473169 100644
--- a/tests/integration/test_extract.py
+++ b/tests/integration/test_extract.py
@@ -17,7 +17,7 @@
 import pytest
 from utils.generators import mk_seq_array
 
-import cunumeric as num
+import cupynumeric as num
 
 DIM = 5
 SIZES = [
@@ -91,7 +91,7 @@ def test_negative_condition(con):
 @pytest.mark.xfail
 def test_complex_condition():
     # when condition is complex type a+bj,
-    # if a==0, cuNumeric take it as 0, while Numpy take it as 1
+    # if a==0, cuPyNumeric take it as 0, while Numpy take it as 1
     a = np.array([1, 2, 3, 4])
     b = num.array([1, 2, 3, 4])
     condition = [1 + 2j, 2, 2, 5j]
@@ -179,7 +179,7 @@ def test_place_basic(shape, vals):
     assert np.array_equal(arr_np, arr_num)
 
 
-@pytest.mark.xfail(reason="cunumeric raises exception when vals is ndim")
+@pytest.mark.xfail(reason="cupynumeric raises exception when vals is ndim")
 @pytest.mark.parametrize("vals", VALUES, ids=str)
 @pytest.mark.parametrize("ndim", range(2, DIM), ids=str)
 def test_place_vals_ndim(vals, ndim):
@@ -194,7 +194,7 @@ def test_place_vals_ndim(vals, ndim):
 
     # NumPy pass, array([[[2, 2, 2], [2, 2, 2]]])
     np.place(arr_np, mask_np, vals_np)
-    # cuNumeric raises ValueError: vals array has to be 1-dimensional
+    # cuPyNumeric raises ValueError: vals array has to be 1-dimensional
     num.place(arr_num, mask_num, vals_num)
     assert np.array_equal(arr_np, arr_num)
 
diff --git a/tests/integration/test_eye.py b/tests/integration/test_eye.py
index 7d7bf43598..a1abca2791 100644
--- a/tests/integration/test_eye.py
+++ b/tests/integration/test_eye.py
@@ -17,7 +17,7 @@
 import pytest
 from utils.utils import check_module_function
 
-import cunumeric as num
+import cupynumeric as num
 
 N = 5
 KS = [0, -1, 1, -2, 2]
@@ -26,27 +26,27 @@
 @pytest.mark.parametrize("k", KS + [-N, N, -10 * N, 10 * N])
 @pytest.mark.parametrize("M", [N, N + 1, N - 1, N * 10, 0])
 def test_eye(M, k):
-    print_msg = f"np & cunumeric.eye({N},{M}, k={k})"
+    print_msg = f"np & cupynumeric.eye({N},{M}, k={k})"
     check_module_function("eye", [N, M], {"k": k}, print_msg)
 
 
 @pytest.mark.parametrize("dtype", [np.int32, np.float64, None], ids=str)
 @pytest.mark.parametrize("k", KS, ids=str)
 def test_square(k, dtype):
-    print_msg = f"np & cunumeric.eye({N},k={k},dtype={dtype})"
+    print_msg = f"np & cupynumeric.eye({N},k={k},dtype={dtype})"
     check_module_function("eye", [N], {"k": k, "dtype": dtype}, print_msg)
 
 
 def test_N_zero():
     N = 0
-    print_msg = f"np & cunumeric eye({N})"
+    print_msg = f"np & cupynumeric eye({N})"
     check_module_function("eye", [N], {}, print_msg)
 
 
 def test_M_zero():
     N = 5
     M = 0
-    print_msg = f"np & cunumeric eye({N},{M})"
+    print_msg = f"np & cupynumeric eye({N},{M})"
     check_module_function("eye", [N, M], {}, print_msg)
 
 
@@ -74,7 +74,7 @@ def testBadM(self):
     @pytest.mark.xfail
     def testBadK(self):
         # numpy: raises TypeError
-        # cunumeric: the error is found by legate, raises struct.error
+        # cupynumeric: the error is found by legate, raises struct.error
         with pytest.raises(TypeError):
             num.eye(5, k=0.0)
 
diff --git a/tests/integration/test_fallback.py b/tests/integration/test_fallback.py
index 885762993a..7d71f6aa0f 100644
--- a/tests/integration/test_fallback.py
+++ b/tests/integration/test_fallback.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def test_array_equal():
@@ -33,7 +33,7 @@ def test_ufunc():
     # methods. If logical_and.accumulate becomes implemented in the future,
     # this assertion will start to fail, and a new (unimplemented) ufunc method
     # should be found to replace it
-    assert not num.logical_and.accumulate._cunumeric.implemented
+    assert not num.logical_and.accumulate._cupynumeric_metadata.implemented
 
     out_num = num.logical_and.accumulate(in_num)
     out_np = np.logical_and.accumulate(in_np)
diff --git a/tests/integration/test_fft_c2c.py b/tests/integration/test_fft_c2c.py
index 16f4598946..6e5fb2125f 100644
--- a/tests/integration/test_fft_c2c.py
+++ b/tests/integration/test_fft_c2c.py
@@ -18,11 +18,16 @@
 from utils.comparisons import allclose as _allclose
 from utils.generators import mk_0to1_array
 
-import cunumeric as num
+import cupynumeric as num
 
 
-def allclose(A, B):
-    if B.dtype == np.float32 or B.dtype == np.complex64:
+def allclose(A: np.ndarray, B: np.ndarray) -> bool:
+    if (
+        B.dtype == np.float32
+        or B.dtype == np.float64
+        or B.dtype == np.complex64
+        or B.dtype == np.complex128
+    ):
         l2 = (A - B) * np.conj(A - B)
         l2 = np.sqrt(np.sum(l2) / np.sum(A * np.conj(A)))
         return l2 < 1e-6
@@ -247,7 +252,7 @@ def test_4d():
         pytest.param(np.uint64, marks=pytest.mark.xfail),
         pytest.param(np.float16, marks=pytest.mark.xfail),
         # NumPy accepts the dtypes
-        # cuNumeric raises
+        # cuPyNumeric raises
         # TypeError: FFT input not supported (missing a conversion?)
     ),
     ids=str,
diff --git a/tests/integration/test_fft_c2r.py b/tests/integration/test_fft_c2r.py
index 861977b59d..2fa4aefc0d 100644
--- a/tests/integration/test_fft_c2r.py
+++ b/tests/integration/test_fft_c2r.py
@@ -17,11 +17,16 @@
 import pytest
 from utils.comparisons import allclose as _allclose
 
-import cunumeric as num
+import cupynumeric as num
 
 
-def allclose(A, B):
-    if B.dtype == np.float32 or B.dtype == np.complex64:
+def allclose(A: np.ndarray, B: np.ndarray) -> bool:
+    if (
+        B.dtype == np.float32
+        or B.dtype == np.float64
+        or B.dtype == np.complex64
+        or B.dtype == np.complex128
+    ):
         l2 = (A - B) * np.conj(A - B)
         l2 = np.sqrt(np.sum(l2) / np.sum(A * np.conj(A)))
         return l2 < 1e-6
@@ -36,6 +41,8 @@ def check_1d_c2r(N, dtype=np.float64):
     all_kwargs = (
         {},
         {"norm": "forward"},
+        {"norm": "ortho"},
+        {"norm": "backward"},
         {"n": N // 2},
         {"n": N // 2 + 1},
         {"n": N * 2},
@@ -48,6 +55,11 @@ def check_1d_c2r(N, dtype=np.float64):
         out_num = num.fft.irfft(Z_num, **kwargs)
         assert allclose(out, out_num)
 
+        out = np.fft.hfft(Z, **kwargs)
+        out_num = num.fft.hfft(Z_num, **kwargs)
+        assert allclose(out, out_num)
+        assert allclose(Z, Z_num)
+
     # Odd types
     out = np.fft.rfft(Z.real)
     out_num = num.fft.rfft(Z_num.real)
@@ -68,6 +80,8 @@ def check_2d_c2r(N, dtype=np.float64):
     all_kwargs = (
         {},
         {"norm": "forward"},
+        {"norm": "ortho"},
+        {"norm": "backward"},
         {"s": (N[0] // 2, N[1] - 2)},
         {"s": (N[0] + 1, N[0] + 2)},
         {"s": (N[0] // 2 + 1, N[0] + 2)},
@@ -211,6 +225,33 @@ def test_4d():
     check_4d_c2r(N=(6, 12, 10, 8), dtype=np.float32)
 
 
+def test_1d_int() -> None:
+    Z = np.random.randint(1, 10, size=8)
+    Z_num = num.array(Z)
+    msg = r"Data type for FFT not supported"
+    with pytest.raises(TypeError, match=msg):
+        num.fft.rfft(Z_num)
+    msg = r"Data type for FFT not supported"
+    with pytest.raises(TypeError, match=msg):
+        num.fft.irfft(Z_num)
+    msg = r"Data type for FFT not supported"
+    with pytest.raises(TypeError, match=msg):
+        num.fft.ihfft(Z_num)
+
+
+def test_norm_invalid() -> None:
+    Z = (
+        np.random.rand(8).astype(np.float64)
+        + np.random.rand(8).astype(np.float64) * 1j
+    )
+    Z_num = num.array(Z)
+    msg = r"Invalid norm value"
+    with pytest.raises(ValueError, match=msg):
+        np.fft.rfft(Z, norm="other")
+    with pytest.raises(ValueError, match=msg):
+        num.fft.rfft(Z_num, norm="other")
+
+
 if __name__ == "__main__":
     import sys
 
diff --git a/tests/integration/test_fft_hermitian.py b/tests/integration/test_fft_hermitian.py
index 623e6ec31f..8431e6b82f 100644
--- a/tests/integration/test_fft_hermitian.py
+++ b/tests/integration/test_fft_hermitian.py
@@ -17,7 +17,7 @@
 import pytest
 from utils.comparisons import allclose as _allclose
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def allclose(A, B):
diff --git a/tests/integration/test_fft_r2c.py b/tests/integration/test_fft_r2c.py
index 408b0de2fe..38e9a31023 100644
--- a/tests/integration/test_fft_r2c.py
+++ b/tests/integration/test_fft_r2c.py
@@ -17,11 +17,16 @@
 import pytest
 from utils.comparisons import allclose as _allclose
 
-import cunumeric as num
+import cupynumeric as num
 
 
-def allclose(A, B):
-    if B.dtype == np.float32 or B.dtype == np.complex64:
+def allclose(A: np.ndarray, B: np.ndarray) -> bool:
+    if (
+        B.dtype == np.float32
+        or B.dtype == np.float64
+        or B.dtype == np.complex64
+        or B.dtype == np.complex128
+    ):
         l2 = (A - B) * np.conj(A - B)
         l2 = np.sqrt(np.sum(l2) / np.sum(A * np.conj(A)))
         return l2 < 1e-6
diff --git a/tests/integration/test_fftshift.py b/tests/integration/test_fftshift.py
index ec30914a0f..d2262ef818 100644
--- a/tests/integration/test_fftshift.py
+++ b/tests/integration/test_fftshift.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def test_fftshift_1d():
@@ -40,6 +40,13 @@ def test_fftshift_axis():
     assert np.array_equal(a_num, a_np)
 
 
+def test_fftshift_axis_int() -> None:
+    freqs = np.fft.fftfreq(9, d=1.0 / 9).reshape(3, 3)
+    a_np = np.fft.fftshift(freqs, axes=1)
+    a_num = num.fft.fftshift(freqs, axes=1)
+    assert np.array_equal(a_num, a_np)
+
+
 def test_ifftshift_1d():
     freqs = np.fft.fftshift(np.fft.fftfreq(10, 0.1))
     a_np = np.fft.ifftshift(freqs)
@@ -64,6 +71,15 @@ def test_ifftshift_axis():
     assert np.array_equal(a_num, a_np)
 
 
+def test_ifftshift_axis_int() -> None:
+    freqs = np.fft.fftshift(
+        np.fft.fftfreq(9, d=1.0 / 9).reshape(3, 3), axes=(1,)
+    )
+    a_np = np.fft.ifftshift(freqs, axes=1)
+    a_num = num.fft.ifftshift(freqs, axes=1)
+    assert np.array_equal(a_num, a_np)
+
+
 if __name__ == "__main__":
     import sys
 
diff --git a/tests/integration/test_file.py b/tests/integration/test_file.py
index 0f815a45cd..9df5281484 100644
--- a/tests/integration/test_file.py
+++ b/tests/integration/test_file.py
@@ -18,7 +18,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def test_load():
diff --git a/tests/integration/test_fill.py b/tests/integration/test_fill.py
index 89cfde7a5a..6b29230e48 100644
--- a/tests/integration/test_fill.py
+++ b/tests/integration/test_fill.py
@@ -18,7 +18,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 INF_VALUES = [-np.inf, np.inf]
 FLOAT_FILL_VALUES = (-2.4e120, -1.3, 8.9e-130, 0.0, 5.7e-150, 0.6, 3.7e160)
@@ -53,11 +53,11 @@ def test_fill_int_with_none():
     a_np = np.full((2, 3), 1)
     a_num = num.array(a_np)
     # numpy fill with -9223372036854775808,
-    # while cunumeric raises TypeError
+    # while cupynumeric raises TypeError
     #
     # Update (wonchan): Numpy 1.23.3 no longer fills
     # the array with -9223372036854775808 on 'array.fill(None)'
-    # but raises the same exception as cuNumeric
+    # but raises the same exception as cuPyNumeric
     try:
         int(None)
     except TypeError as e:
@@ -70,7 +70,7 @@ def test_fill_int_with_nan():
     a_np = np.full((2, 3), 1)
     a_num = num.array(a_np)
     # numpy fill with -9223372036854775808,
-    # while cunumeric raises ValueError
+    # while cupynumeric raises ValueError
     msg = r"cannot convert float NaN to integer"
     with pytest.raises(ValueError, match=msg):
         a_num.fill(np.nan)
@@ -81,7 +81,7 @@ def test_fill_inf_to_int(value: float) -> None:
     a_np = np.full((2, 3), 1)
     a_num = num.array(a_np)
     # numpy fill with -9223372036854775808,
-    # while cunumeric raises OverflowError
+    # while cupynumeric raises OverflowError
     msg = r"cannot convert float infinity to integer"
     with pytest.raises(OverflowError, match=msg):
         a_num.fill(value)
@@ -110,7 +110,7 @@ def test_fill_float_to_int(value: float) -> None:
     a_np = np.full((2, 3), 1)
     a_num = num.array(a_np)
     # numpy fill with -9223372036854775808,
-    # while cunumeric raises OverflowError
+    # while cupynumeric raises OverflowError
     msg = r"Python int too large to convert to C long"
     with pytest.raises(OverflowError, match=msg):
         a_num.fill(value)
diff --git a/tests/integration/test_fill_diagonal.py b/tests/integration/test_fill_diagonal.py
index 7482fc4128..f384d03285 100644
--- a/tests/integration/test_fill_diagonal.py
+++ b/tests/integration/test_fill_diagonal.py
@@ -15,12 +15,11 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
 from utils.generators import mk_seq_array
+from utils.utils import TWO_MAX_DIM_RANGE
 
-import cunumeric as num
+import cupynumeric as num
 
-# cunumeric.fill_diagonal(a: ndarray, val: ndarray, wrap: bool = False) → None
 WRAP = [True, False]
 
 
@@ -35,7 +34,7 @@ def test_wrap(wrap):
     assert np.array_equal(np_array, num_array)
 
 
-@pytest.mark.parametrize("ndim", range(2, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", TWO_MAX_DIM_RANGE)
 @pytest.mark.parametrize("val_shape", ((0,), (3,), (6,), (2, 2), (2, 2, 6)))
 @pytest.mark.parametrize("wrap", WRAP, ids=str)
 def test_basic(ndim, val_shape, wrap):
@@ -121,7 +120,7 @@ def test_val_none(self):
         # a bytes-like object or a real number, not 'NoneType'
         with pytest.raises(expected_exc):
             num.fill_diagonal(num_array, val)
-        # cuNumeric raises AttributeError:
+        # cuPyNumeric raises AttributeError:
         # 'NoneType' object has no attribute 'size'
 
 
diff --git a/tests/integration/test_flags.py b/tests/integration/test_flags.py
index ae63f117ea..1995dd2e9b 100644
--- a/tests/integration/test_flags.py
+++ b/tests/integration/test_flags.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 DIM_CASE = (5, 5)
 
@@ -113,6 +113,17 @@ def test_non_writeable(self):
         with pytest.raises(ValueError, match="not writeable"):
             arr[0, 0] = 12
 
+    def test_flags(self) -> None:
+        arr = num.zeros(shape=DIM_CASE)
+        np_arr = np.zeros(shape=DIM_CASE)
+        arr.flags.writeable = True
+        np_arr.flags.writeable = True
+        assert arr.flags.writeable == np_arr.flags.writeable
+
+        arr.flags.aligned = True
+        np_arr.flags.aligned = True
+        assert arr.flags.aligned == np_arr.flags.aligned
+
     def test_cannot_make_nonwriteable_writeable(self):
         arr = num.zeros(shape=DIM_CASE)
         arr.flags["W"] = False
diff --git a/tests/integration/test_flatten.py b/tests/integration/test_flatten.py
index f143597e93..d571bee64b 100644
--- a/tests/integration/test_flatten.py
+++ b/tests/integration/test_flatten.py
@@ -17,7 +17,7 @@
 import pytest
 from utils.utils import check_array_method
 
-import cunumeric as num
+import cupynumeric as num
 
 DIM = 10
 
@@ -39,7 +39,7 @@
 @pytest.mark.parametrize("size", SIZES, ids=str)
 def test_basic(order, size):
     a = np.random.randint(low=0, high=100, size=size)
-    print_msg = f"np & cunumeric.ndarray.flatten({order})"
+    print_msg = f"np & cupynumeric.ndarray.flatten({order})"
     check_array_method(a, "flatten", [order], {}, print_msg)
 
 
diff --git a/tests/integration/test_flip.py b/tests/integration/test_flip.py
index 97e57ef26b..59cb85e2bd 100644
--- a/tests/integration/test_flip.py
+++ b/tests/integration/test_flip.py
@@ -16,10 +16,9 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
-from utils.utils import AxisError
+from utils.utils import TWO_MAX_DIM_RANGE, AxisError
 
-import cunumeric as num
+import cupynumeric as num
 
 a = num.random.random((10, 10, 10))
 AXES_1d = [-2, 0, 1, 2]
@@ -148,7 +147,7 @@ def test_wrong_dim(self):
 
 
 @pytest.mark.parametrize("func_name", FLIP_FUNCS)
-@pytest.mark.parametrize("ndim", range(2, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", TWO_MAX_DIM_RANGE)
 def test_max_dims(func_name, ndim):
     func_np = getattr(np, func_name)
     func_num = getattr(num, func_name)
diff --git a/tests/integration/test_floating.py b/tests/integration/test_floating.py
index b68731ce27..4fd9856238 100644
--- a/tests/integration/test_floating.py
+++ b/tests/integration/test_floating.py
@@ -17,7 +17,7 @@
 import pytest
 from utils.comparisons import allclose
 
-import cunumeric as num
+import cupynumeric as num
 
 SHAPES = [
     (10, 20),
diff --git a/tests/integration/test_get_item.py b/tests/integration/test_get_item.py
index 6293737064..9cc49de438 100644
--- a/tests/integration/test_get_item.py
+++ b/tests/integration/test_get_item.py
@@ -15,7 +15,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def test_basic():
diff --git a/tests/integration/test_gradient.py b/tests/integration/test_gradient.py
index d5b4804fb9..3d891a8166 100644
--- a/tests/integration/test_gradient.py
+++ b/tests/integration/test_gradient.py
@@ -16,9 +16,9 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
+from utils.utils import ONE_MAX_DIM_RANGE, TWO_MAX_DIM_RANGE
 
-import cunumeric as cn
+import cupynumeric as cn
 
 
 def test_gradient_with_scalar_dx():
@@ -38,7 +38,7 @@ def test_gradient_1d():
     assert np.allclose(res_np, res_cn)
 
 
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
 @pytest.mark.parametrize("edge_order", [1, 2])
 def test_nd_arrays(ndim, edge_order):
     shape = (5,) * ndim
@@ -53,7 +53,7 @@ def test_nd_arrays(ndim, edge_order):
         assert np.allclose(res_np, res_cn)
 
 
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
 @pytest.mark.parametrize("varargs", [0.5, 1, 2, 0.3, 0])
 def test_scalar_varargs(ndim, varargs):
     shape = (5,) * ndim
@@ -66,7 +66,7 @@ def test_scalar_varargs(ndim, varargs):
     assert np.allclose(res_np, res_cn, equal_nan=True)
 
 
-@pytest.mark.parametrize("ndim", range(2, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", TWO_MAX_DIM_RANGE)
 def test_array_1d_varargs(ndim):
     shape = (5,) * ndim
     size = prod(shape)
@@ -79,7 +79,7 @@ def test_array_1d_varargs(ndim):
     assert np.allclose(res_np, res_cn)
 
 
-@pytest.mark.parametrize("ndim", range(2, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", TWO_MAX_DIM_RANGE)
 def test_list_of_axes(ndim):
     shape = (5,) * ndim
     size = prod(shape)
diff --git a/tests/integration/test_histogram.py b/tests/integration/test_histogram.py
index 3764f7f51b..39eba1acbe 100644
--- a/tests/integration/test_histogram.py
+++ b/tests/integration/test_histogram.py
@@ -17,7 +17,7 @@
 import pytest
 from utils.comparisons import allclose
 
-import cunumeric as num
+import cupynumeric as num
 
 
 @pytest.mark.parametrize(
diff --git a/tests/integration/test_identity.py b/tests/integration/test_identity.py
index f84e000d17..ec669cec8f 100644
--- a/tests/integration/test_identity.py
+++ b/tests/integration/test_identity.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 DTYPE_ALL = [
     np.int8,
diff --git a/tests/integration/test_index_routines.py b/tests/integration/test_index_routines.py
index 405a6a2d5a..13c1670edf 100644
--- a/tests/integration/test_index_routines.py
+++ b/tests/integration/test_index_routines.py
@@ -19,10 +19,10 @@
 import pytest
 from legate.core import LEGATE_MAX_DIM
 from utils.generators import mk_seq_array
-from utils.utils import AxisError
+from utils.utils import ONE_MAX_DIM_RANGE, TWO_MAX_DIM_RANGE, AxisError
 
-import cunumeric as num
-from cunumeric._thunk.eager import diagonal_reference
+import cupynumeric as num
+from cupynumeric._thunk.eager import diagonal_reference
 
 
 class TestChoose1d:
@@ -87,7 +87,7 @@ def test_choose_2d():
     )
 
 
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
 def test_choose_target_ndim(ndim):
     tgt_shape = (5,) * ndim
     # try various shapes that broadcast to the target shape
@@ -176,7 +176,7 @@ def test_choose_out():
     num_a = mk_seq_array(num, shape_a) % shape_choices[0]
     num_a = num_a.astype(
         np.int32
-    )  # cuNumeric would convert np.int32 to default type np.int64
+    )  # cuPyNumeric would convert np.int32 to default type np.int64
     np_choices = mk_seq_array(np, shape_choices)
     num_choices = mk_seq_array(num, shape_choices)
     np_aout = mk_seq_array(np, shape_a_out) - 10
@@ -191,7 +191,7 @@ def test_choose_out():
 @pytest.mark.xfail
 def test_choose_mode_none():
     # In Numpy, pass and returns array equals default mode
-    # In cuNumeric, raises ValueError: mode=None not understood.
+    # In cuPyNumeric, raises ValueError: mode=None not understood.
     # Must be 'raise', 'wrap', or 'clip'
     shape_choices = (3, 2, 4)
     shape_a = (2, 4)
@@ -242,7 +242,7 @@ def test_a_invalid_shape(self, shape_a):
     @pytest.mark.xfail
     def test_a_none(self):
         # In Numpy, it raises TypeError
-        # In cuNumeric, it raises AttributeError:
+        # In cuPyNumeric, it raises AttributeError:
         # 'NoneType' object has no attribute 'choose'
         with pytest.raises(TypeError):
             num.choose(None, self.choices)
@@ -255,7 +255,7 @@ def test_empty_choices(self):
     @pytest.mark.xfail
     def test_choices_none(self):
         # In Numpy, it raises TypeError
-        # In cuNumeric, it raises IndexError: tuple index out of range
+        # In cuPyNumeric, it raises IndexError: tuple index out of range
         with pytest.raises(TypeError):
             num.choose(self.a, None)
 
@@ -343,7 +343,7 @@ def test_select(size):
 
 
 def test_select_maxdim():
-    for ndim in range(2, LEGATE_MAX_DIM + 1):
+    for ndim in TWO_MAX_DIM_RANGE:
         a_shape = tuple(np.random.randint(1, 9) for i in range(ndim))
         arr = mk_seq_array(np, a_shape)
         condlist_np = list()
@@ -405,7 +405,7 @@ def test_diagonal():
     assert np.array_equal(ad.diagonal(-1, 0, 2), num_ad.diagonal(-1, 0, 2))
 
     # test diagonal
-    for ndim in range(2, LEGATE_MAX_DIM + 1):
+    for ndim in TWO_MAX_DIM_RANGE:
         a_shape = tuple(np.random.randint(1, 9) for i in range(ndim))
         np_array = mk_seq_array(np, a_shape)
         num_array = mk_seq_array(num, a_shape)
@@ -444,7 +444,7 @@ def test_diagonal():
 def test_diagonal_offset(shape, k):
     # for shape=(5, 1) and k=1, 2,
     # for shape=(1, 5) and k=-1, -2,
-    # In cuNumeric,  raise ValueError: 'offset'
+    # In cuPyNumeric,  raise ValueError: 'offset'
     # for diag or diagonal must be in range
     # In Numpy, pass and returns empty array
     a = mk_seq_array(num, shape)
@@ -462,7 +462,7 @@ def test_diagonal_offset(shape, k):
 )
 def test_diagonal_empty_array(shape):
     # for shape=(3, 0) and k=0,
-    # In cuNumeric,  raise ValueError: 'offset'
+    # In cuPyNumeric,  raise ValueError: 'offset'
     # for diag or diagonal must be in range
     # In Numpy, pass and returns empty array
     a = mk_seq_array(num, shape)
@@ -473,13 +473,13 @@ def test_diagonal_empty_array(shape):
     assert np.array_equal(b, bn)
 
 
-@pytest.mark.xfail(reason="cuNumeric does not take single axis")
+@pytest.mark.xfail(reason="cuPyNumeric does not take single axis")
 def test_diagonal_axis1():
     shape = (3, 1, 2)
     a = mk_seq_array(num, shape)
     an = mk_seq_array(np, shape)
 
-    # cuNumeric hits AssertionError in _diag_helper: assert axes is not None
+    # cuPyNumeric hits AssertionError in _diag_helper: assert axes is not None
     b = num.diagonal(a, axis1=2)
     # NumPy passes
     bn = np.diagonal(an, axis1=2)
@@ -504,7 +504,7 @@ def test_1d_array(self):
 
     @pytest.mark.xfail
     def test_array_none(self):
-        # In cuNumeric, it raises AttributeError:
+        # In cuPyNumeric, it raises AttributeError:
         # 'NoneType' object has no attribute 'diagonal'
         # In Numpy, it raises ValueError:
         # diag requires an array of at least two dimensions.
@@ -518,7 +518,7 @@ def test_array_none(self):
     )
     def test_axes_same(self, axes):
         # For axes =  (0, -3),
-        # In cuNumeric, it raises ValueError:
+        # In cuPyNumeric, it raises ValueError:
         # axes must be the same size as ndim for transpose
         # In Numpy, it raises ValueError: axis1 and axis2 cannot be the same
         axis1, axis2 = axes
@@ -532,7 +532,7 @@ def test_axes_same(self, axes):
     )
     def test_axes_out_of_bound(self, axes):
         # In Numpy, it raises AxisError: is out of bounds
-        # In cuNumeric, it raises ValueError:
+        # In cuPyNumeric, it raises ValueError:
         # axes must be the same size as ndim for transpose
         axis1, axis2 = axes
         with pytest.raises(AxisError):
@@ -541,14 +541,14 @@ def test_axes_out_of_bound(self, axes):
     @pytest.mark.xfail
     def test_axes_float(self):
         # In Numpy, it raise TypeError
-        # In cuNumeric, it raises AssertionError
+        # In cuPyNumeric, it raises AssertionError
         with pytest.raises(TypeError):
             num.diagonal(self.a, 0, 0.0, 1)
 
     @pytest.mark.xfail
     def test_axes_none(self):
         # In Numpy, it raise TypeError
-        # In cuNumeric, it raises AssertionError
+        # In cuPyNumeric, it raises AssertionError
         with pytest.raises(TypeError):
             num.diagonal(self.a, 0, None, 0)
 
@@ -572,7 +572,7 @@ def test_n_axes_offset(self):
     )
     def test_k_float(self, k):
         # for k=0.0,
-        # In cuNumeric, pass
+        # In cuPyNumeric, pass
         # In Numpy, raises TypeError: integer argument expected, got float
         with pytest.raises(TypeError):
             num.diagonal(self.a, k)
@@ -596,7 +596,7 @@ def test_k_none(self):
 def test_diag(shape, k):
     # for shape=(5, 1) and k=1, 2,
     # for shape=(1, 5) and k=-1, -2,
-    # In cuNumeric,  raise ValueError:
+    # In cuPyNumeric,  raise ValueError:
     # 'offset' for diag or diagonal must be in range
     # In Numpy, pass and returns empty array
     a = mk_seq_array(num, shape)
@@ -614,7 +614,7 @@ def test_diag(shape, k):
 )
 def test_diag_empty_array(shape):
     # for shape=(3, 0) and k=0,
-    # In cuNumeric,  raise ValueError:
+    # In cuPyNumeric,  raise ValueError:
     # 'offset' for diag or diagonal must be in range
     # In Numpy, pass and returns empty array
     a = mk_seq_array(num, shape)
@@ -640,7 +640,7 @@ def test_3d_array(self):
 
     @pytest.mark.xfail
     def test_array_none(self):
-        # In cuNumeric, it raises AttributeError,
+        # In cuPyNumeric, it raises AttributeError,
         # 'NoneType' object has no attribute 'ndim'
         # In Numpy, it raises ValueError, Input must be 1- or 2-d.
         with pytest.raises(ValueError):
@@ -653,7 +653,7 @@ def test_array_none(self):
     )
     def test_k_float(self, k):
         # for k=0.0,
-        # In cuNumeric, pass
+        # In cuPyNumeric, pass
         # In Numpy, raises TypeError: integer argument expected, got float
         shape = (3, 3)
         a = mk_seq_array(num, shape)
@@ -686,6 +686,28 @@ def test_ix_(seqs):
     assert all(np.array_equal(*elts) for elts in zip(a, an))
 
 
+def test_ix_bool() -> None:
+    a = num.ix_([0, 1], [True])
+    an = np.ix_([0, 1], [True])
+    assert all(isinstance(elt, num.ndarray) for elt in a)
+    assert all(np.array_equal(*elts) for elts in zip(a, an))
+
+
+def test_ix_empty() -> None:
+    a = num.ix_([0, 1], [])
+    an = np.ix_([0, 1], [])
+    assert all(isinstance(elt, num.ndarray) for elt in a)
+    assert all(np.array_equal(*elts) for elts in zip(a, an))
+
+
+def test_ix_2d() -> None:
+    msg = r"Cross index must be 1 dimensional"
+    with pytest.raises(ValueError, match=msg):
+        num.ix_([0, 1], [[1, 2], [2, 3]])
+    with pytest.raises(ValueError, match=msg):
+        np.ix_([0, 1], [[1, 2], [2, 3]])
+
+
 if __name__ == "__main__":
     import sys
 
diff --git a/tests/integration/test_indices.py b/tests/integration/test_indices.py
index 5a8346bb1a..4de187dcd9 100644
--- a/tests/integration/test_indices.py
+++ b/tests/integration/test_indices.py
@@ -15,9 +15,9 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
+from utils.utils import MAX_DIM_RANGE
 
-import cunumeric as num
+import cupynumeric as num
 
 
 class TestIndicesErrors:
@@ -47,7 +47,7 @@ def test_float_dimensions(self):
     def test_negative_tuple_dimensions(self):
         dimensions = (1, -1)
         # numpy raises: "ValueError: negative dimensions are not allowed"
-        # In cunumeric Eager Executions test,
+        # In cupynumeric Eager Executions test,
         # it raises "ValueError: negative dimensions are not allowed"
         # in other conditions, it raises
         # "ValueError: Invalid shape: Shape((2, 1, -1))"
@@ -75,7 +75,7 @@ def test_indices_zero(self, dimensions):
 
         assert np.array_equal(np_res, num_res)
 
-    @pytest.mark.parametrize("ndim", range(0, LEGATE_MAX_DIM))
+    @pytest.mark.parametrize("ndim", MAX_DIM_RANGE[:-1])
     def test_indices_basic(self, ndim):
         dimensions = tuple(np.random.randint(1, 5) for _ in range(ndim))
 
@@ -83,7 +83,7 @@ def test_indices_basic(self, ndim):
         num_res = num.indices(dimensions)
         assert np.array_equal(np_res, num_res)
 
-    @pytest.mark.parametrize("ndim", range(0, LEGATE_MAX_DIM))
+    @pytest.mark.parametrize("ndim", MAX_DIM_RANGE[:-1])
     def test_indices_dtype_none(self, ndim):
         dimensions = tuple(np.random.randint(1, 5) for _ in range(ndim))
 
@@ -91,14 +91,14 @@ def test_indices_dtype_none(self, ndim):
         num_res = num.indices(dimensions, dtype=None)
         assert np.array_equal(np_res, num_res)
 
-    @pytest.mark.parametrize("ndim", range(0, LEGATE_MAX_DIM))
+    @pytest.mark.parametrize("ndim", MAX_DIM_RANGE[:-1])
     def test_indices_dtype_float(self, ndim):
         dimensions = tuple(np.random.randint(1, 5) for _ in range(ndim))
         np_res = np.indices(dimensions, dtype=float)
         num_res = num.indices(dimensions, dtype=float)
         assert np.array_equal(np_res, num_res)
 
-    @pytest.mark.parametrize("ndim", range(0, LEGATE_MAX_DIM))
+    @pytest.mark.parametrize("ndim", MAX_DIM_RANGE[:-1])
     def test_indices_sparse(self, ndim):
         dimensions = tuple(np.random.randint(1, 5) for _ in range(ndim))
         np_res = np.indices(dimensions, sparse=True)
diff --git a/tests/integration/test_inlinemap-keeps-region-alive.py b/tests/integration/test_inlinemap-keeps-region-alive.py
index 6f6bbf92c8..d7d7380abf 100644
--- a/tests/integration/test_inlinemap-keeps-region-alive.py
+++ b/tests/integration/test_inlinemap-keeps-region-alive.py
@@ -17,7 +17,7 @@
 
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def test_all():
diff --git a/tests/integration/test_inner.py b/tests/integration/test_inner.py
index d1f27a12f9..982342e277 100644
--- a/tests/integration/test_inner.py
+++ b/tests/integration/test_inner.py
@@ -14,16 +14,16 @@
 #
 
 import pytest
-from legate.core import LEGATE_MAX_DIM
 from utils.contractions import check_default
 from utils.generators import mk_0to1_array
+from utils.utils import MAX_DIM_RANGE
 
-import cunumeric as num
-from cunumeric._utils.linalg import inner_modes
+import cupynumeric as num
+from cupynumeric._utils.linalg import inner_modes
 
 
-@pytest.mark.parametrize("b_ndim", range(LEGATE_MAX_DIM + 1))
-@pytest.mark.parametrize("a_ndim", range(LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("b_ndim", MAX_DIM_RANGE)
+@pytest.mark.parametrize("a_ndim", MAX_DIM_RANGE)
 def test_inner(a_ndim, b_ndim):
     name = f"inner({a_ndim} x {b_ndim})"
     modes = inner_modes(a_ndim, b_ndim)
diff --git a/tests/integration/test_input_output.py b/tests/integration/test_input_output.py
index 2e2b2f5947..409b9cc19d 100644
--- a/tests/integration/test_input_output.py
+++ b/tests/integration/test_input_output.py
@@ -20,7 +20,7 @@
 import pytest
 from utils.generators import mk_0to1_array, mk_seq_array
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def test_ndarray_dumps():
diff --git a/tests/integration/test_intra_array_copy.py b/tests/integration/test_intra_array_copy.py
index 03b1976358..eb943f7235 100644
--- a/tests/integration/test_intra_array_copy.py
+++ b/tests/integration/test_intra_array_copy.py
@@ -15,10 +15,10 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
 from utils.generators import mk_0to1_array
+from utils.utils import ONE_MAX_DIM_RANGE
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def random_array(lib, ndim):
@@ -137,7 +137,7 @@ def array_gen(lib, ndim):
     yield from full_overlap(lib, ndim)
 
 
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
 def test_overlap(ndim):
     for np_arr, num_arr in zip(array_gen(np, ndim), array_gen(num, ndim)):
         assert np.array_equal(np_arr, num_arr)
diff --git a/tests/integration/test_item.py b/tests/integration/test_item.py
index a80bc070b9..a242ec790e 100644
--- a/tests/integration/test_item.py
+++ b/tests/integration/test_item.py
@@ -14,10 +14,10 @@
 #
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
 from utils.generators import generate_item
+from utils.utils import MAX_DIM_RANGE
 
-import cunumeric as num
+import cupynumeric as num
 
 
 @pytest.mark.xfail
@@ -32,7 +32,7 @@ def test_no_item():
         # Numpy raises KeyError: 'invalid key'
     with pytest.raises(expected_exc):
         arr_num.item()
-        # cuNumeric raises ValueError: can only convert an array
+        # cuPyNumeric raises ValueError: can only convert an array
         # of size 1 to a Python scalar
 
 
@@ -47,7 +47,7 @@ def test_out_of_bound():
         # Numpy raises IndexError: index 10 is out of bounds for size 9
     with pytest.raises(expected_exc):
         arr_num.item(10)
-        # cuNumeric returns some value
+        # cuPyNumeric returns some value
 
 
 @pytest.mark.xfail
@@ -62,7 +62,7 @@ def test_out_of_index():
         # for axis 1 with size 3
     with pytest.raises(expected_exc):
         arr_num.item(2, 4)
-        # cuNumeric raises ValueError: Out-of-bounds projection on dimension 0
+        # cuPyNumeric: ValueError: Out-of-bounds projection on dimension 0
         # with index 4 for a store of shape Shape((3,))
 
 
@@ -76,7 +76,7 @@ def test_empty_no_item():
     assert np.array_equal(res_np, res_num)
 
 
-@pytest.mark.parametrize("ndim", range(LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", MAX_DIM_RANGE)
 def test_ndim(ndim):
     shape = (4,) * ndim
     arr_num = num.random.randint(0, 3, size=shape)
diff --git a/tests/integration/test_itemset.py b/tests/integration/test_itemset.py
index 283d976a8d..935c613a79 100644
--- a/tests/integration/test_itemset.py
+++ b/tests/integration/test_itemset.py
@@ -14,11 +14,11 @@
 #
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
 from utils.generators import generate_item
+from utils.utils import MAX_DIM_RANGE
 
-import cunumeric as num
-from cunumeric._utils import is_np2
+import cupynumeric as num
+from cupynumeric._utils import is_np2
 
 # itemset was removed in numpy 2.0, skip the entire module
 if is_np2:
@@ -38,7 +38,7 @@ def test_no_itemset():
         # at least one argument
     with pytest.raises(expected_exc):
         arr_np.itemset()
-        # cuNumeric raises KeyError: 'itemset() requires
+        # cuPyNumeric raises KeyError: 'itemset() requires
         # at least one argument'
 
 
@@ -55,7 +55,7 @@ def test_invalid_itemset():
         # to a Python scalar
     with pytest.raises(expected_exc):
         arr_num.itemset(8)
-        # cuNumeric raises KeyError: 'invalid key'
+        # cuPyNumeric raises KeyError: 'invalid key'
 
 
 @pytest.mark.xfail
@@ -69,7 +69,7 @@ def test_out_of_index():
         # Numpy raises IndexError: index 10 is out of bounds for size 9
     with pytest.raises(expected_exc):
         arr_num.itemset(10, 4)
-        # cuNumeric set the value of index 1 as 4
+        # cuPyNumeric set the value of index 1 as 4
         # Original array:
         # [[193 212 238]
         #  [ 97 103 225]
@@ -93,11 +93,11 @@ def test_tuple_out_of_index():
         # for axis 1 with size 3
     with pytest.raises(expected_exc):
         arr_num.itemset((2, 2), 4)
-        # cuNumeric raises ValueError: Out-of-bounds projection on
+        # cuPyNumeric raises ValueError: Out-of-bounds projection on
         # dimension 0 with index 3 for a store of shape Shape((3,))
 
 
-@pytest.mark.parametrize("ndim", range(LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", MAX_DIM_RANGE)
 def test_ndim(ndim):
     shape = (4,) * ndim
     arr_num = num.random.randint(0, 30, size=shape)
diff --git a/tests/integration/test_jacobi.py b/tests/integration/test_jacobi.py
index 82b4ff0b63..74fd679e45 100644
--- a/tests/integration/test_jacobi.py
+++ b/tests/integration/test_jacobi.py
@@ -17,7 +17,7 @@
 import pytest
 from utils.comparisons import allclose
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def test_basic():
diff --git a/tests/integration/test_length.py b/tests/integration/test_length.py
index c00157eeb4..683ae3f963 100644
--- a/tests/integration/test_length.py
+++ b/tests/integration/test_length.py
@@ -15,7 +15,7 @@
 
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 LIST_X = [1, 2, 3]
 
diff --git a/tests/integration/test_linspace.py b/tests/integration/test_linspace.py
index 0937ac10f7..c1680123fb 100644
--- a/tests/integration/test_linspace.py
+++ b/tests/integration/test_linspace.py
@@ -19,7 +19,7 @@
 import pytest
 from utils.generators import broadcasts_to, mk_seq_array
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def equivalent_shapes_gen(shape):
@@ -206,10 +206,10 @@ def test_empty_array_retstep(shape, endpoint):
     "axis", range(-3, 3), ids=lambda axis: f"(axis={axis})"
 )
 def test_arrays_axis(axis, number):
-    # In cuNumeric, if axis < -1, raise ValueError
+    # In cuPyNumeric, if axis < -1, raise ValueError
     # 'Point cannot exceed 4 dimensions set from LEGATE_MAX_DIM'
     # In Numpy, if axis is -2 or -3, also pass
-    # In cuNumeric, for axis >= -1, if num=0, raise IndexError:
+    # In cuPyNumeric, for axis >= -1, if num=0, raise IndexError:
     # tuple index out of range
     # In Numpy, if num=0, pass and returns empty array
     x = np.array([[0, 1], [2, 3]])
@@ -253,7 +253,7 @@ def setup_method(self):
     @pytest.mark.xfail
     def test_num_float(self):
         # In Numpy, raise TypeError
-        # In cuNumeric, pass
+        # In cuPyNumeric, pass
         msg = "cannot be interpreted as an integer"
         with pytest.raises(TypeError, match=msg):
             num.linspace(0, 10, num=4.5)
@@ -273,7 +273,7 @@ def test_num_none(self):
         "axis", (-4, 3), ids=lambda axis: f"(axis={axis})"
     )
     def test_axis_out_of_bound_array(self, axis):
-        # In cuNumeric, if axis < -1, raise ValueError
+        # In cuPyNumeric, if axis < -1, raise ValueError
         # 'Point cannot exceed 4 dimensions set from LEGATE_MAX_DIM'
         msg = "out of bounds"
         # In Numpy, it raises AxisError
@@ -285,7 +285,7 @@ def test_axis_out_of_bound_array(self, axis):
         "axis", (-2, 1), ids=lambda axis: f"(axis={axis})"
     )
     def test_axis_out_of_bound_scalar(self, axis):
-        # In cuNumeric, it pass and the result equals when axis=0
+        # In cuPyNumeric, it pass and the result equals when axis=0
         # In Numpy, it raises AxisError
         msg = "out of bounds"
         with pytest.raises(ValueError, match=msg):
@@ -299,7 +299,7 @@ def test_axis_float(self):
 
     @pytest.mark.xfail
     def test_axis_none(self):
-        # In cuNumeric, pass and treat it as axis=0
+        # In cuPyNumeric, pass and treat it as axis=0
         # In Numpy, raises TypeError
         axis = None
         msg = "'NoneType' object is not iterable"
diff --git a/tests/integration/test_local_task_array.py b/tests/integration/test_local_task_array.py
new file mode 100644
index 0000000000..3ff3c27caa
--- /dev/null
+++ b/tests/integration/test_local_task_array.py
@@ -0,0 +1,46 @@
+# Copyright 2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import pytest
+from legate.core import StoreTarget, get_legate_runtime, types as ty
+
+import cupynumeric as num
+
+runtime = get_legate_runtime()
+
+
+def test_local_task_array_with_array() -> None:
+    array = runtime.create_array(ty.int64, shape=(10,)).get_physical_array()
+    result = num.local_task_array(array)
+    assert result.shape == (10,)
+    assert result.dtype == np.int64
+    on_cpu = array.data().target not in {StoreTarget.FBMEM, StoreTarget.ZCMEM}
+    assert isinstance(result, np.ndarray) == on_cpu
+
+
+def test_local_task_array_with_store() -> None:
+    store = runtime.create_store(ty.int64, shape=(20,)).get_physical_store()
+    result = num.local_task_array(store)
+    assert result.shape == (20,)
+    assert result.dtype == np.int64
+    on_cpu = store.target not in {StoreTarget.FBMEM, StoreTarget.ZCMEM}
+    assert isinstance(result, np.ndarray) == on_cpu
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/integration/test_logic.py b/tests/integration/test_logic.py
index c4b8a33b47..9bd15566a3 100644
--- a/tests/integration/test_logic.py
+++ b/tests/integration/test_logic.py
@@ -18,7 +18,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 SCALARS_INF = (np.inf, -np.inf, np.nan, 0)
 ARRAYS_INF = ([np.inf, -np.inf, np.nan, 0],)
@@ -126,7 +126,7 @@ def test_isscalar_array():
 
     # NumPy's scalar reduction returns a Python scalar
     assert num.isscalar(np.sum(in_np)) is True
-    # but cuNumeric's scalar reduction returns a 0-D array that behaves like
+    # but cuPyNumeric's scalar reduction returns a 0-D array that behaves like
     # a deferred scalar
     assert num.isscalar(num.sum(in_np)) is False
 
@@ -238,7 +238,7 @@ def test_isclose_arrays_rtol_atol(rtol, atol):
 def test_isclose_euqal_nan(equal_nan):
     # If equal_nan is True,
     # In Numpy, it pass
-    # In cuNumeric, it raises NotImplementedError
+    # In cuPyNumeric, it raises NotImplementedError
     values = [np.inf, -np.inf, np.nan, 0.0, -0.0]
     pairs = tuple(combinations_with_replacement(values, 2))
     in1_np = np.array([x for x, _ in pairs])
diff --git a/tests/integration/test_logical.py b/tests/integration/test_logical.py
index dac2de22ae..dfd1f8d228 100644
--- a/tests/integration/test_logical.py
+++ b/tests/integration/test_logical.py
@@ -15,9 +15,9 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
+from utils.utils import MAX_DIM_RANGE
 
-import cunumeric as num
+import cupynumeric as num
 
 INPUTS = (
     [-1, 4, 5],
@@ -39,7 +39,7 @@
 @pytest.mark.parametrize("func", FUNCTIONS)
 def test_basic(func, input, keepdims):
     in_np = np.array(input)
-    # cuNumeric doesn't support reductions for complex128
+    # cuPyNumeric doesn't support reductions for complex128
     if in_np.dtype.kind == "c":
         in_np = in_np.astype("F")
     in_num = num.array(in_np)
@@ -64,7 +64,7 @@ def test_basic(func, input, keepdims):
 def test_axis_tuple(func, axes):
     # For axes=(-1, 0),
     # in Numpy, it pass
-    # in cuNumeric, raises ValueError:
+    # in cuPyNumeric, raises ValueError:
     # Invalid promotion on dimension 2 for a 1-D store
     input = [[[5, 10], [0, 100]]]
     in_np = np.array(input)
@@ -77,7 +77,7 @@ def test_axis_tuple(func, axes):
     assert np.array_equal(out_np, out_num)
 
 
-@pytest.mark.parametrize("ndim", range(LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", MAX_DIM_RANGE)
 @pytest.mark.parametrize("func", FUNCTIONS)
 def test_nd_inputs(ndim, func):
     shape = (3,) * ndim
diff --git a/tests/integration/test_logical_reduction.py b/tests/integration/test_logical_reduction.py
index 21b6216dd0..405836c2bf 100644
--- a/tests/integration/test_logical_reduction.py
+++ b/tests/integration/test_logical_reduction.py
@@ -15,10 +15,10 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
 from utils.generators import mk_seq_array
+from utils.utils import ONE_MAX_DIM_RANGE
 
-import cunumeric as num
+import cupynumeric as num
 
 
 @pytest.mark.parametrize("axis", [None, 0, 1, 2, (0, 1, 2)])
@@ -36,7 +36,7 @@ def test_logical_reductions(axis):
     assert num.array_equal(out_num, out_np)
 
 
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM))
+@pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE[:-1])
 @pytest.mark.parametrize(
     "axis",
     [
@@ -45,7 +45,7 @@ def test_logical_reductions(axis):
         -1,
     ],
 )
-def test_logical_reductions_over_cunumeric_arrays(ndim, axis):
+def test_logical_reductions_over_cupynumeric_arrays(ndim, axis):
     shape = (5,) * ndim
     np_arr = mk_seq_array(np, shape)
     in_np = tuple(np_arr % 2 for dim in range(ndim))
diff --git a/tests/integration/test_lstm_backward_test.py b/tests/integration/test_lstm_backward_test.py
index aa8c5bfb20..394438ffc1 100644
--- a/tests/integration/test_lstm_backward_test.py
+++ b/tests/integration/test_lstm_backward_test.py
@@ -17,7 +17,7 @@
 import pytest
 from utils.comparisons import allclose
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def test_basic():
diff --git a/tests/integration/test_lstm_simple_forward.py b/tests/integration/test_lstm_simple_forward.py
index 629d1ef761..10b4502c8d 100644
--- a/tests/integration/test_lstm_simple_forward.py
+++ b/tests/integration/test_lstm_simple_forward.py
@@ -15,7 +15,7 @@
 
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def test_basic():
diff --git a/tests/integration/test_map_reduce.py b/tests/integration/test_map_reduce.py
index 15ac5c7d05..e5eab87139 100644
--- a/tests/integration/test_map_reduce.py
+++ b/tests/integration/test_map_reduce.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def test_basic():
diff --git a/tests/integration/test_mask.py b/tests/integration/test_mask.py
index 648d54e5e2..ebaa5a0058 100644
--- a/tests/integration/test_mask.py
+++ b/tests/integration/test_mask.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def test_lhs():
diff --git a/tests/integration/test_mask_indices.py b/tests/integration/test_mask_indices.py
index bd45879164..44bb90cd0e 100644
--- a/tests/integration/test_mask_indices.py
+++ b/tests/integration/test_mask_indices.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 KS = (0, -1, 1, -2, 2)
 FUNCTIONS = ("tril", "triu")
@@ -61,7 +61,7 @@ def test_mask_indices(k, mask_func):
 )
 @pytest.mark.parametrize("mask_func", FUNCTIONS)
 def test_mask_indices_float_k(k, mask_func):
-    # cuNumeric: struct.error: required argument is not an integer
+    # cuPyNumeric: struct.error: required argument is not an integer
     # Numpy: pass
     _test(mask_func, N, k)
 
@@ -79,7 +79,7 @@ def test_float_n(self, n):
 
     @pytest.mark.xfail
     def test_k_complex(self):
-        # In cuNumeric, it raises struct.error,
+        # In cuPyNumeric, it raises struct.error,
         # msg is required argument is not an integer
         # In Numpy, it raises TypeError,
         # msg is '<=' not supported between instances of 'complex' and 'int'
@@ -88,7 +88,7 @@ def test_k_complex(self):
 
     @pytest.mark.xfail
     def test_k_none(self):
-        # In cuNumeric, it raises struct.error,
+        # In cuPyNumeric, it raises struct.error,
         # msg is required argument is not an integer
         # In Numpy, it raises TypeError,
         # msg is unsupported operand type(s) for -: 'NoneType' and 'int'
diff --git a/tests/integration/test_matmul.py b/tests/integration/test_matmul.py
index 0952e4d605..4ebdab42d5 100644
--- a/tests/integration/test_matmul.py
+++ b/tests/integration/test_matmul.py
@@ -15,7 +15,6 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
 from utils.comparisons import allclose
 from utils.contractions import (
     check_default,
@@ -23,13 +22,14 @@
     check_shapes,
     check_types,
 )
+from utils.utils import ONE_MAX_DIM_RANGE
 
-import cunumeric as num
-from cunumeric._utils.linalg import matmul_modes
+import cupynumeric as num
+from cupynumeric._utils.linalg import matmul_modes
 
 
-@pytest.mark.parametrize("a_ndim", range(1, LEGATE_MAX_DIM + 1))
-@pytest.mark.parametrize("b_ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("a_ndim", ONE_MAX_DIM_RANGE)
+@pytest.mark.parametrize("b_ndim", ONE_MAX_DIM_RANGE)
 def test_function(a_ndim, b_ndim):
     name = f"matmul({a_ndim} x {b_ndim})"
     modes = matmul_modes(a_ndim, b_ndim)
@@ -163,7 +163,7 @@ def test_invalid_shape_dim_greater_than_one(self, shapesAB):
     def test_invalid_shape_with_vector(self, shapesAB):
         # For ((4, 1), (3,)), ((3,), (1, 4)), ((3,), (1,)),
         # In Numpy, raise ValueError
-        # In cuNumeric, broadcast 1 to 3 and pass
+        # In cuPyNumeric, broadcast 1 to 3 and pass
         expected_exc = ValueError
         shapeA, shapeB = shapesAB
         A_np = np.ones(shapeA)
@@ -221,7 +221,7 @@ def test_out_invalid_shape(self, shape):
     @pytest.mark.xfail
     def test_out_invalid_shape_DIVERGENCE(self):
         # In Numpy, PASS
-        # In cuNumeric, raise ValueError
+        # In cuPyNumeric, raise ValueError
         A = num.ones((3, 2, 4))
         B = num.ones((3, 4, 3))
         shape = (3, 3, 2, 3)
@@ -279,7 +279,7 @@ def test_invalid_casting(self, dtype):
         # In Numpy, raise ValueError
         with pytest.raises(expected_exc):
             np.matmul(A_np, B_np, casting=casting)
-        # cuNumeric does not check casting when A and B are of the same dtype
+        # cuPyNumeric does not check casting when A and B are of the same dtype
         with pytest.raises(expected_exc):
             num.matmul(A_num, B_num, casting=casting)
 
diff --git a/tests/integration/test_matrix_power.py b/tests/integration/test_matrix_power.py
index 58508897c7..dca930ff69 100644
--- a/tests/integration/test_matrix_power.py
+++ b/tests/integration/test_matrix_power.py
@@ -19,7 +19,7 @@
 from utils.comparisons import allclose
 from utils.generators import mk_0to1_array
 
-import cunumeric as num
+import cupynumeric as num
 
 # TODO: add negative exponents here, once they become supported
 EXPONENTS = (0, 1, 2, 3, 5)
@@ -38,7 +38,7 @@
 def test_matrix_power(ndim, exp, dtype):
     # If dtype=np.int32 and exp greater than 1,
     # In Numpy, pass
-    # In cuNumeric, raises TypeError: Unsupported type: int32
+    # In cuPyNumeric, raises TypeError: Unsupported type: int32
     shape = (3,) * ndim + (2, 2)
     a_np = mk_0to1_array(np, shape, dtype=dtype)
     a_num = mk_0to1_array(num, shape, dtype=dtype)
@@ -59,7 +59,7 @@ def test_matrix_power(ndim, exp, dtype):
 def test_matrix_power_empty_matrix(exp):
     # If exp =2 or 3,
     # In Numpy, pass and returns empty array
-    # In cuNumeric, raise AssertionError in _contract
+    # In cuPyNumeric, raise AssertionError in _contract
     shape = (0, 0)
     a_np = mk_0to1_array(np, shape)
     a_num = mk_0to1_array(num, shape)
diff --git a/tests/integration/test_mean.py b/tests/integration/test_mean.py
index 2065b6f758..d0055945a7 100755
--- a/tests/integration/test_mean.py
+++ b/tests/integration/test_mean.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 DIM = 7
 
@@ -103,7 +103,7 @@ def test_where_broadcast(size):
 @pytest.mark.parametrize("axis", ((-3, -1), (-1, 0), (-2, 2), (0, 2)))
 def test_axis_tuple(axis):
     # In Numpy, it pass
-    # In cuNumeric, it raises NotImplementedError
+    # In cuPyNumeric, it raises NotImplementedError
     size = (3, 4, 7)
     arr_np = np.random.randint(-5, 5, size=size)
     arr_num = num.array(arr_np)
diff --git a/tests/integration/test_median.py b/tests/integration/test_median.py
index 37dec719c9..6e388ef792 100644
--- a/tests/integration/test_median.py
+++ b/tests/integration/test_median.py
@@ -15,10 +15,10 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
 from utils.generators import mk_seq_array
+from utils.utils import ONE_MAX_DIM_RANGE
 
-import cunumeric as num
+import cupynumeric as num
 
 
 class TestMedianErrors:
@@ -56,7 +56,7 @@ def test_median_empty_array(self):
 
 
 class TestMedian:
-    @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM))
+    @pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE[:-1])
     @pytest.mark.parametrize(
         "keepdims",
         (
@@ -65,7 +65,7 @@ class TestMedian:
         ),
     )
     def test_median_basic(self, ndim, keepdims):
-        shape = np.random.randint(1, 6, ndim, dtype=int)
+        shape = np.random.randint(1, 4, ndim, dtype=int)
         size = 1
         for dim in shape:
             size *= dim
@@ -170,9 +170,9 @@ def test_median_overwrite_input(self):
 
 
 class TestNanmedian:
-    @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
     def test_nanmedian_basic(self, ndim):
-        shape = np.random.randint(2, 6, ndim, dtype=int)
+        shape = np.random.randint(2, 5, ndim, dtype=int)
         size = 1
         for dim in shape:
             size *= dim
diff --git a/tests/integration/test_meshgrid.py b/tests/integration/test_meshgrid.py
index 909ca56d57..d3ae666a48 100644
--- a/tests/integration/test_meshgrid.py
+++ b/tests/integration/test_meshgrid.py
@@ -17,7 +17,7 @@
 import pytest
 from utils.generators import mk_0to1_array, mk_seq_array
 
-import cunumeric as num
+import cupynumeric as num
 
 
 @pytest.mark.parametrize(
diff --git a/tests/integration/test_min_on_gpu.py b/tests/integration/test_min_on_gpu.py
index ccb09aa82d..d4803281aa 100644
--- a/tests/integration/test_min_on_gpu.py
+++ b/tests/integration/test_min_on_gpu.py
@@ -15,7 +15,7 @@
 
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def test_min():
diff --git a/tests/integration/test_moveaxis.py b/tests/integration/test_moveaxis.py
index f8a58f7991..860026a41f 100644
--- a/tests/integration/test_moveaxis.py
+++ b/tests/integration/test_moveaxis.py
@@ -15,11 +15,10 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
 from utils.generators import mk_0to1_array
-from utils.utils import AxisError
+from utils.utils import TWO_MAX_DIM_RANGE, AxisError
 
-import cunumeric as num
+import cupynumeric as num
 
 AXES = (
     (0, 0),
@@ -31,7 +30,7 @@
 )
 
 
-@pytest.mark.parametrize("ndim", range(2, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", TWO_MAX_DIM_RANGE)
 @pytest.mark.parametrize("axes", AXES)
 def test_moveaxis(ndim, axes):
     source, destination = axes
diff --git a/tests/integration/test_msort.py b/tests/integration/test_msort.py
index cc99a3fbe6..15e4a3f17d 100644
--- a/tests/integration/test_msort.py
+++ b/tests/integration/test_msort.py
@@ -16,10 +16,10 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
-from cunumeric._utils import is_np2
+import cupynumeric as num
+from cupynumeric._utils import is_np2
 
-# cunumeric.msort(a: ndarray) → ndarray
+# cupynumeric.msort(a: ndarray) → ndarray
 
 DIM = 5
 SIZES = [
diff --git a/tests/integration/test_multi_dot.py b/tests/integration/test_multi_dot.py
index 399f70e239..b6702d3951 100644
--- a/tests/integration/test_multi_dot.py
+++ b/tests/integration/test_multi_dot.py
@@ -18,7 +18,7 @@
 from utils.comparisons import allclose
 from utils.generators import mk_0to1_array
 
-import cunumeric as num
+import cupynumeric as num
 
 SHAPES = [
     # 2 arrays
@@ -124,7 +124,7 @@ def test_out_invalid_dim(self):
 
     @pytest.mark.xfail
     def test_out_invalid_shape(self):
-        # In cuNumeric, it raises AssertionError
+        # In cuPyNumeric, it raises AssertionError
         out = num.zeros((2, 1))
         with pytest.raises(ValueError):
             num.linalg.multi_dot(self.arrays, out=out)
@@ -135,7 +135,7 @@ def test_out_invalid_shape(self):
     )
     def test_out_invalid_dtype(self, dtype):
         # In Numpy, for np.float32 and np.int64, it raises ValueError
-        # In cuNumeric,
+        # In cuPyNumeric,
         # for np.float32, it pass
         # for np.int64, it raises TypeError: Unsupported type: int64
         out = num.zeros((2, 2), dtype=dtype)
diff --git a/tests/integration/test_nan_reduction.py b/tests/integration/test_nan_reduction.py
index f0a4509974..837a2e5d13 100644
--- a/tests/integration/test_nan_reduction.py
+++ b/tests/integration/test_nan_reduction.py
@@ -18,17 +18,17 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
 from utils.comparisons import allclose
+from utils.utils import MAX_DIM_RANGE, ONE_MAX_DIM_RANGE
 
-import cunumeric as num
-from cunumeric.settings import settings
+import cupynumeric as num
+from cupynumeric.settings import settings
 
 NAN_FUNCS = ("nanmax", "nanmin", "nanprod", "nansum")
 
-EAGER_TEST = os.environ.get("CUNUMERIC_FORCE_THUNK", None) == "eager"
+EAGER_TEST = os.environ.get("CUPYNUMERIC_FORCE_THUNK", None) == "eager"
 
-NDIMS = range(LEGATE_MAX_DIM + 1)
+NDIMS = MAX_DIM_RANGE
 
 DTYPE = ["l", "L", "f", "d", "h", "i", "H", "I", "?", "b", "B"]
 
@@ -43,11 +43,11 @@ class TestNanReductions:
     """
 
     @pytest.mark.parametrize("func_name", ("nansum", "nanprod"))
-    @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
     @pytest.mark.parametrize("keepdims", [True, False])
     def test_basic_nan_sum_prod(self, func_name, ndim, keepdims):
         """This test sets an element to NaN and checks if the output
-        from cuNumeric and numpy match."""
+        from cuPyNumeric and numpy match."""
         shape = (5,) * ndim
         size = prod(shape)
         in_np = np.random.random(shape)
@@ -68,11 +68,11 @@ def test_basic_nan_sum_prod(self, func_name, ndim, keepdims):
         assert allclose(out_num, out_np, rtol=1e-4)
 
     @pytest.mark.parametrize("func_name", ("nanmin", "nanmax"))
-    @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
     @pytest.mark.parametrize("keepdims", [True, False])
     def test_basic_nan_min_max(self, func_name, ndim, keepdims):
         """This test sets an element to NaN and checks if the output
-        from cuNumeric and numpy match."""
+        from cuPyNumeric and numpy match."""
         shape = (5,) * ndim
         size = prod(shape)
         in_np = np.random.random(shape)
@@ -93,7 +93,7 @@ def test_basic_nan_min_max(self, func_name, ndim, keepdims):
         assert np.array_equal(out_num, out_np)
 
     @pytest.mark.parametrize("func_name", NAN_FUNCS)
-    @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
     def test_out(self, func_name, ndim):
         """This test checks that the out argument is updated with the
         output"""
@@ -118,7 +118,7 @@ def test_out(self, func_name, ndim):
 
             assert allclose(out_num, out_np, rtol=1e-4)
 
-    @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
     @pytest.mark.parametrize("dtype", (np.float32, np.float64))
     @pytest.mark.parametrize("keepdims", [True, False])
     def test_complex_dtype_nansum(self, ndim, dtype, keepdims):
@@ -151,7 +151,7 @@ def test_complex_dtype_nansum(self, ndim, dtype, keepdims):
 
         assert allclose(out_num, out_np, rtol=1e-4)
 
-    @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
     @pytest.mark.parametrize("keepdims", [True, False])
     def test_complex_dtype_nanprod(self, ndim, keepdims):
         """This test checks if nanprod works as expected for complex
@@ -230,7 +230,7 @@ def test_slice_nan_no_numpy_compat(self, identity, func_name):
         settings.numpy_compat.unset_value()
 
     @pytest.mark.parametrize("func_name", ("nanmin", "nanmax"))
-    @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
     def test_all_nans_numpy_compat(self, ndim, func_name):
         """This test checks if we comply with the expected behavior when
         the array contains only NaNs.
@@ -260,7 +260,7 @@ def test_all_nans_numpy_compat(self, ndim, func_name):
         ],
         ids=str,
     )
-    @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
     def test_all_nans_no_numpy_compat(self, ndim, identity, func_name):
         """This test checks if we comply with the expected behavior when
         the array contains only NaNs for nanmin and nanmax.
@@ -278,7 +278,7 @@ def test_all_nans_no_numpy_compat(self, ndim, identity, func_name):
 
         settings.numpy_compat.unset_value()
 
-    @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
     def test_all_nans_nanprod(self, ndim):
         shape = (3,) * ndim
         in_num = num.random.random(shape)
@@ -310,7 +310,7 @@ def test_dtype_nansum(self, dtype) -> None:
         out_num = num.nansum(in_num)
         assert allclose(out_np, out_num)
 
-    @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
     def test_all_nans_nansum(self, ndim):
         shape = (3,) * ndim
         in_num = num.random.random(shape)
diff --git a/tests/integration/test_nanarg_reduction.py b/tests/integration/test_nanarg_reduction.py
index c5f2d66d35..4ec51e5344 100644
--- a/tests/integration/test_nanarg_reduction.py
+++ b/tests/integration/test_nanarg_reduction.py
@@ -18,14 +18,14 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
+from utils.utils import MAX_DIM_RANGE, ONE_MAX_DIM_RANGE
 
-import cunumeric as num
-from cunumeric.settings import settings
+import cupynumeric as num
+from cupynumeric.settings import settings
 
 NAN_ARG_FUNCS = ("nanargmax", "nanargmin")
 
-EAGER_TEST = os.environ.get("CUNUMERIC_FORCE_THUNK", None) == "eager"
+EAGER_TEST = os.environ.get("CUPYNUMERIC_FORCE_THUNK", None) == "eager"
 
 DISALLOWED_DTYPES = (
     np.complex64,
@@ -33,7 +33,7 @@
 )
 
 # Note that when an element is repeated mulitple times in an array,
-# the output from cuNumeric and numpy will vary. This is expected and
+# the output from cuPyNumeric and numpy will vary. This is expected and
 # is not a bug. So, whenever we compare with numpy, we try to make
 # sure the elements in the array are unique. Another way to circumvent
 # this problem would be to make sure that argument corresponding
@@ -46,11 +46,11 @@ class TestNanArgReductions:
     """
 
     @pytest.mark.parametrize("func_name", NAN_ARG_FUNCS)
-    @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
     @pytest.mark.parametrize("keepdims", [True, False])
     def test_basic(self, func_name, ndim, keepdims):
         """This test inserts a NaN in the array and checks if the
-        output from cuNumeric and numpy match
+        output from cuPyNumeric and numpy match
         """
         shape = (5,) * ndim
         size = prod(shape)
@@ -66,7 +66,7 @@ def test_basic(self, func_name, ndim, keepdims):
         func_np = getattr(np, func_name)
         func_num = getattr(num, func_name)
 
-        # make sure numpy and cunumeric give the same out array and max val
+        # make sure numpy and cupynumeric give the same out array and max val
         out_np = np.unravel_index(func_np(in_np, keepdims=keepdims), shape)
         out_num = np.unravel_index(func_num(in_num, keepdims=keepdims), shape)
 
@@ -77,7 +77,7 @@ def test_basic(self, func_name, ndim, keepdims):
         assert np.array_equal(index_array_num, index_array_np)
 
     @pytest.mark.parametrize("func_name", NAN_ARG_FUNCS)
-    @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
     def test_out(self, func_name, ndim):
         """This test checks that the out argument is updated with the
         output"""
@@ -103,7 +103,7 @@ def test_out(self, func_name, ndim):
             assert np.array_equal(out_np, out_num)
 
     @pytest.mark.parametrize("func_name", NAN_ARG_FUNCS)
-    @pytest.mark.parametrize("ndim", range(0, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("ndim", MAX_DIM_RANGE)
     @pytest.mark.parametrize("dtype", (np.float32, np.float64))
     def test_floating_point_types(self, func_name, ndim, dtype):
         """This test checks the most frequently used datatypes
@@ -127,7 +127,7 @@ def test_floating_point_types(self, func_name, ndim, dtype):
         assert np.array_equal(out_num, out_np)
 
     @pytest.mark.parametrize("func_name", NAN_ARG_FUNCS)
-    @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
     def test_all_nan_numpy_compat(self, func_name, ndim):
         """This test checks if we comply with the expected behavior when
         the array contains only NaNs. The expected behavior is to
@@ -152,7 +152,7 @@ def test_all_nan_numpy_compat(self, func_name, ndim):
         reason="Eager and Deferred mode will give different results",
     )
     @pytest.mark.parametrize("func_name", NAN_ARG_FUNCS)
-    @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
     def test_all_nan_no_numpy_compat(self, func_name, ndim):
         """This test checks that we return identity for all-NaN arrays.
         Note that scalar reductions (e.g., argmin/argmax) on arrays
@@ -181,7 +181,7 @@ def test_all_nan_no_numpy_compat(self, func_name, ndim):
     @pytest.mark.parametrize("func_name", NAN_ARG_FUNCS)
     def test_slice_nan_numpy_compat(self, func_name):
         """This test checks if we comply with the numpy when
-        a slice contains only NaNs and CUNUMERIC_NUMPY_COMPATABILITY
+        a slice contains only NaNs and CUPYNUMERIC_NUMPY_COMPATABILITY
         is set to 1.
         """
         settings.numpy_compat = True
@@ -211,7 +211,7 @@ def test_slice_nan_numpy_compat(self, func_name):
     )
     def test_slice_nan_no_numpy_compat(self, identity, func_name):
         """This test checks if we return identity for a slice that
-        contains NaNs when CUNUMERIC_NUMPY_COMPATABILITY is set to 0.
+        contains NaNs when CUPYNUMERIC_NUMPY_COMPATABILITY is set to 0.
         """
         settings.numpy_compat = False
 
@@ -248,7 +248,7 @@ class TestXFail:
 
     @pytest.mark.xfail
     @pytest.mark.parametrize("func_name", NAN_ARG_FUNCS)
-    @pytest.mark.parametrize("ndim", range(LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("ndim", MAX_DIM_RANGE)
     @pytest.mark.parametrize("disallowed_dtype", DISALLOWED_DTYPES)
     def test_disallowed_dtypes(self, func_name, ndim, disallowed_dtype):
         """This test checks if we raise an error for types that are
diff --git a/tests/integration/test_nanmean.py b/tests/integration/test_nanmean.py
index f02487d116..5d13e5f8a8 100755
--- a/tests/integration/test_nanmean.py
+++ b/tests/integration/test_nanmean.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 DIM = 7
 
@@ -94,7 +94,7 @@ def test_basic_where(size):
 @pytest.mark.parametrize("axis", ((-3, -1), (-1, 0), (-2, 2), (0, 2)))
 def test_axis_tuple(axis):
     # In Numpy, it pass
-    # In cuNumeric, it raises NotImplementedError
+    # In cuPyNumeric, it raises NotImplementedError
     size = (3, 4, 7)
     arr_np = np.random.randint(-5, 5, size=size).astype(float)
     arr_np[arr_np % 2 == 1] = np.nan
diff --git a/tests/integration/test_nanpercentiles.py b/tests/integration/test_nanpercentiles.py
index 620577da73..c490d72768 100644
--- a/tests/integration/test_nanpercentiles.py
+++ b/tests/integration/test_nanpercentiles.py
@@ -19,7 +19,7 @@
 import pytest
 from utils.comparisons import allclose
 
-import cunumeric as num
+import cupynumeric as num
 
 ALL_METHODS = (
     "inverted_cdf",
@@ -62,8 +62,8 @@ def test_multi_axes(str_method, axes, qin_arr, keepdims, overwrite_input):
     else:
         qs_arr = np.array(qin_arr)
 
-    # cunumeric:
-    # print("cunumeric axis = %d:"%(axis))
+    # cupynumeric:
+    # print("cupynumeric axis = %d:"%(axis))
     q_out = num.nanpercentile(
         arr,
         qs_arr,
diff --git a/tests/integration/test_nanquantiles.py b/tests/integration/test_nanquantiles.py
index 56f4ae7f2f..31a24484f3 100644
--- a/tests/integration/test_nanquantiles.py
+++ b/tests/integration/test_nanquantiles.py
@@ -19,7 +19,7 @@
 import pytest
 from utils.comparisons import allclose
 
-import cunumeric as num
+import cupynumeric as num
 
 ALL_METHODS = (
     "inverted_cdf",
@@ -39,7 +39,7 @@
 
 
 @pytest.mark.parametrize("str_method", ALL_METHODS)
-@pytest.mark.parametrize("axes", (0, 1))
+@pytest.mark.parametrize("axes", (0, 1, (0, 1), (0,)))
 @pytest.mark.parametrize(
     "qin_arr", (0.5, [0.001, 0.37, 0.42, 0.67, 0.83, 0.99, 0.39, 0.49, 0.5])
 )
@@ -62,8 +62,8 @@ def test_multi_axes(str_method, axes, qin_arr, keepdims, overwrite_input):
     else:
         qs_arr = np.array(qin_arr)
 
-    # cunumeric:
-    # print("cunumeric axis = %d:"%(axis))
+    # cupynumeric:
+    # print("cupynumeric axis = %d:"%(axis))
     q_out = num.nanquantile(
         arr,
         qs_arr,
diff --git a/tests/integration/test_nd_convolve.py b/tests/integration/test_nd_convolve.py
index 4090707945..cde3feff23 100644
--- a/tests/integration/test_nd_convolve.py
+++ b/tests/integration/test_nd_convolve.py
@@ -13,56 +13,64 @@
 # limitations under the License.
 #
 
-import os
-
-import numpy as np
 import pytest
-import scipy.signal as sig
 from utils.comparisons import allclose
 
-import cunumeric as num
-
-CUDA_TEST = os.environ.get("LEGATE_NEED_CUDA") == "1"
+import cupynumeric as num
 
 
 def test_interpolation_x():
-    import scipy.signal as signal 
+    import scipy.signal as signal
 
-    nz = 100 
-    nx = 200 
-    hs = 2 
-    nvariables = 4 
-    shape = (nvariables, nz + 2 * hs, nx + 2 * hs) 
+    nz = 100
+    nx = 200
+    hs = 2
+    nvariables = 4
+    shape = (nvariables, nz + 2 * hs, nx + 2 * hs)
     nelements = num.prod(shape)
 
-    kernel = num.array([-1.0 / 12, 7.0 / 12, 7.0 / 12, -1.0 / 12], dtype=num.float64).reshape(1, 1, 4)
+    kernel = num.array(
+        [-1.0 / 12, 7.0 / 12, 7.0 / 12, -1.0 / 12], dtype=num.float64
+    ).reshape(1, 1, 4)
     state = num.arange(nelements).astype(num.float64).reshape(shape)
-    out_legate = num.convolve(state[:, 2 : nz + 2, :], kernel, mode="same",)
-    out_scipy = signal.convolve(state[:, 2 : nz + 2, :], kernel, mode="same",)
-
-    #print(f"min/max, legate: {out_legate.min()}, {out_legate.max()}", flush=True)
-    #print(f"min/max, scipy : {out_scipy.min()}, {out_scipy.max()}", flush=True)
+    out_legate = num.convolve(
+        state[:, 2 : nz + 2, :],
+        kernel,
+        mode="same",
+    )
+    out_scipy = signal.convolve(
+        state[:, 2 : nz + 2, :],
+        kernel,
+        mode="same",
+    )
 
     assert allclose(out_scipy, out_legate)
 
 
 def test_interpolation_z():
-    import scipy.signal as signal 
+    import scipy.signal as signal
 
-    nz = 100 
-    nx = 200 
-    hs = 2 
-    nvariables = 4 
-    shape = (nvariables, nz + 2 * hs, nx + 2 * hs) 
+    nz = 100
+    nx = 200
+    hs = 2
+    nvariables = 4
+    shape = (nvariables, nz + 2 * hs, nx + 2 * hs)
     nelements = num.prod(shape)
 
-    kernel = num.array([-1.0 / 12, 7.0 / 12, 7.0 / 12, -1.0 / 12], dtype=num.float64).reshape(1, 4, 1)
+    kernel = num.array(
+        [-1.0 / 12, 7.0 / 12, 7.0 / 12, -1.0 / 12], dtype=num.float64
+    ).reshape(1, 4, 1)
     state = num.arange(nelements).astype(num.float64).reshape(shape)
-    out_legate = num.convolve(state[:, :, 2 : nx + 2], kernel, mode="same",)
-    out_scipy = signal.convolve(state[:, :, 2 : nx + 2], kernel, mode="same",)
-
-    #print(f"min/max, legate: {out_legate.min()}, {out_legate.max()}", flush=True)
-    #print(f"min/max, scipy : {out_scipy.min()}, {out_scipy.max()}", flush=True)
+    out_legate = num.convolve(
+        state[:, :, 2 : nx + 2],
+        kernel,
+        mode="same",
+    )
+    out_scipy = signal.convolve(
+        state[:, :, 2 : nx + 2],
+        kernel,
+        mode="same",
+    )
 
     assert allclose(out_scipy, out_legate)
 
diff --git a/tests/integration/test_ndim.py b/tests/integration/test_ndim.py
index d520928d3f..05ce740701 100644
--- a/tests/integration/test_ndim.py
+++ b/tests/integration/test_ndim.py
@@ -17,10 +17,10 @@
 import pytest
 from legate.core import LEGATE_MAX_DIM
 
-import cunumeric as num
+import cupynumeric as num
 
 
-@pytest.mark.parametrize("ndim", range(LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", range(LEGATE_MAX_DIM))
 def test_ndarray(ndim):
     shape = (4,) * ndim
     a = num.ones(shape)
diff --git a/tests/integration/test_negaxes_quantiles.py b/tests/integration/test_negaxes_quantiles.py
index 40cec5a7af..7648cf234d 100644
--- a/tests/integration/test_negaxes_quantiles.py
+++ b/tests/integration/test_negaxes_quantiles.py
@@ -16,10 +16,9 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
 from utils.comparisons import allclose
 
-import cunumeric as num
+import cupynumeric as num
 
 ALL_METHODS = (
     "inverted_cdf",
@@ -78,8 +77,8 @@ def test_quantiles_negative_axes(str_method, axes):
         dtype=float,
     )
 
-    # cunumeric:
-    # print("cunumeric axis = %d:"%(axis))
+    # cupynumeric:
+    # print("cupynumeric axis = %d:"%(axis))
     num_q_out = num.quantile(
         arr, qs_arr, axis=axes, method=str_method, keepdims=keepdims
     )
diff --git a/tests/integration/test_nonzero.py b/tests/integration/test_nonzero.py
index 9c4c44b97a..3dd7e4e529 100644
--- a/tests/integration/test_nonzero.py
+++ b/tests/integration/test_nonzero.py
@@ -17,12 +17,13 @@
 import pytest
 from utils.utils import AxisError
 
-import cunumeric as num
+import cupynumeric as num
+from cupynumeric._utils import is_np2_1
 
-# cunumeric.count_nonzero(a: ndarray,
+# cupynumeric.count_nonzero(a: ndarray,
 # axis: Optional[Union[int, tuple[int, ...]]] = None) → Union[int, ndarray]
-# cunumeric.nonzero(a: ndarray) → tuple[cunumeric.array.ndarray, ...]
-# cunumeric.flatnonzero(a: ndarray) → ndarray
+# cupynumeric.nonzero(a: ndarray) → tuple[cupynumeric.array.ndarray, ...]
+# cupynumeric.flatnonzero(a: ndarray) → ndarray
 
 DIM = 5
 EMPTY_SIZES = [
@@ -51,6 +52,13 @@
 SIZES = NO_EMPTY_SIZE + EMPTY_SIZES
 
 
+@pytest.mark.skipif(not is_np2_1, reason="numpy 1.0 does not raise")
+@pytest.mark.parametrize("value", (0, 1, 2, 7))
+def test_0d_error(value):
+    with pytest.raises(ValueError):
+        num.nonzero(value)
+
+
 @pytest.mark.parametrize("size", EMPTY_SIZES)
 def test_empty(size):
     arr_np = np.random.randint(-5, 5, size=size)
@@ -96,9 +104,9 @@ def test_axis_tuple(axis):
     out_np = np.count_nonzero(arr_np, axis=axis)
     # Numpy passed all axis values
     out_num = num.count_nonzero(arr_num, axis=axis)
-    # For (-1, 1), cuNumeric raises 'ValueError:
+    # For (-1, 1), cuPyNumeric raises 'ValueError:
     # Invalid promotion on dimension 2 for a 1-D store'
-    # For the others, cuNumeric raises 'NotImplementedError:
+    # For the others, cuPyNumeric raises 'NotImplementedError:
     # Need support for reducing multiple dimensions'
     assert np.array_equal(out_np, out_num)
 
@@ -123,7 +131,7 @@ def test_empty_axis(size):
     for axis in range(-ndim + 1, ndim, 1):
         out_np = np.count_nonzero(arr_np, axis=axis)
         out_num = num.count_nonzero(arr_num, axis=axis)
-        # Numpy and cuNumeric have diffrent out.
+        # Numpy and cuPyNumeric have diffrent out.
         # out_np = array([[0]])
         # out_num = 0
         assert np.array_equal(out_np, out_num)
@@ -140,8 +148,8 @@ def test_axis_keepdims(size, keepdims):
         out_np = np.count_nonzero(arr_np, axis=axis, keepdims=keepdims)
         out_num = num.count_nonzero(arr_num, axis=axis, keepdims=keepdims)
         # Numpy has the parameter 'keepdims',
-        # cuNumeric do not have this parameter.
-        # cuNumeric raises "TypeError: count_nonzero() got an unexpected
+        # cuPyNumeric do not have this parameter.
+        # cuPyNumeric raises "TypeError: count_nonzero() got an unexpected
         # keyword argument 'keepdims'"
         assert np.array_equal(out_np, out_num)
 
@@ -164,28 +172,6 @@ def test_flatnonzero(size):
     np.array_equal(res_np, res_num)
 
 
-def test_deprecated_0d():
-    with pytest.deprecated_call():
-        assert num.count_nonzero(num.array(0)) == 0
-        assert num.count_nonzero(num.array(0, dtype="?")) == 0
-        assert_equal(num.nonzero(0), np.nonzero(0))
-
-    with pytest.deprecated_call():
-        assert num.count_nonzero(num.array(1)) == 1
-        assert num.count_nonzero(num.array(1, dtype="?")) == 1
-        assert_equal(num.nonzero(1), np.nonzero(1))
-
-    with pytest.deprecated_call():
-        assert_equal(num.nonzero(0), ([],))
-
-    with pytest.deprecated_call():
-        assert_equal(num.nonzero(1), ([0],))
-
-    x_np = np.array([True, True])
-    x = num.array(x_np)
-    assert np.array_equal(x_np.nonzero(), x.nonzero())
-
-
 if __name__ == "__main__":
     import sys
 
diff --git a/tests/integration/test_norm.py b/tests/integration/test_norm.py
index 51f9b4a5ad..348b5a3ddb 100644
--- a/tests/integration/test_norm.py
+++ b/tests/integration/test_norm.py
@@ -15,25 +15,19 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
 from utils.comparisons import allclose
 from utils.generators import mk_0to1_array
+from utils.utils import MAX_DIM_RANGE, ONE_MAX_DIM_RANGE, TWO_MAX_DIM_RANGE
 
-import cunumeric as num
+import cupynumeric as num
 
 VECTOR_ORDS = [None, np.inf, -np.inf, 0, 1, -1, 2, -2]
 
 # TODO: Add "nuc", 2, -2 once they are implemented
 MATRIX_ORDS = [None, "fro", np.inf, -np.inf, 1, -1]
 
-np_arrays = [
-    mk_0to1_array(np, (3,) * ndim) - 0.5
-    for ndim in range(0, LEGATE_MAX_DIM + 1)
-]
-num_arrays = [
-    mk_0to1_array(num, (3,) * ndim) - 0.5
-    for ndim in range(0, LEGATE_MAX_DIM + 1)
-]
+np_arrays = [mk_0to1_array(np, (3,) * ndim) - 0.5 for ndim in MAX_DIM_RANGE]
+num_arrays = [mk_0to1_array(num, (3,) * ndim) - 0.5 for ndim in MAX_DIM_RANGE]
 
 
 @pytest.mark.parametrize("ord", VECTOR_ORDS)
@@ -44,7 +38,7 @@
 def test_noaxis_1d(ord, keepdims, dtype):
     # for ord=0, dtype is np.complex64
     # Numpy output array is float32
-    # cuNumeric output array is complex64
+    # cuPyNumeric output array is complex64
     np_res = np.linalg.norm(
         np_arrays[1].astype(dtype), ord=ord, keepdims=keepdims
     )
@@ -67,7 +61,7 @@ def test_noaxis_2d(ord, keepdims, dtype):
     assert allclose(np_res, num_res)
 
 
-@pytest.mark.parametrize("ndim", [0] + list(range(3, LEGATE_MAX_DIM + 1)))
+@pytest.mark.parametrize("ndim", MAX_DIM_RANGE[:-1])
 @pytest.mark.parametrize("keepdims", [False, True])
 @pytest.mark.parametrize("dtype", (np.float64, np.complex64))
 def test_noaxis_other(ndim, keepdims, dtype):
@@ -78,7 +72,7 @@ def test_noaxis_other(ndim, keepdims, dtype):
     assert allclose(np_res, num_res)
 
 
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE[:-1])
 @pytest.mark.parametrize("ord", VECTOR_ORDS)
 @pytest.mark.parametrize("keepdims", [False, True])
 def test_axis_1d(ndim, ord, keepdims):
@@ -91,7 +85,7 @@ def test_axis_1d(ndim, ord, keepdims):
     assert allclose(np_res, num_res)
 
 
-@pytest.mark.parametrize("ndim", range(2, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", TWO_MAX_DIM_RANGE[:-1])
 @pytest.mark.parametrize("ord", MATRIX_ORDS)
 @pytest.mark.parametrize("keepdims", [False, True])
 @pytest.mark.parametrize(
@@ -101,7 +95,7 @@ def test_axis_1d(ndim, ord, keepdims):
 )
 def test_axis_2d(ndim, ord, keepdims, axis):
     # For all cases when axis is (1, 0) and ord is None or fro,
-    # output values of cuNumeric and Numpy are different and not close enough
+    # output values of cuPyNumeric and Numpy are different and not close enough
     np_res = np.linalg.norm(
         np_arrays[ndim], ord=ord, axis=axis, keepdims=keepdims
     )
@@ -113,7 +107,7 @@ def test_axis_2d(ndim, ord, keepdims, axis):
 
 class TestNormErrors:
     def test_axis_invalid_type(self):
-        # In cuNumeric, raises error in normalize_axis_tuple
+        # In cuPyNumeric, raises error in normalize_axis_tuple
         expected_exc = TypeError
         x_np = np.array([1, 2, 3])
         x_num = num.array([1, 2, 3])
@@ -131,7 +125,7 @@ def test_axis_invalid_type(self):
         ids=lambda axis: f"(axis={axis})",
     )
     def test_axis_invalid_value(self, axis):
-        # for (1, 1), In cuNumeric, raises error in normalize_axis_tuple
+        # for (1, 1), In cuPyNumeric, raises error in normalize_axis_tuple
         expected_exc = ValueError
         ndim = 2
 
diff --git a/tests/integration/test_ones.py b/tests/integration/test_ones.py
index 6e5f047e3f..b648d6a08a 100644
--- a/tests/integration/test_ones.py
+++ b/tests/integration/test_ones.py
@@ -16,9 +16,9 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
-# cunumeric.ones(shape: NdShapeLike,
+# cupynumeric.ones(shape: NdShapeLike,
 # dtype: npt.DTypeLike = <class 'numpy.float64'>) → ndarray
 
 DIM = 5
@@ -54,9 +54,9 @@ class TestOnes(object):
     def test_size_none(self):
         res_np = np.ones(None)  # output is 1.0
         res_num = num.ones(None)
-        # cunumeric raises AssertionError
+        # cupynumeric raises AssertionError
         # 'assert shape is not None'
-        # in cunumeric/array.py:ndarray:__init__
+        # in cupynumeric/array.py:ndarray:__init__
         assert np.equal(res_np, res_num)
 
     @pytest.mark.parametrize("size", (200 + 20j, "hello"))
diff --git a/tests/integration/test_outer.py b/tests/integration/test_outer.py
index 850cb0e837..5e2ea1caa6 100644
--- a/tests/integration/test_outer.py
+++ b/tests/integration/test_outer.py
@@ -17,7 +17,7 @@
 import pytest
 from utils.generators import mk_0to1_array
 
-import cunumeric as num
+import cupynumeric as num
 
 SHAPES = ((), (0,), (1,), (10,), (4, 5), (1, 4, 5))
 
diff --git a/tests/integration/test_overlap.py b/tests/integration/test_overlap.py
index e695297486..580331d0af 100644
--- a/tests/integration/test_overlap.py
+++ b/tests/integration/test_overlap.py
@@ -19,7 +19,7 @@
 from utils.comparisons import allclose
 from utils.generators import mk_seq_array
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def setitem(lib, a, slice_lhs, slice_rhs):
diff --git a/tests/integration/test_overwrite_slice.py b/tests/integration/test_overwrite_slice.py
index 005daff0ef..892e9a5817 100644
--- a/tests/integration/test_overwrite_slice.py
+++ b/tests/integration/test_overwrite_slice.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def test_basic():
diff --git a/tests/integration/test_partition.py b/tests/integration/test_partition.py
index 2f75aeb1a3..7fb282981c 100644
--- a/tests/integration/test_partition.py
+++ b/tests/integration/test_partition.py
@@ -17,7 +17,7 @@
 import pytest
 from utils.comparisons import allclose
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def assert_partition(a_num, kth, axis):
@@ -184,7 +184,7 @@ def test_axis_out_of_bound(self, axis):
     def test_kth_out_of_bound(self, kth):
         # For all cases,
         # In numpy, it raises ValueError
-        # In cunumeric, it pass
+        # In cupynumeric, it pass
         expected_exc = ValueError
         axis = 0
         with pytest.raises(expected_exc):
@@ -214,7 +214,7 @@ def test_axis_out_of_bound(self, axis):
     def test_kth_out_of_bound(self, kth):
         # For all cases,
         # In numpy, it raises ValueError
-        # In cunumeric, it pass
+        # In cupynumeric, it pass
         expected_exc = ValueError
         axis = 0
         with pytest.raises(expected_exc):
diff --git a/tests/integration/test_percentiles.py b/tests/integration/test_percentiles.py
index 9699d462fe..9dda04a009 100644
--- a/tests/integration/test_percentiles.py
+++ b/tests/integration/test_percentiles.py
@@ -18,7 +18,7 @@
 import pytest
 from utils.comparisons import allclose
 
-import cunumeric as num
+import cupynumeric as num
 
 ALL_METHODS = (
     "inverted_cdf",
@@ -84,8 +84,8 @@ def test_multi_axes(str_method, axes, qin_arr, keepdims, overwrite_input):
     else:
         qs_arr = np.array(qin_arr)
 
-    # cunumeric:
-    # print("cunumeric axis = %d:"%(axis))
+    # cupynumeric:
+    # print("cupynumeric axis = %d:"%(axis))
     q_out = num.percentile(
         arr,
         qs_arr,
diff --git a/tests/integration/test_prod.py b/tests/integration/test_prod.py
index 9fecf6c4f4..f7d6f9db32 100644
--- a/tests/integration/test_prod.py
+++ b/tests/integration/test_prod.py
@@ -17,8 +17,8 @@
 from utils.comparisons import allclose
 from utils.utils import AxisError
 
-import cunumeric as num
-from cunumeric._utils import is_np2
+import cupynumeric as num
+from cupynumeric._utils import is_np2
 
 # numpy.prod(a, axis=None, dtype=None, out=None, keepdims=<no value>,
 # initial=<no value>, where=<no value>)
@@ -182,7 +182,7 @@ def test_dtype(self, dtype):
         out_num = num.prod(arr_num)
         assert allclose(out_np, out_num)
 
-    @pytest.mark.xfail(reason="numpy and cunumeric return different dtypes")
+    @pytest.mark.xfail(reason="numpy and cupynumeric return different dtypes")
     @pytest.mark.parametrize("dtype", INTEGER_DTYPE, ids=to_dtype)
     def test_dtype_integer_precision(self, dtype):
         arr_np = np.arange(0, 5).astype(dtype)
@@ -192,7 +192,7 @@ def test_dtype_integer_precision(self, dtype):
         assert allclose(out_num, arr_num.prod())
         # When input precision is less than default platform integer
         # NumPy returns the product with dtype of platform integer
-        # cuNumeric returns the product with dtype of the input array
+        # cuPyNumeric returns the product with dtype of the input array
         assert allclose(out_np, out_num)
 
     @pytest.mark.parametrize(
@@ -211,11 +211,11 @@ def test_dtype_complex(self, dtype):
         arr_np = np.array(arr, dtype=dtype)
         arr_num = num.array(arr, dtype=dtype)
         out_np = np.prod(arr_np)
-        # cunumeric always returns [1+0.j] when LEGATE_TEST=1
+        # cupynumeric always returns [1+0.j] when LEGATE_TEST=1
         out_num = num.prod(arr_num)
-        # When running tests with CUNUMERIC_TEST=1 and dtype is complex256,
+        # When running tests with CUPYNUMERIC_TEST=1 and dtype is complex256,
         # allclose hits assertion error:
-        # File "/legate/cunumeric/cunumeric/eager.py", line 293,
+        # File "/legate/cupynumeric/cupynumeric/eager.py", line 293,
         # in to_deferred_array
         #   assert self.runtime.is_supported_dtype(self.array.dtype)
         #   AssertionError
@@ -230,7 +230,9 @@ def test_axis_basic(self, axis):
         out_np = np.prod(arr_np, axis=axis)
         assert allclose(out_np, out_num)
 
-    @pytest.mark.xfail(reason="cunumeric raises exceptions when LEGATE_TEST=1")
+    @pytest.mark.xfail(
+        reason="cupynumeric raises exceptions when LEGATE_TEST=1"
+    )
     @pytest.mark.parametrize(
         "axis", ((-1, 1), (0, 1), (1, 2), (0, 2)), ids=str
     )
@@ -239,7 +241,7 @@ def test_axis_tuple(self, axis):
         arr_np = np.random.random(size) * 10
         arr_num = num.array(arr_np)
         out_np = np.prod(arr_np, axis=axis)
-        # when LEGATE_TEST = 1 cuNumeric raises two types of exceptions
+        # when LEGATE_TEST = 1 cuPyNumeric raises two types of exceptions
         # (-1, 1): ValueError: Invalid promotion on dimension 2 for a 1-D store
         # others:
         # NotImplementedError: Need support for reducing multiple dimensions
@@ -317,7 +319,7 @@ def test_axis_keepdims_true(self, size):
         for axis in range(-ndim + 1, ndim, 1):
             out_np = np.prod(arr_np, axis=axis, keepdims=True)
             out_num = num.prod(arr_num, axis=axis, keepdims=True)
-            # in cunumeric/deferred/unary_reduction:
+            # in cupynumeric/deferred/unary_reduction:
             # if lhs_array.size == 1:
             #     > assert axes is None or len(axes) == rhs_array.ndim - (
             #         0 if keepdims else lhs_array.ndim
diff --git a/tests/integration/test_put.py b/tests/integration/test_put.py
index ef8c593b9a..ca1947868f 100644
--- a/tests/integration/test_put.py
+++ b/tests/integration/test_put.py
@@ -15,10 +15,10 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
 from utils.generators import mk_seq_array
+from utils.utils import ONE_MAX_DIM_RANGE
 
-import cunumeric as num
+import cupynumeric as num
 
 INDICES_VALUES = (
     (0, 10),
@@ -137,7 +137,7 @@ def test_indices_array_and_shape_array(shape, indices_values_shape):
     assert np.array_equal(np_arr, num_arr)
 
 
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
 def test_ndim_default_mode(ndim):
     shape = (5,) * ndim
     np_arr = mk_seq_array(np, shape)
@@ -157,7 +157,7 @@ def test_ndim_default_mode(ndim):
 INDICES = ([1, 2, 3.2, 100], [[2, 1], [3, 100]], [1], [100])
 
 
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
 @pytest.mark.parametrize("mode", ("wrap", "clip"))
 @pytest.mark.parametrize(
     "indices", INDICES, ids=lambda indices: f"(indices={indices})"
diff --git a/tests/integration/test_put_along_axis.py b/tests/integration/test_put_along_axis.py
index 89cb3725a5..05c6aca741 100644
--- a/tests/integration/test_put_along_axis.py
+++ b/tests/integration/test_put_along_axis.py
@@ -15,14 +15,14 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
 from utils.generators import (
     broadcasts_to,
     broadcasts_to_along_axis,
     mk_seq_array,
 )
+from utils.utils import ONE_MAX_DIM_RANGE
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def equivalent_shapes_gen(shape):
@@ -50,7 +50,7 @@ def test_axis_None():
 N = 10
 
 
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
 def test_ndim(ndim):
     shape = (N,) * ndim
     np_arr = mk_seq_array(np, shape)
@@ -127,7 +127,7 @@ def test_empty_indice():
         pytest.param(
             np.array((0,)),
             marks=pytest.mark.xfail(
-                reason="NumPy: IndexError, cuNumeric: return None"
+                reason="NumPy: IndexError, cuPyNumeric: return None"
             ),
         ),
     ],
diff --git a/tests/integration/test_putmask.py b/tests/integration/test_putmask.py
index fa40f7fb09..0272648c87 100644
--- a/tests/integration/test_putmask.py
+++ b/tests/integration/test_putmask.py
@@ -15,10 +15,10 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
 from utils.generators import mk_0to1_array, mk_seq_array
+from utils.utils import ONE_MAX_DIM_RANGE
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def test_scalar():
@@ -118,7 +118,7 @@ def test_type_convert():
     assert np.array_equal(x_num, x)
 
 
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
 def test_ndim(ndim):
     shape = (5,) * ndim
     np_arr = mk_seq_array(np, shape)
@@ -178,7 +178,7 @@ def test_ndim(ndim):
 def test_a_values_different_shapes(shape_val):
     # for (2, 3, 4),
     # In Numpy, pass
-    # In cuNumeric, it raises ValueError
+    # In cuPyNumeric, it raises ValueError
     shape_arr = (3, 4)
     np_arr = mk_seq_array(np, shape_arr)
     num_arr = mk_seq_array(num, shape_arr)
@@ -226,7 +226,7 @@ def test_invalid_mask_shape(self):
     def test_a_values_different_dtype(self, dtype_val):
         # for both cases,
         # In Numpy, it raises TypeError
-        # In cuNumeric, it pass
+        # In cuPyNumeric, it pass
         expected_exc = TypeError
         shape = (3, 4)
         dtype_arr = int
diff --git a/tests/integration/test_qr.py b/tests/integration/test_qr.py
index 80bcea47e1..670d763c58 100644
--- a/tests/integration/test_qr.py
+++ b/tests/integration/test_qr.py
@@ -17,7 +17,7 @@
 import pytest
 from utils.comparisons import allclose
 
-import cunumeric as num
+import cupynumeric as num
 
 SIZES = (8, 9, 255)
 
diff --git a/tests/integration/test_quantiles.py b/tests/integration/test_quantiles.py
index b16da8e39d..ae96f509c8 100644
--- a/tests/integration/test_quantiles.py
+++ b/tests/integration/test_quantiles.py
@@ -18,7 +18,7 @@
 import pytest
 from utils.comparisons import allclose
 
-import cunumeric as num
+import cupynumeric as num
 
 ALL_METHODS = (
     "inverted_cdf",
@@ -84,8 +84,8 @@ def test_multi_axes(str_method, axes, qin_arr, keepdims, overwrite_input):
     else:
         qs_arr = np.array(qin_arr)
 
-    # cunumeric:
-    # print("cunumeric axis = %d:"%(axis))
+    # cupynumeric:
+    # print("cupynumeric axis = %d:"%(axis))
     q_out = num.quantile(
         arr,
         qs_arr,
@@ -143,8 +143,8 @@ def test_nd_quantile(str_method, ls_in, axes, keepdims):
         buffer=np.array([0.001, 0.37, 0.42, 0.5, 0.67, 0.83, 0.99, 0.39]).data,
     )
 
-    # cunumeric:
-    # print("cunumeric axis = %d:"%(axis))
+    # cupynumeric:
+    # print("cupynumeric axis = %d:"%(axis))
     q_out = num.quantile(
         arr, qs_arr, axis=axes, method=str_method, keepdims=keepdims
     )
diff --git a/tests/integration/test_randint.py b/tests/integration/test_randint.py
index de83116870..616f0f7add 100644
--- a/tests/integration/test_randint.py
+++ b/tests/integration/test_randint.py
@@ -13,17 +13,46 @@
 # limitations under the License.
 #
 
+import numpy as np
 import pytest
 
-import cunumeric as num
-
-
-def test_1d():
-    num.random.randint(8000, size=8000)
-
-
-def test_2d():
-    num.random.randint(8000, size=(8000, 2))
+import cupynumeric as num
+
+
+class TestRandint:
+    @pytest.mark.parametrize("size", (1, 8000, (8000, 2)))
+    def test_randint(self, size: int | tuple[int, ...]) -> None:
+        L1 = num.random.randint(8000, size=size)
+        L2 = np.random.randint(8000, size=size)
+        assert L1.ndim == L2.ndim
+        assert L1.dtype.kind == "i"
+
+    def test_randint_0(self):
+        L1 = num.random.randint(8000, size=0)
+        L2 = np.random.randint(8000, size=0)
+        assert np.array_equal(L1, L2)
+
+    def test_low(self):
+        L1 = num.random.randint(500)
+        L2 = np.random.randint(500)
+        assert L1 < 500
+        assert L2 < 500
+
+    def test_high(self):
+        L1 = num.random.randint(500, 800)
+        L2 = np.random.randint(500, 800)
+        assert 500 < L1 < 800
+        assert 500 < L2 < 800
+
+    @pytest.mark.xfail(
+        reason="https://github.com/nv-legate/cunumeric.internal/issues/199"
+    )
+    def test_same_seed(self) -> None:
+        num.random.seed(13)
+        L1 = num.random.randint(100)
+        num.random.seed(13)
+        L2 = num.random.randint(100)
+        assert np.array_equal(L1, L2)
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/test_random.py b/tests/integration/test_random.py
index e238fee2ca..46f9f370b3 100644
--- a/tests/integration/test_random.py
+++ b/tests/integration/test_random.py
@@ -11,84 +11,203 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import re
+
 import numpy as np
 import pytest
 
-import cunumeric as num
-
-@pytest.mark.xfail(
-    reason = "https://github.com/nv-legate/cunumeric.internal/issues/199"
-)
-def test_basic_num() -> None:
-    num.random.seed(10)
-    L1 = num.random.randn(3, 3)
-    num.random.seed(10)
-    L2 = num.random.randn(3, 3)
-    assert np.array_equal(L1, L2)
-
-
-@pytest.mark.xfail(
-    reason = "numpy failures in random.mtrand.RandomState.standard_normal"
-)
-def test_basic_np() -> None:
-    np.random.seed(10)
-    L1 = np.random.randn(3, 3)
-    np.random.seed(10)
-    L2 = np.random.randn(3, 3)
-    assert np.array_equal(L1, L2)
+import cupynumeric as num
+
+
+class TestRand:
+    def test_rand_null(self) -> None:
+        L1 = num.random.rand()
+        assert L1.dtype.kind == "f"
+        assert L1.ndim == 0
+
+    @pytest.mark.xfail(
+        reason="numpy failures in random.mtrand.RandomState.standard_normal"
+    )
+    @pytest.mark.parametrize("size", (0, 1, 3))
+    def test_rand(self, size: int) -> None:
+        L1 = num.random.rand(size)
+        L2 = np.random.rand(size)
+        assert L1.ndim == L2.ndim == 1
+
+    @pytest.mark.xfail(
+        reason="numpy failures in random.mtrand.RandomState.standard_normal"
+    )
+    def test_rand_2d(self) -> None:
+        L1 = num.random.rand(3, 3)
+        L2 = np.random.rand(3, 3)
+        assert L1.ndim == L2.ndim == 2
+
+    @pytest.mark.xfail(
+        reason="numpy failures in random.mtrand.RandomState.standard_normal"
+    )
+    def test_float(self) -> None:
+        msg = r"expected a sequence of integers or a single integer"
+        with pytest.raises(TypeError, match=msg):
+            num.random.rand(1.5)
+        msg = r"'float' object cannot be interpreted as an integer"
+        with pytest.raises(TypeError, match=msg):
+            np.random.rand(1.5)
+
+    @pytest.mark.xfail(
+        reason="numpy failures in random.mtrand.RandomState.standard_normal"
+    )
+    def test_negative_value(self) -> None:
+        msg = r"Extent must be a positive number"
+        with pytest.raises(ValueError, match=msg):
+            num.random.rand(-2, -2)
+        msg = r"negative dimensions are not allowed"
+        with pytest.raises(ValueError, match=msg):
+            np.random.rand(-2, -2)
+
+    @pytest.mark.xfail(
+        reason="https://github.com/nv-legate/cunumeric.internal/issues/199"
+    )
+    def test_same_seed(self) -> None:
+        num.random.seed(10)
+        L1 = num.random.rand(3, 3)
+        num.random.seed(10)
+        L2 = num.random.rand(3, 3)
+        assert np.array_equal(L1, L2)
+
+
+class TestRandn:
+    def test_randn_null(self) -> None:
+        L1 = num.random.randn()
+        assert L1.dtype.kind == "f"
+        assert L1.ndim == 0
+
+    @pytest.mark.xfail(
+        reason="numpy failures in random.mtrand.RandomState.standard_normal"
+    )
+    @pytest.mark.parametrize("size", (0, 1, 3))
+    def test_randn(self, size: int) -> None:
+        L1 = num.random.randn(size)
+        L2 = np.random.randn(size)
+        assert L1.ndim == L2.ndim == 1
+
+    @pytest.mark.xfail(
+        reason="numpy failures in random.mtrand.RandomState.standard_normal"
+    )
+    def test_2d(self) -> None:
+        L1 = num.random.randn(3, 3)
+        L2 = np.random.randn(3, 3)
+        assert L1.ndim == L2.ndim == 2
+
+    @pytest.mark.xfail(
+        reason="numpy failures in random.mtrand.RandomState.standard_normal"
+    )
+    def test_float(self) -> None:
+        msg = r"expected a sequence of integers or a single integer"
+        with pytest.raises(TypeError, match=msg):
+            num.random.randn(1.5)
+        msg = r"'float' object cannot be interpreted as an integer"
+        with pytest.raises(TypeError, match=msg):
+            np.random.randn(1.5)
+
+    def test_negative_value(self) -> None:
+        with pytest.raises(ValueError):
+            num.random.randn(-2, -2)
+        msg = r"negative dimensions are not allowed"
+        with pytest.raises(ValueError, match=msg):
+            np.random.randn(-2, -2)
+
+    @pytest.mark.xfail(
+        reason="https://github.com/nv-legate/cunumeric.internal/issues/199"
+    )
+    def test_same_seed(self) -> None:
+        num.random.seed(10)
+        L1 = num.random.randn(3, 3)
+        num.random.seed(10)
+        L2 = num.random.randn(3, 3)
+        assert np.array_equal(L1, L2)
+
+
+class TestRandom:
+    def test_random_null(self) -> None:
+        L1 = num.random.random()
+        assert L1.dtype.kind == "f"
+        assert L1.ndim == 1
+
+    @pytest.mark.xfail(
+        reason="numpy failures in random.mtrand.RandomState.standard_normal"
+    )
+    @pytest.mark.parametrize("size", (0, 1, 3))
+    def test_random(self, size: int) -> None:
+        L1 = num.random.random(size)
+        L2 = np.random.random(size)
+        assert L1.ndim == L2.ndim == 1
+
+    def test_float(self) -> None:
+        msg = r"expected a sequence of integers or a single integer"
+        with pytest.raises(TypeError, match=msg):
+            num.random.random(1.5)
+        msg = r"expected a sequence of integers or a single integer, got '1.5'"
+        with pytest.raises(TypeError, match=msg):
+            np.random.random(1.5)
+
+    def test_negative_value(self) -> None:
+        with pytest.raises(ValueError):
+            num.random.random(-2)
+        msg = r"negative dimensions are not allowed"
+        with pytest.raises(ValueError, match=msg):
+            np.random.random(-2)
+
+    @pytest.mark.xfail(
+        reason="https://github.com/nv-legate/cunumeric.internal/issues/199"
+    )
+    def test_same_seed(self) -> None:
+        num.random.seed(10)
+        L1 = num.random.random(3)
+        num.random.seed(10)
+        L2 = num.random.random(3)
+        assert np.array_equal(L1, L2)
+
+
+class TestRandomSeed:
+    @pytest.mark.xfail(
+        reason="numpy failures in random.mtrand.RandomState.standard_normal"
+    )
+    def test_none(self) -> None:
+        num.random.seed()
+        L1 = num.random.randn(3, 3)
+        np.random.seed()
+        L2 = np.random.randn(3, 3)
+        assert L1.ndim == L2.ndim
+
+    @pytest.mark.xfail(
+        reason="numpy failures in random.mtrand.RandomState.standard_normal"
+    )
+    @pytest.mark.parametrize("seed", (None, 1, 100, 20000))
+    def test_seed(self, seed: int | None) -> None:
+        num.random.seed(seed)
+        L1 = num.random.randn(3, 3)
+        np.random.seed(seed)
+        L2 = np.random.randn(3, 3)
+        assert L1.ndim == L2.ndim
+
+    def test_negative_seed(self) -> None:
+        with pytest.raises(ValueError):
+            np.random.seed(-10)
+        num.random.seed(-10)
+        # See https://github.com/nv-legate/cunumeric.internal/issues/484
+        # cuNumeric passed with negative value
+
+    def test_seed_float(self) -> None:
+        msg = r"Cannot cast scalar from dtype('float64') to dtype('int64') "
+        " according to the rule 'safe'"
+        with pytest.raises(TypeError, match=re.escape(msg)):
+            np.random.seed(10.5)
+
+        num.random.seed(10.5)
+        # See https://github.com/nv-legate/cunumeric.internal/issues/199
+        # cuNumeric passed with float value
 
-    np.random.seed(10)
-    L1 = np.random.randn(3, 3)
-    L2 = np.random.randn(3, 3)
-    assert not np.array_equal(L1, L2)
 
-
-@pytest.mark.xfail(
-    reason = "https://github.com/nv-legate/cunumeric.internal/issues/199"
-)
-def test_none_num() -> None:
-    num.random.seed()
-    L1 = num.random.randn(3, 3)
-    num.random.seed()
-    L2 = num.random.randn(3, 3)
-    assert np.array_equal(L1, L2)
-
-    num.random.seed()
-    L1 = num.random.randn(3, 3)
-    L2 = num.random.randn(3, 3)
-    assert not np.array_equal(L1, L2)
-
-
-@pytest.mark.xfail(
-    reason = "numpy failures in random.mtrand.RandomState.standard_normal"
-)
-def test_none_np() -> None:
-    np.random.seed()
-    L1 = np.random.randn(3, 3)
-    np.random.seed()
-    L2 = np.random.randn(3, 3)
-    assert not np.array_equal(L1, L2)
-
-    np.random.seed()
-    L1 = np.random.randn(3, 3)
-    L2 = np.random.randn(3, 3)
-    assert not np.array_equal(L1, L2)
-
-
-@pytest.mark.xfail(
-    reason = "numpy failures in random.mtrand.RandomState.standard_normal"
-)
-def test_basic_num_np() -> None:
-    np.random.seed(10)
-    L1 = np.random.randn(3, 3)
-    num.random.seed(10)
-    L2 = num.random.randn(3, 3)
-    assert not np.array_equal(L1, L2)
-
-
-@pytest.mark.xfail(
-    reason = "https://github.com/nv-legate/cunumeric.internal/issues/199"
-)
 def test_RandomState() -> None:
     rdm_num = num.random.RandomState(10)
     L1 = rdm_num.randn(3, 3)
@@ -97,14 +216,6 @@ def test_RandomState() -> None:
     assert np.array_equal(L1, L2)
 
 
-def test_float() -> None:
-    with pytest.raises(TypeError):
-        np.random.seed(10.5)
-        # TypeError: 'float' object cannot be interpreted as an integer
-    num.random.seed(10.5)
-    # cuNumeric passed with float
-
-
 if __name__ == "__main__":
     import sys
 
diff --git a/tests/integration/test_random_advanced.py b/tests/integration/test_random_advanced.py
index c470643231..ec4898a72a 100644
--- a/tests/integration/test_random_advanced.py
+++ b/tests/integration/test_random_advanced.py
@@ -18,7 +18,7 @@
 import pytest
 from utils.random import ModuleGenerator, assert_distribution
 
-import cunumeric as num
+import cupynumeric as num
 
 LEGATE_TEST = os.environ.get("LEGATE_TEST", None) == "1"
 
@@ -121,7 +121,7 @@ def test_geometric(t):
             (1.2, 3.1415),
             marks=pytest.mark.xfail,
             # NumPy returns 1-dim array
-            # cuNumeric raises TypeError: float() argument must be a string
+            # cuPyNumeric raises TypeError: float() argument must be a string
             # or a real number, not 'tuple'
         ),
     ),
@@ -219,13 +219,13 @@ def test_random_size_none(func, args):
     gen_num = num.random.Generator(num.random.XORWOW(seed=seed))
     a_np = getattr(gen_np, func)(*args, size=None)
     a_num = getattr(gen_num, func)(*args, size=None)
-    # cuNumeric returns singleton array
+    # cuPyNumeric returns singleton array
     # NumPy returns scalar
     assert np.ndim(a_np) == np.ndim(a_num)
 
 
 class TestRandomErrors:
-    # cuNumeric zipf hangs on the invalid args when LEGATE_TEST=1
+    # cuPyNumeric zipf hangs on the invalid args when LEGATE_TEST=1
     @pytest.mark.skipif(LEGATE_TEST, reason="Test hang when LEGATE_TEST=1")
     @pytest.mark.parametrize(
         "dist, expected_exc",
diff --git a/tests/integration/test_random_beta.py b/tests/integration/test_random_beta.py
index 1670b742a1..bcaf419a38 100644
--- a/tests/integration/test_random_beta.py
+++ b/tests/integration/test_random_beta.py
@@ -17,7 +17,7 @@
 import pytest
 from utils.random import ModuleGenerator, assert_distribution
 
-import cunumeric as num
+import cupynumeric as num
 
 BITGENERATOR_ARGS = [
     ModuleGenerator,
@@ -155,7 +155,7 @@ def test_beta_size_none(func, args):
     gen_num = num.random.Generator(num.random.XORWOW(seed=seed))
     a_np = getattr(gen_np, func)(*args, size=None)
     a_num = getattr(gen_num, func)(*args, size=None)
-    # cuNumeric returns singleton array
+    # cuPyNumeric returns singleton array
     # NumPy returns scalar
     assert np.ndim(a_np) == np.ndim(a_num)
 
diff --git a/tests/integration/test_random_bitgenerator.py b/tests/integration/test_random_bitgenerator.py
index c0b8b70a2f..0d6784cef5 100644
--- a/tests/integration/test_random_bitgenerator.py
+++ b/tests/integration/test_random_bitgenerator.py
@@ -17,7 +17,7 @@
 import pytest
 from utils.random import ModuleGenerator, assert_distribution
 
-import cunumeric as num
+import cupynumeric as num
 
 BITGENERATOR_ARGS = [
     ModuleGenerator,
@@ -60,7 +60,7 @@ def test_bitgenerator_size_none():
     gen_num = num.random.XORWOW(seed=seed)
     a_np = gen_np.random_raw(size=None)
     a_num = gen_num.random_raw(shape=None)
-    # cuNumeric returns singleton array
+    # cuPyNumeric returns singleton array
     # NumPy returns scalar
     assert np.ndim(a_np) == np.ndim(a_num)
 
@@ -254,7 +254,7 @@ def test_random_size_none(func):
     gen_num = num.random.Generator(num.random.XORWOW(seed=seed))
     a_np = getattr(gen_np, func)(size=None)
     a_num = getattr(gen_num, func)(size=None)
-    # cuNumeric returns singleton array
+    # cuPyNumeric returns singleton array
     # NumPy returns scalar
     assert np.ndim(a_np) == np.ndim(a_num)
 
diff --git a/tests/integration/test_random_creation.py b/tests/integration/test_random_creation.py
index 82fd8f5483..9cfecb35fa 100644
--- a/tests/integration/test_random_creation.py
+++ b/tests/integration/test_random_creation.py
@@ -20,10 +20,10 @@
 import pytest
 from utils.random import assert_distribution
 
-import cunumeric as num
+import cupynumeric as num
 
 LEGATE_TEST = os.environ.get("LEGATE_TEST", None) == "1"
-EAGER_TEST = os.environ.get("CUNUMERIC_FORCE_THUNK", None) == "eager"
+EAGER_TEST = os.environ.get("CUPYNUMERIC_FORCE_THUNK", None) == "eager"
 
 
 def test_randn():
@@ -42,14 +42,14 @@ def test_randn():
 def reseed_and_gen_random(
     func: str, seed: Any, *args: Any, **kwargs: Any
 ) -> tuple[Any, Any]:
-    """Reseeed singleton rng and generate random in NumPy and cuNumeric."""
+    """Reseeed singleton rng and generate random in NumPy and cuPyNumeric."""
     return gen_random_from_both(func, *args, **kwargs)
 
 
 def gen_random_from_both(
     func: str, *args: Any, **kwargs: Any
 ) -> tuple[Any, Any]:
-    """Call the same random function from both NumPy and cuNumeric."""
+    """Call the same random function from both NumPy and cuPyNumeric."""
     return (
         getattr(np.random, func)(*args, **kwargs),
         getattr(num.random, func)(*args, **kwargs),
@@ -63,21 +63,21 @@ def gen_random_from_both(
             12345,
             marks=pytest.mark.xfail(
                 not EAGER_TEST,
-                reason="cuNumeric does not respect the singleton generator",
+                reason="cuPyNumeric does not respect the singleton generator",
             ),
-            # https://github.com/nv-legate/cunumeric/issues/601
+            # https://github.com/nv-legate/cupynumeric/issues/601
             # NumPy: generates the same array after initializing with the seed.
-            # cuNumeric: keeps generating different arrays.
+            # cuPyNumeric: keeps generating different arrays.
             # seed is respected in Eager mode.
         ),
         pytest.param(None),
         pytest.param(
             (4, 6, 8),
             marks=pytest.mark.xfail(
-                reason="cuNumeric does not take tuple as seed"
+                reason="cuPyNumeric does not take tuple as seed"
             ),
             # NumPy: pass
-            # cuNumeric: from runtime.set_next_random_epoch(int(init)):
+            # cuPyNumeric: from runtime.set_next_random_epoch(int(init)):
             # TypeError: int() argument must be a string, a bytes-like object
             # or a real number, not 'tuple'
         ),
@@ -95,7 +95,7 @@ def test_singleton_seed(seed):
 
 @pytest.mark.xfail(
     EAGER_TEST,
-    reason="cuNumeric does not respect seed in Eager mode",
+    reason="cuPyNumeric does not respect seed in Eager mode",
 )
 @pytest.mark.parametrize(
     "seed",
@@ -104,9 +104,9 @@ def test_singleton_seed(seed):
         pytest.param(
             (0, 4, 5),
             marks=pytest.mark.xfail(
-                reason="cuNumeric fails to generate random"
+                reason="cuPyNumeric fails to generate random"
                 # NumPy: pass
-                # cuNumeric: struct.error: required argument is not an integer
+                # cuPyNumeric: struct.error:required argument is not an integer
             ),
         ),
     ],
@@ -121,7 +121,7 @@ def test_default_rng_seed(seed):
 
 @pytest.mark.xfail(
     EAGER_TEST,
-    reason="cuNumeric does not respect seed in Eager mode",
+    reason="cuPyNumeric does not respect seed in Eager mode",
 )
 def test_default_rng_bitgenerator():
     seed = 12345
@@ -135,9 +135,9 @@ def test_default_rng_bitgenerator():
 
 @pytest.mark.xfail(
     EAGER_TEST,
-    reason="cuNumeric does not respect seed in Eager mode",
+    reason="cuPyNumeric does not respect seed in Eager mode",
 )
-@pytest.mark.xfail(reason="cunumeric.internal#135")
+@pytest.mark.xfail(reason="cupynumeric.internal#135")
 def test_default_rng_generator():
     steps = 3
     seed = 12345
@@ -215,13 +215,13 @@ def test_random_integers_high_limit():
     assert np.max(arr_num) <= limit
 
 
-@pytest.mark.xfail(reason="cuNumeric raises NotImplementedError")
+@pytest.mark.xfail(reason="cuPyNumeric raises NotImplementedError")
 @pytest.mark.parametrize(
     "low, high", [(3000.45, 15000), (123, 456.7), (12.3, 45.6)], ids=str
 )
 def test_randint_float_range(low, high):
     # NumPy returns integer scalar
-    # cuNumeric raises one of the following
+    # cuPyNumeric raises one of the following
     # NotImplementedError: 'low' must be an integer
     # NotImplementedError: 'high' must be an integer or None
     arr_np, arr_num = gen_random_from_both(
@@ -234,13 +234,13 @@ def test_randint_float_range(low, high):
 
 @pytest.mark.xfail(
     not EAGER_TEST,
-    reason="cuNumeric raises NotImplementedError",
+    reason="cuPyNumeric raises NotImplementedError",
 )
 @pytest.mark.parametrize("size", ALL_RNG_SIZES, ids=str)
 @pytest.mark.parametrize("low, high", [(1000, 65535), (0, 1024)], ids=str)
 @pytest.mark.parametrize("dtype", UINT_DTYPES, ids=str)
 def test_randint_uint(low, high, dtype, size):
-    # NotImplementedError: cunumeric.random.randint must be given an integer
+    # NotImplementedError: cupynumeric.random.randint must be given an integer
     # dtype
     # NotImplementedError: type for random.integers has to be int64 or int32
     # or int16
@@ -278,7 +278,7 @@ def test_randint_distribution(low, high, size, dtype):
 
 @pytest.mark.xfail(
     not EAGER_TEST,
-    reason="cuNumeric raises NotImplementedError",
+    reason="cuPyNumeric raises NotImplementedError",
 )
 @pytest.mark.parametrize("size", (1024, 1025))
 def test_randint_bool(size):
@@ -308,7 +308,7 @@ def test_random_sample_basic_stats(size):
 
 
 @pytest.mark.xfail(
-    reason="NumPy returns scalar, cuNumeric returns 1-dim array"
+    reason="NumPy returns scalar, cuPyNumeric returns 1-dim array"
 )
 def test_random_sample_size_none():
     arr_np, arr_num = gen_random_from_both("random_sample", size=None)
@@ -355,7 +355,7 @@ def assert_exc_from_both(self, func, exc, *args, **kwargs):
         ],
         ids=lambda x: f" {str(getattr(x, 'expected_exception', x))} ",
     )
-    @pytest.mark.xfail(reason="NumPy raises exceptions, cuNumeric pass")
+    @pytest.mark.xfail(reason="NumPy raises exceptions, cuPyNumeric pass")
     def test_invalid_seed(self, seed, expected_exc):
         self.assert_exc_from_both("seed", expected_exc, seed)
         # -100: NumPy raises ValueError: Seed must be between 0 and 2**32 - 1
@@ -363,7 +363,7 @@ def test_invalid_seed(self, seed, expected_exc):
         # dtype('float64') to dtype('int64') according to the rule 'safe'
         # "abc": TypeError: Cannot cast scalar from dtype('<U3') to
         # dtype('int64') according to the rule 'safe'
-        # cuNumeric accepts both -100 and 12.0, raises ValueError on "abc"
+        # cuPyNumeric accepts both -100 and 12.0, raises ValueError on "abc"
         # ValueError: invalid literal for int() with base 10: 'abc'
 
     @pytest.mark.parametrize(
@@ -411,10 +411,10 @@ def test_randint_invalid_range(self, low, high, expected_exc):
     def test_randint_invalid_size(self, size, expected_exc):
         self.assert_exc_from_both("randint", expected_exc, 10000, size=size)
 
-    @pytest.mark.xfail(reason="cuNumeric does not check the bound")
+    @pytest.mark.xfail(reason="cuPyNumeric does not check the bound")
     def test_randint_int16_bound(self):
         # NumPy: ValueError: high is out of bounds for int16
-        # cuNumeric: array([13642], dtype=int16)
+        # cuPyNumeric: array([13642], dtype=int16)
         expected_exc = ValueError
         self.assert_exc_from_both(
             "randint", expected_exc, 34567, dtype=np.int16
@@ -430,29 +430,29 @@ def test_randint_higher_bound_zero(self):
             pytest.param(
                 str,
                 marks=pytest.mark.xfail(
-                    reason="NumPy raise TypeError, cuNumeric pass"
+                    reason="NumPy raise TypeError, cuPyNumeric pass"
                 ),
             ),
             # NumPy: TypeError: Unsupported dtype dtype('<U') for randint
-            # cuNumeric: array(['4'], dtype='<U1')
+            # cuPyNumeric: array(['4'], dtype='<U1')
             pytest.param(
                 np.float16,
                 marks=pytest.mark.xfail(
-                    reason="NumPy: TypeError, cuNumeric: NotImplementedError"
+                    reason="NumPy: TypeError, cuPyNumeric: NotImplementedError"
                 ),
             ),
             # NumPy: TypeError: Unsupported dtype dtype('float16') for randint
-            # cuNumeric with LEGATE_TEST=1: NotImplementedError: type for
+            # cuPyNumeric with LEGATE_TEST=1: NotImplementedError: type for
             # random.integers has to be int64 or int32 or int16
             # without LEGATE_TEST=1: array([2336.], dtype=float16)
             pytest.param(
                 None,
                 marks=pytest.mark.xfail(
-                    reason="NumPy default to float, cuNumeric pass"
+                    reason="NumPy default to float, cuPyNumeric pass"
                 ),
             ),
             # NumPy: TypeError: Unsupported dtype dtype('float64') for randint
-            # cuNumeric: array([401.])
+            # cuPyNumeric: array([401.])
         ],
         ids=str,
     )
@@ -460,7 +460,7 @@ def test_randint_dtype(self, dtype):
         expected_exc = TypeError
         self.assert_exc_from_both("randint", expected_exc, 10000, dtype=dtype)
 
-    @pytest.mark.xfail(reason="cuNumeric pass or raise NotImplementedError")
+    @pytest.mark.xfail(reason="cuPyNumeric pass or raise NotImplementedError")
     @pytest.mark.parametrize("size", (1024, 1025))
     def test_randint_bool(self, size):
         expected_exc = ValueError
@@ -468,10 +468,10 @@ def test_randint_bool(self, size):
             "randint", expected_exc, 10000, size=size, dtype=bool
         )
         # NumPy: ValueError: high is out of bounds for bool
-        # cuNumeric size > 1024 or LEGATE_TEST=1:
+        # cuPyNumeric size > 1024 or LEGATE_TEST=1:
         # NotImplementedError: type for random.integers has to be int64 or
         # int32 or int16
-        # cuNumeric size <= 1024 and LEGATE_TEST=0: returns array of booleans
+        # cuPyNumeric size <= 1024 and LEGATE_TEST=0: returns array of booleans
 
     @pytest.mark.parametrize(
         "size, expected_exc",
diff --git a/tests/integration/test_random_gamma.py b/tests/integration/test_random_gamma.py
index 56ec28159b..2af341b974 100644
--- a/tests/integration/test_random_gamma.py
+++ b/tests/integration/test_random_gamma.py
@@ -17,7 +17,7 @@
 import pytest
 from utils.random import ModuleGenerator, assert_distribution
 
-import cunumeric as num
+import cupynumeric as num
 
 BITGENERATOR_ARGS = [
     ModuleGenerator,
@@ -140,7 +140,7 @@ def test_gamma_size_none(func):
     gen_num = num.random.Generator(num.random.XORWOW(seed=seed))
     a_np = getattr(gen_np, func)(3.1415, 1.414, size=None)
     a_num = getattr(gen_num, func)(3.1415, 1.414, size=None)
-    # cuNumeric returns singleton array
+    # cuPyNumeric returns singleton array
     # NumPy returns scalar
     assert np.ndim(a_np) == np.ndim(a_num)
 
diff --git a/tests/integration/test_random_straightforward.py b/tests/integration/test_random_straightforward.py
index f3e022c35f..19dcb0ddd5 100644
--- a/tests/integration/test_random_straightforward.py
+++ b/tests/integration/test_random_straightforward.py
@@ -19,7 +19,7 @@
 import pytest
 from utils.random import ModuleGenerator, assert_distribution
 
-import cunumeric as num
+import cupynumeric as num
 
 BITGENERATOR_ARGS = [
     ModuleGenerator,
@@ -366,7 +366,7 @@ def test_beta_sizes(t, func, args, size):
 
 
 @pytest.mark.xfail(
-    reason="cuNumeric returns singleton array; NumPy returns scalar"
+    reason="cuPyNumeric returns singleton array; NumPy returns scalar"
 )
 @pytest.mark.parametrize("t", BITGENERATOR_ARGS, ids=str)
 @pytest.mark.parametrize("func, args", FUNC_ARGS, ids=str)
@@ -376,7 +376,7 @@ def test_beta_size_none(t, func, args):
     gen_num = num.random.Generator(t(seed=seed))
     a_np = getattr(gen_np, func)(*args, size=None)
     a_num = getattr(gen_num, func)(*args, size=None)
-    # cuNumeric returns singleton array
+    # cuPyNumeric returns singleton array
     # NumPy returns scalar
     # print("a_np: %s, a_num=%s\n"%(str(a_np), str(a_num)))
     assert (1 + np.ndim(a_np)) == np.ndim(a_num)
diff --git a/tests/integration/test_reduction.py b/tests/integration/test_reduction.py
index ea341ec607..32c237f08a 100644
--- a/tests/integration/test_reduction.py
+++ b/tests/integration/test_reduction.py
@@ -17,7 +17,7 @@
 from utils.comparisons import allclose
 from utils.utils import AxisError
 
-import cunumeric as num
+import cupynumeric as num
 
 # numpy.sum(a, axis=None, dtype=None, out=None, keepdims=<no value>,
 # initial=<no value>, where=<no value>)
@@ -88,7 +88,7 @@ def test_dtype_negative(self, dtype):
         out_np = np.sum(arr_np)  # Numpy return sum of all datas
         out_num = num.sum(
             arr_num
-        )  # cuNumeric return an array with different data
+        )  # cuPyNumeric return an array with different data
         assert allclose(out_np, out_num)
 
     def test_axis_out_bound(self):
@@ -104,7 +104,7 @@ def test_axis_tuple(self, axis):
         arr_np = np.random.random(size) * 10
         arr_num = num.array(arr_np)
         out_np = np.sum(arr_np, axis=axis)
-        # cuNumeric raises NotImplementedError:
+        # cuPyNumeric raises NotImplementedError:
         # 'Need support for reducing multiple dimensions'
         # Numpy get results
         out_num = num.sum(arr_num, axis=axis)
@@ -140,7 +140,7 @@ def test_initial_scalar_list(self):
     def test_initial_list(self):
         arr = [[1, 2], [3, 4]]
         initial_value = [2, 3]
-        with pytest.raises(ValueError):
+        with pytest.raises((ValueError, TypeError)):
             num.sum(arr, initial=initial_value)
 
     @pytest.mark.xfail
@@ -296,7 +296,7 @@ def test_axis_keepdims(self, size, keepdims):
         for axis in range(-ndim + 1, ndim, 1):
             out_np = np.sum(arr_np, axis=axis, keepdims=keepdims)
             out_num = num.sum(arr_num, axis=axis, keepdims=keepdims)
-            # in cunumeric/deferred/unary_reduction:
+            # in cupynumeric/deferred/unary_reduction:
             # if lhs_array.size == 1:
             #     > assert axes is None or len(axes) == rhs_array.ndim - (
             #         0 if keepdims else lhs_array.ndim
diff --git a/tests/integration/test_repeat.py b/tests/integration/test_repeat.py
index 0df92ff172..eb4b4c4814 100644
--- a/tests/integration/test_repeat.py
+++ b/tests/integration/test_repeat.py
@@ -14,11 +14,10 @@
 #
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
 from utils.generators import mk_seq_array
-from utils.utils import AxisError
+from utils.utils import ONE_MAX_DIM_RANGE, AxisError
 
-import cunumeric as num
+import cupynumeric as num
 
 
 @pytest.mark.parametrize(
@@ -49,7 +48,7 @@ def test_array_empty_repeats_invalid_negative(repeats):
         # together with shape (0,) (2,)
     with pytest.raises(expected_exc):
         num.repeat([], repeats)
-        # while cunumeric is pass with the result []
+        # while cupynumeric is pass with the result []
 
 
 @pytest.mark.xfail
@@ -158,7 +157,7 @@ def test_array_1d_repeats_fatal_error(arr, repeats):
         # numpy raises "ValueError: negative dimensions are not allowed"
     with pytest.raises(expected_exc):
         num.repeat(anum, repeats)
-        # cuNumeric got "Fatal Python error: Aborted"
+        # cuPyNumeric got "Fatal Python error: Aborted"
 
 
 @pytest.mark.parametrize("arr", (None, [], 3, [1, 2, 3], [[1, 3], [2, 4]]))
@@ -201,7 +200,7 @@ def test_array_axis_negative_equal():
     assert np.array_equal(res_np, res_num)
 
 
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
 def test_nd_basic(ndim):
     a_shape = tuple(np.random.randint(1, 9) for _ in range(ndim))
     np_array = mk_seq_array(np, a_shape)
@@ -212,7 +211,7 @@ def test_nd_basic(ndim):
     assert np.array_equal(res_num, res_np)
 
 
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
 def test_nd_axis(ndim):
     for axis in range(0, ndim):
         a_shape = tuple(np.random.randint(1, 9) for _ in range(ndim))
@@ -224,7 +223,7 @@ def test_nd_axis(ndim):
         assert np.array_equal(res_num2, res_np2)
 
 
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
 def test_nd_repeats(ndim):
     a_shape = tuple(np.random.randint(1, 9) for _ in range(ndim))
     np_array = mk_seq_array(np, a_shape)
diff --git a/tests/integration/test_reshape.py b/tests/integration/test_reshape.py
index 696ca0f410..e2a72f335c 100644
--- a/tests/integration/test_reshape.py
+++ b/tests/integration/test_reshape.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 SQUARE_CASES = [
     (10, 5, 2),
@@ -133,7 +133,7 @@ def test_shape(self, shape, order):
     def test_0d(self, shape):
         # for shape=None,
         # In Numpy, pass, returns the flattened 1-D array
-        # In cuNumeric, raises TypeError: 'NoneType' object is not iterable
+        # In cuPyNumeric, raises TypeError: 'NoneType' object is not iterable
         a = num.array(self.anp)
         assert np.array_equal(
             num.reshape(a, shape),
@@ -154,7 +154,7 @@ def test_1d(self):
     )
     def test_ravel(self, order):
         # In Numpy, pass with 'K'
-        # In cuNumeric, when order is 'K', raise ValueError:
+        # In cuPyNumeric, when order is 'K', raise ValueError:
         # order 'K' is not permitted for reshaping
         a = num.array(self.anp)
         assert np.array_equal(
@@ -165,7 +165,7 @@ def test_ravel(self, order):
     @pytest.mark.xfail
     def test_ravel_a_none(self):
         # In Numpy, pass and returns [None]
-        # In cuNumeric, raises AttributeError:
+        # In cuPyNumeric, raises AttributeError:
         # 'NoneType' object has no attribute 'ravel'
         assert np.array_equal(
             num.ravel(None),
@@ -197,7 +197,7 @@ def setup_method(self):
     @pytest.mark.xfail
     def test_a_none(self):
         # In Numpy, it raises ValueError: cannot reshape array
-        # In cuNumeric, it raises AttributeError:
+        # In cuPyNumeric, it raises AttributeError:
         # 'NoneType' object has no attribute
         with pytest.raises(ValueError):
             num.reshape(None, self.shape)
diff --git a/tests/integration/test_roll.py b/tests/integration/test_roll.py
index 571b67975d..bed4fa402b 100644
--- a/tests/integration/test_roll.py
+++ b/tests/integration/test_roll.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 # roll tests adapted directly from numpy/_core/tests/test_numeric.py
 
diff --git a/tests/integration/test_rot90.py b/tests/integration/test_rot90.py
index bcc9d00f81..2e0211e4d9 100644
--- a/tests/integration/test_rot90.py
+++ b/tests/integration/test_rot90.py
@@ -15,7 +15,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 
 class TestRot90:
diff --git a/tests/integration/test_round.py b/tests/integration/test_round.py
index 3ea87bab43..22bce80c02 100644
--- a/tests/integration/test_round.py
+++ b/tests/integration/test_round.py
@@ -15,10 +15,10 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
 from utils.generators import mk_0to1_array
+from utils.utils import MAX_DIM_RANGE
 
-import cunumeric as num
+import cupynumeric as num
 
 FLOAT = (
     np.float32,
@@ -40,9 +40,9 @@ def test_empty_array(decimals):
 
 
 @pytest.mark.parametrize("decimals", range(-3, 3))
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM))
+@pytest.mark.parametrize("ndim", MAX_DIM_RANGE)
 def test_basic_float16(ndim, decimals):
-    shape = (5,) * ndim
+    shape = (3,) * ndim
     np_arr = mk_0to1_array(np, shape, dtype=np.float16)
     num_arr = mk_0to1_array(num, shape, dtype=np.float16)
 
@@ -53,10 +53,10 @@ def test_basic_float16(ndim, decimals):
 
 
 @pytest.mark.parametrize("decimals", range(-5, 5))
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM))
+@pytest.mark.parametrize("ndim", MAX_DIM_RANGE)
 @pytest.mark.parametrize("dtype", FLOAT)
 def test_basic_float(dtype, ndim, decimals):
-    shape = (5,) * ndim
+    shape = (3,) * ndim
     np_arr = mk_0to1_array(np, shape, dtype=dtype)
     num_arr = mk_0to1_array(num, shape, dtype=dtype)
 
@@ -67,10 +67,10 @@ def test_basic_float(dtype, ndim, decimals):
 
 
 @pytest.mark.parametrize("decimals", range(-5, 5))
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM))
+@pytest.mark.parametrize("ndim", MAX_DIM_RANGE)
 @pytest.mark.parametrize("dtype", FLOAT)
 def test_randomized_float(dtype, ndim, decimals):
-    shape = (5,) * ndim
+    shape = (3,) * ndim
     values = np.random.uniform(-10, 10, shape) * 10**6
     np_arr = np.array(values, dtype=dtype)
     num_arr = num.array(values, dtype=dtype)
@@ -82,7 +82,7 @@ def test_randomized_float(dtype, ndim, decimals):
 
 
 @pytest.mark.parametrize("decimals", range(-5, 5))
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM))
+@pytest.mark.parametrize("ndim", MAX_DIM_RANGE)
 @pytest.mark.parametrize("dtype", COMPLEX)
 def test_randomized_complex(dtype, ndim, decimals):
     shape = (1,) * ndim
diff --git a/tests/integration/test_scan.py b/tests/integration/test_scan.py
index 4b102f39cd..a91208037b 100644
--- a/tests/integration/test_scan.py
+++ b/tests/integration/test_scan.py
@@ -18,7 +18,7 @@
 import pytest
 from utils.generators import mk_0to1_array
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def _gen_array(n0, shape, dt, axis, outtype):
@@ -43,9 +43,9 @@ def _gen_array(n0, shape, dt, axis, outtype):
         A[(1,) * len(shape)] = np.nan
     elif n0 == "second_half":
         # second from last element along all axes is a NAN
-        A[
-            tuple(map(lambda i, j: i - j, A.shape, (2,) * len(A.shape)))
-        ] = np.nan
+        A[tuple(map(lambda i, j: i - j, A.shape, (2,) * len(A.shape)))] = (
+            np.nan
+        )
     if outtype is None:
         B = None
         C = None
@@ -86,7 +86,7 @@ def _run_tests(op, n0, shape, dt, axis, out0, outtype):
     else:
         print("FAIL!")
         print(f"INPUT    : {A}")
-        print(f"CUNUMERIC: {B}")
+        print(f"CUPYNUMERIC: {B}")
         print(f"NUMPY    : {C}")
         assert False
 
@@ -136,7 +136,7 @@ def _run_tests(op, n0, shape, dt, axis, out0, outtype):
     "dtype, outtype",
     [
         pytest.param(np.int16, np.float64, marks=pytest.mark.xfail),
-        # NumPy and cuNumeric produce different values
+        # NumPy and cuPyNumeric produce different values
         # out_np: array([0., 0., 0., 0., 0., 0.])
         # out_num: array([0.16666667, 0.05555556, 0.02777778, 0.01851852,
         #                 0.0154321, 0.0154321 ]))
@@ -149,7 +149,7 @@ def _run_tests(op, n0, shape, dt, axis, out0, outtype):
     "op",
     [
         pytest.param("cumsum", marks=pytest.mark.xfail),
-        # cunumeric.cumsum returns different value to numpy.cumsum
+        # cupynumeric.cumsum returns different value to numpy.cumsum
         # out_np: array([0., 0., 0., 0., 0., 0.])
         # out_num:
         # array([6.8983227e-310, 6.8983227e-310, 6.8983227e-310,
@@ -256,7 +256,7 @@ def test_axis_out_of_bound(self, op, axis):
     def test_out_invalid_shape(self, op, out_shape):
         # for all ops and all out_shape,
         # in Numpy, it raises ValueError
-        # in cuNumeric, it raises NotImplementedError
+        # in cuPyNumeric, it raises NotImplementedError
         expected_exc = ValueError
         A = [1, 2, 3, 4]
         out_np = np.zeros(out_shape)
diff --git a/tests/integration/test_searchsorted.py b/tests/integration/test_searchsorted.py
index 8ef77f944b..43a749fd00 100644
--- a/tests/integration/test_searchsorted.py
+++ b/tests/integration/test_searchsorted.py
@@ -15,11 +15,11 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
+from utils.utils import MAX_DIM_RANGE
 
-import cunumeric as num
+import cupynumeric as num
 
-# cunumeric.searchsorted(a: ndarray, v: Union[int, float, ndarray],
+# cupynumeric.searchsorted(a: ndarray, v: Union[int, float, ndarray],
 # side: Literal['left', 'right'] = 'left',
 # sorter: Optional[ndarray] = None) → Union[int, ndarray]
 
@@ -80,13 +80,13 @@ def test_val_none(self):
             # instances of 'NoneType' and 'NoneType'
         with pytest.raises(expected_exc):
             num.searchsorted(arr, None)
-            # cuNumeric raises AssertionError
+            # cuPyNumeric raises AssertionError
             #       if self.deferred is None:
             #           if self.parent is None:
             #    >          assert self.runtime.is_supported_dtype
             #                    (self.array.dtype)
             #    E               AssertionError
-            # cunumeric/cunumeric/eager.py:to_deferred_array()
+            # cupynumeric/cupynumeric/eager.py:to_deferred_array()
 
     @pytest.mark.xfail
     def test_side_invalid(self):
@@ -98,7 +98,7 @@ def test_side_invalid(self):
             # (got 'hi')
         with pytest.raises(expected_exc):
             num.searchsorted(arr, 10, "hi")
-            # cuNumeric passed, and the result is the same as that of 'right'.
+            # cuPyNumeric passed, and result is the same as that of 'right'.
 
     def test_ndim_mismatch(self):
         a = np.random.random((5, 5, 5))
@@ -208,7 +208,7 @@ def test_standard_cases(volume, dtype, side):
     check_api(generate_random(volume, dtype), side=side)
 
 
-@pytest.mark.parametrize("ndim", range(0, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", MAX_DIM_RANGE)
 @pytest.mark.parametrize("side", SIDES)
 def test_ndim(ndim, side):
     a = np.random.randint(-100, 100, size=100)
diff --git a/tests/integration/test_set_item.py b/tests/integration/test_set_item.py
index bfbda631dc..67e701ebb1 100644
--- a/tests/integration/test_set_item.py
+++ b/tests/integration/test_set_item.py
@@ -15,7 +15,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def test_basic():
diff --git a/tests/integration/test_setflags.py b/tests/integration/test_setflags.py
index a3bc81699b..86cbbf0faf 100644
--- a/tests/integration/test_setflags.py
+++ b/tests/integration/test_setflags.py
@@ -14,9 +14,9 @@
 #
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
+from utils.utils import MAX_DIM_RANGE
 
-import cunumeric as num
+import cupynumeric as num
 
 
 @pytest.mark.parametrize("write", (None, False, True, 1, -1, 100, "11"))
@@ -73,7 +73,7 @@ def test_writeable():
     array_num = num.array([0, 0, 0, 0, 0])
     array_np.setflags(1)
     array_num.setflags(1)
-    # cuNumeric raises ValueError: cannot set WRITEABLE flag to
+    # cuPyNumeric raises ValueError: cannot set WRITEABLE flag to
     # True of this array
     array_np[2] = 1
     array_num[2] = 1
@@ -95,10 +95,10 @@ def test_logic():
     expected_exc = ValueError
     with pytest.raises(expected_exc):
         array_num.setflags(uic=True)
-        # cuNumeric raises ValueError: cannot set WRITEBACKIFCOPY flag to True
+        # cuPyNumeric: ValueError: cannot set WRITEBACKIFCOPY flag to True
 
 
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", MAX_DIM_RANGE)
 def test_set_write_true(ndim):
     shape = (3,) * ndim
     array_np = np.random.randint(1, 100, shape, dtype=int)
@@ -108,7 +108,7 @@ def test_set_write_true(ndim):
     assert array_np.flags["WRITEABLE"] == array_num.flags["WRITEABLE"]
 
 
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", MAX_DIM_RANGE)
 def test_set_write_false(ndim):
     shape = (3,) * ndim
     array_np = np.random.randint(1, 100, shape, dtype=int)
@@ -118,7 +118,7 @@ def test_set_write_false(ndim):
     assert array_np.flags["WRITEABLE"] == array_num.flags["WRITEABLE"]
 
 
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", MAX_DIM_RANGE)
 def test_set_align_true(ndim):
     shape = (3,) * ndim
     array_np = np.random.randint(1, 100, shape, dtype=int)
@@ -129,7 +129,7 @@ def test_set_align_true(ndim):
 
 
 @pytest.mark.xfail
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", MAX_DIM_RANGE)
 def test_set_align_false(ndim):
     shape = (3,) * ndim
     array_np = np.random.randint(1, 100, shape, dtype=int)
diff --git a/tests/integration/test_shape.py b/tests/integration/test_shape.py
index e134f64ae3..05e4848c05 100644
--- a/tests/integration/test_shape.py
+++ b/tests/integration/test_shape.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def test_basic():
diff --git a/tests/integration/test_singleton_access.py b/tests/integration/test_singleton_access.py
index 8d146a35b1..8278d9f14d 100644
--- a/tests/integration/test_singleton_access.py
+++ b/tests/integration/test_singleton_access.py
@@ -15,14 +15,14 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
 from utils.generators import mk_0to1_array, scalar_gen
+from utils.utils import MAX_DIM_RANGE
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def nonscalar_gen(lib):
-    for ndim in range(1, LEGATE_MAX_DIM + 1):
+    for ndim in MAX_DIM_RANGE:
         yield mk_0to1_array(lib, ndim * (5,))
 
 
@@ -55,7 +55,7 @@ def array_gen(lib):
     # get "multiple" items from scalar array
     for arr in scalar_gen(lib, 42):
         yield arr[arr.ndim * (slice(None),)]  # arr[:,:]
-        # TODO: fix cunumeric#34
+        # TODO: fix cupynumeric#34
         # yield arr[arr.ndim * (slice(1, None),)] # arr[1:,1:]
     # set single item on non-scalar array
     for arr in nonscalar_gen(lib):
@@ -87,7 +87,7 @@ def array_gen(lib):
     for arr in scalar_gen(lib, 42):
         arr[arr.ndim * (slice(None),)] = -1  # arr[:,:] = -1
         yield arr
-    # TODO: fix cunumeric#34
+    # TODO: fix cupynumeric#34
     # for arr in scalar_gen(lib, 42):
     #     arr[arr.ndim * (slice(1, None),)] = -1 # arr[1:,1:] = -1
     #     yield arr
diff --git a/tests/integration/test_slicing.py b/tests/integration/test_slicing.py
index 38e4a5f518..1d186f129d 100644
--- a/tests/integration/test_slicing.py
+++ b/tests/integration/test_slicing.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def test_basic():
diff --git a/tests/integration/test_solve.py b/tests/integration/test_solve.py
index 5747693001..3ecf30f123 100644
--- a/tests/integration/test_solve.py
+++ b/tests/integration/test_solve.py
@@ -17,7 +17,7 @@
 import pytest
 from utils.comparisons import allclose
 
-import cunumeric as num
+import cupynumeric as num
 
 SIZES = (8, 9, 255)
 
diff --git a/tests/integration/test_sort.py b/tests/integration/test_sort.py
index 2618d0491f..4b5ef09046 100644
--- a/tests/integration/test_sort.py
+++ b/tests/integration/test_sort.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 DIM = 5
 SIZES = [
@@ -42,7 +42,7 @@
 SORT_TYPES = ["quicksort", "mergesort", "heapsort", "stable"]
 
 
-# cunumeric.sort(a: ndarray, axis: int = -1,
+# cupynumeric.sort(a: ndarray, axis: int = -1,
 # kind: SortType = 'quicksort', order: Optional = None) → ndarray
 # ndarray.sort(axis=-1, kind=None, order=None)
 
@@ -78,7 +78,7 @@ def test_sorttype_invalid(self):
         res_num = num.sort(arr_num, kind="negative")
         # Numpy raises "ValueError: sort kind must be one of 'quick', 'heap',
         # or 'stable' (got 'negative')"
-        # cuNumeric passed. The code basically supports ‘stable’
+        # cuPyNumeric passed. The code basically supports ‘stable’
         # or not ‘stable’.
         assert np.array_equal(res_num, res_np)
 
@@ -104,7 +104,7 @@ def test_basic_axis_sort_type(self, size, sort_type):
     @pytest.mark.skip
     @pytest.mark.parametrize("size", SIZES)
     def test_arr_basic_axis(self, size):
-        # Set skip due to https://github.com/nv-legate/cunumeric/issues/781
+        # Set skip due to https://github.com/nv-legate/cupynumeric/issues/781
         arr_np = np.random.randint(-100, 100, size)
         arr_num = num.array(arr_np)
         for axis in range(-arr_num.ndim + 1, arr_num.ndim):
@@ -118,7 +118,7 @@ def test_arr_basic_axis(self, size):
     @pytest.mark.parametrize("size", SIZES)
     @pytest.mark.parametrize("sort_type", SORT_TYPES)
     def test_arr_basic_axis_sort(self, size, sort_type):
-        # Set skip due to https://github.com/nv-legate/cunumeric/issues/781
+        # Set skip due to https://github.com/nv-legate/cupynumeric/issues/781
         arr_np = np.random.randint(-100, 100, size)
         arr_num = num.array(arr_np)
         for axis in range(-arr_num.ndim + 1, arr_num.ndim):
@@ -131,7 +131,7 @@ def test_arr_basic_axis_sort(self, size, sort_type):
     @pytest.mark.skip
     @pytest.mark.parametrize("size", SIZES)
     def test_compare_arr_axis(self, size):
-        # Set skip due to https://github.com/nv-legate/cunumeric/issues/781
+        # Set skip due to https://github.com/nv-legate/cupynumeric/issues/781
         arr_num = num.random.randint(-100, 100, size)
         for axis in range(-arr_num.ndim + 1, arr_num.ndim):
             arr_num_copy = arr_num
@@ -143,7 +143,7 @@ def test_compare_arr_axis(self, size):
     @pytest.mark.parametrize("size", SIZES)
     @pytest.mark.parametrize("sort_type", SORT_TYPES)
     def test_compare_arr_axis_sort(self, size, sort_type):
-        # Set skip due to https://github.com/nv-legate/cunumeric/issues/781
+        # Set skip due to https://github.com/nv-legate/cupynumeric/issues/781
         arr_num = num.random.randint(-100, 100, size)
         for axis in range(-arr_num.ndim + 1, arr_num.ndim):
             arr_num_copy = arr_num
@@ -181,7 +181,7 @@ def test_basic_complex_axis_sort(self, size, sort_type):
     @pytest.mark.parametrize("size", SIZES)
     @pytest.mark.parametrize("sort_type", SORT_TYPES)
     def test_compare_complex_arr_axis_sort(self, size, sort_type):
-        # Set skip due to https://github.com/nv-legate/cunumeric/issues/781
+        # Set skip due to https://github.com/nv-legate/cupynumeric/issues/781
         arr_num = (
             num.random.randint(-100, 100, size)
             + num.random.randint(-100, 100, size) * 1.0j
diff --git a/tests/integration/test_sort_complex.py b/tests/integration/test_sort_complex.py
index eeb0bc85d6..9216604453 100644
--- a/tests/integration/test_sort_complex.py
+++ b/tests/integration/test_sort_complex.py
@@ -16,9 +16,9 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
-# cunumeric.sort_complex(a: ndarray) → ndarray
+# cupynumeric.sort_complex(a: ndarray) → ndarray
 
 DIM = 5
 SIZES = [
diff --git a/tests/integration/test_split.py b/tests/integration/test_split.py
index 3fcf6aa663..b5416cca83 100644
--- a/tests/integration/test_split.py
+++ b/tests/integration/test_split.py
@@ -18,16 +18,16 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
-# cunumeric.split(a: ndarray, indices: Union[int, ndarray], axis: int = 0)
-# → list[cunumeric.array.ndarray]
-# cunumeric.vsplit(a: ndarray, indices: Union[int, ndarray])
-# → list[cunumeric.array.ndarray]    (axis=0)
-# cunumeric.hsplit(a: ndarray, indices: Union[int, ndarray])
-# → list[cunumeric.array.ndarray]    (axis=1)
-# cunumeric.dsplit(a: ndarray, indices: Union[int, ndarray])
-# → list[cunumeric.array.ndarray]    (axis=2)
+# cupynumeric.split(a: ndarray, indices: Union[int, ndarray], axis: int = 0)
+# → list[cupynumeric.array.ndarray]
+# cupynumeric.vsplit(a: ndarray, indices: Union[int, ndarray])
+# → list[cupynumeric.array.ndarray]    (axis=0)
+# cupynumeric.hsplit(a: ndarray, indices: Union[int, ndarray])
+# → list[cupynumeric.array.ndarray]    (axis=1)
+# cupynumeric.dsplit(a: ndarray, indices: Union[int, ndarray])
+# → list[cupynumeric.array.ndarray]    (axis=2)
 
 
 DIM = 6
@@ -142,7 +142,7 @@ def test_dimensions_vsplit(self):
         expected_exc = ValueError
         with pytest.raises(expected_exc):
             num.vsplit(ary, 1)
-            # cuNumeric returns [array([], dtype=float64)]
+            # cuPyNumeric returns [array([], dtype=float64)]
         with pytest.raises(expected_exc):
             np.vsplit(ary, 1)
             # Numpy raises
@@ -154,7 +154,7 @@ def test_dimensions_vsplit_1(self):
         expected_exc = ValueError
         with pytest.raises(expected_exc):
             num.vsplit(ary, 1)
-            # cuNumeric returns the array
+            # cuPyNumeric returns the array
         with pytest.raises(expected_exc):
             np.vsplit(ary, 1)
             # Numpy raises
diff --git a/tests/integration/test_squeeze.py b/tests/integration/test_squeeze.py
index c68cae2796..ade60a1acc 100644
--- a/tests/integration/test_squeeze.py
+++ b/tests/integration/test_squeeze.py
@@ -17,7 +17,7 @@
 import pytest
 from utils.utils import AxisError
 
-import cunumeric as num
+import cupynumeric as num
 
 DIM = 5
 SIZES = [
@@ -55,36 +55,28 @@ def test_none_array():
         num.squeeze(None)
 
 
-def test_num_invalid_axis():
+def test_invalid_axis() -> None:
     size = (1, 2, 1)
-    a = num.random.randint(low=-10, high=10, size=size)
+    a_np = np.random.randint(low=-10, high=10, size=size)
     msg = r"one"
     with pytest.raises(ValueError, match=msg):
-        num.squeeze(a, axis=1)
+        np.squeeze(a_np, axis=1)
 
-
-def test_array_invalid_axis():
-    size = (1, 2, 1)
-    a = num.random.randint(low=-10, high=10, size=size)
-    msg = r"one"
+    a_num = num.array(a_np)
     with pytest.raises(ValueError, match=msg):
-        a.squeeze(axis=1)
+        num.squeeze(a_num, axis=1)
 
 
-def test_num_axis_out_bound():
+def test_axis_out_bound() -> None:
     size = (1, 2, 1)
-    a = num.random.randint(low=-10, high=10, size=size)
-    msg = r"bounds"
+    a_np = np.random.randint(low=-10, high=10, size=size)
+    msg = r"axis 3 is out of bounds for array of dimension 3"
     with pytest.raises(AxisError, match=msg):
-        num.squeeze(a, axis=3)
-
+        np.squeeze(a_np, axis=3)
 
-def test_array_axis_out_bound():
-    size = (1, 2, 1)
-    a = num.random.randint(-10, 10, size=size)
-    msg = r"bounds"
-    with pytest.raises(AxisError, match=msg):
-        a.squeeze(axis=3)
+    a_num = num.array(a_np)
+    with pytest.raises(ValueError, match=msg):
+        num.squeeze(a_num, axis=3)
 
 
 @pytest.mark.parametrize("axes", (-1, -3))
diff --git a/tests/integration/test_stack.py b/tests/integration/test_stack.py
index d981a94f22..23121d2567 100644
--- a/tests/integration/test_stack.py
+++ b/tests/integration/test_stack.py
@@ -17,7 +17,7 @@
 
 import pytest
 
-import cunumeric._utils.stack as m  # module under test
+import cupynumeric._utils.stack as m  # module under test
 
 
 def test_find_last_user_stacklevel() -> None:
diff --git a/tests/integration/test_stats.py b/tests/integration/test_stats.py
index 12f82e598d..285c74d2de 100644
--- a/tests/integration/test_stats.py
+++ b/tests/integration/test_stats.py
@@ -19,7 +19,7 @@
 import pytest
 from utils.comparisons import allclose
 
-import cunumeric as num
+import cupynumeric as num
 
 np.random.seed(143)
 
@@ -43,14 +43,14 @@ def check_result(in_np, out_np, out_num, **isclose_kwargs):
         and out_np.dtype == out_num.dtype
     )
     if not result and not is_negative_test:
-        print("cunumeric failed the test")
+        print("cupynumeric failed the test")
         print("Input:")
         print(in_np)
         print(f"dtype: {in_np.dtype}")
         print("NumPy output:")
         print(out_np)
         print(f"dtype: {out_np.dtype}")
-        print("cuNumeric output:")
+        print("cuPyNumeric output:")
         print(out_num)
         print(f"dtype: {out_num.dtype}")
     return result
diff --git a/tests/integration/test_svd.py b/tests/integration/test_svd.py
index a8c26d22e7..b20bf82991 100644
--- a/tests/integration/test_svd.py
+++ b/tests/integration/test_svd.py
@@ -17,7 +17,7 @@
 import pytest
 from utils.comparisons import allclose
 
-import cunumeric as num
+import cupynumeric as num
 
 SIZES = (8, 9, 255)
 
diff --git a/tests/integration/test_swapaxes.py b/tests/integration/test_swapaxes.py
index ded45a663c..124a0a00a3 100644
--- a/tests/integration/test_swapaxes.py
+++ b/tests/integration/test_swapaxes.py
@@ -17,7 +17,7 @@
 import pytest
 from utils.generators import mk_seq_array
 
-import cunumeric as num
+import cupynumeric as num
 
 a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
 
diff --git a/tests/integration/test_take.py b/tests/integration/test_take.py
index 07e34567aa..ee967b77d6 100644
--- a/tests/integration/test_take.py
+++ b/tests/integration/test_take.py
@@ -15,10 +15,10 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
 from utils.generators import mk_seq_array
+from utils.utils import ONE_MAX_DIM_RANGE
 
-import cunumeric as num
+import cupynumeric as num
 
 x = mk_seq_array(np, (3, 4, 5))
 x_num = mk_seq_array(num, (3, 4, 5))
@@ -110,11 +110,11 @@ def test_empty_array_and_indices():
     ((4,), (0,), pytest.param((2, 2), marks=pytest.mark.xfail)),
     ids=lambda shape_in: f"(shape_in={shape_in})",
 )
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
 def test_ndim_default_mode(ndim, shape_in):
     # for shape_in=(2, 2) and ndim=4,
     # In Numpy, pass
-    # In cuNumeric, it raises ValueError:
+    # In cuPyNumeric, it raises ValueError:
     # Point cannot exceed 4 dimensions set from LEGATE_MAX_DIM
     shape = (5,) * ndim
     np_arr = mk_seq_array(np, shape)
@@ -138,11 +138,11 @@ def test_ndim_default_mode(ndim, shape_in):
     ((8,), pytest.param((3, 4), marks=pytest.mark.xfail)),
     ids=lambda shape_in: f"(shape_in={shape_in})",
 )
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
 def test_ndim_mode(ndim, mode, shape_in):
     # for shape_in=(3, 4) and ndim=4,
     # In Numpy, pass
-    # In cuNumeric, it raises ValueError:
+    # In cuPyNumeric, it raises ValueError:
     # Point cannot exceed 4 dimensions set from LEGATE_MAX_DIM
     shape = (5,) * ndim
     np_arr = mk_seq_array(np, shape)
@@ -260,7 +260,7 @@ def test_out_invalid_dtype(self, dtype):
         # In Numpy,
         # for np.float32, it raises TypeError
         # for np.int64 and np.int32, it pass
-        # In cuNumeric,
+        # In cuPyNumeric,
         # for np.float32, it raises ValueError
         # for np.int32, it raises ValueError
         # for np.int64, it pass
diff --git a/tests/integration/test_take_along_axis.py b/tests/integration/test_take_along_axis.py
index a2c930e100..abbe466ded 100644
--- a/tests/integration/test_take_along_axis.py
+++ b/tests/integration/test_take_along_axis.py
@@ -15,15 +15,15 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
 from utils.generators import broadcasts_to_along_axis, mk_seq_array
+from utils.utils import ONE_MAX_DIM_RANGE
 
-import cunumeric as num
+import cupynumeric as num
 
 N = 10
 
 
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
 def test_ndim(ndim):
     shape = (N,) * ndim
     np_arr = mk_seq_array(np, shape)
@@ -89,7 +89,7 @@ def test_indices_bad_type(self, dtype):
     )
     def test_indices_bad_shape(self, shape):
         # In Numpy, it raises IndexError.
-        # In cuNumeric, it raises ValueError.
+        # In cuPyNumeric, it raises ValueError.
         ai = num.ones(shape, dtype=int)
         msg = "shape mismatch: indexing arrays could not be broadcast"
         with pytest.raises(IndexError, match=msg):
diff --git a/tests/integration/test_tensordot.py b/tests/integration/test_tensordot.py
index 0091a46277..0c6212173b 100644
--- a/tests/integration/test_tensordot.py
+++ b/tests/integration/test_tensordot.py
@@ -14,12 +14,12 @@
 #
 
 import pytest
-from legate.core import LEGATE_MAX_DIM
 from utils.contractions import check_default
 from utils.generators import mk_0to1_array
+from utils.utils import MAX_DIM_RANGE
 
-import cunumeric as num
-from cunumeric._utils.linalg import tensordot_modes
+import cupynumeric as num
+from cupynumeric._utils.linalg import tensordot_modes
 
 
 def gen_axes(a_ndim, b_ndim):
@@ -29,8 +29,8 @@ def gen_axes(a_ndim, b_ndim):
         yield ([0, 1], [1, 0])
 
 
-@pytest.mark.parametrize("b_ndim", range(LEGATE_MAX_DIM + 1))
-@pytest.mark.parametrize("a_ndim", range(LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("b_ndim", MAX_DIM_RANGE)
+@pytest.mark.parametrize("a_ndim", MAX_DIM_RANGE)
 def test_tensordot(a_ndim, b_ndim):
     for axes in gen_axes(a_ndim, b_ndim):
         name = f"tensordot({a_ndim} x {b_ndim}, axes={axes})"
@@ -71,7 +71,7 @@ def test_axis_invalid_value(self, axis):
     )
     def test_axis_invalid_index(self, axis):
         # In Numpy, for both cases, it raises IndexError
-        # In cuNumeric, for both cases, it raises ValueError
+        # In cuPyNumeric, for both cases, it raises ValueError
         with pytest.raises(IndexError):
             num.tensordot(self.A, self.B, axis)
 
diff --git a/tests/integration/test_tile.py b/tests/integration/test_tile.py
index 72ce9ee616..3ace08b807 100644
--- a/tests/integration/test_tile.py
+++ b/tests/integration/test_tile.py
@@ -15,7 +15,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def test_negative():
diff --git a/tests/integration/test_trace.py b/tests/integration/test_trace.py
index 4423f83a97..105d40bbae 100644
--- a/tests/integration/test_trace.py
+++ b/tests/integration/test_trace.py
@@ -17,10 +17,10 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
 from utils.generators import mk_seq_array
+from utils.utils import TWO_MAX_DIM_RANGE
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def test_2d():
@@ -69,7 +69,7 @@ def test_4d():
     assert np.array_equal(res, res_num)
 
 
-@pytest.mark.parametrize("ndim", range(2, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", TWO_MAX_DIM_RANGE)
 def test_ndim(ndim):
     a_shape = tuple(np.random.randint(1, 9) for i in range(ndim))
     np_array = mk_seq_array(np, a_shape)
@@ -101,7 +101,7 @@ def test_ndim(ndim):
 def test_offset(offset):
     # For -3, -2, 3
     # In Numpy, pass and return 0
-    # In cuNumeric, it raises ValueError:
+    # In cuPyNumeric, it raises ValueError:
     # 'offset' for diag or diagonal must be in range
     a = np.arange(24).reshape((2, 3, 4))
     a_num = num.array(a)
@@ -119,7 +119,7 @@ def test_offset(offset):
 def test_negative_axes(axes):
     # For all 3 cases,
     # In Numpy, pass
-    # In cuNumeric, it raises ValueError:
+    # In cuPyNumeric, it raises ValueError:
     # axes must be the same size as ndim for transpose
     axis1, axis2 = axes
     a = np.arange(24).reshape((2, 3, 4))
@@ -158,7 +158,7 @@ def test_invalid_arrays(self, array):
     def test_axes_none(self, axes):
         # For (None, None)
         # In Numpy, it raises TypeError
-        # In cuNumeric, it pass
+        # In cuPyNumeric, it pass
         expected_exc = TypeError
         axis1, axis2 = axes
         with pytest.raises(expected_exc):
diff --git a/tests/integration/test_transpose.py b/tests/integration/test_transpose.py
index f97fa3b196..f0ff40165b 100644
--- a/tests/integration/test_transpose.py
+++ b/tests/integration/test_transpose.py
@@ -15,7 +15,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 DIM = 5
 SIZES = [
@@ -61,7 +61,7 @@ def test_int_axis(self):
         size = (2, 3, 4)
         a = num.random.randint(low=-10, high=10, size=size)
         # numpy raises "ValueError: axes don't match array".
-        # cunumeric raises "TypeError".
+        # cupynumeric raises "TypeError".
         with pytest.raises(TypeError):
             num.transpose(a, axes=2)
 
@@ -70,7 +70,7 @@ def test_int_axis_compare(self):
         size = (2, 3, 4)
         a = num.random.randint(low=-10, high=10, size=size)
         # numpy raises "ValueError: axes don't match array".
-        # cunumeric raises "TypeError".
+        # cupynumeric raises "TypeError".
         with pytest.raises(ValueError):
             num.transpose(a, axes=2)
 
@@ -101,8 +101,8 @@ def test_axes_1d(self, size):
     @pytest.mark.parametrize("size", (0, 1, DIM))
     @pytest.mark.parametrize("axes", (-3, 3))
     def test_axes_1d_int(self, size, axes):
-        # For cunumeric, if array.dim==1, it returns the array itself directly,
-        # no matter what the axes value is.
+        # For cupynumeric, if array.dim==1, it returns the array itself
+        # directly, no matter what the axes value is.
         # For numpy, it raises
         # "AxisError: axis * is out of bounds for array of dimension 1".
         a = np.random.randint(low=-10, high=10, size=size)
@@ -115,8 +115,8 @@ def test_axes_1d_int(self, size, axes):
     @pytest.mark.parametrize("size", (0, 1, DIM))
     @pytest.mark.parametrize("axes", ((1,), (3, 1)))
     def test_axes_1d_tuple(self, size, axes):
-        # For cunumeric, if array.dim==1, it returns the array itself directly,
-        # no matter what the axes value is.
+        # For cupynumeric, if array.dim==1, it returns the array itself
+        # directly, no matter what the axes value is.
         # For numpy, it raises "ValueError: axes don't match array".
         a = np.random.randint(low=-10, high=10, size=size)
         b = num.array(a)
@@ -172,7 +172,7 @@ def test_int_axis(self):
         size = (2, 3, 4)
         a = num.random.randint(low=-10, high=10, size=size)
         # numpy raises "ValueError: axes don't match array".
-        # cunumeric raises "TypeError".
+        # cupynumeric raises "TypeError".
         with pytest.raises(TypeError):
             a.transpose(axes=2)
 
@@ -181,7 +181,7 @@ def test_int_axis_compare(self):
         size = (2, 3, 4)
         a = num.random.randint(low=-10, high=10, size=size)
         # numpy raises "ValueError: axes don't match array".
-        # cunumeric raises "TypeError".
+        # cupynumeric raises "TypeError".
         with pytest.raises(ValueError):
             a.transpose(axes=2)
 
@@ -212,8 +212,8 @@ def test_axes_1d(self, size):
     @pytest.mark.parametrize("size", (0, 1, DIM))
     @pytest.mark.parametrize("axes", (-3, 3))
     def test_axes_1d_int(self, size, axes):
-        # For cunumeric, if array.dim==1, it returns the array itself directly,
-        # no matter what the axes value is.
+        # For cupynumeric, if array.dim==1, it returns the array itself
+        # directly, no matter what the axes value is.
         # For Numpy, it raises
         # "AxisError: axis * is out of bounds for array of dimension 1".
         a = np.random.randint(low=-10, high=10, size=size)
@@ -226,8 +226,8 @@ def test_axes_1d_int(self, size, axes):
     @pytest.mark.parametrize("size", (0, 1, DIM))
     @pytest.mark.parametrize("axes", ((1,), (3, 1)))
     def test_axes_1d_tuple(self, size, axes):
-        # For cunumeric, if array.dim==1, it returns the array itself directly,
-        # no matter what the axes value is.
+        # For cupynumeric, if array.dim==1, it returns the array itself
+        # directly, no matter what the axes value is.
         # For Numpy, it raises "ValueError: axes don't match array".
         a = np.random.randint(low=-10, high=10, size=size)
         b = num.array(a)
diff --git a/tests/integration/test_tri.py b/tests/integration/test_tri.py
index 194ed2b122..a348614663 100644
--- a/tests/integration/test_tri.py
+++ b/tests/integration/test_tri.py
@@ -17,7 +17,7 @@
 import pytest
 from utils.utils import check_module_function
 
-import cunumeric as num
+import cupynumeric as num
 
 KS = (0, -1, 1, -2, 2)
 N = 100
@@ -25,7 +25,7 @@
 
 @pytest.mark.parametrize("n", (0, 1, N), ids=lambda n: f"(n={n})")
 def test_tri_n(n):
-    print_msg = f"np & cunumeric.tri({n})"
+    print_msg = f"np & cupynumeric.tri({n})"
     check_module_function("tri", [n], {}, print_msg)
 
 
@@ -33,13 +33,13 @@ def test_tri_n(n):
 @pytest.mark.parametrize("m", (1, 10, N), ids=lambda m: f"(M={m})")
 @pytest.mark.parametrize("n", (1, N), ids=lambda n: f"(n={n})")
 def test_tri_full(n, m, k):
-    print_msg = f"np & cunumeric.tri({n}, k={k}, M={m})"
+    print_msg = f"np & cupynumeric.tri({n}, k={k}, M={m})"
     check_module_function("tri", [n], {"k": k, "M": m}, print_msg)
 
 
 @pytest.mark.parametrize("m", (0, None), ids=lambda m: f"(M={m})")
 def test_tri_m(m):
-    print_msg = f"np & cunumeric.tri({N}, M={m})"
+    print_msg = f"np & cupynumeric.tri({N}, M={m})"
     check_module_function("tri", [N], {"M": m}, print_msg)
 
 
@@ -53,18 +53,18 @@ def test_tri_m(m):
 
 @pytest.mark.parametrize("dtype", DTYPES, ids=str)
 def test_tri_dtype(dtype):
-    # cuNumeric: returns an array with dtype=int
+    # cuPyNumeric: returns an array with dtype=int
     # Numpy: returns an array with dtype=float
-    print_msg = f"np & cunumeric.tri({N}, dtype={dtype})"
+    print_msg = f"np & cupynumeric.tri({N}, dtype={dtype})"
     check_module_function("tri", [N], {"dtype": dtype}, print_msg)
 
 
 @pytest.mark.xfail
 @pytest.mark.parametrize("k", (-10.5, 0.0, 10.5), ids=lambda k: f"(k={k})")
 def test_tri_float_k(k):
-    # cuNumeric: struct.error: required argument is not an integer
+    # cuPyNumeric: struct.error: required argument is not an integer
     # Numpy: pass
-    print_msg = f"np & cunumeric.tri({N}, k={k})"
+    print_msg = f"np & cupynumeric.tri({N}, k={k})"
     check_module_function("tri", [N], {"k": k}, print_msg)
 
 
@@ -140,7 +140,7 @@ def test_n_none(self):
 
     @pytest.mark.xfail
     def test_k_none(self):
-        # In cuNumeric, it raises struct.error,
+        # In cuPyNumeric, it raises struct.error,
         # msg is required argument is not an integer
         # In Numpy, it raises TypeError,
         # msg is bad operand type for unary -: 'NoneType'
diff --git a/tests/integration/test_trilu.py b/tests/integration/test_trilu.py
index 30b22c51ce..447add7c57 100644
--- a/tests/integration/test_trilu.py
+++ b/tests/integration/test_trilu.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 KS = (0, -1, 1, -2, 2)
 FUNCTIONS = ("tril", "triu")
@@ -61,7 +61,7 @@ def test_trilu(func, shape, dtype, k):
 @pytest.mark.parametrize("k", (-2.5, 0.0, 2.5), ids=lambda k: f"(k={k})")
 @pytest.mark.parametrize("func", FUNCTIONS)
 def test_trilu_float_k(func, k):
-    # cuNumeric: struct.error: required argument is not an integer
+    # cuPyNumeric: struct.error: required argument is not an integer
     # Numpy: pass
     shape = (10, 10)
     anp = np.ones(shape)
@@ -78,7 +78,7 @@ def test_arr_none(self):
 
     @pytest.mark.xfail
     def test_k_none(self):
-        # In cuNumeric, it raises struct.error,
+        # In cuPyNumeric, it raises struct.error,
         # msg is required argument is not an integer
         # In Numpy, it raises TypeError,
         # msg is bad operand type for unary -: 'NoneType'
diff --git a/tests/integration/test_trilu_indices.py b/tests/integration/test_trilu_indices.py
index 8f1604e457..6233e11649 100644
--- a/tests/integration/test_trilu_indices.py
+++ b/tests/integration/test_trilu_indices.py
@@ -17,7 +17,7 @@
 import pytest
 from utils.utils import check_module_function
 
-import cunumeric as num
+import cupynumeric as num
 
 KS = (0, -1, 1, -2, 2)
 FUNCTIONS_INDICES = ("tril_indices", "triu_indices")
@@ -39,7 +39,7 @@ def _test_from(func, shape, k):
 @pytest.mark.parametrize("n", (0, 1, 100), ids=lambda n: f"(n={n})")
 @pytest.mark.parametrize("func", FUNCTIONS_INDICES)
 def test_trilu_indices_default(func, n):
-    print_msg = f"np & cunumeric.{func}({n})"
+    print_msg = f"np & cupynumeric.{func}({n})"
     check_module_function(func, [n], {}, print_msg)
 
 
@@ -48,14 +48,14 @@ def test_trilu_indices_default(func, n):
 @pytest.mark.parametrize("n", (1, N), ids=lambda n: f"(n={n})")
 @pytest.mark.parametrize("func", FUNCTIONS_INDICES)
 def test_trilu_indices_full(func, n, m, k):
-    print_msg = f"np & cunumeric.{func}({n}, k={k}, m={m})"
+    print_msg = f"np & cupynumeric.{func}({n}, k={k}, m={m})"
     check_module_function(func, [n], {"k": k, "m": m}, print_msg)
 
 
 @pytest.mark.parametrize("m", (0, None), ids=lambda m: f"(m={m})")
 @pytest.mark.parametrize("func", FUNCTIONS_INDICES)
 def test_trilu_indices_m(func, m):
-    print_msg = f"np & cunumeric.{func}({N}, m={m})"
+    print_msg = f"np & cupynumeric.{func}({N}, m={m})"
     check_module_function(func, [N], {"m": m}, print_msg)
 
 
@@ -63,9 +63,9 @@ def test_trilu_indices_m(func, m):
 @pytest.mark.parametrize("k", (-10.5, 0.0, 10.5), ids=lambda k: f"(k={k})")
 @pytest.mark.parametrize("func", FUNCTIONS_INDICES)
 def test_trilu_indices_float_k(func, k):
-    # cuNumeric: struct.error: required argument is not an integer
+    # cuPyNumeric: struct.error: required argument is not an integer
     # Numpy: pass
-    print_msg = f"np & cunumeric.{func}({N}, k={k})"
+    print_msg = f"np & cupynumeric.{func}({N}, k={k})"
     check_module_function(func, [N], {"k": k}, print_msg)
 
 
@@ -141,7 +141,7 @@ def test_n_none(self):
 
     @pytest.mark.xfail
     def test_k_none(self):
-        # In cuNumeric, it raises struct.error,
+        # In cuPyNumeric, it raises struct.error,
         # msg is required argument is not an integer
         # In Numpy, it raises TypeError,
         # msg is bad operand type for unary -: 'NoneType'
@@ -181,7 +181,7 @@ def test_trilu_indices_from_empty_array(func, shape):
 @pytest.mark.parametrize("k", (-10.5, 0.0, 10.5), ids=lambda k: f"(k={k})")
 @pytest.mark.parametrize("func", FUNCTIONS_INDICES_FROM)
 def test_trilu_indices_from_float_k(func, k):
-    # cuNumeric: struct.error: required argument is not an integer
+    # cuPyNumeric: struct.error: required argument is not an integer
     # Numpy: pass
     shape = (10, 10)
     _test_from(func, shape, k)
@@ -221,7 +221,7 @@ def test_arr_none(self, func):
 
     @pytest.mark.xfail
     def test_k_none(self):
-        # In cuNumeric, it raises struct.error,
+        # In cuPyNumeric, it raises struct.error,
         # msg is required argument is not an integer
         # In Numpy, it raises TypeError,
         # msg is bad operand type for unary -: 'NoneType'
diff --git a/tests/integration/test_unary_functions_2d_complex.py b/tests/integration/test_unary_functions_2d_complex.py
index df3a896033..897ccc7630 100644
--- a/tests/integration/test_unary_functions_2d_complex.py
+++ b/tests/integration/test_unary_functions_2d_complex.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 xn = np.array(
     [[1 + 2j, 3 - 4j, 5 + 6j], [7 - 8j, -9 + 10j, -11 - 12j]], complex
diff --git a/tests/integration/test_unary_ufunc.py b/tests/integration/test_unary_ufunc.py
index 5e264c6777..9d60252599 100644
--- a/tests/integration/test_unary_ufunc.py
+++ b/tests/integration/test_unary_ufunc.py
@@ -17,9 +17,19 @@
 
 import numpy as np
 import pytest
+from packaging.version import Version
 from utils.comparisons import allclose
 
-import cunumeric as num
+import cupynumeric as num
+
+complex_data = [
+    1 + 1j,
+    -1 - 1j,
+    5 + 1j,
+    1 + 0.5j,
+    2.0 + 1.4j,
+    -1 + 2j,
+]
 
 
 def deterministic_op_test(func):
@@ -47,14 +57,14 @@ def check_result(op, in_np, out_np, out_num, **isclose_kwargs):
         and out_np.dtype == out_num.dtype
     )
     if not result:
-        print(f"cunumeric.{op} failed the test")
+        print(f"cupynumeric.{op} failed the test")
         print("Input:")
         print(in_np)
         print(f"dtype: {in_np.dtype}")
         print("NumPy output:")
         print(out_np)
         print(f"dtype: {out_np.dtype}")
-        print("cuNumeric output:")
+        print("cuPyNumeric output:")
         print(out_num)
         print(f"dtype: {out_num.dtype}")
     return result
@@ -90,7 +100,7 @@ def check_op(op, in_np, out_dtype="d", **check_kwargs):
 
         assert check_result(op, in_np, out_np, out_num, **check_kwargs)
 
-        # Ask cuNumeric to produce outputs to NumPy ndarrays
+        # Ask cuPyNumeric to produce outputs to NumPy ndarrays
         out_num = np.ones(out_np.shape, dtype=out_dtype)
         op_num(in_num, out_num)
 
@@ -118,12 +128,15 @@ def check_op_input(
     astype=None,
     out_dtype="d",
     replace_zero=None,
+    complex_type=False,
     **check_kwargs,
 ):
     if randint:
         assert a_min is not None
         assert a_max is not None
         in_np = np.random.randint(a_min, a_max, size=shape)
+    elif complex_type:
+        in_np = np.array(complex_data)
     else:
         in_np = np.random.randn(*shape)
         if offset is not None:
@@ -158,6 +171,20 @@ def check_math_ops(op, **kwargs):
     check_op_input(op, astype="B", **kwargs)
     check_op_input(op, randint=True, a_min=1, a_max=10, **kwargs)
     check_op_input(op, shape=(1,), **kwargs)
+    no_complex_test_list = (
+        "fabs",
+        "logical_not",
+    )
+    numpy_version = Version(np.__version__)
+    # sign has an incorrect implementation for complex
+    # numbers in numpy <2.0
+    if numpy_version < Version("2.0"):
+        no_complex_test_list += ("sign",)
+
+    if op not in no_complex_test_list:
+        check_op_input(
+            op, complex_type=True, out_dtype=np.complex128, **kwargs
+        )
 
 
 # Math operations
@@ -224,6 +251,9 @@ def test_log_ops(op):
     check_op_input(op, randint=True, a_min=3, a_max=10)
     check_op_input(op, shape=(1,), a_min=0.1, offset=3)
 
+    # check with complex data type
+    check_op_input(op, complex_type=True, out_dtype=np.complex128)
+
 
 even_root_ops = ("sqrt",)
 
@@ -239,6 +269,8 @@ def test_even_root_ops(op):
     check_op_input(op, astype="F", out_dtype="D")
     check_op_input(op, randint=True, a_min=3, a_max=10)
     check_op_input(op, shape=(1,), a_min=0.1, offset=3)
+    # check with complex data type
+    check_op_input(op, complex_type=True, out_dtype=np.complex128)
 
 
 odd_root_ops = ("cbrt",)
@@ -276,6 +308,12 @@ def test_trig_ops(op):
     check_op(op, np.random.uniform(low=-1, high=1, size=(4, 5)))
     check_op(op, np.random.uniform(low=-1, high=1, size=(4, 5)).astype("e"))
     check_op(op, np.array(np.random.uniform(low=-1, high=1)))
+    # check with complex data type
+    if op not in (
+        "deg2rad",
+        "rad2deg",
+    ):
+        check_op_input(op, complex_type=True, out_dtype=np.complex128)
 
 
 arc_hyp_trig_ops = (
@@ -290,6 +328,8 @@ def test_arc_hyp_trig_ops(op):
     check_op(op, np.random.uniform(low=1, high=5, size=(4, 5)))
     check_op(op, np.random.uniform(low=1, high=5, size=(4, 5)).astype("e"))
     check_op(op, np.array(np.random.uniform(low=1, high=5)))
+    # check with complex data type
+    check_op_input(op, complex_type=True, out_dtype=np.complex128)
 
 
 bit_ops = ("invert", "~")
@@ -344,6 +384,8 @@ def test_nan_ops(op):
     check_op(op, np.array([-np.inf, 0.0, 1.0, np.inf, np.nan], dtype="F"))
     check_op(op, np.array([-np.inf, 0.0, 1.0, np.inf, np.nan], dtype="e"))
     check_op(op, np.array(np.inf))
+    # check with complex data type
+    check_op_input(op, complex_type=True, out_dtype=np.complex128)
 
 
 def parse_inputs(in_str, dtype_str):
diff --git a/tests/integration/test_unique.py b/tests/integration/test_unique.py
index a6c7013308..67e040e5be 100644
--- a/tests/integration/test_unique.py
+++ b/tests/integration/test_unique.py
@@ -15,9 +15,9 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
+from utils.utils import MAX_DIM_RANGE
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def test_with_nonzero():
@@ -30,7 +30,7 @@ def test_with_nonzero():
     assert np.array_equal(b, b_np)
 
 
-@pytest.mark.parametrize("ndim", range(LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", MAX_DIM_RANGE)
 def test_ndim(ndim):
     shape = (4,) * ndim
     a = num.random.randint(0, 3, size=shape)
@@ -56,7 +56,7 @@ def test_parameters(return_index, return_inverse, return_counts, axis):
         return_inverse=return_inverse,
         return_counts=return_counts,
     )
-    # cuNumeric raises NotImplementedError: Keyword arguments
+    # cuPyNumeric raises NotImplementedError: Keyword arguments
     # for `unique` are not yet supported
     res_np = np.unique(
         arr_np,
diff --git a/tests/integration/test_unravel_index.py b/tests/integration/test_unravel_index.py
index 1fa15aa37d..a8a1d59e1e 100644
--- a/tests/integration/test_unravel_index.py
+++ b/tests/integration/test_unravel_index.py
@@ -15,10 +15,10 @@
 
 import numpy as np
 import pytest
-from legate.core import LEGATE_MAX_DIM
 from utils.generators import mk_seq_array
+from utils.utils import ONE_MAX_DIM_RANGE
 
-import cunumeric as num
+import cupynumeric as num
 
 
 class TestUnravelIndexErrors:
@@ -122,7 +122,7 @@ def test_large_index():
     assert np.array_equal(res_num, res_np)
 
 
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE)
 @pytest.mark.parametrize(
     "order",
     (
@@ -141,7 +141,7 @@ def test_basic(ndim, order):
     assert np.array_equal(res_num, res_np)
 
 
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("ndim", ONE_MAX_DIM_RANGE[:-1])
 @pytest.mark.parametrize(
     "order",
     (
@@ -152,8 +152,8 @@ def test_basic(ndim, order):
 def test_uneven_shape(ndim, order):
     shape = np.random.randint(1, 6, ndim, dtype=int)
     size = ndim
-    np_arr = mk_seq_array(np, size)
-    num_arr = mk_seq_array(num, size)
+    np_arr = mk_seq_array(np, size) - 1
+    num_arr = mk_seq_array(num, size) - 1
 
     res_np = np.unravel_index(np_arr, shape, order)
     res_num = num.unravel_index(num_arr, shape, order)
diff --git a/tests/integration/test_update.py b/tests/integration/test_update.py
index 3f4d76b8e0..660b20fe5d 100644
--- a/tests/integration/test_update.py
+++ b/tests/integration/test_update.py
@@ -15,7 +15,7 @@
 
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def test_basic():
diff --git a/tests/integration/test_vdot.py b/tests/integration/test_vdot.py
index 0b9a20197b..6a5ee19a68 100644
--- a/tests/integration/test_vdot.py
+++ b/tests/integration/test_vdot.py
@@ -18,7 +18,7 @@
 from utils.comparisons import allclose
 from utils.generators import mk_0to1_array
 
-import cunumeric as num
+import cupynumeric as num
 
 DTYPES = [np.float32, np.complex64]
 
@@ -104,7 +104,7 @@ class TestVdotErrors:
     def test_a_b_invalid_shape(self, shapeAB):
         # for ((0,), (1,)) and ((1,), (0,))
         # In Numpy, it raises ValueError
-        # In cuNumeric, it pass
+        # In cuPyNumeric, it pass
         expected_exc = ValueError
         shapeA, shapeB = shapeAB
         A_np = mk_0to1_array(np, shapeA)
@@ -126,7 +126,7 @@ def test_a_b_invalid_shape(self, shapeAB):
     def test_a_b_scalar_and_arrays(self, shapeB):
         # For shape of (0,), (2,), (1, 2),
         # In Numpy, it raises ValueError
-        # In cuNumeric, it pass
+        # In cuPyNumeric, it pass
         expected_exc = ValueError
         A = 5
         B_np = mk_0to1_array(np, shapeB)
diff --git a/tests/integration/test_view.py b/tests/integration/test_view.py
index 1894dc60a6..1be11f49bf 100644
--- a/tests/integration/test_view.py
+++ b/tests/integration/test_view.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def test_update_orig():
diff --git a/tests/integration/test_where.py b/tests/integration/test_where.py
index a8193f9bf8..89905b6c56 100644
--- a/tests/integration/test_where.py
+++ b/tests/integration/test_where.py
@@ -17,7 +17,7 @@
 import pytest
 from utils.generators import mk_seq_array
 
-import cunumeric as num
+import cupynumeric as num
 
 CONDITIONS = [
     [[True, False], [True, True]],
@@ -73,7 +73,7 @@ def test_broadcast(shape_a):
 @pytest.mark.xfail
 def test_condition_none():
     # In Numpy, pass and returns [1, 2]
-    # In cuNumeric, raises AttributeError:
+    # In cuPyNumeric, raises AttributeError:
     # 'NoneType' object has no attribute '_maybe_convert'
     x = 0
     y_np = np.array([1, 2])
@@ -90,10 +90,10 @@ def test_condition_none():
 def test_x_y_none(values):
     # For x=None and y=None,
     # In Numpy, pass and returns [None, None]
-    # In cuNumeric, pass and returns (array([0]),)
+    # In cuPyNumeric, pass and returns (array([0]),)
     # For x=None and y=1
     # In Numpy, pass and returns [None, 1]
-    # In cuNumeric, raises ValueError: both 'x' and 'y' parameters
+    # In cuPyNumeric, raises ValueError: both 'x' and 'y' parameters
     # must be specified together for where
     cond = [True, False]
     a_np = np.array(cond)
@@ -164,7 +164,7 @@ def test_argwhere(input):
 @pytest.mark.xfail
 def test_argwhere_none():
     # In Numpy, it pass and returns []
-    # In cuNumeric, it raises AttributeError:
+    # In cuPyNumeric, it raises AttributeError:
     # 'NoneType' object has no attribute '_thunk'
     assert np.array_equal(np.argwhere(None), num.argwhere(None))
 
diff --git a/tests/integration/test_window.py b/tests/integration/test_window.py
index 71503b8f9a..0ca141145a 100644
--- a/tests/integration/test_window.py
+++ b/tests/integration/test_window.py
@@ -18,7 +18,7 @@
 import pytest
 from utils.comparisons import allclose
 
-import cunumeric as num
+import cupynumeric as num
 
 window_functions = ("bartlett", "blackman", "hamming", "hanning")
 
diff --git a/tests/integration/utils/comparisons.py b/tests/integration/utils/comparisons.py
index a8dd4a1f6c..cae989c8d1 100644
--- a/tests/integration/utils/comparisons.py
+++ b/tests/integration/utils/comparisons.py
@@ -20,8 +20,8 @@
 
 
 def allclose(
-    a: Any,  # numpy or cunumeric array-like
-    b: Any,  # numpy or cunumeric array-like
+    a: Any,  # numpy or cupynumeric array-like
+    b: Any,  # numpy or cupynumeric array-like
     rtol: float = 1e-5,
     atol: float = 1e-8,
     equal_nan: bool = False,
@@ -50,7 +50,7 @@ def allclose(
         inds = islice(zip(*np.where(~close)), diff_limit)
         diffs = [f"  index {i}: {a[i]} {b[i]}" for i in inds]
         N = len(diffs)
-        print(f"First {N} difference{'s' if N>1 else ''} for allclose:\n")
+        print(f"First {N} difference{'s' if N > 1 else ''} for allclose:\n")
         print("\n".join(diffs))
         print(f"\nWith diff_limit={diff_limit}\n")
 
diff --git a/tests/integration/utils/contractions.py b/tests/integration/utils/contractions.py
index 641020b4f3..9e61135d16 100644
--- a/tests/integration/utils/contractions.py
+++ b/tests/integration/utils/contractions.py
@@ -17,7 +17,7 @@
 from legate.core import LEGATE_MAX_DIM
 from legate.core.utils import OrderedSet
 
-import cunumeric as num
+import cupynumeric as num
 
 from .comparisons import allclose
 from .generators import mk_0to1_array
diff --git a/tests/integration/utils/random.py b/tests/integration/utils/random.py
index afca73c31c..0bc990b37e 100644
--- a/tests/integration/utils/random.py
+++ b/tests/integration/utils/random.py
@@ -15,7 +15,7 @@
 
 import numpy as np
 
-import cunumeric as num
+import cupynumeric as num
 
 
 class ModuleGenerator:
diff --git a/tests/integration/utils/utils.py b/tests/integration/utils/utils.py
index ee885b157a..215b8b6478 100644
--- a/tests/integration/utils/utils.py
+++ b/tests/integration/utils/utils.py
@@ -14,9 +14,10 @@
 #
 
 import numpy as np
+from legate.core import LEGATE_MAX_DIM
 
-import cunumeric as num
-from cunumeric._utils import is_np2
+import cupynumeric as num
+from cupynumeric._utils import is_np2
 
 if is_np2:
     from numpy.exceptions import AxisError  # noqa: F401
@@ -50,8 +51,8 @@ def compare_array_and_print_results(a, b, print_msg, check_type=True):
         assert is_equal, (
             f"Failed, {print_msg}\n"
             f"numpy result: {err_arr[0]}\n"
-            f"cunumeric_result: {err_arr[1]}\n"
-            f"cunumeric and numpy shows"
+            f"cupynumeric_result: {err_arr[1]}\n"
+            f"cupynumeric and numpy shows"
             f" different result\n"
         )
         print(f"Passed, {print_msg}")
@@ -61,13 +62,13 @@ def compare_array_and_print_results(a, b, print_msg, check_type=True):
         assert is_equal, (
             f"Failed, {print_msg}\n"
             f"numpy result: {err_arr[0]}, {a.shape}\n"
-            f"cunumeric_result: {err_arr[1]}, {b.shape}\n"
-            f"cunumeric and numpy shows"
+            f"cupynumeric_result: {err_arr[1]}, {b.shape}\n"
+            f"cupynumeric and numpy shows"
             f" different result\n"
         )
         print(
             f"Passed, {print_msg}, np: ({a.shape}, {a.dtype})"
-            f", cunumeric: ({b.shape}, {b.dtype})"
+            f", cupynumeric: ({b.shape}, {b.dtype})"
         )
 
 
@@ -103,3 +104,13 @@ def check_module_function(
     a = getattr(np, fn)(*args, **kwargs)
     b = getattr(num, fn)(*args, **kwargs)
     compare_array_and_print_results(a, b, print_msg, check_type=check_type)
+
+
+# MAX_DIM_RANGE is a list of array dimensions, that is used to test APIs
+# on different array dims. We reduce this list to a sub-set of possible
+# dimensions to reduce walltime for testing
+MAX_DIM_RANGE = list(range(min(4, LEGATE_MAX_DIM)))
+if LEGATE_MAX_DIM > MAX_DIM_RANGE[-1]:
+    MAX_DIM_RANGE.append(LEGATE_MAX_DIM)
+ONE_MAX_DIM_RANGE = MAX_DIM_RANGE[1:]
+TWO_MAX_DIM_RANGE = MAX_DIM_RANGE[2:]
diff --git a/tests/todo/2d_reduction_complex.py b/tests/todo/2d_reduction_complex.py
index 0ed704be41..429b32cc0b 100644
--- a/tests/todo/2d_reduction_complex.py
+++ b/tests/todo/2d_reduction_complex.py
@@ -15,7 +15,7 @@
 
 import numpy as np
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def test():
diff --git a/tests/todo/assign_slice.py b/tests/todo/assign_slice.py
index 75fa19af6e..943300ff27 100644
--- a/tests/todo/assign_slice.py
+++ b/tests/todo/assign_slice.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 #
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def test():
diff --git a/tests/todo/complex_test.py b/tests/todo/complex_test.py
index 22f1d667cb..1aa82fe9df 100644
--- a/tests/todo/complex_test.py
+++ b/tests/todo/complex_test.py
@@ -15,7 +15,7 @@
 
 import numpy as np
 
-import cunumeric as num
+import cupynumeric as num
 
 M = 32
 alpha = 4.0
diff --git a/tests/todo/dot.py b/tests/todo/dot.py
index e90bef46d2..c189c020f2 100644
--- a/tests/todo/dot.py
+++ b/tests/todo/dot.py
@@ -15,7 +15,7 @@
 
 import numpy as np
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def test():
diff --git a/tests/todo/indirect.py b/tests/todo/indirect.py
index 4e24494217..21ed4d4c56 100644
--- a/tests/todo/indirect.py
+++ b/tests/todo/indirect.py
@@ -15,7 +15,7 @@
 
 import numpy as np
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def test():
diff --git a/tests/todo/kmeans_test.py b/tests/todo/kmeans_test.py
index f9d0a6c8c5..52ba16fdf9 100644
--- a/tests/todo/kmeans_test.py
+++ b/tests/todo/kmeans_test.py
@@ -15,7 +15,7 @@
 
 import numpy as np
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def test():
diff --git a/tests/todo/lstm_batch.py b/tests/todo/lstm_batch.py
index 97b57a1d2b..2d72321d33 100644
--- a/tests/todo/lstm_batch.py
+++ b/tests/todo/lstm_batch.py
@@ -16,7 +16,7 @@
 """
 This is a batched LSTM forward and backward pass
 """
-import cunumeric as np
+import cupynumeric as np
 
 
 class LSTM:
@@ -137,9 +137,7 @@ def backward(dHout_in, cache, dcn=None, dhn=None):
             tanhCt = Ct[t]
             dIFOGf[t, :, 2 * d : 3 * d] = tanhCt * dHout[t]
             # backprop tanh non-linearity first then continue backprop
-            dC[t] += (1 - tanhCt**2) * (
-                IFOGf[t, :, 2 * d : 3 * d] * dHout[t]
-            )
+            dC[t] += (1 - tanhCt**2) * (IFOGf[t, :, 2 * d : 3 * d] * dHout[t])
             if t > 0:
                 dIFOGf[t, :, d : 2 * d] = C[t - 1] * dC[t]
                 dC[t - 1] += IFOGf[t, :, d : 2 * d] * dC[t]
diff --git a/tests/todo/lstm_simple_backward.py b/tests/todo/lstm_simple_backward.py
index 772af1a91f..a8126573b7 100644
--- a/tests/todo/lstm_simple_backward.py
+++ b/tests/todo/lstm_simple_backward.py
@@ -15,7 +15,7 @@
 
 import numpy as np
 
-import cunumeric as num
+import cupynumeric as num
 
 
 def testtion():
diff --git a/tests/unit/cunumeric/__init__.py b/tests/unit/cupynumeric/__init__.py
similarity index 100%
rename from tests/unit/cunumeric/__init__.py
rename to tests/unit/cupynumeric/__init__.py
diff --git a/tests/unit/cunumeric/_array/__init__.py b/tests/unit/cupynumeric/_array/__init__.py
similarity index 100%
rename from tests/unit/cunumeric/_array/__init__.py
rename to tests/unit/cupynumeric/_array/__init__.py
diff --git a/tests/unit/cunumeric/_array/test_util.py b/tests/unit/cupynumeric/_array/test_util.py
similarity index 97%
rename from tests/unit/cunumeric/_array/test_util.py
rename to tests/unit/cupynumeric/_array/test_util.py
index 1f5c26e79d..f419d32075 100644
--- a/tests/unit/cunumeric/_array/test_util.py
+++ b/tests/unit/cupynumeric/_array/test_util.py
@@ -17,7 +17,7 @@
 from mock import MagicMock
 from pytest_mock import MockerFixture
 
-import cunumeric._array.util as m  # module under test
+import cupynumeric._array.util as m  # module under test
 
 from ...util import powerset
 
@@ -44,7 +44,9 @@ def _where_explicit(a, b, where):
 
 @pytest.fixture(autouse=True)
 def mock_convert(mocker: MockerFixture) -> MagicMock:
-    return mocker.patch("cunumeric._array.util.convert_to_cunumeric_ndarray")
+    return mocker.patch(
+        "cupynumeric._array.util.convert_to_cupynumeric_ndarray"
+    )
 
 
 class Test_add_boilerplate_bad:
diff --git a/tests/unit/cunumeric/_sphinxext/__init__.py b/tests/unit/cupynumeric/_sphinxext/__init__.py
similarity index 100%
rename from tests/unit/cunumeric/_sphinxext/__init__.py
rename to tests/unit/cupynumeric/_sphinxext/__init__.py
diff --git a/tests/unit/cunumeric/_sphinxext/test__comparison_util.py b/tests/unit/cupynumeric/_sphinxext/test__comparison_util.py
similarity index 63%
rename from tests/unit/cunumeric/_sphinxext/test__comparison_util.py
rename to tests/unit/cupynumeric/_sphinxext/test__comparison_util.py
index c36e69ffbf..97d2880ac7 100644
--- a/tests/unit/cunumeric/_sphinxext/test__comparison_util.py
+++ b/tests/unit/cupynumeric/_sphinxext/test__comparison_util.py
@@ -16,8 +16,8 @@
 import numpy as np
 import pytest
 
-import cunumeric as num
-import cunumeric._sphinxext._comparison_util as m  # module under test
+import cupynumeric as num
+import cupynumeric._sphinxext._comparison_util as m  # module under test
 
 
 def test_get_namespaces_None():
@@ -35,26 +35,31 @@ def test_get_namespaces_attr(attr):
     assert res[1] is getattr(num, attr)
 
 
+class _wrapped:
+    class _cupynumeric_metadata:
+        implemeneted = True
+
+
 class _TestObj:
-    a = 10
-    b = 10.2
-    c = "str"
-    _priv = "priv"
+    a = _wrapped
+    b = _wrapped
+    c = _wrapped
+    d = 10
+    _priv = _wrapped
 
 
-class Test_filter_names:
+class Test_filter_wrapped_names:
     def test_default(self):
-        assert set(m.filter_names(_TestObj)) == {"a", "b", "c"}
-
-    def test_types(self):
-        assert set(m.filter_names(_TestObj, (int,))) == {"a"}
-        assert set(m.filter_names(_TestObj, (int, str))) == {"a", "c"}
-        assert set(m.filter_names(_TestObj, (int, set))) == {"a"}
-        assert set(m.filter_names(_TestObj, (set,))) == set()
+        assert set(m.filter_wrapped_names(_TestObj())) == {"a", "b", "c"}
 
     def test_skip(self):
-        assert set(m.filter_names(_TestObj, skip=("a",))) == {"b", "c"}
-        assert set(m.filter_names(_TestObj, skip=("a", "c"))) == {"b"}
+        assert set(m.filter_wrapped_names(_TestObj(), skip=("a",))) == {
+            "b",
+            "c",
+        }
+        assert set(m.filter_wrapped_names(_TestObj(), skip=("a", "c"))) == {
+            "b"
+        }
 
 
 if __name__ == "__main__":
diff --git a/tests/unit/cunumeric/_utils/__init__.py b/tests/unit/cupynumeric/_utils/__init__.py
similarity index 100%
rename from tests/unit/cunumeric/_utils/__init__.py
rename to tests/unit/cupynumeric/_utils/__init__.py
diff --git a/tests/unit/cunumeric/_utils/test_array.py b/tests/unit/cupynumeric/_utils/test_array.py
similarity index 98%
rename from tests/unit/cunumeric/_utils/test_array.py
rename to tests/unit/cupynumeric/_utils/test_array.py
index 34e124c479..01e490eedb 100644
--- a/tests/unit/cunumeric/_utils/test_array.py
+++ b/tests/unit/cupynumeric/_utils/test_array.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-import cunumeric._utils.array as m  # module under test
+import cupynumeric._utils.array as m  # module under test
 
 EXPECTED_SUPPORTED_DTYPES = set(
     [
diff --git a/tests/unit/cunumeric/_utils/test_coverage.py b/tests/unit/cupynumeric/_utils/test_coverage.py
similarity index 82%
rename from tests/unit/cunumeric/_utils/test_coverage.py
rename to tests/unit/cupynumeric/_utils/test_coverage.py
index 5c6bf1aeee..b58f8940b6 100644
--- a/tests/unit/cunumeric/_utils/test_coverage.py
+++ b/tests/unit/cupynumeric/_utils/test_coverage.py
@@ -19,14 +19,14 @@
 import pytest
 from mock import MagicMock, patch
 
-import cunumeric
-import cunumeric._utils.coverage as m  # module under test
-from cunumeric.settings import settings
+import cupynumeric
+import cupynumeric._utils.coverage as m  # module under test
+from cupynumeric.settings import settings
 
 
 def test_FALLBACK_WARNING() -> None:
     assert m.FALLBACK_WARNING.format(what="foo") == (
-        "cuNumeric has not implemented foo "
+        "cuPyNumeric has not implemented foo "
         + "and is falling back to canonical NumPy. "
         + "You may notice significantly decreased performance "
         + "for this function call."
@@ -104,7 +104,7 @@ def _test_func(a: int, b: int) -> int:
     return a + b
 
 
-class _Test_ufunc(cunumeric._ufunc.ufunc):
+class _Test_ufunc(cupynumeric._ufunc.ufunc):
     """docstring"""
 
     def __init__(self):
@@ -117,8 +117,25 @@ def __call__(self, a: int, b: int) -> int:
 _test_ufunc = _Test_ufunc()
 
 
+class Test_helpers:
+    def test_is_wrapped_true(self) -> None:
+        wrapped = m.implemented(_test_func, "foo", "_test_func")
+        assert m.is_wrapped(wrapped)
+
+    def test_is_wrapped_false(self) -> None:
+        assert not m.is_wrapped(10)
+
+    def test_is_implemented_true(self) -> None:
+        wrapped = m.implemented(_test_func, "foo", "_test_func")
+        assert m.is_implemented(wrapped)
+
+    def test_is_implemented_false(self) -> None:
+        wrapped = m.unimplemented(_test_func, "foo", "_test_func")
+        assert not m.is_implemented(wrapped)
+
+
 class Test_implemented:
-    @patch("cunumeric.runtime.record_api_call")
+    @patch("cupynumeric.runtime.record_api_call")
     def test_reporting_True_func(
         self, mock_record_api_call: MagicMock
     ) -> None:
@@ -140,7 +157,7 @@ def test_reporting_True_func(
         )
         assert int(lineno)
 
-    @patch("cunumeric.runtime.record_api_call")
+    @patch("cupynumeric.runtime.record_api_call")
     def test_reporting_False_func(
         self, mock_record_api_call: MagicMock
     ) -> None:
@@ -157,7 +174,7 @@ def test_reporting_False_func(
 
         mock_record_api_call.assert_not_called()
 
-    @patch("cunumeric.runtime.record_api_call")
+    @patch("cupynumeric.runtime.record_api_call")
     def test_reporting_True_ufunc(
         self, mock_record_api_call: MagicMock
     ) -> None:
@@ -181,7 +198,7 @@ def test_reporting_True_ufunc(
         )
         assert int(lineno)
 
-    @patch("cunumeric.runtime.record_api_call")
+    @patch("cupynumeric.runtime.record_api_call")
     def test_reporting_False_ufunc(
         self, mock_record_api_call: MagicMock
     ) -> None:
@@ -202,7 +219,7 @@ def test_reporting_False_ufunc(
 
 
 class Test_unimplemented:
-    @patch("cunumeric.runtime.record_api_call")
+    @patch("cupynumeric.runtime.record_api_call")
     def test_reporting_True_func(
         self, mock_record_api_call: MagicMock
     ) -> None:
@@ -225,7 +242,7 @@ def test_reporting_True_func(
         )
         assert int(lineno)
 
-    @patch("cunumeric.runtime.record_api_call")
+    @patch("cupynumeric.runtime.record_api_call")
     def test_reporting_False_func(
         self, mock_record_api_call: MagicMock
     ) -> None:
@@ -249,7 +266,7 @@ def test_reporting_False_func(
 
         mock_record_api_call.assert_not_called()
 
-    @patch("cunumeric.runtime.record_api_call")
+    @patch("cupynumeric.runtime.record_api_call")
     def test_reporting_True_ufunc(
         self, mock_record_api_call: MagicMock
     ) -> None:
@@ -270,7 +287,7 @@ def test_reporting_True_ufunc(
         )
         assert int(lineno)
 
-    @patch("cunumeric.runtime.record_api_call")
+    @patch("cupynumeric.runtime.record_api_call")
     def test_reporting_False_ufunc(
         self, mock_record_api_call: MagicMock
     ) -> None:
@@ -347,12 +364,12 @@ def test_report_coverage_True(self) -> None:
         assert _Dest.attr2 == 30
 
         assert _Dest.function1.__wrapped__ is _OriginMod.function1
-        assert not _Dest.function1._cunumeric.implemented
+        assert not _Dest.function1._cupynumeric_metadata.implemented
 
         assert _Dest.function2.__wrapped__
-        assert _Dest.function2._cunumeric.implemented
+        assert _Dest.function2._cupynumeric_metadata.implemented
 
-        assert not hasattr(_Dest.extra, "_cunumeric")
+        assert not hasattr(_Dest.extra, "_cupynumeric")
 
         settings.report_coverage.unset_value()
 
@@ -373,12 +390,12 @@ def test_report_coverage_False(self) -> None:
         assert _Dest.attr2 == 30
 
         assert _Dest.function1.__wrapped__ is _OriginMod.function1
-        assert not _Dest.function1._cunumeric.implemented
+        assert not _Dest.function1._cupynumeric_metadata.implemented
 
         assert _Dest.function2.__wrapped__
-        assert _Dest.function2._cunumeric.implemented
+        assert _Dest.function2._cupynumeric_metadata.implemented
 
-        assert not hasattr(_Dest.extra, "_cunumeric")
+        assert not hasattr(_Dest.extra, "_cupynumeric")
 
         settings.report_coverage.unset_value()
 
@@ -428,12 +445,12 @@ def test_report_coverage_True(self) -> None:
         assert _Test_ndarray.attr2 == 30
 
         assert _Test_ndarray.foo.__wrapped__ is _Orig_ndarray.foo
-        assert not _Test_ndarray.foo._cunumeric.implemented
+        assert not _Test_ndarray.foo._cupynumeric_metadata.implemented
 
         assert _Test_ndarray.bar.__wrapped__
-        assert _Test_ndarray.bar._cunumeric.implemented
+        assert _Test_ndarray.bar._cupynumeric_metadata.implemented
 
-        assert not hasattr(_Test_ndarray.extra, "_cunumeric")
+        assert not hasattr(_Test_ndarray.extra, "_cupynumeric")
 
         settings.report_coverage.unset_value()
 
@@ -447,12 +464,12 @@ def test_report_coverage_False(self) -> None:
         assert _Test_ndarray.attr2 == 30
 
         assert _Test_ndarray.foo.__wrapped__ is _Orig_ndarray.foo
-        assert not _Test_ndarray.foo._cunumeric.implemented
+        assert not _Test_ndarray.foo._cupynumeric_metadata.implemented
 
         assert _Test_ndarray.bar.__wrapped__
-        assert _Test_ndarray.bar._cunumeric.implemented
+        assert _Test_ndarray.bar._cupynumeric_metadata.implemented
 
-        assert not hasattr(_Test_ndarray.extra, "_cunumeric")
+        assert not hasattr(_Test_ndarray.extra, "_cupynumeric")
 
         settings.report_coverage.unset_value()
 
@@ -465,36 +482,36 @@ def test_fallback(self):
 
 
 def test_ufunc_methods_binary() -> None:
-    import cunumeric as np
+    import cupynumeric as np
 
     # reduce is implemented
     assert np.add.reduce.__wrapped__
-    assert np.add.reduce._cunumeric.implemented
+    assert np.add.reduce._cupynumeric_metadata.implemented
 
     # the rest are not
     assert np.add.reduceat.__wrapped__
-    assert not np.add.reduceat._cunumeric.implemented
+    assert not np.add.reduceat._cupynumeric_metadata.implemented
     assert np.add.outer.__wrapped__
-    assert not np.add.outer._cunumeric.implemented
+    assert not np.add.outer._cupynumeric_metadata.implemented
     assert np.add.at.__wrapped__
-    assert not np.add.at._cunumeric.implemented
+    assert not np.add.at._cupynumeric_metadata.implemented
     assert np.add.accumulate.__wrapped__
-    assert not np.add.accumulate._cunumeric.implemented
+    assert not np.add.accumulate._cupynumeric_metadata.implemented
 
 
 def test_ufunc_methods_unary() -> None:
-    import cunumeric as np
+    import cupynumeric as np
 
     assert np.negative.reduce.__wrapped__
-    assert not np.negative.reduce._cunumeric.implemented
+    assert not np.negative.reduce._cupynumeric_metadata.implemented
     assert np.negative.reduceat.__wrapped__
-    assert not np.negative.reduceat._cunumeric.implemented
+    assert not np.negative.reduceat._cupynumeric_metadata.implemented
     assert np.negative.outer.__wrapped__
-    assert not np.negative.outer._cunumeric.implemented
+    assert not np.negative.outer._cupynumeric_metadata.implemented
     assert np.negative.at.__wrapped__
-    assert not np.negative.at._cunumeric.implemented
+    assert not np.negative.at._cupynumeric_metadata.implemented
     assert np.negative.accumulate.__wrapped__
-    assert not np.negative.accumulate._cunumeric.implemented
+    assert not np.negative.accumulate._cupynumeric_metadata.implemented
 
 
 if __name__ == "__main__":
diff --git a/tests/unit/cunumeric/_utils/test_linalg.py b/tests/unit/cupynumeric/_utils/test_linalg.py
similarity index 99%
rename from tests/unit/cunumeric/_utils/test_linalg.py
rename to tests/unit/cupynumeric/_utils/test_linalg.py
index f863f55afb..51ee3eea4f 100644
--- a/tests/unit/cunumeric/_utils/test_linalg.py
+++ b/tests/unit/cupynumeric/_utils/test_linalg.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-import cunumeric._utils.linalg as m  # module under test
+import cupynumeric._utils.linalg as m  # module under test
 
 
 def _dot_modes_oracle(a_ndim: int, b_ndim: int) -> bool:
diff --git a/tests/unit/cunumeric/random/__init__.py b/tests/unit/cupynumeric/random/__init__.py
similarity index 100%
rename from tests/unit/cunumeric/random/__init__.py
rename to tests/unit/cupynumeric/random/__init__.py
diff --git a/tests/unit/cunumeric/random/test_bitgenerator.py b/tests/unit/cupynumeric/random/test_bitgenerator.py
similarity index 96%
rename from tests/unit/cunumeric/random/test_bitgenerator.py
rename to tests/unit/cupynumeric/random/test_bitgenerator.py
index d9b83ab1a6..7d38b3279a 100644
--- a/tests/unit/cunumeric/random/test_bitgenerator.py
+++ b/tests/unit/cupynumeric/random/test_bitgenerator.py
@@ -16,8 +16,8 @@
 import pytest
 from mock import patch
 
-import cunumeric.random._bitgenerator as m  # module under test
-from cunumeric.config import BitGeneratorType
+import cupynumeric.random._bitgenerator as m  # module under test
+from cupynumeric.config import BitGeneratorType
 
 
 class TestXORWOW:
diff --git a/tests/unit/cupynumeric/test___init__.py b/tests/unit/cupynumeric/test___init__.py
new file mode 100644
index 0000000000..d0de454d59
--- /dev/null
+++ b/tests/unit/cupynumeric/test___init__.py
@@ -0,0 +1,49 @@
+# Copyright 2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+import re
+from importlib import reload
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    import pytest
+
+import cupynumeric  # noqa: [F401]
+
+
+def test___version___override(monkeypatch: pytest.MonkeyPatch) -> None:
+    global cupynumeric  # noqa: PLW0603
+    monkeypatch.setenv("CUPYNUMERIC_USE_VERSION", "24.01.00")
+    cupynumeric = reload(cupynumeric)
+    assert cupynumeric.__version__ == "24.01.00"
+
+
+def test___version___format() -> None:
+    global cupynumeric  # noqa: PLW0603
+    cupynumeric = reload(cupynumeric)
+
+    # just being cautious, if the test are functioning properly, the
+    # actual non-overriden version should never equal the bogus version
+    # from test___version___override above
+    assert cupynumeric.__version__ != "24.01.00"
+
+    assert re.match(r"^\d{2}\.\d{2}\.\d{2}$", cupynumeric.__version__[:8])
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/unit/cunumeric/test_config.py b/tests/unit/cupynumeric/test_config.py
similarity index 84%
rename from tests/unit/cunumeric/test_config.py
rename to tests/unit/cupynumeric/test_config.py
index e002040304..5a7edf2f03 100644
--- a/tests/unit/cunumeric/test_config.py
+++ b/tests/unit/cupynumeric/test_config.py
@@ -15,45 +15,45 @@
 
 import pytest
 
-import cunumeric.config as m  # module under test
+import cupynumeric.config as m  # module under test
 
 
-class TestCuNumericLib:
+class TestCuPyNumericLib:
     def test___init__(self) -> None:
-        lib = m.CuNumericLib("foo")
+        lib = m.CuPyNumericLib("foo")
         assert lib.name == "foo"
 
     def test_get_shared_library(self) -> None:
-        lib = m.CuNumericLib("foo")
+        lib = m.CuPyNumericLib("foo")
         result = lib.get_shared_library()
         assert isinstance(result, str)
 
-        from cunumeric.install_info import libpath
+        from cupynumeric.install_info import libpath
 
         assert result.startswith(libpath)
 
-        assert "libcunumeric" in result
+        assert "libcupynumeric" in result
 
         assert result.endswith(lib.get_library_extension())
 
     def test_get_c_header(self) -> None:
-        lib = m.CuNumericLib("foo")
+        lib = m.CuPyNumericLib("foo")
 
-        from cunumeric.install_info import header
+        from cupynumeric.install_info import header
 
         assert lib.get_c_header() == header
 
 
-def test_CUNUMERIC_LIB_NAME() -> None:
-    assert m.CUNUMERIC_LIB_NAME == "cunumeric"
+def test_CUPYNUMERIC_LIB_NAME() -> None:
+    assert m.CUPYNUMERIC_LIB_NAME == "cupynumeric"
 
 
-def test_cunumeric_lib() -> None:
-    assert isinstance(m.cunumeric_lib, m.CuNumericLib)
+def test_cupynumeric_lib() -> None:
+    assert isinstance(m.cupynumeric_lib, m.CuPyNumericLib)
 
 
-def test_CuNumericOpCode() -> None:
-    assert set(m.CuNumericOpCode.__members__) == {
+def test_CuPyNumericOpCode() -> None:
+    assert set(m.CuPyNumericOpCode.__members__) == {
         "ADVANCED_INDEXING",
         "ARANGE",
         "ARGWHERE",
@@ -72,6 +72,7 @@ def test_CuNumericOpCode() -> None:
         "FFT",
         "FILL",
         "FLIP",
+        "GEEV",
         "GEMM",
         "HISTOGRAM",
         "LOAD_CUDALIBS",
diff --git a/tests/unit/cunumeric/test_nptest.py b/tests/unit/cupynumeric/test_nptest.py
similarity index 81%
rename from tests/unit/cunumeric/test_nptest.py
rename to tests/unit/cupynumeric/test_nptest.py
index d2ecf2c5a9..4ee7a16a02 100644
--- a/tests/unit/cunumeric/test_nptest.py
+++ b/tests/unit/cupynumeric/test_nptest.py
@@ -15,13 +15,13 @@
 
 import pytest
 
-from cunumeric import test as nptest
+from cupynumeric import test as nptest
 
 MSG = (
-    "cuNumeric cannot execute numpy.test() due to reliance "
+    "cuPyNumeric cannot execute numpy.test() due to reliance "
     "on Numpy internals. For information about running the "
-    "cuNumeric test suite, see: "
-    "https://docs.nvidia.com/cunumeric/latest/developer/index.html"
+    "cuPyNumeric test suite, see: "
+    "https://docs.nvidia.com/cupynumeric/latest/developer/index.html"
 )
 
 
diff --git a/tests/unit/cunumeric/test_patch.py b/tests/unit/cupynumeric/test_patch.py
similarity index 84%
rename from tests/unit/cunumeric/test_patch.py
rename to tests/unit/cupynumeric/test_patch.py
index 41a5107e80..4bb61406b9 100644
--- a/tests/unit/cunumeric/test_patch.py
+++ b/tests/unit/cupynumeric/test_patch.py
@@ -25,14 +25,14 @@
 
 @pytest.mark.skip
 def test_no_patch() -> None:
-    cmd = "import sys; import cunumeric; import numpy; sys.exit(numpy is cunumeric)"  # noqa E501
+    cmd = "import sys; import cupynumeric; import numpy; sys.exit(numpy is cupynumeric)"  # noqa E501
     proc = run([legate, "-c", cmd])
     assert proc.returncode == 0, "numpy is unexpectedly patched"
 
 
 @pytest.mark.skip
 def test_patch() -> None:
-    cmd = "import sys; import cunumeric.patch; import numpy; sys.exit(numpy is cunumeric)"  # noqa E501
+    cmd = "import sys; import cupynumeric.patch; import numpy; sys.exit(numpy is cupynumeric)"  # noqa E501
     proc = run([legate, "-c", cmd])
     assert proc.returncode == 1, "numpy failed to patch"
 
diff --git a/tests/unit/cunumeric/test_settings.py b/tests/unit/cupynumeric/test_settings.py
similarity index 94%
rename from tests/unit/cunumeric/test_settings.py
rename to tests/unit/cupynumeric/test_settings.py
index 66cee2eb72..00cfe9cf5d 100644
--- a/tests/unit/cunumeric/test_settings.py
+++ b/tests/unit/cupynumeric/test_settings.py
@@ -20,7 +20,7 @@
 from legate.util.fs import read_c_define
 from legate.util.settings import EnvOnlySetting, PrioritizedSetting
 
-import cunumeric.settings as m
+import cupynumeric.settings as m
 
 _expected_settings = (
     "preload_cudalibs",
@@ -61,7 +61,7 @@ def test_standard_settings(self) -> None:
     @pytest.mark.parametrize("name", _expected_settings)
     def test_prefix(self, name: str) -> None:
         ps = getattr(m.settings, name)
-        assert ps.env_var.startswith("CUNUMERIC_")
+        assert ps.env_var.startswith("CUPYNUMERIC_")
 
     def test_types(self) -> None:
         assert m.settings.preload_cudalibs.convert_type == 'bool ("0" or "1")'
@@ -98,7 +98,7 @@ def test_numpy_compat(self) -> None:
     @pytest.mark.parametrize("name", _settings_with_test_defaults)
     def test_default(self, name: str) -> None:
         setting = getattr(m.settings, name)
-        define = setting.env_var.removeprefix("CUNUMERIC_") + "_DEFAULT"
+        define = setting.env_var.removeprefix("CUPYNUMERIC_") + "_DEFAULT"
         expected = setting._convert(read_c_define(ENV_HEADER, define))
         assert setting.default == expected
 
@@ -106,7 +106,7 @@ def test_default(self, name: str) -> None:
     @pytest.mark.parametrize("name", _settings_with_test_defaults)
     def test_test_default(self, name: str) -> None:
         setting = getattr(m.settings, name)
-        define = setting.env_var.removeprefix("CUNUMERIC_") + "_TEST"
+        define = setting.env_var.removeprefix("CUPYNUMERIC_") + "_TEST"
         expected = setting._convert(read_c_define(ENV_HEADER, define))
         assert setting.test_default == expected
 
diff --git a/tests/unit/util.py b/tests/unit/util.py
index a6bb0a49e2..17a8c53da7 100644
--- a/tests/unit/util.py
+++ b/tests/unit/util.py
@@ -16,10 +16,9 @@
 from __future__ import annotations
 
 from itertools import chain, combinations
-from typing import Any, Iterable, Iterator
+from typing import Any, Iterable, Iterator, TypeAlias
 
 import pytest
-from typing_extensions import TypeAlias
 
 Capsys: TypeAlias = pytest.CaptureFixture[str]