diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 9c83535b771..f5886540252 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda11.8-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index a559be18077..270bfa239ad 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,24 +5,24 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-ucx1.15.0-openmpi-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda11.8-ucx1.15.0-openmpi-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda11.8-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/cuda:24.10": {
+    "ghcr.io/rapidsai/devcontainers/features/cuda:24.12": {
       "version": "11.8",
       "installcuBLAS": true,
       "installcuSOLVER": true,
       "installcuRAND": true,
       "installcuSPARSE": true
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/cuda",
diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json
index ca10c04edee..e31428e4b0c 100644
--- a/.devcontainer/cuda12.5-conda/devcontainer.json
+++ b/.devcontainer/cuda12.5-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.12-cpp-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda12.5-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json
index 6e2bf45700a..835274999ba 100644
--- a/.devcontainer/cuda12.5-pip/devcontainer.json
+++ b/.devcontainer/cuda12.5-pip/devcontainer.json
@@ -5,24 +5,24 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda12.5-ucx1.17.0-openmpi-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda12.5-ucx1.17.0-openmpi-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda12.5-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/cuda:24.10": {
+    "ghcr.io/rapidsai/devcontainers/features/cuda:24.12": {
       "version": "12.5",
       "installcuBLAS": true,
       "installcuSOLVER": true,
       "installcuRAND": true,
       "installcuSPARSE": true
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/cuda",
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index bc489ffd3f0..b272fb43e35 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -47,7 +47,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-pylibcugraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -77,13 +77,13 @@ jobs:
       date: ${{ inputs.date }}
       script: ci/build_wheel_pylibcugraph.sh
       extra-repo: rapidsai/cugraph-ops
-      extra-repo-sha: branch-24.10
+      extra-repo-sha: branch-24.12
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
       node_type: cpu32
   wheel-publish-pylibcugraph:
     needs: wheel-build-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -93,7 +93,7 @@ jobs:
   wheel-build-cugraph:
     needs: wheel-publish-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -101,12 +101,12 @@ jobs:
       date: ${{ inputs.date }}
       script: ci/build_wheel_cugraph.sh
       extra-repo: rapidsai/cugraph-ops
-      extra-repo-sha: branch-24.10
+      extra-repo-sha: branch-24.12
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
   wheel-publish-cugraph:
     needs: wheel-build-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -116,7 +116,7 @@ jobs:
   wheel-build-nx-cugraph:
     needs: wheel-publish-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -126,7 +126,7 @@ jobs:
   wheel-publish-nx-cugraph:
     needs: wheel-build-nx-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -136,7 +136,7 @@ jobs:
   wheel-build-cugraph-dgl:
     needs: wheel-publish-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -146,7 +146,7 @@ jobs:
   wheel-publish-cugraph-dgl:
     needs: wheel-build-cugraph-dgl
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -156,7 +156,7 @@ jobs:
   wheel-build-cugraph-pyg:
     needs: wheel-publish-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -166,7 +166,7 @@ jobs:
   wheel-publish-cugraph-pyg:
     needs: wheel-build-cugraph-pyg
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -175,7 +175,7 @@ jobs:
       package-name: cugraph-pyg
   wheel-build-cugraph-equivariant:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -185,7 +185,7 @@ jobs:
   wheel-publish-cugraph-equivariant:
     needs: wheel-build-cugraph-equivariant
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index dacd9a93399..b0a1308237e 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -12,6 +12,7 @@ concurrency:
 jobs:
   pr-builder:
     needs:
+      - changed-files
       - checks
       - conda-cpp-build
       - conda-cpp-tests
@@ -34,29 +35,69 @@ jobs:
       - wheel-tests-cugraph-equivariant
       - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.12
+    if: always()
+    with:
+      needs: ${{ toJSON(needs) }}
+  changed-files:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-24.12
+    with:
+      files_yaml: |
+        test_cpp:
+          - '**'
+          - '!.devcontainers/**'
+          - '!CONTRIBUTING.md'
+          - '!README.md'
+          - '!docs/**'
+          - '!img/**'
+          - '!mg_utils/**'
+          - '!notebooks/**'
+          - '!python/**'
+          - '!readme_pages/**'
+          # TODO: Remove this before merging
+          - '!.github/**'
+        test_notebooks:
+          - '**'
+          - '!.devcontainers/**'
+          - '!CONTRIBUTING.md'
+          - '!README.md'
+          - '!docs/**'
+          # TODO: Remove this before merging
+          - '!.github/**'
+        test_python:
+          - '**'
+          - '!.devcontainers/**'
+          - '!CONTRIBUTING.md'
+          - '!README.md'
+          - '!docs/**'
+          - '!img/**'
+          - '!notebooks/**'
+          # TODO: Remove this before merging
+          - '!.github/**'
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.12
     with:
       build_type: pull-request
       node_type: cpu32
   conda-cpp-tests:
-    needs: conda-cpp-build
+    needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.12
     with:
       build_type: pull-request
       enable_check_symbols: true
@@ -64,19 +105,21 @@ jobs:
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12
     with:
       build_type: pull-request
   conda-python-tests:
-    needs: conda-python-build
+    needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
   conda-notebook-tests:
-    needs: conda-python-build
+    needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_notebooks
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -86,7 +129,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -96,63 +139,67 @@ jobs:
   wheel-build-pylibcugraph:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       build_type: pull-request
       script: ci/build_wheel_pylibcugraph.sh
       extra-repo: rapidsai/cugraph-ops
-      extra-repo-sha: branch-24.10
+      extra-repo-sha: branch-24.12
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
       node_type: cpu32
   wheel-tests-pylibcugraph:
-    needs: wheel-build-pylibcugraph
+    needs: [wheel-build-pylibcugraph, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
       script: ci/test_wheel_pylibcugraph.sh
   wheel-build-cugraph:
     needs: wheel-tests-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       build_type: pull-request
       script: ci/build_wheel_cugraph.sh
       extra-repo: rapidsai/cugraph-ops
-      extra-repo-sha: branch-24.10
+      extra-repo-sha: branch-24.12
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
   wheel-tests-cugraph:
-    needs: wheel-build-cugraph
+    needs: [wheel-build-cugraph, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
       script: ci/test_wheel_cugraph.sh
   wheel-build-nx-cugraph:
     needs: wheel-tests-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       build_type: pull-request
       script: ci/build_wheel_nx-cugraph.sh
   wheel-tests-nx-cugraph:
-    needs: wheel-build-nx-cugraph
+    needs: [wheel-build-nx-cugraph, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
       script: ci/test_wheel_nx-cugraph.sh
   wheel-build-cugraph-dgl:
     needs: wheel-tests-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       build_type: pull-request
       script: ci/build_wheel_cugraph-dgl.sh
   wheel-tests-cugraph-dgl:
-    needs: wheel-build-cugraph-dgl
+    needs: [wheel-build-cugraph-dgl, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
       script: ci/test_wheel_cugraph-dgl.sh
@@ -160,35 +207,37 @@ jobs:
   wheel-build-cugraph-pyg:
     needs: wheel-tests-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       build_type: pull-request
       script: ci/build_wheel_cugraph-pyg.sh
   wheel-tests-cugraph-pyg:
-    needs: wheel-build-cugraph-pyg
+    needs: [wheel-build-cugraph-pyg, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
       script: ci/test_wheel_cugraph-pyg.sh
       matrix_filter: map(select(.ARCH == "amd64"))
   wheel-build-cugraph-equivariant:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       build_type: pull-request
       script: ci/build_wheel_cugraph-equivariant.sh
   wheel-tests-cugraph-equivariant:
-    needs: wheel-build-cugraph-equivariant
+    needs: [wheel-build-cugraph-equivariant, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
       script: ci/test_wheel_cugraph-equivariant.sh
       matrix_filter: map(select(.ARCH == "amd64"))
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.12
     with:
       arch: '["amd64"]'
       cuda: '["12.5"]'
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 957d29ce72b..5fbdd276bd6 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -26,7 +26,7 @@ jobs:
       symbol_exclusions: (cugraph::ops|hornet|void writeEdgeCountsKernel|void markUniqueOffsetsKernel)
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -34,7 +34,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -42,7 +42,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-pylibcugraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
       script: ci/test_wheel_pylibcugraph.sh
   wheel-tests-cugraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -60,7 +60,7 @@ jobs:
       script: ci/test_wheel_cugraph.sh
   wheel-tests-nx-cugraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       script: ci/test_wheel_nx-cugraph.sh
   wheel-tests-cugraph-dgl:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
       matrix_filter: map(select(.ARCH == "amd64"))
   wheel-tests-cugraph-pyg:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
       matrix_filter: map(select(.ARCH == "amd64"))
   wheel-tests-cugraph-equivariant:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 36c5fa84166..8ff284210b7 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -17,11 +17,11 @@ repos:
     hooks:
       - id: black
         language_version: python3
-        args: [--target-version=py39]
+        args: [--target-version=py310]
         files: ^(python/.*|benchmarks/.*)$
         exclude: ^python/nx-cugraph/
   - repo: https://github.com/PyCQA/flake8
-    rev: 6.0.0
+    rev: 7.1.1
     hooks:
       - id: flake8
         args: ["--config=.flake8"]
@@ -34,7 +34,7 @@ repos:
     hooks:
       - id: yesqa
         additional_dependencies:
-          - flake8==6.0.0
+          - flake8==7.1.1
   - repo: https://github.com/pre-commit/mirrors-clang-format
     rev: v16.0.6
     hooks:
@@ -42,7 +42,7 @@ repos:
         types_or: [c, c++, cuda]
         args: ["-fallback-style=none", "-style=file", "-i"]
   - repo: https://github.com/rapidsai/pre-commit-hooks
-    rev: v0.2.0
+    rev: v0.4.0
     hooks:
       - id: verify-copyright
         files: |
diff --git a/CHANGELOG.md b/CHANGELOG.md
index f85c7d03f03..689a214751f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,88 @@
+# cugraph 24.08.00 (7 Aug 2024)
+
+## 🚨 Breaking Changes
+
+- Use MNMG version of ECG in python layer instead, and remove legacy ECG and Louvain ([#4514](https://github.com/rapidsai/cugraph/pull/4514)) [@naimnv](https://github.com/naimnv)
+
+## 🐛 Bug Fixes
+
+- add setuptools to host requirements for conda packages that need it ([#4582](https://github.com/rapidsai/cugraph/pull/4582)) [@jameslamb](https://github.com/jameslamb)
+- Add pylibcugraph dependency on pylibraft. ([#4570](https://github.com/rapidsai/cugraph/pull/4570)) [@bdice](https://github.com/bdice)
+- Fix build error with NO_CUGRAPH_OPS ([#4563](https://github.com/rapidsai/cugraph/pull/4563)) [@seunghwak](https://github.com/seunghwak)
+- [BUG] Fix Failing WholeGraph Tests ([#4560](https://github.com/rapidsai/cugraph/pull/4560)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- Temporarily Disable Feature Store Tests with WholeGraph ([#4559](https://github.com/rapidsai/cugraph/pull/4559)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- Fix MG `katz_centrality`: Check if DataFrame Arg is Not None ([#4555](https://github.com/rapidsai/cugraph/pull/4555)) [@nv-rliu](https://github.com/nv-rliu)
+- nx-cugraph: fix `from_pandas_edgekey` given edgekey but not edgeattr ([#4550](https://github.com/rapidsai/cugraph/pull/4550)) [@eriknw](https://github.com/eriknw)
+- Fix triangle count test bug ([#4549](https://github.com/rapidsai/cugraph/pull/4549)) [@jnke2016](https://github.com/jnke2016)
+- [BUG] Use the Correct WG Communicator ([#4548](https://github.com/rapidsai/cugraph/pull/4548)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- Add Additional Check For SSSP Source Vertex &amp; Fix SSSP Benchmark ([#4541](https://github.com/rapidsai/cugraph/pull/4541)) [@nv-rliu](https://github.com/nv-rliu)
+- Fix OOM Bug for Jaccard, Sorensen, and Overlap benchmarks ([#4524](https://github.com/rapidsai/cugraph/pull/4524)) [@nv-rliu](https://github.com/nv-rliu)
+- Distribute start_list across ranks ([#4519](https://github.com/rapidsai/cugraph/pull/4519)) [@jnke2016](https://github.com/jnke2016)
+- [FIX] Skip Distributed Sampler Tests if PyTorch with CUDA is not Available ([#4518](https://github.com/rapidsai/cugraph/pull/4518)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- [BUG] Fix a hang issue in MG triangle counts (when invoked with a small number of vertices to update triangle counts) ([#4517](https://github.com/rapidsai/cugraph/pull/4517)) [@seunghwak](https://github.com/seunghwak)
+- Update MG Benchmark List ([#4516](https://github.com/rapidsai/cugraph/pull/4516)) [@nv-rliu](https://github.com/nv-rliu)
+- Fix TensorProductConv test and improve docs ([#4480](https://github.com/rapidsai/cugraph/pull/4480)) [@tingyu66](https://github.com/tingyu66)
+- Test nx-cugraph package instead of editable install ([#4442](https://github.com/rapidsai/cugraph/pull/4442)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+
+## 📖 Documentation
+
+- DOC: typo in nx_transition.rst ([#4491](https://github.com/rapidsai/cugraph/pull/4491)) [@raybellwaves](https://github.com/raybellwaves)
+- Doc cleanup for nx-cugraph: fixed typos, cleaned up various descriptions, renamed notebook to match naming convetion. ([#4478](https://github.com/rapidsai/cugraph/pull/4478)) [@rlratzel](https://github.com/rlratzel)
+- [DOC] Minor Improvements to cuGraph-PyG Documentation ([#4460](https://github.com/rapidsai/cugraph/pull/4460)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+
+## 🚀 New Features
+
+- Use MNMG version of ECG in python layer instead, and remove legacy ECG and Louvain ([#4514](https://github.com/rapidsai/cugraph/pull/4514)) [@naimnv](https://github.com/naimnv)
+- c_api and plc binding for lookup src dst using edge ids and type(s) ([#4494](https://github.com/rapidsai/cugraph/pull/4494)) [@naimnv](https://github.com/naimnv)
+- Forward merge branch-24.06 into branch-24.08 ([#4489](https://github.com/rapidsai/cugraph/pull/4489)) [@nv-rliu](https://github.com/nv-rliu)
+- [FEA] New Graph Interface and Loaders for Distributed Sampling in DGL ([#4486](https://github.com/rapidsai/cugraph/pull/4486)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- compute cosine similarity for vertex pairs ([#4482](https://github.com/rapidsai/cugraph/pull/4482)) [@naimnv](https://github.com/naimnv)
+- Define heterogeneous renumbering API ([#4463](https://github.com/rapidsai/cugraph/pull/4463)) [@seunghwak](https://github.com/seunghwak)
+- Lookup edge src dst using edge id and type ([#4449](https://github.com/rapidsai/cugraph/pull/4449)) [@naimnv](https://github.com/naimnv)
+- Biased sampling ([#4443](https://github.com/rapidsai/cugraph/pull/4443)) [@seunghwak](https://github.com/seunghwak)
+
+## 🛠️ Improvements
+
+- nx-cugraph: check networkx version ([#4571](https://github.com/rapidsai/cugraph/pull/4571)) [@eriknw](https://github.com/eriknw)
+- nx-cugraph: add `G.__networkx_cache__` to enable graph conversion caching ([#4567](https://github.com/rapidsai/cugraph/pull/4567)) [@eriknw](https://github.com/eriknw)
+- split up CUDA-suffixed dependencies in dependencies.yaml ([#4552](https://github.com/rapidsai/cugraph/pull/4552)) [@jameslamb](https://github.com/jameslamb)
+- Use workflow branch 24.08 again ([#4544](https://github.com/rapidsai/cugraph/pull/4544)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Support non p2p configuration when initializing the comms ([#4543](https://github.com/rapidsai/cugraph/pull/4543)) [@jnke2016](https://github.com/jnke2016)
+- Fix Warning from `simpleDistributedGraph.py` ([#4540](https://github.com/rapidsai/cugraph/pull/4540)) [@nv-rliu](https://github.com/nv-rliu)
+- Create a graph from the edge list in multiple chunks ([#4539](https://github.com/rapidsai/cugraph/pull/4539)) [@seunghwak](https://github.com/seunghwak)
+- nx-cugraph: add dijkstra sssp functions ([#4538](https://github.com/rapidsai/cugraph/pull/4538)) [@eriknw](https://github.com/eriknw)
+- nx-cugraph: add `from_dict_of_lists` and `to_dict_of_lists` ([#4537](https://github.com/rapidsai/cugraph/pull/4537)) [@eriknw](https://github.com/eriknw)
+- Ensure `get_test_data.sh` doesn&#39;t re-download datasets ([#4536](https://github.com/rapidsai/cugraph/pull/4536)) [@trxcllnt](https://github.com/trxcllnt)
+- Define and Implement C API for biased sampling ([#4535](https://github.com/rapidsai/cugraph/pull/4535)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Build and test with CUDA 12.5.1 ([#4534](https://github.com/rapidsai/cugraph/pull/4534)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Refactor C++ unit tests to allow finer grained filtering ([#4533](https://github.com/rapidsai/cugraph/pull/4533)) [@ChuckHastings](https://github.com/ChuckHastings)
+- [IMP] Set the Default WG Memory Type to &#39;distributed&#39; for the MNMG PyG Example ([#4532](https://github.com/rapidsai/cugraph/pull/4532)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- nx-cugraph: add `relabel_nodes` and `convert_node_labels_to_integers` ([#4531](https://github.com/rapidsai/cugraph/pull/4531)) [@eriknw](https://github.com/eriknw)
+- Add `-cuXX` suffixed versions of cugraph-service-client dependency to pyproject.toml&#39;s project.dependencies list ([#4530](https://github.com/rapidsai/cugraph/pull/4530)) [@trxcllnt](https://github.com/trxcllnt)
+- Further optimize `from_pandas_edgelist` with cudf ([#4528](https://github.com/rapidsai/cugraph/pull/4528)) [@eriknw](https://github.com/eriknw)
+- Performance optimize BFS (including direction optimizing BFS implementation, mainly for single-GPU) ([#4527](https://github.com/rapidsai/cugraph/pull/4527)) [@seunghwak](https://github.com/seunghwak)
+- Add CUDA_STATIC_MATH_LIBRARIES ([#4526](https://github.com/rapidsai/cugraph/pull/4526)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Better handle cudf.pandas in `from_pandas_edgelist` ([#4525](https://github.com/rapidsai/cugraph/pull/4525)) [@eriknw](https://github.com/eriknw)
+- Skip the benchmark ctests within CI ([#4522](https://github.com/rapidsai/cugraph/pull/4522)) [@ChuckHastings](https://github.com/ChuckHastings)
+- remove thriftpy2 ceiling ([#4521](https://github.com/rapidsai/cugraph/pull/4521)) [@jameslamb](https://github.com/jameslamb)
+- Avoid --find-links in wheel jobs ([#4509](https://github.com/rapidsai/cugraph/pull/4509)) [@jameslamb](https://github.com/jameslamb)
+- Refactor code base to reduce memory requirement for building libcugraph ([#4506](https://github.com/rapidsai/cugraph/pull/4506)) [@naimnv](https://github.com/naimnv)
+- Tweak rmm configuration for C++ unit tests ([#4503](https://github.com/rapidsai/cugraph/pull/4503)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Expose new all-pairs Similarity algorithms ([#4502](https://github.com/rapidsai/cugraph/pull/4502)) [@jnke2016](https://github.com/jnke2016)
+- remove openmpi ceiling ([#4496](https://github.com/rapidsai/cugraph/pull/4496)) [@jameslamb](https://github.com/jameslamb)
+- Cut peak memory footprint in per_v_transform_reduce_dst_key_aggregated_outgoing_e ([#4484](https://github.com/rapidsai/cugraph/pull/4484)) [@seunghwak](https://github.com/seunghwak)
+- Skip MG `dgl_uniform_sampler` test in nightlies ([#4479](https://github.com/rapidsai/cugraph/pull/4479)) [@nv-rliu](https://github.com/nv-rliu)
+- Remove text builds of documentation ([#4468](https://github.com/rapidsai/cugraph/pull/4468)) [@vyasr](https://github.com/vyasr)
+- [IMP] Limit the Test Data Size when Running CI in `gcn_dist_sg.py` ([#4461](https://github.com/rapidsai/cugraph/pull/4461)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- Forward Merge branch-24.06 into branch-24.08 ([#4454](https://github.com/rapidsai/cugraph/pull/4454)) [@nv-rliu](https://github.com/nv-rliu)
+- Properly clean up python directories ([#4453](https://github.com/rapidsai/cugraph/pull/4453)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Fixes for On-Going MG Test Failures ([#4450](https://github.com/rapidsai/cugraph/pull/4450)) [@nv-rliu](https://github.com/nv-rliu)
+- remove unnecessary &#39;setuptools&#39; and &#39;wheel&#39; dependencies ([#4448](https://github.com/rapidsai/cugraph/pull/4448)) [@jameslamb](https://github.com/jameslamb)
+- MG Implementation K-Truss ([#4438](https://github.com/rapidsai/cugraph/pull/4438)) [@jnke2016](https://github.com/jnke2016)
+- Overhaul ops-codeowners ([#4409](https://github.com/rapidsai/cugraph/pull/4409)) [@raydouglass](https://github.com/raydouglass)
+- Use rapids-build-backend ([#4393](https://github.com/rapidsai/cugraph/pull/4393)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Optimize K-Truss ([#4375](https://github.com/rapidsai/cugraph/pull/4375)) [@jnke2016](https://github.com/jnke2016)
+
 # cugraph 24.06.00 (5 Jun 2024)
 
 ## 🚨 Breaking Changes
diff --git a/VERSION b/VERSION
index 7c7ba04436f..af28c42b528 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.10.00
+24.12.00
diff --git a/benchmarks/cugraph-dgl/notebooks/get_node_storage.ipynb b/benchmarks/cugraph-dgl/notebooks/get_node_storage.ipynb
index 95b456c7812..4681c8ec825 100644
--- a/benchmarks/cugraph-dgl/notebooks/get_node_storage.ipynb
+++ b/benchmarks/cugraph-dgl/notebooks/get_node_storage.ipynb
@@ -18,7 +18,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/datasets/vjawa/miniconda3/envs/all_cuda-115_arch-x86_64/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "/datasets/vjawa/miniforge/envs/all_cuda-115_arch-x86_64/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
       "  from .autonotebook import tqdm as notebook_tqdm\n"
      ]
     }
diff --git a/benchmarks/cugraph-dgl/notebooks/heterogeneous_dataloader_benchmark.ipynb b/benchmarks/cugraph-dgl/notebooks/heterogeneous_dataloader_benchmark.ipynb
index d3b054bb0ee..2c4a934827a 100644
--- a/benchmarks/cugraph-dgl/notebooks/heterogeneous_dataloader_benchmark.ipynb
+++ b/benchmarks/cugraph-dgl/notebooks/heterogeneous_dataloader_benchmark.ipynb
@@ -176,7 +176,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/datasets/vjawa/miniconda3/envs/all_cuda-115_arch-x86_64/lib/python3.9/site-packages/dgl/dataloading/dataloader.py:859: DGLWarning: Dataloader CPU affinity opt is not enabled, consider switching it on (see enable_cpu_affinity() or CPU best practices for DGL [https://docs.dgl.ai/tutorials/cpu/cpu_best_practises.html])\n",
+      "/datasets/vjawa/miniforge/envs/all_cuda-115_arch-x86_64/lib/python3.9/site-packages/dgl/dataloading/dataloader.py:859: DGLWarning: Dataloader CPU affinity opt is not enabled, consider switching it on (see enable_cpu_affinity() or CPU best practices for DGL [https://docs.dgl.ai/tutorials/cpu/cpu_best_practises.html])\n",
       "  dgl_warning(f'Dataloader CPU affinity opt is not enabled, consider switching it on '\n"
      ]
     },
diff --git a/benchmarks/cugraph-dgl/notebooks/homogenous_dataloader_benchmark.ipynb b/benchmarks/cugraph-dgl/notebooks/homogenous_dataloader_benchmark.ipynb
index ea1e9b34965..ecd111dabdf 100644
--- a/benchmarks/cugraph-dgl/notebooks/homogenous_dataloader_benchmark.ipynb
+++ b/benchmarks/cugraph-dgl/notebooks/homogenous_dataloader_benchmark.ipynb
@@ -26,7 +26,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/datasets/vjawa/miniconda3/envs/all_cuda-115_arch-x86_64/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "/datasets/vjawa/miniforge/envs/all_cuda-115_arch-x86_64/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
       "  from .autonotebook import tqdm as notebook_tqdm\n"
      ]
     }
@@ -190,7 +190,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/datasets/vjawa/miniconda3/envs/all_cuda-115_arch-x86_64/lib/python3.9/site-packages/dgl/dataloading/dataloader.py:859: DGLWarning: Dataloader CPU affinity opt is not enabled, consider switching it on (see enable_cpu_affinity() or CPU best practices for DGL [https://docs.dgl.ai/tutorials/cpu/cpu_best_practises.html])\n",
+      "/datasets/vjawa/miniforge/envs/all_cuda-115_arch-x86_64/lib/python3.9/site-packages/dgl/dataloading/dataloader.py:859: DGLWarning: Dataloader CPU affinity opt is not enabled, consider switching it on (see enable_cpu_affinity() or CPU best practices for DGL [https://docs.dgl.ai/tutorials/cpu/cpu_best_practises.html])\n",
       "  dgl_warning(f'Dataloader CPU affinity opt is not enabled, consider switching it on '\n"
      ]
     },
@@ -278,7 +278,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/datasets/vjawa/miniconda3/envs/all_cuda-115_arch-x86_64/lib/python3.9/site-packages/distributed/worker.py:2988: UserWarning: Large object of size 1.42 MiB detected in task graph: \n",
+      "/datasets/vjawa/miniforge/envs/all_cuda-115_arch-x86_64/lib/python3.9/site-packages/distributed/worker.py:2988: UserWarning: Large object of size 1.42 MiB detected in task graph: \n",
       "  [b'\\xad\\xd1\\xe3\\x9c\\x96\\x83O\\xb3\\xba1\\x86\\x94\\xb6\\ ... =int32), False]\n",
       "Consider scattering large objects ahead of time\n",
       "with client.scatter to reduce scheduler burden and \n",
diff --git a/benchmarks/cugraph-dgl/python-script/ogbn_mag_benchmark.py b/benchmarks/cugraph-dgl/python-script/ogbn_mag_benchmark.py
index 539fe333b1e..55ff0043e30 100644
--- a/benchmarks/cugraph-dgl/python-script/ogbn_mag_benchmark.py
+++ b/benchmarks/cugraph-dgl/python-script/ogbn_mag_benchmark.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -126,4 +126,4 @@ def sampling_func(g, seed_nodes, labels, train_loader):
     st = time.time()
     sampling_func(g, subset_split_idx["train"], labels, train_loader)
     et = time.time()
-    print(f"Sampling time taken  = {et-st} s")
+    print(f"Sampling time taken  = {et - st} s")
diff --git a/benchmarks/cugraph/notebooks/feature_storage.ipynb b/benchmarks/cugraph/notebooks/feature_storage.ipynb
index 7413ac00cde..440d76fbdb4 100644
--- a/benchmarks/cugraph/notebooks/feature_storage.ipynb
+++ b/benchmarks/cugraph/notebooks/feature_storage.ipynb
@@ -18,7 +18,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/datasets/vjawa/miniconda3/envs/all_cuda-115_arch-x86_64/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "/datasets/vjawa/miniforge/envs/all_cuda-115_arch-x86_64/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
       "  from .autonotebook import tqdm as notebook_tqdm\n"
      ]
     }
diff --git a/benchmarks/cugraph/pytest-based/bench_cugraph_uniform_neighbor_sample.py b/benchmarks/cugraph/pytest-based/bench_cugraph_uniform_neighbor_sample.py
index 8c46095a7da..083acdde2f4 100644
--- a/benchmarks/cugraph/pytest-based/bench_cugraph_uniform_neighbor_sample.py
+++ b/benchmarks/cugraph/pytest-based/bench_cugraph_uniform_neighbor_sample.py
@@ -266,7 +266,7 @@ def uns_func(*args, **kwargs):
 @pytest.mark.managedmem_off
 @pytest.mark.poolallocator_on
 @pytest.mark.parametrize("batch_size", params.batch_sizes.values())
-@pytest.mark.parametrize("fanout", [params.fanout_10_25, params.fanout_5_10_15])
+@pytest.mark.parametrize("fanout", [params.fanout_10_25])
 @pytest.mark.parametrize(
     "with_replacement", [False], ids=lambda v: f"with_replacement={v}"
 )
@@ -287,6 +287,8 @@ def bench_cugraph_uniform_neighbor_sample(
         start_list=uns_args["start_list"],
         fanout_vals=uns_args["fanout"],
         with_replacement=uns_args["with_replacement"],
+        use_legacy_names=False,
+        with_edge_properties=True,
     )
     """
     dtmap = {"int32": 32 // 8, "int64": 64 // 8}
diff --git a/benchmarks/nx-cugraph/pytest-based/README.md b/benchmarks/nx-cugraph/pytest-based/README.md
new file mode 100644
index 00000000000..781550fa560
--- /dev/null
+++ b/benchmarks/nx-cugraph/pytest-based/README.md
@@ -0,0 +1,54 @@
+## `nx-cugraph` Benchmarks
+
+### Overview
+
+This directory contains a set of scripts designed to benchmark NetworkX with the `nx-cugraph` backend and deliver a report that summarizes the speed-up and runtime deltas over default NetworkX.
+
+Our current benchmarks provide the following datasets:
+
+| Dataset     | Nodes | Edges | Directed |
+| --------    | ------- | ------- | ------- |
+| netscience  | 1,461    | 5,484 | Yes |
+| email-Eu-core  | 1,005    | 25,571 | Yes |
+| cit-Patents  | 3,774,768    | 16,518,948 | Yes |
+| hollywood  | 1,139,905    | 57,515,616 | No |
+| soc-LiveJournal1  | 4,847,571    | 68,993,773 | Yes |
+
+
+
+### Scripts
+
+#### 1. `run-main-benchmarks.sh`
+This script allows users to run a small set of commonly-used algorithms across multiple datasets and backends. All results are stored inside a sub-directory (`logs/`) and output files are named based on the combination of parameters for that benchmark.
+
+NOTE: If running with all algorithms and datasets using NetworkX without an accelerated backend, this script may take a few hours to finish running.
+
+**Usage:**
+ - Run with `--cpu-only`:
+  ```bash
+  ./run-main-benchmarks.sh --cpu-only
+  ```
+ - Run with `--gpu-only`:
+  ```bash
+  ./run-main-benchmarks.sh --gpu-only
+  ```
+ - Run without any arguments (all backends):
+  ```bash
+  ./run-main-benchmarks.sh
+  ```
+
+#### 2. `get_graph_bench_dataset.py`
+This script downloads the specified dataset using `cugraph.datasets`.
+
+**Usage:**
+  ```bash
+  python get_graph_bench_dataset.py [dataset]
+  ```
+
+#### 3. `create_results_summary_page.py`
+This script is designed to be run after `run-gap-benchmarks.sh` in order to generate an HTML page displaying a results table comparing default NetworkX to nx-cugraph. The script also provides information about the current system, so it should be run on the machine on which benchmarks were run.
+
+**Usage:**
+  ```bash
+  python create_results_summary_page.py > report.html
+  ```
diff --git a/benchmarks/nx-cugraph/pytest-based/bench_algos.py b/benchmarks/nx-cugraph/pytest-based/bench_algos.py
index d40b5130827..f88d93c3f17 100644
--- a/benchmarks/nx-cugraph/pytest-based/bench_algos.py
+++ b/benchmarks/nx-cugraph/pytest-based/bench_algos.py
@@ -271,9 +271,8 @@ def bench_from_networkx(benchmark, graph_obj):
 
 
 # normalized_param_values = [True, False]
-# k_param_values = [10, 100]
 normalized_param_values = [True]
-k_param_values = [10]
+k_param_values = [10, 100, 1000]
 
 
 @pytest.mark.parametrize(
@@ -282,6 +281,10 @@ def bench_from_networkx(benchmark, graph_obj):
 @pytest.mark.parametrize("k", k_param_values, ids=lambda k: f"{k=}")
 def bench_betweenness_centrality(benchmark, graph_obj, backend_wrapper, normalized, k):
     G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+
+    if k > G.number_of_nodes():
+        pytest.skip(reason=f"{k=} > {G.number_of_nodes()=}")
+
     result = benchmark.pedantic(
         target=backend_wrapper(nx.betweenness_centrality),
         args=(G,),
@@ -305,6 +308,10 @@ def bench_edge_betweenness_centrality(
     benchmark, graph_obj, backend_wrapper, normalized, k
 ):
     G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+
+    if k > G.number_of_nodes():
+        pytest.skip(reason=f"{k=} > {G.number_of_nodes()=}")
+
     result = benchmark.pedantic(
         target=backend_wrapper(nx.edge_betweenness_centrality),
         args=(G,),
@@ -473,6 +480,26 @@ def bench_pagerank_personalized(benchmark, graph_obj, backend_wrapper):
     assert type(result) is dict
 
 
+def bench_shortest_path(benchmark, graph_obj, backend_wrapper):
+    """
+    This passes in the source node with the highest degree, but no target.
+    """
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    node = get_highest_degree_node(graph_obj)
+
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.shortest_path),
+        args=(G,),
+        kwargs=dict(
+            source=node,
+        ),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is dict
+
+
 def bench_single_source_shortest_path_length(benchmark, graph_obj, backend_wrapper):
     G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
     node = get_highest_degree_node(graph_obj)
diff --git a/benchmarks/nx-cugraph/pytest-based/create_results_summary_page.py b/benchmarks/nx-cugraph/pytest-based/create_results_summary_page.py
new file mode 100644
index 00000000000..f1cc4b06ccc
--- /dev/null
+++ b/benchmarks/nx-cugraph/pytest-based/create_results_summary_page.py
@@ -0,0 +1,291 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import re
+import pathlib
+import json
+import platform
+import psutil
+import socket
+import subprocess
+
+
+def get_formatted_time_value(time):
+    res = ""
+    if time < 1:
+        if time < 0.001:
+            units = "us"
+            time *= 1e6
+        else:
+            units = "ms"
+            time *= 1e3
+    else:
+        units = "s"
+    return f"{time:.3f}{units}"
+
+
+def get_all_benchmark_info():
+    benchmarks = {}
+    # Populate benchmarks dir from .json files
+    for json_file in logs_dir.glob("*.json"):
+        try:
+            data = json.loads(open(json_file).read())
+        except json.decoder.JSONDecodeError:
+            continue
+
+        for benchmark_run in data["benchmarks"]:
+            # example name: "bench_triangles[ds=netscience-backend=cugraph-preconverted]"
+            name = benchmark_run["name"]
+
+            algo_name = name.split("[")[0]
+            if algo_name.startswith("bench_"):
+                algo_name = algo_name[6:]
+            # special case for betweenness_centrality
+            match = k_patt.match(name)
+            if match is not None:
+                algo_name += f", k={match.group(1)}"
+
+            match = dataset_patt.match(name)
+            if match is None:
+                raise RuntimeError(
+                    f"benchmark name {name} in file {json_file} has an unexpected format"
+                )
+            dataset = match.group(1)
+            if dataset.endswith("-backend"):
+                dataset = dataset[:-8]
+
+            match = backend_patt.match(name)
+            if match is None:
+                raise RuntimeError(
+                    f"benchmark name {name} in file {json_file} has an unexpected format"
+                )
+            backend = match.group(1)
+            if backend == "None":
+                backend = "networkx"
+
+            runtime = benchmark_run["stats"]["mean"]
+            benchmarks.setdefault(algo_name, {}).setdefault(backend, {})[
+                dataset
+            ] = runtime
+    return benchmarks
+
+
+def compute_perf_vals(cugraph_runtime, networkx_runtime):
+    speedup_string = f"{networkx_runtime / cugraph_runtime:.3f}X"
+    delta = networkx_runtime - cugraph_runtime
+    if abs(delta) < 1:
+        if abs(delta) < 0.001:
+            units = "us"
+            delta *= 1e6
+        else:
+            units = "ms"
+            delta *= 1e3
+    else:
+        units = "s"
+    delta_string = f"{delta:.3f}{units}"
+
+    return (speedup_string, delta_string)
+
+
+def get_mem_info():
+    return round(psutil.virtual_memory().total / (1024**3), 2)
+
+
+def get_cuda_version():
+    output = subprocess.check_output("nvidia-smi", shell=True).decode()
+    try:
+        return next(
+            line.split("CUDA Version: ")[1].split()[0]
+            for line in output.splitlines()
+            if "CUDA Version" in line
+        )
+    except subprocess.CalledProcessError:
+        return "Failed to get CUDA version."
+
+
+def get_first_gpu_info():
+    try:
+        gpu_info = (
+            subprocess.check_output(
+                "nvidia-smi --query-gpu=name,memory.total,memory.free,memory.used --format=csv,noheader",
+                shell=True,
+            )
+            .decode()
+            .strip()
+        )
+        if gpu_info:
+            gpus = gpu_info.split("\n")
+            num_gpus = len(gpus)
+            first_gpu = gpus[0]  # Get the information for the first GPU
+            gpu_name, mem_total, _, _ = first_gpu.split(",")
+            return f"{num_gpus} x {gpu_name.strip()} ({round(int(mem_total.strip().split()[0]) / (1024), 2)} GB)"
+        else:
+            print("No GPU found or unable to query GPU details.")
+    except subprocess.CalledProcessError:
+        print("Failed to execute nvidia-smi. No GPU information available.")
+
+
+def get_system_info():
+    print('<div class="box2">')
+    print(f"<p>Hostname: {socket.gethostname()}</p>")
+    print(
+        f'<p class="indent"">Operating System: {platform.system()} {platform.release()}</p>'
+    )
+    print(f'<p class="indent">Kernel Version  : {platform.version()}</p>')
+    with open("/proc/cpuinfo") as f:
+        print(
+            f'<p>CPU: {next(line.strip().split(": ")[1] for line in f if "model name" in line)} ({psutil.cpu_count(logical=False)} cores)</p>'
+        )
+    print(f'<p class="indent">Memory: {get_mem_info()} GB</p>')
+    print(f"<p>GPU: {get_first_gpu_info()}</p>")
+    print(f"<p>CUDA Version: {get_cuda_version()}</p>")
+
+
+if __name__ == "__main__":
+    logs_dir = pathlib.Path("logs")
+
+    dataset_patt = re.compile(".*ds=([\w-]+).*")
+    backend_patt = re.compile(".*backend=(\w+).*")
+    k_patt = re.compile(".*k=(10*).*")
+
+    # Organize all benchmark runs by the following hierarchy: algo -> backend -> dataset
+    benchmarks = get_all_benchmark_info()
+
+    # dump HTML table
+    ordered_datasets = [
+        "netscience",
+        "email_Eu_core",
+        "cit-patents",
+        "hollywood",
+        "soc-livejournal1",
+    ]
+    # dataset, # Node, # Edge, Directed info
+    dataset_meta = {
+        "netscience": ["1,461", "5,484", "Yes"],
+        "email_Eu_core": ["1,005", "25,571", "Yes"],
+        "cit-patents": ["3,774,768", "16,518,948", "Yes"],
+        "hollywood": ["1,139,905", "57,515,616", "No"],
+        "soc-livejournal1": ["4,847,571", "68,993,773", "Yes"],
+    }
+
+    print(
+        """
+    <html>
+    <head>
+    <style>
+        table {
+            table-layout: fixed;
+            width: 100%;
+            border-collapse: collapse;
+        }
+        tbody tr:nth-child(odd) {
+            background-color: #ffffff;
+        }
+        tbody tr:nth-child(even) {
+            background-color: #d3d3d3;
+        }
+        tbody td {
+            text-align: center;
+            color: black;
+        }
+        th,
+        td {
+            padding: 12px;
+        }
+        .footer-main {
+            background-color: #d1d1d1;
+            padding: 20px;
+            padding-top: 0px;
+            font-size: 12px;
+            color: black;
+            width: 100%;
+            display: flex;
+        }
+        .box1{
+            flex: 1;
+            padding-right: 30px;
+        }
+        .box2{
+            flex: 4;
+        }
+        .indent {
+            text-indent: 20px;
+        }
+    </style>
+    </head>
+    <table>
+    <thead>
+    <tr>
+        <th>Dataset<br>Nodes<br>Edges<Br>Directed</th>"""
+    )
+    for ds in ordered_datasets:
+        print(
+            f"      <th>{ds}<br>{dataset_meta[ds][0]}<br>{dataset_meta[ds][1]}<br>{dataset_meta[ds][2]}<br></th>"
+        )
+    print(
+        """   </tr>
+    </thead>
+    <tbody>
+    """
+    )
+    for algo_name in sorted(benchmarks):
+        algo_runs = benchmarks[algo_name]
+        print("   <tr>")
+        print(f"      <td>{algo_name}</td>")
+        # Proceed only if any results are present for both cugraph and NX
+        if "cugraph" in algo_runs and "networkx" in algo_runs:
+            cugraph_algo_runs = algo_runs["cugraph"]
+            networkx_algo_runs = algo_runs["networkx"]
+            datasets_in_both = set(cugraph_algo_runs).intersection(networkx_algo_runs)
+
+            # populate the table with speedup results for each dataset in the order
+            # specified in ordered_datasets. If results for a run using a dataset
+            # are not present for both cugraph and NX, output an empty cell.
+            for dataset in ordered_datasets:
+                if dataset in datasets_in_both:
+                    cugraph_runtime = cugraph_algo_runs[dataset]
+                    networkx_runtime = networkx_algo_runs[dataset]
+                    (speedup, runtime_delta) = compute_perf_vals(
+                        cugraph_runtime=cugraph_runtime,
+                        networkx_runtime=networkx_runtime,
+                    )
+                    nx_formatted = get_formatted_time_value(networkx_runtime)
+                    cg_formatted = get_formatted_time_value(cugraph_runtime)
+                    print(
+                        f"      <td>{nx_formatted} / {cg_formatted}<br>{speedup}<br>{runtime_delta}</td>"
+                    )
+                else:
+                    print(f"      <td></td>")
+
+        # If a comparison between cugraph and NX cannot be made, output empty cells
+        # for each dataset
+        else:
+            for _ in range(len(ordered_datasets)):
+                print("      <td></td>")
+        print("   </tr>")
+    print(
+        """
+    </tbody>\n</table>
+    <div class="footer-main">
+        <div class="box1">
+            <h4>Table Format:</h4>
+            <ul>
+                <li><strong>NetworkX time / nx-cugraph time</strong></li>
+                <li><strong>Speed-up of using nx-cugraph</strong></li>
+                <li><strong>Time-delta</strong></li>
+            </ul>
+        </div>"""
+    )
+    get_system_info()
+    print("""</div>\n</div>\n</html>""")
diff --git a/benchmarks/nx-cugraph/pytest-based/get_graph_bench_dataset.py b/benchmarks/nx-cugraph/pytest-based/get_graph_bench_dataset.py
new file mode 100644
index 00000000000..5a0a15da8ee
--- /dev/null
+++ b/benchmarks/nx-cugraph/pytest-based/get_graph_bench_dataset.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Checks if a particular dataset has been downloaded inside the datasets dir
+(RAPIDS_DATAEST_ROOT_DIR). If not, the file will be downloaded using the
+datasets API.
+
+Positional Arguments:
+    1) dataset name (e.g. 'email_Eu_core', 'cit-patents')
+       available datasets can be found here: `python/cugraph/cugraph/datasets/__init__.py`
+"""
+
+import sys
+
+import cugraph.datasets as cgds
+
+
+if __name__ == "__main__":
+    # download and store dataset (csv) by using the Datasets API
+    dataset = sys.argv[1].replace("-", "_")
+    dataset_obj = getattr(cgds, dataset)
+
+    if not dataset_obj.get_path().exists():
+        dataset_obj.get_edgelist(download=True)
diff --git a/benchmarks/nx-cugraph/pytest-based/run-main-benchmarks.sh b/benchmarks/nx-cugraph/pytest-based/run-main-benchmarks.sh
new file mode 100755
index 00000000000..3059e3d4bdf
--- /dev/null
+++ b/benchmarks/nx-cugraph/pytest-based/run-main-benchmarks.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# location to store datasets used for benchmarking
+export RAPIDS_DATASET_ROOT_DIR=/datasets/cugraph
+mkdir -p logs
+
+# list of algos, datasets, and back-ends to use in combinations
+algos="
+    pagerank
+    betweenness_centrality
+    louvain
+    shortest_path
+    weakly_connected_components
+    triangles
+    bfs_predecessors
+"
+datasets="
+   netscience
+   email_Eu_core
+   cit-patents
+   hollywood
+   soc-livejournal
+"
+# None backend is default networkx
+# cugraph-preconvert backend is nx-cugraph
+backends="
+    None
+    cugraph-preconverted
+"
+# check for --cpu-only or --gpu-only args
+if [[ "$#" -eq 1 ]]; then
+    case $1 in
+        --cpu-only)
+            backends="None"
+            ;;
+        --gpu-only)
+            backends="cugraph-preconverted"
+            ;;
+        *)
+            echo "Unknown option: $1"
+            exit 1
+            ;;
+    esac
+fi
+
+for algo in $algos; do
+    for dataset in $datasets; do
+	# this script can be used to download benchmarking datasets by name via cugraph.datasets
+    	python get_graph_bench_dataset.py $dataset
+        for backend in $backends; do
+            name="${backend}__${algo}__${dataset}"
+            echo "Running: $backend, $dataset, bench_$algo"
+            # command to preproduce test
+            # echo "RUNNING: \"pytest -sv -k \"$backend and $dataset and bench_$algo and not 1000\" --benchmark-json=\"logs/${name}.json\" bench_algos.py"
+            pytest -sv \
+                -k "$backend and $dataset and bench_$algo and not 1000" \
+                --benchmark-json="logs/${name}.json" \
+                bench_algos.py 2>&1 | tee "logs/${name}.out"
+        done
+    done
+done
diff --git a/benchmarks/pytest.ini b/benchmarks/pytest.ini
index fe7fc31b6d6..d692b78de37 100644
--- a/benchmarks/pytest.ini
+++ b/benchmarks/pytest.ini
@@ -8,6 +8,7 @@ testpaths =
 
 addopts =
     --benchmark-columns="min, max, mean, stddev, outliers"
+    --tb=native
 
 markers =
     managedmem_on: RMM managed memory enabled
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 707c61e4d3e..f3979ab3049 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -17,7 +17,7 @@ cd "${package_dir}"
 
 python -m pip wheel \
     -w dist \
-    -vvv \
+    -v \
     --no-deps \
     --disable-pip-version-check \
     --extra-index-url https://pypi.nvidia.com \
@@ -30,7 +30,23 @@ if [[ ${package_name} == "nx-cugraph" ]] || \
    [[ ${package_name} == "cugraph-equivariant" ]]; then
     RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 dist
 else
+    case "${RAPIDS_CUDA_VERSION}" in
+        12.*)
+            EXCLUDE_ARGS=(
+                --exclude "libcublas.so.12"
+                --exclude "libcublasLt.so.12"
+                --exclude "libcurand.so.10"
+                --exclude "libcusolver.so.11"
+                --exclude "libcusparse.so.12"
+                --exclude "libnvJitLink.so.12"
+            )
+        ;;
+        11.*)
+            EXCLUDE_ARGS=()
+        ;;
+    esac
+
     mkdir -p final_dist
-    python -m auditwheel repair -w final_dist dist/*
+    python -m auditwheel repair -w final_dist "${EXCLUDE_ARGS[@]}" dist/*
     RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist
 fi
diff --git a/ci/build_wheel_cugraph.sh b/ci/build_wheel_cugraph.sh
index 6f1b23923ff..20d9bf47e3e 100755
--- a/ci/build_wheel_cugraph.sh
+++ b/ci/build_wheel_cugraph.sh
@@ -19,8 +19,16 @@ export PIP_CONSTRAINT="${PWD}/constraints.txt"
 
 PARALLEL_LEVEL=$(python -c \
   "from math import ceil; from multiprocessing import cpu_count; print(ceil(cpu_count()/4))")
+case "${RAPIDS_CUDA_VERSION}" in
+  12.*)
+    EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=ON"
+  ;;
+  11.*)
+    EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=OFF"
+  ;;
+esac
 
-export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_CUGRAPH_CPP=OFF;-DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/"
+export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_CUGRAPH_CPP=OFF;-DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/${EXTRA_CMAKE_ARGS}"
 export SKBUILD_BUILD_TOOL_ARGS="-j${PARALLEL_LEVEL};-l${PARALLEL_LEVEL}"
 
 ./ci/build_wheel.sh cugraph python/cugraph
diff --git a/ci/build_wheel_pylibcugraph.sh b/ci/build_wheel_pylibcugraph.sh
index ee33ab4a82d..fa967b0be29 100755
--- a/ci/build_wheel_pylibcugraph.sh
+++ b/ci/build_wheel_pylibcugraph.sh
@@ -6,7 +6,16 @@ set -euo pipefail
 PARALLEL_LEVEL=$(python -c \
   "from math import ceil; from multiprocessing import cpu_count; print(ceil(cpu_count()/4))")
 
-export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_CUGRAPH_CPP=OFF;-DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/"
+case "${RAPIDS_CUDA_VERSION}" in
+  12.*)
+    EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=ON"
+  ;;
+  11.*)
+    EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=OFF"
+  ;;
+esac
+
+export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_CUGRAPH_CPP=OFF;-DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/${EXTRA_CMAKE_ARGS}"
 export SKBUILD_BUILD_TOOL_ARGS="-j${PARALLEL_LEVEL};-l${PARALLEL_LEVEL}"
 
 ./ci/build_wheel.sh pylibcugraph python/pylibcugraph
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 08c22fca02e..5859ebde953 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -30,7 +30,7 @@ CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR}
 NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}')
 NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
 NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
-NEXT_UCX_PY_VERSION="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG})"
+NEXT_UCXX_SHORT_TAG="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG})"
 
 echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
 
@@ -45,7 +45,8 @@ function sed_runner() {
 echo "${NEXT_FULL_TAG}" > VERSION
 
 # Need to distutils-normalize the original version
-NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
+NEXT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_SHORT_TAG}'))")
+NEXT_UCXX_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_UCXX_SHORT_TAG}'))")
 
 DEPENDENCIES=(
   cudf
@@ -71,23 +72,30 @@ DEPENDENCIES=(
   rmm
   rapids-dask-dependency
 )
+UCXX_DEPENDENCIES=(
+  ucx-py
+)
 for FILE in dependencies.yaml conda/environments/*.yaml python/cugraph-{pyg,dgl}/conda/*.yaml; do
   for DEP in "${DEPENDENCIES[@]}"; do
     sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
   done
-  sed_runner "/-.* ucx-py\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_UCX_PY_VERSION}.*,>=0.0.0a0/g" "${FILE}"
+  for DEP in "${UCXX_DEPENDENCIES[@]}"; do
+    sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_UCXX_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
+  done
 done
 for FILE in python/**/pyproject.toml python/**/**/pyproject.toml; do
   for DEP in "${DEPENDENCIES[@]}"; do
     sed_runner "/\"${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" "${FILE}"
   done
-  sed_runner "/\"ucx-py\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*\"/==${NEXT_UCX_PY_VERSION}.*,>=0.0.0a0\"/g" "${FILE}"
+  for DEP in "${UCXX_DEPENDENCIES[@]}"; do
+    sed_runner "/\"${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*\"/==${NEXT_UCXX_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" "${FILE}"
+  done
 done
 
 # ucx-py version
-sed_runner "/^ucx_py_version:$/ {n;s/.*/  - \"${NEXT_UCX_PY_VERSION}.*\"/}" conda/recipes/cugraph/conda_build_config.yaml
-sed_runner "/^ucx_py_version:$/ {n;s/.*/  - \"${NEXT_UCX_PY_VERSION}.*\"/}" conda/recipes/cugraph-service/conda_build_config.yaml
-sed_runner "/^ucx_py_version:$/ {n;s/.*/  - \"${NEXT_UCX_PY_VERSION}.*\"/}" conda/recipes/pylibcugraph/conda_build_config.yaml
+for FILE in conda/recipes/*/conda_build_config.yaml; do
+  sed_runner "/^ucx_py_version:$/ {n;s/.*/  - \"${NEXT_UCXX_SHORT_TAG_PEP440}.*\"/}" "${FILE}"
+done
 
 # CI files
 for FILE in .github/workflows/*.yaml; do
diff --git a/ci/run_nx_cugraph_pytests.sh b/ci/run_nx_cugraph_pytests.sh
index b0caffd0a0f..0e309d1e2d4 100755
--- a/ci/run_nx_cugraph_pytests.sh
+++ b/ci/run_nx_cugraph_pytests.sh
@@ -6,4 +6,5 @@ set -euo pipefail
 # Support invoking run_nx_cugraph_pytests.sh outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/nx-cugraph/nx_cugraph
 
-pytest --capture=no --cache-clear --benchmark-disable "$@" tests
+NX_CUGRAPH_USE_COMPAT_GRAPHS=False pytest --capture=no --cache-clear --benchmark-disable "$@" tests
+NX_CUGRAPH_USE_COMPAT_GRAPHS=True pytest --capture=no --cache-clear --benchmark-disable "$@" tests
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index ba106d34a46..6c14870164e 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -38,7 +38,7 @@ nvidia-smi
 # RAPIDS_DATASET_ROOT_DIR is used by test scripts
 export RAPIDS_DATASET_ROOT_DIR="$(realpath datasets)"
 pushd "${RAPIDS_DATASET_ROOT_DIR}"
-./get_test_data.sh --subset
+./get_test_data.sh --cpp_ci_subset
 popd
 
 export GTEST_OUTPUT=xml:${RAPIDS_TESTS_DIR}/
diff --git a/ci/test_python.sh b/ci/test_python.sh
index e8c8272e8d6..f21a06cf061 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -108,7 +108,7 @@ echo "nx-cugraph coverage from networkx tests: $_coverage"
 echo $_coverage | awk '{ if ($NF == "0.0%") exit 1 }'
 # Ensure all algorithms were called by comparing covered lines to function lines.
 # Run our tests again (they're fast enough) to add their coverage, then create coverage.json
-pytest \
+NX_CUGRAPH_USE_COMPAT_GRAPHS=False pytest \
   --pyargs nx_cugraph \
   --config-file=../pyproject.toml \
   --cov-config=../pyproject.toml \
@@ -159,7 +159,7 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
       cugraph \
       cugraph-dgl \
       'dgl>=1.1.0.cu*,<=2.0.0.cu*' \
-      'pytorch>=2.0' \
+      'pytorch>=2.3,<2.4' \
       'cuda-version=11.8'
 
     rapids-print-env
@@ -198,10 +198,10 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
     # TODO re-enable logic once CUDA 12 is testable
     #if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
     CONDA_CUDA_VERSION="11.8"
-    PYG_URL="https://data.pyg.org/whl/torch-2.1.0+cu118.html"
+    PYG_URL="https://data.pyg.org/whl/torch-2.3.0+cu118.html"
     #else
     #  CONDA_CUDA_VERSION="12.1"
-    #  PYG_URL="https://data.pyg.org/whl/torch-2.1.0+cu121.html"
+    #  PYG_URL="https://data.pyg.org/whl/torch-2.3.0+cu121.html"
     #fi
 
     # Will automatically install built dependencies of cuGraph-PyG
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
index 158704e08d1..e3690dfde6e 100755
--- a/ci/test_wheel.sh
+++ b/ci/test_wheel.sh
@@ -37,6 +37,7 @@ else
     DASK_DISTRIBUTED__SCHEDULER__WORKER_TTL="1000s" \
     DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT="1000s" \
     DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT="1000s" \
+    NX_CUGRAPH_USE_COMPAT_GRAPHS=False \
     python -m pytest \
        -v \
        --import-mode=append \
diff --git a/ci/test_wheel_cugraph-dgl.sh b/ci/test_wheel_cugraph-dgl.sh
index 564b46cb07e..9b79cb17fe4 100755
--- a/ci/test_wheel_cugraph-dgl.sh
+++ b/ci/test_wheel_cugraph-dgl.sh
@@ -32,18 +32,8 @@ fi
 PYTORCH_URL="https://download.pytorch.org/whl/cu${PYTORCH_CUDA_VER}"
 DGL_URL="https://data.dgl.ai/wheels/cu${PYTORCH_CUDA_VER}/repo.html"
 
-# Starting from 2.2, PyTorch wheels depend on nvidia-nccl-cuxx>=2.19 wheel and
-# dynamically link to NCCL. RAPIDS CUDA 11 CI images have an older NCCL version that
-# might shadow the newer NCCL required by PyTorch during import (when importing
-# `cupy` before `torch`).
-if [[ "${NCCL_VERSION}" < "2.19" ]]; then
-  PYTORCH_VER="2.1.0"
-else
-  PYTORCH_VER="2.3.0"
-fi
-
 rapids-logger "Installing PyTorch and DGL"
-rapids-retry python -m pip install "torch==${PYTORCH_VER}" --index-url ${PYTORCH_URL}
+rapids-retry python -m pip install torch==2.3.0 --index-url ${PYTORCH_URL}
 rapids-retry python -m pip install dgl==2.0.0 --find-links ${DGL_URL}
 
 python -m pytest python/cugraph-dgl/tests
diff --git a/ci/test_wheel_cugraph-pyg.sh b/ci/test_wheel_cugraph-pyg.sh
index c55ae033344..8f4b16a2dec 100755
--- a/ci/test_wheel_cugraph-pyg.sh
+++ b/ci/test_wheel_cugraph-pyg.sh
@@ -29,13 +29,13 @@ export CI_RUN=1
 
 if [[ "${CUDA_VERSION}" == "11.8.0" ]]; then
   PYTORCH_URL="https://download.pytorch.org/whl/cu118"
-  PYG_URL="https://data.pyg.org/whl/torch-2.1.0+cu118.html"
+  PYG_URL="https://data.pyg.org/whl/torch-2.3.0+cu118.html"
 else
   PYTORCH_URL="https://download.pytorch.org/whl/cu121"
-  PYG_URL="https://data.pyg.org/whl/torch-2.1.0+cu121.html"
+  PYG_URL="https://data.pyg.org/whl/torch-2.3.0+cu121.html"
 fi
 rapids-logger "Installing PyTorch and PyG dependencies"
-rapids-retry python -m pip install torch==2.1.0 --index-url ${PYTORCH_URL}
+rapids-retry python -m pip install torch==2.3.0 --index-url ${PYTORCH_URL}
 rapids-retry python -m pip install "torch-geometric>=2.5,<2.6"
 rapids-retry python -m pip install \
   ogb \
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index f0b86c791f8..a23c2395646 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -16,55 +16,55 @@ dependencies:
 - cuda-nvtx
 - cuda-version=11.8
 - cudatoolkit
-- cudf==24.10.*,>=0.0.0a0
+- cudf==24.12.*,>=0.0.0a0
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.10.*,>=0.0.0a0
-- dask-cudf==24.10.*,>=0.0.0a0
+- dask-cuda==24.12.*,>=0.0.0a0
+- dask-cudf==24.12.*,>=0.0.0a0
 - doxygen
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
 - graphviz
 - ipython
-- libcudf==24.10.*,>=0.0.0a0
-- libcugraphops==24.10.*,>=0.0.0a0
-- libraft-headers==24.10.*,>=0.0.0a0
-- libraft==24.10.*,>=0.0.0a0
-- librmm==24.10.*,>=0.0.0a0
+- libcudf==24.12.*,>=0.0.0a0
+- libcugraphops==24.12.*,>=0.0.0a0
+- libraft-headers==24.12.*,>=0.0.0a0
+- libraft==24.12.*,>=0.0.0a0
+- librmm==24.12.*,>=0.0.0a0
 - nbsphinx
-- nccl>=2.9.9
+- nccl>=2.19
 - networkx>=2.5.1
 - networkx>=3.0
 - ninja
 - notebook>=0.5.0
 - numba>=0.57
-- numpy>=1.23,<2.0a0
+- numpy>=1.23,<3.0a0
 - numpydoc
 - nvcc_linux-64=11.8
+- ogb
 - openmpi
-- packaging>=21
 - pandas
 - pre-commit
 - pydantic
 - pydata-sphinx-theme
-- pylibcugraphops==24.10.*,>=0.0.0a0
-- pylibraft==24.10.*,>=0.0.0a0
-- pylibwholegraph==24.10.*,>=0.0.0a0
+- pylibcugraphops==24.12.*,>=0.0.0a0
+- pylibraft==24.12.*,>=0.0.0a0
+- pylibwholegraph==24.12.*,>=0.0.0a0
 - pytest
 - pytest-benchmark
 - pytest-cov
 - pytest-mpl
 - pytest-xdist
 - python-louvain
-- pytorch>=2.0,<2.2.0a0
-- raft-dask==24.10.*,>=0.0.0a0
+- pytorch>=2.3,<2.4.0a0
+- raft-dask==24.12.*,>=0.0.0a0
 - rapids-build-backend>=0.3.1,<0.4.0.dev0
-- rapids-dask-dependency==24.10.*,>=0.0.0a0
+- rapids-dask-dependency==24.12.*,>=0.0.0a0
 - recommonmark
 - requests
-- rmm==24.10.*,>=0.0.0a0
-- scikit-build-core>=0.7.0
+- rmm==24.12.*,>=0.0.0a0
+- scikit-build-core>=0.10.0
 - scikit-learn>=0.23.1
 - scipy
 - setuptools>=61.0.0
@@ -74,8 +74,9 @@ dependencies:
 - sphinxcontrib-websupport
 - thriftpy2!=0.5.0,!=0.5.1
 - torchdata
+- torchmetrics
 - ucx-proc=*=gpu
-- ucx-py==0.40.*,>=0.0.0a0
+- ucx-py==0.41.*,>=0.0.0a0
 - wget
 - wheel
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index ebded3eec92..eca10584304 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -18,58 +18,58 @@ dependencies:
 - cuda-nvtx-dev
 - cuda-profiler-api
 - cuda-version=12.5
-- cudf==24.10.*,>=0.0.0a0
+- cudf==24.12.*,>=0.0.0a0
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.10.*,>=0.0.0a0
-- dask-cudf==24.10.*,>=0.0.0a0
+- dask-cuda==24.12.*,>=0.0.0a0
+- dask-cudf==24.12.*,>=0.0.0a0
 - doxygen
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
 - graphviz
 - ipython
 - libcublas-dev
-- libcudf==24.10.*,>=0.0.0a0
-- libcugraphops==24.10.*,>=0.0.0a0
+- libcudf==24.12.*,>=0.0.0a0
+- libcugraphops==24.12.*,>=0.0.0a0
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- libraft-headers==24.10.*,>=0.0.0a0
-- libraft==24.10.*,>=0.0.0a0
-- librmm==24.10.*,>=0.0.0a0
+- libraft-headers==24.12.*,>=0.0.0a0
+- libraft==24.12.*,>=0.0.0a0
+- librmm==24.12.*,>=0.0.0a0
 - nbsphinx
-- nccl>=2.9.9
+- nccl>=2.19
 - networkx>=2.5.1
 - networkx>=3.0
 - ninja
 - notebook>=0.5.0
 - numba>=0.57
-- numpy>=1.23,<2.0a0
+- numpy>=1.23,<3.0a0
 - numpydoc
+- ogb
 - openmpi
-- packaging>=21
 - pandas
 - pre-commit
 - pydantic
 - pydata-sphinx-theme
-- pylibcugraphops==24.10.*,>=0.0.0a0
-- pylibraft==24.10.*,>=0.0.0a0
-- pylibwholegraph==24.10.*,>=0.0.0a0
+- pylibcugraphops==24.12.*,>=0.0.0a0
+- pylibraft==24.12.*,>=0.0.0a0
+- pylibwholegraph==24.12.*,>=0.0.0a0
 - pytest
 - pytest-benchmark
 - pytest-cov
 - pytest-mpl
 - pytest-xdist
 - python-louvain
-- pytorch>=2.0,<2.2.0a0
-- raft-dask==24.10.*,>=0.0.0a0
+- pytorch>=2.3,<2.4.0a0
+- raft-dask==24.12.*,>=0.0.0a0
 - rapids-build-backend>=0.3.1,<0.4.0.dev0
-- rapids-dask-dependency==24.10.*,>=0.0.0a0
+- rapids-dask-dependency==24.12.*,>=0.0.0a0
 - recommonmark
 - requests
-- rmm==24.10.*,>=0.0.0a0
-- scikit-build-core>=0.7.0
+- rmm==24.12.*,>=0.0.0a0
+- scikit-build-core>=0.10.0
 - scikit-learn>=0.23.1
 - scipy
 - setuptools>=61.0.0
@@ -79,8 +79,9 @@ dependencies:
 - sphinxcontrib-websupport
 - thriftpy2!=0.5.0,!=0.5.1
 - torchdata
+- torchmetrics
 - ucx-proc=*=gpu
-- ucx-py==0.40.*,>=0.0.0a0
+- ucx-py==0.41.*,>=0.0.0a0
 - wget
 - wheel
 name: all_cuda-125_arch-x86_64
diff --git a/conda/recipes/cugraph-dgl/meta.yaml b/conda/recipes/cugraph-dgl/meta.yaml
index d1cf6fcd9e9..c80ca6890a8 100644
--- a/conda/recipes/cugraph-dgl/meta.yaml
+++ b/conda/recipes/cugraph-dgl/meta.yaml
@@ -27,11 +27,11 @@ requirements:
     - cugraph ={{ version }}
     - dgl >=1.1.0.cu*
     - numba >=0.57
-    - numpy >=1.23,<2.0a0
+    - numpy >=1.23,<3.0a0
     - pylibcugraphops ={{ minor_version }}
     - tensordict >=0.1.2
     - python
-    - pytorch >=2.0
+    - pytorch >=2.3,<2.4.0a0
     - cupy >=12.0.0
 
 tests:
diff --git a/conda/recipes/cugraph-pyg/meta.yaml b/conda/recipes/cugraph-pyg/meta.yaml
index 2e1788ac0c6..38d4a3d7d15 100644
--- a/conda/recipes/cugraph-pyg/meta.yaml
+++ b/conda/recipes/cugraph-pyg/meta.yaml
@@ -29,9 +29,9 @@ requirements:
   run:
     - rapids-dask-dependency ={{ minor_version }}
     - numba >=0.57
-    - numpy >=1.23,<2.0a0
+    - numpy >=1.23,<3.0a0
     - python
-    - pytorch >=2.0
+    - pytorch >=2.3,<2.4.0a0
     - cupy >=12.0.0
     - cugraph ={{ version }}
     - pylibcugraphops ={{ minor_version }}
diff --git a/conda/recipes/cugraph-service/conda_build_config.yaml b/conda/recipes/cugraph-service/conda_build_config.yaml
index 2ac251ab10a..67ed3e26b0e 100644
--- a/conda/recipes/cugraph-service/conda_build_config.yaml
+++ b/conda/recipes/cugraph-service/conda_build_config.yaml
@@ -1,2 +1,2 @@
 ucx_py_version:
-  - "0.40.*"
+  - "0.41.*"
diff --git a/conda/recipes/cugraph-service/meta.yaml b/conda/recipes/cugraph-service/meta.yaml
index c1027582c78..7df7573e2d0 100644
--- a/conda/recipes/cugraph-service/meta.yaml
+++ b/conda/recipes/cugraph-service/meta.yaml
@@ -63,7 +63,7 @@ outputs:
         - dask-cuda ={{ minor_version }}
         - dask-cudf ={{ minor_version }}
         - numba >=0.57
-        - numpy >=1.23,<2.0a0
+        - numpy >=1.23,<3.0a0
         - python
         - rapids-dask-dependency ={{ minor_version }}
         - thriftpy2 >=0.4.15,!=0.5.0,!=0.5.1
diff --git a/conda/recipes/cugraph/conda_build_config.yaml b/conda/recipes/cugraph/conda_build_config.yaml
index 2525441f92d..10f2e15c550 100644
--- a/conda/recipes/cugraph/conda_build_config.yaml
+++ b/conda/recipes/cugraph/conda_build_config.yaml
@@ -20,4 +20,4 @@ c_stdlib_version:
   - "2.17"
 
 ucx_py_version:
-  - "0.40.*"
+  - "0.41.*"
diff --git a/conda/recipes/cugraph/meta.yaml b/conda/recipes/cugraph/meta.yaml
index fccc3a208d6..9f5a137faba 100644
--- a/conda/recipes/cugraph/meta.yaml
+++ b/conda/recipes/cugraph/meta.yaml
@@ -68,7 +68,7 @@ requirements:
     - python
     - raft-dask ={{ minor_version }}
     - rmm ={{ minor_version }}
-    - scikit-build-core >=0.7.0
+    - scikit-build-core >=0.10.0
     - rapids-build-backend>=0.3.1,<0.4.0.dev0
   run:
     - aiohttp
diff --git a/conda/recipes/libcugraph/conda_build_config.yaml b/conda/recipes/libcugraph/conda_build_config.yaml
index 26aa428d7f5..55bd635c330 100644
--- a/conda/recipes/libcugraph/conda_build_config.yaml
+++ b/conda/recipes/libcugraph/conda_build_config.yaml
@@ -17,7 +17,7 @@ doxygen_version:
   - ">=1.8.11"
 
 nccl_version:
-  - ">=2.9.9"
+  - ">=2.19"
 
 c_stdlib:
   - sysroot
diff --git a/conda/recipes/nx-cugraph/meta.yaml b/conda/recipes/nx-cugraph/meta.yaml
index d67287be757..263f53d9a8f 100644
--- a/conda/recipes/nx-cugraph/meta.yaml
+++ b/conda/recipes/nx-cugraph/meta.yaml
@@ -14,9 +14,7 @@ source:
 
 build:
   number: {{ GIT_DESCRIBE_NUMBER }}
-  build:
-      number: {{ GIT_DESCRIBE_NUMBER }}
-      string: py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+  string: py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
 
 requirements:
   host:
diff --git a/conda/recipes/pylibcugraph/conda_build_config.yaml b/conda/recipes/pylibcugraph/conda_build_config.yaml
index 2525441f92d..10f2e15c550 100644
--- a/conda/recipes/pylibcugraph/conda_build_config.yaml
+++ b/conda/recipes/pylibcugraph/conda_build_config.yaml
@@ -20,4 +20,4 @@ c_stdlib_version:
   - "2.17"
 
 ucx_py_version:
-  - "0.40.*"
+  - "0.41.*"
diff --git a/conda/recipes/pylibcugraph/meta.yaml b/conda/recipes/pylibcugraph/meta.yaml
index 15632cfcc0e..54d29a68d91 100644
--- a/conda/recipes/pylibcugraph/meta.yaml
+++ b/conda/recipes/pylibcugraph/meta.yaml
@@ -65,7 +65,7 @@ requirements:
     - libcugraph ={{ version }}
     - pylibraft ={{ minor_version }}
     - python
-    - scikit-build-core >=0.7.0
+    - scikit-build-core >=0.10.0
     - rapids-build-backend>=0.3.1,<0.4.0.dev0
   run:
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 441627cabce..b8eaba9d575 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -332,8 +332,12 @@ set(CUGRAPH_SOURCES
     src/sampling/neighbor_sampling_sg_v32_e64.cpp
     src/sampling/neighbor_sampling_sg_v32_e32.cpp
     src/sampling/neighbor_sampling_sg_v64_e64.cpp
-    src/sampling/renumber_sampled_edgelist_sg_v64_e64.cu
-    src/sampling/renumber_sampled_edgelist_sg_v32_e32.cu
+    src/sampling/negative_sampling_sg_v32_e64.cu
+    src/sampling/negative_sampling_sg_v32_e32.cu
+    src/sampling/negative_sampling_sg_v64_e64.cu
+    src/sampling/negative_sampling_mg_v32_e64.cu
+    src/sampling/negative_sampling_mg_v32_e32.cu
+    src/sampling/negative_sampling_mg_v64_e64.cu
     src/sampling/sampling_post_processing_sg_v64_e64.cu
     src/sampling/sampling_post_processing_sg_v32_e32.cu
     src/sampling/sampling_post_processing_sg_v32_e64.cu
@@ -483,6 +487,7 @@ set(CUGRAPH_SOURCES
     src/centrality/betweenness_centrality_mg_v32_e32.cu
     src/centrality/betweenness_centrality_mg_v32_e64.cu
     src/tree/legacy/mst.cu
+    src/from_cugraph_ops/sampling_index.cu
     src/components/weakly_connected_components_sg_v64_e64.cu
     src/components/weakly_connected_components_sg_v32_e32.cu
     src/components/weakly_connected_components_sg_v32_e64.cu
@@ -656,6 +661,7 @@ add_library(cugraph_c
         src/c_api/louvain.cpp
         src/c_api/triangle_count.cpp
         src/c_api/neighbor_sampling.cpp
+        src/c_api/negative_sampling.cpp
         src/c_api/labeling_result.cpp
         src/c_api/weakly_connected_components.cpp
         src/c_api/strongly_connected_components.cpp
diff --git a/cpp/examples/developers/graph_operations/graph_operations.cu b/cpp/examples/developers/graph_operations/graph_operations.cu
index 014cedcab7e..912f9f1fd46 100644
--- a/cpp/examples/developers/graph_operations/graph_operations.cu
+++ b/cpp/examples/developers/graph_operations/graph_operations.cu
@@ -131,7 +131,7 @@ create_graph(raft::handle_t const& handle,
   //
 
   if (multi_gpu) {
-    std::tie(d_edge_srcs, d_edge_dsts, d_edge_wgts, std::ignore, std::ignore) =
+    std::tie(d_edge_srcs, d_edge_dsts, d_edge_wgts, std::ignore, std::ignore, std::ignore) =
       cugraph::shuffle_external_edges<vertex_t, vertex_t, weight_t, int32_t>(handle,
                                                                              std::move(d_edge_srcs),
                                                                              std::move(d_edge_dsts),
@@ -215,10 +215,10 @@ void perform_example_graph_operations(
                                                                                   graph_view);
 
     cugraph::update_edge_src_property(
-      handle, graph_view, vertex_weights.begin(), src_vertex_weights_cache);
+      handle, graph_view, vertex_weights.begin(), src_vertex_weights_cache.mutable_view());
 
     cugraph::update_edge_dst_property(
-      handle, graph_view, vertex_weights.begin(), dst_vertex_weights_cache);
+      handle, graph_view, vertex_weights.begin(), dst_vertex_weights_cache.mutable_view());
 
     rmm::device_uvector<result_t> weighted_averages(
       size_of_the_vertex_partition_assigned_to_this_process, handle.get_stream());
@@ -259,10 +259,10 @@ void perform_example_graph_operations(
                                                                                   graph_view);
 
     cugraph::update_edge_src_property(
-      handle, graph_view, vertex_weights.begin(), src_vertex_weights_cache);
+      handle, graph_view, vertex_weights.begin(), src_vertex_weights_cache.mutable_view());
 
     cugraph::update_edge_dst_property(
-      handle, graph_view, vertex_weights.begin(), dst_vertex_weights_cache);
+      handle, graph_view, vertex_weights.begin(), dst_vertex_weights_cache.mutable_view());
 
     rmm::device_uvector<result_t> weighted_averages(
       size_of_the_vertex_partition_assigned_to_this_process, handle.get_stream());
diff --git a/cpp/examples/developers/vertex_and_edge_partition/vertex_and_edge_partition.cu b/cpp/examples/developers/vertex_and_edge_partition/vertex_and_edge_partition.cu
index ce02e3b2639..c261ff6d843 100644
--- a/cpp/examples/developers/vertex_and_edge_partition/vertex_and_edge_partition.cu
+++ b/cpp/examples/developers/vertex_and_edge_partition/vertex_and_edge_partition.cu
@@ -127,7 +127,7 @@ create_graph(raft::handle_t const& handle,
   //
 
   if (multi_gpu) {
-    std::tie(d_edge_srcs, d_edge_dsts, d_edge_wgts, std::ignore, std::ignore) =
+    std::tie(d_edge_srcs, d_edge_dsts, d_edge_wgts, std::ignore, std::ignore, std::ignore) =
       cugraph::shuffle_external_edges<vertex_t, vertex_t, weight_t, int32_t>(handle,
                                                                              std::move(d_edge_srcs),
                                                                              std::move(d_edge_dsts),
diff --git a/cpp/examples/users/multi_gpu_application/mg_graph_algorithms.cpp b/cpp/examples/users/multi_gpu_application/mg_graph_algorithms.cpp
index a9e2a170208..db629117604 100644
--- a/cpp/examples/users/multi_gpu_application/mg_graph_algorithms.cpp
+++ b/cpp/examples/users/multi_gpu_application/mg_graph_algorithms.cpp
@@ -123,7 +123,7 @@ create_graph(raft::handle_t const& handle,
   //
 
   if (multi_gpu) {
-    std::tie(d_edge_srcs, d_edge_dsts, d_edge_wgts, std::ignore, std::ignore) =
+    std::tie(d_edge_srcs, d_edge_dsts, d_edge_wgts, std::ignore, std::ignore, std::ignore) =
       cugraph::shuffle_external_edges<vertex_t, vertex_t, weight_t, int32_t>(handle,
                                                                              std::move(d_edge_srcs),
                                                                              std::move(d_edge_dsts),
@@ -248,9 +248,8 @@ void run_graph_algorithms(
                             std::cout);
 }
 
-int main(int argc, char** argv)
+void run_tests()
 {
-  initialize_mpi_and_set_device(argc, argv);
   std::unique_ptr<raft::handle_t> handle = initialize_mg_handle();
 
   //
@@ -279,6 +278,7 @@ int main(int argc, char** argv)
       std::move(std::make_optional(edge_wgts)),
       renumber,
       is_symmetric);
+
   // Non-owning view of the graph object
   auto graph_view = graph.view();
 
@@ -292,5 +292,14 @@ int main(int argc, char** argv)
   run_graph_algorithms<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
     *handle, graph_view, edge_weight_view);
 
+  handle.release();
+}
+
+int main(int argc, char** argv)
+{
+  initialize_mpi_and_set_device(argc, argv);
+
+  run_tests();
+
   RAFT_MPI_TRY(MPI_Finalize());
 }
diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp
index 8ba39fa2328..7e5af4ac686 100644
--- a/cpp/include/cugraph/algorithms.hpp
+++ b/cpp/include/cugraph/algorithms.hpp
@@ -1579,11 +1579,11 @@ std::
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
 std::tuple<rmm::device_uvector<vertex_t>, std::optional<rmm::device_uvector<weight_t>>>
 uniform_random_walks(raft::handle_t const& handle,
+                     raft::random::RngState& rng_state,
                      graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
                      std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
                      raft::device_span<vertex_t const> start_vertices,
-                     size_t max_length,
-                     uint64_t seed = std::numeric_limits<uint64_t>::max());
+                     size_t max_length);
 
 /**
  * @brief returns biased random walks from starting sources, where each path is of given
@@ -1623,11 +1623,11 @@ uniform_random_walks(raft::handle_t const& handle,
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
 std::tuple<rmm::device_uvector<vertex_t>, std::optional<rmm::device_uvector<weight_t>>>
 biased_random_walks(raft::handle_t const& handle,
+                    raft::random::RngState& rng_state,
                     graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
                     edge_property_view_t<edge_t, weight_t const*> edge_weight_view,
                     raft::device_span<vertex_t const> start_vertices,
-                    size_t max_length,
-                    uint64_t seed = std::numeric_limits<uint64_t>::max());
+                    size_t max_length);
 
 /**
  * @brief returns biased random walks with node2vec biases from starting sources,
@@ -1670,13 +1670,13 @@ biased_random_walks(raft::handle_t const& handle,
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
 std::tuple<rmm::device_uvector<vertex_t>, std::optional<rmm::device_uvector<weight_t>>>
 node2vec_random_walks(raft::handle_t const& handle,
+                      raft::random::RngState& rng_state,
                       graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
                       std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
                       raft::device_span<vertex_t const> start_vertices,
                       size_t max_length,
                       weight_t p,
-                      weight_t q,
-                      uint64_t seed = std::numeric_limits<uint64_t>::max());
+                      weight_t q);
 
 #ifndef NO_CUGRAPH_OPS
 /**
@@ -1684,6 +1684,8 @@ node2vec_random_walks(raft::handle_t const& handle,
  * list of vertices and sample size per vertex. The output graph consists of the given
  * vertices with each vertex having at most `sample_size` neighbors from the original graph
  *
+ * @deprecated This API will be deprecated.  uniform_neighbor_sample can be used instead.
+ *
  * @tparam graph_t Type of input graph/view (typically, graph_view_t, non-transposed and
  * single-gpu).
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
@@ -1714,6 +1716,8 @@ sample_neighbors_adjacency_list(raft::handle_t const& handle,
  * list of vertices and sample size per vertex. The output graph consists of the given
  * vertices with each vertex having at most `sample_size` neighbors from the original graph
  *
+ * @deprecated This API will be deprecated.  uniform_neighbor_sample can be used instead.
+ *
  * @tparam graph_t Type of input graph/view (typically, graph_view_t, non-transposed and
  * single-gpu).
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
@@ -1869,12 +1873,16 @@ void triangle_count(raft::handle_t const& handle,
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Graph view object.
+ *  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to
+ * `true`).
  *
  * @return edge_property_t containing the edge triangle count
  */
 template <typename vertex_t, typename edge_t, bool multi_gpu>
 edge_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>, edge_t> edge_triangle_count(
-  raft::handle_t const& handle, graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view);
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+  bool do_expensive_check = false);
 
 /*
  * @brief Compute K-Truss.
diff --git a/cpp/include/cugraph/detail/utility_wrappers.hpp b/cpp/include/cugraph/detail/utility_wrappers.hpp
index 61ac1bd2804..3d99b85556b 100644
--- a/cpp/include/cugraph/detail/utility_wrappers.hpp
+++ b/cpp/include/cugraph/detail/utility_wrappers.hpp
@@ -87,6 +87,28 @@ void sequence_fill(rmm::cuda_stream_view const& stream_view,
                    size_t size,
                    value_t start_value);
 
+/**
+ * @brief    Fill a buffer with a sequence of values with the input stride
+ *
+ * Fills the buffer with the sequence with the input stride:
+ *   {start_value, start_value+stride, start_value+stride*2, ..., start_value+stride*(size-1)}
+ *
+ * @tparam      value_t      type of the value to operate on
+ *
+ * @param[in]   stream_view  stream view
+ * @param[out]  d_value      device array to fill
+ * @param[in]   size         number of elements in array
+ * @param[in]   start_value  starting value for sequence
+ * @param[in]   stride       input stride
+ *
+ */
+template <typename value_t>
+void stride_fill(rmm::cuda_stream_view const& stream_view,
+                 value_t* d_value,
+                 size_t size,
+                 value_t start_value,
+                 value_t stride);
+
 /**
  * @brief    Compute the maximum vertex id of an edge list
  *
diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp
index e1364f69991..866ab16ee97 100644
--- a/cpp/include/cugraph/graph_functions.hpp
+++ b/cpp/include/cugraph/graph_functions.hpp
@@ -988,63 +988,6 @@ rmm::device_uvector<vertex_t> select_random_vertices(
   bool sort_vertices,
   bool do_expensive_check = false);
 
-/**
- * @brief renumber sampling output
- *
- * @deprecated This API will be deprecated and will be replaced by the
- * renumber_and_compress_sampled_edgelist and renumber_and_sort_sampled_edgelist functions in
- * sampling_functions.hpp.
- *
- * This function renumbers sampling function (e.g. uniform_neighbor_sample) outputs satisfying the
- * following requirements.
- *
- * 1. If @p edgelist_hops is valid, we can consider (vertex ID, flag=src, hop) triplets for each
- * vertex ID in @p edgelist_srcs and (vertex ID, flag=dst, hop) triplets for each vertex ID in @p
- * edgelist_dsts. From these triplets, we can find the minimum (hop, flag) pairs for every unique
- * vertex ID (hop is the primary key and flag is the secondary key, flag=src is considered smaller
- * than flag=dst if hop numbers are same). Vertex IDs with smaller (hop, flag) pairs precede vertex
- * IDs with larger (hop, flag) pairs in renumbering. Ordering can be arbitrary among the vertices
- * with the same (hop, flag) pairs.
- * 2. If @p edgelist_hops is invalid, unique vertex IDs in @p edgelist_srcs precede vertex IDs that
- * appear only in @p edgelist_dsts.
- * 3. If label_offsets.has_value() is ture, edge lists for different labels will be renumbered
- * separately.
- *
- * This function is single-GPU only (we are not aware of any practical multi-GPU use cases).
- *
- * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
- * @tparam label_t Type of labels. Needs to be an integral type.
- * @param  handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
- * handles to various CUDA libraries) to run graph algorithms.
- * @param edgelist_srcs A vector storing original edgelist source vertices.
- * @param edgelist_dsts A vector storing original edgelist destination vertices (size = @p
- * edgelist_srcs.size()).
- * @param edgelist_hops An optional pointer to the array storing hops for each edge list (source,
- * destination) pairs (size = @p edgelist_srcs.size() if valid).
- * @param label_offsets An optional tuple of unique labels and the input edge list (@p
- * edgelist_srcs, @p edgelist_hops, and @p edgelist_dsts) offsets for the labels (siez = # unique
- * labels + 1).
- * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
- * @return Tuple of vectors storing renumbered edge sources (size = @p edgelist_srcs.size()) ,
- * renumbered edge destinations (size = @p edgelist_dsts.size()), renumber_map to query original
- * verties (size = # unique vertices or aggregate # unique vertices for every label), and
- * renumber_map offsets (size = std::get<0>(*label_offsets).size() + 1, valid only if @p
- * label_offsets.has_value() is true).
- */
-template <typename vertex_t, typename label_t>
-std::tuple<rmm::device_uvector<vertex_t>,
-           rmm::device_uvector<vertex_t>,
-           rmm::device_uvector<vertex_t>,
-           std::optional<rmm::device_uvector<size_t>>>
-renumber_sampled_edgelist(
-  raft::handle_t const& handle,
-  rmm::device_uvector<vertex_t>&& edgelist_srcs,
-  rmm::device_uvector<vertex_t>&& edgelist_dsts,
-  std::optional<raft::device_span<int32_t const>> edgelist_hops,
-  std::optional<std::tuple<raft::device_span<label_t const>, raft::device_span<size_t const>>>
-    label_offsets,
-  bool do_expensive_check = false);
-
 /**
  * @brief Remove self loops from an edge list
  *
@@ -1178,7 +1121,8 @@ std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
            std::optional<rmm::device_uvector<weight_t>>,
            std::optional<rmm::device_uvector<edge_t>>,
-           std::optional<rmm::device_uvector<edge_type_t>>>
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::vector<size_t>>
 shuffle_external_edges(raft::handle_t const& handle,
                        rmm::device_uvector<vertex_t>&& edge_srcs,
                        rmm::device_uvector<vertex_t>&& edge_dsts,
diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp
index cbb52ef3b1e..a2ff3166fa4 100644
--- a/cpp/include/cugraph/graph_view.hpp
+++ b/cpp/include/cugraph/graph_view.hpp
@@ -636,7 +636,7 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
                                      /* (edge_srcs, edge_dsts) should be pre-shuffled */
                                      raft::device_span<vertex_t const> edge_srcs,
                                      raft::device_span<vertex_t const> edge_dsts,
-                                     bool do_expensive_check = false);
+                                     bool do_expensive_check = false) const;
 
   rmm::device_uvector<edge_t> compute_multiplicity(
     raft::handle_t const& handle,
@@ -945,7 +945,7 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
   rmm::device_uvector<bool> has_edge(raft::handle_t const& handle,
                                      raft::device_span<vertex_t const> edge_srcs,
                                      raft::device_span<vertex_t const> edge_dsts,
-                                     bool do_expensive_check = false);
+                                     bool do_expensive_check = false) const;
 
   rmm::device_uvector<edge_t> compute_multiplicity(raft::handle_t const& handle,
                                                    raft::device_span<vertex_t const> edge_srcs,
diff --git a/cpp/include/cugraph/mtmg/instance_manager.hpp b/cpp/include/cugraph/mtmg/instance_manager.hpp
index a2111804997..759635b4a34 100644
--- a/cpp/include/cugraph/mtmg/instance_manager.hpp
+++ b/cpp/include/cugraph/mtmg/instance_manager.hpp
@@ -20,6 +20,8 @@
 
 #include <raft/comms/std_comms.hpp>
 
+#include <nccl.h>
+
 #include <vector>
 
 namespace cugraph {
diff --git a/cpp/include/cugraph/mtmg/resource_manager.hpp b/cpp/include/cugraph/mtmg/resource_manager.hpp
index a9e4b81f894..e9d25c4576b 100644
--- a/cpp/include/cugraph/mtmg/resource_manager.hpp
+++ b/cpp/include/cugraph/mtmg/resource_manager.hpp
@@ -27,6 +27,8 @@
 #include <rmm/mr/device/owning_wrapper.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
+#include <nccl.h>
+
 #include <execution>
 
 namespace cugraph {
diff --git a/cpp/include/cugraph/sampling_functions.hpp b/cpp/include/cugraph/sampling_functions.hpp
index fec1a07604e..783cd3a7e2b 100644
--- a/cpp/include/cugraph/sampling_functions.hpp
+++ b/cpp/include/cugraph/sampling_functions.hpp
@@ -476,12 +476,12 @@ renumber_and_sort_sampled_edgelist(
  * 1. If @p edgelist_hops is valid, we can consider (vertex ID, hop, flag=major) triplets for each
  * vertex ID in edge majors (@p edgelist_srcs if @p src_is_major is true, @p edgelist_dsts if false)
  * and (vertex ID, hop, flag=minor) triplets for each vertex ID in edge minors. From these triplets,
- * we can find the minimum (hop, flag) pairs for every unique vertex ID (hop is the primary key and
+ * we can find the minimum (hop, flag) pair for every unique vertex ID (hop is the primary key and
  * flag is the secondary key, flag=major is considered smaller than flag=minor if hop numbers are
  * same). Vertex IDs with smaller (hop, flag) pairs precede vertex IDs with larger (hop, flag) pairs
  * in renumbering (if their vertex types are same, vertices with different types are renumbered
  * separately). Ordering can be arbitrary among the vertices with the same (vertex type, hop, flag)
- * triplets. If @p seed_vertices.has-value() is true, we assume (hop=0, flag=major) for every vertex
+ * triplets. If @p seed_vertices.has_value() is true, we assume (hop=0, flag=major) for every vertex
  * in @p *seed_vertices in renumbering (this is relevant when there are seed vertices with no
  * neighbors).
  * 2. If @p edgelist_hops is invalid, unique vertex IDs in edge majors precede vertex IDs that
@@ -495,11 +495,15 @@ renumber_and_sort_sampled_edgelist(
  * Edge IDs are renumbered fulfilling the following requirements (This is relevant only when @p
  * edgelist_edge_ids.has_value() is true).
  *
- * 1. If @p edgelist_edge_types.has_value() is true, unique (edge type, edge ID) pairs are
- * renumbered to consecutive integers starting from 0 for each edge type. If @p
- * edgelist_edge_types.has_value() is true, unique edge IDs are renumbered to consecutive inetgers
- * starting from 0.
- * 2. If edgelist_label_offsets.has_value() is true, edge lists for different labels will be
+ * 1. If @p edgelist_hops is valid, we can consider (edge ID, hop) pairs. From these pairs, we can
+ * find the minimum hop value for every unique edge ID. Edge IDs with smaller hop values precede
+ * edge IDs with larger hop values in renumbering (if their edge types are same, edges with
+ * different edge types are renumbered separately). Ordering can be arbitrary among the edge IDs
+ * with the same (edge type, hop) pairs.
+ * 2. If @p edgelist_edge_hops.has_value() is false, unique edge IDs (for each edge type is @p
+ * edgelist_edge_types.has_value() is true) are mapped to consecutive integers starting from 0. The
+ * ordering can be arbitrary.
+ * 3. If edgelist_label_offsets.has_value() is true, edge lists for different labels will be
  * renumbered separately.
  *
  * The renumbered edges are sorted based on the following rules.
@@ -510,6 +514,11 @@ renumber_and_sort_sampled_edgelist(
  * true.
  * 2. Edges in each label are sorted independently if @p edgelist_label_offsets.has_value() is true.
  *
+ * This function assumes that there is a single edge source vertex type and a single edge
+ * destination vertex type for each edge. If @p edgelist_edge_types.has_value() is false (i.e. there
+ * is only one edge type), there should be only one edge source vertex type and only one edge
+ * destination vertex type; the source & destination vertex types may or may not coincide.
+ *
  * This function is single-GPU only (we are not aware of any practical multi-GPU use cases).
  *
  * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
@@ -530,19 +539,16 @@ renumber_and_sort_sampled_edgelist(
  * edgelist_srcs.size() if valid).
  * @param edgelist_hops An optional vector storing edge list hop numbers (size = @p
  * edgelist_srcs.size() if valid). @p edgelist_hops should be valid if @p num_hops >= 2.
- * @param edgelist_label_offsets An optional pointer to the array storing label offsets to the input
- * edges (size = @p num_labels + 1). @p edgelist_label_offsets should be valid if @p num_labels
- * >= 2.
  * @param seed_vertices An optional pointer to the array storing seed vertices in hop 0.
  * @param seed_vertex_label_offsets An optional pointer to the array storing label offsets to the
  * seed vertices (size = @p num_labels + 1). @p seed_vertex_label_offsets should be valid if @p
  * num_labels >= 2 and @p seed_vertices is valid and invalid otherwise.
- * ext_vertices A pointer to the array storing external vertex IDs for the local internal vertices.
- * The local internal vertex range can be obatined bgy invoking a graph_view_t object's
- * local_vertex_partition_range() function. ext_vertex_type offsets A pointer to the array storing
- * vertex type offsets for the entire external vertex ID range (array size = @p num_vertex_types +
- * 1). For example, if the array stores [0, 100, 200], external vertex IDs [0, 100) has vertex type
- * 0 and external vertex IDs [100, 200) has vertex type 1.
+ * @param edgelist_label_offsets An optional pointer to the array storing label offsets to the input
+ * edges (size = @p num_labels + 1). @p edgelist_label_offsets should be valid if @p num_labels
+ * >= 2.
+ * @param vertex_type offsets A pointer to the array storing vertex type offsets for the entire
+ * vertex ID range (array size = @p num_vertex_types + 1). For example, if the array stores [0, 100,
+ * 200], vertex IDs [0, 100) has vertex type 0 and vertex IDs [100, 200) has vertex type 1.
  * @param num_labels Number of labels. Labels are considered if @p num_labels >=2 and ignored if @p
  * num_labels = 1.
  * @param num_hops Number of hops. Hop numbers are considered if @p num_hops >=2 and ignored if @p
@@ -552,31 +558,36 @@ renumber_and_sort_sampled_edgelist(
  * @param src_is_major A flag to determine whether to use the source or destination as the
  * major key in renumbering and sorting.
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
- * @return Tuple of vectors storing edge sources, edge destinations, optional edge weights (valid
- * only if @p edgelist_weights.has_value() is true), optional edge IDs (valid only if @p
- * edgelist_edge_ids.has_value() is true), optional edge types (valid only if @p
- * edgelist_edge_types.has_value() is true), optional (label, hop) offset values to the renumbered
- * and sorted edges (size = @p num_labels * @p num_hops + 1, valid only when @p
- * edgelist_hops.has_value() or @p edgelist_label_offsetes.has_value() is true), renumber_map to
- * query original vertices (size = # unique or aggregate # unique vertices for each label), and
- * label offsets to the renumber map (size = @p num_labels + 1, valid only if @p
- * edgelist_label_offsets.has_value() is true).
+ * @return Tuple of vectors storing renumbered edge sources, renumbered edge destinations, optional
+ * edge weights (valid only if @p edgelist_weights.has_value() is true), optional renumbered edge
+ * IDs (valid only if @p edgelist_edge_ids.has_value() is true), optional (label, edge type, hop)
+ * offset values to the renumbered and sorted edges (size = @p num_labels * @p num_edge_types * @p
+ * num_hops + 1, valid only when @p edgelist_edge_types.has_value(), @p edgelist_hops.has_value(),
+ * or @p edgelist_label_offsetes.has_value() is true), renumber_map to query original vertices (size
+ * = # unique or aggregate # unique vertices for each label), (label, vertex type) offsets to the
+ * vertex renumber map (size = @p num_labels * @p num_vertex_types + 1), optional renumber_map to
+ * query original edge IDs (size = # unique (edge_type, edge ID) pairs, valid only if @p
+ * edgelist_edge_ids.has_value() is true), and optional (label, edge type) offsets to the edge ID
+ * renumber map (size = @p num_labels + @p num_edge_types + 1, valid only if @p
+ * edgelist_edge_ids.has_value() is true). We do not explicitly return edge source & destination
+ * vertex types as we assume that source & destination vertex type are implicilty determined for a
+ * given edge type.
  */
 template <typename vertex_t,
           typename weight_t,
           typename edge_id_t,
           typename edge_type_t>
 std::tuple<
-  rmm::device_uvector<vertex_t>,                    // srcs
-  rmm::device_uvector<vertex_t>,                    // dsts
-  std::optional<rmm::device_uvector<weight_t>>,     // weights
-  std::optional<rmm::device_uvector<edge_id_t>>,    // edge IDs
-  std::optional<rmm::device_uvector<edge_type_t>>,  // edge types
-  std::optional<rmm::device_uvector<size_t>>,       // (label, edge type, hop) offsets to the edges
-  rmm::device_uvector<vertex_t>,                    // vertex renumber map
-  std::optional<rmm::device_uvector<size_t>>,  // (label, type) offsets to the vertex renumber map
+  rmm::device_uvector<vertex_t>,                  // srcs
+  rmm::device_uvector<vertex_t>,                  // dsts
+  std::optional<rmm::device_uvector<weight_t>>,   // weights
+  std::optional<rmm::device_uvector<edge_id_t>>,  // edge IDs
+  std::optional<rmm::device_uvector<size_t>>,     // (label, edge type, hop) offsets to the edges
+  rmm::device_uvector<vertex_t>,                  // vertex renumber map
+  rmm::device_uvector<size_t>,  // (label, vertex type) offsets to the vertex renumber map
   std::optional<rmm::device_uvector<edge_id_t>>,  // edge ID renumber map
-  std::optional<rmm::device_uvector<size_t>>>  // (label, type) offsets to the edge ID renumber map
+  std::optional<
+    rmm::device_uvector<size_t>>>  // (label, edge type) offsets to the edge ID renumber map
 heterogeneous_renumber_and_sort_sampled_edgelist(
   raft::handle_t const& handle,
   rmm::device_uvector<vertex_t>&& edgelist_srcs,
@@ -585,11 +596,10 @@ heterogeneous_renumber_and_sort_sampled_edgelist(
   std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
   std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
   std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
-  std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
   std::optional<raft::device_span<vertex_t const>> seed_vertices,
   std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
-  raft::device_span<vertex_t const> ext_vertices,
-  raft::device_span<vertex_t const> ext_vertex_type_offsets,
+  std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+  raft::device_span<vertex_t const> vertex_type_offsets,
   size_t num_labels,
   size_t num_hops,
   size_t num_vertex_types,
@@ -743,4 +753,61 @@ lookup_endpoints_from_edge_ids_and_types(
   raft::device_span<edge_t const> edge_ids_to_lookup,
   raft::device_span<edge_type_t const> edge_types_to_lookup);
 
+/**
+ * @brief Negative Sampling
+ *
+ * This function generates negative samples for graph.
+ *
+ * Negative sampling is done by generating a random graph according to the specified
+ * parameters and optionally removing samples that represent actual edges in the graph
+ *
+ * Sampling occurs by creating a list of source vertex ids from biased samping
+ * of the source vertex space, and destination vertex ids from biased sampling of the
+ * destination vertex space, and using this as the putative list of edges.  We
+ * then can optionally remove duplicates and remove actual edges in the graph to generate
+ * the final list.  If necessary we will repeat the process to end with a resulting
+ * edge list of the appropriate size.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam store_transposed Flag indicating whether sources (if false) or destinations (if
+ * true) are major indices
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ *
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Graph View object to generate NBR Sampling for
+ * @param rng_state RNG state
+ * @param src_biases Optional bias for randomly selecting source vertices.  If std::nullopt vertices
+ * will be selected uniformly.  In multi-GPU environment the biases should be partitioned based
+ * on the vertex partitions.
+ * @param dst_biases Optional bias for randomly selecting destination vertices.  If std::nullopt
+ * vertices will be selected uniformly.  In multi-GPU environment the biases should be partitioned
+ * based on the vertex partitions.
+ * @param num_samples Number of negative samples to generate
+ * @param remove_duplicates If true, remove duplicate samples
+ * @param remove_existing_edges If true, remove samples that are actually edges in the graph
+ * @param exact_number_of_samples If true, repeat generation until we get the exact number of
+ * negative samples
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ *
+ * @return tuple containing source vertex ids and destination vertex ids for the negative samples
+ */
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> negative_sampling(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::optional<raft::device_span<weight_t const>> src_biases,
+  std::optional<raft::device_span<weight_t const>> dst_biases,
+  size_t num_samples,
+  bool remove_duplicates,
+  bool remove_existing_edges,
+  bool exact_number_of_samples,
+  bool do_expensive_check);
+
 }  // namespace cugraph
diff --git a/cpp/include/cugraph_c/coo.h b/cpp/include/cugraph_c/coo.h
new file mode 100644
index 00000000000..ef746c6ed6a
--- /dev/null
+++ b/cpp/include/cugraph_c/coo.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph_c/array.h>
+#include <cugraph_c/graph.h>
+#include <cugraph_c/random.h>
+#include <cugraph_c/resource_handle.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief       Opaque COO definition
+ */
+typedef struct {
+  int32_t align_;
+} cugraph_coo_t;
+
+/**
+ * @brief       Opaque COO list definition
+ */
+typedef struct {
+  int32_t align_;
+} cugraph_coo_list_t;
+
+/**
+ * @brief       Get the source vertex ids
+ *
+ * @param [in]     coo   Opaque pointer to COO
+ * @return type erased array view of source vertex ids
+ */
+cugraph_type_erased_device_array_view_t* cugraph_coo_get_sources(cugraph_coo_t* coo);
+
+/**
+ * @brief       Get the destination vertex ids
+ *
+ * @param [in]     coo   Opaque pointer to COO
+ * @return type erased array view of destination vertex ids
+ */
+cugraph_type_erased_device_array_view_t* cugraph_coo_get_destinations(cugraph_coo_t* coo);
+
+/**
+ * @brief       Get the edge weights
+ *
+ * @param [in]     coo   Opaque pointer to COO
+ * @return type erased array view of edge weights, NULL if no edge weights in COO
+ */
+cugraph_type_erased_device_array_view_t* cugraph_coo_get_edge_weights(cugraph_coo_t* coo);
+
+/**
+ * @brief       Get the edge id
+ *
+ * @param [in]     coo   Opaque pointer to COO
+ * @return type erased array view of edge id, NULL if no edge ids in COO
+ */
+cugraph_type_erased_device_array_view_t* cugraph_coo_get_edge_id(cugraph_coo_t* coo);
+
+/**
+ * @brief       Get the edge type
+ *
+ * @param [in]     coo   Opaque pointer to COO
+ * @return type erased array view of edge type, NULL if no edge types in COO
+ */
+cugraph_type_erased_device_array_view_t* cugraph_coo_get_edge_type(cugraph_coo_t* coo);
+
+/**
+ * @brief       Get the number of coo object in the list
+ *
+ * @param [in]     coo_list   Opaque pointer to COO list
+ * @return number of elements
+ */
+size_t cugraph_coo_list_size(const cugraph_coo_list_t* coo_list);
+
+/**
+ * @brief       Get a COO from the list
+ *
+ * @param [in]     coo_list   Opaque pointer to COO list
+ * @param [in]     index      Index of desired COO from list
+ * @return a cugraph_coo_t* object from the list
+ */
+cugraph_coo_t* cugraph_coo_list_element(cugraph_coo_list_t* coo_list, size_t index);
+
+/**
+ * @brief     Free coo object
+ *
+ * @param [in]    coo Opaque pointer to COO
+ */
+void cugraph_coo_free(cugraph_coo_t* coo);
+
+/**
+ * @brief     Free coo list
+ *
+ * @param [in]    coo_list Opaque pointer to list of COO objects
+ */
+void cugraph_coo_list_free(cugraph_coo_list_t* coo_list);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/cpp/include/cugraph_c/graph_generators.h b/cpp/include/cugraph_c/graph_generators.h
index 272131d2aab..553be530e95 100644
--- a/cpp/include/cugraph_c/graph_generators.h
+++ b/cpp/include/cugraph_c/graph_generators.h
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cugraph_c/array.h>
+#include <cugraph_c/coo.h>
 #include <cugraph_c/graph.h>
 #include <cugraph_c/random.h>
 #include <cugraph_c/resource_handle.h>
@@ -27,91 +28,6 @@ extern "C" {
 
 typedef enum { POWER_LAW = 0, UNIFORM } cugraph_generator_distribution_t;
 
-/**
- * @brief       Opaque COO definition
- */
-typedef struct {
-  int32_t align_;
-} cugraph_coo_t;
-
-/**
- * @brief       Opaque COO list definition
- */
-typedef struct {
-  int32_t align_;
-} cugraph_coo_list_t;
-
-/**
- * @brief       Get the source vertex ids
- *
- * @param [in]     coo   Opaque pointer to COO
- * @return type erased array view of source vertex ids
- */
-cugraph_type_erased_device_array_view_t* cugraph_coo_get_sources(cugraph_coo_t* coo);
-
-/**
- * @brief       Get the destination vertex ids
- *
- * @param [in]     coo   Opaque pointer to COO
- * @return type erased array view of destination vertex ids
- */
-cugraph_type_erased_device_array_view_t* cugraph_coo_get_destinations(cugraph_coo_t* coo);
-
-/**
- * @brief       Get the edge weights
- *
- * @param [in]     coo   Opaque pointer to COO
- * @return type erased array view of edge weights, NULL if no edge weights in COO
- */
-cugraph_type_erased_device_array_view_t* cugraph_coo_get_edge_weights(cugraph_coo_t* coo);
-
-/**
- * @brief       Get the edge id
- *
- * @param [in]     coo   Opaque pointer to COO
- * @return type erased array view of edge id, NULL if no edge ids in COO
- */
-cugraph_type_erased_device_array_view_t* cugraph_coo_get_edge_id(cugraph_coo_t* coo);
-
-/**
- * @brief       Get the edge type
- *
- * @param [in]     coo   Opaque pointer to COO
- * @return type erased array view of edge type, NULL if no edge types in COO
- */
-cugraph_type_erased_device_array_view_t* cugraph_coo_get_edge_type(cugraph_coo_t* coo);
-
-/**
- * @brief       Get the number of coo object in the list
- *
- * @param [in]     coo_list   Opaque pointer to COO list
- * @return number of elements
- */
-size_t cugraph_coo_list_size(const cugraph_coo_list_t* coo_list);
-
-/**
- * @brief       Get a COO from the list
- *
- * @param [in]     coo_list   Opaque pointer to COO list
- * @param [in]     index      Index of desired COO from list
- * @return a cugraph_coo_t* object from the list
- */
-cugraph_coo_t* cugraph_coo_list_element(cugraph_coo_list_t* coo_list, size_t index);
-
-/**
- * @brief     Free coo object
- *
- * @param [in]    coo Opaque pointer to COO
- */
-void cugraph_coo_free(cugraph_coo_t* coo);
-
-/**
- * @brief     Free coo list
- *
- * @param [in]    coo_list Opaque pointer to list of COO objects
- */
-void cugraph_coo_list_free(cugraph_coo_list_t* coo_list);
-
 /**
  * @brief      Generate RMAT edge list
  *
diff --git a/cpp/include/cugraph_c/sampling_algorithms.h b/cpp/include/cugraph_c/sampling_algorithms.h
index 1a3d20b9339..bb26e577915 100644
--- a/cpp/include/cugraph_c/sampling_algorithms.h
+++ b/cpp/include/cugraph_c/sampling_algorithms.h
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cugraph_c/coo.h>
 #include <cugraph_c/error.h>
 #include <cugraph_c/graph.h>
 #include <cugraph_c/properties.h>
@@ -674,6 +675,57 @@ cugraph_error_code_t cugraph_select_random_vertices(const cugraph_resource_handl
                                                     cugraph_type_erased_device_array_t** vertices,
                                                     cugraph_error_t** error);
 
+/**
+ * @ingroup samplingC
+ * @brief Perform negative sampling
+ *
+ * Negative sampling generates a COO structure defining edges according to the specified parameters
+ *
+ * @param [in]     handle                  Handle for accessing resources
+ * @param [in,out] rng_state               State of the random number generator, updated with each
+ *                                         call
+ * @param [in]     graph                   Pointer to graph
+ * @param [in]     vertices                Vertex ids for the source biases.  If @p src_bias and
+ *                                         @p dst_bias are not specified this is ignored.  If
+ *                                         @p vertices is specified then vertices[i] is the vertex
+ *                                         id of src_biases[i] and dst_biases[i].  If @p vertices
+ *                                         is not specified then i is the vertex id if src_biases[i]
+ *                                         and dst_biases[i]
+ * @param [in]     src_biases              Bias for selecting source vertices.  If NULL, do uniform
+ *                                         sampling, if provided probability of vertex i will be
+ *                                         src_bias[i] / (sum of all source biases)
+ * @param [in]     dst_biases              Bias for selecting destination vertices.  If NULL, do
+ *                                         uniform sampling, if provided probability of vertex i
+ *                                         will be dst_bias[i] / (sum of all destination biases)
+ * @param [in]     num_samples             Number of negative samples to generate
+ * @param [in]     remove_duplicates       If true, remove duplicates from sampled edges
+ * @param [in]     remove_existing_edges   If true, remove sampled edges that actually exist in
+ *                                         the graph
+ * @param [in]     exact_number_of_samples If true, result should contain exactly @p num_samples. If
+ *                                         false the code will generate @p num_samples and then do
+ *                                         any filtering as specified
+ * @param [in]     do_expensive_check      A flag to run expensive checks for input arguments (if
+ *                                         set to true)
+ * @param [out]    result                  Opaque pointer to generated coo list
+ * @param [out]    error                   Pointer to an error object storing details of any error.
+ *                                         Will be populated if error code is not CUGRAPH_SUCCESS
+ * @return error code
+ */
+cugraph_error_code_t cugraph_negative_sampling(
+  const cugraph_resource_handle_t* handle,
+  cugraph_rng_state_t* rng_state,
+  cugraph_graph_t* graph,
+  const cugraph_type_erased_device_array_view_t* vertices,
+  const cugraph_type_erased_device_array_view_t* src_biases,
+  const cugraph_type_erased_device_array_view_t* dst_biases,
+  size_t num_samples,
+  bool_t remove_duplicates,
+  bool_t remove_existing_edges,
+  bool_t exact_number_of_samples,
+  bool_t do_expensive_check,
+  cugraph_coo_t** result,
+  cugraph_error_t** error);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/cpp/src/c_api/coo.hpp b/cpp/src/c_api/coo.hpp
new file mode 100644
index 00000000000..a83a3af375a
--- /dev/null
+++ b/cpp/src/c_api/coo.hpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c_api/array.hpp"
+
+#include <vector>
+
+namespace cugraph {
+namespace c_api {
+
+struct cugraph_coo_t {
+  std::unique_ptr<cugraph_type_erased_device_array_t> src_{};
+  std::unique_ptr<cugraph_type_erased_device_array_t> dst_{};
+  std::unique_ptr<cugraph_type_erased_device_array_t> wgt_{};
+  std::unique_ptr<cugraph_type_erased_device_array_t> id_{};
+  std::unique_ptr<cugraph_type_erased_device_array_t> type_{};
+};
+
+struct cugraph_coo_list_t {
+  std::vector<std::unique_ptr<cugraph_coo_t>> list_;
+};
+
+}  // namespace c_api
+}  // namespace cugraph
diff --git a/cpp/src/c_api/graph_generators.cpp b/cpp/src/c_api/graph_generators.cpp
index ef478e57098..a58a4d5db35 100644
--- a/cpp/src/c_api/graph_generators.cpp
+++ b/cpp/src/c_api/graph_generators.cpp
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 #include "c_api/array.hpp"
+#include "c_api/coo.hpp"
 #include "c_api/error.hpp"
 #include "c_api/random.hpp"
 #include "c_api/resource_handle.hpp"
@@ -26,24 +27,6 @@
 
 #include <raft/core/handle.hpp>
 
-namespace cugraph {
-namespace c_api {
-
-struct cugraph_coo_t {
-  std::unique_ptr<cugraph_type_erased_device_array_t> src_{};
-  std::unique_ptr<cugraph_type_erased_device_array_t> dst_{};
-  std::unique_ptr<cugraph_type_erased_device_array_t> wgt_{};
-  std::unique_ptr<cugraph_type_erased_device_array_t> id_{};
-  std::unique_ptr<cugraph_type_erased_device_array_t> type_{};
-};
-
-struct cugraph_coo_list_t {
-  std::vector<std::unique_ptr<cugraph_coo_t>> list_;
-};
-
-}  // namespace c_api
-}  // namespace cugraph
-
 namespace {
 
 template <typename vertex_t>
@@ -141,32 +124,41 @@ cugraph_error_code_t cugraph_generate_rmat_edgelists(
 extern "C" cugraph_type_erased_device_array_view_t* cugraph_coo_get_sources(cugraph_coo_t* coo)
 {
   auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_coo_t*>(coo);
-  return reinterpret_cast<cugraph_type_erased_device_array_view_t*>(internal_pointer->src_->view());
+  return (internal_pointer->src_) ? reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+                                      internal_pointer->src_->view())
+                                  : nullptr;
 }
 
 extern "C" cugraph_type_erased_device_array_view_t* cugraph_coo_get_destinations(cugraph_coo_t* coo)
 {
   auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_coo_t*>(coo);
-  return reinterpret_cast<cugraph_type_erased_device_array_view_t*>(internal_pointer->dst_->view());
+  return (internal_pointer->dst_) ? reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+                                      internal_pointer->dst_->view())
+                                  : nullptr;
 }
 
 extern "C" cugraph_type_erased_device_array_view_t* cugraph_coo_get_edge_weights(cugraph_coo_t* coo)
 {
   auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_coo_t*>(coo);
-  return reinterpret_cast<cugraph_type_erased_device_array_view_t*>(internal_pointer->wgt_->view());
+  return (internal_pointer->wgt_) ? reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+                                      internal_pointer->wgt_->view())
+                                  : nullptr;
 }
 
 extern "C" cugraph_type_erased_device_array_view_t* cugraph_coo_get_edge_id(cugraph_coo_t* coo)
 {
   auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_coo_t*>(coo);
-  return reinterpret_cast<cugraph_type_erased_device_array_view_t*>(internal_pointer->id_->view());
+  return (internal_pointer->id_) ? reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+                                     internal_pointer->id_->view())
+                                 : nullptr;
 }
 
 extern "C" cugraph_type_erased_device_array_view_t* cugraph_coo_get_edge_type(cugraph_coo_t* coo)
 {
   auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_coo_t*>(coo);
-  return reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
-    internal_pointer->type_->view());
+  return (internal_pointer->type_) ? reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+                                       internal_pointer->type_->view())
+                                   : nullptr;
 }
 
 extern "C" size_t cugraph_coo_list_size(const cugraph_coo_list_t* coo_list)
diff --git a/cpp/src/c_api/negative_sampling.cpp b/cpp/src/c_api/negative_sampling.cpp
new file mode 100644
index 00000000000..54f465d67b4
--- /dev/null
+++ b/cpp/src/c_api/negative_sampling.cpp
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c_api/abstract_functor.hpp"
+#include "c_api/coo.hpp"
+#include "c_api/graph.hpp"
+#include "c_api/random.hpp"
+#include "c_api/resource_handle.hpp"
+#include "c_api/utils.hpp"
+
+#include <cugraph_c/sampling_algorithms.h>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/detail/shuffle_wrappers.hpp>
+#include <cugraph/detail/utility_wrappers.hpp>
+#include <cugraph/sampling_functions.hpp>
+
+#include <raft/core/handle.hpp>
+
+namespace {
+
+struct negative_sampling_functor : public cugraph::c_api::abstract_functor {
+  raft::handle_t const& handle_;
+  cugraph::c_api::cugraph_rng_state_t* rng_state_{nullptr};
+  cugraph::c_api::cugraph_graph_t* graph_{nullptr};
+  cugraph::c_api::cugraph_type_erased_device_array_view_t const* vertices_{nullptr};
+  cugraph::c_api::cugraph_type_erased_device_array_view_t const* src_biases_{nullptr};
+  cugraph::c_api::cugraph_type_erased_device_array_view_t const* dst_biases_{nullptr};
+  size_t num_samples_;
+  bool remove_duplicates_{false};
+  bool remove_existing_edges_{false};
+  bool exact_number_of_samples_{false};
+  bool do_expensive_check_{false};
+  cugraph::c_api::cugraph_coo_t* result_{nullptr};
+
+  negative_sampling_functor(const cugraph_resource_handle_t* handle,
+                            cugraph_rng_state_t* rng_state,
+                            cugraph_graph_t* graph,
+                            const cugraph_type_erased_device_array_view_t* vertices,
+                            const cugraph_type_erased_device_array_view_t* src_biases,
+                            const cugraph_type_erased_device_array_view_t* dst_biases,
+                            size_t num_samples,
+                            bool_t remove_duplicates,
+                            bool_t remove_existing_edges,
+                            bool_t exact_number_of_samples,
+                            bool_t do_expensive_check)
+    : abstract_functor(),
+      handle_(*reinterpret_cast<cugraph::c_api::cugraph_resource_handle_t const*>(handle)->handle_),
+      rng_state_(reinterpret_cast<cugraph::c_api::cugraph_rng_state_t*>(rng_state)),
+      graph_(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)),
+      vertices_(
+        reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(vertices)),
+      src_biases_(reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+        src_biases)),
+      dst_biases_(reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+        dst_biases)),
+      num_samples_(num_samples),
+      remove_duplicates_(remove_duplicates),
+      remove_existing_edges_(remove_existing_edges),
+      exact_number_of_samples_(exact_number_of_samples),
+      do_expensive_check_(do_expensive_check)
+  {
+  }
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            typename edge_type_t,
+            bool store_transposed,
+            bool multi_gpu>
+  void operator()()
+  {
+    // FIXME: Think about how to handle SG vice MG
+    if constexpr (!cugraph::is_candidate<vertex_t, edge_t, weight_t>::value) {
+      unsupported();
+    } else {
+      // negative_sampling expects store_transposed == false
+      if constexpr (store_transposed) {
+        error_code_ = cugraph::c_api::
+          transpose_storage<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
+            handle_, graph_, error_.get());
+        if (error_code_ != CUGRAPH_SUCCESS) return;
+      }
+
+      auto graph =
+        reinterpret_cast<cugraph::graph_t<vertex_t, edge_t, false, multi_gpu>*>(graph_->graph_);
+
+      auto graph_view = graph->view();
+
+      auto number_map = reinterpret_cast<rmm::device_uvector<vertex_t>*>(graph_->number_map_);
+
+      rmm::device_uvector<vertex_t> vertices(0, handle_.get_stream());
+      rmm::device_uvector<weight_t> src_biases(0, handle_.get_stream());
+      rmm::device_uvector<weight_t> dst_biases(0, handle_.get_stream());
+
+      if (src_biases_ != nullptr) {
+        vertices.resize(vertices_->size_, handle_.get_stream());
+        src_biases.resize(src_biases_->size_, handle_.get_stream());
+
+        raft::copy(
+          vertices.data(), vertices_->as_type<vertex_t>(), vertices.size(), handle_.get_stream());
+        raft::copy(src_biases.data(),
+                   src_biases_->as_type<weight_t>(),
+                   src_biases.size(),
+                   handle_.get_stream());
+
+        src_biases = cugraph::detail::
+          collect_local_vertex_values_from_ext_vertex_value_pairs<vertex_t, weight_t, multi_gpu>(
+            handle_,
+            std::move(vertices),
+            std::move(src_biases),
+            *number_map,
+            graph_view.local_vertex_partition_range_first(),
+            graph_view.local_vertex_partition_range_last(),
+            weight_t{0},
+            do_expensive_check_);
+      }
+
+      if (dst_biases_ != nullptr) {
+        vertices.resize(vertices_->size_, handle_.get_stream());
+        dst_biases.resize(dst_biases_->size_, handle_.get_stream());
+
+        raft::copy(
+          vertices.data(), vertices_->as_type<vertex_t>(), vertices.size(), handle_.get_stream());
+        raft::copy(dst_biases.data(),
+                   dst_biases_->as_type<weight_t>(),
+                   dst_biases.size(),
+                   handle_.get_stream());
+
+        dst_biases = cugraph::detail::
+          collect_local_vertex_values_from_ext_vertex_value_pairs<vertex_t, weight_t, multi_gpu>(
+            handle_,
+            std::move(vertices),
+            std::move(dst_biases),
+            *number_map,
+            graph_view.local_vertex_partition_range_first(),
+            graph_view.local_vertex_partition_range_last(),
+            weight_t{0},
+            do_expensive_check_);
+      }
+
+      auto&& [src, dst] = cugraph::negative_sampling(
+        handle_,
+        rng_state_->rng_state_,
+        graph_view,
+        (src_biases_ != nullptr) ? std::make_optional(raft::device_span<weight_t const>{
+                                     src_biases.data(), src_biases.size()})
+                                 : std::nullopt,
+        (dst_biases_ != nullptr) ? std::make_optional(raft::device_span<weight_t const>{
+                                     dst_biases.data(), dst_biases.size()})
+                                 : std::nullopt,
+        num_samples_,
+        remove_duplicates_,
+        remove_existing_edges_,
+        exact_number_of_samples_,
+        do_expensive_check_);
+
+      std::vector<vertex_t> vertex_partition_lasts = graph_view.vertex_partition_range_lasts();
+
+      cugraph::unrenumber_int_vertices<vertex_t, multi_gpu>(handle_,
+                                                            src.data(),
+                                                            src.size(),
+                                                            number_map->data(),
+                                                            vertex_partition_lasts,
+                                                            do_expensive_check_);
+
+      cugraph::unrenumber_int_vertices<vertex_t, multi_gpu>(handle_,
+                                                            dst.data(),
+                                                            dst.size(),
+                                                            number_map->data(),
+                                                            vertex_partition_lasts,
+                                                            do_expensive_check_);
+
+      result_ = new cugraph::c_api::cugraph_coo_t{
+        std::make_unique<cugraph::c_api::cugraph_type_erased_device_array_t>(src,
+                                                                             graph_->vertex_type_),
+        std::make_unique<cugraph::c_api::cugraph_type_erased_device_array_t>(dst,
+                                                                             graph_->vertex_type_),
+        nullptr,
+        nullptr,
+        nullptr};
+    }
+  }
+};
+
+}  // namespace
+
+cugraph_error_code_t cugraph_negative_sampling(
+  const cugraph_resource_handle_t* handle,
+  cugraph_rng_state_t* rng_state,
+  cugraph_graph_t* graph,
+  const cugraph_type_erased_device_array_view_t* vertices,
+  const cugraph_type_erased_device_array_view_t* src_biases,
+  const cugraph_type_erased_device_array_view_t* dst_biases,
+  size_t num_samples,
+  bool_t remove_duplicates,
+  bool_t remove_existing_edges,
+  bool_t exact_number_of_samples,
+  bool_t do_expensive_check,
+  cugraph_coo_t** result,
+  cugraph_error_t** error)
+{
+  negative_sampling_functor functor{handle,
+                                    rng_state,
+                                    graph,
+                                    vertices,
+                                    src_biases,
+                                    dst_biases,
+                                    num_samples,
+                                    remove_duplicates,
+                                    remove_existing_edges,
+                                    exact_number_of_samples,
+                                    do_expensive_check};
+  return cugraph::c_api::run_algorithm(graph, functor, result, error);
+}
diff --git a/cpp/src/c_api/random_walks.cpp b/cpp/src/c_api/random_walks.cpp
index b9a2c8e4f60..705d2108437 100644
--- a/cpp/src/c_api/random_walks.cpp
+++ b/cpp/src/c_api/random_walks.cpp
@@ -16,6 +16,7 @@
 
 #include "c_api/abstract_functor.hpp"
 #include "c_api/graph.hpp"
+#include "c_api/random.hpp"
 #include "c_api/resource_handle.hpp"
 #include "c_api/utils.hpp"
 
@@ -153,10 +154,11 @@ namespace {
 
 struct uniform_random_walks_functor : public cugraph::c_api::abstract_functor {
   raft::handle_t const& handle_;
+  //  FIXME: rng_state_ should be passed as a parameter
+  cugraph::c_api::cugraph_rng_state_t* rng_state_{nullptr};
   cugraph::c_api::cugraph_graph_t* graph_{nullptr};
   cugraph::c_api::cugraph_type_erased_device_array_view_t const* start_vertices_{nullptr};
   size_t max_length_{0};
-  size_t seed_{0};
   cugraph::c_api::cugraph_random_walk_result_t* result_{nullptr};
 
   uniform_random_walks_functor(cugraph_resource_handle_t const* handle,
@@ -222,13 +224,17 @@ struct uniform_random_walks_functor : public cugraph::c_api::abstract_functor {
         graph_view.local_vertex_partition_range_last(),
         false);
 
+      //  FIXME: remove once rng_state passed as parameter
+      rng_state_ = reinterpret_cast<cugraph::c_api::cugraph_rng_state_t*>(
+        new cugraph::c_api::cugraph_rng_state_t{raft::random::RngState{0}});
+
       auto [paths, weights] = cugraph::uniform_random_walks(
         handle_,
+        rng_state_->rng_state_,
         graph_view,
         (edge_weights != nullptr) ? std::make_optional(edge_weights->view()) : std::nullopt,
         raft::device_span<vertex_t const>{start_vertices.data(), start_vertices.size()},
-        max_length_,
-        seed_);
+        max_length_);
 
       //
       // Need to unrenumber the vertices in the resulting paths
@@ -255,11 +261,12 @@ struct uniform_random_walks_functor : public cugraph::c_api::abstract_functor {
 
 struct biased_random_walks_functor : public cugraph::c_api::abstract_functor {
   raft::handle_t const& handle_;
+  //  FIXME: rng_state_ should be passed as a parameter
+  cugraph::c_api::cugraph_rng_state_t* rng_state_{nullptr};
   cugraph::c_api::cugraph_graph_t* graph_{nullptr};
   cugraph::c_api::cugraph_type_erased_device_array_view_t const* start_vertices_{nullptr};
   size_t max_length_{0};
   cugraph::c_api::cugraph_random_walk_result_t* result_{nullptr};
-  uint64_t seed_{0};
 
   biased_random_walks_functor(cugraph_resource_handle_t const* handle,
                               cugraph_graph_t* graph,
@@ -326,13 +333,17 @@ struct biased_random_walks_functor : public cugraph::c_api::abstract_functor {
         graph_view.local_vertex_partition_range_last(),
         false);
 
+      //  FIXME: remove once rng_state passed as parameter
+      rng_state_ = reinterpret_cast<cugraph::c_api::cugraph_rng_state_t*>(
+        new cugraph::c_api::cugraph_rng_state_t{raft::random::RngState{0}});
+
       auto [paths, weights] = cugraph::biased_random_walks(
         handle_,
+        rng_state_->rng_state_,
         graph_view,
         edge_weights->view(),
         raft::device_span<vertex_t const>{start_vertices.data(), start_vertices.size()},
-        max_length_,
-        seed_);
+        max_length_);
 
       //
       // Need to unrenumber the vertices in the resulting paths
@@ -354,12 +365,13 @@ struct biased_random_walks_functor : public cugraph::c_api::abstract_functor {
 
 struct node2vec_random_walks_functor : public cugraph::c_api::abstract_functor {
   raft::handle_t const& handle_;
+  //  FIXME: rng_state_ should be passed as a parameter
+  cugraph::c_api::cugraph_rng_state_t* rng_state_{nullptr};
   cugraph::c_api::cugraph_graph_t* graph_{nullptr};
   cugraph::c_api::cugraph_type_erased_device_array_view_t const* start_vertices_{nullptr};
   size_t max_length_{0};
   double p_{0};
   double q_{0};
-  uint64_t seed_{0};
   cugraph::c_api::cugraph_random_walk_result_t* result_{nullptr};
 
   node2vec_random_walks_functor(cugraph_resource_handle_t const* handle,
@@ -431,15 +443,19 @@ struct node2vec_random_walks_functor : public cugraph::c_api::abstract_functor {
         graph_view.local_vertex_partition_range_last(),
         false);
 
+      //  FIXME: remove once rng_state passed as parameter
+      rng_state_ = reinterpret_cast<cugraph::c_api::cugraph_rng_state_t*>(
+        new cugraph::c_api::cugraph_rng_state_t{raft::random::RngState{0}});
+
       auto [paths, weights] = cugraph::node2vec_random_walks(
         handle_,
+        rng_state_->rng_state_,
         graph_view,
         (edge_weights != nullptr) ? std::make_optional(edge_weights->view()) : std::nullopt,
         raft::device_span<vertex_t const>{start_vertices.data(), start_vertices.size()},
         max_length_,
         static_cast<weight_t>(p_),
-        static_cast<weight_t>(q_),
-        seed_);
+        static_cast<weight_t>(q_));
 
       // FIXME:  Need to fix invalid_vtx issue here.  We can't unrenumber max_vertex_id+1
       // properly...
diff --git a/cpp/src/community/edge_triangle_count_impl.cuh b/cpp/src/community/edge_triangle_count_impl.cuh
index 225687c4cf0..e3501065008 100644
--- a/cpp/src/community/edge_triangle_count_impl.cuh
+++ b/cpp/src/community/edge_triangle_count_impl.cuh
@@ -18,8 +18,8 @@
 
 #include "detail/graph_partition_utils.cuh"
 #include "prims/edge_bucket.cuh"
+#include "prims/per_v_pair_dst_nbr_intersection.cuh"
 #include "prims/transform_e.cuh"
-#include "prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh"
 
 #include <cugraph/detail/shuffle_wrappers.hpp>
 #include <cugraph/graph_functions.hpp>
@@ -124,7 +124,8 @@ struct extract_q_r {
 template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
 edge_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>, edge_t> edge_triangle_count_impl(
   raft::handle_t const& handle,
-  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view)
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  bool do_expensive_check)
 {
   using weight_t = float;
   rmm::device_uvector<vertex_t> edgelist_srcs(0, handle.get_stream());
@@ -158,14 +159,11 @@ edge_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>, edge_t> edge_t
     num_remaining_edges -= chunk_size;
     // Perform 'nbr_intersection' in chunks to reduce peak memory.
     auto [intersection_offsets, intersection_indices] =
-      detail::nbr_intersection(handle,
-                               graph_view,
-                               cugraph::edge_dummy_property_t{}.view(),
-                               edge_first + prev_chunk_size,
-                               edge_first + prev_chunk_size + chunk_size,
-                               std::array<bool, 2>{true, true},
-                               false /*FIXME: pass 'do_expensive_check' as argument*/);
-
+      per_v_pair_dst_nbr_intersection(handle,
+                                      graph_view,
+                                      edge_first + prev_chunk_size,
+                                      edge_first + prev_chunk_size + chunk_size,
+                                      do_expensive_check);
     // Update the number of triangles of each (p, q) edges by looking at their intersection
     // size
     thrust::for_each(
@@ -365,9 +363,11 @@ edge_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>, edge_t> edge_t
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
 edge_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>, edge_t> edge_triangle_count(
-  raft::handle_t const& handle, graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view)
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+  bool do_expensive_check)
 {
-  return detail::edge_triangle_count_impl(handle, graph_view);
+  return detail::edge_triangle_count_impl(handle, graph_view, do_expensive_check);
 }
 
 }  // namespace cugraph
diff --git a/cpp/src/community/edge_triangle_count_mg_v32_e32.cu b/cpp/src/community/edge_triangle_count_mg_v32_e32.cu
index 1212a13323b..5e333139ddf 100644
--- a/cpp/src/community/edge_triangle_count_mg_v32_e32.cu
+++ b/cpp/src/community/edge_triangle_count_mg_v32_e32.cu
@@ -20,6 +20,7 @@ namespace cugraph {
 // SG instantiation
 template edge_property_t<graph_view_t<int32_t, int32_t, false, true>, int32_t> edge_triangle_count(
   raft::handle_t const& handle,
-  cugraph::graph_view_t<int32_t, int32_t, false, true> const& graph_view);
+  cugraph::graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  bool do_expensive_check);
 
 }  // namespace cugraph
diff --git a/cpp/src/community/edge_triangle_count_mg_v32_e64.cu b/cpp/src/community/edge_triangle_count_mg_v32_e64.cu
index 64ee195c7ee..adab2d1fede 100644
--- a/cpp/src/community/edge_triangle_count_mg_v32_e64.cu
+++ b/cpp/src/community/edge_triangle_count_mg_v32_e64.cu
@@ -20,6 +20,7 @@ namespace cugraph {
 // SG instantiation
 template edge_property_t<graph_view_t<int32_t, int64_t, false, true>, int64_t> edge_triangle_count(
   raft::handle_t const& handle,
-  cugraph::graph_view_t<int32_t, int64_t, false, true> const& graph_view);
+  cugraph::graph_view_t<int32_t, int64_t, false, true> const& graph_view,
+  bool do_expensive_check);
 
 }  // namespace cugraph
diff --git a/cpp/src/community/edge_triangle_count_mg_v64_e64.cu b/cpp/src/community/edge_triangle_count_mg_v64_e64.cu
index 67c19e5ac52..1f321b2149f 100644
--- a/cpp/src/community/edge_triangle_count_mg_v64_e64.cu
+++ b/cpp/src/community/edge_triangle_count_mg_v64_e64.cu
@@ -20,6 +20,7 @@ namespace cugraph {
 // SG instantiation
 template edge_property_t<graph_view_t<int64_t, int64_t, false, true>, int64_t> edge_triangle_count(
   raft::handle_t const& handle,
-  cugraph::graph_view_t<int64_t, int64_t, false, true> const& graph_view);
+  cugraph::graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  bool do_expensive_check);
 
 }  // namespace cugraph
diff --git a/cpp/src/community/edge_triangle_count_sg_v32_e32.cu b/cpp/src/community/edge_triangle_count_sg_v32_e32.cu
index d6a215aa456..3e16a2cf7ef 100644
--- a/cpp/src/community/edge_triangle_count_sg_v32_e32.cu
+++ b/cpp/src/community/edge_triangle_count_sg_v32_e32.cu
@@ -20,6 +20,7 @@ namespace cugraph {
 // SG instantiation
 template edge_property_t<graph_view_t<int32_t, int32_t, false, false>, int32_t> edge_triangle_count(
   raft::handle_t const& handle,
-  cugraph::graph_view_t<int32_t, int32_t, false, false> const& graph_view);
+  cugraph::graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  bool do_expensive_check);
 
 }  // namespace cugraph
diff --git a/cpp/src/community/edge_triangle_count_sg_v32_e64.cu b/cpp/src/community/edge_triangle_count_sg_v32_e64.cu
index e70fa45c257..24a8de868e0 100644
--- a/cpp/src/community/edge_triangle_count_sg_v32_e64.cu
+++ b/cpp/src/community/edge_triangle_count_sg_v32_e64.cu
@@ -20,6 +20,7 @@ namespace cugraph {
 // SG instantiation
 template edge_property_t<graph_view_t<int32_t, int64_t, false, false>, int64_t> edge_triangle_count(
   raft::handle_t const& handle,
-  cugraph::graph_view_t<int32_t, int64_t, false, false> const& graph_view);
+  cugraph::graph_view_t<int32_t, int64_t, false, false> const& graph_view,
+  bool do_expensive_check);
 
 }  // namespace cugraph
diff --git a/cpp/src/community/edge_triangle_count_sg_v64_e64.cu b/cpp/src/community/edge_triangle_count_sg_v64_e64.cu
index 849603f781b..81f814df713 100644
--- a/cpp/src/community/edge_triangle_count_sg_v64_e64.cu
+++ b/cpp/src/community/edge_triangle_count_sg_v64_e64.cu
@@ -20,6 +20,7 @@ namespace cugraph {
 // SG instantiation
 template edge_property_t<graph_view_t<int64_t, int64_t, false, false>, int64_t> edge_triangle_count(
   raft::handle_t const& handle,
-  cugraph::graph_view_t<int64_t, int64_t, false, false> const& graph_view);
+  cugraph::graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  bool do_expensive_check);
 
 }  // namespace cugraph
diff --git a/cpp/src/community/egonet_impl.cuh b/cpp/src/community/egonet_impl.cuh
index 8b942be5b6a..c7945831ba8 100644
--- a/cpp/src/community/egonet_impl.cuh
+++ b/cpp/src/community/egonet_impl.cuh
@@ -17,8 +17,6 @@
 
 // #define TIMING
 
-#include "utilities/graph_utils.cuh"
-
 #include <cugraph/algorithms.hpp>
 #include <cugraph/graph.hpp>
 #include <cugraph/graph_functions.hpp>
diff --git a/cpp/src/components/legacy/connectivity.cu b/cpp/src/components/legacy/connectivity.cu
index ecaaab173db..4d0198fdff6 100644
--- a/cpp/src/components/legacy/connectivity.cu
+++ b/cpp/src/components/legacy/connectivity.cu
@@ -15,7 +15,6 @@
  */
 
 #include "scc_matrix.cuh"
-#include "utilities/graph_utils.cuh"
 #include "weak_cc.cuh"
 
 #include <cugraph/algorithms.hpp>
diff --git a/cpp/src/detail/utility_wrappers_32.cu b/cpp/src/detail/utility_wrappers_32.cu
index 6ab5ae375ca..de407f12493 100644
--- a/cpp/src/detail/utility_wrappers_32.cu
+++ b/cpp/src/detail/utility_wrappers_32.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "detail/utility_wrappers.cuh"
+#include "detail/utility_wrappers_impl.cuh"
 
 #include <cugraph/detail/utility_wrappers.hpp>
 #include <cugraph/utilities/error.hpp>
@@ -68,6 +68,23 @@ template void sequence_fill(rmm::cuda_stream_view const& stream_view,
                             size_t size,
                             int32_t start_value);
 
+template void sequence_fill(rmm::cuda_stream_view const& stream_view,
+                            uint32_t* d_value,
+                            size_t size,
+                            uint32_t start_value);
+
+template void stride_fill(rmm::cuda_stream_view const& stream_view,
+                          int32_t* d_value,
+                          size_t size,
+                          int32_t start_value,
+                          int32_t stride);
+
+template void stride_fill(rmm::cuda_stream_view const& stream_view,
+                          uint32_t* d_value,
+                          size_t size,
+                          uint32_t start_value,
+                          uint32_t stride);
+
 template int32_t compute_maximum_vertex_id(rmm::cuda_stream_view const& stream_view,
                                            int32_t const* d_edgelist_srcs,
                                            int32_t const* d_edgelist_dsts,
diff --git a/cpp/src/detail/utility_wrappers_64.cu b/cpp/src/detail/utility_wrappers_64.cu
index a12bc3e952d..2c136d5902b 100644
--- a/cpp/src/detail/utility_wrappers_64.cu
+++ b/cpp/src/detail/utility_wrappers_64.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "detail/utility_wrappers.cuh"
+#include "detail/utility_wrappers_impl.cuh"
 
 #include <cugraph/detail/utility_wrappers.hpp>
 #include <cugraph/utilities/error.hpp>
@@ -71,6 +71,18 @@ template void sequence_fill(rmm::cuda_stream_view const& stream_view,
                             size_t size,
                             uint64_t start_value);
 
+template void stride_fill(rmm::cuda_stream_view const& stream_view,
+                          int64_t* d_value,
+                          size_t size,
+                          int64_t start_value,
+                          int64_t stride);
+
+template void stride_fill(rmm::cuda_stream_view const& stream_view,
+                          uint64_t* d_value,
+                          size_t size,
+                          uint64_t start_value,
+                          uint64_t stride);
+
 template int64_t compute_maximum_vertex_id(rmm::cuda_stream_view const& stream_view,
                                            int64_t const* d_edgelist_srcs,
                                            int64_t const* d_edgelist_dsts,
diff --git a/cpp/src/detail/utility_wrappers.cuh b/cpp/src/detail/utility_wrappers_impl.cuh
similarity index 88%
rename from cpp/src/detail/utility_wrappers.cuh
rename to cpp/src/detail/utility_wrappers_impl.cuh
index ce8549db9f8..074d7044261 100644
--- a/cpp/src/detail/utility_wrappers.cuh
+++ b/cpp/src/detail/utility_wrappers_impl.cuh
@@ -72,6 +72,22 @@ void sequence_fill(rmm::cuda_stream_view const& stream_view,
   thrust::sequence(rmm::exec_policy(stream_view), d_value, d_value + size, start_value);
 }
 
+template <typename value_t>
+void stride_fill(rmm::cuda_stream_view const& stream_view,
+                 value_t* d_value,
+                 size_t size,
+                 value_t start_value,
+                 value_t stride)
+{
+  thrust::transform(rmm::exec_policy(stream_view),
+                    thrust::make_counting_iterator(size_t{0}),
+                    thrust::make_counting_iterator(size),
+                    d_value,
+                    cuda::proclaim_return_type<value_t>([start_value, stride] __device__(size_t i) {
+                      return static_cast<value_t>(start_value + stride * i);
+                    }));
+}
+
 template <typename vertex_t>
 vertex_t compute_maximum_vertex_id(rmm::cuda_stream_view const& stream_view,
                                    vertex_t const* d_edgelist_srcs,
diff --git a/cpp/src/from_cugraph_ops/algo_R.cuh b/cpp/src/from_cugraph_ops/algo_R.cuh
new file mode 100644
index 00000000000..031a7d2ceb9
--- /dev/null
+++ b/cpp/src/from_cugraph_ops/algo_R.cuh
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ */
+
+#pragma once
+
+#include "device.cuh"
+
+#include <cugraph/graph.hpp>
+
+#include <raft/random/rng.cuh>
+#include <raft/random/rng_device.cuh>
+#include <raft/random/rng_state.hpp>
+
+#include <algorithm>
+
+namespace cugraph::ops::graph {
+
+// single warp-separated field of type IdxT
+template <typename IdxT>
+using smem_algo_r_t = utils::smem_unit_simple_t<1, IdxT>;
+
+template <typename IdxT, typename GenT, typename RandT>
+__device__ __forceinline__ void warp_algo_r_index(IdxT* smem,
+                                                  IdxT pop_size,
+                                                  IdxT idx_offset,
+                                                  int sample_size,
+                                                  raft::random::DeviceState<GenT>& rng_state)
+{
+  auto lane = utils::lane_id();
+  // first 'sample_size' are just copied
+  CUGRAPH_OPS_UNROLL
+  for (int i = lane; i < sample_size; i += utils::WARP_SIZE) {
+    smem[i] = idx_offset + i;
+  }
+  auto sample_size_idxt = IdxT{sample_size};
+  if (sample_size_idxt >= pop_size) return;
+
+  // we must synchronize here since we have just written to smem
+  utils::warp_sync();
+  // TODO(mjoux): when we support more warps per node enable this
+  //__syncthreads();
+
+  auto idx_end = idx_offset + pop_size;
+  auto n       = idx_offset + sample_size_idxt;
+  auto flat_id = uint64_t{threadIdx.x + blockIdx.x * blockDim.x};
+  GenT gen(rng_state, flat_id);
+  CUGRAPH_OPS_UNROLL
+  for (auto nidx = n + IdxT{lane}; nidx < idx_end; nidx += IdxT{utils::WARP_SIZE}) {
+    // nidx - idx_offset inclusive (necessary for correctness of algo R)
+    auto end = nidx - idx_offset + 1;
+    raft::random::UniformIntDistParams<IdxT, RandT> int_params{};
+    int_params.start = IdxT{0};
+    int_params.end   = IdxT{end};
+    int_params.diff  = static_cast<RandT>(end);
+    IdxT idx;
+    raft::random::custom_next(gen, &idx, int_params, 0, 0 /* idx / stride unused */);
+    if (idx < sample_size_idxt) {
+      // using atomic max instead of exch here because it leads to the same
+      // output as the sequential algorithm (DGL does this, too)
+      // Additionally, we use the index instead of the neighbor ID here
+      // since this allows copying over other node/edge-related data
+      // (useful for heterogeneous graphs for example)
+      utils::atomic_max(smem + idx, nidx);
+    }
+  }
+  // must synchronize to make smem valid
+  utils::warp_sync();
+  // TODO(mjoux): when we support more warps per node enable this
+  //__syncthreads();
+}
+
+template <typename IdxT, typename GenT, typename RandT>
+__device__ __forceinline__ void warp_algo_r(IdxT* smem,
+                                            IdxT row_id,
+                                            const IdxT* nodes,
+                                            const IdxT* fg_offsets,
+                                            int sample_size,
+                                            IdxT& node_id,
+                                            IdxT& node_start,
+                                            IdxT& node_end,
+                                            raft::random::DeviceState<GenT>& rng_state)
+{
+  auto lane = utils::lane_id();
+  if (nodes == nullptr) {
+    node_id = row_id;
+    if (lane == 0)
+      node_start = fg_offsets[node_id];
+    else if (lane == 1)
+      node_end = fg_offsets[node_id + 1];
+    node_start = utils::shfl(node_start, 0);
+    node_end   = utils::shfl(node_end, 1);
+  } else {
+    if (lane == 0) {
+      node_id    = nodes[row_id];
+      node_start = fg_offsets[node_id];
+      node_end   = fg_offsets[node_id + 1];
+    }
+    node_id    = utils::shfl(node_id, 0);
+    node_start = utils::shfl(node_start, 0);
+    node_end   = utils::shfl(node_end, 0);
+  }
+  auto pop_size = node_end - node_start;
+  warp_algo_r_index<IdxT, GenT, RandT>(smem, pop_size, node_start, sample_size, rng_state);
+}
+
+// TODO(mjoux): support configuring n_warps_per_node in template
+template <typename RandT, int N_WARPS, bool SAMPLE_SELF, bool IS_HG, typename IdxT, typename GenT>
+CUGRAPH_OPS_KERNEL void algo_r_kernel(raft::random::DeviceState<GenT> rng_state,
+                                      IdxT* neighbors,
+                                      IdxT* counts,
+                                      // edge_types / node_types should be non-const
+                                      // probably detected if `!IS_HG`
+                                      // NOLINTNEXTLINE(readability-non-const-parameter)
+                                      int32_t* edge_types,
+                                      // NOLINTNEXTLINE(readability-non-const-parameter)
+                                      int32_t* node_types,
+                                      const IdxT* offsets,
+                                      const IdxT* indices,
+                                      const int32_t* g_edge_types,
+                                      const int32_t* g_node_types,
+                                      const IdxT* nodes,
+                                      IdxT n_dst_nodes,
+                                      int sample_size)
+{
+  auto lane   = utils::lane_id();
+  auto warp   = utils::warp_id();  // 1D block with X dim
+  auto row_id = warp + static_cast<IdxT>(blockIdx.x) * IdxT{N_WARPS};
+  if (row_id >= n_dst_nodes) { return; }
+  IdxT* s_idx;
+  smem_algo_r_t<IdxT> smem{};
+  int32_t smem_sizes[] = {sample_size};
+  smem.set_ptrs(warp, N_WARPS, smem_sizes, s_idx);
+  IdxT node_id, node_start, node_end;
+  warp_algo_r<IdxT, GenT, RandT>(
+    s_idx, row_id, nodes, offsets, sample_size, node_id, node_start, node_end, rng_state);
+
+  IdxT count = 0;
+  for (int i = lane; i < sample_size; i += utils::WARP_SIZE) {
+    auto nidx = s_idx[i];
+    // checking for node_end here because sample_size may be larger than
+    // the total number of neighbors of the node
+    auto val = nidx < node_end ? indices[nidx] : cugraph::invalid_idx<IdxT>::value;
+    // TODO(mjoux) it's possible that we break the ELLPACK format here since
+    // if we set val to invalid, we should add it to end of list, rather
+    // than simply at index "i". This is ignored for now since the case
+    // where SAMPLE_SELF := false is rare and unconventional
+    if (!SAMPLE_SELF && val == node_id) val = cugraph::invalid_idx<IdxT>::value;
+    auto local_id       = row_id * IdxT{sample_size} + i;
+    neighbors[local_id] = val;
+    if (val != cugraph::invalid_idx<IdxT>::value) {
+      ++count;
+      if (IS_HG) edge_types[local_id] = g_edge_types[nidx];
+    }
+  }
+  if (IS_HG && lane == 0) node_types[row_id] = g_node_types[node_id];
+  if (counts != nullptr) {
+    count = utils::warp_reduce(count);
+    if (lane == 0) { counts[row_id] = count; }
+  }
+}
+
+template <typename IdxT, bool SAMPLE_SELF, bool IS_HG>
+void algo_r_impl(IdxT* neighbors,
+                 IdxT* counts,
+                 int32_t* edge_types,
+                 int32_t* node_types,
+                 raft::random::RngState& rng,
+                 const IdxT* offsets,
+                 const IdxT* indices,
+                 const int32_t* g_edge_types,
+                 const int32_t* g_node_types,
+                 const IdxT* nodes,
+                 IdxT n_dst_nodes,
+                 IdxT g_n_dst_nodes,
+                 IdxT sample_size,
+                 IdxT max_val,
+                 cudaStream_t stream)
+{
+  if (nodes == nullptr) { n_dst_nodes = g_n_dst_nodes; }
+  ASSERT(n_dst_nodes <= g_n_dst_nodes,
+         "Algo R: expected n_dst_nodes <= graph.n_dst_nodes (%ld > %ld)",
+         long(n_dst_nodes),
+         long(g_n_dst_nodes));
+  ASSERT(
+    static_cast<size_t>(sample_size) + 2 < static_cast<size_t>(std::numeric_limits<int>::max()),
+    "Expected sample size [+2] to be lower than INT_MAX");
+  static constexpr int TPB     = 512;
+  static constexpr int N_WARPS = TPB / utils::WARP_SIZE;
+  auto n_blks                  = utils::ceil_div<IdxT>(n_dst_nodes, N_WARPS);
+  int sample_size_i            = static_cast<int>(sample_size);
+  int32_t smem_sizes[]         = {sample_size_i};
+  size_t smem_size             = smem_algo_r_t<IdxT>::get_size(N_WARPS, smem_sizes);
+  if (static_cast<uint64_t>(max_val) < std::numeric_limits<uint32_t>::max()) {
+    // we'll use the 32-bit based method for generating random integers
+    // as we most likely do not need less bias
+    RAFT_CALL_RNG_FUNC(
+      rng,
+      (algo_r_kernel<uint32_t, N_WARPS, SAMPLE_SELF, IS_HG><<<n_blks, TPB, smem_size, stream>>>),
+      neighbors,
+      counts,
+      edge_types,
+      node_types,
+      offsets,
+      indices,
+      g_edge_types,
+      g_node_types,
+      nodes,
+      n_dst_nodes,
+      sample_size_i);
+  } else {
+    RAFT_CALL_RNG_FUNC(
+      rng,
+      (algo_r_kernel<uint64_t, N_WARPS, SAMPLE_SELF, IS_HG><<<n_blks, TPB, smem_size, stream>>>),
+      neighbors,
+      counts,
+      edge_types,
+      node_types,
+      offsets,
+      indices,
+      g_edge_types,
+      g_node_types,
+      nodes,
+      n_dst_nodes,
+      sample_size_i);
+  }
+  // update the rng state (this is a pessimistic update as it is difficult to
+  // compute the number of RNG calls done per thread!)
+  auto thread_rs = utils::ceil_div<IdxT>(
+    std::max(IdxT{0}, std::min(max_val, g_n_dst_nodes) - sample_size), utils::WARP_SIZE);
+  rng.advance(static_cast<uint64_t>(n_blks * TPB), thread_rs);
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+}  // namespace cugraph::ops::graph
diff --git a/cpp/src/from_cugraph_ops/device.cuh b/cpp/src/from_cugraph_ops/device.cuh
new file mode 100644
index 00000000000..f7d37c62f35
--- /dev/null
+++ b/cpp/src/from_cugraph_ops/device.cuh
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ */
+
+#pragma once
+
+#include "device_atomics.cuh"
+#include "device_core.hpp"
+#include "device_dim.cuh"
+#include "device_smem_helper.cuh"
+#include "device_warp_collectives.cuh"
+#include "macros.hpp"
diff --git a/cpp/src/from_cugraph_ops/device_atomics.cuh b/cpp/src/from_cugraph_ops/device_atomics.cuh
new file mode 100644
index 00000000000..b8be7614284
--- /dev/null
+++ b/cpp/src/from_cugraph_ops/device_atomics.cuh
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ */
+
+#pragma once
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+#include <cstdint>
+
+namespace cugraph::ops::utils {
+
+/**
+ * @defgroup AtomicMax Device atomic max operation
+ *
+ * @{
+ */
+template <typename DataT>
+__device__ inline DataT atomic_max(DataT* address, DataT val)
+{
+  return atomicMax(address, val);
+}
+template <>
+__device__ inline float atomic_max(float* address, float val)
+{
+  using u32_t          = unsigned int;
+  auto* address_as_u32 = reinterpret_cast<u32_t*>(address);
+  u32_t old            = *address_as_u32, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_u32, assumed, __float_as_uint(max(val, __uint_as_float(assumed))));
+  } while (assumed != old);
+  return __uint_as_float(old);
+}
+template <>
+__device__ inline double atomic_max(double* address, double val)
+{
+  using u64_t          = unsigned long long;  // NOLINT(google-runtime-int)
+  auto* address_as_ull = reinterpret_cast<u64_t*>(address);
+  u64_t old            = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old     = atomicCAS(
+      address_as_ull, assumed, __double_as_longlong(max(val, __longlong_as_double(assumed))));
+  } while (assumed != old);
+  return __longlong_as_double(old);
+}
+template <>
+__device__ inline int64_t atomic_max(int64_t* address, int64_t val)
+{
+  using u64_t          = unsigned long long;  // NOLINT(google-runtime-int)
+  auto* val_as_u64     = reinterpret_cast<u64_t*>(&val);
+  auto* address_as_u64 = reinterpret_cast<u64_t*>(address);
+  auto ret             = atomicMax(address_as_u64, *val_as_u64);
+  return *reinterpret_cast<int64_t*>(&ret);
+}
+template <>
+__device__ inline uint64_t atomic_max(uint64_t* address, uint64_t val)
+{
+  using u64_t          = unsigned long long;  // NOLINT(google-runtime-int)
+  auto* val_as_u64     = reinterpret_cast<u64_t*>(&val);
+  auto* address_as_u64 = reinterpret_cast<u64_t*>(address);
+  auto ret             = atomicMax(address_as_u64, *val_as_u64);
+  return *reinterpret_cast<uint64_t*>(&ret);
+}
+/** @} */
+
+}  // namespace cugraph::ops::utils
diff --git a/cpp/src/from_cugraph_ops/device_core.hpp b/cpp/src/from_cugraph_ops/device_core.hpp
new file mode 100644
index 00000000000..b548d2d4d1f
--- /dev/null
+++ b/cpp/src/from_cugraph_ops/device_core.hpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ */
+
+#pragma once
+
+#include "macros.hpp"
+
+namespace cugraph::ops::utils {
+
+/** number of threads per warp */
+static constexpr int WARP_SIZE = 32;
+
+/** minimum CUDA version required for warp shfl sync functions */
+static constexpr int CUDA_VER_WARP_SHFL = 9000;
+
+/**
+ * @brief Provide a ceiling division operation ie. ceil(a / b)
+ *
+ * @tparam IntT supposed to be only integers for now!
+ *
+ * @param[in] a dividend
+ * @param[in] b divisor
+ */
+template <typename IntT>
+constexpr CUGRAPH_OPS_HD IntT ceil_div(IntT a, IntT b)
+{
+  return (a + b - 1) / b;
+}
+
+/**
+ * @brief Provide an alignment function ie. ceil(a / b) * b
+ *
+ * @tparam IntT supposed to be only integers for now!
+ *
+ * @param[in] a dividend
+ * @param[in] b divisor
+ */
+template <typename IntT>
+constexpr CUGRAPH_OPS_HD IntT align_to(IntT a, IntT b)
+{
+  return ceil_div(a, b) * b;
+}
+
+}  // namespace cugraph::ops::utils
diff --git a/cpp/src/from_cugraph_ops/device_dim.cuh b/cpp/src/from_cugraph_ops/device_dim.cuh
new file mode 100644
index 00000000000..275d0edd485
--- /dev/null
+++ b/cpp/src/from_cugraph_ops/device_dim.cuh
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ */
+
+#pragma once
+
+#include "device_core.hpp"
+
+namespace cugraph::ops::utils {
+
+/** get the lane id of the current thread */
+__device__ __forceinline__ int lane_id()
+{
+  int id;
+  asm("mov.s32 %0, %%laneid;" : "=r"(id));
+  return id;
+}
+
+/**
+ * get the flat id of the current thread (within block)
+ * template parameters allow to control which CTA dimensions are used
+ */
+template <bool USE_X = true, bool USE_Y = false, bool USE_Z = false>
+__device__ __forceinline__ int flat_id()
+{
+  if (!USE_X && !USE_Y && !USE_Z)
+    return 0;  // weird case, but if we get here, we should have 1 thread
+  if (!USE_X && !USE_Y && USE_Z) return threadIdx.z;
+  if (!USE_X && USE_Y && !USE_Z) return threadIdx.y;
+  if (!USE_X && USE_Y && USE_Z) return threadIdx.y + threadIdx.z * blockDim.y;
+  if (USE_X && !USE_Y && !USE_Z) return threadIdx.x;
+  if (USE_X && !USE_Y && USE_Z) return threadIdx.x + threadIdx.z * blockDim.x;
+  if (USE_X && USE_Y && !USE_Z) return threadIdx.x + threadIdx.y * blockDim.x;
+  // USE_X && USE_Y && USE_Z
+  return threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
+}
+
+/**
+ * get the number of warps of the current block
+ * template parameters allow to control which CTA dimensions are used
+ */
+template <bool USE_X = true, bool USE_Y = false, bool USE_Z = false>
+__device__ __forceinline__ int num_warps()
+{
+  if (!USE_X && !USE_Y && !USE_Z)
+    return 1;  // weird case, but if we get here, we should have 1 thread
+  if (!USE_X && !USE_Y && USE_Z) return ceil_div<int>(blockDim.z, WARP_SIZE);
+  if (!USE_X && USE_Y && !USE_Z) return ceil_div<int>(blockDim.y, WARP_SIZE);
+  if (!USE_X && USE_Y && USE_Z) return ceil_div<int>(blockDim.y * blockDim.z, WARP_SIZE);
+  if (USE_X && !USE_Y && !USE_Z) return ceil_div<int>(blockDim.x, WARP_SIZE);
+  if (USE_X && !USE_Y && USE_Z) return ceil_div<int>(blockDim.x * blockDim.z, WARP_SIZE);
+  if (USE_X && USE_Y && !USE_Z) return ceil_div<int>(blockDim.x * blockDim.y, WARP_SIZE);
+  // USE_X && USE_Y && USE_Z
+  return ceil_div<int>(blockDim.x * blockDim.y * blockDim.z, WARP_SIZE);
+}
+
+/**
+ * get the warp id of the current thread
+ * template parameters allow to control which CTA dimensions are used
+ * @note: this only makes sense if the first used dimension of the CTA size
+ * is a multiple of WARP_SIZE. If this is not the case, use
+ * `flat_id<...>() / WARP_SIZE` to get the warp id of the current thread
+ */
+template <bool USE_X = true, bool USE_Y = false, bool USE_Z = false>
+__device__ __forceinline__ int warp_id()
+{
+  if (!USE_X && !USE_Y && !USE_Z)
+    return 0;  // weird case, but if we get here, we should have 1 thread
+  if (!USE_X && !USE_Y && USE_Z) return threadIdx.z / WARP_SIZE;
+  if (!USE_X && USE_Y && !USE_Z) return threadIdx.y / WARP_SIZE;
+  if (!USE_X && USE_Y && USE_Z)
+    return threadIdx.y / WARP_SIZE + threadIdx.z * num_warps<false, true, false>();
+  if (USE_X && !USE_Y && !USE_Z) return threadIdx.x / WARP_SIZE;
+  if (USE_X && !USE_Y && USE_Z)
+    return threadIdx.x / WARP_SIZE + threadIdx.z * num_warps<true, false, false>();
+  if (USE_X && USE_Y && !USE_Z)
+    return threadIdx.x / WARP_SIZE + threadIdx.y * num_warps<true, false, false>();
+  // USE_X && USE_Y && USE_Z
+  return threadIdx.x / WARP_SIZE + threadIdx.y * num_warps<true, false, false>() +
+         threadIdx.z * blockDim.y * num_warps<true, false, false>();
+}
+
+/**
+ * get the block dimension of the current executing block
+ * template parameters allow to control which CTA dimensions are used
+ */
+template <bool USE_X = true, bool USE_Y = false, bool USE_Z = false>
+__device__ __forceinline__ int block_dim()
+{
+  if (!USE_X && !USE_Y && !USE_Z)
+    return 1;  // weird case, but if we get here, we should have 1 thread
+  if (!USE_X && !USE_Y && USE_Z) return blockDim.z;
+  if (!USE_X && USE_Y && !USE_Z) return blockDim.y;
+  if (!USE_X && USE_Y && USE_Z) return blockDim.y * blockDim.z;
+  if (USE_X && !USE_Y && !USE_Z) return blockDim.x;
+  if (USE_X && !USE_Y && USE_Z) return blockDim.x * blockDim.z;
+  if (USE_X && USE_Y && !USE_Z) return blockDim.x * blockDim.y;
+  // USE_X && USE_Y && USE_Z
+  return blockDim.x * blockDim.y * blockDim.z;
+}
+
+/**
+ * get the flat id of the current thread (within device/grid)
+ * template parameters allow to control which grid and block/CTA dimensions are used
+ */
+template <bool G_USE_X = true,
+          bool G_USE_Y = false,
+          bool G_USE_Z = false,
+          bool B_USE_X = true,
+          bool B_USE_Y = false,
+          bool B_USE_Z = false>
+__device__ __forceinline__ int flat_grid_id()
+{
+  auto b_id  = flat_id<B_USE_X, B_USE_Y, B_USE_Z>();
+  auto b_dim = block_dim<B_USE_X, B_USE_Y, B_USE_Z>();
+  if (!G_USE_X && !G_USE_Y && !G_USE_Z)
+    return 0;  // weird case, but if we get here, we should have 1 thread
+  if (!G_USE_X && !G_USE_Y && G_USE_Z) return blockIdx.z * b_dim + b_id;
+  if (!G_USE_X && G_USE_Y && !G_USE_Z) return blockIdx.y * b_dim + b_id;
+  if (!G_USE_X && G_USE_Y && G_USE_Z) return blockIdx.y * b_dim + blockIdx.z * blockDim.z + b_id;
+  if (G_USE_X && !G_USE_Y && !G_USE_Z) return blockIdx.x * b_dim + b_id;
+  if (G_USE_X && !G_USE_Y && G_USE_Z) return blockIdx.x * b_dim + blockIdx.z * blockDim.z + b_id;
+  if (G_USE_X && G_USE_Y && !G_USE_Z) return blockIdx.x * b_dim + blockIdx.y * blockDim.y + b_id;
+  // G_USE_X && G_USE_Y && G_USE_Z
+  return blockIdx.x * b_dim + blockIdx.y * blockDim.y * blockDim.z + blockIdx.z * blockDim.z + b_id;
+}
+
+}  // namespace cugraph::ops::utils
diff --git a/cpp/src/from_cugraph_ops/device_smem_helper.cuh b/cpp/src/from_cugraph_ops/device_smem_helper.cuh
new file mode 100644
index 00000000000..f1b5be071d9
--- /dev/null
+++ b/cpp/src/from_cugraph_ops/device_smem_helper.cuh
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ */
+
+#pragma once
+
+#include "device_core.hpp"
+
+#include <cstdint>
+#include <tuple>
+#include <type_traits>
+
+namespace cugraph::ops::utils {
+
+// The following struct must be used to transmit the size and alignment of
+// a field to the shared memory helpers below.
+// By default, the alignment is just like the alignment of the original data type.
+template <typename DataT, int32_t ALIGN = 0>
+struct field_type {
+  using data_t                       = DataT;
+  static constexpr int32_t BYTES     = static_cast<int32_t>(sizeof(DataT));
+  static constexpr int32_t ALIGNMENT = ALIGN > 0 ? ALIGN : alignof(DataT);
+};
+
+// Imagine we have 2 fields of data in shared memory, one for ints, one for doubles.
+// The intended usage of the following class in simple cases is as follows:
+// 1. specify the type somewhere for both host and kernel code:
+//    using special_smem_name_t = smem_helper< 0, 0, field_type<int>, field_type<double> >;
+//    /* can be simplified to the following: */
+//    using special_smem_name_t = smem_simple_t< int, double >;
+// 2. in host code, get the size of shared memory:
+//    int32_t smem_sizes[] = {n_ints, n_doubles};
+//    /* note: sizes are always in number of elements, not bytes */
+//    /*       sizes always have type `int32_t` */
+//    auto size = special_smem_name_t::get_size(sizes);
+// 3. in device code, call the empty constructor:
+//    special_smem_name_t helper {};
+//    int* s_ints;
+//    double* s_doubles;
+//    int32_t smem_sizes[] = {n_ints, n_doubles};
+//    helper.set_ptrs(sizes, s_ints, s_doubles);
+//
+// For more complicated use cases, it is often useful to create a struct overloading
+// operator[] and passing that to the `get_size` or `set_ptrs` helpers.
+// The struct can also be used to directly pass the size information from
+// host code (launch) to the kernel, avoiding duplication of calculating sizes.
+// Be aware that this overload must have a `__host__ __device__` signature.
+// Here is an example struct for the above use case:
+// struct sizes_t {
+//   int32_t n_ints, n_doubles;
+//   __host__ __device__ sizes_t() = delete;
+//   __host__ __device__ sizes_t(int32_t _n_ints, int32_t _n_doubles) :
+//     n_ints(_n_ints), n_doubles(_n_doubles) {}
+//
+//   /* you may also just return int32_t here instead of const int32_t& */
+//   __host__ __device__ const int32_t& operator[](int idx) const
+//   {
+//     return idx == 0 ? n_ints : n_doubles;
+//   }
+// };
+//
+// The ALIGN_INIT template parameter is important for correctness:
+// By default (ALIGN_INIT=0), we assume that all alignments are powers of 2,
+// and we set ALIGN_INIT to the max alignment of the fields. If you want more
+// control, you can set it yourself, but we always assume that it is a multiple
+// of all alignment values of the fields.
+//
+// The N_UNIT_FIELDS template parameters allows specifying sub-spaces
+// for a given number of "units" (often warps) such that the first
+// `N_UNIT_FIELDS` fields are reserved sub-spaces per unit.
+// In this case, the `get_size` and `set_ptrs` methods are modified such that
+// you have to specify the number of units, and for `set_ptrs` the unit ID
+// as well.
+// This is useful for reserving exclusive shared memory per warp for example.
+// Each unit (warp) will have its sub-space (containing the `N_UNIT_FIELDS`
+// fields) aligned to the initial alignment as described above.
+template <int32_t ALIGN_INIT, int N_UNIT_FIELDS, typename... FieldsT>
+class smem_helper {
+ public:
+  static constexpr size_t N_ARGS = sizeof...(FieldsT);
+
+ protected:
+  static_assert(N_ARGS > 0, "smem_helper: must have at least one field type");
+  static_assert(N_UNIT_FIELDS >= 0, "smem_helper: #unit fields must be non-negative");
+  static_assert(N_UNIT_FIELDS <= N_ARGS,
+                "smem_helper: #unit fields must be smaller than #field types");
+  // following static assertion for FieldsT to not be scalar types is based on
+  // https://stackoverflow.com/a/28253503/4134127
+  template <bool...>
+  struct bool_pack;
+  template <bool... BOOLS>
+  using all_true_t = std::is_same<bool_pack<true, BOOLS...>, bool_pack<BOOLS..., true>>;
+  static_assert(all_true_t<!std::is_scalar<FieldsT>::value...>::value,
+                "smem_helper: the given field template types must be of type `field_type` and "
+                "cannot be scalars");
+
+  template <int IDX>
+  __host__ __device__ static constexpr typename std::enable_if<(IDX < N_ARGS), int32_t>::type
+  max_align()
+  {
+    using f_t = typename std::tuple_element<IDX, std::tuple<FieldsT...>>::type;
+    static_assert(f_t::ALIGNMENT > 0, "field alignments must be greater than 0");
+    return max_align<IDX + 1>() > f_t::ALIGNMENT ? max_align<IDX + 1>() : f_t::ALIGNMENT;
+  }
+  template <int IDX>
+  __host__ __device__ static constexpr typename std::enable_if<(IDX >= N_ARGS), int32_t>::type
+  max_align()
+  {
+    return -1;
+  }
+
+  // this is assumed to be a multiple of all alignments
+  static constexpr int32_t ALIGN_BASE = ALIGN_INIT > 0 ? ALIGN_INIT : max_align<0>();
+
+  // here we exploit that the base pointer must be aligned to 16 bytes.
+  // if 16 is a multiple of ALIGN_BASE, that means we don't have any overhead.
+  // if ALIGN_BASE is a multiple of 16, it means that we need at most
+  // ALIGN_BASE - 16 extra bytes, otherwise it's ALIGN_BASE - 1
+  static constexpr int32_t SIZE_OVERHEAD = 16 % ALIGN_BASE == 0   ? 0
+                                           : ALIGN_BASE % 16 == 0 ? ALIGN_BASE - 16
+                                                                  : ALIGN_BASE - 1;
+
+ public:
+  // cannot easily use "= default" here for host-only code
+  // NOLINTNEXTLINE(modernize-use-equals-default)
+  __host__ __device__ smem_helper()
+  {
+#if defined(__CUDA_ARCH__)
+    // must be aligned to 16 bytes on all supported architectures
+    // (don't have a reference for this at the moment!)
+    extern __shared__ uint8_t smem[];
+    // align manually to `ALIGN_BASE`: this avoids the `__align(X)__` attribute
+    // which can cause issues if this is used in the same compilation unit
+    // with different types / alignments.
+    // In any case, the compiler/hardware cannot do a better job at providing
+    // an aligned pointer than we can do manually.
+    auto smem_aligned = align_to(reinterpret_cast<uintptr_t>(smem), uintptr_t(ALIGN_BASE));
+    base_ptr_         = reinterpret_cast<uint8_t*>(smem_aligned);
+#endif
+  }
+
+  template <typename SizeT, int N = N_UNIT_FIELDS>
+  __host__ __device__ static inline typename std::enable_if<(N <= 0), int32_t>::type get_size(
+    const SizeT& sizes)
+  {
+    auto current_total = 0;  // base pointer must be aligned to ALIGN_BASE
+    size_helper<1>(current_total, sizes);
+    return SIZE_OVERHEAD + current_total;
+  }
+
+  template <typename SizeT, int N = N_UNIT_FIELDS>
+  __host__ __device__ static inline typename std::enable_if<(N > 0), int32_t>::type get_size(
+    const int32_t n_units, const SizeT& sizes)
+  {
+    auto current_total = 0;  // base pointer must be aligned to all alignments
+    unit_size_helper<1>(current_total, sizes);
+    // since the unit size is aligned to ALIGN_BASE, every base pointer for
+    // each unit as well as the base pointer after all units is aligned to
+    // ALIGN_BASE: since that is a multiple of all alignments, we can safely
+    // continue adding the sizes afterwards
+    auto unit_size = align_to(current_total, ALIGN_BASE);
+    current_total  = 0;  // base pointer must be aligned to all alignments
+    size_helper<N + 1>(current_total, sizes);
+    return SIZE_OVERHEAD + unit_size * n_units + current_total;
+  }
+
+  template <typename SizeT, int N = N_UNIT_FIELDS>
+  __device__ inline typename std::enable_if<(N <= 0)>::type set_ptrs(
+    const SizeT& sizes, typename FieldsT::data_t*&... ptrs) const
+  {
+    return ptrs_helper<1>(0, 0, 0, 0, sizes, ptrs...);
+  }
+
+  template <typename SizeT, int N = N_UNIT_FIELDS>
+  __device__ inline typename std::enable_if<(N > 0)>::type set_ptrs(
+    const int32_t& unit_id,
+    const int32_t& n_units,
+    const SizeT& sizes,
+    typename FieldsT::data_t*&... ptrs) const
+  {
+    auto current_total = 0;  // base pointer must be aligned to all alignments
+    unit_size_helper<1>(current_total, sizes);
+    // see explanation in `get_size` for what aligning to ALIGN_BASE means
+    auto unit_size = align_to(current_total, ALIGN_BASE);
+    return ptrs_helper<1>(0, unit_id, unit_size, n_units, sizes, ptrs...);
+  }
+
+ protected:
+  template <int NEXT, typename SizeT>
+  __host__ __device__ static inline void single_size(int32_t& current_total, const SizeT& sizes)
+  {
+    using next_field_t = typename std::tuple_element<(NEXT < N_ARGS ? NEXT : N_ARGS - 1),
+                                                     std::tuple<FieldsT...>>::type;
+    using this_field_t = typename std::tuple_element<(NEXT < N_ARGS ? NEXT - 1 : N_ARGS - 1),
+                                                     std::tuple<FieldsT...>>::type;
+    static constexpr int32_t ALIGN =
+      NEXT == N_UNIT_FIELDS || NEXT >= N_ARGS ? 1 : next_field_t::ALIGNMENT;
+    current_total = align_to(current_total + sizes[NEXT - 1] * this_field_t::BYTES, ALIGN);
+  }
+
+  // parentheses in `enable_if` here are used to help the parser understand "<>"
+  template <int NEXT, typename SizeT>
+  __host__ __device__ static inline typename std::enable_if<(NEXT <= N_ARGS)>::type size_helper(
+    int32_t& current_total, const SizeT& sizes)
+  {
+    single_size<NEXT>(current_total, sizes);
+    size_helper<NEXT + 1>(current_total, sizes);
+  }
+  template <int NEXT, typename SizeT>
+  __host__ __device__ static inline typename std::enable_if<(NEXT > N_ARGS)>::type size_helper(
+    int32_t& /* current_total */, const SizeT& /* sizes */)
+  {
+  }
+
+  template <int NEXT, typename SizeT>
+  __host__ __device__ static inline typename std::enable_if<(NEXT <= N_UNIT_FIELDS)>::type
+  unit_size_helper(int32_t& current_total, const SizeT& sizes)
+  {
+    single_size<NEXT>(current_total, sizes);
+    unit_size_helper<NEXT + 1>(current_total, sizes);
+  }
+  template <int NEXT, typename SizeT>
+  __host__ __device__ static inline typename std::enable_if<(NEXT > N_UNIT_FIELDS)>::type
+  unit_size_helper(int32_t& /* current_total */, const SizeT& /* sizes */)
+  {
+  }
+
+  template <int /* NEXT */, typename SizeT>
+  __device__ inline void ptrs_helper(const int32_t& /* offset */,
+                                     const int32_t& /* unit_id */,
+                                     const int32_t& /* unit_size */,
+                                     const int32_t& /* n_units */,
+                                     const SizeT& /* sizes */) const
+  {
+  }
+  template <int NEXT, typename SizeT, typename PtrT, typename... PtrsT>
+  __device__ inline void ptrs_helper(const int32_t& offset,
+                                     const int32_t& unit_id,
+                                     const int32_t& unit_size,
+                                     const int32_t& n_units,
+                                     const SizeT& sizes,
+                                     PtrT*& ptr,
+                                     PtrsT*&... ptrs) const
+  {
+    // see `get_size`: base_ptr_ + u_off is always aligned to all alignments
+    // (whether for each individual unit or after all units)
+    auto u_off          = NEXT <= N_UNIT_FIELDS ? unit_id * unit_size : n_units * unit_size;
+    ptr                 = reinterpret_cast<PtrT*>(base_ptr_ + (u_off + offset));
+    int32_t next_offset = offset;
+    if (NEXT == N_UNIT_FIELDS)
+      next_offset = 0;  // pointer after all unit fields is aligned to all alignments
+    else
+      single_size<NEXT>(next_offset, sizes);
+    ptrs_helper<NEXT + 1>(next_offset, unit_id, unit_size, n_units, sizes, ptrs...);
+  }
+
+  uint8_t* base_ptr_{nullptr};
+};
+
+template <typename... DataT>
+using smem_simple_t = smem_helper<0, 0, field_type<DataT>...>;
+
+template <int N_UNIT_FIELDS, typename... DataT>
+using smem_unit_simple_t = smem_helper<0, N_UNIT_FIELDS, field_type<DataT>...>;
+
+}  // namespace cugraph::ops::utils
diff --git a/cpp/src/from_cugraph_ops/device_warp_collectives.cuh b/cpp/src/from_cugraph_ops/device_warp_collectives.cuh
new file mode 100644
index 00000000000..198b3be2f12
--- /dev/null
+++ b/cpp/src/from_cugraph_ops/device_warp_collectives.cuh
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ */
+
+#pragma once
+
+#include "device_core.hpp"
+#include "device_dim.cuh"
+#include "macros.hpp"
+
+#include <cstdint>
+
+namespace cugraph::ops::utils {
+
+/**
+ * @brief get a bit mask for the `n_threads` lowest threads of a warp
+ *
+ * @param[in] n_threads  number of threads in the mask
+ *
+ * @return the bit mask
+ */
+__host__ __device__ constexpr uint32_t low_thread_mask(int n_threads)
+{
+  return n_threads >= WARP_SIZE ? 0xffffffffU : (1U << n_threads) - 1U;
+}
+
+/**
+ * apply a warp-wide sync (useful from Volta+ archs)
+ *
+ * @tparam NP number of participating threads
+ *
+ * @note This works on Pascal and earlier archs as well, but all threads with
+ * lane id <= NP must enter this function together and in convergence.
+ */
+template <int NP = WARP_SIZE>
+__device__ inline void warp_sync()
+{
+  __syncwarp(low_thread_mask(NP));
+}
+
+/**
+ * @brief Shuffle the data inside a warp
+ *
+ * @tparam DataT the data type (currently assumed to be 4B)
+ *
+ * @param[in] val      value to be shuffled
+ * @param[in] src_lane lane from where to shuffle
+ * @param[in] width    lane width
+ * @param[in] mask     mask of participating threads (Volta+)
+ *
+ * @return the shuffled data
+ */
+template <typename DataT>
+__device__ inline DataT shfl(DataT val,
+                             int src_lane,
+                             int width     = WARP_SIZE,
+                             uint32_t mask = 0xffffffffU)
+{
+  static_assert(CUDART_VERSION >= CUDA_VER_WARP_SHFL,
+                "Expected CUDA >= 9 for warp synchronous shuffle");
+  return __shfl_sync(mask, val, src_lane, width);
+}
+
+/**
+ * @brief Warp-level sum reduction
+ *
+ * @tparam DataT data type
+ * @tparam NP number of participating threads.
+ *         must be a power of 2 and at most warp size
+ *
+ * @param[in] val input value
+ *
+ * @return only the lane0 will contain valid reduced result
+ *
+ * @note Why not cub? Because cub doesn't seem to allow working with arbitrary
+ *       number of warps in a block.
+ *
+ * @note All threads with lane id <= NP must enter this function together
+ *
+ * TODO(mjoux) Expand this to support arbitrary reduction ops
+ */
+template <typename DataT, int NP = WARP_SIZE>
+__device__ inline DataT warp_reduce(DataT val)
+{
+  static constexpr uint32_t MASK = low_thread_mask(NP);
+  CUGRAPH_OPS_UNROLL
+  for (int i = NP / 2; i > 0; i >>= 1) {
+    DataT tmp = shfl(val, lane_id() + i, NP, MASK);
+    val += tmp;
+  }
+  return val;
+}
+
+}  // namespace cugraph::ops::utils
diff --git a/cpp/src/from_cugraph_ops/macros.hpp b/cpp/src/from_cugraph_ops/macros.hpp
new file mode 100644
index 00000000000..0ff08af0b1a
--- /dev/null
+++ b/cpp/src/from_cugraph_ops/macros.hpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ */
+
+#define CUGRAPH_OPS_STRINGIFY_DETAIL(x) #x
+#define CUGRAPH_OPS_STRINGIFY(x)        CUGRAPH_OPS_STRINGIFY_DETAIL(x)
+
+#define CUGRAPH_OPS_UNROLL _Pragma("unroll")
+#if defined(__clang__) && defined(__CUDA__)
+// clang wants pragma unroll without parentheses
+#define CUGRAPH_OPS_UNROLL_N(n) _Pragma(CUGRAPH_OPS_STRINGIFY(unroll n))
+#else
+// nvcc / nvrtc want pragma unroll with parentheses
+#define CUGRAPH_OPS_UNROLL_N(n) _Pragma(CUGRAPH_OPS_STRINGIFY(unroll(n)))
+#endif
+
+#if defined(__clang__)
+#define CUGRAPH_OPS_CONSTEXPR_D constexpr
+#else
+#define CUGRAPH_OPS_CONSTEXPR_D constexpr __device__
+#endif
+
+#if defined(__CUDACC__) || defined(__CUDA__)
+#define CUGRAPH_OPS_HD __host__ __device__
+#else
+#define CUGRAPH_OPS_HD
+#endif
+
+// The CUGRAPH_OPS_KERNEL specificies that a kernel has hidden visibility
+//
+// cugraph-ops needs to ensure that the visibility of its CUGRAPH_OPS_KERNEL function
+// templates have hidden visibility ( default is weak visibility).
+//
+// When kernels have weak visibility it means that if two dynamic libraries
+// both contain identical instantiations of a kernel/template, then the linker
+// will discard one of the two instantiations and use only one of them.
+//
+// Do to unique requirements of how the CUDA works this de-deduplication
+// can lead to the wrong kernels being called ( SM version being wrong ),
+// silently no kernel being called at all, or cuda runtime errors being
+// thrown.
+//
+// https://github.com/rapidsai/raft/issues/1722
+#ifndef CUGRAPH_OPS_KERNEL
+#define CUGRAPH_OPS_KERNEL __global__ static
+#endif
diff --git a/cpp/src/from_cugraph_ops/sampling.hpp b/cpp/src/from_cugraph_ops/sampling.hpp
new file mode 100644
index 00000000000..5663b8d9c03
--- /dev/null
+++ b/cpp/src/from_cugraph_ops/sampling.hpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ */
+
+#pragma once
+
+// FIXME: This is only here for the prims...
+//   Need to look how Seunghwa fixed this in his PR
+#include <cugraph/graph.hpp>
+
+#include <raft/random/rng_state.hpp>
+
+#include <cuda_runtime.h>
+
+#include <cstdint>
+
+namespace cugraph::legacy::ops::graph {
+
+/**
+ * @brief Generate indexes given population sizes and a sample size,
+ *        with or without replacement
+ *
+ * @param[out]   index          The (dense) index matrix. [on device]
+ *                              [dim = `n_sizes x sample_size`]
+ *                              In case `replace` is `false`, this may contain
+ *                              `ops::graph::INVALID_ID<IdxT>`
+ *                              if no index could be generated.
+ * @param[inout] rng            RAFT RngState state object
+ * @param[in]    sizes          Input array of population sizes [on device]
+ *                              [len = `n_sizes`]
+ * @param[in]    n_sizes        number of sizes to sample from.
+ * @param[in]    sample_size    max number of indexes to be sampled per element
+ *                              in `sizes`. Assumed to be <= 384 at the moment.
+ * @param[in]    replace        If `true`, sample with replacement, otherwise
+ *                              without replacement.
+ * @param[in]    stream         cuda stream
+ *
+ @{
+ */
+void get_sampling_index(int32_t* index,
+                        raft::random::RngState& rng,
+                        const int32_t* sizes,
+                        int32_t n_sizes,
+                        int32_t sample_size,
+                        bool replace,
+                        cudaStream_t stream);
+void get_sampling_index(int64_t* index,
+                        raft::random::RngState& rng,
+                        const int64_t* sizes,
+                        int64_t n_sizes,
+                        int32_t sample_size,
+                        bool replace,
+                        cudaStream_t stream);
+
+}  // namespace cugraph::legacy::ops::graph
diff --git a/cpp/src/from_cugraph_ops/sampling_index.cu b/cpp/src/from_cugraph_ops/sampling_index.cu
new file mode 100644
index 00000000000..fb1f4ac3f1e
--- /dev/null
+++ b/cpp/src/from_cugraph_ops/sampling_index.cu
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ */
+
+#include "sampling.hpp"
+#include "sampling_index.cuh"
+
+namespace cugraph::legacy::ops::graph {
+
+void get_sampling_index(int32_t* index,
+                        raft::random::RngState& rng,
+                        const int32_t* sizes,
+                        int32_t n_sizes,
+                        int32_t sample_size,
+                        bool replace,
+                        cudaStream_t stream)
+{
+  get_sampling_index_impl(index, rng, sizes, n_sizes, sample_size, replace, stream);
+}
+
+void get_sampling_index(int64_t* index,
+                        raft::random::RngState& rng,
+                        const int64_t* sizes,
+                        int64_t n_sizes,
+                        int32_t sample_size,
+                        bool replace,
+                        cudaStream_t stream)
+{
+  get_sampling_index_impl(index, rng, sizes, n_sizes, sample_size, replace, stream);
+}
+
+}  // namespace cugraph::legacy::ops::graph
diff --git a/cpp/src/from_cugraph_ops/sampling_index.cuh b/cpp/src/from_cugraph_ops/sampling_index.cuh
new file mode 100644
index 00000000000..9ac574315bb
--- /dev/null
+++ b/cpp/src/from_cugraph_ops/sampling_index.cuh
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ */
+
+#pragma once
+
+#include "algo_R.cuh"
+#include "sampling.hpp"
+
+#include <raft/random/rng.cuh>
+#include <raft/random/rng_state.hpp>
+
+#include <type_traits>
+
+namespace cugraph::legacy::ops::graph {
+
+namespace utils = cugraph::ops::utils;
+
+template <typename IdxT>
+using smem_algo_r_t = utils::smem_unit_simple_t<1, IdxT>;
+
+template <typename IdxT, typename GenT>
+CUGRAPH_OPS_KERNEL void index_replace_kernel(raft::random::DeviceState<GenT> rng_state,
+                                             IdxT* index,
+                                             const IdxT* sizes,
+                                             IdxT n_sizes,
+                                             int sample_size)
+{
+  using rand_t = std::make_unsigned_t<IdxT>;
+  // a warp-wide implementation.
+  auto lane    = cugraph::ops::utils::lane_id();
+  auto warp    = utils::warp_id();    // 1D block with X dim
+  auto n_warps = utils::num_warps();  // 1D block with X dim
+  auto row_id  = warp + static_cast<IdxT>(blockIdx.x) * IdxT{n_warps};
+  if (row_id >= n_sizes) return;
+  // 1. load population size (once per warp)
+  IdxT size = IdxT{0};
+  if (lane == 0) size = sizes[row_id];
+
+  // 2. shuffle it to all threads in warp
+  size = utils::shfl(size, 0);
+
+  // 3. check valid size: possible early-out
+  if (size <= 0) {
+    CUGRAPH_OPS_UNROLL
+    for (auto i = lane; i < sample_size; i += utils::WARP_SIZE) {
+      index[row_id * IdxT{sample_size} + IdxT{i}] = cugraph::invalid_idx<IdxT>::value;
+    }
+    return;
+  }
+
+  // 4. every thread generates its indexes
+  auto flat_id = static_cast<uint64_t>(threadIdx.x + blockIdx.x * blockDim.x);
+  GenT gen(rng_state, flat_id);
+  raft::random::UniformIntDistParams<IdxT, rand_t> int_params{};
+  int_params.start = IdxT{0};
+  int_params.end   = size;
+  int_params.diff  = static_cast<rand_t>(size);
+  CUGRAPH_OPS_UNROLL
+  for (auto i = lane; i < sample_size; i += utils::WARP_SIZE) {
+    IdxT idx = IdxT{0};
+    raft::random::custom_next(gen, &idx, int_params, 0, 0 /* idx / stride unused */);
+
+    // 5. output index
+    index[row_id * IdxT{sample_size} + IdxT{i}] = idx;
+  }
+}
+
+template <typename IdxT>
+void get_sampling_index_replace(IdxT* index,
+                                raft::random::RngState& rng,
+                                const IdxT* sizes,
+                                IdxT n_sizes,
+                                int32_t sample_size,
+                                cudaStream_t stream)
+{
+  // keep thread per block fairly low since we can expect sample_size < warp_size
+  // thus we want to have as many blocks as possible to increase parallelism
+  static constexpr int TPB     = 128;
+  static constexpr int N_WARPS = TPB / utils::WARP_SIZE;
+  auto n_blks                  = utils::ceil_div<IdxT>(n_sizes, N_WARPS);
+  RAFT_CALL_RNG_FUNC(
+    rng, (index_replace_kernel<<<n_blks, TPB, 0, stream>>>), index, sizes, n_sizes, sample_size);
+  auto thread_rs = utils::ceil_div<IdxT>(IdxT{sample_size}, utils::WARP_SIZE);
+  rng.advance(static_cast<uint64_t>(n_blks * TPB), thread_rs * sizeof(IdxT) / sizeof(int32_t));
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+template <int N_WARPS, typename IdxT, typename GenT>
+CUGRAPH_OPS_KERNEL void index_algo_r_kernel(raft::random::DeviceState<GenT> rng_state,
+                                            IdxT* index,
+                                            const IdxT* sizes,
+                                            IdxT n_sizes,
+                                            int sample_size)
+{
+  using rand_t = std::make_unsigned_t<IdxT>;
+  // a warp-wide implementation.
+  auto lane   = utils::lane_id();
+  auto warp   = utils::warp_id();  // 1D block with X dim
+  auto row_id = warp + static_cast<IdxT>(blockIdx.x) * IdxT{N_WARPS};
+  if (row_id >= n_sizes) return;
+  IdxT* s_idx;
+  smem_algo_r_t<IdxT> smem{};
+  int32_t smem_sizes[] = {sample_size};
+  smem.set_ptrs(warp, N_WARPS, smem_sizes, s_idx);
+  // 1. load population size (once per warp)
+  IdxT size = IdxT{0};
+  if (lane == 0) size = sizes[row_id];
+
+  // 2. shuffle it to all threads in warp
+  size = utils::shfl(size, 0);
+
+  // 3. Get algo R indexes per warp
+  cugraph::ops::graph::warp_algo_r_index<IdxT, GenT, rand_t>(
+    s_idx, size, IdxT{0}, sample_size, rng_state);
+
+  CUGRAPH_OPS_UNROLL
+  for (auto i = lane; i < sample_size; i += utils::WARP_SIZE) {
+    // 4. output index
+    // still need to check if the index is actually valid
+    auto idx = s_idx[i];
+    index[row_id * IdxT{sample_size} + IdxT{i}] =
+      idx >= size ? cugraph::invalid_idx<IdxT>::value : idx;
+  }
+}
+
+template <typename IdxT>
+void get_sampling_index_reservoir(IdxT* index,
+                                  raft::random::RngState& rng,
+                                  const IdxT* sizes,
+                                  IdxT n_sizes,
+                                  int32_t sample_size,
+                                  cudaStream_t stream)
+{
+  // same TPB as in algo R: increased SM occupancy is most important here
+  static constexpr int TPB     = 512;
+  static constexpr int N_WARPS = TPB / utils::WARP_SIZE;
+  auto n_blks                  = utils::ceil_div<IdxT>(n_sizes, N_WARPS);
+  int32_t smem_sizes[]         = {sample_size};
+  size_t smem_size             = smem_algo_r_t<IdxT>::get_size(N_WARPS, smem_sizes);
+  RAFT_CALL_RNG_FUNC(rng,
+                     (index_algo_r_kernel<N_WARPS><<<n_blks, TPB, smem_size, stream>>>),
+                     index,
+                     sizes,
+                     n_sizes,
+                     sample_size);
+  auto thread_rs = utils::ceil_div<IdxT>(
+    std::max(IdxT{0}, std::min(std::numeric_limits<IdxT>::max(), n_sizes) - IdxT{sample_size}),
+    utils::WARP_SIZE);
+  rng.advance(static_cast<uint64_t>(n_blks * TPB), thread_rs * sizeof(IdxT) / sizeof(int32_t));
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+template <typename IdxT>
+void get_sampling_index_impl(IdxT* index,
+                             raft::random::RngState& rng,
+                             const IdxT* sizes,
+                             IdxT n_sizes,
+                             int32_t sample_size,
+                             bool replace,
+                             cudaStream_t stream)
+{
+  if (replace) {
+    get_sampling_index_replace<IdxT>(index, rng, sizes, n_sizes, sample_size, stream);
+  } else {
+    get_sampling_index_reservoir<IdxT>(index, rng, sizes, n_sizes, sample_size, stream);
+  }
+}
+
+}  // namespace cugraph::legacy::ops::graph
diff --git a/cpp/src/generators/erdos_renyi_generator.cuh b/cpp/src/generators/erdos_renyi_generator.cuh
index cd461ee1aa2..10573ddb0d0 100644
--- a/cpp/src/generators/erdos_renyi_generator.cuh
+++ b/cpp/src/generators/erdos_renyi_generator.cuh
@@ -40,6 +40,11 @@ generate_erdos_renyi_graph_edgelist_gnp(raft::handle_t const& handle,
                                         vertex_t base_vertex_id,
                                         uint64_t seed)
 {
+  // NOTE:
+  // https://networkx.org/documentation/stable/_modules/networkx/generators/random_graphs.html#fast_gnp_random_graph
+  //   identifies a faster algorithm that I think would be very efficient on the GPU.  I believe we
+  //   could just compute lr/lp in that code for a batch of values, use prefix sums to generate edge
+  //   ids and then convert the generated values to a batch of edges.
   CUGRAPH_EXPECTS(num_vertices < std::numeric_limits<int32_t>::max(),
                   "Implementation cannot support specified value");
 
@@ -88,6 +93,11 @@ generate_erdos_renyi_graph_edgelist_gnm(raft::handle_t const& handle,
                                         uint64_t seed)
 {
   CUGRAPH_FAIL("Not implemented");
+
+  // To implement:
+  //    Use sampling function to select `m` unique edge ids from the
+  //    (num_vertices ^ 2) possible edges.  Convert these to vertex
+  //    ids.
 }
 
 }  // namespace cugraph
diff --git a/cpp/src/layout/legacy/barnes_hut.cuh b/cpp/src/layout/legacy/barnes_hut.cuh
index fa6d3816417..fdd57c7772d 100644
--- a/cpp/src/layout/legacy/barnes_hut.cuh
+++ b/cpp/src/layout/legacy/barnes_hut.cuh
@@ -19,7 +19,6 @@
 #include "bh_kernels.cuh"
 #include "converters/legacy/COOtoCSR.cuh"
 #include "fa2_kernels.cuh"
-#include "utilities/graph_utils.cuh"
 #include "utils.hpp"
 
 #include <cugraph/detail/utility_wrappers.hpp>
diff --git a/cpp/src/layout/legacy/fa2_kernels.cuh b/cpp/src/layout/legacy/fa2_kernels.cuh
index 33e7841a380..195889eebfb 100644
--- a/cpp/src/layout/legacy/fa2_kernels.cuh
+++ b/cpp/src/layout/legacy/fa2_kernels.cuh
@@ -17,7 +17,9 @@
 #pragma once
 #define restrict __restrict__
 
-#include "utilities/graph_utils.cuh"
+// From old graph_utils.cuh
+#define CUDA_MAX_BLOCKS         65535
+#define CUDA_MAX_KERNEL_THREADS 256  // kernel will launch at most 256 threads per block
 
 namespace cugraph {
 namespace detail {
diff --git a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh
index 43415ba6df4..a6a164d36c1 100644
--- a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh
+++ b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include "from_cugraph_ops/sampling.hpp"
 #include "prims/detail/partition_v_frontier.cuh"
 #include "prims/detail/transform_v_frontier_e.cuh"
 #include "prims/property_op_utils.cuh"
@@ -33,9 +34,6 @@
 #include <cugraph/vertex_partition_device_view.cuh>
 
 #include <raft/random/rng.cuh>
-#ifndef NO_CUGRAPH_OPS
-#include <cugraph-ops/graph/sampling.hpp>
-#endif
 
 #include <cub/cub.cuh>
 #include <cuda/atomic>
@@ -394,11 +392,11 @@ compute_unique_keys(raft::handle_t const& handle,
       cuda::proclaim_return_type<size_t>(
         [unique_key_first = get_dataframe_buffer_begin(aggregate_local_frontier_unique_keys) +
                             local_frontier_unique_key_displacements[i],
-         num_unique_keys = local_frontier_unique_key_sizes[i]] __device__(key_t key) {
+         unique_key_last = get_dataframe_buffer_begin(aggregate_local_frontier_unique_keys) +
+                           local_frontier_unique_key_displacements[i] +
+                           local_frontier_unique_key_sizes[i]] __device__(key_t key) {
           return static_cast<size_t>(thrust::distance(
-            unique_key_first,
-            thrust::lower_bound(
-              thrust::seq, unique_key_first, unique_key_first + num_unique_keys, key)));
+            unique_key_first, thrust::find(thrust::seq, unique_key_first, unique_key_last, key)));
         }));
   }
 
@@ -639,7 +637,7 @@ rmm::device_uvector<edge_t> compute_uniform_sampling_index_without_replacement(
   auto mid_partition_size = frontier_partition_offsets[2] - frontier_partition_offsets[1];
   if (mid_partition_size > 0) {
     // FIXME: tmp_degrees & tmp_nbr_indices can be avoided if we customize
-    // cugraph::ops::get_sampling_index
+    // cugraph::legacy::ops::get_sampling_index
     rmm::device_uvector<edge_t> tmp_degrees(mid_partition_size, handle.get_stream());
     rmm::device_uvector<edge_t> tmp_nbr_indices(mid_partition_size * K, handle.get_stream());
     thrust::gather(handle.get_thrust_policy(),
@@ -647,13 +645,13 @@ rmm::device_uvector<edge_t> compute_uniform_sampling_index_without_replacement(
                    frontier_indices.begin() + frontier_partition_offsets[2],
                    frontier_degrees.begin(),
                    tmp_degrees.begin());
-    cugraph::ops::graph::get_sampling_index(tmp_nbr_indices.data(),
-                                            rng_state,
-                                            tmp_degrees.data(),
-                                            mid_partition_size,
-                                            static_cast<int32_t>(K),
-                                            false,
-                                            handle.get_stream());
+    cugraph::legacy::ops::graph::get_sampling_index(tmp_nbr_indices.data(),
+                                                    rng_state,
+                                                    tmp_degrees.data(),
+                                                    mid_partition_size,
+                                                    static_cast<int32_t>(K),
+                                                    false,
+                                                    handle.get_stream());
     thrust::for_each(
       handle.get_thrust_policy(),
       thrust::make_counting_iterator(size_t{0}),
@@ -736,7 +734,7 @@ rmm::device_uvector<edge_t> compute_uniform_sampling_index_without_replacement(
         }
 
         if (retry_segment_indices) {
-          cugraph::ops::graph::get_sampling_index(
+          cugraph::legacy::ops::graph::get_sampling_index(
             (*retry_nbr_indices).data(),
             rng_state,
             (*retry_degrees).begin(),
@@ -752,7 +750,7 @@ rmm::device_uvector<edge_t> compute_uniform_sampling_index_without_replacement(
                        segment_frontier_degree_first,
                        segment_frontier_degree_first + num_segments,
                        tmp_degrees.begin());
-          cugraph::ops::graph::get_sampling_index(
+          cugraph::legacy::ops::graph::get_sampling_index(
             tmp_nbr_indices.data(),
             rng_state,
             tmp_degrees.data(),
@@ -1626,13 +1624,13 @@ uniform_sample_and_compute_local_nbr_indices(
   if (with_replacement) {
     if (frontier_degrees.size() > 0) {
       nbr_indices.resize(frontier_degrees.size() * K, handle.get_stream());
-      cugraph::ops::graph::get_sampling_index(nbr_indices.data(),
-                                              rng_state,
-                                              frontier_degrees.data(),
-                                              static_cast<edge_t>(frontier_degrees.size()),
-                                              static_cast<int32_t>(K),
-                                              with_replacement,
-                                              handle.get_stream());
+      cugraph::legacy::ops::graph::get_sampling_index(nbr_indices.data(),
+                                                      rng_state,
+                                                      frontier_degrees.data(),
+                                                      static_cast<edge_t>(frontier_degrees.size()),
+                                                      static_cast<int32_t>(K),
+                                                      with_replacement,
+                                                      handle.get_stream());
       frontier_degrees.resize(0, handle.get_stream());
       frontier_degrees.shrink_to_fit(handle.get_stream());
     }
@@ -1761,8 +1759,7 @@ biased_sample_and_compute_local_nbr_indices(
   std::optional<rmm::device_uvector<size_t>> key_indices{std::nullopt};
   std::vector<size_t> local_frontier_sample_offsets{};
   if (with_replacement) {
-    // computet segmented inclusive sums (one segment per seed)
-
+    // compute segmented inclusive sums (one segment per seed)
     auto unique_key_first = thrust::make_transform_iterator(
       thrust::make_counting_iterator(size_t{0}),
       cuda::proclaim_return_type<size_t>(
@@ -2041,7 +2038,7 @@ biased_sample_and_compute_local_nbr_indices(
         zero_bias_frontier_indices.resize(zero_bias_count_inclusive_sums.back(),
                                           handle.get_stream());
         zero_bias_frontier_indices.shrink_to_fit(handle.get_stream());
-        zero_bias_local_nbr_indices.resize(frontier_indices.size(), handle.get_stream());
+        zero_bias_local_nbr_indices.resize(zero_bias_frontier_indices.size(), handle.get_stream());
         zero_bias_local_nbr_indices.shrink_to_fit(handle.get_stream());
         std::vector<size_t> zero_bias_counts(zero_bias_count_inclusive_sums.size());
         std::adjacent_difference(zero_bias_count_inclusive_sums.begin(),
diff --git a/cpp/src/prims/detail/transform_v_frontier_e.cuh b/cpp/src/prims/detail/transform_v_frontier_e.cuh
index 7d8824849f0..5ebcddfe8da 100644
--- a/cpp/src/prims/detail/transform_v_frontier_e.cuh
+++ b/cpp/src/prims/detail/transform_v_frontier_e.cuh
@@ -209,9 +209,6 @@ __global__ static void transform_v_frontier_e_mid_degree(
   auto const lane_id = tid % raft::warp_size();
   size_t idx         = static_cast<size_t>(tid / raft::warp_size());
 
-  using WarpScan = cub::WarpScan<edge_t, raft::warp_size()>;
-  __shared__ typename WarpScan::TempStorage temp_storage;
-
   while (idx < static_cast<size_t>(thrust::distance(edge_partition_frontier_key_index_first,
                                                     edge_partition_frontier_key_index_last))) {
     auto key_idx      = *(edge_partition_frontier_key_index_first + idx);
@@ -224,16 +221,15 @@ __global__ static void transform_v_frontier_e_mid_degree(
     thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
     auto this_key_value_first = value_first + edge_partition_frontier_local_degree_offsets[key_idx];
     if (edge_partition_e_mask) {
-      // FIXME: it might be faster to update in warp-sync way
-      edge_t counter{0};
-      for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
-        if ((*edge_partition_e_mask).get(edge_offset + i)) { ++counter; }
-      }
-      edge_t offset_within_warp{};
-      WarpScan(temp_storage).ExclusiveSum(counter, offset_within_warp);
-      counter = 0;
-      for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
-        if ((*edge_partition_e_mask).get(edge_offset + i)) {
+      auto rounded_up_local_degree =
+        ((static_cast<size_t>(local_degree) + (raft::warp_size() - 1)) / raft::warp_size()) *
+        raft::warp_size();
+      edge_t base_offset{0};
+      for (edge_t i = lane_id; i < rounded_up_local_degree; i += raft::warp_size()) {
+        auto valid  = (i < local_degree) && (*edge_partition_e_mask).get(edge_offset + i);
+        auto ballot = __ballot_sync(raft::warp_full_mask(), valid ? uint32_t{1} : uint32_t{0});
+        if (valid) {
+          auto intra_warp_offset = __popc(ballot & ~(raft::warp_full_mask() << lane_id));
           transform_v_frontier_e_update_buffer_element<key_t, GraphViewType>(
             edge_partition,
             key,
@@ -244,9 +240,9 @@ __global__ static void transform_v_frontier_e_mid_degree(
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
             e_op,
-            this_key_value_first + offset_within_warp + counter);
-          ++counter;
+            this_key_value_first + base_offset + intra_warp_offset);
         }
+        base_offset += __popc(ballot);
       }
     } else {
       for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
@@ -300,6 +296,7 @@ __global__ static void transform_v_frontier_e_high_degree(
 
   using BlockScan = cub::BlockScan<edge_t, transform_v_frontier_e_kernel_block_size>;
   __shared__ typename BlockScan::TempStorage temp_storage;
+  __shared__ edge_t increment;
 
   while (idx < static_cast<size_t>(thrust::distance(edge_partition_frontier_key_index_first,
                                                     edge_partition_frontier_key_index_last))) {
@@ -313,16 +310,16 @@ __global__ static void transform_v_frontier_e_high_degree(
     thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
     auto this_key_value_first = value_first + edge_partition_frontier_local_degree_offsets[key_idx];
     if (edge_partition_e_mask) {
-      // FIXME: it might be faster to update in block-sync way
-      edge_t counter{0};
-      for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
-        if ((*edge_partition_e_mask).get(edge_offset + i)) { ++counter; }
-      }
-      edge_t offset_within_block{};
-      BlockScan(temp_storage).ExclusiveSum(counter, offset_within_block);
-      counter = 0;
-      for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
-        if ((*edge_partition_e_mask).get(edge_offset + i)) {
+      auto rounded_up_local_degree =
+        ((static_cast<size_t>(local_degree) + (transform_v_frontier_e_kernel_block_size - 1)) /
+         transform_v_frontier_e_kernel_block_size) *
+        transform_v_frontier_e_kernel_block_size;
+      edge_t base_offset{0};
+      for (size_t i = threadIdx.x; i < rounded_up_local_degree; i += blockDim.x) {
+        auto valid = (i < local_degree) && (*edge_partition_e_mask).get(edge_offset + i);
+        edge_t intra_block_offset{};
+        BlockScan(temp_storage).ExclusiveSum(valid ? edge_t{1} : edge_t{0}, intra_block_offset);
+        if (valid) {
           transform_v_frontier_e_update_buffer_element<key_t, GraphViewType>(
             edge_partition,
             key,
@@ -333,9 +330,13 @@ __global__ static void transform_v_frontier_e_high_degree(
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
             e_op,
-            this_key_value_first + offset_within_block + counter);
-          ++counter;
+            this_key_value_first + base_offset + intra_block_offset);
+        }
+        if (threadIdx.x == transform_v_frontier_e_kernel_block_size - 1) {
+          increment = intra_block_offset + (valid ? edge_t{1} : edge_t{0});
         }
+        __syncthreads();
+        base_offset += increment;
       }
     } else {
       for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
diff --git a/cpp/src/prims/per_v_pair_dst_nbr_intersection.cuh b/cpp/src/prims/per_v_pair_dst_nbr_intersection.cuh
new file mode 100644
index 00000000000..01c76e5085a
--- /dev/null
+++ b/cpp/src/prims/per_v_pair_dst_nbr_intersection.cuh
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "prims/detail/nbr_intersection.cuh"
+
+#include <raft/core/handle.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <tuple>
+
+namespace cugraph {
+
+/**
+ * @brief Iterate over each input vertex pair and returns the common destination neighbor list
+ * pair in a CSR-like format
+ *
+ * Iterate over every vertex pair; intersect destination neighbor lists of the two vertices in the
+ * pair and store the result in a CSR-like format
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam VertexPairIterator Type of the iterator for input vertex pairs.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param vertex_pair_first Iterator pointing to the first (inclusive) input vertex pair.
+ * @param vertex_pair_last Iterator pointing to the last (exclusive) input vertex pair.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return std::tuple Tuple of intersection offsets and indices.
+ */
+template <typename GraphViewType, typename VertexPairIterator>
+std::tuple<rmm::device_uvector<size_t>, rmm::device_uvector<typename GraphViewType::vertex_type>>
+per_v_pair_dst_nbr_intersection(raft::handle_t const& handle,
+                                GraphViewType const& graph_view,
+                                VertexPairIterator vertex_pair_first,
+                                VertexPairIterator vertex_pair_last,
+                                bool do_expensive_check = false)
+{
+  static_assert(!GraphViewType::is_storage_transposed);
+
+  return detail::nbr_intersection(handle,
+                                  graph_view,
+                                  cugraph::edge_dummy_property_t{}.view(),
+                                  vertex_pair_first,
+                                  vertex_pair_last,
+                                  std::array<bool, 2>{true, true},
+                                  do_expensive_check);
+}
+
+}  // namespace cugraph
diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
index 015a9c683f1..9d0f711d106 100644
--- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include "from_cugraph_ops/sampling.hpp"
 #include "prims/detail/sample_and_compute_local_nbr_indices.cuh"
 #include "prims/property_op_utils.cuh"
 
@@ -30,9 +31,6 @@
 #include <cugraph/utilities/shuffle_comm.cuh>
 
 #include <raft/random/rng.cuh>
-#ifndef NO_CUGRAPH_OPS
-#include <cugraph-ops/graph/sampling.hpp>
-#endif
 
 #include <cub/cub.cuh>
 #include <cuda/atomic>
@@ -353,7 +351,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
       uniform_sample_and_compute_local_nbr_indices(
         handle,
         graph_view,
-        (minor_comm_size > 1) ? get_dataframe_buffer_begin(*aggregate_local_frontier)
+        (minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_frontier)
                               : frontier.begin(),
         local_frontier_displacements,
         local_frontier_sizes,
@@ -365,7 +363,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
       biased_sample_and_compute_local_nbr_indices(
         handle,
         graph_view,
-        (minor_comm_size > 1) ? get_dataframe_buffer_begin(*aggregate_local_frontier)
+        (minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_frontier)
                               : frontier.begin(),
         edge_bias_src_value_input,
         edge_bias_dst_value_input,
@@ -394,7 +392,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
         graph_view.local_edge_partition_view(i));
 
     auto edge_partition_frontier_key_first =
-      ((minor_comm_size > 1) ? get_dataframe_buffer_begin(*aggregate_local_frontier)
+      ((minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_frontier)
                              : frontier.begin()) +
       local_frontier_displacements[i];
     auto edge_partition_sample_local_nbr_index_first =
diff --git a/cpp/src/sampling/negative_sampling_impl.cuh b/cpp/src/sampling/negative_sampling_impl.cuh
new file mode 100644
index 00000000000..93bb03077bc
--- /dev/null
+++ b/cpp/src/sampling/negative_sampling_impl.cuh
@@ -0,0 +1,417 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "prims/reduce_v.cuh"
+#include "prims/update_edge_src_dst_property.cuh"
+#include "utilities/collect_comm.cuh"
+
+#include <cugraph/detail/shuffle_wrappers.hpp>
+#include <cugraph/detail/utility_wrappers.hpp>
+#include <cugraph/sampling_functions.hpp>
+#include <cugraph/utilities/device_functors.cuh>
+#include <cugraph/utilities/host_scalar_comm.hpp>
+
+#include <rmm/device_scalar.hpp>
+
+#include <thrust/adjacent_difference.h>
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/reduce.h>
+#include <thrust/remove.h>
+#include <thrust/scan.h>
+#include <thrust/transform.h>
+#include <thrust/unique.h>
+
+namespace cugraph {
+
+namespace detail {
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<weight_t>>>
+normalize_biases(raft::handle_t const& handle,
+                 graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+                 raft::device_span<weight_t const> biases)
+{
+  std::optional<rmm::device_uvector<weight_t>> normalized_biases{std::nullopt};
+  std::optional<rmm::device_uvector<weight_t>> gpu_biases{std::nullopt};
+
+  // Need to normalize the biases
+  normalized_biases =
+    std::make_optional<rmm::device_uvector<weight_t>>(biases.size(), handle.get_stream());
+
+  weight_t sum =
+    thrust::reduce(handle.get_thrust_policy(), biases.begin(), biases.end(), weight_t{0});
+
+  thrust::transform(handle.get_thrust_policy(),
+                    biases.begin(),
+                    biases.end(),
+                    normalized_biases->begin(),
+                    divider_t<weight_t>{sum});
+
+  thrust::inclusive_scan(handle.get_thrust_policy(),
+                         normalized_biases->begin(),
+                         normalized_biases->end(),
+                         normalized_biases->begin());
+
+  if constexpr (multi_gpu) {
+    rmm::device_scalar<weight_t> d_sum(sum, handle.get_stream());
+
+    gpu_biases = cugraph::device_allgatherv(
+      handle, handle.get_comms(), raft::device_span<weight_t const>{d_sum.data(), d_sum.size()});
+
+    weight_t aggregate_sum = thrust::reduce(
+      handle.get_thrust_policy(), gpu_biases->begin(), gpu_biases->end(), weight_t{0});
+
+    // FIXME: https://github.com/rapidsai/raft/issues/2400 results in the possibility
+    // that 1 can appear as a random floating point value.  We're going to use
+    // thrust::upper_bound to assign random values to GPUs, we need the value 1.0 to
+    // be part of the upper-most range.  We'll compute the last non-zero value in the
+    // gpu_biases array here and below we will fill it with a value larger than 1.0
+    size_t trailing_zeros = thrust::distance(
+      thrust::make_reverse_iterator(gpu_biases->end()),
+      thrust::find_if(handle.get_thrust_policy(),
+                      thrust::make_reverse_iterator(gpu_biases->end()),
+                      thrust::make_reverse_iterator(gpu_biases->begin()),
+                      [] __device__(weight_t bias) { return bias > weight_t{0}; }));
+
+    thrust::transform(handle.get_thrust_policy(),
+                      gpu_biases->begin(),
+                      gpu_biases->end(),
+                      gpu_biases->begin(),
+                      divider_t<weight_t>{aggregate_sum});
+
+    thrust::inclusive_scan(
+      handle.get_thrust_policy(), gpu_biases->begin(), gpu_biases->end(), gpu_biases->begin());
+
+    // FIXME: conclusion of above.  Using 1.1 since it is > 1.0 and easy to type
+    thrust::copy_n(handle.get_thrust_policy(),
+                   thrust::make_constant_iterator<weight_t>(1.1),
+                   trailing_zeros + 1,
+                   gpu_biases->begin() + gpu_biases->size() - trailing_zeros - 1);
+  }
+
+  return std::make_tuple(std::move(normalized_biases), std::move(gpu_biases));
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+rmm::device_uvector<vertex_t> create_local_samples(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::optional<rmm::device_uvector<weight_t>> const& normalized_biases,
+  std::optional<rmm::device_uvector<weight_t>> const& gpu_biases,
+  size_t samples_in_this_batch)
+{
+  rmm::device_uvector<vertex_t> samples(0, handle.get_stream());
+
+  if (normalized_biases) {
+    size_t samples_to_generate{samples_in_this_batch};
+    std::vector<size_t> sample_count_from_each_gpu;
+
+    rmm::device_uvector<size_t> position(0, handle.get_stream());
+
+    if constexpr (multi_gpu) {
+      // Determine how many vertices are generated on each GPU
+      auto const comm_size = handle.get_comms().get_size();
+      auto const comm_rank = handle.get_comms().get_rank();
+
+      sample_count_from_each_gpu.resize(comm_size);
+
+      rmm::device_uvector<size_t> gpu_counts(comm_size, handle.get_stream());
+      position.resize(samples_in_this_batch, handle.get_stream());
+
+      thrust::fill(handle.get_thrust_policy(), gpu_counts.begin(), gpu_counts.end(), size_t{0});
+      thrust::sequence(handle.get_thrust_policy(), position.begin(), position.end());
+
+      rmm::device_uvector<weight_t> random_values(samples_in_this_batch, handle.get_stream());
+      detail::uniform_random_fill(handle.get_stream(),
+                                  random_values.data(),
+                                  random_values.size(),
+                                  weight_t{0},
+                                  weight_t{1},
+                                  rng_state);
+
+      thrust::sort(handle.get_thrust_policy(),
+                   thrust::make_zip_iterator(random_values.begin(), position.begin()),
+                   thrust::make_zip_iterator(random_values.end(), position.end()));
+
+      thrust::upper_bound(handle.get_thrust_policy(),
+                          random_values.begin(),
+                          random_values.end(),
+                          gpu_biases->begin(),
+                          gpu_biases->end(),
+                          gpu_counts.begin());
+
+      thrust::adjacent_difference(
+        handle.get_thrust_policy(), gpu_counts.begin(), gpu_counts.end(), gpu_counts.begin());
+
+      std::vector<size_t> tx_counts(gpu_counts.size());
+      std::fill(tx_counts.begin(), tx_counts.end(), size_t{1});
+
+      rmm::device_uvector<size_t> d_sample_count_from_each_gpu(0, handle.get_stream());
+
+      std::tie(d_sample_count_from_each_gpu, std::ignore) =
+        shuffle_values(handle.get_comms(), gpu_counts.begin(), tx_counts, handle.get_stream());
+
+      samples_to_generate = thrust::reduce(handle.get_thrust_policy(),
+                                           d_sample_count_from_each_gpu.begin(),
+                                           d_sample_count_from_each_gpu.end(),
+                                           size_t{0});
+
+      raft::update_host(sample_count_from_each_gpu.data(),
+                        d_sample_count_from_each_gpu.data(),
+                        d_sample_count_from_each_gpu.size(),
+                        handle.get_stream());
+    }
+
+    // Generate samples
+    //  FIXME: We could save this memory if we had an iterator that
+    //         generated random values.
+    rmm::device_uvector<weight_t> random_values(samples_to_generate, handle.get_stream());
+    samples.resize(samples_to_generate, handle.get_stream());
+    detail::uniform_random_fill(handle.get_stream(),
+                                random_values.data(),
+                                random_values.size(),
+                                weight_t{0},
+                                weight_t{1},
+                                rng_state);
+
+    thrust::transform(
+      handle.get_thrust_policy(),
+      random_values.begin(),
+      random_values.end(),
+      samples.begin(),
+      [biases =
+         raft::device_span<weight_t const>{normalized_biases->data(), normalized_biases->size()},
+       offset = graph_view.local_vertex_partition_range_first()] __device__(weight_t r) {
+        size_t result =
+          offset +
+          static_cast<vertex_t>(thrust::distance(
+            biases.begin(), thrust::lower_bound(thrust::seq, biases.begin(), biases.end(), r)));
+
+        // FIXME: https://github.com/rapidsai/raft/issues/2400
+        // results in the possibility that 1 can appear as a
+        // random floating point value, which results in the sampling
+        // algorithm below generating a value that's OOB.
+        if (result == (offset + biases.size())) --result;
+
+        return result;
+      });
+
+    // Shuffle them back
+    if constexpr (multi_gpu) {
+      std::tie(samples, std::ignore) = shuffle_values(
+        handle.get_comms(), samples.begin(), sample_count_from_each_gpu, handle.get_stream());
+
+      thrust::sort(handle.get_thrust_policy(),
+                   thrust::make_zip_iterator(position.begin(), samples.begin()),
+                   thrust::make_zip_iterator(position.end(), samples.begin()));
+    }
+  } else {
+    samples.resize(samples_in_this_batch, handle.get_stream());
+
+    // Uniformly select a vertex from any GPU
+    detail::uniform_random_fill(handle.get_stream(),
+                                samples.data(),
+                                samples.size(),
+                                vertex_t{0},
+                                graph_view.number_of_vertices(),
+                                rng_state);
+  }
+
+  return samples;
+}
+
+}  // namespace detail
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> negative_sampling(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::optional<raft::device_span<weight_t const>> src_biases,
+  std::optional<raft::device_span<weight_t const>> dst_biases,
+  size_t num_samples,
+  bool remove_duplicates,
+  bool remove_existing_edges,
+  bool exact_number_of_samples,
+  bool do_expensive_check)
+{
+  rmm::device_uvector<vertex_t> src(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> dst(0, handle.get_stream());
+
+  // Optimistically assume we can do this in one pass
+  size_t samples_in_this_batch = num_samples;
+
+  // Normalize the biases and (for MG) determine how the biases are
+  // distributed across the GPUs.
+  std::optional<rmm::device_uvector<weight_t>> normalized_src_biases{std::nullopt};
+  std::optional<rmm::device_uvector<weight_t>> gpu_src_biases{std::nullopt};
+  std::optional<rmm::device_uvector<weight_t>> normalized_dst_biases{std::nullopt};
+  std::optional<rmm::device_uvector<weight_t>> gpu_dst_biases{std::nullopt};
+
+  if (src_biases)
+    std::tie(normalized_src_biases, gpu_src_biases) =
+      detail::normalize_biases(handle, graph_view, *src_biases);
+
+  if (dst_biases)
+    std::tie(normalized_dst_biases, gpu_dst_biases) =
+      detail::normalize_biases(handle, graph_view, *dst_biases);
+
+  while (samples_in_this_batch > 0) {
+    if constexpr (multi_gpu) {
+      auto const comm_size = handle.get_comms().get_size();
+      auto const comm_rank = handle.get_comms().get_rank();
+
+      samples_in_this_batch =
+        (samples_in_this_batch / static_cast<size_t>(comm_size)) +
+        (static_cast<size_t>(comm_rank) < (samples_in_this_batch % static_cast<size_t>(comm_size))
+           ? 1
+           : 0);
+    }
+
+    auto batch_src = create_local_samples(
+      handle, rng_state, graph_view, normalized_src_biases, gpu_src_biases, samples_in_this_batch);
+    auto batch_dst = create_local_samples(
+      handle, rng_state, graph_view, normalized_dst_biases, gpu_dst_biases, samples_in_this_batch);
+
+    if constexpr (multi_gpu) {
+      auto vertex_partition_range_lasts = graph_view.vertex_partition_range_lasts();
+
+      std::tie(batch_src, batch_dst, std::ignore, std::ignore, std::ignore, std::ignore) =
+        detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
+                                                                                       edge_t,
+                                                                                       weight_t,
+                                                                                       int32_t>(
+          handle,
+          std::move(batch_src),
+          std::move(batch_dst),
+          std::nullopt,
+          std::nullopt,
+          std::nullopt,
+          vertex_partition_range_lasts);
+    }
+
+    if (remove_existing_edges) {
+      auto has_edge_flags =
+        graph_view.has_edge(handle,
+                            raft::device_span<vertex_t const>{batch_src.data(), batch_src.size()},
+                            raft::device_span<vertex_t const>{batch_dst.data(), batch_dst.size()},
+                            do_expensive_check);
+
+      auto begin_iter = thrust::make_zip_iterator(batch_src.begin(), batch_dst.begin());
+      auto new_end    = thrust::remove_if(handle.get_thrust_policy(),
+                                       begin_iter,
+                                       begin_iter + batch_src.size(),
+                                       has_edge_flags.begin(),
+                                       thrust::identity<bool>());
+
+      batch_src.resize(thrust::distance(begin_iter, new_end), handle.get_stream());
+      batch_dst.resize(thrust::distance(begin_iter, new_end), handle.get_stream());
+    }
+
+    if (remove_duplicates) {
+      thrust::sort(handle.get_thrust_policy(),
+                   thrust::make_zip_iterator(batch_src.begin(), batch_dst.begin()),
+                   thrust::make_zip_iterator(batch_src.end(), batch_dst.end()));
+
+      auto new_end = thrust::unique(handle.get_thrust_policy(),
+                                    thrust::make_zip_iterator(batch_src.begin(), batch_dst.begin()),
+                                    thrust::make_zip_iterator(batch_src.end(), batch_dst.end()));
+
+      size_t new_size =
+        thrust::distance(thrust::make_zip_iterator(batch_src.begin(), batch_dst.begin()), new_end);
+
+      if (src.size() > 0) {
+        rmm::device_uvector<vertex_t> new_src(src.size() + new_size, handle.get_stream());
+        rmm::device_uvector<vertex_t> new_dst(dst.size() + new_size, handle.get_stream());
+
+        thrust::merge(handle.get_thrust_policy(),
+                      thrust::make_zip_iterator(batch_src.begin(), batch_dst.begin()),
+                      new_end,
+                      thrust::make_zip_iterator(src.begin(), dst.begin()),
+                      thrust::make_zip_iterator(src.end(), dst.end()),
+                      thrust::make_zip_iterator(new_src.begin(), new_dst.begin()));
+
+        new_end = thrust::unique(handle.get_thrust_policy(),
+                                 thrust::make_zip_iterator(new_src.begin(), new_dst.begin()),
+                                 thrust::make_zip_iterator(new_src.end(), new_dst.end()));
+
+        new_size =
+          thrust::distance(thrust::make_zip_iterator(new_src.begin(), new_dst.begin()), new_end);
+
+        src = std::move(new_src);
+        dst = std::move(new_dst);
+      } else {
+        src = std::move(batch_src);
+        dst = std::move(batch_dst);
+      }
+
+      src.resize(new_size, handle.get_stream());
+      dst.resize(new_size, handle.get_stream());
+    } else if (src.size() > 0) {
+      size_t current_end = src.size();
+
+      src.resize(src.size() + batch_src.size(), handle.get_stream());
+      dst.resize(dst.size() + batch_dst.size(), handle.get_stream());
+
+      thrust::copy(handle.get_thrust_policy(),
+                   thrust::make_zip_iterator(batch_src.begin(), batch_dst.begin()),
+                   thrust::make_zip_iterator(batch_src.end(), batch_dst.end()),
+                   thrust::make_zip_iterator(src.begin(), dst.begin()) + current_end);
+    } else {
+      src = std::move(batch_src);
+      dst = std::move(batch_dst);
+    }
+
+    if (exact_number_of_samples) {
+      size_t current_sample_size = src.size();
+      if constexpr (multi_gpu) {
+        current_sample_size = cugraph::host_scalar_allreduce(
+          handle.get_comms(), current_sample_size, raft::comms::op_t::SUM, handle.get_stream());
+      }
+
+      // FIXME: We could oversample and discard the unnecessary samples
+      // to reduce the number of iterations in the outer loop, but it seems like
+      // exact_number_of_samples is an edge case not worth optimizing for at this time.
+      samples_in_this_batch = num_samples - current_sample_size;
+    } else {
+      samples_in_this_batch = 0;
+    }
+  }
+
+  src.shrink_to_fit(handle.get_stream());
+  dst.shrink_to_fit(handle.get_stream());
+
+  return std::make_tuple(std::move(src), std::move(dst));
+}
+
+}  // namespace cugraph
diff --git a/cpp/src/sampling/negative_sampling_mg_v32_e32.cu b/cpp/src/sampling/negative_sampling_mg_v32_e32.cu
new file mode 100644
index 00000000000..ce54d54d319
--- /dev/null
+++ b/cpp/src/sampling/negative_sampling_mg_v32_e32.cu
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "negative_sampling_impl.cuh"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/sampling_functions.hpp>
+
+namespace cugraph {
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>> negative_sampling(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<raft::device_span<float const>> src_bias,
+  std::optional<raft::device_span<float const>> dst_bias,
+  size_t num_samples,
+  bool remove_duplicates,
+  bool remove_existing_edges,
+  bool exact_number_of_samples,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>> negative_sampling(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<raft::device_span<double const>> src_bias,
+  std::optional<raft::device_span<double const>> dst_bias,
+  size_t num_samples,
+  bool remove_duplicates,
+  bool remove_existing_edges,
+  bool exact_number_of_samples,
+  bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/sampling/negative_sampling_mg_v32_e64.cu b/cpp/src/sampling/negative_sampling_mg_v32_e64.cu
new file mode 100644
index 00000000000..af4c28c0f1a
--- /dev/null
+++ b/cpp/src/sampling/negative_sampling_mg_v32_e64.cu
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "negative_sampling_impl.cuh"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/sampling_functions.hpp>
+
+namespace cugraph {
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>> negative_sampling(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int64_t, false, true> const& graph_view,
+  std::optional<raft::device_span<float const>> src_bias,
+  std::optional<raft::device_span<float const>> dst_bias,
+  size_t num_samples,
+  bool remove_duplicates,
+  bool remove_existing_edges,
+  bool exact_number_of_samples,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>> negative_sampling(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int64_t, false, true> const& graph_view,
+  std::optional<raft::device_span<double const>> src_bias,
+  std::optional<raft::device_span<double const>> dst_bias,
+  size_t num_samples,
+  bool remove_duplicates,
+  bool remove_existing_edges,
+  bool exact_number_of_samples,
+  bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/sampling/negative_sampling_mg_v64_e64.cu b/cpp/src/sampling/negative_sampling_mg_v64_e64.cu
new file mode 100644
index 00000000000..c5691fb4644
--- /dev/null
+++ b/cpp/src/sampling/negative_sampling_mg_v64_e64.cu
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "negative_sampling_impl.cuh"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/sampling_functions.hpp>
+
+namespace cugraph {
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>> negative_sampling(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<raft::device_span<float const>> src_bias,
+  std::optional<raft::device_span<float const>> dst_bias,
+  size_t num_samples,
+  bool remove_duplicates,
+  bool remove_existing_edges,
+  bool exact_number_of_samples,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>> negative_sampling(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<raft::device_span<double const>> src_bias,
+  std::optional<raft::device_span<double const>> dst_bias,
+  size_t num_samples,
+  bool remove_duplicates,
+  bool remove_existing_edges,
+  bool exact_number_of_samples,
+  bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/sampling/negative_sampling_sg_v32_e32.cu b/cpp/src/sampling/negative_sampling_sg_v32_e32.cu
new file mode 100644
index 00000000000..3712414e4ec
--- /dev/null
+++ b/cpp/src/sampling/negative_sampling_sg_v32_e32.cu
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "negative_sampling_impl.cuh"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/sampling_functions.hpp>
+
+namespace cugraph {
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>> negative_sampling(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<raft::device_span<float const>> src_bias,
+  std::optional<raft::device_span<float const>> dst_bias,
+  size_t num_samples,
+  bool remove_duplicates,
+  bool remove_existing_edges,
+  bool exact_number_of_samples,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>> negative_sampling(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<raft::device_span<double const>> src_bias,
+  std::optional<raft::device_span<double const>> dst_bias,
+  size_t num_samples,
+  bool remove_duplicates,
+  bool remove_existing_edges,
+  bool exact_number_of_samples,
+  bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/sampling/negative_sampling_sg_v32_e64.cu b/cpp/src/sampling/negative_sampling_sg_v32_e64.cu
new file mode 100644
index 00000000000..c66c31a4258
--- /dev/null
+++ b/cpp/src/sampling/negative_sampling_sg_v32_e64.cu
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "negative_sampling_impl.cuh"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/sampling_functions.hpp>
+
+namespace cugraph {
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>> negative_sampling(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int64_t, false, false> const& graph_view,
+  std::optional<raft::device_span<float const>> src_bias,
+  std::optional<raft::device_span<float const>> dst_bias,
+  size_t num_samples,
+  bool remove_duplicates,
+  bool remove_existing_edges,
+  bool exact_number_of_samples,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>> negative_sampling(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int64_t, false, false> const& graph_view,
+  std::optional<raft::device_span<double const>> src_bias,
+  std::optional<raft::device_span<double const>> dst_bias,
+  size_t num_samples,
+  bool remove_duplicates,
+  bool remove_existing_edges,
+  bool exact_number_of_samples,
+  bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/sampling/negative_sampling_sg_v64_e64.cu b/cpp/src/sampling/negative_sampling_sg_v64_e64.cu
new file mode 100644
index 00000000000..e4fc50890e4
--- /dev/null
+++ b/cpp/src/sampling/negative_sampling_sg_v64_e64.cu
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "negative_sampling_impl.cuh"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/sampling_functions.hpp>
+
+namespace cugraph {
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>> negative_sampling(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<raft::device_span<float const>> src_bias,
+  std::optional<raft::device_span<float const>> dst_bias,
+  size_t num_samples,
+  bool remove_duplicates,
+  bool remove_existing_edges,
+  bool exact_number_of_samples,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>> negative_sampling(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<raft::device_span<double const>> src_bias,
+  std::optional<raft::device_span<double const>> dst_bias,
+  size_t num_samples,
+  bool remove_duplicates,
+  bool remove_existing_edges,
+  bool exact_number_of_samples,
+  bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/sampling/random_walks.cuh b/cpp/src/sampling/random_walks.cuh
index 3b0bc15df93..0b1d9dcdb56 100644
--- a/cpp/src/sampling/random_walks.cuh
+++ b/cpp/src/sampling/random_walks.cuh
@@ -18,8 +18,6 @@
 //
 #pragma once
 
-#include "utilities/graph_utils.cuh"
-
 #include <cugraph/algorithms.hpp>
 #include <cugraph/detail/utility_wrappers.hpp>
 #include <cugraph/graph.hpp>
diff --git a/cpp/src/sampling/random_walks_impl.cuh b/cpp/src/sampling/random_walks_impl.cuh
index d582893d756..6c10fc473f3 100644
--- a/cpp/src/sampling/random_walks_impl.cuh
+++ b/cpp/src/sampling/random_walks_impl.cuh
@@ -17,7 +17,10 @@
 #pragma once
 
 #include "detail/graph_partition_utils.cuh"
+#include "prims/detail/nbr_intersection.cuh"
 #include "prims/per_v_random_select_transform_outgoing_e.cuh"
+#include "prims/property_op_utils.cuh"
+#include "prims/update_edge_src_dst_property.cuh"
 #include "prims/vertex_frontier.cuh"
 
 #include <cugraph/algorithms.hpp>
@@ -25,6 +28,7 @@
 #include <cugraph/detail/utility_wrappers.hpp>
 #include <cugraph/edge_src_dst_property.hpp>
 #include <cugraph/graph.hpp>
+#include <cugraph/graph_functions.hpp>
 #include <cugraph/graph_view.hpp>
 #include <cugraph/partition_manager.hpp>
 #include <cugraph/utilities/host_scalar_comm.hpp>
@@ -46,13 +50,6 @@
 namespace cugraph {
 namespace detail {
 
-inline uint64_t get_current_time_nanoseconds()
-{
-  auto cur = std::chrono::steady_clock::now();
-  return static_cast<uint64_t>(
-    std::chrono::duration_cast<std::chrono::nanoseconds>(cur.time_since_epoch()).count());
-}
-
 template <typename vertex_t, typename weight_t>
 struct sample_edges_op_t {
   template <typename W = weight_t>
@@ -70,21 +67,129 @@ struct sample_edges_op_t {
   }
 };
 
+template <typename vertex_t, typename bias_t>
+struct biased_random_walk_e_bias_op_t {
+  __device__ bias_t
+  operator()(vertex_t, vertex_t, bias_t src_out_weight_sum, thrust::nullopt_t, bias_t weight) const
+  {
+    return weight / src_out_weight_sum;
+  }
+};
+
+template <typename vertex_t, typename weight_t>
+struct biased_sample_edges_op_t {
+  __device__ thrust::tuple<vertex_t, weight_t> operator()(
+    vertex_t, vertex_t dst, weight_t, thrust::nullopt_t, weight_t weight) const
+  {
+    return thrust::make_tuple(dst, weight);
+  }
+};
+
+template <typename vertex_t, typename bias_t, typename weight_t>
+struct node2vec_random_walk_e_bias_op_t {
+  bias_t p_{};
+  bias_t q_{};
+  raft::device_span<size_t const> intersection_offsets_{};
+  raft::device_span<vertex_t const> intersection_indices_{};
+  raft::device_span<vertex_t const> current_vertices_{};
+  raft::device_span<vertex_t const> prev_vertices_{};
+
+  // Unweighted Bias Operator
+  template <typename W = weight_t>
+  __device__ std::enable_if_t<std::is_same_v<W, void>, bias_t> operator()(
+    thrust::tuple<vertex_t, vertex_t> tagged_src,
+    vertex_t dst,
+    thrust::nullopt_t,
+    thrust::nullopt_t,
+    thrust::nullopt_t) const
+  {
+    //  Check tag (prev vert) for destination
+    if (dst == thrust::get<1>(tagged_src)) { return 1.0 / p_; }
+    //  Search zipped vertices for tagged src
+    auto lower_itr = thrust::lower_bound(
+      thrust::seq,
+      thrust::make_zip_iterator(current_vertices_.begin(), prev_vertices_.begin()),
+      thrust::make_zip_iterator(current_vertices_.end(), prev_vertices_.end()),
+      tagged_src);
+    auto low_idx = thrust::distance(
+      thrust::make_zip_iterator(current_vertices_.begin(), prev_vertices_.begin()), lower_itr);
+    auto intersection_index_first = intersection_indices_.begin() + intersection_offsets_[low_idx];
+    auto intersection_index_last =
+      intersection_indices_.begin() + intersection_offsets_[low_idx + 1];
+    auto itr =
+      thrust::lower_bound(thrust::seq, intersection_index_first, intersection_index_last, dst);
+    return (itr != intersection_index_last && *itr == dst) ? 1.0 : 1.0 / q_;
+  }
+
+  //  Weighted Bias Operator
+  template <typename W = weight_t>
+  __device__ std::enable_if_t<!std::is_same_v<W, void>, bias_t> operator()(
+    thrust::tuple<vertex_t, vertex_t> tagged_src,
+    vertex_t dst,
+    thrust::nullopt_t,
+    thrust::nullopt_t,
+    W) const
+  {
+    //  Check tag (prev vert) for destination
+    if (dst == thrust::get<1>(tagged_src)) { return 1.0 / p_; }
+    //  Search zipped vertices for tagged src
+    auto lower_itr = thrust::lower_bound(
+      thrust::seq,
+      thrust::make_zip_iterator(current_vertices_.begin(), prev_vertices_.begin()),
+      thrust::make_zip_iterator(current_vertices_.end(), prev_vertices_.end()),
+      tagged_src);
+    auto low_idx = thrust::distance(
+      thrust::make_zip_iterator(current_vertices_.begin(), prev_vertices_.begin()), lower_itr);
+    auto intersection_index_first = intersection_indices_.begin() + intersection_offsets_[low_idx];
+    auto intersection_index_last =
+      intersection_indices_.begin() + intersection_offsets_[low_idx + 1];
+    auto itr =
+      thrust::lower_bound(thrust::seq, intersection_index_first, intersection_index_last, dst);
+    return (itr != intersection_index_last && *itr == dst) ? 1.0 : 1.0 / q_;
+  }
+};
+
+template <typename vertex_t, typename weight_t>
+struct node2vec_sample_edges_op_t {
+  template <typename W = weight_t>
+  __device__ std::enable_if_t<std::is_same_v<W, void>, vertex_t> operator()(
+    thrust::tuple<vertex_t, vertex_t> tagged_src,
+    vertex_t dst,
+    thrust::nullopt_t,
+    thrust::nullopt_t,
+    thrust::nullopt_t) const
+  {
+    return dst;
+  }
+
+  template <typename W = weight_t>
+  __device__ std::enable_if_t<!std::is_same_v<W, void>, thrust::tuple<vertex_t, W>> operator()(
+    thrust::tuple<vertex_t, vertex_t> tagged_src,
+    vertex_t dst,
+    thrust::nullopt_t,
+    thrust::nullopt_t,
+    W w) const
+  {
+    return thrust::make_tuple(dst, w);
+  }
+};
+
 template <typename weight_t>
 struct uniform_selector {
-  raft::random::RngState rng_state_;
-
-  uniform_selector(uint64_t seed) : rng_state_(seed) {}
+  raft::random::RngState& rng_state_;
+  static constexpr bool is_second_order_ = false;
 
   template <typename GraphViewType>
   std::tuple<rmm::device_uvector<typename GraphViewType::vertex_type>,
+             std::optional<rmm::device_uvector<typename GraphViewType::vertex_type>>,
              std::optional<rmm::device_uvector<weight_t>>>
   follow_random_edge(
     raft::handle_t const& handle,
     GraphViewType const& graph_view,
     std::optional<edge_property_view_t<typename GraphViewType::edge_type, weight_t const*>>
       edge_weight_view,
-    rmm::device_uvector<typename GraphViewType::vertex_type> const& current_vertices)
+    rmm::device_uvector<typename GraphViewType::vertex_type>&& current_vertices,
+    std::optional<rmm::device_uvector<typename GraphViewType::vertex_type>>&& previous_vertices)
   {
     using vertex_t = typename GraphViewType::vertex_type;
 
@@ -133,30 +238,67 @@ struct uniform_selector {
 
       minors = std::move(sample_e_op_results);
     }
-    return std::make_tuple(std::move(minors), std::move(weights));
+    return std::make_tuple(std::move(minors), std::move(previous_vertices), std::move(weights));
   }
 };
 
 template <typename weight_t>
 struct biased_selector {
-  uint64_t seed_{0};
+  raft::random::RngState& rng_state_;
+  static constexpr bool is_second_order_ = false;
 
   template <typename GraphViewType>
   std::tuple<rmm::device_uvector<typename GraphViewType::vertex_type>,
+             std::optional<rmm::device_uvector<typename GraphViewType::vertex_type>>,
              std::optional<rmm::device_uvector<weight_t>>>
   follow_random_edge(
     raft::handle_t const& handle,
     GraphViewType const& graph_view,
     std::optional<edge_property_view_t<typename GraphViewType::edge_type, weight_t const*>>
       edge_weight_view,
-    rmm::device_uvector<typename GraphViewType::vertex_type> const& current_vertices)
+    rmm::device_uvector<typename GraphViewType::vertex_type>&& current_vertices,
+    std::optional<rmm::device_uvector<typename GraphViewType::vertex_type>>&& previous_vertices)
   {
-    //  To do biased sampling, I need out_weights instead of out_degrees.
-    //  Then I generate a random float between [0, out_weights[v]).  Then
-    //  instead of making a decision based on the index I need to find
-    //  upper_bound (or is it lower_bound) of the random number and
-    //  the cumulative weight.
-    CUGRAPH_FAIL("biased sampling not implemented");
+    //  Create vertex frontier
+    using vertex_t = typename GraphViewType::vertex_type;
+
+    using tag_t = void;
+
+    cugraph::vertex_frontier_t<vertex_t, tag_t, GraphViewType::is_multi_gpu, false> vertex_frontier(
+      handle, 1);
+
+    vertex_frontier.bucket(0).insert(current_vertices.begin(), current_vertices.end());
+
+    auto vertex_weight_sum = compute_out_weight_sums(handle, graph_view, *edge_weight_view);
+    edge_src_property_t<GraphViewType, weight_t> edge_src_out_weight_sums(handle, graph_view);
+    update_edge_src_property(handle,
+                             graph_view,
+                             vertex_frontier.bucket(0).begin(),
+                             vertex_frontier.bucket(0).end(),
+                             vertex_weight_sum.data(),
+                             edge_src_out_weight_sums.mutable_view());
+    auto [sample_offsets, sample_e_op_results] = cugraph::per_v_random_select_transform_outgoing_e(
+      handle,
+      graph_view,
+      vertex_frontier.bucket(0),
+      edge_src_out_weight_sums.view(),
+      cugraph::edge_dst_dummy_property_t{}.view(),
+      *edge_weight_view,
+      biased_random_walk_e_bias_op_t<vertex_t, weight_t>{},
+      edge_src_out_weight_sums.view(),
+      cugraph::edge_dst_dummy_property_t{}.view(),
+      *edge_weight_view,
+      biased_sample_edges_op_t<vertex_t, weight_t>{},
+      rng_state_,
+      size_t{1},
+      true,
+      std::make_optional(
+        thrust::make_tuple(vertex_t{cugraph::invalid_vertex_id<vertex_t>::value}, weight_t{0.0})));
+
+    //  Return results
+    return std::make_tuple(std::move(std::get<0>(sample_e_op_results)),
+                           std::move(previous_vertices),
+                           std::move(std::get<1>(sample_e_op_results)));
   }
 };
 
@@ -164,26 +306,232 @@ template <typename weight_t>
 struct node2vec_selector {
   weight_t p_;
   weight_t q_;
-  uint64_t seed_{0};
+  raft::random::RngState& rng_state_;
+  static constexpr bool is_second_order_ = true;
 
   template <typename GraphViewType>
   std::tuple<rmm::device_uvector<typename GraphViewType::vertex_type>,
+             std::optional<rmm::device_uvector<typename GraphViewType::vertex_type>>,
              std::optional<rmm::device_uvector<weight_t>>>
   follow_random_edge(
     raft::handle_t const& handle,
     GraphViewType const& graph_view,
     std::optional<edge_property_view_t<typename GraphViewType::edge_type, weight_t const*>>
       edge_weight_view,
-    rmm::device_uvector<typename GraphViewType::vertex_type> const& current_vertices)
+    rmm::device_uvector<typename GraphViewType::vertex_type>&& current_vertices,
+    std::optional<rmm::device_uvector<typename GraphViewType::vertex_type>>&& previous_vertices)
   {
-    //  To do node2vec, I need the following:
-    //    1) transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v to compute the sum of the
-    //       node2vec style weights
-    //    2) Generate a random number between [0, output_from_trdnioeebv[v])
-    //    3) a sampling value that lets me pick the correct edge based on the same computation
-    //       (essentially weighted sampling, but with a function that computes the weight rather
-    //       than just using the edge weights)
-    CUGRAPH_FAIL("node2vec not implemented");
+    //  Create vertex frontier
+    using vertex_t = typename GraphViewType::vertex_type;
+
+    using tag_t = vertex_t;
+
+    //  Zip previous and current vertices for nbr_intersection()
+    auto intersection_pairs =
+      thrust::make_zip_iterator(current_vertices.begin(), (*previous_vertices).begin());
+
+    auto [intersection_offsets, intersection_indices] =
+      detail::nbr_intersection(handle,
+                               graph_view,
+                               cugraph::edge_dummy_property_t{}.view(),
+                               intersection_pairs,
+                               intersection_pairs + current_vertices.size(),
+                               std::array<bool, 2>{true, true},
+                               false);
+
+    rmm::device_uvector<size_t> intersection_counts(size_t{0}, handle.get_stream());
+    rmm::device_uvector<size_t> aggregate_offsets(size_t{0}, handle.get_stream());
+    rmm::device_uvector<vertex_t> aggregate_currents(size_t{0}, handle.get_stream());
+    rmm::device_uvector<vertex_t> aggregate_previous(size_t{0}, handle.get_stream());
+    rmm::device_uvector<vertex_t> aggregate_indices(size_t{0}, handle.get_stream());
+
+    //  Aggregate intersection data across minor comm
+    if constexpr (GraphViewType::is_multi_gpu) {
+      intersection_counts.resize(intersection_offsets.size(), handle.get_stream());
+      thrust::adjacent_difference(handle.get_thrust_policy(),
+                                  intersection_offsets.begin(),
+                                  intersection_offsets.end(),
+                                  intersection_counts.begin());
+
+      auto recv_counts = cugraph::host_scalar_allgather(
+        handle.get_subcomm(cugraph::partition_manager::minor_comm_name()),
+        current_vertices.size(),
+        handle.get_stream());
+
+      std::vector<size_t> displacements(recv_counts.size());
+      std::exclusive_scan(recv_counts.begin(), recv_counts.end(), displacements.begin(), size_t{0});
+
+      aggregate_offsets.resize(displacements.back() + recv_counts.back() + 1, handle.get_stream());
+      aggregate_offsets.set_element_to_zero_async(aggregate_offsets.size() - 1,
+                                                  handle.get_stream());
+
+      cugraph::device_allgatherv(handle.get_subcomm(cugraph::partition_manager::minor_comm_name()),
+                                 intersection_counts.begin() + 1,
+                                 aggregate_offsets.begin(),
+                                 recv_counts,
+                                 displacements,
+                                 handle.get_stream());
+
+      thrust::exclusive_scan(handle.get_thrust_policy(),
+                             aggregate_offsets.begin(),
+                             aggregate_offsets.end(),
+                             aggregate_offsets.begin());
+
+      aggregate_currents.resize(displacements.back() + recv_counts.back(), handle.get_stream());
+
+      cugraph::device_allgatherv(handle.get_subcomm(cugraph::partition_manager::minor_comm_name()),
+                                 current_vertices.begin(),
+                                 aggregate_currents.begin(),
+                                 recv_counts,
+                                 displacements,
+                                 handle.get_stream());
+
+      aggregate_previous.resize(displacements.back() + recv_counts.back(), handle.get_stream());
+
+      cugraph::device_allgatherv(handle.get_subcomm(cugraph::partition_manager::minor_comm_name()),
+                                 (*previous_vertices).begin(),
+                                 aggregate_previous.begin(),
+                                 recv_counts,
+                                 displacements,
+                                 handle.get_stream());
+
+      recv_counts = cugraph::host_scalar_allgather(
+        handle.get_subcomm(cugraph::partition_manager::minor_comm_name()),
+        intersection_offsets.back_element(handle.get_stream()),
+        handle.get_stream());
+
+      displacements.resize(recv_counts.size());
+      std::exclusive_scan(recv_counts.begin(), recv_counts.end(), displacements.begin(), size_t{0});
+
+      aggregate_indices.resize(displacements.back() + recv_counts.back(), handle.get_stream());
+
+      cugraph::device_allgatherv(handle.get_subcomm(cugraph::partition_manager::minor_comm_name()),
+                                 intersection_indices.begin(),
+                                 aggregate_indices.begin(),
+                                 recv_counts,
+                                 displacements,
+                                 handle.get_stream());
+    }
+
+    cugraph::vertex_frontier_t<vertex_t, tag_t, GraphViewType::is_multi_gpu, false> vertex_frontier(
+      handle, 1);
+    vertex_frontier.bucket(0).insert(
+      thrust::make_zip_iterator(current_vertices.begin(), (*previous_vertices).begin()),
+      thrust::make_zip_iterator(current_vertices.end(), (*previous_vertices).end()));
+
+    // Create data structs for results
+    rmm::device_uvector<vertex_t> minors(0, handle.get_stream());
+    std::optional<rmm::device_uvector<weight_t>> weights{std::nullopt};
+
+    if (edge_weight_view) {
+      auto [sample_offsets, sample_e_op_results] =
+        cugraph::per_v_random_select_transform_outgoing_e(
+          handle,
+          graph_view,
+          vertex_frontier.bucket(0),
+          cugraph::edge_src_dummy_property_t{}.view(),
+          cugraph::edge_dst_dummy_property_t{}.view(),
+          *edge_weight_view,
+          GraphViewType::is_multi_gpu
+            ? node2vec_random_walk_e_bias_op_t<vertex_t,
+                                               weight_t,
+                                               weight_t>{p_,
+                                                         q_,
+                                                         raft::device_span<size_t const>(
+                                                           aggregate_offsets.data(),
+                                                           aggregate_offsets.size()),
+                                                         raft::device_span<vertex_t const>(
+                                                           aggregate_indices.data(),
+                                                           aggregate_indices.size()),
+                                                         raft::device_span<vertex_t const>(
+                                                           aggregate_currents.data(),
+                                                           aggregate_currents.size()),
+                                                         raft::device_span<vertex_t const>(
+                                                           aggregate_previous.data(),
+                                                           aggregate_previous.size())}
+            : node2vec_random_walk_e_bias_op_t<vertex_t,
+                                               weight_t,
+                                               weight_t>{p_,
+                                                         q_,
+                                                         raft::device_span<size_t const>(
+                                                           intersection_offsets.data(),
+                                                           intersection_offsets.size()),
+                                                         raft::device_span<vertex_t const>(
+                                                           intersection_indices.data(),
+                                                           intersection_indices.size()),
+                                                         raft::device_span<
+                                                           vertex_t const>(current_vertices.data(),
+                                                                           current_vertices.size()),
+                                                         raft::device_span<vertex_t const>(
+                                                           (*previous_vertices).data(),
+                                                           (*previous_vertices).size())},
+          cugraph::edge_src_dummy_property_t{}.view(),
+          cugraph::edge_dst_dummy_property_t{}.view(),
+          *edge_weight_view,
+          node2vec_sample_edges_op_t<vertex_t, weight_t>{},
+          rng_state_,
+          size_t{1},
+          true,
+          std::make_optional(thrust::make_tuple(
+            vertex_t{cugraph::invalid_vertex_id<vertex_t>::value}, weight_t{0.0})));
+      minors  = std::move(std::get<0>(sample_e_op_results));
+      weights = std::move(std::get<1>(sample_e_op_results));
+    } else {
+      auto [sample_offsets, sample_e_op_results] =
+        cugraph::per_v_random_select_transform_outgoing_e(
+          handle,
+          graph_view,
+          vertex_frontier.bucket(0),
+          cugraph::edge_src_dummy_property_t{}.view(),
+          cugraph::edge_dst_dummy_property_t{}.view(),
+          cugraph::edge_dummy_property_t{}.view(),
+          GraphViewType::is_multi_gpu
+            ? node2vec_random_walk_e_bias_op_t<vertex_t,
+                                               weight_t,
+                                               weight_t>{p_,
+                                                         q_,
+                                                         raft::device_span<size_t const>(
+                                                           aggregate_offsets.data(),
+                                                           aggregate_offsets.size()),
+                                                         raft::device_span<vertex_t const>(
+                                                           aggregate_indices.data(),
+                                                           aggregate_indices.size()),
+                                                         raft::device_span<vertex_t const>(
+                                                           aggregate_currents.data(),
+                                                           aggregate_currents.size()),
+                                                         raft::device_span<vertex_t const>(
+                                                           aggregate_previous.data(),
+                                                           aggregate_previous.size())}
+            : node2vec_random_walk_e_bias_op_t<vertex_t,
+                                               weight_t,
+                                               weight_t>{p_,
+                                                         q_,
+                                                         raft::device_span<size_t const>(
+                                                           intersection_offsets.data(),
+                                                           intersection_offsets.size()),
+                                                         raft::device_span<vertex_t const>(
+                                                           intersection_indices.data(),
+                                                           intersection_indices.size()),
+                                                         raft::device_span<
+                                                           vertex_t const>(current_vertices.data(),
+                                                                           current_vertices.size()),
+                                                         raft::device_span<vertex_t const>(
+                                                           (*previous_vertices).data(),
+                                                           (*previous_vertices).size())},
+          cugraph::edge_src_dummy_property_t{}.view(),
+          cugraph::edge_dst_dummy_property_t{}.view(),
+          cugraph::edge_dummy_property_t{}.view(),
+          node2vec_sample_edges_op_t<vertex_t, void>{},
+          rng_state_,
+          size_t{1},
+          true,
+          std::make_optional(vertex_t{cugraph::invalid_vertex_id<vertex_t>::value}));
+      minors = std::move(sample_e_op_results);
+    }
+
+    *previous_vertices = std::move(current_vertices);
+
+    return std::make_tuple(std::move(minors), std::move(previous_vertices), std::move(weights));
   }
 };
 
@@ -221,6 +569,16 @@ random_walk_impl(raft::handle_t const& handle,
                        ? std::make_optional<rmm::device_uvector<weight_t>>(0, handle.get_stream())
                        : std::nullopt;
 
+  auto previous_vertices = (random_selector.is_second_order_)
+                             ? std::make_optional<rmm::device_uvector<vertex_t>>(
+                                 current_vertices.size(), handle.get_stream())
+                             : std::nullopt;
+  if (previous_vertices) {
+    raft::copy((*previous_vertices).data(),
+               start_vertices.data(),
+               start_vertices.size(),
+               handle.get_stream());
+  }
   raft::copy(
     current_vertices.data(), start_vertices.data(), start_vertices.size(), handle.get_stream());
   detail::sequence_fill(
@@ -255,25 +613,73 @@ random_walk_impl(raft::handle_t const& handle,
       auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
       auto const minor_comm_size = minor_comm.get_size();
 
-      // Shuffle vertices to correct GPU to compute random indices
-      std::forward_as_tuple(std::tie(current_vertices, current_gpu, current_position),
-                            std::ignore) =
-        cugraph::groupby_gpu_id_and_shuffle_values(
-          handle.get_comms(),
+      if (previous_vertices) {
+        std::forward_as_tuple(
+          std::tie(current_vertices, current_gpu, current_position, previous_vertices),
+          std::ignore) =
+          cugraph::groupby_gpu_id_and_shuffle_values(
+            handle.get_comms(),
+            thrust::make_zip_iterator(current_vertices.begin(),
+                                      current_gpu.begin(),
+                                      current_position.begin(),
+                                      previous_vertices->begin()),
+            thrust::make_zip_iterator(current_vertices.end(),
+                                      current_gpu.end(),
+                                      current_position.end(),
+                                      previous_vertices->end()),
+            [key_func =
+               cugraph::detail::compute_gpu_id_from_int_vertex_t<vertex_t>{
+                 {vertex_partition_range_lasts.begin(), vertex_partition_range_lasts.size()},
+                 major_comm_size,
+                 minor_comm_size}] __device__(auto val) { return key_func(thrust::get<0>(val)); },
+            handle.get_stream());
+      } else {
+        // Shuffle vertices to correct GPU to compute random indices
+        std::forward_as_tuple(std::tie(current_vertices, current_gpu, current_position),
+                              std::ignore) =
+          cugraph::groupby_gpu_id_and_shuffle_values(
+            handle.get_comms(),
+            thrust::make_zip_iterator(
+              current_vertices.begin(), current_gpu.begin(), current_position.begin()),
+            thrust::make_zip_iterator(
+              current_vertices.end(), current_gpu.end(), current_position.end()),
+            [key_func =
+               cugraph::detail::compute_gpu_id_from_int_vertex_t<vertex_t>{
+                 {vertex_partition_range_lasts.begin(), vertex_partition_range_lasts.size()},
+                 major_comm_size,
+                 minor_comm_size}] __device__(auto val) { return key_func(thrust::get<0>(val)); },
+            handle.get_stream());
+      }
+    }
+
+    //  Sort for nbr_intersection, must sort all together
+    if (previous_vertices) {
+      if constexpr (multi_gpu) {
+        thrust::sort(handle.get_thrust_policy(),
+                     thrust::make_zip_iterator(current_vertices.begin(),
+                                               (*previous_vertices).begin(),
+                                               current_position.begin(),
+                                               current_gpu.begin()),
+                     thrust::make_zip_iterator(current_vertices.end(),
+                                               (*previous_vertices).end(),
+                                               current_position.end(),
+                                               current_gpu.end()));
+      } else {
+        thrust::sort(
+          handle.get_thrust_policy(),
           thrust::make_zip_iterator(
-            current_vertices.begin(), current_gpu.begin(), current_position.begin()),
+            current_vertices.begin(), (*previous_vertices).begin(), current_position.begin()),
           thrust::make_zip_iterator(
-            current_vertices.end(), current_gpu.end(), current_position.end()),
-          [key_func =
-             cugraph::detail::compute_gpu_id_from_int_vertex_t<vertex_t>{
-               {vertex_partition_range_lasts.begin(), vertex_partition_range_lasts.size()},
-               major_comm_size,
-               minor_comm_size}] __device__(auto val) { return key_func(thrust::get<0>(val)); },
-          handle.get_stream());
+            current_vertices.end(), (*previous_vertices).end(), current_position.end()));
+      }
     }
 
-    std::tie(current_vertices, new_weights) =
-      random_selector.follow_random_edge(handle, graph_view, edge_weight_view, current_vertices);
+    std::tie(current_vertices, previous_vertices, new_weights) =
+      random_selector.follow_random_edge(handle,
+                                         graph_view,
+                                         edge_weight_view,
+                                         std::move(current_vertices),
+                                         std::move(previous_vertices));
 
     // FIXME: remove_if has a 32-bit overflow issue
     // (https://github.com/NVIDIA/thrust/issues/1302) Seems unlikely here (the goal of
@@ -281,164 +687,244 @@ random_walk_impl(raft::handle_t const& handle,
     CUGRAPH_EXPECTS(
       current_vertices.size() < static_cast<size_t>(std::numeric_limits<int32_t>::max()),
       "remove_if will fail, current_vertices.size() is too large");
-
+    size_t compacted_length{0};
     if constexpr (multi_gpu) {
       if (result_weights) {
-        auto input_iter = thrust::make_zip_iterator(current_vertices.begin(),
-                                                    new_weights->begin(),
-                                                    current_gpu.begin(),
-                                                    current_position.begin());
-
-        auto compacted_length = thrust::distance(
-          input_iter,
-          thrust::remove_if(handle.get_thrust_policy(),
-                            input_iter,
-                            input_iter + current_vertices.size(),
-                            current_vertices.begin(),
-                            [] __device__(auto dst) {
-                              return (dst == cugraph::invalid_vertex_id<vertex_t>::value);
-                            }));
-
-        current_vertices.resize(compacted_length, handle.get_stream());
-        new_weights->resize(compacted_length, handle.get_stream());
-        current_gpu.resize(compacted_length, handle.get_stream());
-        current_position.resize(compacted_length, handle.get_stream());
-
-        // Shuffle back to original GPU
-        auto current_iter = thrust::make_zip_iterator(current_vertices.begin(),
+        if (previous_vertices) {
+          auto input_iter = thrust::make_zip_iterator(current_vertices.begin(),
+                                                      new_weights->begin(),
+                                                      current_gpu.begin(),
+                                                      current_position.begin(),
+                                                      previous_vertices->begin());
+
+          compacted_length = thrust::distance(
+            input_iter,
+            thrust::remove_if(handle.get_thrust_policy(),
+                              input_iter,
+                              input_iter + current_vertices.size(),
+                              current_vertices.begin(),
+                              [] __device__(auto dst) {
+                                return (dst == cugraph::invalid_vertex_id<vertex_t>::value);
+                              }));
+        } else {
+          auto input_iter = thrust::make_zip_iterator(current_vertices.begin(),
                                                       new_weights->begin(),
                                                       current_gpu.begin(),
                                                       current_position.begin());
 
-        std::forward_as_tuple(
-          std::tie(current_vertices, *new_weights, current_gpu, current_position), std::ignore) =
-          cugraph::groupby_gpu_id_and_shuffle_values(
-            handle.get_comms(),
-            current_iter,
-            current_iter + current_vertices.size(),
-            [] __device__(auto val) { return thrust::get<2>(val); },
-            handle.get_stream());
-
-        thrust::for_each(
-          handle.get_thrust_policy(),
-          thrust::make_zip_iterator(
-            current_vertices.begin(), new_weights->begin(), current_position.begin()),
-          thrust::make_zip_iterator(
-            current_vertices.end(), new_weights->end(), current_position.end()),
-          [result_verts = result_vertices.data(),
-           result_wgts  = result_weights->data(),
-           level,
-           max_length] __device__(auto tuple) {
-            vertex_t v                                       = thrust::get<0>(tuple);
-            weight_t w                                       = thrust::get<1>(tuple);
-            size_t pos                                       = thrust::get<2>(tuple);
-            result_verts[pos * (max_length + 1) + level + 1] = v;
-            result_wgts[pos * max_length + level]            = w;
-          });
+          compacted_length = thrust::distance(
+            input_iter,
+            thrust::remove_if(handle.get_thrust_policy(),
+                              input_iter,
+                              input_iter + current_vertices.size(),
+                              current_vertices.begin(),
+                              [] __device__(auto dst) {
+                                return (dst == cugraph::invalid_vertex_id<vertex_t>::value);
+                              }));
+        }
       } else {
-        auto input_iter = thrust::make_zip_iterator(
-          current_vertices.begin(), current_gpu.begin(), current_position.begin());
-
-        auto compacted_length = thrust::distance(
-          input_iter,
-          thrust::remove_if(handle.get_thrust_policy(),
-                            input_iter,
-                            input_iter + current_vertices.size(),
-                            current_vertices.begin(),
-                            [] __device__(auto dst) {
-                              return (dst == cugraph::invalid_vertex_id<vertex_t>::value);
-                            }));
-
-        current_vertices.resize(compacted_length, handle.get_stream());
-        current_gpu.resize(compacted_length, handle.get_stream());
-        current_position.resize(compacted_length, handle.get_stream());
-
-        // Shuffle back to original GPU
-        auto current_iter = thrust::make_zip_iterator(
-          current_vertices.begin(), current_gpu.begin(), current_position.begin());
-
-        std::forward_as_tuple(std::tie(current_vertices, current_gpu, current_position),
-                              std::ignore) =
-          cugraph::groupby_gpu_id_and_shuffle_values(
-            handle.get_comms(),
-            current_iter,
-            current_iter + current_vertices.size(),
-            [] __device__(auto val) { return thrust::get<1>(val); },
-            handle.get_stream());
-
-        thrust::for_each(
-          handle.get_thrust_policy(),
-          thrust::make_zip_iterator(current_vertices.begin(), current_position.begin()),
-          thrust::make_zip_iterator(current_vertices.end(), current_position.end()),
-          [result_verts = result_vertices.data(), level, max_length] __device__(auto tuple) {
-            vertex_t v                                       = thrust::get<0>(tuple);
-            size_t pos                                       = thrust::get<1>(tuple);
-            result_verts[pos * (max_length + 1) + level + 1] = v;
-          });
+        if (previous_vertices) {
+          auto input_iter = thrust::make_zip_iterator(current_vertices.begin(),
+                                                      current_gpu.begin(),
+                                                      current_position.begin(),
+                                                      previous_vertices->begin());
+
+          compacted_length = thrust::distance(
+            input_iter,
+            thrust::remove_if(handle.get_thrust_policy(),
+                              input_iter,
+                              input_iter + current_vertices.size(),
+                              current_vertices.begin(),
+                              [] __device__(auto dst) {
+                                return (dst == cugraph::invalid_vertex_id<vertex_t>::value);
+                              }));
+        } else {
+          auto input_iter = thrust::make_zip_iterator(
+            current_vertices.begin(), current_gpu.begin(), current_position.begin());
+
+          compacted_length = thrust::distance(
+            input_iter,
+            thrust::remove_if(handle.get_thrust_policy(),
+                              input_iter,
+                              input_iter + current_vertices.size(),
+                              current_vertices.begin(),
+                              [] __device__(auto dst) {
+                                return (dst == cugraph::invalid_vertex_id<vertex_t>::value);
+                              }));
+        }
       }
     } else {
       if (result_weights) {
-        auto input_iter = thrust::make_zip_iterator(
-          current_vertices.begin(), new_weights->begin(), current_position.begin());
-
-        auto compacted_length = thrust::distance(
-          input_iter,
-          thrust::remove_if(handle.get_thrust_policy(),
-                            input_iter,
-                            input_iter + current_vertices.size(),
-                            current_vertices.begin(),
-                            [] __device__(auto dst) {
-                              return (dst == cugraph::invalid_vertex_id<vertex_t>::value);
-                            }));
-
-        current_vertices.resize(compacted_length, handle.get_stream());
-        new_weights->resize(compacted_length, handle.get_stream());
-        current_position.resize(compacted_length, handle.get_stream());
-
-        thrust::for_each(
-          handle.get_thrust_policy(),
-          thrust::make_zip_iterator(
-            current_vertices.begin(), new_weights->begin(), current_position.begin()),
-          thrust::make_zip_iterator(
-            current_vertices.end(), new_weights->end(), current_position.end()),
-          [result_verts = result_vertices.data(),
-           result_wgts  = result_weights->data(),
-           level,
-           max_length] __device__(auto tuple) {
-            vertex_t v                                       = thrust::get<0>(tuple);
-            weight_t w                                       = thrust::get<1>(tuple);
-            size_t pos                                       = thrust::get<2>(tuple);
-            result_verts[pos * (max_length + 1) + level + 1] = v;
-            result_wgts[pos * max_length + level]            = w;
-          });
+        if (previous_vertices) {
+          auto input_iter = thrust::make_zip_iterator(current_vertices.begin(),
+                                                      new_weights->begin(),
+                                                      current_position.begin(),
+                                                      previous_vertices->begin());
+
+          compacted_length = thrust::distance(
+            input_iter,
+            thrust::remove_if(handle.get_thrust_policy(),
+                              input_iter,
+                              input_iter + current_vertices.size(),
+                              current_vertices.begin(),
+                              [] __device__(auto dst) {
+                                return (dst == cugraph::invalid_vertex_id<vertex_t>::value);
+                              }));
+        } else {
+          auto input_iter = thrust::make_zip_iterator(
+            current_vertices.begin(), new_weights->begin(), current_position.begin());
+
+          compacted_length = thrust::distance(
+            input_iter,
+            thrust::remove_if(handle.get_thrust_policy(),
+                              input_iter,
+                              input_iter + current_vertices.size(),
+                              current_vertices.begin(),
+                              [] __device__(auto dst) {
+                                return (dst == cugraph::invalid_vertex_id<vertex_t>::value);
+                              }));
+        }
       } else {
-        auto input_iter =
-          thrust::make_zip_iterator(current_vertices.begin(), current_position.begin());
-
-        auto compacted_length = thrust::distance(
-          input_iter,
-          thrust::remove_if(handle.get_thrust_policy(),
-                            input_iter,
-                            input_iter + current_vertices.size(),
-                            current_vertices.begin(),
-                            [] __device__(auto dst) {
-                              return (dst == cugraph::invalid_vertex_id<vertex_t>::value);
-                            }));
-
-        current_vertices.resize(compacted_length, handle.get_stream());
-        current_position.resize(compacted_length, handle.get_stream());
-
-        thrust::for_each(
-          handle.get_thrust_policy(),
-          thrust::make_zip_iterator(current_vertices.begin(), current_position.begin()),
-          thrust::make_zip_iterator(current_vertices.end(), current_position.end()),
-          [result_verts = result_vertices.data(), level, max_length] __device__(auto tuple) {
-            vertex_t v                                       = thrust::get<0>(tuple);
-            size_t pos                                       = thrust::get<1>(tuple);
-            result_verts[pos * (max_length + 1) + level + 1] = v;
-          });
+        if (previous_vertices) {
+          auto input_iter = thrust::make_zip_iterator(
+            current_vertices.begin(), current_position.begin(), previous_vertices->begin());
+
+          compacted_length = thrust::distance(
+            input_iter,
+            thrust::remove_if(handle.get_thrust_policy(),
+                              input_iter,
+                              input_iter + current_vertices.size(),
+                              current_vertices.begin(),
+                              [] __device__(auto dst) {
+                                return (dst == cugraph::invalid_vertex_id<vertex_t>::value);
+                              }));
+        } else {
+          auto input_iter =
+            thrust::make_zip_iterator(current_vertices.begin(), current_position.begin());
+
+          compacted_length = thrust::distance(
+            input_iter,
+            thrust::remove_if(handle.get_thrust_policy(),
+                              input_iter,
+                              input_iter + current_vertices.size(),
+                              current_vertices.begin(),
+                              [] __device__(auto dst) {
+                                return (dst == cugraph::invalid_vertex_id<vertex_t>::value);
+                              }));
+        }
+      }
+    }
+
+    //  Moved out of if statements to cut down on code duplication
+    current_vertices.resize(compacted_length, handle.get_stream());
+    current_vertices.shrink_to_fit(handle.get_stream());
+    current_position.resize(compacted_length, handle.get_stream());
+    current_position.shrink_to_fit(handle.get_stream());
+    if (result_weights) {
+      new_weights->resize(compacted_length, handle.get_stream());
+      new_weights->shrink_to_fit(handle.get_stream());
+    }
+    if (previous_vertices) {
+      previous_vertices->resize(compacted_length, handle.get_stream());
+      previous_vertices->shrink_to_fit(handle.get_stream());
+    }
+    if constexpr (multi_gpu) {
+      current_gpu.resize(compacted_length, handle.get_stream());
+      current_gpu.shrink_to_fit(handle.get_stream());
+
+      // Shuffle back to original GPU
+      if (previous_vertices) {
+        if (result_weights) {
+          auto current_iter = thrust::make_zip_iterator(current_vertices.begin(),
+                                                        new_weights->begin(),
+                                                        current_gpu.begin(),
+                                                        current_position.begin(),
+                                                        previous_vertices->begin());
+
+          std::forward_as_tuple(
+            std::tie(
+              current_vertices, *new_weights, current_gpu, current_position, *previous_vertices),
+            std::ignore) =
+            cugraph::groupby_gpu_id_and_shuffle_values(
+              handle.get_comms(),
+              current_iter,
+              current_iter + current_vertices.size(),
+              [] __device__(auto val) { return thrust::get<2>(val); },
+              handle.get_stream());
+        } else {
+          auto current_iter = thrust::make_zip_iterator(current_vertices.begin(),
+                                                        current_gpu.begin(),
+                                                        current_position.begin(),
+                                                        previous_vertices->begin());
+
+          std::forward_as_tuple(
+            std::tie(current_vertices, current_gpu, current_position, *previous_vertices),
+            std::ignore) =
+            cugraph::groupby_gpu_id_and_shuffle_values(
+              handle.get_comms(),
+              current_iter,
+              current_iter + current_vertices.size(),
+              [] __device__(auto val) { return thrust::get<1>(val); },
+              handle.get_stream());
+        }
+      } else {
+        if (result_weights) {
+          auto current_iter = thrust::make_zip_iterator(current_vertices.begin(),
+                                                        new_weights->begin(),
+                                                        current_gpu.begin(),
+                                                        current_position.begin());
+
+          std::forward_as_tuple(
+            std::tie(current_vertices, *new_weights, current_gpu, current_position), std::ignore) =
+            cugraph::groupby_gpu_id_and_shuffle_values(
+              handle.get_comms(),
+              current_iter,
+              current_iter + current_vertices.size(),
+              [] __device__(auto val) { return thrust::get<2>(val); },
+              handle.get_stream());
+        } else {
+          auto current_iter = thrust::make_zip_iterator(
+            current_vertices.begin(), current_gpu.begin(), current_position.begin());
+
+          std::forward_as_tuple(std::tie(current_vertices, current_gpu, current_position),
+                                std::ignore) =
+            cugraph::groupby_gpu_id_and_shuffle_values(
+              handle.get_comms(),
+              current_iter,
+              current_iter + current_vertices.size(),
+              [] __device__(auto val) { return thrust::get<1>(val); },
+              handle.get_stream());
+        }
       }
     }
+
+    if (result_weights) {
+      thrust::for_each(handle.get_thrust_policy(),
+                       thrust::make_zip_iterator(
+                         current_vertices.begin(), new_weights->begin(), current_position.begin()),
+                       thrust::make_zip_iterator(
+                         current_vertices.end(), new_weights->end(), current_position.end()),
+                       [result_verts = result_vertices.data(),
+                        result_wgts  = result_weights->data(),
+                        level,
+                        max_length] __device__(auto tuple) {
+                         vertex_t v                                       = thrust::get<0>(tuple);
+                         weight_t w                                       = thrust::get<1>(tuple);
+                         size_t pos                                       = thrust::get<2>(tuple);
+                         result_verts[pos * (max_length + 1) + level + 1] = v;
+                         result_wgts[pos * max_length + level]            = w;
+                       });
+    } else {
+      thrust::for_each(
+        handle.get_thrust_policy(),
+        thrust::make_zip_iterator(current_vertices.begin(), current_position.begin()),
+        thrust::make_zip_iterator(current_vertices.end(), current_position.end()),
+        [result_verts = result_vertices.data(), level, max_length] __device__(auto tuple) {
+          vertex_t v                                       = thrust::get<0>(tuple);
+          size_t pos                                       = thrust::get<1>(tuple);
+          result_verts[pos * (max_length + 1) + level + 1] = v;
+        });
+    }
   }
 
   return std::make_tuple(std::move(result_vertices), std::move(result_weights));
@@ -449,11 +935,11 @@ random_walk_impl(raft::handle_t const& handle,
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
 std::tuple<rmm::device_uvector<vertex_t>, std::optional<rmm::device_uvector<weight_t>>>
 uniform_random_walks(raft::handle_t const& handle,
+                     raft::random::RngState& rng_state,
                      graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
                      std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
                      raft::device_span<vertex_t const> start_vertices,
-                     size_t max_length,
-                     uint64_t seed)
+                     size_t max_length)
 {
   CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
 
@@ -462,18 +948,17 @@ uniform_random_walks(raft::handle_t const& handle,
                                   edge_weight_view,
                                   start_vertices,
                                   max_length,
-                                  detail::uniform_selector<weight_t>(
-                                    (seed == 0 ? detail::get_current_time_nanoseconds() : seed)));
+                                  detail::uniform_selector<weight_t>{rng_state});
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
 std::tuple<rmm::device_uvector<vertex_t>, std::optional<rmm::device_uvector<weight_t>>>
 biased_random_walks(raft::handle_t const& handle,
+                    raft::random::RngState& rng_state,
                     graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
                     edge_property_view_t<edge_t, weight_t const*> edge_weight_view,
                     raft::device_span<vertex_t const> start_vertices,
-                    size_t max_length,
-                    uint64_t seed)
+                    size_t max_length)
 {
   CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
 
@@ -483,30 +968,28 @@ biased_random_walks(raft::handle_t const& handle,
     std::optional<edge_property_view_t<edge_t, weight_t const*>>{edge_weight_view},
     start_vertices,
     max_length,
-    detail::biased_selector<weight_t>{(seed == 0 ? detail::get_current_time_nanoseconds() : seed)});
+    detail::biased_selector<weight_t>{rng_state});
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
 std::tuple<rmm::device_uvector<vertex_t>, std::optional<rmm::device_uvector<weight_t>>>
 node2vec_random_walks(raft::handle_t const& handle,
+                      raft::random::RngState& rng_state,
                       graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
                       std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
                       raft::device_span<vertex_t const> start_vertices,
                       size_t max_length,
                       weight_t p,
-                      weight_t q,
-                      uint64_t seed)
+                      weight_t q)
 {
   CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
 
-  return detail::random_walk_impl(
-    handle,
-    graph_view,
-    edge_weight_view,
-    start_vertices,
-    max_length,
-    detail::node2vec_selector<weight_t>{
-      p, q, (seed == 0 ? detail::get_current_time_nanoseconds() : seed)});
+  return detail::random_walk_impl(handle,
+                                  graph_view,
+                                  edge_weight_view,
+                                  start_vertices,
+                                  max_length,
+                                  detail::node2vec_selector<weight_t>{p, q, rng_state});
 }
 
 }  // namespace cugraph
diff --git a/cpp/src/sampling/random_walks_mg_v32_e32.cu b/cpp/src/sampling/random_walks_mg_v32_e32.cu
index 421d3e9c818..abe5386da1c 100644
--- a/cpp/src/sampling/random_walks_mg_v32_e32.cu
+++ b/cpp/src/sampling/random_walks_mg_v32_e32.cu
@@ -22,54 +22,54 @@ namespace cugraph {
 
 template std::tuple<rmm::device_uvector<int32_t>, std::optional<rmm::device_uvector<float>>>
 uniform_random_walks(raft::handle_t const& handle,
+                     raft::random::RngState& rng_state,
                      graph_view_t<int32_t, int32_t, false, true> const& graph_view,
                      std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
                      raft::device_span<int32_t const> start_vertices,
-                     size_t max_length,
-                     uint64_t seed);
+                     size_t max_length);
 
 template std::tuple<rmm::device_uvector<int32_t>, std::optional<rmm::device_uvector<double>>>
 uniform_random_walks(raft::handle_t const& handle,
+                     raft::random::RngState& rng_state,
                      graph_view_t<int32_t, int32_t, false, true> const& graph_view,
                      std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
                      raft::device_span<int32_t const> start_vertices,
-                     size_t max_length,
-                     uint64_t seed);
+                     size_t max_length);
 
 template std::tuple<rmm::device_uvector<int32_t>, std::optional<rmm::device_uvector<float>>>
 biased_random_walks(raft::handle_t const& handle,
+                    raft::random::RngState& rng_state,
                     graph_view_t<int32_t, int32_t, false, true> const& graph_view,
                     edge_property_view_t<int32_t, float const*> edge_weight_view,
                     raft::device_span<int32_t const> start_vertices,
-                    size_t max_length,
-                    uint64_t seed);
+                    size_t max_length);
 
 template std::tuple<rmm::device_uvector<int32_t>, std::optional<rmm::device_uvector<double>>>
 biased_random_walks(raft::handle_t const& handle,
+                    raft::random::RngState& rng_state,
                     graph_view_t<int32_t, int32_t, false, true> const& graph_view,
                     edge_property_view_t<int32_t, double const*> edge_weight_view,
                     raft::device_span<int32_t const> start_vertices,
-                    size_t max_length,
-                    uint64_t seed);
+                    size_t max_length);
 
 template std::tuple<rmm::device_uvector<int32_t>, std::optional<rmm::device_uvector<float>>>
 node2vec_random_walks(raft::handle_t const& handle,
+                      raft::random::RngState& rng_state,
                       graph_view_t<int32_t, int32_t, false, true> const& graph_view,
                       std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
                       raft::device_span<int32_t const> start_vertices,
                       size_t max_length,
                       float p,
-                      float q,
-                      uint64_t seed);
+                      float q);
 
 template std::tuple<rmm::device_uvector<int32_t>, std::optional<rmm::device_uvector<double>>>
 node2vec_random_walks(raft::handle_t const& handle,
+                      raft::random::RngState& rng_state,
                       graph_view_t<int32_t, int32_t, false, true> const& graph_view,
                       std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
                       raft::device_span<int32_t const> start_vertices,
                       size_t max_length,
                       double p,
-                      double q,
-                      uint64_t seed);
+                      double q);
 
 }  // namespace cugraph
diff --git a/cpp/src/sampling/random_walks_mg_v32_e64.cu b/cpp/src/sampling/random_walks_mg_v32_e64.cu
index d38af65a505..b1bf1a19b77 100644
--- a/cpp/src/sampling/random_walks_mg_v32_e64.cu
+++ b/cpp/src/sampling/random_walks_mg_v32_e64.cu
@@ -22,54 +22,54 @@ namespace cugraph {
 
 template std::tuple<rmm::device_uvector<int32_t>, std::optional<rmm::device_uvector<float>>>
 uniform_random_walks(raft::handle_t const& handle,
+                     raft::random::RngState& rng_state,
                      graph_view_t<int32_t, int64_t, false, true> const& graph_view,
                      std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
                      raft::device_span<int32_t const> start_vertices,
-                     size_t max_length,
-                     uint64_t seed);
+                     size_t max_length);
 
 template std::tuple<rmm::device_uvector<int32_t>, std::optional<rmm::device_uvector<double>>>
 uniform_random_walks(raft::handle_t const& handle,
+                     raft::random::RngState& rng_state,
                      graph_view_t<int32_t, int64_t, false, true> const& graph_view,
                      std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
                      raft::device_span<int32_t const> start_vertices,
-                     size_t max_length,
-                     uint64_t seed);
+                     size_t max_length);
 
 template std::tuple<rmm::device_uvector<int32_t>, std::optional<rmm::device_uvector<float>>>
 biased_random_walks(raft::handle_t const& handle,
+                    raft::random::RngState& rng_state,
                     graph_view_t<int32_t, int64_t, false, true> const& graph_view,
                     edge_property_view_t<int64_t, float const*> edge_weight_view,
                     raft::device_span<int32_t const> start_vertices,
-                    size_t max_length,
-                    uint64_t seed);
+                    size_t max_length);
 
 template std::tuple<rmm::device_uvector<int32_t>, std::optional<rmm::device_uvector<double>>>
 biased_random_walks(raft::handle_t const& handle,
+                    raft::random::RngState& rng_state,
                     graph_view_t<int32_t, int64_t, false, true> const& graph_view,
                     edge_property_view_t<int64_t, double const*> edge_weight_view,
                     raft::device_span<int32_t const> start_vertices,
-                    size_t max_length,
-                    uint64_t seed);
+                    size_t max_length);
 
 template std::tuple<rmm::device_uvector<int32_t>, std::optional<rmm::device_uvector<float>>>
 node2vec_random_walks(raft::handle_t const& handle,
+                      raft::random::RngState& rng_state,
                       graph_view_t<int32_t, int64_t, false, true> const& graph_view,
                       std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
                       raft::device_span<int32_t const> start_vertices,
                       size_t max_length,
                       float p,
-                      float q,
-                      uint64_t seed);
+                      float q);
 
 template std::tuple<rmm::device_uvector<int32_t>, std::optional<rmm::device_uvector<double>>>
 node2vec_random_walks(raft::handle_t const& handle,
+                      raft::random::RngState& rng_state,
                       graph_view_t<int32_t, int64_t, false, true> const& graph_view,
                       std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
                       raft::device_span<int32_t const> start_vertices,
                       size_t max_length,
                       double p,
-                      double q,
-                      uint64_t seed);
+                      double q);
 
 }  // namespace cugraph
diff --git a/cpp/src/sampling/random_walks_mg_v64_e64.cu b/cpp/src/sampling/random_walks_mg_v64_e64.cu
index 9dedc893242..13cc899e50d 100644
--- a/cpp/src/sampling/random_walks_mg_v64_e64.cu
+++ b/cpp/src/sampling/random_walks_mg_v64_e64.cu
@@ -22,54 +22,54 @@ namespace cugraph {
 
 template std::tuple<rmm::device_uvector<int64_t>, std::optional<rmm::device_uvector<float>>>
 uniform_random_walks(raft::handle_t const& handle,
+                     raft::random::RngState& rng_state,
                      graph_view_t<int64_t, int64_t, false, true> const& graph_view,
                      std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
                      raft::device_span<int64_t const> start_vertices,
-                     size_t max_length,
-                     uint64_t seed);
+                     size_t max_length);
 
 template std::tuple<rmm::device_uvector<int64_t>, std::optional<rmm::device_uvector<double>>>
 uniform_random_walks(raft::handle_t const& handle,
+                     raft::random::RngState& rng_state,
                      graph_view_t<int64_t, int64_t, false, true> const& graph_view,
                      std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
                      raft::device_span<int64_t const> start_vertices,
-                     size_t max_length,
-                     uint64_t seed);
+                     size_t max_length);
 
 template std::tuple<rmm::device_uvector<int64_t>, std::optional<rmm::device_uvector<float>>>
 biased_random_walks(raft::handle_t const& handle,
+                    raft::random::RngState& rng_state,
                     graph_view_t<int64_t, int64_t, false, true> const& graph_view,
                     edge_property_view_t<int64_t, float const*> edge_weight_view,
                     raft::device_span<int64_t const> start_vertices,
-                    size_t max_length,
-                    uint64_t seed);
+                    size_t max_length);
 
 template std::tuple<rmm::device_uvector<int64_t>, std::optional<rmm::device_uvector<double>>>
 biased_random_walks(raft::handle_t const& handle,
+                    raft::random::RngState& rng_state,
                     graph_view_t<int64_t, int64_t, false, true> const& graph_view,
                     edge_property_view_t<int64_t, double const*> edge_weight_view,
                     raft::device_span<int64_t const> start_vertices,
-                    size_t max_length,
-                    uint64_t seed);
+                    size_t max_length);
 
 template std::tuple<rmm::device_uvector<int64_t>, std::optional<rmm::device_uvector<float>>>
 node2vec_random_walks(raft::handle_t const& handle,
+                      raft::random::RngState& rng_state,
                       graph_view_t<int64_t, int64_t, false, true> const& graph_view,
                       std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
                       raft::device_span<int64_t const> start_vertices,
                       size_t max_length,
                       float p,
-                      float q,
-                      uint64_t seed);
+                      float q);
 
 template std::tuple<rmm::device_uvector<int64_t>, std::optional<rmm::device_uvector<double>>>
 node2vec_random_walks(raft::handle_t const& handle,
+                      raft::random::RngState& rng_state,
                       graph_view_t<int64_t, int64_t, false, true> const& graph_view,
                       std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
                       raft::device_span<int64_t const> start_vertices,
                       size_t max_length,
                       double p,
-                      double q,
-                      uint64_t seed);
+                      double q);
 
 }  // namespace cugraph
diff --git a/cpp/src/sampling/random_walks_sg_v32_e32.cu b/cpp/src/sampling/random_walks_sg_v32_e32.cu
index 7b64d107250..383917c0248 100644
--- a/cpp/src/sampling/random_walks_sg_v32_e32.cu
+++ b/cpp/src/sampling/random_walks_sg_v32_e32.cu
@@ -22,54 +22,54 @@ namespace cugraph {
 
 template std::tuple<rmm::device_uvector<int32_t>, std::optional<rmm::device_uvector<float>>>
 uniform_random_walks(raft::handle_t const& handle,
+                     raft::random::RngState& rng_state,
                      graph_view_t<int32_t, int32_t, false, false> const& graph_view,
                      std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
                      raft::device_span<int32_t const> start_vertices,
-                     size_t max_length,
-                     uint64_t seed);
+                     size_t max_length);
 
 template std::tuple<rmm::device_uvector<int32_t>, std::optional<rmm::device_uvector<double>>>
 uniform_random_walks(raft::handle_t const& handle,
+                     raft::random::RngState& rng_state,
                      graph_view_t<int32_t, int32_t, false, false> const& graph_view,
                      std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
                      raft::device_span<int32_t const> start_vertices,
-                     size_t max_length,
-                     uint64_t seed);
+                     size_t max_length);
 
 template std::tuple<rmm::device_uvector<int32_t>, std::optional<rmm::device_uvector<float>>>
 biased_random_walks(raft::handle_t const& handle,
+                    raft::random::RngState& rng_state,
                     graph_view_t<int32_t, int32_t, false, false> const& graph_view,
                     edge_property_view_t<int32_t, float const*> edge_weight_view,
                     raft::device_span<int32_t const> start_vertices,
-                    size_t max_length,
-                    uint64_t seed);
+                    size_t max_length);
 
 template std::tuple<rmm::device_uvector<int32_t>, std::optional<rmm::device_uvector<double>>>
 biased_random_walks(raft::handle_t const& handle,
+                    raft::random::RngState& rng_state,
                     graph_view_t<int32_t, int32_t, false, false> const& graph_view,
                     edge_property_view_t<int32_t, double const*> edge_weight_view,
                     raft::device_span<int32_t const> start_vertices,
-                    size_t max_length,
-                    uint64_t seed);
+                    size_t max_length);
 
 template std::tuple<rmm::device_uvector<int32_t>, std::optional<rmm::device_uvector<float>>>
 node2vec_random_walks(raft::handle_t const& handle,
+                      raft::random::RngState& rng_state,
                       graph_view_t<int32_t, int32_t, false, false> const& graph_view,
                       std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
                       raft::device_span<int32_t const> start_vertices,
                       size_t max_length,
                       float p,
-                      float q,
-                      uint64_t seed);
+                      float q);
 
 template std::tuple<rmm::device_uvector<int32_t>, std::optional<rmm::device_uvector<double>>>
 node2vec_random_walks(raft::handle_t const& handle,
+                      raft::random::RngState& rng_state,
                       graph_view_t<int32_t, int32_t, false, false> const& graph_view,
                       std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
                       raft::device_span<int32_t const> start_vertices,
                       size_t max_length,
                       double p,
-                      double q,
-                      uint64_t seed);
+                      double q);
 
 }  // namespace cugraph
diff --git a/cpp/src/sampling/random_walks_sg_v32_e64.cu b/cpp/src/sampling/random_walks_sg_v32_e64.cu
index d9ea09f36ef..98d2bb02d88 100644
--- a/cpp/src/sampling/random_walks_sg_v32_e64.cu
+++ b/cpp/src/sampling/random_walks_sg_v32_e64.cu
@@ -22,54 +22,54 @@ namespace cugraph {
 
 template std::tuple<rmm::device_uvector<int32_t>, std::optional<rmm::device_uvector<float>>>
 uniform_random_walks(raft::handle_t const& handle,
+                     raft::random::RngState& rng_state,
                      graph_view_t<int32_t, int64_t, false, false> const& graph_view,
                      std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
                      raft::device_span<int32_t const> start_vertices,
-                     size_t max_length,
-                     uint64_t seed);
+                     size_t max_length);
 
 template std::tuple<rmm::device_uvector<int32_t>, std::optional<rmm::device_uvector<double>>>
 uniform_random_walks(raft::handle_t const& handle,
+                     raft::random::RngState& rng_state,
                      graph_view_t<int32_t, int64_t, false, false> const& graph_view,
                      std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
                      raft::device_span<int32_t const> start_vertices,
-                     size_t max_length,
-                     uint64_t seed);
+                     size_t max_length);
 
 template std::tuple<rmm::device_uvector<int32_t>, std::optional<rmm::device_uvector<float>>>
 biased_random_walks(raft::handle_t const& handle,
+                    raft::random::RngState& rng_state,
                     graph_view_t<int32_t, int64_t, false, false> const& graph_view,
                     edge_property_view_t<int64_t, float const*> edge_weight_view,
                     raft::device_span<int32_t const> start_vertices,
-                    size_t max_length,
-                    uint64_t seed);
+                    size_t max_length);
 
 template std::tuple<rmm::device_uvector<int32_t>, std::optional<rmm::device_uvector<double>>>
 biased_random_walks(raft::handle_t const& handle,
+                    raft::random::RngState& rng_state,
                     graph_view_t<int32_t, int64_t, false, false> const& graph_view,
                     edge_property_view_t<int64_t, double const*> edge_weight_view,
                     raft::device_span<int32_t const> start_vertices,
-                    size_t max_length,
-                    uint64_t seed);
+                    size_t max_length);
 
 template std::tuple<rmm::device_uvector<int32_t>, std::optional<rmm::device_uvector<float>>>
 node2vec_random_walks(raft::handle_t const& handle,
+                      raft::random::RngState& rng_state,
                       graph_view_t<int32_t, int64_t, false, false> const& graph_view,
                       std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
                       raft::device_span<int32_t const> start_vertices,
                       size_t max_length,
                       float p,
-                      float q,
-                      uint64_t seed);
+                      float q);
 
 template std::tuple<rmm::device_uvector<int32_t>, std::optional<rmm::device_uvector<double>>>
 node2vec_random_walks(raft::handle_t const& handle,
+                      raft::random::RngState& rng_state,
                       graph_view_t<int32_t, int64_t, false, false> const& graph_view,
                       std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
                       raft::device_span<int32_t const> start_vertices,
                       size_t max_length,
                       double p,
-                      double q,
-                      uint64_t seed);
+                      double q);
 
 }  // namespace cugraph
diff --git a/cpp/src/sampling/random_walks_sg_v64_e64.cu b/cpp/src/sampling/random_walks_sg_v64_e64.cu
index 0b9be107276..c139acec4b7 100644
--- a/cpp/src/sampling/random_walks_sg_v64_e64.cu
+++ b/cpp/src/sampling/random_walks_sg_v64_e64.cu
@@ -22,54 +22,54 @@ namespace cugraph {
 
 template std::tuple<rmm::device_uvector<int64_t>, std::optional<rmm::device_uvector<float>>>
 uniform_random_walks(raft::handle_t const& handle,
+                     raft::random::RngState& rng_state,
                      graph_view_t<int64_t, int64_t, false, false> const& graph_view,
                      std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
                      raft::device_span<int64_t const> start_vertices,
-                     size_t max_length,
-                     uint64_t seed);
+                     size_t max_length);
 
 template std::tuple<rmm::device_uvector<int64_t>, std::optional<rmm::device_uvector<double>>>
 uniform_random_walks(raft::handle_t const& handle,
+                     raft::random::RngState& rng_state,
                      graph_view_t<int64_t, int64_t, false, false> const& graph_view,
                      std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
                      raft::device_span<int64_t const> start_vertices,
-                     size_t max_length,
-                     uint64_t seed);
+                     size_t max_length);
 
 template std::tuple<rmm::device_uvector<int64_t>, std::optional<rmm::device_uvector<float>>>
 biased_random_walks(raft::handle_t const& handle,
+                    raft::random::RngState& rng_state,
                     graph_view_t<int64_t, int64_t, false, false> const& graph_view,
                     edge_property_view_t<int64_t, float const*> edge_weight_view,
                     raft::device_span<int64_t const> start_vertices,
-                    size_t max_length,
-                    uint64_t seed);
+                    size_t max_length);
 
 template std::tuple<rmm::device_uvector<int64_t>, std::optional<rmm::device_uvector<double>>>
 biased_random_walks(raft::handle_t const& handle,
+                    raft::random::RngState& rng_state,
                     graph_view_t<int64_t, int64_t, false, false> const& graph_view,
                     edge_property_view_t<int64_t, double const*> edge_weight_view,
                     raft::device_span<int64_t const> start_vertices,
-                    size_t max_length,
-                    uint64_t seed);
+                    size_t max_length);
 
 template std::tuple<rmm::device_uvector<int64_t>, std::optional<rmm::device_uvector<float>>>
 node2vec_random_walks(raft::handle_t const& handle,
+                      raft::random::RngState& rng_state,
                       graph_view_t<int64_t, int64_t, false, false> const& graph_view,
                       std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
                       raft::device_span<int64_t const> start_vertices,
                       size_t max_length,
                       float p,
-                      float q,
-                      uint64_t seed);
+                      float q);
 
 template std::tuple<rmm::device_uvector<int64_t>, std::optional<rmm::device_uvector<double>>>
 node2vec_random_walks(raft::handle_t const& handle,
+                      raft::random::RngState& rng_state,
                       graph_view_t<int64_t, int64_t, false, false> const& graph_view,
                       std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
                       raft::device_span<int64_t const> start_vertices,
                       size_t max_length,
                       double p,
-                      double q,
-                      uint64_t seed);
+                      double q);
 
 }  // namespace cugraph
diff --git a/cpp/src/sampling/renumber_sampled_edgelist_impl.cuh b/cpp/src/sampling/renumber_sampled_edgelist_impl.cuh
deleted file mode 100644
index f5bc3ef6d2e..00000000000
--- a/cpp/src/sampling/renumber_sampled_edgelist_impl.cuh
+++ /dev/null
@@ -1,719 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "prims/kv_store.cuh"
-
-#include <cugraph/utilities/device_functors.cuh>
-#include <cugraph/utilities/error.hpp>
-#include <cugraph/utilities/misc_utils.cuh>
-
-#include <raft/core/handle.hpp>
-
-#include <rmm/device_uvector.hpp>
-
-#include <cub/cub.cuh>
-#include <thrust/binary_search.h>
-#include <thrust/copy.h>
-#include <thrust/count.h>
-#include <thrust/distance.h>
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/merge.h>
-#include <thrust/set_operations.h>
-#include <thrust/sort.h>
-#include <thrust/transform.h>
-#include <thrust/tuple.h>
-#include <thrust/unique.h>
-
-#include <optional>
-
-// FIXME: deprecated, to be deleted
-namespace cugraph {
-
-namespace {
-
-// output sorted by (primary key:label_index, secondary key:vertex)
-template <typename vertex_t, typename label_index_t>
-std::tuple<std::optional<rmm::device_uvector<label_index_t>> /* label indices */,
-           rmm::device_uvector<vertex_t> /* vertices */,
-           std::optional<rmm::device_uvector<int32_t>> /* minimum hops for the vertices */,
-           std::optional<rmm::device_uvector<size_t>> /* label offsets for the output */>
-compute_min_hop_for_unique_label_vertex_pairs(
-  raft::handle_t const& handle,
-  raft::device_span<vertex_t const> vertices,
-  std::optional<raft::device_span<int32_t const>> hops,
-  std::optional<raft::device_span<label_index_t const>> label_indices,
-  std::optional<raft::device_span<size_t const>> label_offsets)
-{
-  auto approx_edges_to_sort_per_iteration =
-    static_cast<size_t>(handle.get_device_properties().multiProcessorCount) *
-    (1 << 20) /* tuning parameter */;  // for segmented sort
-
-  if (label_indices) {
-    auto num_labels = (*label_offsets).size() - 1;
-
-    rmm::device_uvector<label_index_t> tmp_label_indices((*label_indices).size(),
-                                                         handle.get_stream());
-    thrust::copy(handle.get_thrust_policy(),
-                 (*label_indices).begin(),
-                 (*label_indices).end(),
-                 tmp_label_indices.begin());
-
-    rmm::device_uvector<vertex_t> tmp_vertices(0, handle.get_stream());
-    std::optional<rmm::device_uvector<int32_t>> tmp_hops{std::nullopt};
-
-    if (hops) {
-      tmp_vertices.resize(vertices.size(), handle.get_stream());
-      thrust::copy(
-        handle.get_thrust_policy(), vertices.begin(), vertices.end(), tmp_vertices.begin());
-      tmp_hops = rmm::device_uvector<int32_t>((*hops).size(), handle.get_stream());
-      thrust::copy(handle.get_thrust_policy(), (*hops).begin(), (*hops).end(), (*tmp_hops).begin());
-
-      auto triplet_first = thrust::make_zip_iterator(
-        tmp_label_indices.begin(), tmp_vertices.begin(), (*tmp_hops).begin());
-      thrust::sort(
-        handle.get_thrust_policy(), triplet_first, triplet_first + tmp_label_indices.size());
-      auto key_first   = thrust::make_zip_iterator(tmp_label_indices.begin(), tmp_vertices.begin());
-      auto num_uniques = static_cast<size_t>(
-        thrust::distance(key_first,
-                         thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
-                                                              key_first,
-                                                              key_first + tmp_label_indices.size(),
-                                                              (*tmp_hops).begin()))));
-      tmp_label_indices.resize(num_uniques, handle.get_stream());
-      tmp_vertices.resize(num_uniques, handle.get_stream());
-      (*tmp_hops).resize(num_uniques, handle.get_stream());
-      tmp_label_indices.shrink_to_fit(handle.get_stream());
-      tmp_vertices.shrink_to_fit(handle.get_stream());
-      (*tmp_hops).shrink_to_fit(handle.get_stream());
-    } else {
-      rmm::device_uvector<vertex_t> segment_sorted_vertices(vertices.size(), handle.get_stream());
-
-      rmm::device_uvector<std::byte> d_tmp_storage(0, handle.get_stream());
-
-      auto [h_label_offsets, h_edge_offsets] = detail::compute_offset_aligned_element_chunks(
-        handle, *label_offsets, vertices.size(), approx_edges_to_sort_per_iteration);
-      auto num_chunks = h_label_offsets.size() - 1;
-
-      for (size_t i = 0; i < num_chunks; ++i) {
-        size_t tmp_storage_bytes{0};
-
-        auto offset_first =
-          thrust::make_transform_iterator((*label_offsets).data() + h_label_offsets[i],
-                                          detail::shift_left_t<size_t>{h_edge_offsets[i]});
-        cub::DeviceSegmentedSort::SortKeys(static_cast<void*>(nullptr),
-                                           tmp_storage_bytes,
-                                           vertices.begin() + h_edge_offsets[i],
-                                           segment_sorted_vertices.begin() + h_edge_offsets[i],
-                                           h_edge_offsets[i + 1] - h_edge_offsets[i],
-                                           h_label_offsets[i + 1] - h_label_offsets[i],
-                                           offset_first,
-                                           offset_first + 1,
-                                           handle.get_stream());
-
-        if (tmp_storage_bytes > d_tmp_storage.size()) {
-          d_tmp_storage = rmm::device_uvector<std::byte>(tmp_storage_bytes, handle.get_stream());
-        }
-
-        cub::DeviceSegmentedSort::SortKeys(d_tmp_storage.data(),
-                                           tmp_storage_bytes,
-                                           vertices.begin() + h_edge_offsets[i],
-                                           segment_sorted_vertices.begin() + h_edge_offsets[i],
-                                           h_edge_offsets[i + 1] - h_edge_offsets[i],
-                                           h_label_offsets[i + 1] - h_label_offsets[i],
-                                           offset_first,
-                                           offset_first + 1,
-                                           handle.get_stream());
-      }
-      d_tmp_storage.resize(0, handle.get_stream());
-      d_tmp_storage.shrink_to_fit(handle.get_stream());
-
-      auto pair_first =
-        thrust::make_zip_iterator(tmp_label_indices.begin(), segment_sorted_vertices.begin());
-      auto num_uniques = static_cast<size_t>(thrust::distance(
-        pair_first,
-        thrust::unique(
-          handle.get_thrust_policy(), pair_first, pair_first + tmp_label_indices.size())));
-      tmp_label_indices.resize(num_uniques, handle.get_stream());
-      segment_sorted_vertices.resize(num_uniques, handle.get_stream());
-      tmp_label_indices.shrink_to_fit(handle.get_stream());
-      segment_sorted_vertices.shrink_to_fit(handle.get_stream());
-
-      tmp_vertices = std::move(segment_sorted_vertices);
-    }
-
-    rmm::device_uvector<size_t> tmp_label_offsets(num_labels + 1, handle.get_stream());
-    tmp_label_offsets.set_element_to_zero_async(0, handle.get_stream());
-    thrust::upper_bound(handle.get_thrust_policy(),
-                        tmp_label_indices.begin(),
-                        tmp_label_indices.end(),
-                        thrust::make_counting_iterator(size_t{0}),
-                        thrust::make_counting_iterator(num_labels),
-                        tmp_label_offsets.begin() + 1);
-
-    return std::make_tuple(std::move(tmp_label_indices),
-                           std::move(tmp_vertices),
-                           std::move(tmp_hops),
-                           std::move(tmp_label_offsets));
-  } else {
-    rmm::device_uvector<vertex_t> tmp_vertices(vertices.size(), handle.get_stream());
-    thrust::copy(
-      handle.get_thrust_policy(), vertices.begin(), vertices.end(), tmp_vertices.begin());
-
-    if (hops) {
-      rmm::device_uvector<int32_t> tmp_hops((*hops).size(), handle.get_stream());
-      thrust::copy(handle.get_thrust_policy(), (*hops).begin(), (*hops).end(), tmp_hops.begin());
-
-      auto pair_first = thrust::make_zip_iterator(
-        tmp_vertices.begin(), tmp_hops.begin());  // vertex is a primary key, hop is a secondary key
-      thrust::sort(handle.get_thrust_policy(), pair_first, pair_first + tmp_vertices.size());
-      tmp_vertices.resize(
-        thrust::distance(tmp_vertices.begin(),
-                         thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
-                                                              tmp_vertices.begin(),
-                                                              tmp_vertices.end(),
-                                                              tmp_hops.begin()))),
-        handle.get_stream());
-      tmp_hops.resize(tmp_vertices.size(), handle.get_stream());
-
-      return std::make_tuple(
-        std::nullopt, std::move(tmp_vertices), std::move(tmp_hops), std::nullopt);
-    } else {
-      thrust::sort(handle.get_thrust_policy(), tmp_vertices.begin(), tmp_vertices.end());
-      tmp_vertices.resize(
-        thrust::distance(
-          tmp_vertices.begin(),
-          thrust::unique(handle.get_thrust_policy(), tmp_vertices.begin(), tmp_vertices.end())),
-        handle.get_stream());
-      tmp_vertices.shrink_to_fit(handle.get_stream());
-
-      return std::make_tuple(std::nullopt, std::move(tmp_vertices), std::nullopt, std::nullopt);
-    }
-  }
-}
-
-template <typename vertex_t, typename label_index_t>
-std::tuple<rmm::device_uvector<vertex_t>, std::optional<rmm::device_uvector<label_index_t>>>
-compute_renumber_map(raft::handle_t const& handle,
-                     raft::device_span<vertex_t const> edgelist_srcs,
-                     raft::device_span<vertex_t const> edgelist_dsts,
-                     std::optional<raft::device_span<int32_t const>> edgelist_hops,
-                     std::optional<raft::device_span<size_t const>> label_offsets)
-{
-  auto approx_edges_to_sort_per_iteration =
-    static_cast<size_t>(handle.get_device_properties().multiProcessorCount) *
-    (1 << 20) /* tuning parameter */;  // for segmented sort
-
-  std::optional<rmm::device_uvector<label_index_t>> edgelist_label_indices{std::nullopt};
-  if (label_offsets) {
-    edgelist_label_indices =
-      detail::expand_sparse_offsets(*label_offsets, label_index_t{0}, handle.get_stream());
-  }
-
-  auto [unique_label_src_pair_label_indices,
-        unique_label_src_pair_vertices,
-        unique_label_src_pair_hops,
-        unique_label_src_pair_label_offsets] =
-    compute_min_hop_for_unique_label_vertex_pairs(
-      handle,
-      edgelist_srcs,
-      edgelist_hops,
-      edgelist_label_indices ? std::make_optional<raft::device_span<label_index_t const>>(
-                                 (*edgelist_label_indices).data(), (*edgelist_label_indices).size())
-                             : std::nullopt,
-      label_offsets);
-
-  auto [unique_label_dst_pair_label_indices,
-        unique_label_dst_pair_vertices,
-        unique_label_dst_pair_hops,
-        unique_label_dst_pair_label_offsets] =
-    compute_min_hop_for_unique_label_vertex_pairs(
-      handle,
-      edgelist_dsts,
-      edgelist_hops,
-      edgelist_label_indices ? std::make_optional<raft::device_span<label_index_t const>>(
-                                 (*edgelist_label_indices).data(), (*edgelist_label_indices).size())
-                             : std::nullopt,
-      label_offsets);
-
-  edgelist_label_indices = std::nullopt;
-
-  if (label_offsets) {
-    auto num_labels = (*label_offsets).size() - 1;
-
-    rmm::device_uvector<vertex_t> renumber_map(0, handle.get_stream());
-    rmm::device_uvector<label_index_t> renumber_map_label_indices(0, handle.get_stream());
-
-    renumber_map.reserve(
-      (*unique_label_src_pair_label_indices).size() + (*unique_label_dst_pair_label_indices).size(),
-      handle.get_stream());
-    renumber_map_label_indices.reserve(renumber_map.capacity(), handle.get_stream());
-
-    auto num_chunks = (edgelist_srcs.size() + (approx_edges_to_sort_per_iteration - 1)) /
-                      approx_edges_to_sort_per_iteration;
-    auto chunk_size = (num_chunks > 0) ? ((num_labels + (num_chunks - 1)) / num_chunks) : 0;
-
-    size_t copy_offset{0};
-    for (size_t i = 0; i < num_chunks; ++i) {
-      auto src_start_offset =
-        (*unique_label_src_pair_label_offsets).element(chunk_size * i, handle.get_stream());
-      auto src_end_offset =
-        (*unique_label_src_pair_label_offsets)
-          .element(std::min(chunk_size * (i + 1), num_labels), handle.get_stream());
-      auto dst_start_offset =
-        (*unique_label_dst_pair_label_offsets).element(chunk_size * i, handle.get_stream());
-      auto dst_end_offset =
-        (*unique_label_dst_pair_label_offsets)
-          .element(std::min(chunk_size * (i + 1), num_labels), handle.get_stream());
-
-      rmm::device_uvector<label_index_t> merged_label_indices(
-        (src_end_offset - src_start_offset) + (dst_end_offset - dst_start_offset),
-        handle.get_stream());
-      rmm::device_uvector<vertex_t> merged_vertices(merged_label_indices.size(),
-                                                    handle.get_stream());
-      rmm::device_uvector<int8_t> merged_flags(merged_label_indices.size(), handle.get_stream());
-
-      if (edgelist_hops) {
-        rmm::device_uvector<int32_t> merged_hops(merged_label_indices.size(), handle.get_stream());
-        auto src_quad_first =
-          thrust::make_zip_iterator((*unique_label_src_pair_label_indices).begin(),
-                                    unique_label_src_pair_vertices.begin(),
-                                    (*unique_label_src_pair_hops).begin(),
-                                    thrust::make_constant_iterator(int8_t{0}));
-        auto dst_quad_first =
-          thrust::make_zip_iterator((*unique_label_dst_pair_label_indices).begin(),
-                                    unique_label_dst_pair_vertices.begin(),
-                                    (*unique_label_dst_pair_hops).begin(),
-                                    thrust::make_constant_iterator(int8_t{1}));
-        thrust::merge(handle.get_thrust_policy(),
-                      src_quad_first + src_start_offset,
-                      src_quad_first + src_end_offset,
-                      dst_quad_first + dst_start_offset,
-                      dst_quad_first + dst_end_offset,
-                      thrust::make_zip_iterator(merged_label_indices.begin(),
-                                                merged_vertices.begin(),
-                                                merged_hops.begin(),
-                                                merged_flags.begin()));
-
-        auto unique_key_first =
-          thrust::make_zip_iterator(merged_label_indices.begin(), merged_vertices.begin());
-        merged_label_indices.resize(
-          thrust::distance(
-            unique_key_first,
-            thrust::get<0>(thrust::unique_by_key(
-              handle.get_thrust_policy(),
-              unique_key_first,
-              unique_key_first + merged_label_indices.size(),
-              thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin())))),
-          handle.get_stream());
-        merged_vertices.resize(merged_label_indices.size(), handle.get_stream());
-        merged_hops.resize(merged_label_indices.size(), handle.get_stream());
-        merged_flags.resize(merged_label_indices.size(), handle.get_stream());
-        auto sort_key_first = thrust::make_zip_iterator(
-          merged_label_indices.begin(), merged_hops.begin(), merged_flags.begin());
-        thrust::sort_by_key(handle.get_thrust_policy(),
-                            sort_key_first,
-                            sort_key_first + merged_label_indices.size(),
-                            merged_vertices.begin());
-      } else {
-        auto src_triplet_first =
-          thrust::make_zip_iterator((*unique_label_src_pair_label_indices).begin(),
-                                    unique_label_src_pair_vertices.begin(),
-                                    thrust::make_constant_iterator(int8_t{0}));
-        auto dst_triplet_first =
-          thrust::make_zip_iterator((*unique_label_dst_pair_label_indices).begin(),
-                                    unique_label_dst_pair_vertices.begin(),
-                                    thrust::make_constant_iterator(int8_t{1}));
-        thrust::merge(
-          handle.get_thrust_policy(),
-          src_triplet_first + src_start_offset,
-          src_triplet_first + src_end_offset,
-          dst_triplet_first + dst_start_offset,
-          dst_triplet_first + dst_end_offset,
-          thrust::make_zip_iterator(
-            merged_label_indices.begin(), merged_vertices.begin(), merged_flags.begin()));
-
-        auto unique_key_first =
-          thrust::make_zip_iterator(merged_label_indices.begin(), merged_vertices.begin());
-        merged_label_indices.resize(
-          thrust::distance(
-            unique_key_first,
-            thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
-                                                 unique_key_first,
-                                                 unique_key_first + merged_label_indices.size(),
-                                                 merged_flags.begin()))),
-          handle.get_stream());
-        merged_vertices.resize(merged_label_indices.size(), handle.get_stream());
-        merged_flags.resize(merged_label_indices.size(), handle.get_stream());
-        auto sort_key_first =
-          thrust::make_zip_iterator(merged_label_indices.begin(), merged_flags.begin());
-        thrust::sort_by_key(handle.get_thrust_policy(),
-                            sort_key_first,
-                            sort_key_first + merged_label_indices.size(),
-                            merged_vertices.begin());
-      }
-
-      renumber_map.resize(copy_offset + merged_vertices.size(), handle.get_stream());
-      thrust::copy(handle.get_thrust_policy(),
-                   merged_vertices.begin(),
-                   merged_vertices.end(),
-                   renumber_map.begin() + copy_offset);
-      renumber_map_label_indices.resize(copy_offset + merged_label_indices.size(),
-                                        handle.get_stream());
-      thrust::copy(handle.get_thrust_policy(),
-                   merged_label_indices.begin(),
-                   merged_label_indices.end(),
-                   renumber_map_label_indices.begin() + copy_offset);
-
-      copy_offset += merged_vertices.size();
-    }
-
-    renumber_map.shrink_to_fit(handle.get_stream());
-    renumber_map_label_indices.shrink_to_fit(handle.get_stream());
-
-    return std::make_tuple(std::move(renumber_map), std::move(renumber_map_label_indices));
-  } else {
-    if (edgelist_hops) {
-      rmm::device_uvector<vertex_t> merged_vertices(
-        unique_label_src_pair_vertices.size() + unique_label_dst_pair_vertices.size(),
-        handle.get_stream());
-      rmm::device_uvector<int32_t> merged_hops(merged_vertices.size(), handle.get_stream());
-      rmm::device_uvector<int8_t> merged_flags(merged_vertices.size(), handle.get_stream());
-      auto src_triplet_first = thrust::make_zip_iterator(unique_label_src_pair_vertices.begin(),
-                                                         (*unique_label_src_pair_hops).begin(),
-                                                         thrust::make_constant_iterator(int8_t{0}));
-      auto dst_triplet_first = thrust::make_zip_iterator(unique_label_dst_pair_vertices.begin(),
-                                                         (*unique_label_dst_pair_hops).begin(),
-                                                         thrust::make_constant_iterator(int8_t{1}));
-      thrust::merge(handle.get_thrust_policy(),
-                    src_triplet_first,
-                    src_triplet_first + unique_label_src_pair_vertices.size(),
-                    dst_triplet_first,
-                    dst_triplet_first + unique_label_dst_pair_vertices.size(),
-                    thrust::make_zip_iterator(
-                      merged_vertices.begin(), merged_hops.begin(), merged_flags.begin()));
-
-      unique_label_src_pair_vertices.resize(0, handle.get_stream());
-      unique_label_src_pair_vertices.shrink_to_fit(handle.get_stream());
-      unique_label_src_pair_hops = std::nullopt;
-      unique_label_dst_pair_vertices.resize(0, handle.get_stream());
-      unique_label_dst_pair_vertices.shrink_to_fit(handle.get_stream());
-      unique_label_dst_pair_hops = std::nullopt;
-
-      merged_vertices.resize(
-        thrust::distance(merged_vertices.begin(),
-                         thrust::get<0>(thrust::unique_by_key(
-                           handle.get_thrust_policy(),
-                           merged_vertices.begin(),
-                           merged_vertices.end(),
-                           thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin())))),
-        handle.get_stream());
-      merged_hops.resize(merged_vertices.size(), handle.get_stream());
-      merged_flags.resize(merged_vertices.size(), handle.get_stream());
-
-      auto sort_key_first = thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin());
-      thrust::sort_by_key(handle.get_thrust_policy(),
-                          sort_key_first,
-                          sort_key_first + merged_hops.size(),
-                          merged_vertices.begin());
-
-      return std::make_tuple(std::move(merged_vertices), std::nullopt);
-    } else {
-      rmm::device_uvector<vertex_t> output_vertices(unique_label_dst_pair_vertices.size(),
-                                                    handle.get_stream());
-      auto output_last = thrust::set_difference(handle.get_thrust_policy(),
-                                                unique_label_dst_pair_vertices.begin(),
-                                                unique_label_dst_pair_vertices.end(),
-                                                unique_label_src_pair_vertices.begin(),
-                                                unique_label_src_pair_vertices.end(),
-                                                output_vertices.begin());
-
-      auto num_unique_srcs = unique_label_src_pair_vertices.size();
-      auto renumber_map    = std::move(unique_label_src_pair_vertices);
-      renumber_map.resize(
-        renumber_map.size() + thrust::distance(output_vertices.begin(), output_last),
-        handle.get_stream());
-      thrust::copy(handle.get_thrust_policy(),
-                   output_vertices.begin(),
-                   output_last,
-                   renumber_map.begin() + num_unique_srcs);
-
-      return std::make_tuple(std::move(renumber_map), std::nullopt);
-    }
-  }
-}
-
-}  // namespace
-
-template <typename vertex_t, typename label_t>
-std::tuple<rmm::device_uvector<vertex_t>,
-           rmm::device_uvector<vertex_t>,
-           rmm::device_uvector<vertex_t>,
-           std::optional<rmm::device_uvector<size_t>>>
-renumber_sampled_edgelist(
-  raft::handle_t const& handle,
-  rmm::device_uvector<vertex_t>&& edgelist_srcs,
-  rmm::device_uvector<vertex_t>&& edgelist_dsts,
-  std::optional<raft::device_span<int32_t const>> edgelist_hops,
-  std::optional<std::tuple<raft::device_span<label_t const>, raft::device_span<size_t const>>>
-    label_offsets,
-  bool do_expensive_check)
-{
-  using label_index_t = uint32_t;
-
-  // 1. check input arguments
-
-  CUGRAPH_EXPECTS(!label_offsets || (std::get<0>(*label_offsets).size() <=
-                                     std::numeric_limits<label_index_t>::max()),
-                  "Invalid input arguments: current implementation assumes that the number of "
-                  "unique labels is no larger than std::numeric_limits<uint32_t>::max().");
-
-  CUGRAPH_EXPECTS(
-    edgelist_srcs.size() == edgelist_dsts.size(),
-    "Invalid input arguments: edgelist_srcs.size() and edgelist_dsts.size() should coincide.");
-  CUGRAPH_EXPECTS(!edgelist_hops.has_value() || (edgelist_srcs.size() == (*edgelist_hops).size()),
-                  "Invalid input arguments: if edgelist_hops is valid, (*edgelist_hops).size() and "
-                  "edgelist_srcs.size() should coincide.");
-  CUGRAPH_EXPECTS(!label_offsets.has_value() ||
-                    (std::get<1>(*label_offsets).size() == std::get<0>(*label_offsets).size() + 1),
-                  "Invalid input arguments: if label_offsets is valid, "
-                  "std::get<1>(label_offsets).size() (size of the offset array) should be "
-                  "std::get<0>(label_offsets).size() (number of unique labels) + 1.");
-
-  if (do_expensive_check) {
-    if (label_offsets) {
-      CUGRAPH_EXPECTS(thrust::is_sorted(handle.get_thrust_policy(),
-                                        std::get<1>(*label_offsets).begin(),
-                                        std::get<1>(*label_offsets).end()),
-                      "Invalid input arguments: if label_offsets is valid, "
-                      "std::get<1>(*label_offsets) should be sorted.");
-      size_t back_element{};
-      raft::update_host(
-        &back_element,
-        std::get<1>(*label_offsets).data() + (std::get<1>(*label_offsets).size() - 1),
-        size_t{1},
-        handle.get_stream());
-      handle.get_stream();
-      CUGRAPH_EXPECTS(back_element == edgelist_srcs.size(),
-                      "Invalid input arguments: if label_offsets is valid, the last element of "
-                      "std::get<1>(*label_offsets) and edgelist_srcs.size() should coincide.");
-    }
-  }
-
-  // 2. compute renumber_map
-
-  auto [renumber_map, renumber_map_label_indices] = compute_renumber_map<vertex_t, label_index_t>(
-    handle,
-    raft::device_span<vertex_t const>(edgelist_srcs.data(), edgelist_srcs.size()),
-    raft::device_span<vertex_t const>(edgelist_dsts.data(), edgelist_dsts.size()),
-    edgelist_hops,
-    label_offsets ? std::make_optional<raft::device_span<size_t const>>(std::get<1>(*label_offsets))
-                  : std::nullopt);
-
-  // 3. compute renumber map offsets for each label
-
-  std::optional<rmm::device_uvector<size_t>> renumber_map_label_offsets{};
-  if (label_offsets) {
-    auto num_unique_labels = thrust::count_if(
-      handle.get_thrust_policy(),
-      thrust::make_counting_iterator(size_t{0}),
-      thrust::make_counting_iterator((*renumber_map_label_indices).size()),
-      detail::is_first_in_run_t<label_index_t const*>{(*renumber_map_label_indices).data()});
-    rmm::device_uvector<label_index_t> unique_label_indices(num_unique_labels, handle.get_stream());
-    rmm::device_uvector<vertex_t> vertex_counts(num_unique_labels, handle.get_stream());
-    thrust::reduce_by_key(handle.get_thrust_policy(),
-                          (*renumber_map_label_indices).begin(),
-                          (*renumber_map_label_indices).end(),
-                          thrust::make_constant_iterator(size_t{1}),
-                          unique_label_indices.begin(),
-                          vertex_counts.begin());
-
-    renumber_map_label_offsets =
-      rmm::device_uvector<size_t>(std::get<0>(*label_offsets).size() + 1, handle.get_stream());
-    thrust::fill(handle.get_thrust_policy(),
-                 (*renumber_map_label_offsets).begin(),
-                 (*renumber_map_label_offsets).end(),
-                 size_t{0});
-    thrust::scatter(handle.get_thrust_policy(),
-                    vertex_counts.begin(),
-                    vertex_counts.end(),
-                    unique_label_indices.begin(),
-                    (*renumber_map_label_offsets).begin() + 1);
-
-    thrust::inclusive_scan(handle.get_thrust_policy(),
-                           (*renumber_map_label_offsets).begin(),
-                           (*renumber_map_label_offsets).end(),
-                           (*renumber_map_label_offsets).begin());
-  }
-
-  // 4. renumber input edges
-
-  if (label_offsets) {
-    rmm::device_uvector<vertex_t> new_vertices(renumber_map.size(), handle.get_stream());
-    thrust::tabulate(handle.get_thrust_policy(),
-                     new_vertices.begin(),
-                     new_vertices.end(),
-                     [label_indices = raft::device_span<label_index_t const>(
-                        (*renumber_map_label_indices).data(), (*renumber_map_label_indices).size()),
-                      renumber_map_label_offsets = raft::device_span<size_t const>(
-                        (*renumber_map_label_offsets).data(),
-                        (*renumber_map_label_offsets).size())] __device__(size_t i) {
-                       auto label_index        = label_indices[i];
-                       auto label_start_offset = renumber_map_label_offsets[label_index];
-                       return static_cast<vertex_t>(i - label_start_offset);
-                     });
-
-    (*renumber_map_label_indices).resize(0, handle.get_stream());
-    (*renumber_map_label_indices).shrink_to_fit(handle.get_stream());
-
-    auto num_labels = std::get<0>(*label_offsets).size();
-
-    rmm::device_uvector<vertex_t> segment_sorted_renumber_map(renumber_map.size(),
-                                                              handle.get_stream());
-    rmm::device_uvector<vertex_t> segment_sorted_new_vertices(new_vertices.size(),
-                                                              handle.get_stream());
-
-    rmm::device_uvector<std::byte> d_tmp_storage(0, handle.get_stream());
-
-    auto approx_edges_to_sort_per_iteration =
-      static_cast<size_t>(handle.get_device_properties().multiProcessorCount) *
-      (1 << 20) /* tuning parameter */;  // for segmented sort
-
-    auto [h_label_offsets, h_edge_offsets] = detail::compute_offset_aligned_element_chunks(
-      handle,
-      raft::device_span<size_t const>{(*renumber_map_label_offsets).data(),
-                                      (*renumber_map_label_offsets).size()},
-      renumber_map.size(),
-      approx_edges_to_sort_per_iteration);
-    auto num_chunks = h_label_offsets.size() - 1;
-
-    for (size_t i = 0; i < num_chunks; ++i) {
-      size_t tmp_storage_bytes{0};
-
-      auto offset_first =
-        thrust::make_transform_iterator((*renumber_map_label_offsets).data() + h_label_offsets[i],
-                                        detail::shift_left_t<size_t>{h_edge_offsets[i]});
-      cub::DeviceSegmentedSort::SortPairs(static_cast<void*>(nullptr),
-                                          tmp_storage_bytes,
-                                          renumber_map.begin() + h_edge_offsets[i],
-                                          segment_sorted_renumber_map.begin() + h_edge_offsets[i],
-                                          new_vertices.begin() + h_edge_offsets[i],
-                                          segment_sorted_new_vertices.begin() + h_edge_offsets[i],
-                                          h_edge_offsets[i + 1] - h_edge_offsets[i],
-                                          h_label_offsets[i + 1] - h_label_offsets[i],
-                                          offset_first,
-                                          offset_first + 1,
-                                          handle.get_stream());
-
-      if (tmp_storage_bytes > d_tmp_storage.size()) {
-        d_tmp_storage = rmm::device_uvector<std::byte>(tmp_storage_bytes, handle.get_stream());
-      }
-
-      cub::DeviceSegmentedSort::SortPairs(d_tmp_storage.data(),
-                                          tmp_storage_bytes,
-                                          renumber_map.begin() + h_edge_offsets[i],
-                                          segment_sorted_renumber_map.begin() + h_edge_offsets[i],
-                                          new_vertices.begin() + h_edge_offsets[i],
-                                          segment_sorted_new_vertices.begin() + h_edge_offsets[i],
-                                          h_edge_offsets[i + 1] - h_edge_offsets[i],
-                                          h_label_offsets[i + 1] - h_label_offsets[i],
-                                          offset_first,
-                                          offset_first + 1,
-                                          handle.get_stream());
-    }
-    new_vertices.resize(0, handle.get_stream());
-    d_tmp_storage.resize(0, handle.get_stream());
-    new_vertices.shrink_to_fit(handle.get_stream());
-    d_tmp_storage.shrink_to_fit(handle.get_stream());
-
-    auto edgelist_label_indices = detail::expand_sparse_offsets(
-      std::get<1>(*label_offsets), label_index_t{0}, handle.get_stream());
-
-    auto pair_first =
-      thrust::make_zip_iterator(edgelist_srcs.begin(), edgelist_label_indices.begin());
-    thrust::transform(
-      handle.get_thrust_policy(),
-      pair_first,
-      pair_first + edgelist_srcs.size(),
-      edgelist_srcs.begin(),
-      [renumber_map_label_offsets = raft::device_span<size_t const>(
-         (*renumber_map_label_offsets).data(), (*renumber_map_label_offsets).size()),
-       old_vertices = raft::device_span<vertex_t const>(segment_sorted_renumber_map.data(),
-                                                        segment_sorted_renumber_map.size()),
-       new_vertices = raft::device_span<vertex_t const>(
-         segment_sorted_new_vertices.data(),
-         segment_sorted_new_vertices.size())] __device__(auto pair) {
-        auto old_vertex         = thrust::get<0>(pair);
-        auto label_index        = thrust::get<1>(pair);
-        auto label_start_offset = renumber_map_label_offsets[label_index];
-        auto label_end_offset   = renumber_map_label_offsets[label_index + 1];
-        auto it                 = thrust::lower_bound(thrust::seq,
-                                      old_vertices.begin() + label_start_offset,
-                                      old_vertices.begin() + label_end_offset,
-                                      old_vertex);
-        assert(*it == old_vertex);
-        return *(new_vertices.begin() + thrust::distance(old_vertices.begin(), it));
-      });
-
-    pair_first = thrust::make_zip_iterator(edgelist_dsts.begin(), edgelist_label_indices.begin());
-    thrust::transform(
-      handle.get_thrust_policy(),
-      pair_first,
-      pair_first + edgelist_dsts.size(),
-      edgelist_dsts.begin(),
-      [renumber_map_label_offsets = raft::device_span<size_t const>(
-         (*renumber_map_label_offsets).data(), (*renumber_map_label_offsets).size()),
-       old_vertices = raft::device_span<vertex_t const>(segment_sorted_renumber_map.data(),
-                                                        segment_sorted_renumber_map.size()),
-       new_vertices = raft::device_span<vertex_t const>(
-         segment_sorted_new_vertices.data(),
-         segment_sorted_new_vertices.size())] __device__(auto pair) {
-        auto old_vertex         = thrust::get<0>(pair);
-        auto label_index        = thrust::get<1>(pair);
-        auto label_start_offset = renumber_map_label_offsets[label_index];
-        auto label_end_offset   = renumber_map_label_offsets[label_index + 1];
-        auto it                 = thrust::lower_bound(thrust::seq,
-                                      old_vertices.begin() + label_start_offset,
-                                      old_vertices.begin() + label_end_offset,
-                                      old_vertex);
-        assert(*it == old_vertex);
-        return new_vertices[thrust::distance(old_vertices.begin(), it)];
-      });
-  } else {
-    kv_store_t<vertex_t, vertex_t, false> kv_store(renumber_map.begin(),
-                                                   renumber_map.end(),
-                                                   thrust::make_counting_iterator(vertex_t{0}),
-                                                   std::numeric_limits<vertex_t>::max(),
-                                                   std::numeric_limits<vertex_t>::max(),
-                                                   handle.get_stream());
-    auto kv_store_view = kv_store.view();
-
-    kv_store_view.find(
-      edgelist_srcs.begin(), edgelist_srcs.end(), edgelist_srcs.begin(), handle.get_stream());
-    kv_store_view.find(
-      edgelist_dsts.begin(), edgelist_dsts.end(), edgelist_dsts.begin(), handle.get_stream());
-  }
-
-  return std::make_tuple(std::move(edgelist_srcs),
-                         std::move(edgelist_dsts),
-                         std::move(renumber_map),
-                         std::move(renumber_map_label_offsets));
-}
-
-}  // namespace cugraph
diff --git a/cpp/src/sampling/renumber_sampled_edgelist_sg_v32_e32.cu b/cpp/src/sampling/renumber_sampled_edgelist_sg_v32_e32.cu
deleted file mode 100644
index dee28c593ad..00000000000
--- a/cpp/src/sampling/renumber_sampled_edgelist_sg_v32_e32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "renumber_sampled_edgelist_impl.cuh"
-
-#include <cugraph/sampling_functions.hpp>
-
-// FIXME: deprecated, to be deleted
-namespace cugraph {
-
-template std::tuple<rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<size_t>>>
-renumber_sampled_edgelist(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int32_t>&& edgelist_srcs,
-  rmm::device_uvector<int32_t>&& edgelist_dsts,
-  std::optional<raft::device_span<int32_t const>> edgelist_hops,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<size_t const>>>
-    label_offsets,
-  bool do_expensive_check);
-
-}  // namespace cugraph
diff --git a/cpp/src/sampling/renumber_sampled_edgelist_sg_v64_e64.cu b/cpp/src/sampling/renumber_sampled_edgelist_sg_v64_e64.cu
deleted file mode 100644
index 99293c68f0c..00000000000
--- a/cpp/src/sampling/renumber_sampled_edgelist_sg_v64_e64.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "renumber_sampled_edgelist_impl.cuh"
-
-#include <cugraph/sampling_functions.hpp>
-
-// FIXME: deprecated, to be deleted
-namespace cugraph {
-
-template std::tuple<rmm::device_uvector<int64_t>,
-                    rmm::device_uvector<int64_t>,
-                    rmm::device_uvector<int64_t>,
-                    std::optional<rmm::device_uvector<size_t>>>
-renumber_sampled_edgelist(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int64_t>&& edgelist_srcs,
-  rmm::device_uvector<int64_t>&& edgelist_dsts,
-  std::optional<raft::device_span<int32_t const>> edgelist_hops,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<size_t const>>>
-    label_offsets,
-  bool do_expensive_check);
-
-}  // namespace cugraph
diff --git a/cpp/src/sampling/rw_traversals.hpp b/cpp/src/sampling/rw_traversals.hpp
index 45cc1e54cb4..2c5658b32a5 100644
--- a/cpp/src/sampling/rw_traversals.hpp
+++ b/cpp/src/sampling/rw_traversals.hpp
@@ -18,8 +18,6 @@
 //
 #pragma once
 
-#include "utilities/graph_utils.cuh"
-
 #include <cugraph/api_helpers.hpp>
 #include <cugraph/graph.hpp>
 
diff --git a/cpp/src/sampling/sampling_post_processing_impl.cuh b/cpp/src/sampling/sampling_post_processing_impl.cuh
index b0b3bb5f4f2..4624e6d4a5e 100644
--- a/cpp/src/sampling/sampling_post_processing_impl.cuh
+++ b/cpp/src/sampling/sampling_post_processing_impl.cuh
@@ -49,9 +49,10 @@ namespace cugraph {
 
 namespace {
 
-template <typename vertex_t>
+template <typename vertex_t, typename edge_type_t>
 struct edge_order_t {
   thrust::optional<raft::device_span<size_t const>> edgelist_label_offsets{thrust::nullopt};
+  thrust::optional<raft::device_span<edge_type_t const>> edgelist_edge_types{thrust::nullopt};
   thrust::optional<raft::device_span<int32_t const>> edgelist_hops{thrust::nullopt};
   raft::device_span<vertex_t const> edgelist_majors{};
   raft::device_span<vertex_t const> edgelist_minors{};
@@ -72,6 +73,12 @@ struct edge_order_t {
       if (l_label != r_label) { return l_label < r_label; }
     }
 
+    if (edgelist_edge_types) {
+      auto l_type = (*edgelist_edge_types)[l_idx];
+      auto r_type = (*edgelist_edge_types)[r_idx];
+      if (l_type != r_type) { return l_type < r_type; }
+    }
+
     if (edgelist_hops) {
       auto l_hop = (*edgelist_hops)[l_idx];
       auto r_hop = (*edgelist_hops)[r_idx];
@@ -151,6 +158,7 @@ struct optionally_compute_label_index_t {
 
 template <typename label_index_t,
           typename vertex_t,
+          typename vertex_type_t,
           typename weight_t,
           typename edge_id_t,
           typename edge_type_t>
@@ -164,8 +172,11 @@ void check_input_edges(raft::handle_t const& handle,
                        std::optional<raft::device_span<vertex_t const>> seed_vertices,
                        std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
                        std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+                       std::optional<raft::device_span<vertex_t const>> vertex_type_offsets,
                        size_t num_labels,
                        size_t num_hops,
+                       size_t num_vertex_types,
+                       std::optional<size_t> num_edge_types,
                        bool do_expensive_check)
 {
   CUGRAPH_EXPECTS(
@@ -193,6 +204,7 @@ void check_input_edges(raft::handle_t const& handle,
     "(size of the offset array) should be num_labels + 1.");
 
   if (edgelist_majors.size() > 0) {
+    static_assert(std::is_same_v<label_index_t, uint32_t>);
     CUGRAPH_EXPECTS((num_labels >= 1) && (num_labels <= std::numeric_limits<label_index_t>::max()),
                     "Invalid input arguments: num_labels should be a positive integer and the "
                     "current implementation assumes that the number of unique labels is no larger "
@@ -209,13 +221,16 @@ void check_input_edges(raft::handle_t const& handle,
     CUGRAPH_EXPECTS(
       (num_hops == 1) || edgelist_hops.has_value(),
       "Invalid input arguments: edgelist_hops.has_value() should be true if num_hops >= 2.");
-  } else {
-    CUGRAPH_EXPECTS(
-      "num_labels == 0",
-      "Invalid input arguments: num_labels should be 0 if the input edge list is empty.");
+
+    static_assert(std::is_same_v<vertex_type_t, uint32_t>);
     CUGRAPH_EXPECTS(
-      "num_hops == 0",
-      "Invalid input arguments: num_hops should be 0 if the input edge list is empty.");
+      (num_vertex_types >= 1) && (num_vertex_types <= std::numeric_limits<vertex_type_t>::max()),
+      "Invalid input arguments: num_vertex_types should be a positive integer and the "
+      "current implementation assumes that the number of vertex types is no larger "
+      "than std::numeric_limits<uint32_t>::max().");
+    CUGRAPH_EXPECTS((num_vertex_types == 1) || vertex_type_offsets.has_value(),
+                    "Invalid input arguments: vertex_type_offsets.has_value() should be true if "
+                    "num_vertex_types >= 2.");
   }
 
   CUGRAPH_EXPECTS((!seed_vertices.has_value() && !seed_vertex_label_offsets.has_value()) ||
@@ -257,6 +272,174 @@ void check_input_edges(raft::handle_t const& handle,
         "*edgelist_label_offsets and edgelist_(srcs|dsts).size() should coincide.");
     }
 
+    if (edgelist_edge_types && num_edge_types) {
+      CUGRAPH_EXPECTS(
+        thrust::count_if(handle.get_thrust_policy(),
+                         (*edgelist_edge_types).begin(),
+                         (*edgelist_edge_types).end(),
+                         [num_edge_types = static_cast<edge_type_t>(*num_edge_types)] __device__(
+                           edge_type_t edge_type) { return edge_type >= num_edge_types; }) == 0,
+        "Invalid input arguments: edgelist_edge_type is valid but contains out-of-range edge type "
+        "values.");
+      if constexpr (std::is_signed_v<edge_type_t>) {
+        CUGRAPH_EXPECTS(thrust::count_if(handle.get_thrust_policy(),
+                                         (*edgelist_edge_types).begin(),
+                                         (*edgelist_edge_types).end(),
+                                         [] __device__(edge_type_t edge_type) {
+                                           return edge_type < edge_type_t{0};
+                                         }) == 0,
+                        "Invalid input arguments: edgelist_edge_type is valid but contains "
+                        "negative edge type values.");
+      }
+    }
+
+    if (vertex_type_offsets) {
+      CUGRAPH_EXPECTS(
+        thrust::is_sorted(
+          handle.get_thrust_policy(), (*vertex_type_offsets).begin(), (*vertex_type_offsets).end()),
+        "Invalid input arguments: if vertex_type_offsets is valid, "
+        "*vertex_type_offsets should be sorted.");
+      vertex_t front_element{};
+      raft::update_host(
+        &front_element, (*vertex_type_offsets).data(), size_t{1}, handle.get_stream());
+      vertex_t back_element{};
+      raft::update_host(&back_element,
+                        (*vertex_type_offsets).data() + num_vertex_types,
+                        size_t{1},
+                        handle.get_stream());
+      handle.sync_stream();
+      CUGRAPH_EXPECTS(
+        front_element == vertex_t{0},
+        "Invalid input arguments: if vertex_type_offsets is valid, the first element of "
+        "*vertex_type_offsets should be 0.");
+      vertex_t max_v = std::max(thrust::reduce(handle.get_thrust_policy(),
+                                               edgelist_majors.begin(),
+                                               edgelist_majors.end(),
+                                               vertex_t{0},
+                                               thrust::maximum<vertex_t>{}),
+                                thrust::reduce(handle.get_thrust_policy(),
+                                               edgelist_minors.begin(),
+                                               edgelist_minors.end(),
+                                               vertex_t{0},
+                                               thrust::maximum<vertex_t>{}));
+      CUGRAPH_EXPECTS(
+        back_element > max_v,
+        "Invalid input arguments: if vertex_type_offsets is valid, the last element of "
+        "*vertex_type_offsets should be larger than the maximum vertex ID in edgelist_majors & "
+        "edgelist_minors.");
+
+      rmm::device_uvector<vertex_t> tmp_majors(edgelist_majors.size(), handle.get_stream());
+      rmm::device_uvector<vertex_t> tmp_minors(edgelist_minors.size(), handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(),
+                   edgelist_majors.begin(),
+                   edgelist_majors.end(),
+                   tmp_majors.begin());
+      thrust::copy(handle.get_thrust_policy(),
+                   edgelist_minors.begin(),
+                   edgelist_minors.end(),
+                   tmp_minors.begin());
+      if (edgelist_edge_types) {
+        rmm::device_uvector<edge_type_t> tmp_edge_types((*edgelist_edge_types).size(),
+                                                        handle.get_stream());
+        thrust::copy(handle.get_thrust_policy(),
+                     (*edgelist_edge_types).begin(),
+                     (*edgelist_edge_types).end(),
+                     tmp_edge_types.begin());
+        auto triplet_first =
+          thrust::make_zip_iterator(tmp_edge_types.begin(), tmp_majors.begin(), tmp_minors.begin());
+        thrust::sort(handle.get_thrust_policy(), triplet_first, triplet_first + tmp_majors.size());
+        CUGRAPH_EXPECTS(
+          thrust::count_if(
+            handle.get_thrust_policy(),
+            thrust::make_counting_iterator(size_t{0}),
+            thrust::make_counting_iterator(tmp_majors.size()),
+            [vertex_type_offsets = *vertex_type_offsets, triplet_first] __device__(size_t i) {
+              if (i > 0) {
+                auto prev = *(triplet_first + i - 1);
+                auto cur  = *(triplet_first + i);
+                if (thrust::get<0>(prev) == thrust::get<0>(cur)) {  // same edge type
+                  auto prev_major_v_type =
+                    thrust::distance(vertex_type_offsets.begin() + 1,
+                                     thrust::upper_bound(thrust::seq,
+                                                         vertex_type_offsets.begin() + 1,
+                                                         vertex_type_offsets.end(),
+                                                         thrust::get<1>(prev)));
+                  auto cur_major_v_type =
+                    thrust::distance(vertex_type_offsets.begin() + 1,
+                                     thrust::upper_bound(thrust::seq,
+                                                         vertex_type_offsets.begin() + 1,
+                                                         vertex_type_offsets.end(),
+                                                         thrust::get<1>(cur)));
+                  if (prev_major_v_type != cur_major_v_type) { return true; }
+                  auto prev_minor_v_type =
+                    thrust::distance(vertex_type_offsets.begin() + 1,
+                                     thrust::upper_bound(thrust::seq,
+                                                         vertex_type_offsets.begin() + 1,
+                                                         vertex_type_offsets.end(),
+                                                         thrust::get<2>(prev)));
+                  auto cur_minor_v_type =
+                    thrust::distance(vertex_type_offsets.begin() + 1,
+                                     thrust::upper_bound(thrust::seq,
+                                                         vertex_type_offsets.begin() + 1,
+                                                         vertex_type_offsets.end(),
+                                                         thrust::get<2>(cur)));
+                  if (prev_minor_v_type != cur_minor_v_type) { return true; }
+                }
+              }
+              return false;
+            }) == 0,
+          "Invalid input arguments: if vertex_type_offsets and edgelist_edge_types are valid, the "
+          "entire set of input edge source vertices for each edge type should have an identical "
+          "vertex type, and the entire set of input edge destination vertices for each type should "
+          "have an identical vertex type.");
+      } else {
+        auto pair_first = thrust::make_zip_iterator(tmp_majors.begin(), tmp_minors.begin());
+        thrust::sort(handle.get_thrust_policy(), pair_first, pair_first + tmp_majors.size());
+        CUGRAPH_EXPECTS(
+          thrust::count_if(
+            handle.get_thrust_policy(),
+            thrust::make_counting_iterator(size_t{0}),
+            thrust::make_counting_iterator(tmp_majors.size()),
+            [vertex_type_offsets = *vertex_type_offsets, pair_first] __device__(size_t i) {
+              if (i > 0) {
+                auto prev = *(pair_first + i - 1);
+                auto cur  = *(pair_first + i);
+                auto prev_src_v_type =
+                  thrust::distance(vertex_type_offsets.begin() + 1,
+                                   thrust::upper_bound(thrust::seq,
+                                                       vertex_type_offsets.begin() + 1,
+                                                       vertex_type_offsets.end(),
+                                                       thrust::get<0>(prev)));
+                auto cur_src_v_type =
+                  thrust::distance(vertex_type_offsets.begin() + 1,
+                                   thrust::upper_bound(thrust::seq,
+                                                       vertex_type_offsets.begin() + 1,
+                                                       vertex_type_offsets.end(),
+                                                       thrust::get<0>(cur)));
+                if (prev_src_v_type != cur_src_v_type) { return true; }
+                auto prev_dst_v_type =
+                  thrust::distance(vertex_type_offsets.begin() + 1,
+                                   thrust::upper_bound(thrust::seq,
+                                                       vertex_type_offsets.begin() + 1,
+                                                       vertex_type_offsets.end(),
+                                                       thrust::get<1>(prev)));
+                auto cur_dst_v_type =
+                  thrust::distance(vertex_type_offsets.begin() + 1,
+                                   thrust::upper_bound(thrust::seq,
+                                                       vertex_type_offsets.begin() + 1,
+                                                       vertex_type_offsets.end(),
+                                                       thrust::get<1>(cur)));
+                if (prev_dst_v_type != cur_dst_v_type) { return true; }
+              }
+              return false;
+            }) == 0,
+          "Invalid input arguments: if vertex_type_offsets is valid (but "
+          "edgelist_edge_types is invalid), the entire set of input edge source "
+          "vertices should have an identical vertex type, and the entire set of "
+          "input edge destination vertices should have an identical vertex type.");
+      }
+    }
+
     if (seed_vertices) {
       for (size_t i = 0; i < num_labels; ++i) {
         rmm::device_uvector<vertex_t> this_label_seed_vertices(0, handle.get_stream());
@@ -356,7 +539,7 @@ compute_min_hop_for_unique_label_vertex_pairs(
   std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
   std::optional<raft::device_span<size_t const>> edgelist_label_offsets)
 {
-  auto approx_edges_to_sort_per_iteration =
+  auto approx_items_to_sort_per_iteration =
     static_cast<size_t>(handle.get_device_properties().multiProcessorCount) *
     (1 << 18) /* tuning parameter */;  // for segmented sort
 
@@ -369,7 +552,7 @@ compute_min_hop_for_unique_label_vertex_pairs(
       detail::compute_offset_aligned_element_chunks(handle,
                                                     *edgelist_label_offsets,
                                                     edgelist_vertices.size(),
-                                                    approx_edges_to_sort_per_iteration);
+                                                    approx_items_to_sort_per_iteration);
     auto num_chunks = h_label_offsets.size() - 1;
 
     if (edgelist_hops) {
@@ -406,28 +589,28 @@ compute_min_hop_for_unique_label_vertex_pairs(
       }
 
       tmp_indices.resize(
-        thrust::distance(
-          tmp_indices.begin(),
-          thrust::unique(handle.get_thrust_policy(),
-                         tmp_indices.begin(),
-                         tmp_indices.end(),
-                         [edgelist_label_offsets = *edgelist_label_offsets,
-                          edgelist_vertices,
-                          edgelist_hops = *edgelist_hops] __device__(size_t l_idx, size_t r_idx) {
-                           auto l_it = thrust::upper_bound(thrust::seq,
-                                                           edgelist_label_offsets.begin() + 1,
-                                                           edgelist_label_offsets.end(),
-                                                           l_idx);
-                           auto r_it = thrust::upper_bound(thrust::seq,
-                                                           edgelist_label_offsets.begin() + 1,
-                                                           edgelist_label_offsets.end(),
-                                                           r_idx);
-                           if (l_it != r_it) { return false; }
-
-                           auto l_vertex = edgelist_vertices[l_idx];
-                           auto r_vertex = edgelist_vertices[r_idx];
-                           return l_vertex == r_vertex;
-                         })),
+        thrust::distance(tmp_indices.begin(),
+                         thrust::unique(handle.get_thrust_policy(),
+                                        tmp_indices.begin(),
+                                        tmp_indices.end(),
+                                        [edgelist_label_offsets = *edgelist_label_offsets,
+                                         edgelist_vertices] __device__(size_t l_idx, size_t r_idx) {
+                                          auto l_it =
+                                            thrust::upper_bound(thrust::seq,
+                                                                edgelist_label_offsets.begin() + 1,
+                                                                edgelist_label_offsets.end(),
+                                                                l_idx);
+                                          auto r_it =
+                                            thrust::upper_bound(thrust::seq,
+                                                                edgelist_label_offsets.begin() + 1,
+                                                                edgelist_label_offsets.end(),
+                                                                r_idx);
+                                          if (l_it != r_it) { return false; }
+
+                                          auto l_vertex = edgelist_vertices[l_idx];
+                                          auto r_vertex = edgelist_vertices[r_idx];
+                                          return l_vertex == r_vertex;
+                                        })),
         handle.get_stream());
 
       tmp_label_indices.resize(tmp_indices.size(), handle.get_stream());
@@ -859,17 +1042,23 @@ compute_min_hop_for_unique_label_vertex_pairs(
   }
 }
 
-template <typename vertex_t, typename label_index_t>
-std::tuple<rmm::device_uvector<vertex_t>, std::optional<rmm::device_uvector<label_index_t>>>
-compute_renumber_map(raft::handle_t const& handle,
-                     raft::device_span<vertex_t const> edgelist_majors,
-                     raft::device_span<vertex_t const> edgelist_minors,
-                     std::optional<raft::device_span<int32_t const>> edgelist_hops,
-                     std::optional<raft::device_span<vertex_t const>> seed_vertices,
-                     std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
-                     std::optional<raft::device_span<size_t const>> edgelist_label_offsets)
+// returns renumber map & optional (label, type) offsets
+// indices are non-descedning)
+template <typename vertex_t, typename vertex_type_t, typename label_index_t>
+std::tuple<rmm::device_uvector<vertex_t>, std::optional<rmm::device_uvector<size_t>>>
+compute_vertex_renumber_map(
+  raft::handle_t const& handle,
+  raft::device_span<vertex_t const> edgelist_majors,
+  raft::device_span<vertex_t const> edgelist_minors,
+  std::optional<raft::device_span<int32_t const>> edgelist_hops,
+  std::optional<raft::device_span<vertex_t const>> seed_vertices,
+  std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
+  std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+  std::optional<raft::device_span<vertex_t const>> vertex_type_offsets,
+  size_t num_labels,
+  size_t num_vertex_types)
 {
-  auto approx_edges_to_sort_per_iteration =
+  auto approx_items_to_sort_per_iteration =
     static_cast<size_t>(handle.get_device_properties().multiProcessorCount) *
     (1 << 20) /* tuning parameter */;  // for segmented sort
 
@@ -892,10 +1081,9 @@ compute_renumber_map(raft::handle_t const& handle,
     compute_min_hop_for_unique_label_vertex_pairs<vertex_t, label_index_t>(
       handle, edgelist_minors, edgelist_hops, std::nullopt, std::nullopt, edgelist_label_offsets);
 
+  rmm::device_uvector<vertex_t> renumber_map(0, handle.get_stream());
+  std::optional<rmm::device_uvector<size_t>> renumber_map_label_type_offsets{std::nullopt};
   if (edgelist_label_offsets) {
-    auto num_labels = (*edgelist_label_offsets).size() - 1;
-
-    rmm::device_uvector<vertex_t> renumber_map(0, handle.get_stream());
     rmm::device_uvector<label_index_t> renumber_map_label_indices(0, handle.get_stream());
 
     renumber_map.reserve((*unique_label_major_pair_label_indices).size() +
@@ -903,8 +1091,8 @@ compute_renumber_map(raft::handle_t const& handle,
                          handle.get_stream());
     renumber_map_label_indices.reserve(renumber_map.capacity(), handle.get_stream());
 
-    auto num_chunks = (edgelist_majors.size() + (approx_edges_to_sort_per_iteration - 1)) /
-                      approx_edges_to_sort_per_iteration;
+    auto num_chunks = (edgelist_majors.size() + (approx_items_to_sort_per_iteration - 1)) /
+                      approx_items_to_sort_per_iteration;
     auto chunk_size = (num_chunks > 0) ? ((num_labels + (num_chunks - 1)) / num_chunks) : 0;
 
     size_t copy_offset{0};
@@ -963,12 +1151,37 @@ compute_renumber_map(raft::handle_t const& handle,
         merged_vertices.resize(merged_label_indices.size(), handle.get_stream());
         merged_hops.resize(merged_label_indices.size(), handle.get_stream());
         merged_flags.resize(merged_label_indices.size(), handle.get_stream());
-        auto sort_key_first = thrust::make_zip_iterator(
-          merged_label_indices.begin(), merged_hops.begin(), merged_flags.begin());
-        thrust::sort_by_key(handle.get_thrust_policy(),
-                            sort_key_first,
-                            sort_key_first + merged_label_indices.size(),
-                            merged_vertices.begin());
+        if (vertex_type_offsets) {
+          auto quadraplet_first = thrust::make_zip_iterator(merged_label_indices.begin(),
+                                                            merged_vertices.begin(),
+                                                            merged_hops.begin(),
+                                                            merged_flags.begin());
+          thrust::sort(
+            handle.get_thrust_policy(),
+            quadraplet_first,
+            quadraplet_first + merged_vertices.size(),
+            [offsets = *vertex_type_offsets] __device__(auto lhs, auto rhs) {
+              auto lhs_v_type = thrust::distance(
+                offsets.begin() + 1,
+                thrust::upper_bound(
+                  thrust::seq, offsets.begin() + 1, offsets.end(), thrust::get<1>(lhs)));
+              auto rhs_v_type = thrust::distance(
+                offsets.begin() + 1,
+                thrust::upper_bound(
+                  thrust::seq, offsets.begin() + 1, offsets.end(), thrust::get<1>(rhs)));
+              return thrust::make_tuple(
+                       thrust::get<0>(lhs), lhs_v_type, thrust::get<2>(lhs), thrust::get<3>(lhs)) <
+                     thrust::make_tuple(
+                       thrust::get<0>(rhs), rhs_v_type, thrust::get<2>(rhs), thrust::get<3>(rhs));
+            });
+        } else {
+          auto sort_key_first = thrust::make_zip_iterator(
+            merged_label_indices.begin(), merged_hops.begin(), merged_flags.begin());
+          thrust::sort_by_key(handle.get_thrust_policy(),
+                              sort_key_first,
+                              sort_key_first + merged_label_indices.size(),
+                              merged_vertices.begin());
+        }
       } else {
         auto major_triplet_first =
           thrust::make_zip_iterator((*unique_label_major_pair_label_indices).begin(),
@@ -999,12 +1212,33 @@ compute_renumber_map(raft::handle_t const& handle,
           handle.get_stream());
         merged_vertices.resize(merged_label_indices.size(), handle.get_stream());
         merged_flags.resize(merged_label_indices.size(), handle.get_stream());
-        auto sort_key_first =
-          thrust::make_zip_iterator(merged_label_indices.begin(), merged_flags.begin());
-        thrust::sort_by_key(handle.get_thrust_policy(),
-                            sort_key_first,
-                            sort_key_first + merged_label_indices.size(),
-                            merged_vertices.begin());
+        if (vertex_type_offsets) {
+          auto triplet_first = thrust::make_zip_iterator(
+            merged_label_indices.begin(), merged_vertices.begin(), merged_flags.begin());
+          thrust::sort(
+            handle.get_thrust_policy(),
+            triplet_first,
+            triplet_first + merged_vertices.size(),
+            [offsets = *vertex_type_offsets] __device__(auto lhs, auto rhs) {
+              auto lhs_v_type = thrust::distance(
+                offsets.begin() + 1,
+                thrust::upper_bound(
+                  thrust::seq, offsets.begin() + 1, offsets.end(), thrust::get<1>(lhs)));
+              auto rhs_v_type = thrust::distance(
+                offsets.begin() + 1,
+                thrust::upper_bound(
+                  thrust::seq, offsets.begin() + 1, offsets.end(), thrust::get<1>(rhs)));
+              return thrust::make_tuple(thrust::get<0>(lhs), lhs_v_type, thrust::get<2>(lhs)) <
+                     thrust::make_tuple(thrust::get<0>(rhs), rhs_v_type, thrust::get<2>(rhs));
+            });
+        } else {
+          auto sort_key_first =
+            thrust::make_zip_iterator(merged_label_indices.begin(), merged_flags.begin());
+          thrust::sort_by_key(handle.get_thrust_policy(),
+                              sort_key_first,
+                              sort_key_first + merged_label_indices.size(),
+                              merged_vertices.begin());
+        }
       }
 
       renumber_map.resize(copy_offset + merged_vertices.size(), handle.get_stream());
@@ -1025,7 +1259,41 @@ compute_renumber_map(raft::handle_t const& handle,
     renumber_map.shrink_to_fit(handle.get_stream());
     renumber_map_label_indices.shrink_to_fit(handle.get_stream());
 
-    return std::make_tuple(std::move(renumber_map), std::move(renumber_map_label_indices));
+    renumber_map_label_type_offsets =
+      rmm::device_uvector<size_t>(num_labels * num_vertex_types + 1, handle.get_stream());
+    (*renumber_map_label_type_offsets).set_element_to_zero_async(0, handle.get_stream());
+    if (vertex_type_offsets) {
+      auto label_type_pair_first = thrust::make_zip_iterator(
+        renumber_map_label_indices.begin(),
+        thrust::make_transform_iterator(
+          renumber_map.begin(),
+          cuda::proclaim_return_type<vertex_type_t>(
+            [offsets = *vertex_type_offsets] __device__(auto v) {
+              return static_cast<vertex_type_t>(thrust::distance(
+                offsets.begin() + 1,
+                thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), v)));
+            })));
+      auto value_first = thrust::make_transform_iterator(
+        thrust::make_counting_iterator(size_t{0}),
+        cuda::proclaim_return_type<thrust::tuple<label_index_t, vertex_type_t>>(
+          [num_vertex_types] __device__(size_t i) {
+            return thrust::make_tuple(static_cast<label_index_t>(i / num_vertex_types),
+                                      static_cast<vertex_type_t>(i % num_vertex_types));
+          }));
+      thrust::upper_bound(handle.get_thrust_policy(),
+                          label_type_pair_first,
+                          label_type_pair_first + renumber_map.size(),
+                          value_first,
+                          value_first + (num_labels * num_vertex_types),
+                          (*renumber_map_label_type_offsets).begin() + 1);
+    } else {
+      thrust::upper_bound(handle.get_thrust_policy(),
+                          renumber_map_label_indices.begin(),
+                          renumber_map_label_indices.end(),
+                          thrust::make_counting_iterator(label_index_t{0}),
+                          thrust::make_counting_iterator(static_cast<label_index_t>(num_labels)),
+                          (*renumber_map_label_type_offsets).begin() + 1);
+    }
   } else {
     if (edgelist_hops) {
       rmm::device_uvector<vertex_t> merged_vertices(
@@ -1067,13 +1335,34 @@ compute_renumber_map(raft::handle_t const& handle,
       merged_hops.resize(merged_vertices.size(), handle.get_stream());
       merged_flags.resize(merged_vertices.size(), handle.get_stream());
 
-      auto sort_key_first = thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin());
-      thrust::sort_by_key(handle.get_thrust_policy(),
-                          sort_key_first,
-                          sort_key_first + merged_hops.size(),
-                          merged_vertices.begin());
+      if (vertex_type_offsets) {
+        auto triplet_first = thrust::make_zip_iterator(
+          merged_vertices.begin(), merged_hops.begin(), merged_flags.begin());
+        thrust::sort(
+          handle.get_thrust_policy(),
+          triplet_first,
+          triplet_first + merged_vertices.size(),
+          [offsets = *vertex_type_offsets] __device__(auto lhs, auto rhs) {
+            auto lhs_v_type = thrust::distance(
+              offsets.begin() + 1,
+              thrust::upper_bound(
+                thrust::seq, offsets.begin() + 1, offsets.end(), thrust::get<0>(lhs)));
+            auto rhs_v_type = thrust::distance(
+              offsets.begin() + 1,
+              thrust::upper_bound(
+                thrust::seq, offsets.begin() + 1, offsets.end(), thrust::get<0>(rhs)));
+            return thrust::make_tuple(lhs_v_type, thrust::get<1>(lhs), thrust::get<2>(lhs)) <
+                   thrust::make_tuple(rhs_v_type, thrust::get<1>(rhs), thrust::get<2>(rhs));
+          });
+      } else {
+        auto sort_key_first = thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin());
+        thrust::sort_by_key(handle.get_thrust_policy(),
+                            sort_key_first,
+                            sort_key_first + merged_hops.size(),
+                            merged_vertices.begin());
+      }
 
-      return std::make_tuple(std::move(merged_vertices), std::nullopt);
+      renumber_map = std::move(merged_vertices);
     } else {
       rmm::device_uvector<vertex_t> output_vertices(unique_label_minor_pair_vertices.size(),
                                                     handle.get_stream());
@@ -1085,7 +1374,7 @@ compute_renumber_map(raft::handle_t const& handle,
                                                 output_vertices.begin());
 
       auto num_unique_majors = unique_label_major_pair_vertices.size();
-      auto renumber_map      = std::move(unique_label_major_pair_vertices);
+      renumber_map           = std::move(unique_label_major_pair_vertices);
       renumber_map.resize(
         renumber_map.size() + thrust::distance(output_vertices.begin(), output_last),
         handle.get_stream());
@@ -1094,9 +1383,370 @@ compute_renumber_map(raft::handle_t const& handle,
                    output_last,
                    renumber_map.begin() + num_unique_majors);
 
-      return std::make_tuple(std::move(renumber_map), std::nullopt);
+      if (vertex_type_offsets) {
+        thrust::stable_sort(
+          handle.get_thrust_policy(),
+          renumber_map.begin(),
+          renumber_map.end(),
+          [offsets = *vertex_type_offsets] __device__(auto lhs, auto rhs) {
+            auto lhs_v_type = thrust::distance(
+              offsets.begin() + 1,
+              thrust::upper_bound(
+                thrust::seq, offsets.begin() + 1, offsets.end(), thrust::get<0>(lhs)));
+            auto rhs_v_type = thrust::distance(
+              offsets.begin() + 1,
+              thrust::upper_bound(
+                thrust::seq, offsets.begin() + 1, offsets.end(), thrust::get<0>(rhs)));
+            return lhs_v_type < rhs_v_type;
+          });
+      }
+    }
+
+    if (vertex_type_offsets) {
+      renumber_map_label_type_offsets =
+        rmm::device_uvector<size_t>(num_vertex_types + 1, handle.get_stream());
+      (*renumber_map_label_type_offsets).set_element_to_zero_async(0, handle.get_stream());
+      auto type_first = thrust::make_transform_iterator(
+        renumber_map.begin(),
+        cuda::proclaim_return_type<vertex_type_t>(
+          [offsets = *vertex_type_offsets] __device__(auto v) {
+            return static_cast<vertex_type_t>(thrust::distance(
+              offsets.begin() + 1,
+              thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), v)));
+          }));
+      thrust::upper_bound(
+        handle.get_thrust_policy(),
+        type_first,
+        type_first + renumber_map.size(),
+        thrust::make_counting_iterator(vertex_type_t{0}),
+        thrust::make_counting_iterator(static_cast<vertex_type_t>(num_vertex_types)),
+        (*renumber_map_label_type_offsets).begin() + 1);
+    }
+  }
+
+  return std::make_tuple(std::move(renumber_map), std::move(renumber_map_label_type_offsets));
+}
+
+// returns renumber map & optional (label, type) offsets
+template <typename vertex_t, typename edge_id_t, typename edge_type_t, typename label_index_t>
+std::tuple<rmm::device_uvector<edge_id_t>, std::optional<rmm::device_uvector<size_t>>>
+compute_edge_id_renumber_map(
+  raft::handle_t const& handle,
+  raft::device_span<edge_id_t const> edgelist_edge_ids,
+  std::optional<raft::device_span<edge_type_t const>> edgelist_edge_types,
+  std::optional<raft::device_span<int32_t const>> edgelist_hops,
+  std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+  size_t num_labels,
+  size_t num_edge_types)
+{
+  rmm::device_uvector<edge_id_t> renumber_map(0, handle.get_stream());
+  std::optional<rmm::device_uvector<size_t>> renumber_map_label_type_offsets{std::nullopt};
+  if (edgelist_label_offsets) {
+    auto approx_items_to_sort_per_iteration =
+      static_cast<size_t>(handle.get_device_properties().multiProcessorCount) *
+      (1 << 20) /* tuning parameter */;  // for segmented sort
+
+    auto [h_label_offsets, h_edge_offsets] =
+      detail::compute_offset_aligned_element_chunks(handle,
+                                                    *edgelist_label_offsets,
+                                                    edgelist_edge_ids.size(),
+                                                    approx_items_to_sort_per_iteration);
+    auto num_chunks = h_label_offsets.size() - 1;
+
+    rmm::device_uvector<size_t> tmp_indices(edgelist_edge_ids.size(), handle.get_stream());
+    thrust::sequence(handle.get_thrust_policy(), tmp_indices.begin(), tmp_indices.end(), size_t{0});
+
+    // cub::DeviceSegmentedSort currently does not suuport thrust::tuple type keys, sorting in
+    // chunks still helps in limiting the binary search range and improving memory locality
+    for (size_t i = 0; i < num_chunks; ++i) {
+      // sort by (label, (type), id, (hop))
+
+      thrust::sort(
+        handle.get_thrust_policy(),
+        tmp_indices.begin() + h_edge_offsets[i],
+        tmp_indices.begin() + h_edge_offsets[i + 1],
+        [edgelist_label_offsets =
+           raft::device_span<size_t const>((*edgelist_label_offsets).data() + h_label_offsets[i],
+                                           (h_label_offsets[i + 1] - h_label_offsets[i]) + 1),
+         edgelist_edge_types = detail::to_thrust_optional(edgelist_edge_types),
+         edgelist_edge_ids,
+         edgelist_hops = detail::to_thrust_optional(edgelist_hops)] __device__(size_t l_idx,
+                                                                               size_t r_idx) {
+          auto l_it = thrust::upper_bound(
+            thrust::seq, edgelist_label_offsets.begin() + 1, edgelist_label_offsets.end(), l_idx);
+          auto r_it = thrust::upper_bound(
+            thrust::seq, edgelist_label_offsets.begin() + 1, edgelist_label_offsets.end(), r_idx);
+          if (l_it != r_it) { return l_it < r_it; }
+
+          if (edgelist_edge_types) {
+            auto l_type = (*edgelist_edge_types)[l_idx];
+            auto r_type = (*edgelist_edge_types)[r_idx];
+            if (l_type != r_type) { return l_type < r_type; }
+          }
+
+          auto l_id = edgelist_edge_ids[l_idx];
+          auto r_id = edgelist_edge_ids[r_idx];
+          if (l_id != r_id) { return l_id < r_id; }
+
+          if (edgelist_hops) {
+            auto l_hop = (*edgelist_hops)[l_idx];
+            auto r_hop = (*edgelist_hops)[r_idx];
+            return l_hop < r_hop;
+          }
+
+          return false;
+        });
+
+      // find unique (label, (type), id, (min_hop)) tuples
+
+      auto last = thrust::unique(
+        handle.get_thrust_policy(),
+        tmp_indices.begin() + h_edge_offsets[i],
+        tmp_indices.begin() + h_edge_offsets[i + 1],
+        [edgelist_label_offsets = *edgelist_label_offsets,
+         edgelist_edge_types    = detail::to_thrust_optional(edgelist_edge_types),
+         edgelist_edge_ids] __device__(size_t l_idx, size_t r_idx) {
+          auto l_it = thrust::upper_bound(
+            thrust::seq, edgelist_label_offsets.begin() + 1, edgelist_label_offsets.end(), l_idx);
+          auto r_it = thrust::upper_bound(
+            thrust::seq, edgelist_label_offsets.begin() + 1, edgelist_label_offsets.end(), r_idx);
+          if (l_it != r_it) { return false; }
+
+          if (edgelist_edge_types) {
+            auto l_type = (*edgelist_edge_types)[l_idx];
+            auto r_type = (*edgelist_edge_types)[r_idx];
+            if (l_type != r_type) { return false; }
+          }
+
+          auto l_id = edgelist_edge_ids[l_idx];
+          auto r_id = edgelist_edge_ids[r_idx];
+          return l_id == r_id;
+        });
+
+      // sort by (label, (type), (min_hop), id)
+
+      if (edgelist_hops) {
+        thrust::sort(
+          handle.get_thrust_policy(),
+          tmp_indices.begin() + h_edge_offsets[i],
+          last,
+          [edgelist_label_offsets =
+             raft::device_span<size_t const>((*edgelist_label_offsets).data() + h_label_offsets[i],
+                                             (h_label_offsets[i + 1] - h_label_offsets[i]) + 1),
+           edgelist_edge_types = detail::to_thrust_optional(edgelist_edge_types),
+           edgelist_edge_ids,
+           edgelist_hops = detail::to_thrust_optional(edgelist_hops)] __device__(size_t l_idx,
+                                                                                 size_t r_idx) {
+            auto l_it = thrust::upper_bound(
+              thrust::seq, edgelist_label_offsets.begin() + 1, edgelist_label_offsets.end(), l_idx);
+            auto r_it = thrust::upper_bound(
+              thrust::seq, edgelist_label_offsets.begin() + 1, edgelist_label_offsets.end(), r_idx);
+            if (l_it != r_it) { return l_it < r_it; }
+
+            if (edgelist_edge_types) {
+              auto l_type = (*edgelist_edge_types)[l_idx];
+              auto r_type = (*edgelist_edge_types)[r_idx];
+              if (l_type != r_type) { return l_type < r_type; }
+            }
+
+            if (edgelist_hops) {
+              auto l_hop = (*edgelist_hops)[l_idx];
+              auto r_hop = (*edgelist_hops)[r_idx];
+              return l_hop < r_hop;
+            }
+
+            auto l_id = edgelist_edge_ids[l_idx];
+            auto r_id = edgelist_edge_ids[r_idx];
+            if (l_id != r_id) { return l_id < r_id; }
+
+            return false;
+          });
+      }
+
+      // mark invalid indices
+
+      thrust::fill(handle.get_thrust_policy(),
+                   last,
+                   tmp_indices.begin() + h_edge_offsets[i + 1],
+                   std::numeric_limits<size_t>::max());
+    }
+
+    tmp_indices.resize(thrust::distance(tmp_indices.begin(),
+                                        thrust::remove(handle.get_thrust_policy(),
+                                                       tmp_indices.begin(),
+                                                       tmp_indices.end(),
+                                                       std::numeric_limits<size_t>::max())),
+                       handle.get_stream());
+
+    renumber_map = rmm::device_uvector<edge_id_t>(tmp_indices.size(), handle.get_stream());
+    thrust::gather(handle.get_thrust_policy(),
+                   tmp_indices.begin(),
+                   tmp_indices.end(),
+                   edgelist_edge_ids.begin(),
+                   renumber_map.begin());
+
+    renumber_map_label_type_offsets =
+      rmm::device_uvector<size_t>(num_labels * num_edge_types + 1, handle.get_stream());
+    (*renumber_map_label_type_offsets).set_element_to_zero_async(0, handle.get_stream());
+    if (edgelist_edge_types) {
+      auto label_type_pair_first = thrust::make_transform_iterator(
+        tmp_indices.begin(),
+        cuda::proclaim_return_type<thrust::tuple<label_index_t, edge_type_t>>(
+          [edgelist_label_offsets = *edgelist_label_offsets,
+           edgelist_edge_types    = *edgelist_edge_types] __device__(size_t i) {
+            auto label_idx = thrust::distance(
+              edgelist_label_offsets.begin() + 1,
+              thrust::upper_bound(
+                thrust::seq, edgelist_label_offsets.begin() + 1, edgelist_label_offsets.end(), i));
+            return thrust::make_tuple(static_cast<label_index_t>(label_idx),
+                                      edgelist_edge_types[i]);
+          }));
+      auto value_first = thrust::make_transform_iterator(
+        thrust::make_counting_iterator(size_t{0}),
+        cuda::proclaim_return_type<thrust::tuple<label_index_t, edge_type_t>>(
+          [num_edge_types] __device__(size_t i) {
+            return thrust::make_tuple(static_cast<label_index_t>(i / num_edge_types),
+                                      static_cast<edge_type_t>(i % num_edge_types));
+          }));
+      thrust::upper_bound(handle.get_thrust_policy(),
+                          label_type_pair_first,
+                          label_type_pair_first + renumber_map.size(),
+                          value_first,
+                          value_first + (num_labels * num_edge_types),
+                          (*renumber_map_label_type_offsets).begin() + 1);
+    } else {
+      auto label_first = thrust::make_transform_iterator(
+        tmp_indices.begin(),
+        cuda::proclaim_return_type<label_index_t>(
+          [edgelist_label_offsets = *edgelist_label_offsets] __device__(size_t i) {
+            auto label_idx = thrust::distance(
+              edgelist_label_offsets.begin() + 1,
+              thrust::upper_bound(
+                thrust::seq, edgelist_label_offsets.begin() + 1, edgelist_label_offsets.end(), i));
+            return static_cast<label_index_t>(label_idx);
+          }));
+      auto value_first = thrust::make_counting_iterator(label_index_t{0});
+      thrust::upper_bound(handle.get_thrust_policy(),
+                          label_first,
+                          label_first + renumber_map.size(),
+                          value_first,
+                          value_first + num_labels,
+                          (*renumber_map_label_type_offsets).begin() + 1);
+    }
+  } else {
+    // copy
+
+    std::optional<rmm::device_uvector<edge_type_t>> tmp_types{std::nullopt};
+    if (edgelist_edge_types) {
+      tmp_types =
+        rmm::device_uvector<edge_type_t>((*edgelist_edge_types).size(), handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(),
+                   (*edgelist_edge_types).begin(),
+                   (*edgelist_edge_types).end(),
+                   (*tmp_types).begin());
+    }
+    rmm::device_uvector<edge_id_t> tmp_ids(edgelist_edge_ids.size(), handle.get_stream());
+    thrust::copy(handle.get_thrust_policy(),
+                 edgelist_edge_ids.begin(),
+                 edgelist_edge_ids.end(),
+                 tmp_ids.begin());
+    std::optional<rmm::device_uvector<int32_t>> tmp_hops{std::nullopt};
+    if (edgelist_hops) {
+      tmp_hops = rmm::device_uvector<int32_t>((*edgelist_hops).size(), handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(),
+                   (*edgelist_hops).begin(),
+                   (*edgelist_hops).end(),
+                   (*tmp_hops).begin());
+    }
+
+    // sort by ((type), id, (hop))
+
+    if (tmp_types) {
+      if (tmp_hops) {
+        auto triplet_first =
+          thrust::make_zip_iterator((*tmp_types).begin(), tmp_ids.begin(), (*tmp_hops).begin());
+        thrust::sort(handle.get_thrust_policy(), triplet_first, triplet_first + tmp_ids.size());
+      } else {
+        auto pair_first = thrust::make_zip_iterator((*tmp_types).begin(), tmp_ids.begin());
+        thrust::sort(handle.get_thrust_policy(), pair_first, pair_first + tmp_ids.size());
+      }
+    } else {
+      if (tmp_hops) {
+        auto pair_first = thrust::make_zip_iterator(tmp_ids.begin(), (*tmp_hops).begin());
+        thrust::sort(handle.get_thrust_policy(), pair_first, pair_first + tmp_ids.size());
+      } else {
+        thrust::sort(handle.get_thrust_policy(), tmp_ids.begin(), tmp_ids.end());
+      }
+    }
+
+    // find unique ((type), id, (min_hop)) tuples
+
+    if (tmp_types) {
+      auto pair_first = thrust::make_zip_iterator((*tmp_types).begin(), tmp_ids.begin());
+      if (tmp_hops) {
+        tmp_ids.resize(
+          thrust::distance(pair_first,
+                           thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
+                                                                pair_first,
+                                                                pair_first + tmp_ids.size(),
+                                                                (*tmp_hops).begin()))),
+          handle.get_stream());
+        (*tmp_hops).resize(tmp_ids.size(), handle.get_stream());
+      } else {
+        tmp_ids.resize(
+          thrust::distance(
+            pair_first,
+            thrust::unique(handle.get_thrust_policy(), pair_first, pair_first + tmp_ids.size())),
+          handle.get_stream());
+      }
+      (*tmp_types).resize(tmp_ids.size(), handle.get_stream());
+    } else {
+      if (tmp_hops) {
+        tmp_ids.resize(
+          thrust::distance(
+            tmp_ids.begin(),
+            thrust::get<0>(thrust::unique_by_key(
+              handle.get_thrust_policy(), tmp_ids.begin(), tmp_ids.end(), (*tmp_hops).begin()))),
+          handle.get_stream());
+        (*tmp_hops).resize(tmp_ids.size(), handle.get_stream());
+      } else {
+        tmp_ids.resize(
+          thrust::distance(
+            tmp_ids.begin(),
+            thrust::unique(handle.get_thrust_policy(), tmp_ids.begin(), tmp_ids.end())),
+          handle.get_stream());
+      }
+    }
+
+    // sort by ((type), (min_hop), id)
+
+    if (tmp_hops) {
+      if (tmp_types) {
+        auto triplet_first =
+          thrust::make_zip_iterator((*tmp_types).begin(), (*tmp_hops).begin(), tmp_ids.begin());
+        thrust::sort(handle.get_thrust_policy(), triplet_first, triplet_first + tmp_ids.size());
+      } else {
+        auto pair_first = thrust::make_zip_iterator((*tmp_hops).begin(), tmp_ids.begin());
+        thrust::sort(handle.get_thrust_policy(), pair_first, pair_first + tmp_ids.size());
+      }
+    }
+
+    renumber_map = std::move(tmp_ids);
+
+    if (tmp_types) {
+      renumber_map_label_type_offsets =
+        rmm::device_uvector<size_t>(num_edge_types + 1, handle.get_stream());
+      (*renumber_map_label_type_offsets).set_element_to_zero_async(0, handle.get_stream());
+      thrust::upper_bound(handle.get_thrust_policy(),
+                          (*tmp_types).begin(),
+                          (*tmp_types).end(),
+                          thrust::make_counting_iterator(edge_type_t{0}),
+                          thrust::make_counting_iterator(static_cast<edge_type_t>(num_edge_types)),
+                          (*renumber_map_label_type_offsets).begin() + 1);
     }
   }
+
+  return std::make_tuple(std::move(renumber_map), std::move(renumber_map_label_type_offsets));
 }
 
 // this function does not reorder edges (the i'th returned edge is the renumbered output of the
@@ -1117,74 +1767,45 @@ renumber_sampled_edgelist(raft::handle_t const& handle,
                           size_t num_labels,
                           bool do_expensive_check)
 {
-  // 1. compute renumber_map
+  using vertex_type_t = uint32_t;  // dummy
 
-  auto [renumber_map, renumber_map_label_indices] = compute_renumber_map<vertex_t, label_index_t>(
-    handle,
-    raft::device_span<vertex_t const>(edgelist_majors.data(), edgelist_majors.size()),
-    raft::device_span<vertex_t const>(edgelist_minors.data(), edgelist_minors.size()),
-    edgelist_hops,
-    seed_vertices ? std::make_optional<raft::device_span<vertex_t const>>((*seed_vertices).data(),
-                                                                          (*seed_vertices).size())
-                  : std::nullopt,
-    seed_vertex_label_offsets,
-    edgelist_label_offsets);
-
-  // 2. compute renumber map offsets for each label
+  // 1. compute renumber_map
 
-  std::optional<rmm::device_uvector<size_t>> renumber_map_label_offsets{};
-  if (edgelist_label_offsets) {
-    auto num_unique_labels = thrust::count_if(
-      handle.get_thrust_policy(),
-      thrust::make_counting_iterator(size_t{0}),
-      thrust::make_counting_iterator((*renumber_map_label_indices).size()),
-      detail::is_first_in_run_t<label_index_t const*>{(*renumber_map_label_indices).data()});
-    rmm::device_uvector<label_index_t> unique_label_indices(num_unique_labels, handle.get_stream());
-    rmm::device_uvector<vertex_t> vertex_counts(num_unique_labels, handle.get_stream());
-    thrust::reduce_by_key(handle.get_thrust_policy(),
-                          (*renumber_map_label_indices).begin(),
-                          (*renumber_map_label_indices).end(),
-                          thrust::make_constant_iterator(size_t{1}),
-                          unique_label_indices.begin(),
-                          vertex_counts.begin());
-
-    renumber_map_label_offsets = rmm::device_uvector<size_t>(num_labels + 1, handle.get_stream());
-    thrust::fill(handle.get_thrust_policy(),
-                 (*renumber_map_label_offsets).begin(),
-                 (*renumber_map_label_offsets).end(),
-                 size_t{0});
-    thrust::scatter(handle.get_thrust_policy(),
-                    vertex_counts.begin(),
-                    vertex_counts.end(),
-                    unique_label_indices.begin(),
-                    (*renumber_map_label_offsets).begin() + 1);
-
-    thrust::inclusive_scan(handle.get_thrust_policy(),
-                           (*renumber_map_label_offsets).begin(),
-                           (*renumber_map_label_offsets).end(),
-                           (*renumber_map_label_offsets).begin());
-  }
+  auto [renumber_map, renumber_map_label_offsets] =
+    compute_vertex_renumber_map<vertex_t, vertex_type_t, label_index_t>(
+      handle,
+      raft::device_span<vertex_t const>(edgelist_majors.data(), edgelist_majors.size()),
+      raft::device_span<vertex_t const>(edgelist_minors.data(), edgelist_minors.size()),
+      edgelist_hops,
+      seed_vertices ? std::make_optional<raft::device_span<vertex_t const>>((*seed_vertices).data(),
+                                                                            (*seed_vertices).size())
+                    : std::nullopt,
+      seed_vertex_label_offsets,
+      edgelist_label_offsets,
+      std::nullopt,
+      num_labels,
+      size_t{1});
 
-  // 3. renumber input edges
+  // 2. renumber input edges
 
   if (edgelist_label_offsets) {
     rmm::device_uvector<vertex_t> new_vertices(renumber_map.size(), handle.get_stream());
     thrust::tabulate(handle.get_thrust_policy(),
                      new_vertices.begin(),
                      new_vertices.end(),
-                     [label_indices = raft::device_span<label_index_t const>(
-                        (*renumber_map_label_indices).data(), (*renumber_map_label_indices).size()),
-                      renumber_map_label_offsets = raft::device_span<size_t const>(
+                     [renumber_map_label_offsets = raft::device_span<size_t const>(
                         (*renumber_map_label_offsets).data(),
                         (*renumber_map_label_offsets).size())] __device__(size_t i) {
-                       auto label_index        = label_indices[i];
+                       auto label_index        = static_cast<label_index_t>(thrust::distance(
+                         renumber_map_label_offsets.begin() + 1,
+                         thrust::upper_bound(thrust::seq,
+                                             renumber_map_label_offsets.begin() + 1,
+                                             renumber_map_label_offsets.end(),
+                                             i)));
                        auto label_start_offset = renumber_map_label_offsets[label_index];
                        return static_cast<vertex_t>(i - label_start_offset);
                      });
 
-    (*renumber_map_label_indices).resize(0, handle.get_stream());
-    (*renumber_map_label_indices).shrink_to_fit(handle.get_stream());
-
     rmm::device_uvector<vertex_t> segment_sorted_renumber_map(renumber_map.size(),
                                                               handle.get_stream());
     rmm::device_uvector<vertex_t> segment_sorted_new_vertices(new_vertices.size(),
@@ -1192,7 +1813,7 @@ renumber_sampled_edgelist(raft::handle_t const& handle,
 
     rmm::device_uvector<std::byte> d_tmp_storage(0, handle.get_stream());
 
-    auto approx_edges_to_sort_per_iteration =
+    auto approx_items_to_sort_per_iteration =
       static_cast<size_t>(handle.get_device_properties().multiProcessorCount) *
       (1 << 20) /* tuning parameter */;  // for segmented sort
 
@@ -1201,7 +1822,7 @@ renumber_sampled_edgelist(raft::handle_t const& handle,
       raft::device_span<size_t const>{(*renumber_map_label_offsets).data(),
                                       (*renumber_map_label_offsets).size()},
       renumber_map.size(),
-      approx_edges_to_sort_per_iteration);
+      approx_items_to_sort_per_iteration);
     auto num_chunks = h_label_offsets.size() - 1;
 
     for (size_t i = 0; i < num_chunks; ++i) {
@@ -1369,6 +1990,455 @@ renumber_sampled_edgelist(raft::handle_t const& handle,
                          std::move(renumber_map_label_offsets));
 }
 
+// this function does not reorder edges (the i'th returned edge is the renumbered output of the
+// i'th input edge)
+template <typename vertex_t,
+          typename vertex_type_t,
+          typename edge_id_t,
+          typename edge_type_t,
+          typename label_index_t>
+std::tuple<
+  rmm::device_uvector<vertex_t>,                  // edgelist_majors
+  rmm::device_uvector<vertex_t>,                  // edgelist minors
+  std::optional<rmm::device_uvector<edge_id_t>>,  // edgelist edge IDs
+  std::optional<rmm::device_uvector<vertex_t>>,   // seed_vertices,
+  rmm::device_uvector<vertex_t>,                  // vertex renumber_map
+  rmm::device_uvector<size_t>,  // vertex renumber_map (label, vertex type) offsets
+  std::optional<rmm::device_uvector<edge_id_t>>,  // edge ID renumber map
+  std::optional<rmm::device_uvector<size_t>>>     // edge ID renumber map (label, edge type) offsets
+heterogeneous_renumber_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& edgelist_majors,
+  rmm::device_uvector<vertex_t>&& edgelist_minors,
+  std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
+  std::optional<raft::device_span<edge_type_t const>>&& edgelist_edge_types,
+  std::optional<raft::device_span<int32_t const>> edgelist_hops,
+  std::optional<rmm::device_uvector<vertex_t>>&& seed_vertices,
+  std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
+  std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+  raft::device_span<vertex_t const> vertex_type_offsets,
+  size_t num_labels,
+  size_t num_vertex_types,
+  size_t num_edge_types,
+  bool do_expensive_check)
+{
+  // 1. compute vertex renumber map
+
+  auto [vertex_renumber_map, vertex_renumber_map_label_type_offsets] =
+    compute_vertex_renumber_map<vertex_t, vertex_type_t, label_index_t>(
+      handle,
+      raft::device_span<vertex_t const>(edgelist_majors.data(), edgelist_majors.size()),
+      raft::device_span<vertex_t const>(edgelist_minors.data(), edgelist_minors.size()),
+      edgelist_hops,
+      seed_vertices ? std::make_optional<raft::device_span<vertex_t const>>((*seed_vertices).data(),
+                                                                            (*seed_vertices).size())
+                    : std::nullopt,
+      seed_vertex_label_offsets,
+      edgelist_label_offsets,
+      std::make_optional(vertex_type_offsets),
+      num_labels,
+      num_vertex_types);
+  assert(vertex_renumber_map_label_type_offsets.has_value());
+
+  // 2. compute edge renumber map
+
+  std::optional<rmm::device_uvector<edge_id_t>> edge_id_renumber_map{std::nullopt};
+  std::optional<rmm::device_uvector<size_t>> edge_id_renumber_map_label_type_offsets{std::nullopt};
+  if (edgelist_edge_ids) {
+    std::tie(edge_id_renumber_map, edge_id_renumber_map_label_type_offsets) =
+      compute_edge_id_renumber_map<vertex_t, edge_id_t, edge_type_t, label_index_t>(
+        handle,
+        raft::device_span<edge_id_t const>((*edgelist_edge_ids).data(),
+                                           (*edgelist_edge_ids).size()),
+        edgelist_edge_types,
+        edgelist_hops,
+        edgelist_label_offsets,
+        num_labels,
+        num_edge_types);
+  }
+
+  auto approx_items_to_sort_per_iteration =
+    static_cast<size_t>(handle.get_device_properties().multiProcessorCount) *
+    (1 << 20) /* tuning parameter */;  // for segmented sort
+
+  // 3. renumber input edge source/destination vertices
+
+  {
+    rmm::device_uvector<vertex_t> new_vertices(vertex_renumber_map.size(), handle.get_stream());
+    thrust::tabulate(handle.get_thrust_policy(),
+                     new_vertices.begin(),
+                     new_vertices.end(),
+                     [renumber_map_label_type_offsets = raft::device_span<size_t const>(
+                        (*vertex_renumber_map_label_type_offsets).data(),
+                        (*vertex_renumber_map_label_type_offsets).size())] __device__(size_t i) {
+                       auto idx          = static_cast<size_t>(thrust::distance(
+                         renumber_map_label_type_offsets.begin() + 1,
+                         thrust::upper_bound(thrust::seq,
+                                             renumber_map_label_type_offsets.begin() + 1,
+                                             renumber_map_label_type_offsets.end(),
+                                             i)));
+                       auto start_offset = renumber_map_label_type_offsets[idx];
+                       return static_cast<vertex_t>(i - start_offset);
+                     });
+
+    rmm::device_uvector<vertex_t> segment_sorted_vertex_renumber_map(vertex_renumber_map.size(),
+                                                                     handle.get_stream());
+    rmm::device_uvector<vertex_t> segment_sorted_new_vertices(new_vertices.size(),
+                                                              handle.get_stream());
+
+    rmm::device_uvector<std::byte> d_tmp_storage(0, handle.get_stream());
+
+    auto [h_label_offsets, h_edge_offsets] = detail::compute_offset_aligned_element_chunks(
+      handle,
+      raft::device_span<size_t const>{(*vertex_renumber_map_label_type_offsets).data(),
+                                      (*vertex_renumber_map_label_type_offsets).size()},
+      vertex_renumber_map.size(),
+      approx_items_to_sort_per_iteration);
+    auto num_chunks = h_label_offsets.size() - 1;
+
+    for (size_t i = 0; i < num_chunks; ++i) {
+      size_t tmp_storage_bytes{0};
+
+      auto offset_first = thrust::make_transform_iterator(
+        (*vertex_renumber_map_label_type_offsets).data() + h_label_offsets[i],
+        detail::shift_left_t<size_t>{h_edge_offsets[i]});
+      cub::DeviceSegmentedSort::SortPairs(
+        static_cast<void*>(nullptr),
+        tmp_storage_bytes,
+        vertex_renumber_map.begin() + h_edge_offsets[i],
+        segment_sorted_vertex_renumber_map.begin() + h_edge_offsets[i],
+        new_vertices.begin() + h_edge_offsets[i],
+        segment_sorted_new_vertices.begin() + h_edge_offsets[i],
+        h_edge_offsets[i + 1] - h_edge_offsets[i],
+        h_label_offsets[i + 1] - h_label_offsets[i],
+        offset_first,
+        offset_first + 1,
+        handle.get_stream());
+
+      if (tmp_storage_bytes > d_tmp_storage.size()) {
+        d_tmp_storage = rmm::device_uvector<std::byte>(tmp_storage_bytes, handle.get_stream());
+      }
+
+      cub::DeviceSegmentedSort::SortPairs(
+        d_tmp_storage.data(),
+        tmp_storage_bytes,
+        vertex_renumber_map.begin() + h_edge_offsets[i],
+        segment_sorted_vertex_renumber_map.begin() + h_edge_offsets[i],
+        new_vertices.begin() + h_edge_offsets[i],
+        segment_sorted_new_vertices.begin() + h_edge_offsets[i],
+        h_edge_offsets[i + 1] - h_edge_offsets[i],
+        h_label_offsets[i + 1] - h_label_offsets[i],
+        offset_first,
+        offset_first + 1,
+        handle.get_stream());
+    }
+
+    new_vertices.resize(0, handle.get_stream());
+    new_vertices.shrink_to_fit(handle.get_stream());
+
+    auto pair_first =
+      thrust::make_zip_iterator(edgelist_majors.begin(), thrust::make_counting_iterator(size_t{0}));
+    thrust::transform(
+      handle.get_thrust_policy(),
+      pair_first,
+      pair_first + edgelist_majors.size(),
+      edgelist_majors.begin(),
+      [edgelist_label_offsets = detail::to_thrust_optional(edgelist_label_offsets),
+       vertex_type_offsets,
+       renumber_map_label_type_offsets =
+         raft::device_span<size_t const>((*vertex_renumber_map_label_type_offsets).data(),
+                                         (*vertex_renumber_map_label_type_offsets).size()),
+       old_vertices = raft::device_span<vertex_t const>(segment_sorted_vertex_renumber_map.data(),
+                                                        segment_sorted_vertex_renumber_map.size()),
+       new_vertices = raft::device_span<vertex_t const>(segment_sorted_new_vertices.data(),
+                                                        segment_sorted_new_vertices.size()),
+       num_vertex_types] __device__(auto pair) {
+        auto old_vertex = thrust::get<0>(pair);
+        label_index_t label_idx{0};
+        if (edgelist_label_offsets) {
+          label_idx = static_cast<label_index_t>(
+            thrust::distance((*edgelist_label_offsets).begin() + 1,
+                             thrust::upper_bound(thrust::seq,
+                                                 (*edgelist_label_offsets).begin() + 1,
+                                                 (*edgelist_label_offsets).end(),
+                                                 thrust::get<1>(pair))));
+        }
+        auto v_type       = static_cast<vertex_type_t>(thrust::distance(
+          vertex_type_offsets.begin() + 1,
+          thrust::upper_bound(
+            thrust::seq, vertex_type_offsets.begin() + 1, vertex_type_offsets.end(), old_vertex)));
+        auto start_offset = renumber_map_label_type_offsets[label_idx * num_vertex_types + v_type];
+        auto end_offset =
+          renumber_map_label_type_offsets[label_idx * num_vertex_types + v_type + 1];
+        auto it = thrust::lower_bound(thrust::seq,
+                                      old_vertices.begin() + start_offset,
+                                      old_vertices.begin() + end_offset,
+                                      old_vertex);
+        assert(*it == old_vertex);
+        return *(new_vertices.begin() + thrust::distance(old_vertices.begin(), it));
+      });
+
+    pair_first =
+      thrust::make_zip_iterator(edgelist_minors.begin(), thrust::make_counting_iterator(size_t{0}));
+    thrust::transform(
+      handle.get_thrust_policy(),
+      pair_first,
+      pair_first + edgelist_minors.size(),
+      edgelist_minors.begin(),
+      [edgelist_label_offsets = detail::to_thrust_optional(edgelist_label_offsets),
+       vertex_type_offsets,
+       renumber_map_label_type_offsets =
+         raft::device_span<size_t const>((*vertex_renumber_map_label_type_offsets).data(),
+                                         (*vertex_renumber_map_label_type_offsets).size()),
+       old_vertices = raft::device_span<vertex_t const>(segment_sorted_vertex_renumber_map.data(),
+                                                        segment_sorted_vertex_renumber_map.size()),
+       new_vertices = raft::device_span<vertex_t const>(segment_sorted_new_vertices.data(),
+                                                        segment_sorted_new_vertices.size()),
+       num_vertex_types] __device__(auto pair) {
+        auto old_vertex = thrust::get<0>(pair);
+        label_index_t label_idx{0};
+        if (edgelist_label_offsets) {
+          label_idx = static_cast<label_index_t>(
+            thrust::distance((*edgelist_label_offsets).begin() + 1,
+                             thrust::upper_bound(thrust::seq,
+                                                 (*edgelist_label_offsets).begin() + 1,
+                                                 (*edgelist_label_offsets).end(),
+                                                 thrust::get<1>(pair))));
+        }
+        auto v_type       = static_cast<vertex_type_t>(thrust::distance(
+          vertex_type_offsets.begin() + 1,
+          thrust::upper_bound(
+            thrust::seq, vertex_type_offsets.begin() + 1, vertex_type_offsets.end(), old_vertex)));
+        auto start_offset = renumber_map_label_type_offsets[label_idx * num_vertex_types + v_type];
+        auto end_offset =
+          renumber_map_label_type_offsets[label_idx * num_vertex_types + v_type + 1];
+        auto it = thrust::lower_bound(thrust::seq,
+                                      old_vertices.begin() + start_offset,
+                                      old_vertices.begin() + end_offset,
+                                      old_vertex);
+        assert(*it == old_vertex);
+        return *(new_vertices.begin() + thrust::distance(old_vertices.begin(), it));
+      });
+
+    if (seed_vertices) {
+      pair_first = thrust::make_zip_iterator((*seed_vertices).begin(),
+                                             thrust::make_counting_iterator(size_t{0}));
+      thrust::transform(
+        handle.get_thrust_policy(),
+        pair_first,
+        pair_first + (*seed_vertices).size(),
+        (*seed_vertices).begin(),
+        [seed_vertex_label_offsets = detail::to_thrust_optional(seed_vertex_label_offsets),
+         vertex_type_offsets,
+         renumber_map_label_type_offsets =
+           raft::device_span<size_t const>((*vertex_renumber_map_label_type_offsets).data(),
+                                           (*vertex_renumber_map_label_type_offsets).size()),
+         old_vertices = raft::device_span<vertex_t const>(
+           segment_sorted_vertex_renumber_map.data(), segment_sorted_vertex_renumber_map.size()),
+         new_vertices = raft::device_span<vertex_t const>(segment_sorted_new_vertices.data(),
+                                                          segment_sorted_new_vertices.size()),
+         num_vertex_types] __device__(auto pair) {
+          auto old_vertex = thrust::get<0>(pair);
+          label_index_t label_idx{0};
+          if (seed_vertex_label_offsets) {
+            label_idx = static_cast<label_index_t>(
+              thrust::distance((*seed_vertex_label_offsets).begin() + 1,
+                               thrust::upper_bound(thrust::seq,
+                                                   (*seed_vertex_label_offsets).begin() + 1,
+                                                   (*seed_vertex_label_offsets).end(),
+                                                   thrust::get<1>(pair))));
+          }
+          auto v_type = static_cast<vertex_type_t>(
+            thrust::distance(vertex_type_offsets.begin() + 1,
+                             thrust::upper_bound(thrust::seq,
+                                                 vertex_type_offsets.begin() + 1,
+                                                 vertex_type_offsets.end(),
+                                                 old_vertex)));
+          auto start_offset =
+            renumber_map_label_type_offsets[label_idx * num_vertex_types + v_type];
+          auto end_offset =
+            renumber_map_label_type_offsets[label_idx * num_vertex_types + v_type + 1];
+          auto it = thrust::lower_bound(thrust::seq,
+                                        old_vertices.begin() + start_offset,
+                                        old_vertices.begin() + end_offset,
+                                        old_vertex);
+          assert(*it == old_vertex);
+          return new_vertices[thrust::distance(old_vertices.begin(), it)];
+        });
+    }
+  }
+
+  // 4. renumber input edge IDs
+
+  if (edgelist_edge_ids) {
+    rmm::device_uvector<edge_id_t> new_edge_ids((*edge_id_renumber_map).size(),
+                                                handle.get_stream());
+    if (edge_id_renumber_map_label_type_offsets) {
+      thrust::tabulate(handle.get_thrust_policy(),
+                       new_edge_ids.begin(),
+                       new_edge_ids.end(),
+                       [renumber_map_label_type_offsets = raft::device_span<size_t const>(
+                          (*edge_id_renumber_map_label_type_offsets).data(),
+                          (*edge_id_renumber_map_label_type_offsets).size())] __device__(size_t i) {
+                         auto idx          = static_cast<size_t>(thrust::distance(
+                           renumber_map_label_type_offsets.begin() + 1,
+                           thrust::upper_bound(thrust::seq,
+                                               renumber_map_label_type_offsets.begin() + 1,
+                                               renumber_map_label_type_offsets.end(),
+                                               i)));
+                         auto start_offset = renumber_map_label_type_offsets[idx];
+                         return static_cast<edge_id_t>(i - start_offset);
+                       });
+    } else {
+      thrust::sequence(
+        handle.get_thrust_policy(), new_edge_ids.begin(), new_edge_ids.end(), edge_id_t{0});
+    }
+
+    rmm::device_uvector<edge_id_t> segment_sorted_edge_id_renumber_map(
+      (*edge_id_renumber_map).size(), handle.get_stream());
+    rmm::device_uvector<edge_id_t> segment_sorted_new_edge_ids(new_edge_ids.size(),
+                                                               handle.get_stream());
+
+    if (edge_id_renumber_map_label_type_offsets) {
+      rmm::device_uvector<std::byte> d_tmp_storage(0, handle.get_stream());
+
+      auto [h_label_offsets, h_edge_offsets] = detail::compute_offset_aligned_element_chunks(
+        handle,
+        raft::device_span<size_t const>{(*edge_id_renumber_map_label_type_offsets).data(),
+                                        (*edge_id_renumber_map_label_type_offsets).size()},
+        (*edge_id_renumber_map).size(),
+        approx_items_to_sort_per_iteration);
+      auto num_chunks = h_label_offsets.size() - 1;
+
+      for (size_t i = 0; i < num_chunks; ++i) {
+        size_t tmp_storage_bytes{0};
+
+        auto offset_first = thrust::make_transform_iterator(
+          (*edge_id_renumber_map_label_type_offsets).data() + h_label_offsets[i],
+          detail::shift_left_t<size_t>{h_edge_offsets[i]});
+        cub::DeviceSegmentedSort::SortPairs(
+          static_cast<void*>(nullptr),
+          tmp_storage_bytes,
+          (*edge_id_renumber_map).begin() + h_edge_offsets[i],
+          segment_sorted_edge_id_renumber_map.begin() + h_edge_offsets[i],
+          new_edge_ids.begin() + h_edge_offsets[i],
+          segment_sorted_new_edge_ids.begin() + h_edge_offsets[i],
+          h_edge_offsets[i + 1] - h_edge_offsets[i],
+          h_label_offsets[i + 1] - h_label_offsets[i],
+          offset_first,
+          offset_first + 1,
+          handle.get_stream());
+
+        if (tmp_storage_bytes > d_tmp_storage.size()) {
+          d_tmp_storage = rmm::device_uvector<std::byte>(tmp_storage_bytes, handle.get_stream());
+        }
+
+        cub::DeviceSegmentedSort::SortPairs(
+          d_tmp_storage.data(),
+          tmp_storage_bytes,
+          (*edge_id_renumber_map).begin() + h_edge_offsets[i],
+          segment_sorted_edge_id_renumber_map.begin() + h_edge_offsets[i],
+          new_edge_ids.begin() + h_edge_offsets[i],
+          segment_sorted_new_edge_ids.begin() + h_edge_offsets[i],
+          h_edge_offsets[i + 1] - h_edge_offsets[i],
+          h_label_offsets[i + 1] - h_label_offsets[i],
+          offset_first,
+          offset_first + 1,
+          handle.get_stream());
+      }
+
+      new_edge_ids.resize(0, handle.get_stream());
+      new_edge_ids.shrink_to_fit(handle.get_stream());
+    } else {
+      thrust::copy(handle.get_thrust_policy(),
+                   (*edge_id_renumber_map).begin(),
+                   (*edge_id_renumber_map).end(),
+                   segment_sorted_edge_id_renumber_map.begin());
+      segment_sorted_new_edge_ids = std::move(new_edge_ids);
+      thrust::sort_by_key(handle.get_thrust_policy(),
+                          segment_sorted_edge_id_renumber_map.begin(),
+                          segment_sorted_edge_id_renumber_map.end(),
+                          segment_sorted_new_edge_ids.begin());
+    }
+
+    if (edge_id_renumber_map_label_type_offsets) {
+      auto pair_first = thrust::make_zip_iterator((*edgelist_edge_ids).begin(),
+                                                  thrust::make_counting_iterator(size_t{0}));
+      thrust::transform(
+        handle.get_thrust_policy(),
+        pair_first,
+        pair_first + (*edgelist_edge_ids).size(),
+        (*edgelist_edge_ids).begin(),
+        cuda::proclaim_return_type<edge_id_t>(
+          [edgelist_label_offsets = detail::to_thrust_optional(edgelist_label_offsets),
+           edge_types             = edgelist_edge_types
+                                      ? thrust::make_optional<raft::device_span<edge_type_t const>>(
+                              (*edgelist_edge_types).data(), (*edgelist_edge_types).size())
+                                      : thrust::nullopt,
+           renumber_map =
+             raft::device_span<edge_id_t const>(segment_sorted_edge_id_renumber_map.data(),
+                                                segment_sorted_edge_id_renumber_map.size()),
+           new_edge_ids = raft::device_span<edge_id_t const>(segment_sorted_new_edge_ids.data(),
+                                                             segment_sorted_new_edge_ids.size()),
+           renumber_map_label_type_offsets =
+             raft::device_span<size_t const>((*edge_id_renumber_map_label_type_offsets).data(),
+                                             (*edge_id_renumber_map_label_type_offsets).size()),
+           num_edge_types] __device__(auto pair) {
+            auto old_edge_id = thrust::get<0>(pair);
+            auto edge_idx    = thrust::get<1>(pair);
+            size_t label_idx{0};
+            if (edgelist_label_offsets) {
+              label_idx = static_cast<size_t>(
+                thrust::distance((*edgelist_label_offsets).begin() + 1,
+                                 thrust::upper_bound(thrust::seq,
+                                                     (*edgelist_label_offsets).begin() + 1,
+                                                     (*edgelist_label_offsets).end(),
+                                                     edge_idx)));
+            }
+            edge_type_t edge_type{0};
+            if (edge_types) { edge_type = (*edge_types)[edge_idx]; }
+            auto renumber_map_start_offset =
+              renumber_map_label_type_offsets[label_idx * num_edge_types + edge_type];
+            auto renumber_map_end_offset =
+              renumber_map_label_type_offsets[label_idx * num_edge_types + edge_type + 1];
+            auto it = thrust::lower_bound(thrust::seq,
+                                          renumber_map.begin() + renumber_map_start_offset,
+                                          renumber_map.begin() + renumber_map_end_offset,
+                                          old_edge_id);
+            assert(*it == old_edge_id);
+            return *(new_edge_ids.begin() + thrust::distance(renumber_map.begin(), it));
+          }));
+    } else {
+      thrust::transform(
+        handle.get_thrust_policy(),
+        (*edgelist_edge_ids).begin(),
+        (*edgelist_edge_ids).end(),
+        (*edgelist_edge_ids).begin(),
+        cuda::proclaim_return_type<edge_id_t>(
+          [renumber_map =
+             raft::device_span<edge_id_t const>(segment_sorted_edge_id_renumber_map.data(),
+                                                segment_sorted_edge_id_renumber_map.size()),
+           new_edge_ids = raft::device_span<edge_id_t const>(
+             segment_sorted_new_edge_ids.data(),
+             segment_sorted_new_edge_ids.size())] __device__(edge_id_t old_edge_id) {
+            auto it = thrust::lower_bound(
+              thrust::seq, renumber_map.begin(), renumber_map.end(), old_edge_id);
+            assert(*it == old_edge_id);
+            return *(new_edge_ids.begin() + thrust::distance(renumber_map.begin(), it));
+          }));
+    }
+  }
+
+  return std::make_tuple(std::move(edgelist_majors),
+                         std::move(edgelist_minors),
+                         std::move(edgelist_edge_ids),
+                         std::move(seed_vertices),
+                         std::move(vertex_renumber_map),
+                         std::move(*vertex_renumber_map_label_type_offsets),
+                         std::move(edge_id_renumber_map),
+                         std::move(edge_id_renumber_map_label_type_offsets));
+}
+
 template <typename IndexIterator, typename ValueIterator>
 void permute_array(raft::handle_t const& handle,
                    IndexIterator index_first,
@@ -1390,7 +2460,9 @@ void permute_array(raft::handle_t const& handle,
                value_first);
 }
 
-// key: ((label), (hop), major, minor)
+// key:
+// ((label), (edge type), (hop), major, minor) if use_edge_type_as_sort_key is true
+// ((label), (hop), major, minor) if use_edge_type_as_sort_key is false
 template <typename vertex_t, typename weight_t, typename edge_id_t, typename edge_type_t>
 std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
@@ -1405,7 +2477,8 @@ sort_sampled_edge_tuples(raft::handle_t const& handle,
                          std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
                          std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
                          std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
-                         std::optional<raft::device_span<size_t const>> edgelist_label_offsets)
+                         std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+                         bool use_edge_type_as_sort_key)
 {
   std::vector<size_t> h_label_offsets{};
   std::vector<size_t> h_edge_offsets{};
@@ -1427,11 +2500,15 @@ sort_sampled_edge_tuples(raft::handle_t const& handle,
     rmm::device_uvector<size_t> indices(h_edge_offsets[i + 1] - h_edge_offsets[i],
                                         handle.get_stream());
     thrust::sequence(handle.get_thrust_policy(), indices.begin(), indices.end(), size_t{0});
-    edge_order_t<vertex_t> edge_order_comp{
+    edge_order_t<vertex_t, edge_type_t> edge_order_comp{
       edgelist_label_offsets ? thrust::make_optional<raft::device_span<size_t const>>(
                                  (*edgelist_label_offsets).data() + h_label_offsets[i],
                                  (h_label_offsets[i + 1] - h_label_offsets[i]) + 1)
                              : thrust::nullopt,
+      edgelist_edge_types && use_edge_type_as_sort_key
+        ? thrust::make_optional<raft::device_span<edge_type_t const>>(
+            (*edgelist_edge_types).data() + h_edge_offsets[i], indices.size())
+        : thrust::nullopt,
       edgelist_hops ? thrust::make_optional<raft::device_span<int32_t const>>(
                         (*edgelist_hops).data() + h_edge_offsets[i], indices.size())
                     : thrust::nullopt,
@@ -1510,25 +2587,29 @@ renumber_and_compress_sampled_edgelist(
   bool do_expensive_check)
 {
   using label_index_t = uint32_t;
+  using vertex_type_t = uint32_t;  // dummy
 
   auto edgelist_majors = src_is_major ? std::move(edgelist_srcs) : std::move(edgelist_dsts);
   auto edgelist_minors = src_is_major ? std::move(edgelist_dsts) : std::move(edgelist_srcs);
 
   // 1. check input arguments
 
-  check_input_edges<label_index_t>(handle,
-                                   edgelist_majors,
-                                   edgelist_minors,
-                                   edgelist_weights,
-                                   edgelist_edge_ids,
-                                   edgelist_edge_types,
-                                   edgelist_hops,
-                                   seed_vertices,
-                                   seed_vertex_label_offsets,
-                                   edgelist_label_offsets,
-                                   num_labels,
-                                   num_hops,
-                                   do_expensive_check);
+  check_input_edges<label_index_t, vertex_t, vertex_type_t>(handle,
+                                                            edgelist_majors,
+                                                            edgelist_minors,
+                                                            edgelist_weights,
+                                                            edgelist_edge_ids,
+                                                            edgelist_edge_types,
+                                                            edgelist_hops,
+                                                            seed_vertices,
+                                                            seed_vertex_label_offsets,
+                                                            edgelist_label_offsets,
+                                                            std::nullopt,
+                                                            num_labels,
+                                                            num_hops,
+                                                            size_t{1},
+                                                            std::optional<size_t>{std::nullopt},
+                                                            do_expensive_check);
 
   CUGRAPH_EXPECTS(
     !doubly_compress || !compress_per_hop,
@@ -1582,7 +2663,8 @@ renumber_and_compress_sampled_edgelist(
                                                      std::move(edgelist_edge_ids),
                                                      std::move(edgelist_edge_types),
                                                      std::move(edgelist_hops),
-                                                     edgelist_label_offsets);
+                                                     edgelist_label_offsets,
+                                                     false);
 
   if (renumbered_seed_vertices) {
     if (seed_vertex_label_offsets) {
@@ -2144,25 +3226,29 @@ renumber_and_sort_sampled_edgelist(
   bool do_expensive_check)
 {
   using label_index_t = uint32_t;
+  using vertex_type_t = uint32_t;  // dummy
 
   auto edgelist_majors = src_is_major ? std::move(edgelist_srcs) : std::move(edgelist_dsts);
   auto edgelist_minors = src_is_major ? std::move(edgelist_dsts) : std::move(edgelist_srcs);
 
   // 1. check input arguments
 
-  check_input_edges<label_index_t>(handle,
-                                   edgelist_majors,
-                                   edgelist_minors,
-                                   edgelist_weights,
-                                   edgelist_edge_ids,
-                                   edgelist_edge_types,
-                                   edgelist_hops,
-                                   seed_vertices,
-                                   seed_vertex_label_offsets,
-                                   edgelist_label_offsets,
-                                   num_labels,
-                                   num_hops,
-                                   do_expensive_check);
+  check_input_edges<label_index_t, vertex_t, vertex_type_t>(handle,
+                                                            edgelist_majors,
+                                                            edgelist_minors,
+                                                            edgelist_weights,
+                                                            edgelist_edge_ids,
+                                                            edgelist_edge_types,
+                                                            edgelist_hops,
+                                                            seed_vertices,
+                                                            seed_vertex_label_offsets,
+                                                            edgelist_label_offsets,
+                                                            std::nullopt,
+                                                            num_labels,
+                                                            num_hops,
+                                                            size_t{1},
+                                                            std::optional<size_t>{std::nullopt},
+                                                            do_expensive_check);
 
   // 2. renumber
 
@@ -2206,7 +3292,8 @@ renumber_and_sort_sampled_edgelist(
                                                      std::move(edgelist_edge_ids),
                                                      std::move(edgelist_edge_types),
                                                      std::move(edgelist_hops),
-                                                     edgelist_label_offsets);
+                                                     edgelist_label_offsets,
+                                                     false);
 
   // 4. compute edgelist_label_hop_offsets
 
@@ -2274,6 +3361,218 @@ renumber_and_sort_sampled_edgelist(
                          std::move(renumber_map_label_offsets));
 }
 
+template <typename vertex_t,
+          typename weight_t,
+          typename edge_id_t,
+          typename edge_type_t>
+std::tuple<rmm::device_uvector<vertex_t>,                  // srcs
+           rmm::device_uvector<vertex_t>,                  // dsts
+           std::optional<rmm::device_uvector<weight_t>>,   // weights
+           std::optional<rmm::device_uvector<edge_id_t>>,  // edge IDs
+           std::optional<rmm::device_uvector<size_t>>,     // (label, edge type, hop) offsets to the
+                                                           // edges
+           rmm::device_uvector<vertex_t>,                  // vertex renumber map
+           rmm::device_uvector<size_t>,  // (label, vertex type) offsets to the vertex renumber map
+           std::optional<rmm::device_uvector<edge_id_t>>,  // edge ID renumber map
+           std::optional<
+             rmm::device_uvector<size_t>>>  // (label, edge type) offsets to the vertex renumber map
+heterogeneous_renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& edgelist_srcs,
+  rmm::device_uvector<vertex_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+  std::optional<raft::device_span<vertex_t const>> seed_vertices,
+  std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
+  std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+  raft::device_span<vertex_t const> vertex_type_offsets,
+  size_t num_labels,
+  size_t num_hops,
+  size_t num_vertex_types,
+  size_t num_edge_types,
+  bool src_is_major,
+  bool do_expensive_check)
+{
+  using label_index_t = uint32_t;
+  using vertex_type_t = uint32_t;
+
+  auto edgelist_majors = src_is_major ? std::move(edgelist_srcs) : std::move(edgelist_dsts);
+  auto edgelist_minors = src_is_major ? std::move(edgelist_dsts) : std::move(edgelist_srcs);
+
+  // 1. check input arguments
+
+  check_input_edges<label_index_t, vertex_t, vertex_type_t>(handle,
+                                                            edgelist_majors,
+                                                            edgelist_minors,
+                                                            edgelist_weights,
+                                                            edgelist_edge_ids,
+                                                            edgelist_edge_types,
+                                                            edgelist_hops,
+                                                            seed_vertices,
+                                                            seed_vertex_label_offsets,
+                                                            edgelist_label_offsets,
+                                                            vertex_type_offsets,
+                                                            num_labels,
+                                                            num_hops,
+                                                            num_vertex_types,
+                                                            std::optional<size_t>{num_edge_types},
+                                                            do_expensive_check);
+
+  // 2. renumber
+
+  std::optional<rmm::device_uvector<vertex_t>> renumbered_seed_vertices{std::nullopt};
+  if (seed_vertices) {
+    renumbered_seed_vertices =
+      rmm::device_uvector<vertex_t>((*seed_vertices).size(), handle.get_stream());
+    thrust::copy(handle.get_thrust_policy(),
+                 (*seed_vertices).begin(),
+                 (*seed_vertices).end(),
+                 (*renumbered_seed_vertices).begin());
+  }
+  rmm::device_uvector<vertex_t> vertex_renumber_map(0, handle.get_stream());
+  rmm::device_uvector<size_t> vertex_renumber_map_label_type_offsets(0, handle.get_stream());
+  std::optional<rmm::device_uvector<edge_id_t>> edge_id_renumber_map{std::nullopt};
+  std::optional<rmm::device_uvector<size_t>> edge_id_renumber_map_label_type_offsets{std::nullopt};
+  std::tie(edgelist_majors,
+           edgelist_minors,
+           edgelist_edge_ids,
+           std::ignore,
+           vertex_renumber_map,
+           vertex_renumber_map_label_type_offsets,
+           edge_id_renumber_map,
+           edge_id_renumber_map_label_type_offsets) =
+    heterogeneous_renumber_sampled_edgelist<vertex_t,
+                                            vertex_type_t,
+                                            edge_id_t,
+                                            edge_type_t,
+                                            label_index_t>(
+      handle,
+      std::move(edgelist_majors),
+      std::move(edgelist_minors),
+      std::move(edgelist_edge_ids),
+      edgelist_edge_types ? std::make_optional(raft::device_span<edge_type_t const>(
+                              (*edgelist_edge_types).data(), (*edgelist_edge_types).size()))
+                          : std::nullopt,
+      edgelist_hops ? std::make_optional(raft::device_span<int32_t const>((*edgelist_hops).data(),
+                                                                          (*edgelist_hops).size()))
+                    : std::nullopt,
+      std::move(renumbered_seed_vertices),
+      seed_vertex_label_offsets,
+      edgelist_label_offsets,
+      vertex_type_offsets,
+      num_labels,
+      num_vertex_types,
+      num_edge_types,
+      do_expensive_check);
+
+  // 3. sort by ((label), (edge type), (hop), major, minor)
+
+  std::tie(edgelist_majors,
+           edgelist_minors,
+           edgelist_weights,
+           edgelist_edge_ids,
+           edgelist_edge_types,
+           edgelist_hops) = sort_sampled_edge_tuples(handle,
+                                                     std::move(edgelist_majors),
+                                                     std::move(edgelist_minors),
+                                                     std::move(edgelist_weights),
+                                                     std::move(edgelist_edge_ids),
+                                                     std::move(edgelist_edge_types),
+                                                     std::move(edgelist_hops),
+                                                     edgelist_label_offsets,
+                                                     true);
+
+  // 4. compute edgelist (label, edge type, hop) offsets
+
+  std::optional<rmm::device_uvector<size_t>> edgelist_label_type_hop_offsets{std::nullopt};
+  if (edgelist_label_offsets || edgelist_edge_types || edgelist_hops) {
+    edgelist_label_type_hop_offsets =
+      rmm::device_uvector<size_t>(num_labels * num_edge_types * num_hops + 1, handle.get_stream());
+    thrust::fill(handle.get_thrust_policy(),
+                 (*edgelist_label_type_hop_offsets).begin(),
+                 (*edgelist_label_type_hop_offsets).end(),
+                 size_t{0});
+    thrust::transform(
+      handle.get_thrust_policy(),
+      thrust::make_counting_iterator(size_t{0}),
+      thrust::make_counting_iterator(num_labels * num_edge_types * num_hops),
+      (*edgelist_label_type_hop_offsets).begin(),
+      cuda::proclaim_return_type<size_t>(
+        [edgelist_label_offsets = detail::to_thrust_optional(edgelist_label_offsets),
+         edgelist_edge_types    = edgelist_edge_types
+                                    ? thrust::make_optional<raft::device_span<edge_type_t const>>(
+                                     (*edgelist_edge_types).data(), (*edgelist_edge_types).size())
+                                    : thrust::nullopt,
+         edgelist_hops = edgelist_hops ? thrust::make_optional<raft::device_span<int32_t const>>(
+                                           (*edgelist_hops).data(), (*edgelist_hops).size())
+                                       : thrust::nullopt,
+         num_edge_types,
+         num_hops,
+         num_edges = edgelist_majors.size()] __device__(size_t i) {
+          size_t start_offset{0};
+          auto end_offset = num_edges;
+
+          if (edgelist_label_offsets) {
+            auto l_idx   = static_cast<label_index_t>(i / (num_edge_types * num_hops));
+            start_offset = (*edgelist_label_offsets)[l_idx];
+            end_offset   = (*edgelist_label_offsets)[l_idx + 1];
+          }
+
+          if (edgelist_edge_types) {
+            auto t        = static_cast<edge_type_t>((i % (num_edge_types * num_hops)) / num_hops);
+            auto lower_it = thrust::lower_bound(thrust::seq,
+                                                (*edgelist_edge_types).begin() + start_offset,
+                                                (*edgelist_edge_types).begin() + end_offset,
+                                                t);
+            auto upper_it = thrust::upper_bound(thrust::seq,
+                                                (*edgelist_edge_types).begin() + start_offset,
+                                                (*edgelist_edge_types).begin() + end_offset,
+                                                t);
+            start_offset =
+              static_cast<size_t>(thrust::distance((*edgelist_edge_types).begin(), lower_it));
+            end_offset =
+              static_cast<size_t>(thrust::distance((*edgelist_edge_types).begin(), upper_it));
+          }
+
+          if (edgelist_hops) {
+            auto h        = static_cast<int32_t>(i % num_hops);
+            auto lower_it = thrust::lower_bound(thrust::seq,
+                                                (*edgelist_hops).begin() + start_offset,
+                                                (*edgelist_hops).begin() + end_offset,
+                                                h);
+            auto upper_it = thrust::upper_bound(thrust::seq,
+                                                (*edgelist_hops).begin() + start_offset,
+                                                (*edgelist_hops).begin() + end_offset,
+                                                h);
+            start_offset =
+              static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), lower_it));
+            end_offset = static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), upper_it));
+          }
+
+          return end_offset - start_offset;
+        }));
+    thrust::exclusive_scan(handle.get_thrust_policy(),
+                           (*edgelist_label_type_hop_offsets).begin(),
+                           (*edgelist_label_type_hop_offsets).end(),
+                           (*edgelist_label_type_hop_offsets).begin());
+  }
+
+  edgelist_edge_types = std::nullopt;
+  edgelist_hops       = std::nullopt;
+
+  return std::make_tuple(std::move(src_is_major ? edgelist_majors : edgelist_minors),
+                         std::move(src_is_major ? edgelist_minors : edgelist_majors),
+                         std::move(edgelist_weights),
+                         std::move(edgelist_edge_ids),
+                         std::move(edgelist_label_type_hop_offsets),
+                         std::move(vertex_renumber_map),
+                         std::move(vertex_renumber_map_label_type_offsets),
+                         std::move(edge_id_renumber_map),
+                         std::move(edge_id_renumber_map_label_type_offsets));
+}
+
 template <typename vertex_t,
           typename weight_t,
           typename edge_id_t,
@@ -2298,25 +3597,29 @@ sort_sampled_edgelist(raft::handle_t const& handle,
                       bool do_expensive_check)
 {
   using label_index_t = uint32_t;
+  using vertex_type_t = uint32_t;  // dummy
 
   auto edgelist_majors = src_is_major ? std::move(edgelist_srcs) : std::move(edgelist_dsts);
   auto edgelist_minors = src_is_major ? std::move(edgelist_dsts) : std::move(edgelist_srcs);
 
   // 1. check input arguments
 
-  check_input_edges<label_index_t, vertex_t>(handle,
-                                             edgelist_majors,
-                                             edgelist_minors,
-                                             edgelist_weights,
-                                             edgelist_edge_ids,
-                                             edgelist_edge_types,
-                                             edgelist_hops,
-                                             std::nullopt,
-                                             std::nullopt,
-                                             edgelist_label_offsets,
-                                             num_labels,
-                                             num_hops,
-                                             do_expensive_check);
+  check_input_edges<label_index_t, vertex_t, vertex_type_t>(handle,
+                                                            edgelist_majors,
+                                                            edgelist_minors,
+                                                            edgelist_weights,
+                                                            edgelist_edge_ids,
+                                                            edgelist_edge_types,
+                                                            edgelist_hops,
+                                                            std::nullopt,
+                                                            std::nullopt,
+                                                            edgelist_label_offsets,
+                                                            std::nullopt,
+                                                            num_labels,
+                                                            num_hops,
+                                                            size_t{1},
+                                                            std::optional<size_t>{std::nullopt},
+                                                            do_expensive_check);
 
   // 2. sort by ((l), (h), major, minor)
 
@@ -2332,7 +3635,8 @@ sort_sampled_edgelist(raft::handle_t const& handle,
                                                      std::move(edgelist_edge_ids),
                                                      std::move(edgelist_edge_types),
                                                      std::move(edgelist_hops),
-                                                     edgelist_label_offsets);
+                                                     edgelist_label_offsets,
+                                                     false);
 
   // 3. compute edgelist_label_hop_offsets
 
diff --git a/cpp/src/sampling/sampling_post_processing_sg_v32_e32.cu b/cpp/src/sampling/sampling_post_processing_sg_v32_e32.cu
index 6b8d8a07d92..ff1add6a02a 100644
--- a/cpp/src/sampling/sampling_post_processing_sg_v32_e32.cu
+++ b/cpp/src/sampling/sampling_post_processing_sg_v32_e32.cu
@@ -122,6 +122,62 @@ renumber_and_sort_sampled_edgelist(
   bool src_is_major,
   bool do_expensive_check);
 
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<size_t>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+  std::optional<raft::device_span<int32_t const>> seed_vertices,
+  std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
+  std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+  raft::device_span<int32_t const> vertex_type_offsets,
+  size_t num_labels,
+  size_t num_hops,
+  size_t num_vertex_types,
+  size_t num_edge_types,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<size_t>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+  std::optional<raft::device_span<int32_t const>> seed_vertices,
+  std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
+  std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+  raft::device_span<int32_t const> vertex_type_offsets,
+  size_t num_labels,
+  size_t num_hops,
+  size_t num_vertex_types,
+  size_t num_edge_types,
+  bool src_is_major,
+  bool do_expensive_check);
+
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<float>>,
diff --git a/cpp/src/sampling/sampling_post_processing_sg_v32_e64.cu b/cpp/src/sampling/sampling_post_processing_sg_v32_e64.cu
index a4b083efd7c..7001dcfdaf3 100644
--- a/cpp/src/sampling/sampling_post_processing_sg_v32_e64.cu
+++ b/cpp/src/sampling/sampling_post_processing_sg_v32_e64.cu
@@ -122,6 +122,62 @@ renumber_and_sort_sampled_edgelist(
   bool src_is_major,
   bool do_expensive_check);
 
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<size_t>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+  std::optional<raft::device_span<int32_t const>> seed_vertices,
+  std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
+  std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+  raft::device_span<int32_t const> vertex_type_offsets,
+  size_t num_labels,
+  size_t num_hops,
+  size_t num_vertex_types,
+  size_t num_edge_types,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<size_t>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+  std::optional<raft::device_span<int32_t const>> seed_vertices,
+  std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
+  std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+  raft::device_span<int32_t const> vertex_type_offsets,
+  size_t num_labels,
+  size_t num_hops,
+  size_t num_vertex_types,
+  size_t num_edge_types,
+  bool src_is_major,
+  bool do_expensive_check);
+
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<float>>,
diff --git a/cpp/src/sampling/sampling_post_processing_sg_v64_e64.cu b/cpp/src/sampling/sampling_post_processing_sg_v64_e64.cu
index a62ca2a0777..3b2b8144420 100644
--- a/cpp/src/sampling/sampling_post_processing_sg_v64_e64.cu
+++ b/cpp/src/sampling/sampling_post_processing_sg_v64_e64.cu
@@ -122,6 +122,62 @@ renumber_and_sort_sampled_edgelist(
   bool src_is_major,
   bool do_expensive_check);
 
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<size_t>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& edgelist_srcs,
+  rmm::device_uvector<int64_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+  std::optional<raft::device_span<int64_t const>> seed_vertices,
+  std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
+  std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+  raft::device_span<int64_t const> vertex_type_offsets,
+  size_t num_labels,
+  size_t num_hops,
+  size_t num_vertex_types,
+  size_t num_edge_types,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<size_t>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& edgelist_srcs,
+  rmm::device_uvector<int64_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+  std::optional<raft::device_span<int64_t const>> seed_vertices,
+  std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
+  std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+  raft::device_span<int64_t const> vertex_type_offsets,
+  size_t num_labels,
+  size_t num_hops,
+  size_t num_vertex_types,
+  size_t num_edge_types,
+  bool src_is_major,
+  bool do_expensive_check);
+
 template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
                     std::optional<rmm::device_uvector<float>>,
diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh
index 5371d53bcf0..f925a142737 100644
--- a/cpp/src/structure/graph_view_impl.cuh
+++ b/cpp/src/structure/graph_view_impl.cuh
@@ -803,7 +803,7 @@ graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<mul
   raft::handle_t const& handle,
   raft::device_span<vertex_t const> edge_srcs,
   raft::device_span<vertex_t const> edge_dsts,
-  bool do_expensive_check)
+  bool do_expensive_check) const
 {
   CUGRAPH_EXPECTS(
     edge_srcs.size() == edge_dsts.size(),
@@ -883,7 +883,7 @@ graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!mu
   raft::handle_t const& handle,
   raft::device_span<vertex_t const> edge_srcs,
   raft::device_span<vertex_t const> edge_dsts,
-  bool do_expensive_check)
+  bool do_expensive_check) const
 {
   CUGRAPH_EXPECTS(
     edge_srcs.size() == edge_dsts.size(),
diff --git a/cpp/src/structure/legacy/graph.cu b/cpp/src/structure/legacy/graph.cu
index 7e1238e1558..a504125080b 100644
--- a/cpp/src/structure/legacy/graph.cu
+++ b/cpp/src/structure/legacy/graph.cu
@@ -14,17 +14,18 @@
  * limitations under the License.
  */
 
-#include "utilities/graph_utils.cuh"
-
 #include <cugraph/legacy/graph.hpp>
 #include <cugraph/utilities/error.hpp>
 
+#include <raft/core/device_span.hpp>
 #include <raft/util/device_atomics.cuh>
 
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/sequence.h>
 
 namespace {
 
@@ -69,15 +70,40 @@ namespace legacy {
 template <typename VT, typename ET, typename WT>
 void GraphViewBase<VT, ET, WT>::get_vertex_identifiers(VT* identifiers) const
 {
-  cugraph::detail::sequence<VT>(number_of_vertices, identifiers);
+  thrust::sequence(thrust::device,
+                   thrust::device_pointer_cast(identifiers),
+                   thrust::device_pointer_cast(identifiers + number_of_vertices),
+                   VT{0});
+  RAFT_CHECK_CUDA(nullptr);
 }
 
+// FIXME: Need to get rid of this function... still used in python
 template <typename VT, typename ET, typename WT>
 void GraphCompressedSparseBaseView<VT, ET, WT>::get_source_indices(VT* src_indices) const
 {
   CUGRAPH_EXPECTS(offsets != nullptr, "No graph specified");
-  cugraph::detail::offsets_to_indices<ET, VT>(
-    offsets, GraphViewBase<VT, ET, WT>::number_of_vertices, src_indices);
+  rmm::cuda_stream_view stream_view;
+
+  raft::device_span<VT> indices_span(src_indices, GraphViewBase<VT, ET, WT>::number_of_edges);
+
+  if (indices_span.size() > 0) {
+    thrust::fill(rmm::exec_policy(stream_view), indices_span.begin(), indices_span.end(), VT{0});
+
+    thrust::for_each(rmm::exec_policy(stream_view),
+                     offsets + 1,
+                     offsets + GraphViewBase<VT, ET, WT>::number_of_vertices,
+                     [indices_span] __device__(ET offset) {
+                       if (offset < static_cast<ET>(indices_span.size())) {
+                         cuda::atomic_ref<VT, cuda::thread_scope_device> atomic_counter(
+                           indices_span.data()[offset]);
+                         atomic_counter.fetch_add(VT{1}, cuda::std::memory_order_relaxed);
+                       }
+                     });
+    thrust::inclusive_scan(rmm::exec_policy(stream_view),
+                           indices_span.begin(),
+                           indices_span.end(),
+                           indices_span.begin());
+  }
 }
 
 template <typename VT, typename ET, typename WT>
@@ -152,6 +178,4 @@ void GraphCompressedSparseBaseView<VT, ET, WT>::degree(ET* degree, DegreeDirecti
 }  // namespace legacy
 }  // namespace cugraph
 
-#include "utilities/eidir_graph_utils.hpp"
-
 #include <cugraph/legacy/eidir_graph.hpp>
diff --git a/cpp/src/structure/select_random_vertices_impl.hpp b/cpp/src/structure/select_random_vertices_impl.hpp
index e6857a5beda..d7502b3f6da 100644
--- a/cpp/src/structure/select_random_vertices_impl.hpp
+++ b/cpp/src/structure/select_random_vertices_impl.hpp
@@ -16,6 +16,7 @@
 #pragma once
 
 #include "detail/graph_partition_utils.cuh"
+#include "from_cugraph_ops/sampling.hpp"
 
 #include <cugraph/detail/shuffle_wrappers.hpp>
 #include <cugraph/detail/utility_wrappers.hpp>
@@ -30,10 +31,6 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
-#ifndef NO_CUGRAPH_OPS
-#include <cugraph-ops/graph/sampling.hpp>
-#endif
-
 #include <thrust/functional.h>
 #include <thrust/gather.h>
 #include <thrust/logical.h>
diff --git a/cpp/src/traversal/extract_bfs_paths_impl.cuh b/cpp/src/traversal/extract_bfs_paths_impl.cuh
index 3790c0057cb..40030e2e39c 100644
--- a/cpp/src/traversal/extract_bfs_paths_impl.cuh
+++ b/cpp/src/traversal/extract_bfs_paths_impl.cuh
@@ -17,7 +17,6 @@
 
 #include "detail/graph_partition_utils.cuh"
 #include "utilities/collect_comm.cuh"
-#include "utilities/graph_utils.cuh"
 
 #include <cugraph/algorithms.hpp>
 #include <cugraph/detail/utility_wrappers.hpp>
diff --git a/cpp/src/utilities/eidecl_graph_utils.hpp b/cpp/src/utilities/eidecl_graph_utils.hpp
deleted file mode 100644
index abf026cbbfe..00000000000
--- a/cpp/src/utilities/eidecl_graph_utils.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-namespace cugraph {
-namespace detail {
-
-extern template __device__ float parallel_prefix_sum(int32_t, int32_t const*, float const*);
-extern template __device__ double parallel_prefix_sum(int32_t, int32_t const*, double const*);
-extern template __device__ float parallel_prefix_sum(int64_t, int32_t const*, float const*);
-extern template __device__ double parallel_prefix_sum(int64_t, int32_t const*, double const*);
-extern template __device__ float parallel_prefix_sum(int64_t, int64_t const*, float const*);
-extern template __device__ double parallel_prefix_sum(int64_t, int64_t const*, double const*);
-
-extern template void offsets_to_indices<int, int>(int const*, int, int*);
-extern template void offsets_to_indices<long, int>(long const*, int, int*);
-extern template void offsets_to_indices<long, long>(long const*, long, long*);
-
-extern template __attribute__((visibility("hidden"))) __global__ void
-offsets_to_indices_kernel<int, int>(int const*, int, int*);
-extern template __attribute__((visibility("hidden"))) __global__ void
-offsets_to_indices_kernel<long, int>(long const*, int, int*);
-extern template __attribute__((visibility("hidden"))) __global__ void
-offsets_to_indices_kernel<long, long>(long const*, long, long*);
-
-}  // namespace detail
-}  // namespace cugraph
diff --git a/cpp/src/utilities/eidir_graph_utils.hpp b/cpp/src/utilities/eidir_graph_utils.hpp
deleted file mode 100644
index ba06c6f56ea..00000000000
--- a/cpp/src/utilities/eidir_graph_utils.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-namespace cugraph {
-namespace detail {
-
-template __device__ float parallel_prefix_sum(int32_t, int32_t const*, float const*);
-template __device__ double parallel_prefix_sum(int32_t, int32_t const*, double const*);
-template __device__ float parallel_prefix_sum(int64_t, int32_t const*, float const*);
-template __device__ double parallel_prefix_sum(int64_t, int32_t const*, double const*);
-template __device__ float parallel_prefix_sum(int64_t, int64_t const*, float const*);
-template __device__ double parallel_prefix_sum(int64_t, int64_t const*, double const*);
-
-template void offsets_to_indices<int32_t, int32_t>(int32_t const*, int32_t, int32_t*);
-template void offsets_to_indices<int64_t, int32_t>(int64_t const*, int32_t, int32_t*);
-template void offsets_to_indices<int64_t, int64_t>(int64_t const*, int64_t, int64_t*);
-
-template __global__ __attribute__((visibility("hidden"))) void
-offsets_to_indices_kernel<int32_t, int32_t>(int32_t const*, int32_t, int32_t*);
-template __global__ __attribute__((visibility("hidden"))) void
-offsets_to_indices_kernel<int64_t, int32_t>(int64_t const*, int32_t, int32_t*);
-template __global__ __attribute__((visibility("hidden"))) void
-offsets_to_indices_kernel<int64_t, int64_t>(int64_t const*, int64_t, int64_t*);
-
-}  // namespace detail
-}  // namespace cugraph
diff --git a/cpp/src/utilities/graph_utils.cuh b/cpp/src/utilities/graph_utils.cuh
deleted file mode 100644
index 0b257e7abde..00000000000
--- a/cpp/src/utilities/graph_utils.cuh
+++ /dev/null
@@ -1,430 +0,0 @@
-/*
- * Copyright (c) 2018-2024, NVIDIA CORPORATION.  All rights reserved.
- *
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
- *
- */
-
-// Interanl helper functions
-// Author: Alex Fender afender@nvidia.com
-#pragma once
-
-#include <cugraph/utilities/error.hpp>
-
-#include <raft/util/cudart_utils.hpp>
-#include <raft/util/device_atomics.cuh>
-
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/copy.h>
-#include <thrust/device_ptr.h>
-#include <thrust/execution_policy.h>
-#include <thrust/extrema.h>
-#include <thrust/fill.h>
-#include <thrust/functional.h>
-#include <thrust/inner_product.h>
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/reduce.h>
-#include <thrust/scatter.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include <thrust/transform.h>
-#include <thrust/transform_reduce.h>
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-namespace cugraph {
-namespace detail {
-
-// #define DEBUG 1
-#define CUDA_MAX_BLOCKS         65535
-#define CUDA_MAX_KERNEL_THREADS 256  // kernel will launch at most 256 threads per block
-#define US
-
-template <typename count_t, typename index_t, typename value_t>
-__inline__ __device__ value_t parallel_prefix_sum(count_t n, index_t const* ind, value_t const* w)
-{
-  count_t i, j, mn;
-  value_t v, last;
-  value_t sum = 0.0;
-  bool valid;
-
-  // Parallel prefix sum (using __shfl)
-  mn = (((n + blockDim.x - 1) / blockDim.x) * blockDim.x);  // n in multiple of blockDim.x
-  for (i = threadIdx.x; i < mn; i += blockDim.x) {
-    // All threads (especially the last one) must always participate
-    // in the shfl instruction, otherwise their sum will be undefined.
-    // So, the loop stopping condition is based on multiple of n in loop increments,
-    // so that all threads enter into the loop and inside we make sure we do not
-    // read out of bounds memory checking for the actual size n.
-
-    // check if the thread is valid
-    valid = i < n;
-
-    // Notice that the last thread is used to propagate the prefix sum.
-    // For all the threads, in the first iteration the last is 0, in the following
-    // iterations it is the value at the last thread of the previous iterations.
-
-    // get the value of the last thread
-    last = __shfl_sync(raft::warp_full_mask(), sum, blockDim.x - 1, blockDim.x);
-
-    // if you are valid read the value from memory, otherwise set your value to 0
-    sum = (valid) ? w[ind[i]] : 0.0;
-
-    // do prefix sum (of size warpSize=blockDim.x =< 32)
-    for (j = 1; j < blockDim.x; j *= 2) {
-      v = __shfl_up_sync(raft::warp_full_mask(), sum, j, blockDim.x);
-      if (threadIdx.x >= j) sum += v;
-    }
-    // shift by last
-    sum += last;
-    // notice that no __threadfence or __syncthreads are needed in this implementation
-  }
-  // get the value of the last thread (to all threads)
-  last = __shfl_sync(raft::warp_full_mask(), sum, blockDim.x - 1, blockDim.x);
-
-  return last;
-}
-
-// axpy
-template <typename T>
-struct axpy_functor : public thrust::binary_function<T, T, T> {
-  const T a;
-  axpy_functor(T _a) : a(_a) {}
-  __host__ __device__ T operator()(const T& x, const T& y) const { return a * x + y; }
-};
-
-template <typename T>
-void axpy(size_t n, T a, T* x, T* y)
-{
-  rmm::cuda_stream_view stream_view;
-  thrust::transform(rmm::exec_policy(stream_view),
-                    thrust::device_pointer_cast(x),
-                    thrust::device_pointer_cast(x + n),
-                    thrust::device_pointer_cast(y),
-                    thrust::device_pointer_cast(y),
-                    axpy_functor<T>(a));
-  RAFT_CHECK_CUDA(stream_view.value());
-}
-
-// norm
-template <typename T>
-struct square {
-  __host__ __device__ T operator()(const T& x) const { return x * x; }
-};
-
-template <typename T>
-T nrm2(size_t n, T* x)
-{
-  rmm::cuda_stream_view stream_view;
-  T init   = 0;
-  T result = std::sqrt(thrust::transform_reduce(rmm::exec_policy(stream_view),
-                                                thrust::device_pointer_cast(x),
-                                                thrust::device_pointer_cast(x + n),
-                                                square<T>(),
-                                                init,
-                                                thrust::plus<T>()));
-  RAFT_CHECK_CUDA(stream_view.value());
-  return result;
-}
-
-template <typename T>
-T nrm1(size_t n, T* x)
-{
-  rmm::cuda_stream_view stream_view;
-  T result = thrust::reduce(rmm::exec_policy(stream_view),
-                            thrust::device_pointer_cast(x),
-                            thrust::device_pointer_cast(x + n));
-  RAFT_CHECK_CUDA(stream_view.value());
-  return result;
-}
-
-template <typename T>
-void scal(size_t n, T val, T* x)
-{
-  rmm::cuda_stream_view stream_view;
-  thrust::transform(rmm::exec_policy(stream_view),
-                    thrust::device_pointer_cast(x),
-                    thrust::device_pointer_cast(x + n),
-                    thrust::make_constant_iterator(val),
-                    thrust::device_pointer_cast(x),
-                    thrust::multiplies<T>());
-  RAFT_CHECK_CUDA(stream_view.value());
-}
-
-template <typename T>
-void addv(size_t n, T val, T* x)
-{
-  rmm::cuda_stream_view stream_view;
-  thrust::transform(rmm::exec_policy(stream_view),
-                    thrust::device_pointer_cast(x),
-                    thrust::device_pointer_cast(x + n),
-                    thrust::make_constant_iterator(val),
-                    thrust::device_pointer_cast(x),
-                    thrust::plus<T>());
-  RAFT_CHECK_CUDA(stream_view.value());
-}
-
-template <typename T>
-void fill(size_t n, T* x, T value)
-{
-  rmm::cuda_stream_view stream_view;
-  thrust::fill(rmm::exec_policy(stream_view),
-               thrust::device_pointer_cast(x),
-               thrust::device_pointer_cast(x + n),
-               value);
-  RAFT_CHECK_CUDA(stream_view.value());
-}
-
-template <typename T, typename M>
-void scatter(size_t n, T* src, T* dst, M* map)
-{
-  rmm::cuda_stream_view stream_view;
-  thrust::scatter(rmm::exec_policy(stream_view),
-                  thrust::device_pointer_cast(src),
-                  thrust::device_pointer_cast(src + n),
-                  thrust::device_pointer_cast(map),
-                  thrust::device_pointer_cast(dst));
-  RAFT_CHECK_CUDA(stream_view.value());
-}
-
-template <typename T>
-void printv(size_t n, T* vec, int offset)
-{
-  thrust::device_ptr<T> dev_ptr(vec);
-  std::cout.precision(15);
-  std::cout << "sample size = " << n << ", offset = " << offset << std::endl;
-  thrust::copy(
-    dev_ptr + offset,
-    dev_ptr + offset + n,
-    std::ostream_iterator<T>(
-      std::cout, " "));  // Assume no RMM dependency; TODO: check / test (potential BUG !!!!!)
-  RAFT_CHECK_CUDA(nullptr);
-  std::cout << std::endl;
-}
-
-template <typename T>
-void copy(size_t n, T* x, T* res)
-{
-  thrust::device_ptr<T> dev_ptr(x);
-  thrust::device_ptr<T> res_ptr(res);
-  rmm::cuda_stream_view stream_view;
-  thrust::copy_n(rmm::exec_policy(stream_view), dev_ptr, n, res_ptr);
-  RAFT_CHECK_CUDA(stream_view.value());
-}
-
-template <typename T>
-struct is_zero {
-  __host__ __device__ bool operator()(const T x) { return x == 0; }
-};
-
-template <typename T>
-struct dangling_functor : public thrust::unary_function<T, T> {
-  const T val;
-  dangling_functor(T _val) : val(_val) {}
-  __host__ __device__ T operator()(const T& x) const { return val + x; }
-};
-
-template <typename T>
-void update_dangling_nodes(size_t n, T* dangling_nodes, T damping_factor)
-{
-  rmm::cuda_stream_view stream_view;
-  thrust::transform_if(rmm::exec_policy(stream_view),
-                       thrust::device_pointer_cast(dangling_nodes),
-                       thrust::device_pointer_cast(dangling_nodes + n),
-                       thrust::device_pointer_cast(dangling_nodes),
-                       dangling_functor<T>(1.0 - damping_factor),
-                       is_zero<T>());
-  RAFT_CHECK_CUDA(stream_view.value());
-}
-
-// google matrix kernels
-template <typename IndexType, typename ValueType>
-__global__ static void degree_coo(const IndexType n,
-                                  const IndexType e,
-                                  const IndexType* ind,
-                                  ValueType* degree)
-{
-  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x)
-    atomicAdd(&degree[ind[i]], (ValueType)1.0);
-}
-
-template <typename IndexType, typename ValueType>
-__global__ static void flag_leafs_kernel(const size_t n,
-                                         const IndexType* degree,
-                                         ValueType* bookmark)
-{
-  for (auto i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x)
-    if (degree[i] == 0) bookmark[i] = 1.0;
-}
-
-template <typename IndexType, typename ValueType>
-__global__ static void degree_offsets(const IndexType n,
-                                      const IndexType e,
-                                      const IndexType* ind,
-                                      ValueType* degree)
-{
-  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x)
-    degree[i] += ind[i + 1] - ind[i];
-}
-
-template <typename FromType, typename ToType>
-__global__ static void type_convert(FromType* array, int n)
-{
-  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) {
-    ToType val   = array[i];
-    ToType* vals = (ToType*)array;
-    vals[i]      = val;
-  }
-}
-
-template <typename IndexType, typename ValueType>
-__global__ static void equi_prob3(const IndexType n,
-                                  const IndexType e,
-                                  const IndexType* csrPtr,
-                                  const IndexType* csrInd,
-                                  ValueType* val,
-                                  IndexType* degree)
-{
-  int j, row, col;
-  for (row = threadIdx.z + blockIdx.z * blockDim.z; row < n; row += gridDim.z * blockDim.z) {
-    for (j = csrPtr[row] + threadIdx.y + blockIdx.y * blockDim.y; j < csrPtr[row + 1];
-         j += gridDim.y * blockDim.y) {
-      col    = csrInd[j];
-      val[j] = 1.0 / degree[col];
-      // val[j] = 999;
-    }
-  }
-}
-
-template <typename IndexType, typename ValueType>
-__global__ static void equi_prob2(const IndexType n,
-                                  const IndexType e,
-                                  const IndexType* csrPtr,
-                                  const IndexType* csrInd,
-                                  ValueType* val,
-                                  IndexType* degree)
-{
-  int row = blockIdx.x * blockDim.x + threadIdx.x;
-  if (row < n) {
-    int row_begin = csrPtr[row];
-    int row_end   = csrPtr[row + 1];
-    int col;
-    for (int i = row_begin; i < row_end; i++) {
-      col    = csrInd[i];
-      val[i] = 1.0 / degree[col];
-    }
-  }
-}
-
-// compute the H^T values for an already transposed adjacency matrix, leveraging coo info
-template <typename IndexType, typename ValueType>
-void HT_matrix_csc_coo(const IndexType n,
-                       const IndexType e,
-                       const IndexType* csrPtr,
-                       const IndexType* csrInd,
-                       ValueType* val,
-                       ValueType* bookmark)
-{
-  rmm::cuda_stream_view stream_view;
-  rmm::device_uvector<IndexType> degree(n, stream_view);
-
-  dim3 nthreads, nblocks;
-  nthreads.x = min(e, CUDA_MAX_KERNEL_THREADS);
-  nthreads.y = 1;
-  nthreads.z = 1;
-  nblocks.x  = min((e + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS);
-  nblocks.y  = 1;
-  nblocks.z  = 1;
-  degree_coo<IndexType, IndexType>
-    <<<nblocks, nthreads, 0, stream_view.value()>>>(n, e, csrInd, degree.data());
-  RAFT_CHECK_CUDA(stream_view.value());
-
-  int y      = 4;
-  nthreads.x = 32 / y;
-  nthreads.y = y;
-  nthreads.z = 8;
-  nblocks.x  = 1;
-  nblocks.y  = 1;
-  nblocks.z  = min((n + nthreads.z - 1) / nthreads.z, CUDA_MAX_BLOCKS);  // 1;
-  equi_prob3<IndexType, ValueType>
-    <<<nblocks, nthreads, 0, stream_view.value()>>>(n, e, csrPtr, csrInd, val, degree.data());
-  RAFT_CHECK_CUDA(stream_view.value());
-
-  ValueType a = 0.0;
-  fill(n, bookmark, a);
-  RAFT_CHECK_CUDA(stream_view.value());
-
-  nthreads.x = min(n, CUDA_MAX_KERNEL_THREADS);
-  nthreads.y = 1;
-  nthreads.z = 1;
-  nblocks.x  = min((n + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS);
-  nblocks.y  = 1;
-  nblocks.z  = 1;
-  flag_leafs_kernel<IndexType, ValueType>
-    <<<nblocks, nthreads, 0, stream_view.value()>>>(n, degree.data(), bookmark);
-  RAFT_CHECK_CUDA(stream_view.value());
-}
-
-template <typename offsets_t, typename index_t>
-__attribute__((visibility("hidden"))) __global__ void offsets_to_indices_kernel(
-  const offsets_t* offsets, index_t v, index_t* indices)
-{
-  auto tid{threadIdx.x};
-  auto ctaStart{blockIdx.x};
-
-  for (index_t j = ctaStart; j < v; j += gridDim.x) {
-    offsets_t colStart = offsets[j];
-    offsets_t colEnd   = offsets[j + 1];
-    offsets_t rowNnz   = colEnd - colStart;
-
-    for (offsets_t i = 0; i < rowNnz; i += blockDim.x) {
-      if ((colStart + tid + i) < colEnd) { indices[colStart + tid + i] = j; }
-    }
-  }
-}
-
-template <typename offsets_t, typename index_t>
-void offsets_to_indices(const offsets_t* offsets, index_t v, index_t* indices)
-{
-  cudaStream_t stream{nullptr};
-  index_t nthreads = min(v, (index_t)CUDA_MAX_KERNEL_THREADS);
-  index_t nblocks  = min((v + nthreads - 1) / nthreads, (index_t)CUDA_MAX_BLOCKS);
-  offsets_to_indices_kernel<<<nblocks, nthreads, 0, stream>>>(offsets, v, indices);
-  RAFT_CHECK_CUDA(stream);
-}
-
-template <typename IndexType>
-void sequence(IndexType n, IndexType* vec, IndexType init = 0)
-{
-  thrust::sequence(
-    thrust::device, thrust::device_pointer_cast(vec), thrust::device_pointer_cast(vec + n), init);
-  RAFT_CHECK_CUDA(nullptr);
-}
-
-template <typename DistType>
-bool has_negative_val(DistType* arr, size_t n)
-{
-  // custom kernel with boolean bitwise reduce may be
-  // faster.
-  rmm::cuda_stream_view stream_view;
-  DistType result = *thrust::min_element(rmm::exec_policy(stream_view),
-                                         thrust::device_pointer_cast(arr),
-                                         thrust::device_pointer_cast(arr + n));
-
-  RAFT_CHECK_CUDA(stream_view.value());
-
-  return (result < 0);
-}
-
-}  // namespace detail
-}  // namespace cugraph
-
-#include "eidecl_graph_utils.hpp"
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 52d257b9bea..3752e823659 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -39,6 +39,7 @@ add_library(cugraphtestutil STATIC
             utilities/misc_utilities.cpp
             utilities/conversion_utilities_sg.cu
             utilities/debug_utilities_sg.cpp
+            utilities/validation_utilities.cu
             link_prediction/similarity_compare.cpp
             centrality/betweenness_centrality_validate.cu
             community/egonet_validate.cu
@@ -46,6 +47,7 @@ add_library(cugraphtestutil STATIC
             structure/induced_subgraph_validate.cu
             sampling/random_walks_check_sg.cu
             sampling/detail/nbr_sampling_validate.cu
+            sampling/detail/sampling_post_processing_validate.cu
             ../../thirdparty/mmio/mmio.c)
 
 target_compile_options(cugraphtestutil
@@ -218,6 +220,7 @@ function(ConfigureTestMG CMAKE_TEST_NAME)
         GPUS ${GPU_COUNT}
         PERCENT 100
         INSTALL_COMPONENT_SET testing_mg
+        INSTALL_TARGET ${CMAKE_TEST_NAME}
     )
     set_tests_properties(${CMAKE_TEST_NAME} PROPERTIES LABELS "CUGRAPH_MG")
 
@@ -300,6 +303,7 @@ function(ConfigureCTestMG CMAKE_TEST_NAME)
         GPUS ${GPU_COUNT}
         PERCENT 100
         INSTALL_COMPONENT_SET testing_mg
+        INSTALL_TARGET ${CMAKE_TEST_NAME}
     )
     set_tests_properties(${CMAKE_TEST_NAME} PROPERTIES LABELS "CUGRAPH_C_MG")
 
@@ -485,7 +489,16 @@ ConfigureTest(BIASED_NEIGHBOR_SAMPLING_TEST sampling/biased_neighbor_sampling.cp
 
 ###################################################################################################
 # - SAMPLING_POST_PROCESSING tests ----------------------------------------------------------------
-ConfigureTest(SAMPLING_POST_PROCESSING_TEST sampling/sampling_post_processing_test.cu)
+ConfigureTest(SAMPLING_POST_PROCESSING_TEST sampling/sampling_post_processing_test.cpp)
+
+###################################################################################################
+# - SAMPLING_HETEROGENEOUS_POST_PROCESSING tests --------------------------------------------------
+ConfigureTest(SAMPLING_HETEROGENEOUS_POST_PROCESSING_TEST
+              sampling/sampling_heterogeneous_post_processing_test.cpp)
+
+###################################################################################################
+# - NEGATIVE SAMPLING tests --------------------------------------------------------------------
+ConfigureTest(NEGATIVE_SAMPLING_TEST sampling/negative_sampling.cpp PERCENT 100)
 
 ###################################################################################################
 # - Renumber tests --------------------------------------------------------------------------------
@@ -576,7 +589,8 @@ if(BUILD_CUGRAPH_MG_TESTS)
     ###############################################################################################
     # - MG BETWEENNESS CENTRALITY tests -----------------------------------------------------------
     ConfigureTestMG(MG_BETWEENNESS_CENTRALITY_TEST centrality/mg_betweenness_centrality_test.cpp)
-    ConfigureTestMG(MG_EDGE_BETWEENNESS_CENTRALITY_TEST centrality/mg_edge_betweenness_centrality_test.cpp)
+    ConfigureTestMG(MG_EDGE_BETWEENNESS_CENTRALITY_TEST
+                    centrality/mg_edge_betweenness_centrality_test.cpp)
 
     ###############################################################################################
     # - MG BFS tests ------------------------------------------------------------------------------
@@ -741,6 +755,11 @@ if(BUILD_CUGRAPH_MG_TESTS)
     # - MG BIASED NBR SAMPLING tests --------------------------------------------------------------
     ConfigureTestMG(MG_BIASED_NEIGHBOR_SAMPLING_TEST sampling/mg_biased_neighbor_sampling.cpp)
 
+    ###################################################################################################
+    # - NEGATIVE SAMPLING tests --------------------------------------------------------------------
+    ConfigureTestMG(MG_NEGATIVE_SAMPLING_TEST sampling/mg_negative_sampling.cpp)
+
+
     ###############################################################################################
     # - MG RANDOM_WALKS tests ---------------------------------------------------------------------
     ConfigureTestMG(MG_RANDOM_WALKS_TEST sampling/mg_random_walks_test.cpp)
@@ -773,6 +792,7 @@ if(BUILD_CUGRAPH_MG_TESTS)
     ConfigureCTestMG(MG_CAPI_HITS_TEST c_api/mg_hits_test.c)
     ConfigureCTestMG(MG_CAPI_UNIFORM_NEIGHBOR_SAMPLE_TEST c_api/mg_uniform_neighbor_sample_test.c)
     ConfigureCTestMG(MG_CAPI_BIASED_NEIGHBOR_SAMPLE_TEST c_api/mg_biased_neighbor_sample_test.c)
+    ConfigureCTestMG(MG_CAPI_NEGATIVE_SAMPLING_TEST c_api/mg_negative_sampling_test.c)
     ConfigureCTestMG(MG_CAPI_LOOKUP_SRC_DST_TEST c_api/mg_lookup_src_dst_test.c)
     ConfigureCTestMG(MG_CAPI_RANDOM_WALKS_TEST c_api/mg_random_walks_test.c)
     ConfigureCTestMG(MG_CAPI_TRIANGLE_COUNT_TEST c_api/mg_triangle_count_test.c)
@@ -812,6 +832,7 @@ ConfigureCTest(CAPI_WEAKLY_CONNECTED_COMPONENTS_TEST c_api/weakly_connected_comp
 ConfigureCTest(CAPI_STRONGLY_CONNECTED_COMPONENTS_TEST c_api/strongly_connected_components_test.c)
 ConfigureCTest(CAPI_UNIFORM_NEIGHBOR_SAMPLE_TEST c_api/uniform_neighbor_sample_test.c)
 ConfigureCTest(CAPI_BIASED_NEIGHBOR_SAMPLE_TEST c_api/biased_neighbor_sample_test.c)
+ConfigureCTest(CAPI_NEGATIVE_SAMPLING_TEST c_api/negative_sampling_test.c)
 ConfigureCTest(CAPI_RANDOM_WALKS_TEST c_api/sg_random_walks_test.c)
 ConfigureCTest(CAPI_TRIANGLE_COUNT_TEST c_api/triangle_count_test.c)
 ConfigureCTest(CAPI_LOUVAIN_TEST c_api/louvain_test.c)
diff --git a/cpp/tests/c_api/mg_negative_sampling_test.c b/cpp/tests/c_api/mg_negative_sampling_test.c
new file mode 100644
index 00000000000..3289206d8db
--- /dev/null
+++ b/cpp/tests/c_api/mg_negative_sampling_test.c
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mg_test_utils.h" /* RUN_MG_TEST */
+
+#include <cugraph_c/algorithms.h>
+#include <cugraph_c/graph.h>
+
+#include <math.h>
+#include <stdbool.h>
+#include <unistd.h>
+
+typedef int32_t vertex_t;
+typedef int32_t edge_t;
+typedef float weight_t;
+
+data_type_id_t vertex_tid    = INT32;
+data_type_id_t edge_tid      = INT32;
+data_type_id_t weight_tid    = FLOAT32;
+data_type_id_t edge_id_tid   = INT32;
+data_type_id_t edge_type_tid = INT32;
+
+int generic_negative_sampling_test(const cugraph_resource_handle_t* handle,
+                                   vertex_t* h_src,
+                                   vertex_t* h_dst,
+                                   size_t num_vertices,
+                                   size_t num_edges,
+                                   size_t num_samples,
+                                   vertex_t* h_vertices,
+                                   weight_t* h_src_bias,
+                                   weight_t* h_dst_bias,
+                                   size_t num_biases,
+                                   bool_t remove_duplicates,
+                                   bool_t remove_false_negatives,
+                                   bool_t exact_number_of_samples)
+{
+  // Create graph
+  int test_ret_value            = 0;
+  cugraph_error_code_t ret_code = CUGRAPH_SUCCESS;
+  cugraph_error_t* ret_error    = NULL;
+  cugraph_graph_t* graph        = NULL;
+  cugraph_coo_t* result         = NULL;
+
+  ret_code = create_mg_test_graph_new(handle,
+                                      vertex_tid,
+                                      edge_tid,
+                                      h_src,
+                                      h_dst,
+                                      weight_tid,
+                                      NULL,
+                                      edge_type_tid,
+                                      NULL,
+                                      edge_id_tid,
+                                      NULL,
+                                      num_edges,
+                                      FALSE,
+                                      TRUE,
+                                      FALSE,
+                                      FALSE,
+                                      &graph,
+                                      &ret_error);
+
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "graph creation failed.");
+
+  cugraph_type_erased_device_array_t* d_vertices           = NULL;
+  cugraph_type_erased_device_array_view_t* d_vertices_view = NULL;
+  cugraph_type_erased_device_array_t* d_src_bias           = NULL;
+  cugraph_type_erased_device_array_view_t* d_src_bias_view = NULL;
+  cugraph_type_erased_device_array_t* d_dst_bias           = NULL;
+  cugraph_type_erased_device_array_view_t* d_dst_bias_view = NULL;
+
+  int rank = cugraph_resource_handle_get_rank(handle);
+
+  if (num_biases > 0) {
+    if (rank == 0) {
+      ret_code = cugraph_type_erased_device_array_create(
+        handle, num_biases, vertex_tid, &d_vertices, &ret_error);
+      TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_vertices create failed.");
+
+      d_vertices_view = cugraph_type_erased_device_array_view(d_vertices);
+
+      ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+        handle, d_vertices_view, (byte_t*)h_vertices, &ret_error);
+
+      ret_code = cugraph_type_erased_device_array_create(
+        handle, num_biases, weight_tid, &d_src_bias, &ret_error);
+      TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_src_bias create failed.");
+
+      d_src_bias_view = cugraph_type_erased_device_array_view(d_src_bias);
+
+      ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+        handle, d_src_bias_view, (byte_t*)h_src_bias, &ret_error);
+
+      ret_code = cugraph_type_erased_device_array_create(
+        handle, num_biases, weight_tid, &d_dst_bias, &ret_error);
+      TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_dst_bias create failed.");
+
+      d_dst_bias_view = cugraph_type_erased_device_array_view(d_dst_bias);
+
+      ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+        handle, d_dst_bias_view, (byte_t*)h_dst_bias, &ret_error);
+    } else {
+      d_vertices_view = cugraph_type_erased_device_array_view_create(NULL, 0, vertex_tid);
+      d_src_bias_view = cugraph_type_erased_device_array_view_create(NULL, 0, weight_tid);
+      d_dst_bias_view = cugraph_type_erased_device_array_view_create(NULL, 0, weight_tid);
+    }
+  }
+
+  cugraph_rng_state_t* rng_state;
+  ret_code = cugraph_rng_state_create(handle, rank, &rng_state, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "rng_state create failed.");
+
+  ret_code = cugraph_negative_sampling(handle,
+                                       rng_state,
+                                       graph,
+                                       d_vertices_view,
+                                       d_src_bias_view,
+                                       d_dst_bias_view,
+                                       num_samples,
+                                       remove_duplicates,
+                                       remove_false_negatives,
+                                       exact_number_of_samples,
+                                       FALSE,
+                                       &result,
+                                       &ret_error);
+
+  cugraph_type_erased_device_array_view_t* result_srcs = NULL;
+  cugraph_type_erased_device_array_view_t* result_dsts = NULL;
+
+  result_srcs = cugraph_coo_get_sources(result);
+  result_dsts = cugraph_coo_get_destinations(result);
+
+  size_t result_size = cugraph_type_erased_device_array_view_size(result_srcs);
+
+  vertex_t h_result_srcs[result_size];
+  vertex_t h_result_dsts[result_size];
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_result_srcs, result_srcs, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_result_dsts, result_dsts, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  //  First, check that all edges are actually part of the graph
+  int32_t M_exists[num_vertices][num_vertices];
+  int32_t M_duplicates[num_vertices][num_vertices];
+
+  for (int i = 0; i < num_vertices; ++i)
+    for (int j = 0; j < num_vertices; ++j) {
+      M_exists[i][j]     = 0;
+      M_duplicates[i][j] = 0;
+    }
+
+  for (int i = 0; i < num_edges; ++i) {
+    M_exists[h_src[i]][h_dst[i]] = 1;
+  }
+
+  for (int i = 0; (i < result_size) && (test_ret_value == 0); ++i) {
+    TEST_ASSERT(test_ret_value,
+                (h_result_srcs[i] >= 0) && (h_result_srcs[i] < num_vertices),
+                "negative_sampling generated an edge that with an invalid vertex");
+    TEST_ASSERT(test_ret_value,
+                (h_result_dsts[i] >= 0) && (h_result_dsts[i] < num_vertices),
+                "negative_sampling generated an edge that with an invalid vertex");
+    if (remove_false_negatives == TRUE) {
+      TEST_ASSERT(test_ret_value,
+                  M_exists[h_result_srcs[i]][h_result_dsts[i]] == 0,
+                  "negative_sampling generated a false negative edge that should be suppressed");
+    }
+
+    if (remove_duplicates == TRUE) {
+      TEST_ASSERT(test_ret_value,
+                  M_duplicates[h_result_srcs[i]][h_result_dsts[i]] == 0,
+                  "negative_sampling generated a duplicate edge that should be suppressed");
+      M_duplicates[h_result_srcs[i]][h_result_dsts[i]] = 1;
+    }
+  }
+
+  if (exact_number_of_samples == TRUE)
+    TEST_ASSERT(test_ret_value,
+                result_size == num_samples,
+                "negative_sampling generated a result with an incorrect number of samples");
+
+  cugraph_type_erased_device_array_view_free(d_vertices_view);
+  cugraph_type_erased_device_array_view_free(d_src_bias_view);
+  cugraph_type_erased_device_array_view_free(d_dst_bias_view);
+  cugraph_type_erased_device_array_free(d_vertices);
+  cugraph_type_erased_device_array_free(d_src_bias);
+  cugraph_type_erased_device_array_free(d_dst_bias);
+  cugraph_coo_free(result);
+  cugraph_mg_graph_free(graph);
+  cugraph_error_free(ret_error);
+  return test_ret_value;
+}
+
+int test_negative_sampling_uniform(const cugraph_resource_handle_t* handle)
+{
+  data_type_id_t vertex_tid = INT32;
+  data_type_id_t edge_tid   = INT32;
+  data_type_id_t weight_tid = FLOAT32;
+
+  size_t num_edges    = 9;
+  size_t num_vertices = 6;
+  size_t num_biases   = 0;
+  size_t num_samples  = 10;
+
+  vertex_t src[] = {0, 0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t dst[] = {1, 2, 3, 4, 0, 1, 3, 5, 5};
+
+  bool_t remove_duplicates       = FALSE;
+  bool_t remove_false_negatives  = TRUE;
+  bool_t exact_number_of_samples = FALSE;
+
+  return generic_negative_sampling_test(handle,
+                                        src,
+                                        dst,
+                                        num_vertices,
+                                        num_edges,
+                                        num_samples,
+                                        NULL,
+                                        NULL,
+                                        NULL,
+                                        num_biases,
+                                        remove_duplicates,
+                                        remove_false_negatives,
+                                        exact_number_of_samples);
+}
+
+int test_negative_sampling_biased(const cugraph_resource_handle_t* handle)
+{
+  data_type_id_t vertex_tid = INT32;
+  data_type_id_t edge_tid   = INT32;
+  data_type_id_t weight_tid = FLOAT32;
+
+  size_t num_edges    = 9;
+  size_t num_vertices = 6;
+  size_t num_biases   = 6;
+  size_t num_samples  = 10;
+
+  vertex_t src[]      = {0, 0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t dst[]      = {1, 2, 3, 4, 0, 1, 3, 5, 5};
+  weight_t src_bias[] = {1, 1, 2, 2, 1, 1};
+  weight_t dst_bias[] = {2, 2, 1, 1, 1, 1};
+  vertex_t vertices[] = {0, 1, 2, 3, 4, 5};
+
+  bool_t remove_duplicates       = FALSE;
+  bool_t remove_false_negatives  = TRUE;
+  bool_t exact_number_of_samples = FALSE;
+
+  return generic_negative_sampling_test(handle,
+                                        src,
+                                        dst,
+                                        num_vertices,
+                                        num_edges,
+                                        num_samples,
+                                        vertices,
+                                        src_bias,
+                                        dst_bias,
+                                        num_biases,
+                                        remove_duplicates,
+                                        remove_false_negatives,
+                                        exact_number_of_samples);
+}
+
+/******************************************************************************/
+
+int main(int argc, char** argv)
+{
+  void* raft_handle                 = create_mg_raft_handle(argc, argv);
+  cugraph_resource_handle_t* handle = cugraph_create_resource_handle(raft_handle);
+
+  int result = 0;
+  result |= RUN_MG_TEST(test_negative_sampling_uniform, handle);
+  result |= RUN_MG_TEST(test_negative_sampling_biased, handle);
+
+  cugraph_free_resource_handle(handle);
+  free_mg_raft_handle(raft_handle);
+
+  return result;
+}
diff --git a/cpp/tests/c_api/negative_sampling_test.c b/cpp/tests/c_api/negative_sampling_test.c
new file mode 100644
index 00000000000..5e8d3f7e765
--- /dev/null
+++ b/cpp/tests/c_api/negative_sampling_test.c
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c_test_utils.h" /* RUN_TEST */
+
+#include <cugraph_c/algorithms.h>
+#include <cugraph_c/graph.h>
+
+#include <math.h>
+#include <stdbool.h>
+#include <stdlib.h>
+
+typedef int32_t vertex_t;
+typedef int32_t edge_t;
+typedef float weight_t;
+
+data_type_id_t vertex_tid    = INT32;
+data_type_id_t edge_tid      = INT32;
+data_type_id_t weight_tid    = FLOAT32;
+data_type_id_t edge_id_tid   = INT32;
+data_type_id_t edge_type_tid = INT32;
+
+int generic_negative_sampling_test(const cugraph_resource_handle_t* handle,
+                                   vertex_t* h_src,
+                                   vertex_t* h_dst,
+                                   size_t num_vertices,
+                                   size_t num_edges,
+                                   size_t num_samples,
+                                   vertex_t* h_vertices,
+                                   weight_t* h_src_bias,
+                                   weight_t* h_dst_bias,
+                                   size_t num_biases,
+                                   bool_t remove_duplicates,
+                                   bool_t remove_false_negatives,
+                                   bool_t exact_number_of_samples)
+{
+  // Create graph
+  int test_ret_value            = 0;
+  cugraph_error_code_t ret_code = CUGRAPH_SUCCESS;
+  cugraph_error_t* ret_error    = NULL;
+  cugraph_graph_t* graph        = NULL;
+  cugraph_coo_t* result         = NULL;
+
+  ret_code = create_sg_test_graph(handle,
+                                  vertex_tid,
+                                  edge_tid,
+                                  h_src,
+                                  h_dst,
+                                  weight_tid,
+                                  NULL,
+                                  edge_type_tid,
+                                  NULL,
+                                  edge_id_tid,
+                                  NULL,
+                                  num_edges,
+                                  FALSE,
+                                  TRUE,
+                                  FALSE,
+                                  FALSE,
+                                  &graph,
+                                  &ret_error);
+
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "graph creation failed.");
+
+  cugraph_type_erased_device_array_t* d_vertices           = NULL;
+  cugraph_type_erased_device_array_view_t* d_vertices_view = NULL;
+  cugraph_type_erased_device_array_t* d_src_bias           = NULL;
+  cugraph_type_erased_device_array_view_t* d_src_bias_view = NULL;
+  cugraph_type_erased_device_array_t* d_dst_bias           = NULL;
+  cugraph_type_erased_device_array_view_t* d_dst_bias_view = NULL;
+
+  if (num_biases > 0) {
+    ret_code = cugraph_type_erased_device_array_create(
+      handle, num_biases, vertex_tid, &d_vertices, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_vertices create failed.");
+
+    d_vertices_view = cugraph_type_erased_device_array_view(d_vertices);
+
+    ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+      handle, d_vertices_view, (byte_t*)h_vertices, &ret_error);
+
+    ret_code = cugraph_type_erased_device_array_create(
+      handle, num_biases, weight_tid, &d_src_bias, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_src_bias create failed.");
+
+    d_src_bias_view = cugraph_type_erased_device_array_view(d_src_bias);
+
+    ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+      handle, d_src_bias_view, (byte_t*)h_src_bias, &ret_error);
+
+    ret_code = cugraph_type_erased_device_array_create(
+      handle, num_biases, weight_tid, &d_dst_bias, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_dst_bias create failed.");
+
+    d_dst_bias_view = cugraph_type_erased_device_array_view(d_dst_bias);
+
+    ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+      handle, d_dst_bias_view, (byte_t*)h_dst_bias, &ret_error);
+  }
+
+  cugraph_rng_state_t* rng_state;
+  ret_code = cugraph_rng_state_create(handle, 0, &rng_state, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "rng_state create failed.");
+
+  ret_code = cugraph_negative_sampling(handle,
+                                       rng_state,
+                                       graph,
+                                       d_vertices_view,
+                                       d_src_bias_view,
+                                       d_dst_bias_view,
+                                       num_samples,
+                                       remove_duplicates,
+                                       remove_false_negatives,
+                                       exact_number_of_samples,
+                                       FALSE,
+                                       &result,
+                                       &ret_error);
+
+  cugraph_type_erased_device_array_view_t* result_srcs = NULL;
+  cugraph_type_erased_device_array_view_t* result_dsts = NULL;
+
+  result_srcs = cugraph_coo_get_sources(result);
+  result_dsts = cugraph_coo_get_destinations(result);
+
+  size_t result_size = cugraph_type_erased_device_array_view_size(result_srcs);
+
+  vertex_t h_result_srcs[result_size];
+  vertex_t h_result_dsts[result_size];
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_result_srcs, result_srcs, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_result_dsts, result_dsts, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  //  First, check that all edges are actually part of the graph
+  int32_t M_exists[num_vertices][num_vertices];
+  int32_t M_duplicates[num_vertices][num_vertices];
+
+  for (int i = 0; i < num_vertices; ++i)
+    for (int j = 0; j < num_vertices; ++j) {
+      M_exists[i][j]     = 0;
+      M_duplicates[i][j] = 0;
+    }
+
+  for (int i = 0; i < num_edges; ++i) {
+    M_exists[h_src[i]][h_dst[i]] = 1;
+  }
+
+  for (int i = 0; (i < result_size) && (test_ret_value == 0); ++i) {
+    TEST_ASSERT(test_ret_value,
+                (h_result_srcs[i] >= 0) && (h_result_srcs[i] < num_vertices),
+                "negative_sampling generated an edge that with an invalid vertex");
+    TEST_ASSERT(test_ret_value,
+                (h_result_dsts[i] >= 0) && (h_result_dsts[i] < num_vertices),
+                "negative_sampling generated an edge that with an invalid vertex");
+    if (remove_false_negatives == TRUE) {
+      TEST_ASSERT(test_ret_value,
+                  M_exists[h_result_srcs[i]][h_result_dsts[i]] == 0,
+                  "negative_sampling generated a false negative edge that should be suppressed");
+    }
+
+    if (remove_duplicates == TRUE) {
+      TEST_ASSERT(test_ret_value,
+                  M_duplicates[h_result_srcs[i]][h_result_dsts[i]] == 0,
+                  "negative_sampling generated a duplicate edge that should be suppressed");
+      M_duplicates[h_result_srcs[i]][h_result_dsts[i]] = 1;
+    }
+  }
+
+  if (exact_number_of_samples == TRUE)
+    TEST_ASSERT(test_ret_value,
+                result_size == num_samples,
+                "negative_sampling generated a result with an incorrect number of samples");
+
+  cugraph_type_erased_device_array_view_free(d_vertices_view);
+  cugraph_type_erased_device_array_view_free(d_src_bias_view);
+  cugraph_type_erased_device_array_view_free(d_dst_bias_view);
+  cugraph_type_erased_device_array_free(d_vertices);
+  cugraph_type_erased_device_array_free(d_src_bias);
+  cugraph_coo_free(result);
+  cugraph_sg_graph_free(graph);
+  cugraph_error_free(ret_error);
+  return test_ret_value;
+}
+
+int test_negative_sampling_uniform(const cugraph_resource_handle_t* handle)
+{
+  data_type_id_t vertex_tid = INT32;
+  data_type_id_t edge_tid   = INT32;
+  data_type_id_t weight_tid = FLOAT32;
+
+  size_t num_edges    = 9;
+  size_t num_vertices = 6;
+  size_t num_biases   = 0;
+  size_t num_samples  = 10;
+
+  vertex_t src[] = {0, 0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t dst[] = {1, 2, 3, 4, 0, 1, 3, 5, 5};
+
+  bool_t remove_duplicates       = FALSE;
+  bool_t remove_false_negatives  = TRUE;
+  bool_t exact_number_of_samples = FALSE;
+
+  return generic_negative_sampling_test(handle,
+                                        src,
+                                        dst,
+                                        num_vertices,
+                                        num_edges,
+                                        num_samples,
+                                        NULL,
+                                        NULL,
+                                        NULL,
+                                        num_biases,
+                                        remove_duplicates,
+                                        remove_false_negatives,
+                                        exact_number_of_samples);
+}
+
+int test_negative_sampling_biased(const cugraph_resource_handle_t* handle)
+{
+  data_type_id_t vertex_tid = INT32;
+  data_type_id_t edge_tid   = INT32;
+  data_type_id_t weight_tid = FLOAT32;
+
+  size_t num_edges    = 9;
+  size_t num_vertices = 6;
+  size_t num_biases   = 6;
+  size_t num_samples  = 10;
+
+  vertex_t src[]      = {0, 0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t dst[]      = {1, 2, 3, 4, 0, 1, 3, 5, 5};
+  weight_t src_bias[] = {1, 1, 2, 2, 1, 1};
+  weight_t dst_bias[] = {2, 2, 1, 1, 1, 1};
+  vertex_t vertices[] = {0, 1, 2, 3, 4, 5};
+
+  bool_t remove_duplicates       = FALSE;
+  bool_t remove_false_negatives  = TRUE;
+  bool_t exact_number_of_samples = FALSE;
+
+  return generic_negative_sampling_test(handle,
+                                        src,
+                                        dst,
+                                        num_vertices,
+                                        num_edges,
+                                        num_samples,
+                                        vertices,
+                                        src_bias,
+                                        dst_bias,
+                                        num_biases,
+                                        remove_duplicates,
+                                        remove_false_negatives,
+                                        exact_number_of_samples);
+}
+
+int main(int argc, char** argv)
+{
+  cugraph_resource_handle_t* handle = NULL;
+
+  handle = cugraph_create_resource_handle(NULL);
+
+  int result = 0;
+  result |= RUN_TEST_NEW(test_negative_sampling_uniform, handle);
+  result |= RUN_TEST_NEW(test_negative_sampling_biased, handle);
+
+  cugraph_free_resource_handle(handle);
+
+  return result;
+}
diff --git a/cpp/tests/c_api/sg_random_walks_test.c b/cpp/tests/c_api/sg_random_walks_test.c
index 14108d91c04..a4a77b5775a 100644
--- a/cpp/tests/c_api/sg_random_walks_test.c
+++ b/cpp/tests/c_api/sg_random_walks_test.c
@@ -192,9 +192,6 @@ int generic_biased_random_walks_test(vertex_t* h_src,
   ret_code =
     cugraph_biased_random_walks(handle, graph, d_start_view, max_depth, &result, &ret_error);
 
-#if 1
-  TEST_ASSERT(test_ret_value, ret_code != CUGRAPH_SUCCESS, "biased_random_walks should have failed")
-#else
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "biased_random_walks failed.");
 
@@ -208,10 +205,10 @@ int generic_biased_random_walks_test(vertex_t* h_src,
   size_t wgts_size  = cugraph_type_erased_device_array_view_size(wgts);
 
   vertex_t h_result_verts[verts_size];
-  vertex_t h_result_wgts[wgts_size];
+  weight_t h_result_wgts[wgts_size];
 
-  ret_code =
-    cugraph_type_erased_device_array_view_copy_to_host(handle, (byte_t*)h_verts, verts, &ret_error);
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_result_verts, verts, &ret_error);
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
 
   ret_code = cugraph_type_erased_device_array_view_copy_to_host(
@@ -231,23 +228,35 @@ int generic_biased_random_walks_test(vertex_t* h_src,
     M[h_src[i]][h_dst[i]] = h_wgt[i];
 
   TEST_ASSERT(test_ret_value,
-              cugraph_random_walk_result_get_max_path_length() == max_depth,
+              cugraph_random_walk_result_get_max_path_length(result) == max_depth,
               "path length does not match");
 
   for (int i = 0; (i < num_starts) && (test_ret_value == 0); ++i) {
-    TEST_ASSERT(test_ret_value,
-                M[h_start[i]][h_result_verts[i * (max_depth + 1)]] == h_result_wgts[i * max_depth],
-                "biased_random_walks got edge that doesn't exist");
-    for (size_t j = 1; j < cugraph_random_walk_result_get_max_path_length(); ++j)
-      TEST_ASSERT(
-        test_ret_value,
-        M[h_start[i * (max_depth + 1) + j - 1]][h_result_verts[i * (max_depth + 1) + j]] ==
-          h_result_wgts[i * max_depth + j - 1],
-        "biased_random_walks got edge that doesn't exist");
+    TEST_ASSERT(
+      test_ret_value, h_start[i] == h_result_verts[i * (max_depth + 1)], "start of path not found");
+    for (size_t j = 0; j < max_depth; ++j) {
+      int src_index = i * (max_depth + 1) + j;
+      int dst_index = src_index + 1;
+      if (h_result_verts[dst_index] < 0) {
+        if (h_result_verts[src_index] >= 0) {
+          int departing_count = 0;
+          for (int k = 0; k < num_vertices; ++k) {
+            if (M[h_result_verts[src_index]][k] >= 0) departing_count++;
+          }
+          TEST_ASSERT(test_ret_value,
+                      departing_count == 0,
+                      "biased_random_walks found no edge when an edge exists");
+        }
+      } else {
+        TEST_ASSERT(test_ret_value,
+                    M[h_result_verts[src_index]][h_result_verts[dst_index]] ==
+                      h_result_wgts[i * max_depth + j],
+                    "biased_random_walks got edge that doesn't exist");
+      }
+    }
   }
 
   cugraph_random_walk_result_free(result);
-#endif
 
   cugraph_sg_graph_free(graph);
   cugraph_free_resource_handle(handle);
@@ -302,10 +311,6 @@ int generic_node2vec_random_walks_test(vertex_t* h_src,
   ret_code = cugraph_node2vec_random_walks(
     handle, graph, d_start_view, max_depth, p, q, &result, &ret_error);
 
-#if 1
-  TEST_ASSERT(
-    test_ret_value, ret_code != CUGRAPH_SUCCESS, "node2vec_random_walks should have failed")
-#else
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "node2vec_random_walks failed.");
 
@@ -319,10 +324,10 @@ int generic_node2vec_random_walks_test(vertex_t* h_src,
   size_t wgts_size  = cugraph_type_erased_device_array_view_size(wgts);
 
   vertex_t h_result_verts[verts_size];
-  vertex_t h_result_wgts[wgts_size];
+  weight_t h_result_wgts[wgts_size];
 
-  ret_code =
-    cugraph_type_erased_device_array_view_copy_to_host(handle, (byte_t*)h_verts, verts, &ret_error);
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_result_verts, verts, &ret_error);
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
 
   ret_code = cugraph_type_erased_device_array_view_copy_to_host(
@@ -342,23 +347,35 @@ int generic_node2vec_random_walks_test(vertex_t* h_src,
     M[h_src[i]][h_dst[i]] = h_wgt[i];
 
   TEST_ASSERT(test_ret_value,
-              cugraph_random_walk_result_get_max_path_length() == max_depth,
+              cugraph_random_walk_result_get_max_path_length(result) == max_depth,
               "path length does not match");
 
   for (int i = 0; (i < num_starts) && (test_ret_value == 0); ++i) {
-    TEST_ASSERT(test_ret_value,
-                M[h_start[i]][h_result_verts[i * (max_depth + 1)]] == h_result_wgts[i * max_depth],
-                "node2vec_random_walks got edge that doesn't exist");
-    for (size_t j = 1; j < max_depth; ++j)
-      TEST_ASSERT(
-        test_ret_value,
-        M[h_start[i * (max_depth + 1) + j - 1]][h_result_verts[i * (max_depth + 1) + j]] ==
-          h_result_wgts[i * max_depth + j - 1],
-        "node2vec_random_walks got edge that doesn't exist");
+    TEST_ASSERT(
+      test_ret_value, h_start[i] == h_result_verts[i * (max_depth + 1)], "start of path not found");
+    for (size_t j = 0; j < max_depth; ++j) {
+      int src_index = i * (max_depth + 1) + j;
+      int dst_index = src_index + 1;
+      if (h_result_verts[dst_index] < 0) {
+        if (h_result_verts[src_index] >= 0) {
+          int departing_count = 0;
+          for (int k = 0; k < num_vertices; ++k) {
+            if (M[h_result_verts[src_index]][k] >= 0) departing_count++;
+          }
+          TEST_ASSERT(test_ret_value,
+                      departing_count == 0,
+                      "node2vec_random_walks found no edge when an edge exists");
+        }
+      } else {
+        TEST_ASSERT(test_ret_value,
+                    M[h_result_verts[src_index]][h_result_verts[dst_index]] ==
+                      h_result_wgts[i * max_depth + j],
+                    "node2vec_random_walks got edge that doesn't exist");
+      }
+    }
   }
 
   cugraph_random_walk_result_free(result);
-#endif
 
   cugraph_sg_graph_free(graph);
   cugraph_free_resource_handle(handle);
@@ -390,7 +407,7 @@ int test_biased_random_walks()
 
   vertex_t src[]   = {0, 1, 1, 2, 2, 2, 3, 4};
   vertex_t dst[]   = {1, 3, 4, 0, 1, 3, 5, 5};
-  weight_t wgt[]   = {0, 1, 2, 3, 4, 5, 6, 7};
+  weight_t wgt[]   = {1, 2, 3, 4, 5, 6, 7, 8};
   vertex_t start[] = {2, 2};
 
   return generic_biased_random_walks_test(
diff --git a/cpp/tests/mtmg/multi_node_threaded_test.cu b/cpp/tests/mtmg/multi_node_threaded_test.cu
index 06ccd4a7fa1..374c432aac5 100644
--- a/cpp/tests/mtmg/multi_node_threaded_test.cu
+++ b/cpp/tests/mtmg/multi_node_threaded_test.cu
@@ -39,6 +39,7 @@
 #include <thrust/unique.h>
 
 #include <gtest/gtest.h>
+#include <nccl.h>
 
 #include <filesystem>
 #include <fstream>
diff --git a/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu b/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu
index d77d8a7659e..f698701eb08 100644
--- a/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu
+++ b/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu
@@ -15,6 +15,7 @@
  */
 
 #include "prims/per_v_random_select_transform_outgoing_e.cuh"
+#include "prims/transform_e.cuh"
 #include "prims/vertex_frontier.cuh"
 #include "utilities/base_fixture.hpp"
 #include "utilities/conversion_utilities.hpp"
@@ -103,6 +104,7 @@ struct Prims_Usecase {
   bool with_replacement{false};
   bool use_invalid_value{false};
   bool use_weight_as_bias{false};
+  bool inject_zero_bias{false};  // valid only when use_weight_as_bias is true
   bool edge_masking{false};
   bool check_correctness{true};
 };
@@ -159,6 +161,23 @@ class Tests_MGPerVRandomSelectTransformOutgoingE
       mg_graph_view.attach_edge_mask((*edge_mask).view());
     }
 
+    if (mg_edge_weight_view && prims_usecase.inject_zero_bias) {
+      cugraph::transform_e(
+        *handle_,
+        mg_graph_view,
+        cugraph::edge_src_dummy_property_t{}.view(),
+        cugraph::edge_dst_dummy_property_t{}.view(),
+        *mg_edge_weight_view,
+        [] __device__(auto src, auto dst, auto, auto, auto w) {
+          if ((src % 2) == 0 && (dst % 2) == 0) {
+            return weight_t{0.0};
+          } else {
+            return w;
+          }
+        },
+        (*mg_edge_weights).mutable_view());
+    }
+
     // 2. run MG per_v_random_select_transform_outgoing_e primitive
 
     const int hash_bin_count = 5;
@@ -324,11 +343,14 @@ class Tests_MGPerVRandomSelectTransformOutgoingE
       }
 
       cugraph::graph_t<vertex_t, edge_t, false, false> sg_graph(*handle_);
-      std::tie(sg_graph, std::ignore, std::ignore, std::ignore) =
+      std::optional<
+        cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, false, false>, weight_t>>
+        sg_edge_weights{std::nullopt};
+      std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) =
         cugraph::test::mg_graph_to_sg_graph(
           *handle_,
           mg_graph_view,
-          std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+          mg_edge_weight_view,
           std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
           std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
                                                                 (*mg_renumber_map).size()),
@@ -347,6 +369,8 @@ class Tests_MGPerVRandomSelectTransformOutgoingE
         }
 
         auto sg_graph_view = sg_graph.view();
+        auto sg_edge_weight_view =
+          sg_edge_weights ? std::make_optional((*sg_edge_weights).view()) : std::nullopt;
 
         rmm::device_uvector<edge_t> sg_offsets(sg_graph_view.number_of_vertices() + vertex_t{1},
                                                handle_->get_stream());
@@ -361,6 +385,17 @@ class Tests_MGPerVRandomSelectTransformOutgoingE
                      sg_graph_view.local_edge_partition_view().indices().end(),
                      sg_indices.begin());
 
+        std::optional<rmm::device_uvector<weight_t>> sg_biases{std::nullopt};
+        if (sg_edge_weight_view) {
+          auto firsts = (*sg_edge_weight_view).value_firsts();
+          auto counts = (*sg_edge_weight_view).edge_counts();
+          assert(firsts.size() == 1);
+          assert(counts.size() == 1);
+          sg_biases = rmm::device_uvector<weight_t>(counts[0], handle_->get_stream());
+          thrust::copy(
+            handle_->get_thrust_policy(), firsts[0], firsts[0] + counts[0], (*sg_biases).begin());
+        }
+
         auto num_invalids = static_cast<size_t>(thrust::count_if(
           handle_->get_thrust_policy(),
           thrust::make_counting_iterator(size_t{0}),
@@ -371,9 +406,10 @@ class Tests_MGPerVRandomSelectTransformOutgoingE
                                                         : thrust::nullopt,
            sample_e_op_result_first =
              cugraph::get_dataframe_buffer_begin(mg_aggregate_sample_e_op_results),
-           sg_offsets       = sg_offsets.begin(),
-           sg_indices       = sg_indices.begin(),
-           K                = prims_usecase.K,
+           sg_offsets = sg_offsets.begin(),
+           sg_indices = sg_indices.begin(),
+           sg_biases  = sg_biases ? thrust::make_optional((*sg_biases).begin()) : thrust::nullopt,
+           K          = prims_usecase.K,
            with_replacement = prims_usecase.with_replacement,
            invalid_value =
              invalid_value ? thrust::make_optional<result_t>(*invalid_value) : thrust::nullopt,
@@ -402,6 +438,12 @@ class Tests_MGPerVRandomSelectTransformOutgoingE
             auto count = offset_last - offset_first;
 
             auto out_degree = *(sg_offsets + v + 1) - *(sg_offsets + v);
+            if (sg_biases) {
+              out_degree = thrust::count_if(thrust::seq,
+                                            *sg_biases + *(sg_offsets + v),
+                                            *sg_biases + *(sg_offsets + v + 1),
+                                            [] __device__(auto bias) { return bias > 0.0; });
+            }
             if (with_replacement) {
               if ((out_degree > 0 && count != K) || (out_degree == 0 && count != 0)) {
                 return true;
@@ -418,12 +460,33 @@ class Tests_MGPerVRandomSelectTransformOutgoingE
               auto sg_dst       = thrust::get<1>(e_op_result);
               auto sg_nbr_first = sg_indices + *(sg_offsets + sg_src);
               auto sg_nbr_last  = sg_indices + *(sg_offsets + (sg_src + vertex_t{1}));
-              if (!thrust::binary_search(thrust::seq,
-                                         sg_nbr_first,
-                                         sg_nbr_last,
-                                         sg_dst)) {  // assumed neighbor lists are sorted
-                return true;
+              auto sg_nbr_bias_first =
+                sg_biases ? thrust::make_optional((*sg_biases) + *(sg_offsets + sg_src))
+                          : thrust::nullopt;
+              if (sg_src != v) { return true; }
+
+              if (sg_nbr_bias_first) {
+                auto lower_it = thrust::lower_bound(thrust::seq, sg_nbr_first, sg_nbr_last, sg_dst);
+                auto upper_it = thrust::upper_bound(thrust::seq, sg_nbr_first, sg_nbr_last, sg_dst);
+                bool found    = false;
+                for (auto it = (*sg_nbr_bias_first + thrust::distance(sg_nbr_first, lower_it));
+                     it != (*sg_nbr_bias_first + thrust::distance(sg_nbr_first, upper_it));
+                     ++it) {
+                  if (*it > 0.0) {
+                    found = true;
+                    break;
+                  }
+                }
+                if (!found) { return true; }
+              } else {
+                if (!thrust::binary_search(thrust::seq,
+                                           sg_nbr_first,
+                                           sg_nbr_last,
+                                           sg_dst)) {  // assumed neighbor lists are sorted
+                  return true;
+                }
               }
+
               property_t src_val{};
               property_t dst_val{};
               if constexpr (cugraph::is_thrust_tuple_of_arithmetic<property_t>::value) {
@@ -443,20 +506,25 @@ class Tests_MGPerVRandomSelectTransformOutgoingE
                   thrust::get<1>(sample_e_op_result_first.get_iterator_tuple()) + offset_first;
                 auto sg_dst_last =
                   thrust::get<1>(sample_e_op_result_first.get_iterator_tuple()) + offset_last;
-                auto dst_count =
-                  thrust::count(thrust::seq,
-                                sg_dst_first,
-                                sg_dst_last,
-                                sg_dst);  // this could be inefficient for high-degree vertices, if
-                                          // we sort [sg_dst_first, sg_dst_last) we can use binary
-                                          // search but we may better not modify the sampling output
-                                          // and allow inefficiency as this is just for testing
-                auto multiplicity = thrust::distance(
-                  thrust::lower_bound(thrust::seq, sg_nbr_first, sg_nbr_last, sg_dst),
+                auto dst_count = thrust::count(thrust::seq, sg_dst_first, sg_dst_last, sg_dst);
+                auto lower_it =
+                  thrust::lower_bound(thrust::seq,
+                                      sg_nbr_first,
+                                      sg_nbr_last,
+                                      sg_dst);  // this assumes neighbor lists are sorted
+                auto upper_it =
                   thrust::upper_bound(thrust::seq,
                                       sg_nbr_first,
                                       sg_nbr_last,
-                                      sg_dst));  // this assumes neighbor lists are sorted
+                                      sg_dst);  // this assumes neighbor lists are sorted
+                auto multiplicity =
+                  sg_nbr_bias_first
+                    ? thrust::count_if(
+                        thrust::seq,
+                        *sg_nbr_bias_first + thrust::distance(sg_nbr_first, lower_it),
+                        *sg_nbr_bias_first + thrust::distance(sg_nbr_first, upper_it),
+                        [] __device__(auto bias) { return bias > 0.0; })
+                    : thrust::distance(lower_it, upper_it);
                 if (dst_count > multiplicity) { return true; }
               }
             }
@@ -547,44 +615,60 @@ INSTANTIATE_TEST_SUITE_P(
   file_test,
   Tests_MGPerVRandomSelectTransformOutgoingE_File,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{size_t{1000}, size_t{4}, false, false, false, false},
-                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, false, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, false},
-                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, false, false},
-                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, false, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, false},
-                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, false, false},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, false, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, false},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, false, false},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, false, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, false},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, true}),
+    ::testing::Values(Prims_Usecase{size_t{1000}, size_t{4}, false, false, false, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, false, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, true, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, true, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, false, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, false, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, true, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, true, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, false, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, false, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, true, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, true, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, false, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, false, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, true, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, true, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
 
 INSTANTIATE_TEST_SUITE_P(
   file_large_test,
   Tests_MGPerVRandomSelectTransformOutgoingE_File,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{size_t{1000}, size_t{4}, false, false, false, false},
-                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, false, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, false},
-                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, false, false},
-                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, false, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, false},
-                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, false, false},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, false, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, false},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, false, false},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, false, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, false},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, true}),
+    ::testing::Values(Prims_Usecase{size_t{1000}, size_t{4}, false, false, false, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, false, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, true, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, true, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, false, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, false, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, true, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, true, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, false, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, false, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, true, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, true, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, false, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, false, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, true, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, true, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
                       cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
                       cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
@@ -593,22 +677,30 @@ INSTANTIATE_TEST_SUITE_P(
   rmat_small_test,
   Tests_MGPerVRandomSelectTransformOutgoingE_Rmat,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{size_t{1000}, size_t{4}, false, false, false, false},
-                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, false, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, false},
-                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, false, false},
-                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, false, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, false},
-                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, false, false},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, false, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, false},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, false, false},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, false, true},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, false},
-                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, true}),
+    ::testing::Values(Prims_Usecase{size_t{1000}, size_t{4}, false, false, false, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, false, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, true, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, true, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, false, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, false, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, true, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, true, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, false, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, false, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, true, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, true, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, false, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, false, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, false, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, true, false},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, false, true},
+                      Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, true, true}),
     ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
 INSTANTIATE_TEST_SUITE_P(
@@ -620,22 +712,30 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_MGPerVRandomSelectTransformOutgoingE_Rmat,
   ::testing::Combine(
     ::testing::Values(
-      Prims_Usecase{size_t{10000000}, size_t{25}, false, false, false, false, false},
-      Prims_Usecase{size_t{10000000}, size_t{25}, false, false, false, true, false},
-      Prims_Usecase{size_t{10000000}, size_t{25}, false, false, true, false, false},
-      Prims_Usecase{size_t{10000000}, size_t{25}, false, false, true, true, false},
-      Prims_Usecase{size_t{10000000}, size_t{25}, false, true, false, false, false},
-      Prims_Usecase{size_t{10000000}, size_t{25}, false, true, false, true, false},
-      Prims_Usecase{size_t{10000000}, size_t{25}, false, true, true, false, false},
-      Prims_Usecase{size_t{10000000}, size_t{25}, false, true, true, true, false},
-      Prims_Usecase{size_t{10000000}, size_t{25}, true, false, false, false, false},
-      Prims_Usecase{size_t{10000000}, size_t{25}, true, false, false, true, false},
-      Prims_Usecase{size_t{10000000}, size_t{25}, true, false, true, false, false},
-      Prims_Usecase{size_t{10000000}, size_t{25}, true, false, true, true, false},
-      Prims_Usecase{size_t{10000000}, size_t{25}, true, true, false, false, false},
-      Prims_Usecase{size_t{10000000}, size_t{25}, true, true, false, true, false},
-      Prims_Usecase{size_t{10000000}, size_t{25}, true, true, true, false, false},
-      Prims_Usecase{size_t{10000000}, size_t{25}, true, true, true, true, false}),
+      Prims_Usecase{size_t{10000000}, size_t{25}, false, false, false, false, false, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, false, false, false, false, true, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, false, false, true, false, false, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, false, false, true, true, false, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, false, false, true, false, true, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, false, false, true, true, true, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, false, true, false, false, false, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, false, true, false, false, true, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, false, true, true, false, false, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, false, true, true, true, false, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, false, true, true, false, true, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, false, true, true, true, true, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, true, false, false, false, false, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, true, false, false, false, true, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, true, false, true, false, false, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, true, false, true, true, false, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, true, false, true, false, true, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, true, false, true, true, true, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, true, true, false, false, false, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, true, true, false, false, true, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, true, true, true, false, false, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, true, true, true, true, false, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, true, true, true, false, true, false},
+      Prims_Usecase{size_t{10000000}, size_t{25}, true, true, true, true, true, false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/sampling/detail/nbr_sampling_validate.cu b/cpp/tests/sampling/detail/nbr_sampling_validate.cu
index 61731e2e15c..70828e559f1 100644
--- a/cpp/tests/sampling/detail/nbr_sampling_validate.cu
+++ b/cpp/tests/sampling/detail/nbr_sampling_validate.cu
@@ -75,6 +75,8 @@ struct ArithmeticZipLess {
       } else {
         return thrust::get<1>(left) < thrust::get<1>(right);
       }
+    } else {
+      return false;
     }
   }
 };
diff --git a/cpp/tests/sampling/detail/sampling_post_processing_validate.cu b/cpp/tests/sampling/detail/sampling_post_processing_validate.cu
new file mode 100644
index 00000000000..a0babc3b921
--- /dev/null
+++ b/cpp/tests/sampling/detail/sampling_post_processing_validate.cu
@@ -0,0 +1,1738 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cugraph/detail/utility_wrappers.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/utilities/device_functors.cuh>
+
+#include <raft/core/handle.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <cuda/functional>
+#include <thrust/binary_search.h>
+#include <thrust/distance.h>
+#include <thrust/equal.h>
+#include <thrust/fill.h>
+#include <thrust/gather.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/remove.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+
+template <typename index_t>
+bool check_offsets(raft::handle_t const& handle,
+                   raft::device_span<index_t const> offsets,
+                   index_t num_segments,
+                   index_t num_elements)
+{
+  if (offsets.size() != num_segments + 1) { return false; }
+
+  if (!thrust::is_sorted(handle.get_thrust_policy(), offsets.begin(), offsets.end())) {
+    return false;
+  }
+
+  index_t front_element{};
+  index_t back_element{};
+  raft::update_host(&front_element, offsets.data(), index_t{1}, handle.get_stream());
+  raft::update_host(
+    &back_element, offsets.data() + offsets.size() - 1, index_t{1}, handle.get_stream());
+  handle.sync_stream();
+
+  if (front_element != index_t{0}) { return false; }
+
+  if (back_element != num_elements) { return false; }
+
+  return true;
+}
+
+template bool check_offsets(raft::handle_t const& handle,
+                            raft::device_span<size_t const> offsets,
+                            size_t num_segments,
+                            size_t num_elements);
+
+template <typename vertex_t>
+bool check_edgelist_is_sorted(raft::handle_t const& handle,
+                              raft::device_span<vertex_t const> edgelist_majors,
+                              raft::device_span<vertex_t const> edgelist_minors)
+{
+  auto edge_first = thrust::make_zip_iterator(edgelist_majors.begin(), edgelist_minors.begin());
+  return thrust::is_sorted(
+    handle.get_thrust_policy(), edge_first, edge_first + edgelist_majors.size());
+}
+
+template bool check_edgelist_is_sorted(raft::handle_t const& handle,
+                                       raft::device_span<int32_t const> edgelist_majors,
+                                       raft::device_span<int32_t const> edgelist_minors);
+
+template bool check_edgelist_is_sorted(raft::handle_t const& handle,
+                                       raft::device_span<int64_t const> edgelist_majors,
+                                       raft::device_span<int64_t const> edgelist_minors);
+
+// unrenumber the renumbered edge list and check whether the original & unrenumbered edge lists are
+// identical
+template <typename vertex_t, typename weight_t>
+bool compare_edgelist(raft::handle_t const& handle,
+                      raft::device_span<vertex_t const> org_edgelist_srcs,
+                      raft::device_span<vertex_t const> org_edgelist_dsts,
+                      std::optional<raft::device_span<weight_t const>> org_edgelist_weights,
+                      std::optional<raft::device_span<size_t const>> org_edgelist_label_offsets,
+                      raft::device_span<vertex_t const> renumbered_edgelist_srcs,
+                      raft::device_span<vertex_t const> renumbered_edgelist_dsts,
+                      std::optional<raft::device_span<weight_t const>> renumbered_edgelist_weights,
+                      std::optional<raft::device_span<vertex_t const>> renumber_map,
+                      std::optional<raft::device_span<size_t const>> renumber_map_label_offsets,
+                      size_t num_labels)
+{
+  if (org_edgelist_srcs.size() != renumbered_edgelist_srcs.size()) { return false; }
+
+  for (size_t i = 0; i < num_labels; ++i) {
+    size_t label_start_offset{0};
+    size_t label_end_offset = org_edgelist_srcs.size();
+    if (org_edgelist_label_offsets) {
+      raft::update_host(&label_start_offset,
+                        (*org_edgelist_label_offsets).data() + i,
+                        size_t{1},
+                        handle.get_stream());
+      raft::update_host(&label_end_offset,
+                        (*org_edgelist_label_offsets).data() + i + 1,
+                        size_t{1},
+                        handle.get_stream());
+      handle.sync_stream();
+    }
+
+    if (label_start_offset == label_end_offset) { continue; }
+
+    rmm::device_uvector<vertex_t> this_label_sorted_org_edgelist_srcs(
+      label_end_offset - label_start_offset, handle.get_stream());
+    thrust::copy(handle.get_thrust_policy(),
+                 org_edgelist_srcs.begin() + label_start_offset,
+                 org_edgelist_srcs.begin() + label_end_offset,
+                 this_label_sorted_org_edgelist_srcs.begin());
+    rmm::device_uvector<vertex_t> this_label_sorted_org_edgelist_dsts(org_edgelist_dsts.size(),
+                                                                      handle.get_stream());
+    thrust::copy(handle.get_thrust_policy(),
+                 org_edgelist_dsts.begin() + label_start_offset,
+                 org_edgelist_dsts.begin() + label_end_offset,
+                 this_label_sorted_org_edgelist_dsts.begin());
+    auto this_label_sorted_org_edgelist_weights =
+      org_edgelist_weights ? std::make_optional<rmm::device_uvector<weight_t>>(
+                               label_end_offset - label_start_offset, handle.get_stream())
+                           : std::nullopt;
+    if (this_label_sorted_org_edgelist_weights) {
+      thrust::copy(handle.get_thrust_policy(),
+                   (*org_edgelist_weights).begin() + label_start_offset,
+                   (*org_edgelist_weights).begin() + label_end_offset,
+                   (*this_label_sorted_org_edgelist_weights).begin());
+    }
+
+    if (this_label_sorted_org_edgelist_weights) {
+      auto sorted_org_edge_first =
+        thrust::make_zip_iterator(this_label_sorted_org_edgelist_srcs.begin(),
+                                  this_label_sorted_org_edgelist_dsts.begin(),
+                                  (*this_label_sorted_org_edgelist_weights).begin());
+      thrust::sort(handle.get_thrust_policy(),
+                   sorted_org_edge_first,
+                   sorted_org_edge_first + this_label_sorted_org_edgelist_srcs.size());
+    } else {
+      auto sorted_org_edge_first = thrust::make_zip_iterator(
+        this_label_sorted_org_edgelist_srcs.begin(), this_label_sorted_org_edgelist_dsts.begin());
+      thrust::sort(handle.get_thrust_policy(),
+                   sorted_org_edge_first,
+                   sorted_org_edge_first + this_label_sorted_org_edgelist_srcs.size());
+    }
+
+    rmm::device_uvector<vertex_t> this_label_sorted_unrenumbered_edgelist_srcs(
+      label_end_offset - label_start_offset, handle.get_stream());
+    thrust::copy(handle.get_thrust_policy(),
+                 renumbered_edgelist_srcs.begin() + label_start_offset,
+                 renumbered_edgelist_srcs.begin() + label_end_offset,
+                 this_label_sorted_unrenumbered_edgelist_srcs.begin());
+    rmm::device_uvector<vertex_t> this_label_sorted_unrenumbered_edgelist_dsts(
+      label_end_offset - label_start_offset, handle.get_stream());
+    thrust::copy(handle.get_thrust_policy(),
+                 renumbered_edgelist_dsts.begin() + label_start_offset,
+                 renumbered_edgelist_dsts.begin() + label_end_offset,
+                 this_label_sorted_unrenumbered_edgelist_dsts.begin());
+    auto this_label_sorted_unrenumbered_edgelist_weights =
+      renumbered_edgelist_weights ? std::make_optional<rmm::device_uvector<weight_t>>(
+                                      label_end_offset - label_start_offset, handle.get_stream())
+                                  : std::nullopt;
+    if (this_label_sorted_unrenumbered_edgelist_weights) {
+      thrust::copy(handle.get_thrust_policy(),
+                   (*renumbered_edgelist_weights).begin() + label_start_offset,
+                   (*renumbered_edgelist_weights).begin() + label_end_offset,
+                   (*this_label_sorted_unrenumbered_edgelist_weights).begin());
+    }
+
+    if (renumber_map) {
+      size_t renumber_map_label_start_offset{0};
+      size_t renumber_map_label_end_offset = (*renumber_map).size();
+      if (renumber_map_label_offsets) {
+        raft::update_host(&renumber_map_label_start_offset,
+                          (*renumber_map_label_offsets).data() + i,
+                          size_t{1},
+                          handle.get_stream());
+        raft::update_host(&renumber_map_label_end_offset,
+                          (*renumber_map_label_offsets).data() + i + 1,
+                          size_t{1},
+                          handle.get_stream());
+        handle.sync_stream();
+      }
+      cugraph::unrenumber_int_vertices<vertex_t, false>(
+        handle,
+        this_label_sorted_unrenumbered_edgelist_srcs.data(),
+        this_label_sorted_unrenumbered_edgelist_srcs.size(),
+        (*renumber_map).data() + renumber_map_label_start_offset,
+        std::vector<vertex_t>{
+          static_cast<vertex_t>(renumber_map_label_end_offset - renumber_map_label_start_offset)});
+      cugraph::unrenumber_int_vertices<vertex_t, false>(
+        handle,
+        this_label_sorted_unrenumbered_edgelist_dsts.data(),
+        this_label_sorted_unrenumbered_edgelist_dsts.size(),
+        (*renumber_map).data() + renumber_map_label_start_offset,
+        std::vector<vertex_t>{
+          static_cast<vertex_t>(renumber_map_label_end_offset - renumber_map_label_start_offset)});
+    }
+
+    if (this_label_sorted_unrenumbered_edgelist_weights) {
+      auto sorted_unrenumbered_edge_first =
+        thrust::make_zip_iterator(this_label_sorted_unrenumbered_edgelist_srcs.begin(),
+                                  this_label_sorted_unrenumbered_edgelist_dsts.begin(),
+                                  (*this_label_sorted_unrenumbered_edgelist_weights).begin());
+      thrust::sort(
+        handle.get_thrust_policy(),
+        sorted_unrenumbered_edge_first,
+        sorted_unrenumbered_edge_first + this_label_sorted_unrenumbered_edgelist_srcs.size());
+
+      auto sorted_org_edge_first =
+        thrust::make_zip_iterator(this_label_sorted_org_edgelist_srcs.begin(),
+                                  this_label_sorted_org_edgelist_dsts.begin(),
+                                  (*this_label_sorted_org_edgelist_weights).begin());
+      if (!thrust::equal(handle.get_thrust_policy(),
+                         sorted_org_edge_first,
+                         sorted_org_edge_first + this_label_sorted_org_edgelist_srcs.size(),
+                         sorted_unrenumbered_edge_first)) {
+        return false;
+      }
+    } else {
+      auto sorted_unrenumbered_edge_first =
+        thrust::make_zip_iterator(this_label_sorted_unrenumbered_edgelist_srcs.begin(),
+                                  this_label_sorted_unrenumbered_edgelist_dsts.begin());
+      thrust::sort(
+        handle.get_thrust_policy(),
+        sorted_unrenumbered_edge_first,
+        sorted_unrenumbered_edge_first + this_label_sorted_unrenumbered_edgelist_srcs.size());
+
+      auto sorted_org_edge_first = thrust::make_zip_iterator(
+        this_label_sorted_org_edgelist_srcs.begin(), this_label_sorted_org_edgelist_dsts.begin());
+      if (!thrust::equal(handle.get_thrust_policy(),
+                         sorted_org_edge_first,
+                         sorted_org_edge_first + this_label_sorted_org_edgelist_srcs.size(),
+                         sorted_unrenumbered_edge_first)) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+template bool compare_edgelist(
+  raft::handle_t const& handle,
+  raft::device_span<int32_t const> org_edgelist_srcs,
+  raft::device_span<int32_t const> org_edgelist_dsts,
+  std::optional<raft::device_span<float const>> org_edgelist_weights,
+  std::optional<raft::device_span<size_t const>> org_edgelist_label_offsets,
+  raft::device_span<int32_t const> renumbered_edgelist_srcs,
+  raft::device_span<int32_t const> renumbered_edgelist_dsts,
+  std::optional<raft::device_span<float const>> renumbered_edgelist_weights,
+  std::optional<raft::device_span<int32_t const>> renumber_map,
+  std::optional<raft::device_span<size_t const>> renumber_map_label_offsets,
+  size_t num_labels);
+
+template bool compare_edgelist(
+  raft::handle_t const& handle,
+  raft::device_span<int32_t const> org_edgelist_srcs,
+  raft::device_span<int32_t const> org_edgelist_dsts,
+  std::optional<raft::device_span<double const>> org_edgelist_weights,
+  std::optional<raft::device_span<size_t const>> org_edgelist_label_offsets,
+  raft::device_span<int32_t const> renumbered_edgelist_srcs,
+  raft::device_span<int32_t const> renumbered_edgelist_dsts,
+  std::optional<raft::device_span<double const>> renumbered_edgelist_weights,
+  std::optional<raft::device_span<int32_t const>> renumber_map,
+  std::optional<raft::device_span<size_t const>> renumber_map_label_offsets,
+  size_t num_labels);
+
+template bool compare_edgelist(
+  raft::handle_t const& handle,
+  raft::device_span<int64_t const> org_edgelist_srcs,
+  raft::device_span<int64_t const> org_edgelist_dsts,
+  std::optional<raft::device_span<float const>> org_edgelist_weights,
+  std::optional<raft::device_span<size_t const>> org_edgelist_label_offsets,
+  raft::device_span<int64_t const> renumbered_edgelist_srcs,
+  raft::device_span<int64_t const> renumbered_edgelist_dsts,
+  std::optional<raft::device_span<float const>> renumbered_edgelist_weights,
+  std::optional<raft::device_span<int64_t const>> renumber_map,
+  std::optional<raft::device_span<size_t const>> renumber_map_label_offsets,
+  size_t num_labels);
+
+template bool compare_edgelist(
+  raft::handle_t const& handle,
+  raft::device_span<int64_t const> org_edgelist_srcs,
+  raft::device_span<int64_t const> org_edgelist_dsts,
+  std::optional<raft::device_span<double const>> org_edgelist_weights,
+  std::optional<raft::device_span<size_t const>> org_edgelist_label_offsets,
+  raft::device_span<int64_t const> renumbered_edgelist_srcs,
+  raft::device_span<int64_t const> renumbered_edgelist_dsts,
+  std::optional<raft::device_span<double const>> renumbered_edgelist_weights,
+  std::optional<raft::device_span<int64_t const>> renumber_map,
+  std::optional<raft::device_span<size_t const>> renumber_map_label_offsets,
+  size_t num_labels);
+
+// unrenumber the renumbered edge list and check whether the original & unrenumbered edge lists
+// are identical
+template <typename vertex_t, typename weight_t, typename edge_id_t, typename edge_type_t>
+bool compare_heterogeneous_edgelist(
+  raft::handle_t const& handle,
+  raft::device_span<vertex_t const> org_edgelist_srcs,
+  raft::device_span<vertex_t const> org_edgelist_dsts,
+  std::optional<raft::device_span<weight_t const>> org_edgelist_weights,
+  std::optional<raft::device_span<edge_id_t const>> org_edgelist_edge_ids,
+  std::optional<raft::device_span<edge_type_t const>> org_edgelist_edge_types,
+  std::optional<raft::device_span<int32_t const>> org_edgelist_hops,
+  std::optional<raft::device_span<size_t const>> org_edgelist_label_offsets,
+  raft::device_span<vertex_t const> renumbered_edgelist_srcs,
+  raft::device_span<vertex_t const> renumbered_edgelist_dsts,
+  std::optional<raft::device_span<weight_t const>> renumbered_edgelist_weights,
+  std::optional<raft::device_span<edge_id_t const>> renumbered_edgelist_edge_ids,
+  std::optional<raft::device_span<size_t const>> renumbered_edgelist_label_edge_type_hop_offsets,
+  raft::device_span<vertex_t const> vertex_renumber_map,
+  raft::device_span<size_t const> vertex_renumber_map_label_type_offsets,
+  std::optional<raft::device_span<edge_id_t const>> edge_id_renumber_map,
+  std::optional<raft::device_span<size_t const>> edge_id_renumber_map_label_type_offsets,
+  raft::device_span<vertex_t const> vertex_type_offsets,
+  size_t num_labels,
+  size_t num_vertex_types,
+  size_t num_edge_types,
+  size_t num_hops)
+{
+  if (org_edgelist_srcs.size() != renumbered_edgelist_srcs.size()) { return false; }
+
+  for (size_t i = 0; i < num_labels; ++i) {
+    size_t label_start_offset{0};
+    size_t label_end_offset = org_edgelist_srcs.size();
+    if (org_edgelist_label_offsets) {
+      raft::update_host(&label_start_offset,
+                        (*org_edgelist_label_offsets).data() + i,
+                        size_t{1},
+                        handle.get_stream());
+      raft::update_host(&label_end_offset,
+                        (*org_edgelist_label_offsets).data() + i + 1,
+                        size_t{1},
+                        handle.get_stream());
+      handle.sync_stream();
+    }
+
+    if (label_start_offset == label_end_offset) { continue; }
+
+    if (renumbered_edgelist_label_edge_type_hop_offsets) {
+      size_t renumbered_label_start_offset{0};
+      size_t renumbered_label_end_offset{0};
+      raft::update_host(
+        &renumbered_label_start_offset,
+        (*renumbered_edgelist_label_edge_type_hop_offsets).data() + i * num_edge_types * num_hops,
+        size_t{1},
+        handle.get_stream());
+      raft::update_host(&renumbered_label_end_offset,
+                        (*renumbered_edgelist_label_edge_type_hop_offsets).data() +
+                          (i + 1) * num_edge_types * num_hops,
+                        size_t{1},
+                        handle.get_stream());
+      handle.sync_stream();
+      if (renumbered_label_start_offset != label_start_offset) { return false; }
+      if (renumbered_label_end_offset != label_end_offset) { return false; }
+    }
+
+    // sort org edgelist by ((edge_type), (hop), src, dst, (weight), (edge ID))
+
+    rmm::device_uvector<size_t> this_label_org_sorted_indices(label_end_offset - label_start_offset,
+                                                              handle.get_stream());
+    thrust::sequence(handle.get_thrust_policy(),
+                     this_label_org_sorted_indices.begin(),
+                     this_label_org_sorted_indices.end(),
+                     size_t{0});
+
+    thrust::sort(
+      handle.get_thrust_policy(),
+      this_label_org_sorted_indices.begin(),
+      this_label_org_sorted_indices.end(),
+      [edge_types = org_edgelist_edge_types
+                      ? thrust::make_optional<raft::device_span<edge_type_t const>>(
+                          (*org_edgelist_edge_types).data() + label_start_offset,
+                          label_end_offset - label_start_offset)
+                      : thrust::nullopt,
+       hops       = org_edgelist_hops ? thrust::make_optional<raft::device_span<int32_t const>>(
+                                    (*org_edgelist_hops).data() + label_start_offset,
+                                    label_end_offset - label_start_offset)
+                                      : thrust::nullopt,
+       srcs       = raft::device_span<vertex_t const>(org_edgelist_srcs.data() + label_start_offset,
+                                                label_end_offset - label_start_offset),
+       dsts       = raft::device_span<vertex_t const>(org_edgelist_dsts.data() + label_start_offset,
+                                                label_end_offset - label_start_offset),
+       weights    = org_edgelist_weights ? thrust::make_optional<raft::device_span<weight_t const>>(
+                                          (*org_edgelist_weights).data() + label_start_offset,
+                                          label_end_offset - label_start_offset)
+                                         : thrust::nullopt,
+       edge_ids = org_edgelist_edge_ids ? thrust::make_optional<raft::device_span<edge_id_t const>>(
+                                            (*org_edgelist_edge_ids).data() + label_start_offset,
+                                            label_end_offset - label_start_offset)
+                                        : thrust::nullopt] __device__(size_t l_idx, size_t r_idx) {
+        edge_type_t l_edge_type{0};
+        edge_type_t r_edge_type{0};
+        if (edge_types) {
+          l_edge_type = (*edge_types)[l_idx];
+          r_edge_type = (*edge_types)[r_idx];
+        }
+
+        int32_t l_hop{0};
+        int32_t r_hop{0};
+        if (hops) {
+          l_hop = (*hops)[l_idx];
+          r_hop = (*hops)[r_idx];
+        }
+
+        vertex_t l_src = srcs[l_idx];
+        vertex_t r_src = srcs[r_idx];
+
+        vertex_t l_dst = dsts[l_idx];
+        vertex_t r_dst = dsts[r_idx];
+
+        weight_t l_weight{0.0};
+        weight_t r_weight{0.0};
+        if (weights) {
+          l_weight = (*weights)[l_idx];
+          r_weight = (*weights)[r_idx];
+        }
+
+        edge_id_t l_edge_id{0};
+        edge_id_t r_edge_id{0};
+        if (edge_ids) {
+          l_edge_id = (*edge_ids)[l_idx];
+          r_edge_id = (*edge_ids)[r_idx];
+        }
+
+        return thrust::make_tuple(l_edge_type, l_hop, l_src, l_dst, l_weight, l_edge_id) <
+               thrust::make_tuple(r_edge_type, r_hop, r_src, r_dst, r_weight, r_edge_id);
+      });
+
+    for (size_t j = 0; j < num_edge_types; ++j) {
+      auto edge_type_start_offset = label_start_offset;
+      auto edge_type_end_offset   = label_end_offset;
+      if (renumbered_edgelist_label_edge_type_hop_offsets) {
+        raft::update_host(&edge_type_start_offset,
+                          (*renumbered_edgelist_label_edge_type_hop_offsets).data() +
+                            i * num_edge_types * num_hops + j * num_hops,
+                          size_t{1},
+                          handle.get_stream());
+        raft::update_host(&edge_type_end_offset,
+                          (*renumbered_edgelist_label_edge_type_hop_offsets).data() +
+                            i * num_edge_types * num_hops + (j + 1) * num_hops,
+                          size_t{1},
+                          handle.get_stream());
+        handle.sync_stream();
+      }
+
+      if (edge_type_start_offset == edge_type_end_offset) { continue; }
+
+      if (org_edgelist_edge_types) {
+        if (static_cast<size_t>(thrust::count_if(
+              handle.get_thrust_policy(),
+              this_label_org_sorted_indices.begin() + (edge_type_start_offset - label_start_offset),
+              this_label_org_sorted_indices.begin() + (edge_type_end_offset - label_start_offset),
+              [edge_types = raft::device_span<edge_type_t const>(
+                 (*org_edgelist_edge_types).data() + label_start_offset,
+                 label_end_offset - label_start_offset),
+               edge_type = static_cast<edge_type_t>(j)] __device__(auto i) {
+                return edge_types[i] == edge_type;
+              })) != edge_type_end_offset - edge_type_start_offset) {
+          return false;
+        }
+      }
+
+      if (org_edgelist_hops) {
+        for (size_t k = 0; k < num_hops; ++k) {
+          auto hop_start_offset = edge_type_start_offset;
+          auto hop_end_offset   = edge_type_end_offset;
+          if (renumbered_edgelist_label_edge_type_hop_offsets) {
+            raft::update_host(&hop_start_offset,
+                              (*renumbered_edgelist_label_edge_type_hop_offsets).data() +
+                                i * num_edge_types * num_hops + j * num_hops + k,
+                              size_t{1},
+                              handle.get_stream());
+            raft::update_host(&hop_end_offset,
+                              (*renumbered_edgelist_label_edge_type_hop_offsets).data() +
+                                i * num_edge_types * num_hops + j * num_hops + k + 1,
+                              size_t{1},
+                              handle.get_stream());
+            handle.sync_stream();
+          }
+
+          if (hop_start_offset == hop_end_offset) { continue; }
+
+          if (static_cast<size_t>(thrust::count_if(
+                handle.get_thrust_policy(),
+                this_label_org_sorted_indices.begin() + (hop_start_offset - label_start_offset),
+                this_label_org_sorted_indices.begin() + (hop_end_offset - label_start_offset),
+                [hops = raft::device_span<int32_t const>(
+                   (*org_edgelist_hops).data() + label_start_offset,
+                   label_end_offset - label_start_offset),
+                 hop = static_cast<int32_t>(k)] __device__(auto i) { return hops[i] == hop; })) !=
+              hop_end_offset - hop_start_offset) {
+            return false;
+          }
+        }
+      }
+
+      // unrenumber source vertex IDs
+
+      rmm::device_uvector<vertex_t> this_edge_type_unrenumbered_edgelist_srcs(
+        edge_type_end_offset - edge_type_start_offset, handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(),
+                   renumbered_edgelist_srcs.begin() + edge_type_start_offset,
+                   renumbered_edgelist_srcs.begin() + edge_type_end_offset,
+                   this_edge_type_unrenumbered_edgelist_srcs.begin());
+      {
+        vertex_t org_src{};
+        raft::update_host(&org_src,
+                          org_edgelist_srcs.data() + label_start_offset +
+                            this_label_org_sorted_indices.element(
+                              edge_type_start_offset - label_start_offset, handle.get_stream()),
+                          size_t{1},
+                          handle.get_stream());
+        handle.sync_stream();
+        auto vertex_type = thrust::distance(vertex_type_offsets.begin() + 1,
+                                            thrust::upper_bound(handle.get_thrust_policy(),
+                                                                vertex_type_offsets.begin() + 1,
+                                                                vertex_type_offsets.end(),
+                                                                org_src));
+        size_t renumber_map_label_start_offset{};
+        size_t renumber_map_label_end_offset{};
+        raft::update_host(
+          &renumber_map_label_start_offset,
+          vertex_renumber_map_label_type_offsets.data() + i * num_vertex_types + vertex_type,
+          size_t{1},
+          handle.get_stream());
+        raft::update_host(
+          &renumber_map_label_end_offset,
+          vertex_renumber_map_label_type_offsets.data() + i * num_vertex_types + vertex_type + 1,
+          size_t{1},
+          handle.get_stream());
+        handle.sync_stream();
+        auto renumber_map = raft::device_span<vertex_t const>(
+          vertex_renumber_map.data() + renumber_map_label_start_offset,
+          renumber_map_label_end_offset - renumber_map_label_start_offset);
+        cugraph::unrenumber_int_vertices<vertex_t, false>(
+          handle,
+          this_edge_type_unrenumbered_edgelist_srcs.data(),
+          edge_type_end_offset - edge_type_start_offset,
+          renumber_map.data(),
+          std::vector<vertex_t>{static_cast<vertex_t>(renumber_map.size())});
+      }
+
+      // unrenumber destination vertex IDs
+
+      rmm::device_uvector<vertex_t> this_edge_type_unrenumbered_edgelist_dsts(
+        edge_type_end_offset - edge_type_start_offset, handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(),
+                   renumbered_edgelist_dsts.begin() + edge_type_start_offset,
+                   renumbered_edgelist_dsts.begin() + edge_type_end_offset,
+                   this_edge_type_unrenumbered_edgelist_dsts.begin());
+      {
+        vertex_t org_dst{};
+        raft::update_host(&org_dst,
+                          org_edgelist_dsts.data() + label_start_offset +
+                            this_label_org_sorted_indices.element(
+                              edge_type_start_offset - label_start_offset, handle.get_stream()),
+                          size_t{1},
+                          handle.get_stream());
+        handle.sync_stream();
+        auto vertex_type = thrust::distance(vertex_type_offsets.begin() + 1,
+                                            thrust::upper_bound(handle.get_thrust_policy(),
+                                                                vertex_type_offsets.begin() + 1,
+                                                                vertex_type_offsets.end(),
+                                                                org_dst));
+        size_t renumber_map_label_start_offset{0};
+        size_t renumber_map_label_end_offset{};
+        raft::update_host(
+          &renumber_map_label_start_offset,
+          vertex_renumber_map_label_type_offsets.data() + i * num_vertex_types + vertex_type,
+          size_t{1},
+          handle.get_stream());
+        raft::update_host(
+          &renumber_map_label_end_offset,
+          vertex_renumber_map_label_type_offsets.data() + i * num_vertex_types + vertex_type + 1,
+          size_t{1},
+          handle.get_stream());
+        handle.sync_stream();
+        auto renumber_map = raft::device_span<vertex_t const>(
+          vertex_renumber_map.data() + renumber_map_label_start_offset,
+          renumber_map_label_end_offset - renumber_map_label_start_offset);
+        cugraph::unrenumber_int_vertices<vertex_t, false>(
+          handle,
+          this_edge_type_unrenumbered_edgelist_dsts.data(),
+          edge_type_end_offset - edge_type_start_offset,
+          renumber_map.data(),
+          std::vector<vertex_t>{static_cast<vertex_t>(renumber_map.size())});
+      }
+
+      // unrenumber edge IDs
+
+      std::optional<rmm::device_uvector<edge_id_t>> unrenumbered_edgelist_edge_ids{std::nullopt};
+      if (renumbered_edgelist_edge_ids) {
+        unrenumbered_edgelist_edge_ids = rmm::device_uvector<edge_id_t>(
+          edge_type_end_offset - edge_type_start_offset, handle.get_stream());
+        size_t renumber_map_type_start_offset{0};
+        size_t renumber_map_type_end_offset = (*edge_id_renumber_map).size();
+        if (edge_id_renumber_map_label_type_offsets) {
+          raft::update_host(&renumber_map_type_start_offset,
+                            (*edge_id_renumber_map_label_type_offsets).data() + i * num_edge_types +
+                              static_cast<edge_type_t>(j),
+                            size_t{1},
+                            handle.get_stream());
+          raft::update_host(&renumber_map_type_end_offset,
+                            (*edge_id_renumber_map_label_type_offsets).data() + i * num_edge_types +
+                              static_cast<edge_type_t>(j) + 1,
+                            size_t{1},
+                            handle.get_stream());
+          handle.sync_stream();
+        }
+        auto renumber_map = raft::device_span<edge_id_t const>(
+          (*edge_id_renumber_map).data() + renumber_map_type_start_offset,
+          renumber_map_type_end_offset - renumber_map_type_start_offset);
+        thrust::gather(handle.get_thrust_policy(),
+                       (*renumbered_edgelist_edge_ids).begin() + edge_type_start_offset,
+                       (*renumbered_edgelist_edge_ids).begin() + edge_type_end_offset,
+                       renumber_map.begin(),
+                       (*unrenumbered_edgelist_edge_ids).begin());
+      }
+
+      // sort sorted & renumbered edgelist by (src, dst, (weight), (edge ID))
+
+      rmm::device_uvector<size_t> this_edge_type_unrenumbered_sorted_indices(
+        edge_type_end_offset - edge_type_start_offset, handle.get_stream());
+      thrust::sequence(handle.get_thrust_policy(),
+                       this_edge_type_unrenumbered_sorted_indices.begin(),
+                       this_edge_type_unrenumbered_sorted_indices.end(),
+                       size_t{0});
+
+      for (size_t k = 0; k < num_hops; ++k) {
+        auto hop_start_offset = edge_type_start_offset;
+        auto hop_end_offset   = edge_type_end_offset;
+        if (renumbered_edgelist_label_edge_type_hop_offsets) {
+          raft::update_host(&hop_start_offset,
+                            (*renumbered_edgelist_label_edge_type_hop_offsets).data() +
+                              i * num_edge_types * num_hops + j * num_hops + k,
+                            size_t{1},
+                            handle.get_stream());
+          raft::update_host(&hop_end_offset,
+                            (*renumbered_edgelist_label_edge_type_hop_offsets).data() +
+                              i * num_edge_types * num_hops + j * num_hops + k + 1,
+                            size_t{1},
+                            handle.get_stream());
+          handle.sync_stream();
+        }
+
+        if (hop_start_offset == hop_end_offset) { continue; }
+
+        thrust::sort(
+          handle.get_thrust_policy(),
+          this_edge_type_unrenumbered_sorted_indices.begin() +
+            (hop_start_offset - edge_type_start_offset),
+          this_edge_type_unrenumbered_sorted_indices.begin() +
+            (hop_end_offset - edge_type_start_offset),
+          [srcs =
+             raft::device_span<vertex_t const>(this_edge_type_unrenumbered_edgelist_srcs.data(),
+                                               this_edge_type_unrenumbered_edgelist_srcs.size()),
+           dsts =
+             raft::device_span<vertex_t const>(this_edge_type_unrenumbered_edgelist_dsts.data(),
+                                               this_edge_type_unrenumbered_edgelist_dsts.size()),
+           weights  = renumbered_edgelist_weights
+                        ? thrust::make_optional<raft::device_span<weight_t const>>(
+                           (*renumbered_edgelist_weights).data() + edge_type_start_offset,
+                           edge_type_end_offset - edge_type_start_offset)
+                        : thrust::nullopt,
+           edge_ids = renumbered_edgelist_edge_ids
+                        ? thrust::make_optional<raft::device_span<edge_id_t const>>(
+                            (*renumbered_edgelist_edge_ids).data() + edge_type_start_offset,
+                            edge_type_end_offset - edge_type_start_offset)
+                        : thrust::nullopt] __device__(size_t l_idx, size_t r_idx) {
+            vertex_t l_src = srcs[l_idx];
+            vertex_t r_src = srcs[r_idx];
+
+            vertex_t l_dst = dsts[l_idx];
+            vertex_t r_dst = dsts[r_idx];
+
+            weight_t l_weight{0.0};
+            weight_t r_weight{0.0};
+            if (weights) {
+              l_weight = (*weights)[l_idx];
+              r_weight = (*weights)[r_idx];
+            }
+
+            edge_id_t l_edge_id{0};
+            edge_id_t r_edge_id{0};
+            if (edge_ids) {
+              l_edge_id = (*edge_ids)[l_idx];
+              r_edge_id = (*edge_ids)[r_idx];
+            }
+
+            return thrust::make_tuple(l_src, l_dst, l_weight, l_edge_id) <
+                   thrust::make_tuple(r_src, r_dst, r_weight, r_edge_id);
+          });
+      }
+
+      // compare
+
+      if (!thrust::equal(
+            handle.get_thrust_policy(),
+            this_label_org_sorted_indices.begin() + (edge_type_start_offset - label_start_offset),
+            this_label_org_sorted_indices.begin() + (edge_type_end_offset - label_start_offset),
+            this_edge_type_unrenumbered_sorted_indices.begin(),
+            [org_srcs =
+               raft::device_span<vertex_t const>(org_edgelist_srcs.data() + label_start_offset,
+                                                 label_end_offset - label_start_offset),
+             org_dsts =
+               raft::device_span<vertex_t const>(org_edgelist_dsts.data() + label_start_offset,
+                                                 label_end_offset - label_start_offset),
+             org_weights  = org_edgelist_weights
+                              ? thrust::make_optional<raft::device_span<weight_t const>>(
+                                 (*org_edgelist_weights).data() + label_start_offset,
+                                 label_end_offset - label_start_offset)
+                              : thrust::nullopt,
+             org_edge_ids = org_edgelist_edge_ids
+                              ? thrust::make_optional<raft::device_span<edge_id_t const>>(
+                                  (*org_edgelist_edge_ids).data() + label_start_offset,
+                                  label_end_offset - label_start_offset)
+                              : thrust::nullopt,
+             unrenumbered_srcs =
+               raft::device_span<vertex_t const>(this_edge_type_unrenumbered_edgelist_srcs.data(),
+                                                 this_edge_type_unrenumbered_edgelist_srcs.size()),
+             unrenumbered_dsts =
+               raft::device_span<vertex_t const>(this_edge_type_unrenumbered_edgelist_dsts.data(),
+                                                 this_edge_type_unrenumbered_edgelist_dsts.size()),
+             unrenumbered_weights =
+               renumbered_edgelist_weights
+                 ? thrust::make_optional<raft::device_span<weight_t const>>(
+                     (*renumbered_edgelist_weights).data() + edge_type_start_offset,
+                     edge_type_end_offset - edge_type_start_offset)
+                 : thrust::nullopt,
+             unrenumbered_edge_ids =
+               unrenumbered_edgelist_edge_ids
+                 ? thrust::make_optional<raft::device_span<edge_id_t const>>(
+                     (*unrenumbered_edgelist_edge_ids).data(),
+                     (*unrenumbered_edgelist_edge_ids).size())
+                 : thrust::
+                     nullopt] __device__(size_t org_idx /* from label_start_offset */,
+                                         size_t
+                                           unrenumbered_idx /* from edge_type_start_offset */) {
+              auto org_src          = org_srcs[org_idx];
+              auto unrenumbered_src = unrenumbered_srcs[unrenumbered_idx];
+              if (org_src != unrenumbered_src) { return false; }
+
+              auto org_dst          = org_dsts[org_idx];
+              auto unrenumbered_dst = unrenumbered_dsts[unrenumbered_idx];
+              if (org_dst != unrenumbered_dst) { return false; }
+
+              weight_t org_weight{0.0};
+              if (org_weights) { org_weight = (*org_weights)[org_idx]; }
+              weight_t unrenumbered_weight{0.0};
+              if (unrenumbered_weights) {
+                unrenumbered_weight = (*unrenumbered_weights)[unrenumbered_idx];
+              }
+              if (org_weight != unrenumbered_weight) { return false; }
+
+              edge_id_t org_edge_id{0};
+              if (org_edge_ids) { org_edge_id = (*org_edge_ids)[org_idx]; }
+              edge_id_t unrenumbered_edge_id{0};
+              if (unrenumbered_edge_ids) {
+                unrenumbered_edge_id = (*unrenumbered_edge_ids)[unrenumbered_idx];
+              }
+
+              return org_edge_id == unrenumbered_edge_id;
+            })) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+template bool compare_heterogeneous_edgelist(
+  raft::handle_t const& handle,
+  raft::device_span<int32_t const> org_edgelist_srcs,
+  raft::device_span<int32_t const> org_edgelist_dsts,
+  std::optional<raft::device_span<float const>> org_edgelist_weights,
+  std::optional<raft::device_span<int32_t const>> org_edgelist_edge_ids,
+  std::optional<raft::device_span<int32_t const>> org_edgelist_edge_types,
+  std::optional<raft::device_span<int32_t const>> org_edgelist_hops,
+  std::optional<raft::device_span<size_t const>> org_edgelist_label_offsets,
+  raft::device_span<int32_t const> renumbered_edgelist_srcs,
+  raft::device_span<int32_t const> renumbered_edgelist_dsts,
+  std::optional<raft::device_span<float const>> renumbered_edgelist_weights,
+  std::optional<raft::device_span<int32_t const>> renumbered_edgelist_edge_ids,
+  std::optional<raft::device_span<size_t const>> renumbered_edgelist_label_edge_type_hop_offsets,
+  raft::device_span<int32_t const> vertex_renumber_map,
+  raft::device_span<size_t const> vertex_renumber_map_label_type_offsets,
+  std::optional<raft::device_span<int32_t const>> edge_id_renumber_map,
+  std::optional<raft::device_span<size_t const>> edge_id_renumber_map_label_type_offsets,
+  raft::device_span<int32_t const> vertex_type_offsets,
+  size_t num_labels,
+  size_t num_vertex_types,
+  size_t num_edge_types,
+  size_t num_hops);
+
+template bool compare_heterogeneous_edgelist(
+  raft::handle_t const& handle,
+  raft::device_span<int32_t const> org_edgelist_srcs,
+  raft::device_span<int32_t const> org_edgelist_dsts,
+  std::optional<raft::device_span<double const>> org_edgelist_weights,
+  std::optional<raft::device_span<int32_t const>> org_edgelist_edge_ids,
+  std::optional<raft::device_span<int32_t const>> org_edgelist_edge_types,
+  std::optional<raft::device_span<int32_t const>> org_edgelist_hops,
+  std::optional<raft::device_span<size_t const>> org_edgelist_label_offsets,
+  raft::device_span<int32_t const> renumbered_edgelist_srcs,
+  raft::device_span<int32_t const> renumbered_edgelist_dsts,
+  std::optional<raft::device_span<double const>> renumbered_edgelist_weights,
+  std::optional<raft::device_span<int32_t const>> renumbered_edgelist_edge_ids,
+  std::optional<raft::device_span<size_t const>> renumbered_edgelist_label_edge_type_hop_offsets,
+  raft::device_span<int32_t const> vertex_renumber_map,
+  raft::device_span<size_t const> vertex_renumber_map_label_type_offsets,
+  std::optional<raft::device_span<int32_t const>> edge_id_renumber_map,
+  std::optional<raft::device_span<size_t const>> edge_id_renumber_map_label_type_offsets,
+  raft::device_span<int32_t const> vertex_type_offsets,
+  size_t num_labels,
+  size_t num_vertex_types,
+  size_t num_edge_types,
+  size_t num_hops);
+
+template bool compare_heterogeneous_edgelist(
+  raft::handle_t const& handle,
+  raft::device_span<int32_t const> org_edgelist_srcs,
+  raft::device_span<int32_t const> org_edgelist_dsts,
+  std::optional<raft::device_span<float const>> org_edgelist_weights,
+  std::optional<raft::device_span<int64_t const>> org_edgelist_edge_ids,
+  std::optional<raft::device_span<int32_t const>> org_edgelist_edge_types,
+  std::optional<raft::device_span<int32_t const>> org_edgelist_hops,
+  std::optional<raft::device_span<size_t const>> org_edgelist_label_offsets,
+  raft::device_span<int32_t const> renumbered_edgelist_srcs,
+  raft::device_span<int32_t const> renumbered_edgelist_dsts,
+  std::optional<raft::device_span<float const>> renumbered_edgelist_weights,
+  std::optional<raft::device_span<int64_t const>> renumbered_edgelist_edge_ids,
+  std::optional<raft::device_span<size_t const>> renumbered_edgelist_label_edge_type_hop_offsets,
+  raft::device_span<int32_t const> vertex_renumber_map,
+  raft::device_span<size_t const> vertex_renumber_map_label_type_offsets,
+  std::optional<raft::device_span<int64_t const>> edge_id_renumber_map,
+  std::optional<raft::device_span<size_t const>> edge_id_renumber_map_label_type_offsets,
+  raft::device_span<int32_t const> vertex_type_offsets,
+  size_t num_labels,
+  size_t num_vertex_types,
+  size_t num_edge_types,
+  size_t num_hops);
+
+template bool compare_heterogeneous_edgelist(
+  raft::handle_t const& handle,
+  raft::device_span<int32_t const> org_edgelist_srcs,
+  raft::device_span<int32_t const> org_edgelist_dsts,
+  std::optional<raft::device_span<double const>> org_edgelist_weights,
+  std::optional<raft::device_span<int64_t const>> org_edgelist_edge_ids,
+  std::optional<raft::device_span<int32_t const>> org_edgelist_edge_types,
+  std::optional<raft::device_span<int32_t const>> org_edgelist_hops,
+  std::optional<raft::device_span<size_t const>> org_edgelist_label_offsets,
+  raft::device_span<int32_t const> renumbered_edgelist_srcs,
+  raft::device_span<int32_t const> renumbered_edgelist_dsts,
+  std::optional<raft::device_span<double const>> renumbered_edgelist_weights,
+  std::optional<raft::device_span<int64_t const>> renumbered_edgelist_edge_ids,
+  std::optional<raft::device_span<size_t const>> renumbered_edgelist_label_edge_type_hop_offsets,
+  raft::device_span<int32_t const> vertex_renumber_map,
+  raft::device_span<size_t const> vertex_renumber_map_label_type_offsets,
+  std::optional<raft::device_span<int64_t const>> edge_id_renumber_map,
+  std::optional<raft::device_span<size_t const>> edge_id_renumber_map_label_type_offsets,
+  raft::device_span<int32_t const> vertex_type_offsets,
+  size_t num_labels,
+  size_t num_vertex_types,
+  size_t num_edge_types,
+  size_t num_hops);
+
+template bool compare_heterogeneous_edgelist(
+  raft::handle_t const& handle,
+  raft::device_span<int64_t const> org_edgelist_srcs,
+  raft::device_span<int64_t const> org_edgelist_dsts,
+  std::optional<raft::device_span<float const>> org_edgelist_weights,
+  std::optional<raft::device_span<int64_t const>> org_edgelist_edge_ids,
+  std::optional<raft::device_span<int32_t const>> org_edgelist_edge_types,
+  std::optional<raft::device_span<int32_t const>> org_edgelist_hops,
+  std::optional<raft::device_span<size_t const>> org_edgelist_label_offsets,
+  raft::device_span<int64_t const> renumbered_edgelist_srcs,
+  raft::device_span<int64_t const> renumbered_edgelist_dsts,
+  std::optional<raft::device_span<float const>> renumbered_edgelist_weights,
+  std::optional<raft::device_span<int64_t const>> renumbered_edgelist_edge_ids,
+  std::optional<raft::device_span<size_t const>> renumbered_edgelist_label_edge_type_hop_offsets,
+  raft::device_span<int64_t const> vertex_renumber_map,
+  raft::device_span<size_t const> vertex_renumber_map_label_type_offsets,
+  std::optional<raft::device_span<int64_t const>> edge_id_renumber_map,
+  std::optional<raft::device_span<size_t const>> edge_id_renumber_map_label_type_offsets,
+  raft::device_span<int64_t const> vertex_type_offsets,
+  size_t num_labels,
+  size_t num_vertex_types,
+  size_t num_edge_types,
+  size_t num_hops);
+
+template bool compare_heterogeneous_edgelist(
+  raft::handle_t const& handle,
+  raft::device_span<int64_t const> org_edgelist_srcs,
+  raft::device_span<int64_t const> org_edgelist_dsts,
+  std::optional<raft::device_span<double const>> org_edgelist_weights,
+  std::optional<raft::device_span<int64_t const>> org_edgelist_edge_ids,
+  std::optional<raft::device_span<int32_t const>> org_edgelist_edge_types,
+  std::optional<raft::device_span<int32_t const>> org_edgelist_hops,
+  std::optional<raft::device_span<size_t const>> org_edgelist_label_offsets,
+  raft::device_span<int64_t const> renumbered_edgelist_srcs,
+  raft::device_span<int64_t const> renumbered_edgelist_dsts,
+  std::optional<raft::device_span<double const>> renumbered_edgelist_weights,
+  std::optional<raft::device_span<int64_t const>> renumbered_edgelist_edge_ids,
+  std::optional<raft::device_span<size_t const>> renumbered_edgelist_label_edge_type_hop_offsets,
+  raft::device_span<int64_t const> vertex_renumber_map,
+  raft::device_span<size_t const> vertex_renumber_map_label_type_offsets,
+  std::optional<raft::device_span<int64_t const>> edge_id_renumber_map,
+  std::optional<raft::device_span<size_t const>> edge_id_renumber_map_label_type_offsets,
+  raft::device_span<int64_t const> vertex_type_offsets,
+  size_t num_labels,
+  size_t num_vertex_types,
+  size_t num_edge_types,
+  size_t num_hops);
+
+template <typename vertex_t>
+bool check_vertex_renumber_map_invariants(
+  raft::handle_t const& handle,
+  std::optional<raft::device_span<vertex_t const>> starting_vertices,
+  std::optional<raft::device_span<size_t const>> starting_vertex_label_offsets,
+  raft::device_span<vertex_t const> org_edgelist_srcs,
+  raft::device_span<vertex_t const> org_edgelist_dsts,
+  std::optional<raft::device_span<int32_t const>> org_edgelist_hops,
+  std::optional<raft::device_span<size_t const>> org_edgelist_label_offsets,
+  raft::device_span<vertex_t const> renumber_map,
+  std::optional<raft::device_span<size_t const>> renumber_map_label_type_offsets,
+  std::optional<raft::device_span<vertex_t const>> vertex_type_offsets,
+  size_t num_labels,
+  size_t num_vertex_types,
+  bool src_is_major)
+{
+  // Check the invariants in renumber_map
+  // Say we found the minimum (primary key:hop, secondary key:flag) pairs for every unique vertices,
+  // where flag is 0 for majors and 1 for minors. Then, vertices with smaller (hop, flag)
+  // pairs should be renumbered to smaller numbers than vertices with larger (hop, flag) pairs.
+  auto org_edgelist_majors = src_is_major ? org_edgelist_srcs : org_edgelist_dsts;
+  auto org_edgelist_minors = src_is_major ? org_edgelist_dsts : org_edgelist_srcs;
+
+  for (size_t i = 0; i < num_labels; ++i) {
+    size_t label_start_offset{0};
+    auto label_end_offset = org_edgelist_majors.size();
+    if (org_edgelist_label_offsets) {
+      raft::update_host(&label_start_offset,
+                        (*org_edgelist_label_offsets).data() + i,
+                        size_t{1},
+                        handle.get_stream());
+      raft::update_host(&label_end_offset,
+                        (*org_edgelist_label_offsets).data() + i + 1,
+                        size_t{1},
+                        handle.get_stream());
+      handle.sync_stream();
+    }
+
+    if (label_start_offset == label_end_offset) { continue; }
+
+    // compute (unique major, min_hop) pairs
+
+    rmm::device_uvector<vertex_t> this_label_unique_majors(label_end_offset - label_start_offset,
+                                                           handle.get_stream());
+    thrust::copy(handle.get_thrust_policy(),
+                 org_edgelist_majors.begin() + label_start_offset,
+                 org_edgelist_majors.begin() + label_end_offset,
+                 this_label_unique_majors.begin());
+    if (starting_vertices) {
+      size_t starting_vertex_label_start_offset{0};
+      auto starting_vertex_label_end_offset = (*starting_vertices).size();
+      if (starting_vertex_label_offsets) {
+        raft::update_host(&starting_vertex_label_start_offset,
+                          (*starting_vertex_label_offsets).data() + i,
+                          size_t{1},
+                          handle.get_stream());
+        raft::update_host(&starting_vertex_label_end_offset,
+                          (*starting_vertex_label_offsets).data() + i + 1,
+                          size_t{1},
+                          handle.get_stream());
+        handle.sync_stream();
+      }
+
+      auto old_size = this_label_unique_majors.size();
+      this_label_unique_majors.resize(
+        old_size + starting_vertex_label_end_offset - starting_vertex_label_start_offset,
+        handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(),
+                   (*starting_vertices).begin() + starting_vertex_label_start_offset,
+                   (*starting_vertices).begin() + starting_vertex_label_end_offset,
+                   this_label_unique_majors.begin() + old_size);
+    }
+
+    std::optional<rmm::device_uvector<int32_t>> this_label_unique_major_hops =
+      org_edgelist_hops ? std::make_optional<rmm::device_uvector<int32_t>>(
+                            label_end_offset - label_start_offset, handle.get_stream())
+                        : std::nullopt;
+    if (org_edgelist_hops) {
+      thrust::copy(handle.get_thrust_policy(),
+                   (*org_edgelist_hops).begin() + label_start_offset,
+                   (*org_edgelist_hops).begin() + label_end_offset,
+                   (*this_label_unique_major_hops).begin());
+      if (starting_vertices) {
+        auto old_size = (*this_label_unique_major_hops).size();
+        (*this_label_unique_major_hops)
+          .resize(this_label_unique_majors.size(), handle.get_stream());
+        thrust::fill(handle.get_thrust_policy(),
+                     (*this_label_unique_major_hops).begin() + old_size,
+                     (*this_label_unique_major_hops).end(),
+                     int32_t{0});
+      }
+
+      auto pair_first = thrust::make_zip_iterator(this_label_unique_majors.begin(),
+                                                  (*this_label_unique_major_hops).begin());
+      thrust::sort(
+        handle.get_thrust_policy(), pair_first, pair_first + this_label_unique_majors.size());
+      this_label_unique_majors.resize(thrust::distance(this_label_unique_majors.begin(),
+                                                       thrust::get<0>(thrust::unique_by_key(
+                                                         handle.get_thrust_policy(),
+                                                         this_label_unique_majors.begin(),
+                                                         this_label_unique_majors.end(),
+                                                         (*this_label_unique_major_hops).begin()))),
+                                      handle.get_stream());
+      (*this_label_unique_major_hops).resize(this_label_unique_majors.size(), handle.get_stream());
+    } else {
+      thrust::sort(handle.get_thrust_policy(),
+                   this_label_unique_majors.begin(),
+                   this_label_unique_majors.end());
+      this_label_unique_majors.resize(
+        thrust::distance(this_label_unique_majors.begin(),
+                         thrust::unique(handle.get_thrust_policy(),
+                                        this_label_unique_majors.begin(),
+                                        this_label_unique_majors.end())),
+        handle.get_stream());
+    }
+
+    // compute (unique minor, min_hop) pairs
+
+    rmm::device_uvector<vertex_t> this_label_unique_minors(label_end_offset - label_start_offset,
+                                                           handle.get_stream());
+    thrust::copy(handle.get_thrust_policy(),
+                 org_edgelist_minors.begin() + label_start_offset,
+                 org_edgelist_minors.begin() + label_end_offset,
+                 this_label_unique_minors.begin());
+    std::optional<rmm::device_uvector<int32_t>> this_label_unique_minor_hops =
+      org_edgelist_hops ? std::make_optional<rmm::device_uvector<int32_t>>(
+                            label_end_offset - label_start_offset, handle.get_stream())
+                        : std::nullopt;
+    if (org_edgelist_hops) {
+      thrust::copy(handle.get_thrust_policy(),
+                   (*org_edgelist_hops).begin() + label_start_offset,
+                   (*org_edgelist_hops).begin() + label_end_offset,
+                   (*this_label_unique_minor_hops).begin());
+
+      auto pair_first = thrust::make_zip_iterator(this_label_unique_minors.begin(),
+                                                  (*this_label_unique_minor_hops).begin());
+      thrust::sort(
+        handle.get_thrust_policy(), pair_first, pair_first + this_label_unique_minors.size());
+      this_label_unique_minors.resize(thrust::distance(this_label_unique_minors.begin(),
+                                                       thrust::get<0>(thrust::unique_by_key(
+                                                         handle.get_thrust_policy(),
+                                                         this_label_unique_minors.begin(),
+                                                         this_label_unique_minors.end(),
+                                                         (*this_label_unique_minor_hops).begin()))),
+                                      handle.get_stream());
+      (*this_label_unique_minor_hops).resize(this_label_unique_minors.size(), handle.get_stream());
+    } else {
+      thrust::sort(handle.get_thrust_policy(),
+                   this_label_unique_minors.begin(),
+                   this_label_unique_minors.end());
+      this_label_unique_minors.resize(
+        thrust::distance(this_label_unique_minors.begin(),
+                         thrust::unique(handle.get_thrust_policy(),
+                                        this_label_unique_minors.begin(),
+                                        this_label_unique_minors.end())),
+        handle.get_stream());
+    }
+
+    for (size_t j = 0; j < num_vertex_types; ++j) {
+      size_t renumber_map_type_start_offset{0};
+      auto renumber_map_type_end_offset = renumber_map.size();
+      if (renumber_map_label_type_offsets) {
+        raft::update_host(&renumber_map_type_start_offset,
+                          (*renumber_map_label_type_offsets).data() + i * num_vertex_types + j,
+                          size_t{1},
+                          handle.get_stream());
+        raft::update_host(&renumber_map_type_end_offset,
+                          (*renumber_map_label_type_offsets).data() + i * num_vertex_types + j + 1,
+                          size_t{1},
+                          handle.get_stream());
+        handle.sync_stream();
+      }
+
+      rmm::device_uvector<vertex_t> this_type_sorted_org_vertices(
+        renumber_map_type_end_offset - renumber_map_type_start_offset, handle.get_stream());
+      rmm::device_uvector<vertex_t> this_type_matching_renumbered_vertices(
+        this_type_sorted_org_vertices.size(), handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(),
+                   renumber_map.begin() + renumber_map_type_start_offset,
+                   renumber_map.begin() + renumber_map_type_end_offset,
+                   this_type_sorted_org_vertices.begin());
+      thrust::sequence(handle.get_thrust_policy(),
+                       this_type_matching_renumbered_vertices.begin(),
+                       this_type_matching_renumbered_vertices.end(),
+                       vertex_t{0});
+      thrust::sort_by_key(handle.get_thrust_policy(),
+                          this_type_sorted_org_vertices.begin(),
+                          this_type_sorted_org_vertices.end(),
+                          this_type_matching_renumbered_vertices.begin());
+
+      rmm::device_uvector<vertex_t> this_type_unique_majors(this_label_unique_majors.size(),
+                                                            handle.get_stream());
+      auto this_type_unique_major_hops =
+        this_label_unique_major_hops
+          ? std::make_optional<rmm::device_uvector<int32_t>>((*this_label_unique_major_hops).size(),
+                                                             handle.get_stream())
+          : std::nullopt;
+      rmm::device_uvector<vertex_t> this_type_unique_minors(this_label_unique_minors.size(),
+                                                            handle.get_stream());
+      auto this_type_unique_minor_hops =
+        this_label_unique_minor_hops
+          ? std::make_optional<rmm::device_uvector<int32_t>>((*this_label_unique_minor_hops).size(),
+                                                             handle.get_stream())
+          : std::nullopt;
+
+      if (org_edgelist_hops) {
+        if (vertex_type_offsets) {
+          auto input_pair_first = thrust::make_zip_iterator(
+            this_label_unique_majors.begin(), (*this_label_unique_major_hops).begin());
+          auto output_pair_first = thrust::make_zip_iterator(
+            this_type_unique_majors.begin(), (*this_type_unique_major_hops).begin());
+          this_type_unique_majors.resize(
+            thrust::distance(
+              output_pair_first,
+              thrust::copy_if(handle.get_thrust_policy(),
+                              input_pair_first,
+                              input_pair_first + this_label_unique_majors.size(),
+                              output_pair_first,
+                              [vertex_type_offsets = *vertex_type_offsets,
+                               vertex_type         = j] __device__(auto pair) {
+                                auto type_idx = thrust::distance(
+                                  vertex_type_offsets.begin() + 1,
+                                  thrust::upper_bound(thrust::seq,
+                                                      vertex_type_offsets.begin() + 1,
+                                                      vertex_type_offsets.end(),
+                                                      thrust::get<0>(pair)));
+                                return static_cast<size_t>(thrust::distance(
+                                         vertex_type_offsets.begin() + 1,
+                                         thrust::upper_bound(thrust::seq,
+                                                             vertex_type_offsets.begin() + 1,
+                                                             vertex_type_offsets.end(),
+                                                             thrust::get<0>(pair)))) == vertex_type;
+                              })),
+            handle.get_stream());
+          (*this_type_unique_major_hops)
+            .resize(this_type_unique_majors.size(), handle.get_stream());
+
+          input_pair_first  = thrust::make_zip_iterator(this_label_unique_minors.begin(),
+                                                       (*this_label_unique_minor_hops).begin());
+          output_pair_first = thrust::make_zip_iterator(this_type_unique_minors.begin(),
+                                                        (*this_type_unique_minor_hops).begin());
+          this_type_unique_minors.resize(
+            thrust::distance(
+              output_pair_first,
+              thrust::copy_if(handle.get_thrust_policy(),
+                              input_pair_first,
+                              input_pair_first + this_label_unique_minors.size(),
+                              output_pair_first,
+                              [vertex_type_offsets = *vertex_type_offsets,
+                               vertex_type         = j] __device__(auto pair) {
+                                return static_cast<size_t>(thrust::distance(
+                                         vertex_type_offsets.begin() + 1,
+                                         thrust::upper_bound(thrust::seq,
+                                                             vertex_type_offsets.begin() + 1,
+                                                             vertex_type_offsets.end(),
+                                                             thrust::get<0>(pair)))) == vertex_type;
+                              })),
+            handle.get_stream());
+          (*this_type_unique_minor_hops)
+            .resize(this_type_unique_minors.size(), handle.get_stream());
+        } else {
+          auto input_pair_first = thrust::make_zip_iterator(
+            this_label_unique_majors.begin(), (*this_label_unique_major_hops).begin());
+          thrust::copy(handle.get_thrust_policy(),
+                       input_pair_first,
+                       input_pair_first + this_label_unique_majors.size(),
+                       thrust::make_zip_iterator(this_type_unique_majors.begin(),
+                                                 (*this_type_unique_major_hops).begin()));
+          input_pair_first = thrust::make_zip_iterator(this_label_unique_minors.begin(),
+                                                       (*this_label_unique_minor_hops).begin());
+          thrust::copy(handle.get_thrust_policy(),
+                       input_pair_first,
+                       input_pair_first + this_label_unique_minors.size(),
+                       thrust::make_zip_iterator(this_type_unique_minors.begin(),
+                                                 (*this_type_unique_minor_hops).begin()));
+        }
+
+        if (this_type_unique_majors.size() + this_type_unique_minors.size() == 0) { continue; }
+
+        rmm::device_uvector<vertex_t> merged_vertices(
+          this_type_unique_majors.size() + this_type_unique_minors.size(), handle.get_stream());
+        rmm::device_uvector<int32_t> merged_hops(merged_vertices.size(), handle.get_stream());
+        rmm::device_uvector<int8_t> merged_flags(merged_vertices.size(), handle.get_stream());
+
+        auto major_triplet_first =
+          thrust::make_zip_iterator(this_type_unique_majors.begin(),
+                                    (*this_type_unique_major_hops).begin(),
+                                    thrust::make_constant_iterator(int8_t{0}));
+        auto minor_triplet_first =
+          thrust::make_zip_iterator(this_type_unique_minors.begin(),
+                                    (*this_type_unique_minor_hops).begin(),
+                                    thrust::make_constant_iterator(int8_t{1}));
+        thrust::merge(handle.get_thrust_policy(),
+                      major_triplet_first,
+                      major_triplet_first + this_type_unique_majors.size(),
+                      minor_triplet_first,
+                      minor_triplet_first + this_type_unique_minors.size(),
+                      thrust::make_zip_iterator(
+                        merged_vertices.begin(), merged_hops.begin(), merged_flags.begin()));
+        merged_vertices.resize(
+          thrust::distance(
+            merged_vertices.begin(),
+            thrust::get<0>(thrust::unique_by_key(
+              handle.get_thrust_policy(),
+              merged_vertices.begin(),
+              merged_vertices.end(),
+              thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin())))),
+          handle.get_stream());
+        merged_hops.resize(merged_vertices.size(), handle.get_stream());
+        merged_flags.resize(merged_vertices.size(), handle.get_stream());
+
+        if ((renumber_map_type_end_offset - renumber_map_type_start_offset) !=
+            merged_vertices.size()) {  // renumber map size == # unique vertices
+          return false;
+        }
+
+        auto sort_key_first = thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin());
+        thrust::sort_by_key(handle.get_thrust_policy(),
+                            sort_key_first,
+                            sort_key_first + merged_hops.size(),
+                            merged_vertices.begin());
+
+        auto num_unique_keys = thrust::count_if(
+          handle.get_thrust_policy(),
+          thrust::make_counting_iterator(size_t{0}),
+          thrust::make_counting_iterator(merged_hops.size()),
+          cugraph::detail::is_first_in_run_t<decltype(sort_key_first)>{sort_key_first});
+        rmm::device_uvector<vertex_t> min_vertices(num_unique_keys, handle.get_stream());
+        rmm::device_uvector<vertex_t> max_vertices(num_unique_keys, handle.get_stream());
+
+        auto renumbered_merged_vertex_first = thrust::make_transform_iterator(
+          merged_vertices.begin(),
+          cuda::proclaim_return_type<vertex_t>(
+            [this_type_sorted_org_vertices = raft::device_span<vertex_t const>(
+               this_type_sorted_org_vertices.data(), this_type_sorted_org_vertices.size()),
+             this_type_matching_renumbered_vertices = raft::device_span<vertex_t const>(
+               this_type_matching_renumbered_vertices.data(),
+               this_type_matching_renumbered_vertices.size())] __device__(vertex_t major) {
+              auto it = thrust::lower_bound(thrust::seq,
+                                            this_type_sorted_org_vertices.begin(),
+                                            this_type_sorted_org_vertices.end(),
+                                            major);
+              return this_type_matching_renumbered_vertices[thrust::distance(
+                this_type_sorted_org_vertices.begin(), it)];
+            }));
+
+        thrust::reduce_by_key(handle.get_thrust_policy(),
+                              sort_key_first,
+                              sort_key_first + merged_hops.size(),
+                              renumbered_merged_vertex_first,
+                              thrust::make_discard_iterator(),
+                              min_vertices.begin(),
+                              thrust::equal_to<thrust::tuple<int32_t, int8_t>>{},
+                              thrust::minimum<vertex_t>{});
+        thrust::reduce_by_key(handle.get_thrust_policy(),
+                              sort_key_first,
+                              sort_key_first + merged_hops.size(),
+                              renumbered_merged_vertex_first,
+                              thrust::make_discard_iterator(),
+                              max_vertices.begin(),
+                              thrust::equal_to<thrust::tuple<int32_t, int8_t>>{},
+                              thrust::maximum<vertex_t>{});
+
+        auto num_violations =
+          thrust::count_if(handle.get_thrust_policy(),
+                           thrust::make_counting_iterator(size_t{1}),
+                           thrust::make_counting_iterator(min_vertices.size()),
+                           [min_vertices = raft::device_span<vertex_t const>(min_vertices.data(),
+                                                                             min_vertices.size()),
+                            max_vertices = raft::device_span<vertex_t const>(
+                              max_vertices.data(), max_vertices.size())] __device__(size_t i) {
+                             return min_vertices[i] <= max_vertices[i - 1];
+                           });
+
+        if (num_violations != 0) { return false; }
+      } else {
+        if (vertex_type_offsets) {
+          this_type_unique_majors.resize(
+            thrust::distance(
+              this_type_unique_majors.begin(),
+              thrust::copy_if(
+                handle.get_thrust_policy(),
+                this_label_unique_majors.begin(),
+                this_label_unique_majors.end(),
+                this_type_unique_majors.begin(),
+                [vertex_type_offsets = *vertex_type_offsets, vertex_type = j] __device__(auto v) {
+                  auto type_idx = thrust::distance(
+                    vertex_type_offsets.begin() + 1,
+                    thrust::upper_bound(
+                      thrust::seq, vertex_type_offsets.begin() + 1, vertex_type_offsets.end(), v));
+                  return static_cast<size_t>(
+                           thrust::distance(vertex_type_offsets.begin() + 1,
+                                            thrust::upper_bound(thrust::seq,
+                                                                vertex_type_offsets.begin() + 1,
+                                                                vertex_type_offsets.end(),
+                                                                v))) == vertex_type;
+                })),
+            handle.get_stream());
+
+          this_type_unique_minors.resize(
+            thrust::distance(
+              this_type_unique_minors.begin(),
+              thrust::copy_if(
+                handle.get_thrust_policy(),
+                this_label_unique_minors.begin(),
+                this_label_unique_minors.end(),
+                this_type_unique_minors.begin(),
+                [vertex_type_offsets = *vertex_type_offsets, vertex_type = j] __device__(auto v) {
+                  return static_cast<size_t>(
+                           thrust::distance(vertex_type_offsets.begin() + 1,
+                                            thrust::upper_bound(thrust::seq,
+                                                                vertex_type_offsets.begin() + 1,
+                                                                vertex_type_offsets.end(),
+                                                                v))) == vertex_type;
+                })),
+            handle.get_stream());
+          (*this_type_unique_minor_hops)
+            .resize(this_type_unique_minors.size(), handle.get_stream());
+        } else {
+          thrust::copy(handle.get_thrust_policy(),
+                       this_label_unique_majors.begin(),
+                       this_label_unique_majors.end(),
+                       this_type_unique_majors.begin());
+          thrust::copy(handle.get_thrust_policy(),
+                       this_label_unique_minors.begin(),
+                       this_label_unique_minors.end(),
+                       this_type_unique_minors.begin());
+        }
+
+        this_type_unique_minors.resize(
+          thrust::distance(
+            this_type_unique_minors.begin(),
+            thrust::remove_if(handle.get_thrust_policy(),
+                              this_type_unique_minors.begin(),
+                              this_type_unique_minors.end(),
+                              [sorted_unique_majors = raft::device_span<vertex_t const>(
+                                 this_type_unique_majors.data(),
+                                 this_type_unique_majors.size())] __device__(auto minor) {
+                                return thrust::binary_search(thrust::seq,
+                                                             sorted_unique_majors.begin(),
+                                                             sorted_unique_majors.end(),
+                                                             minor);
+                              })),
+          handle.get_stream());
+
+        if ((renumber_map_type_end_offset - renumber_map_type_start_offset) !=
+            (this_type_unique_majors.size() +
+             this_type_unique_minors.size())) {  // renumber map size == # unique vertices
+          return false;
+        }
+
+        auto max_major_renumbered_vertex = thrust::transform_reduce(
+          handle.get_thrust_policy(),
+          this_type_unique_majors.begin(),
+          this_type_unique_majors.end(),
+          cuda::proclaim_return_type<vertex_t>(
+            [this_type_sorted_org_vertices = raft::device_span<vertex_t const>(
+               this_type_sorted_org_vertices.data(), this_type_sorted_org_vertices.size()),
+             this_type_matching_renumbered_vertices = raft::device_span<vertex_t const>(
+               this_type_matching_renumbered_vertices.data(),
+               this_type_matching_renumbered_vertices.size())] __device__(vertex_t major)
+              -> vertex_t {
+              auto it = thrust::lower_bound(thrust::seq,
+                                            this_type_sorted_org_vertices.begin(),
+                                            this_type_sorted_org_vertices.end(),
+                                            major);
+              return this_type_matching_renumbered_vertices[thrust::distance(
+                this_type_sorted_org_vertices.begin(), it)];
+            }),
+          std::numeric_limits<vertex_t>::lowest(),
+          thrust::maximum<vertex_t>{});
+
+        auto min_minor_renumbered_vertex = thrust::transform_reduce(
+          handle.get_thrust_policy(),
+          this_type_unique_minors.begin(),
+          this_type_unique_minors.end(),
+          cuda::proclaim_return_type<vertex_t>(
+            [this_type_sorted_org_vertices = raft::device_span<vertex_t const>(
+               this_type_sorted_org_vertices.data(), this_type_sorted_org_vertices.size()),
+             this_type_matching_renumbered_vertices = raft::device_span<vertex_t const>(
+               this_type_matching_renumbered_vertices.data(),
+               this_type_matching_renumbered_vertices.size())] __device__(vertex_t minor)
+              -> vertex_t {
+              auto it = thrust::lower_bound(thrust::seq,
+                                            this_type_sorted_org_vertices.begin(),
+                                            this_type_sorted_org_vertices.end(),
+                                            minor);
+              return this_type_matching_renumbered_vertices[thrust::distance(
+                this_type_sorted_org_vertices.begin(), it)];
+            }),
+          std::numeric_limits<vertex_t>::max(),
+          thrust::minimum<vertex_t>{});
+
+        if (max_major_renumbered_vertex >= min_minor_renumbered_vertex) { return false; }
+      }
+    }
+  }
+
+  return true;
+}
+
+template bool check_vertex_renumber_map_invariants(
+  raft::handle_t const& handle,
+  std::optional<raft::device_span<int32_t const>> starting_vertices,
+  std::optional<raft::device_span<size_t const>> starting_vertex_label_offsets,
+  raft::device_span<int32_t const> org_edgelist_srcs,
+  raft::device_span<int32_t const> org_edgelist_dsts,
+  std::optional<raft::device_span<int32_t const>> org_edgelist_hops,
+  std::optional<raft::device_span<size_t const>> org_edgelist_label_offsets,
+  raft::device_span<int32_t const> renumber_map,
+  std::optional<raft::device_span<size_t const>> renumber_map_label_type_offsets,
+  std::optional<raft::device_span<int32_t const>> vertex_type_offsets,
+  size_t num_labels,
+  size_t num_vertex_types,
+  bool src_is_major);
+
+template bool check_vertex_renumber_map_invariants(
+  raft::handle_t const& handle,
+  std::optional<raft::device_span<int64_t const>> starting_vertices,
+  std::optional<raft::device_span<size_t const>> starting_vertex_label_offsets,
+  raft::device_span<int64_t const> org_edgelist_srcs,
+  raft::device_span<int64_t const> org_edgelist_dsts,
+  std::optional<raft::device_span<int32_t const>> org_edgelist_hops,
+  std::optional<raft::device_span<size_t const>> org_edgelist_label_offsets,
+  raft::device_span<int64_t const> renumber_map,
+  std::optional<raft::device_span<size_t const>> renumber_map_label_type_offsets,
+  std::optional<raft::device_span<int64_t const>> vertex_type_offsets,
+  size_t num_labels,
+  size_t num_vertex_types,
+  bool src_is_major);
+
+template <typename edge_id_t, typename edge_type_t>
+bool check_edge_id_renumber_map_invariants(
+  raft::handle_t const& handle,
+  raft::device_span<edge_id_t const> org_edgelist_edge_ids,
+  std::optional<raft::device_span<edge_type_t const>> org_edgelist_edge_types,
+  std::optional<raft::device_span<int32_t const>> org_edgelist_hops,
+  std::optional<raft::device_span<size_t const>> org_edgelist_label_offsets,
+  raft::device_span<edge_id_t const> renumber_map,
+  std::optional<raft::device_span<size_t const>> renumber_map_label_type_offsets,
+  size_t num_labels,
+  size_t num_edge_types)
+{
+  // Check the invariants in renumber_map
+  // Say we found the minimum (primary key:hop, secondary key:flag) pairs for every unique vertices,
+  // where flag is 0 for majors and 1 for minors. Then, vertices with smaller (hop, flag)
+  // pairs should be renumbered to smaller numbers than vertices with larger (hop, flag) pairs.
+
+  for (size_t i = 0; i < num_labels; ++i) {
+    size_t label_start_offset{0};
+    auto label_end_offset = org_edgelist_edge_ids.size();
+    if (org_edgelist_label_offsets) {
+      raft::update_host(&label_start_offset,
+                        (*org_edgelist_label_offsets).data() + i,
+                        size_t{1},
+                        handle.get_stream());
+      raft::update_host(&label_end_offset,
+                        (*org_edgelist_label_offsets).data() + i + 1,
+                        size_t{1},
+                        handle.get_stream());
+      handle.sync_stream();
+    }
+
+    if (label_start_offset == label_end_offset) { continue; }
+
+    // compute unique key (edge type, edge ID), value (min. hop) pairs
+
+    std::optional<rmm::device_uvector<edge_type_t>> this_label_unique_key_edge_types =
+      org_edgelist_edge_types ? std::make_optional<rmm::device_uvector<edge_type_t>>(
+                                  label_end_offset - label_start_offset, handle.get_stream())
+                              : std::nullopt;
+    if (org_edgelist_edge_types) {
+      thrust::copy(handle.get_thrust_policy(),
+                   (*org_edgelist_edge_types).begin() + label_start_offset,
+                   (*org_edgelist_edge_types).begin() + label_end_offset,
+                   (*this_label_unique_key_edge_types).begin());
+    }
+
+    rmm::device_uvector<edge_id_t> this_label_unique_key_edge_ids(
+      label_end_offset - label_start_offset, handle.get_stream());
+    thrust::copy(handle.get_thrust_policy(),
+                 org_edgelist_edge_ids.begin() + label_start_offset,
+                 org_edgelist_edge_ids.begin() + label_end_offset,
+                 this_label_unique_key_edge_ids.begin());
+
+    std::optional<rmm::device_uvector<int32_t>> this_label_unique_key_hops =
+      org_edgelist_hops ? std::make_optional<rmm::device_uvector<int32_t>>(
+                            label_end_offset - label_start_offset, handle.get_stream())
+                        : std::nullopt;
+    if (org_edgelist_hops) {
+      thrust::copy(handle.get_thrust_policy(),
+                   (*org_edgelist_hops).begin() + label_start_offset,
+                   (*org_edgelist_hops).begin() + label_end_offset,
+                   (*this_label_unique_key_hops).begin());
+    }
+
+    if (org_edgelist_edge_types) {
+      if (org_edgelist_hops) {
+        auto triplet_first = thrust::make_zip_iterator((*this_label_unique_key_edge_types).begin(),
+                                                       this_label_unique_key_edge_ids.begin(),
+                                                       (*this_label_unique_key_hops).begin());
+        thrust::sort(handle.get_thrust_policy(),
+                     triplet_first,
+                     triplet_first + this_label_unique_key_edge_ids.size());
+        auto key_first = thrust::make_zip_iterator((*this_label_unique_key_edge_types).begin(),
+                                                   this_label_unique_key_edge_ids.begin());
+        this_label_unique_key_edge_ids.resize(
+          thrust::distance(
+            key_first,
+            thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
+                                                 key_first,
+                                                 key_first + this_label_unique_key_edge_ids.size(),
+                                                 (*this_label_unique_key_hops).begin()))),
+          handle.get_stream());
+        (*this_label_unique_key_edge_types)
+          .resize(this_label_unique_key_edge_ids.size(), handle.get_stream());
+        (*this_label_unique_key_hops)
+          .resize(this_label_unique_key_edge_ids.size(), handle.get_stream());
+      } else {
+        auto pair_first = thrust::make_zip_iterator((*this_label_unique_key_edge_types).begin(),
+                                                    this_label_unique_key_edge_ids.begin());
+        thrust::sort(handle.get_thrust_policy(),
+                     pair_first,
+                     pair_first + this_label_unique_key_edge_ids.size());
+        this_label_unique_key_edge_ids.resize(
+          thrust::distance(pair_first,
+                           thrust::unique(handle.get_thrust_policy(),
+                                          pair_first,
+                                          pair_first + this_label_unique_key_edge_ids.size())),
+          handle.get_stream());
+        (*this_label_unique_key_edge_types)
+          .resize(this_label_unique_key_edge_ids.size(), handle.get_stream());
+      }
+    } else {
+      if (org_edgelist_hops) {
+        auto pair_first = thrust::make_zip_iterator(this_label_unique_key_edge_ids.begin(),
+                                                    (*this_label_unique_key_hops).begin());
+        thrust::sort(handle.get_thrust_policy(),
+                     pair_first,
+                     pair_first + this_label_unique_key_edge_ids.size());
+        this_label_unique_key_edge_ids.resize(
+          thrust::distance(
+            this_label_unique_key_edge_ids.begin(),
+            thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
+                                                 this_label_unique_key_edge_ids.begin(),
+                                                 this_label_unique_key_edge_ids.end(),
+                                                 (*this_label_unique_key_hops).begin()))),
+          handle.get_stream());
+        (*this_label_unique_key_hops)
+          .resize(this_label_unique_key_edge_ids.size(), handle.get_stream());
+      } else {
+        thrust::sort(handle.get_thrust_policy(),
+                     this_label_unique_key_edge_ids.begin(),
+                     this_label_unique_key_edge_ids.end());
+        this_label_unique_key_edge_ids.resize(
+          thrust::distance(this_label_unique_key_edge_ids.begin(),
+                           thrust::unique(handle.get_thrust_policy(),
+                                          this_label_unique_key_edge_ids.begin(),
+                                          this_label_unique_key_edge_ids.end())),
+          handle.get_stream());
+      }
+    }
+
+    for (size_t j = 0; j < num_edge_types; ++j) {
+      size_t renumber_map_type_start_offset{0};
+      auto renumber_map_type_end_offset = renumber_map.size();
+      if (renumber_map_label_type_offsets) {
+        raft::update_host(&renumber_map_type_start_offset,
+                          (*renumber_map_label_type_offsets).data() + i * num_edge_types + j,
+                          size_t{1},
+                          handle.get_stream());
+        raft::update_host(&renumber_map_type_end_offset,
+                          (*renumber_map_label_type_offsets).data() + i * num_edge_types + j + 1,
+                          size_t{1},
+                          handle.get_stream());
+        handle.sync_stream();
+      }
+
+      rmm::device_uvector<edge_id_t> this_type_sorted_org_edge_ids(
+        renumber_map_type_end_offset - renumber_map_type_start_offset, handle.get_stream());
+      rmm::device_uvector<edge_id_t> this_type_matching_renumbered_edge_ids(
+        this_type_sorted_org_edge_ids.size(), handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(),
+                   renumber_map.begin() + renumber_map_type_start_offset,
+                   renumber_map.begin() + renumber_map_type_end_offset,
+                   this_type_sorted_org_edge_ids.begin());
+      thrust::sequence(handle.get_thrust_policy(),
+                       this_type_matching_renumbered_edge_ids.begin(),
+                       this_type_matching_renumbered_edge_ids.end(),
+                       edge_id_t{0});
+      thrust::sort_by_key(handle.get_thrust_policy(),
+                          this_type_sorted_org_edge_ids.begin(),
+                          this_type_sorted_org_edge_ids.end(),
+                          this_type_matching_renumbered_edge_ids.begin());
+
+      size_t type_start_offset{0};
+      auto type_end_offset = this_label_unique_key_edge_ids.size();
+      if (this_label_unique_key_edge_types) {
+        type_start_offset = static_cast<size_t>(
+          thrust::distance((*this_label_unique_key_edge_types).begin(),
+                           thrust::lower_bound(handle.get_thrust_policy(),
+                                               (*this_label_unique_key_edge_types).begin(),
+                                               (*this_label_unique_key_edge_types).end(),
+                                               static_cast<edge_type_t>(j))));
+        type_end_offset = static_cast<size_t>(
+          thrust::distance((*this_label_unique_key_edge_types).begin(),
+                           thrust::upper_bound(handle.get_thrust_policy(),
+                                               (*this_label_unique_key_edge_types).begin(),
+                                               (*this_label_unique_key_edge_types).end(),
+                                               static_cast<edge_type_t>(j))));
+      }
+
+      if ((renumber_map_type_end_offset - renumber_map_type_start_offset) !=
+          (type_end_offset - type_start_offset)) {  // renumber map size == # unique edge IDs
+        return false;
+      }
+
+      if (org_edgelist_hops) {
+        if (type_start_offset == type_end_offset) { continue; }
+
+        auto sort_key_first = (*this_label_unique_key_hops).begin();
+        thrust::sort_by_key(handle.get_thrust_policy(),
+                            sort_key_first + type_start_offset,
+                            sort_key_first + type_end_offset,
+                            this_label_unique_key_edge_ids.begin() + type_start_offset);
+
+        auto num_unique_keys =
+          thrust::count_if(handle.get_thrust_policy(),
+                           thrust::make_counting_iterator(size_t{0}),
+                           thrust::make_counting_iterator(type_end_offset - type_start_offset),
+                           cugraph::detail::is_first_in_run_t<decltype(sort_key_first)>{
+                             sort_key_first + type_start_offset});
+        rmm::device_uvector<edge_id_t> min_edge_ids(num_unique_keys, handle.get_stream());
+        rmm::device_uvector<edge_id_t> max_edge_ids(num_unique_keys, handle.get_stream());
+
+        auto renumbered_edge_id_first = thrust::make_transform_iterator(
+          this_label_unique_key_edge_ids.begin(),
+          cuda::proclaim_return_type<edge_id_t>(
+            [this_type_sorted_org_edge_ids = raft::device_span<edge_id_t const>(
+               this_type_sorted_org_edge_ids.data(), this_type_sorted_org_edge_ids.size()),
+             this_type_matching_renumbered_edge_ids = raft::device_span<edge_id_t const>(
+               this_type_matching_renumbered_edge_ids.data(),
+               this_type_matching_renumbered_edge_ids.size())] __device__(edge_id_t id) {
+              auto it = thrust::lower_bound(thrust::seq,
+                                            this_type_sorted_org_edge_ids.begin(),
+                                            this_type_sorted_org_edge_ids.end(),
+                                            id);
+              return this_type_matching_renumbered_edge_ids[thrust::distance(
+                this_type_sorted_org_edge_ids.begin(), it)];
+            }));
+
+        thrust::reduce_by_key(handle.get_thrust_policy(),
+                              sort_key_first + type_start_offset,
+                              sort_key_first + type_end_offset,
+                              renumbered_edge_id_first + type_start_offset,
+                              thrust::make_discard_iterator(),
+                              min_edge_ids.begin(),
+                              thrust::equal_to<int32_t>{},
+                              thrust::minimum<edge_id_t>{});
+        thrust::reduce_by_key(handle.get_thrust_policy(),
+                              sort_key_first + type_start_offset,
+                              sort_key_first + type_end_offset,
+                              renumbered_edge_id_first + type_start_offset,
+                              thrust::make_discard_iterator(),
+                              max_edge_ids.begin(),
+                              thrust::equal_to<int32_t>{},
+                              thrust::maximum<edge_id_t>{});
+
+        auto num_violations =
+          thrust::count_if(handle.get_thrust_policy(),
+                           thrust::make_counting_iterator(size_t{1}),
+                           thrust::make_counting_iterator(min_edge_ids.size()),
+                           [min_edge_ids = raft::device_span<edge_id_t const>(min_edge_ids.data(),
+                                                                              min_edge_ids.size()),
+                            max_edge_ids = raft::device_span<edge_id_t const>(
+                              max_edge_ids.data(), max_edge_ids.size())] __device__(size_t i) {
+                             return min_edge_ids[i] <= max_edge_ids[i - 1];
+                           });
+
+        if (num_violations != 0) { return false; }
+      }
+    }
+  }
+
+  return true;
+}
+
+template bool check_edge_id_renumber_map_invariants(
+  raft::handle_t const& handle,
+  raft::device_span<int32_t const> org_edgelist_edge_ids,
+  std::optional<raft::device_span<int32_t const>> org_edgelist_edge_types,
+  std::optional<raft::device_span<int32_t const>> org_edgelist_hops,
+  std::optional<raft::device_span<size_t const>> org_edgelist_label_offsets,
+  raft::device_span<int32_t const> renumber_map,
+  std::optional<raft::device_span<size_t const>> renumber_map_label_type_offsets,
+  size_t num_labels,
+  size_t num_edge_types);
+
+template bool check_edge_id_renumber_map_invariants(
+  raft::handle_t const& handle,
+  raft::device_span<int64_t const> org_edgelist_edge_ids,
+  std::optional<raft::device_span<int32_t const>> org_edgelist_edge_types,
+  std::optional<raft::device_span<int32_t const>> org_edgelist_hops,
+  std::optional<raft::device_span<size_t const>> org_edgelist_label_offsets,
+  raft::device_span<int64_t const> renumber_map,
+  std::optional<raft::device_span<size_t const>> renumber_map_label_type_offsets,
+  size_t num_labels,
+  size_t num_edge_types);
diff --git a/cpp/tests/sampling/detail/sampling_post_processing_validate.hpp b/cpp/tests/sampling/detail/sampling_post_processing_validate.hpp
new file mode 100644
index 00000000000..986265b368f
--- /dev/null
+++ b/cpp/tests/sampling/detail/sampling_post_processing_validate.hpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/device_span.hpp>
+#include <raft/core/handle.hpp>
+
+#include <optional>
+
+template <typename index_t>
+bool check_offsets(raft::handle_t const& handle,
+                   raft::device_span<index_t const> offsets,
+                   index_t num_segments,
+                   index_t num_elements);
+
+template <typename vertex_t>
+bool check_edgelist_is_sorted(raft::handle_t const& handle,
+                              raft::device_span<vertex_t const> edgelist_majors,
+                              raft::device_span<vertex_t const> edgelist_minors);
+
+// unrenumber the renumbered edge list and check whether the original & unrenumbered edge lists are
+// identical
+template <typename vertex_t, typename weight_t>
+bool compare_edgelist(raft::handle_t const& handle,
+                      raft::device_span<vertex_t const> org_edgelist_srcs,
+                      raft::device_span<vertex_t const> org_edgelist_dsts,
+                      std::optional<raft::device_span<weight_t const>> org_edgelist_weights,
+                      std::optional<raft::device_span<size_t const>> org_edgelist_label_offsets,
+                      raft::device_span<vertex_t const> renumbered_edgelist_srcs,
+                      raft::device_span<vertex_t const> renumbered_edgelist_dsts,
+                      std::optional<raft::device_span<weight_t const>> renumbered_edgelist_weights,
+                      std::optional<raft::device_span<vertex_t const>> renumber_map,
+                      std::optional<raft::device_span<size_t const>> renumber_map_label_offsets,
+                      size_t num_labels);
+
+// unrenumber the renumbered edge list and check whether the original & unrenumbered edge lists
+// are identical
+template <typename vertex_t, typename weight_t, typename edge_id_t, typename edge_type_t>
+bool compare_heterogeneous_edgelist(
+  raft::handle_t const& handle,
+  raft::device_span<vertex_t const> org_edgelist_srcs,
+  raft::device_span<vertex_t const> org_edgelist_dsts,
+  std::optional<raft::device_span<weight_t const>> org_edgelist_weights,
+  std::optional<raft::device_span<edge_id_t const>> org_edgelist_edge_ids,
+  std::optional<raft::device_span<edge_type_t const>> org_edgelist_edge_types,
+  std::optional<raft::device_span<int32_t const>> org_edgelist_hops,
+  std::optional<raft::device_span<size_t const>> org_edgelist_label_offsets,
+  raft::device_span<vertex_t const> renumbered_edgelist_srcs,
+  raft::device_span<vertex_t const> renumbered_edgelist_dsts,
+  std::optional<raft::device_span<weight_t const>> renumbered_edgelist_weights,
+  std::optional<raft::device_span<edge_id_t const>> renumbered_edgelist_edge_ids,
+  std::optional<raft::device_span<size_t const>> renumbered_edgelist_label_edge_type_hop_offsets,
+  raft::device_span<vertex_t const> vertex_renumber_map,
+  raft::device_span<size_t const> vertex_renumber_map_label_type_offsets,
+  std::optional<raft::device_span<edge_id_t const>> edge_id_renumber_map,
+  std::optional<raft::device_span<size_t const>> edge_id_renumber_map_label_type_offsets,
+  raft::device_span<vertex_t const> vertex_type_offsets,
+  size_t num_labels,
+  size_t num_vertex_types,
+  size_t num_edge_types,
+  size_t num_hops);
+
+template <typename vertex_t>
+bool check_vertex_renumber_map_invariants(
+  raft::handle_t const& handle,
+  std::optional<raft::device_span<vertex_t const>> starting_vertices,
+  std::optional<raft::device_span<size_t const>> starting_vertex_label_offsets,
+  raft::device_span<vertex_t const> org_edgelist_srcs,
+  raft::device_span<vertex_t const> org_edgelist_dsts,
+  std::optional<raft::device_span<int32_t const>> org_edgelist_hops,
+  std::optional<raft::device_span<size_t const>> org_edgelist_label_offsets,
+  raft::device_span<vertex_t const> renumber_map,
+  std::optional<raft::device_span<size_t const>> renumber_map_label_type_offsets,
+  std::optional<raft::device_span<vertex_t const>> vertex_type_offsets,
+  size_t num_labels,
+  size_t num_vertex_types,
+  bool src_is_major);
+
+template <typename edge_id_t, typename edge_type_t>
+bool check_edge_id_renumber_map_invariants(
+  raft::handle_t const& handle,
+  raft::device_span<edge_id_t const> org_edgelist_edge_ids,
+  std::optional<raft::device_span<edge_type_t const>> org_edgelist_edge_types,
+  std::optional<raft::device_span<int32_t const>> org_edgelist_hops,
+  std::optional<raft::device_span<size_t const>> org_edgelist_label_offsets,
+  raft::device_span<edge_id_t const> renumber_map,
+  std::optional<raft::device_span<size_t const>> renumber_map_label_type_offsets,
+  size_t num_labels,
+  size_t num_edge_types);
diff --git a/cpp/tests/sampling/mg_negative_sampling.cpp b/cpp/tests/sampling/mg_negative_sampling.cpp
new file mode 100644
index 00000000000..7c64bb7fbbb
--- /dev/null
+++ b/cpp/tests/sampling/mg_negative_sampling.cpp
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utilities/base_fixture.hpp"
+#include "utilities/conversion_utilities.hpp"
+#include "utilities/property_generator_utilities.hpp"
+#include "utilities/validation_utilities.hpp"
+
+#include <cugraph/sampling_functions.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <gtest/gtest.h>
+
+struct Negative_Sampling_Usecase {
+  float sample_multiplier{2};
+  bool use_src_bias{false};
+  bool use_dst_bias{false};
+  bool remove_duplicates{false};
+  bool remove_existing_edges{false};
+  bool exact_number_of_samples{false};
+  bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t, typename vertex_t, typename edge_t, typename weight_t>
+class Tests_MGNegative_Sampling : public ::testing::TestWithParam<input_usecase_t> {
+ public:
+  using graph_t      = cugraph::graph_t<vertex_t, edge_t, false, true>;
+  using graph_view_t = cugraph::graph_view_t<vertex_t, edge_t, false, true>;
+
+  Tests_MGNegative_Sampling() : graph_(*handle_) {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+
+  static void TearDownTestCase() { handle_.reset(); }
+
+  template <typename input_t>
+  void load_graph(input_t const& param)
+  {
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Construct graph");
+    }
+
+    std::tie(graph_, edge_weights_, renumber_map_labels_) =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, true>(
+        *handle_, param, true, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    edge_mask_ =
+      cugraph::test::generate<graph_view_t, bool>::edge_property(*handle_, graph_.view(), 2);
+  }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  void run_current_test(raft::random::RngState& rng_state,
+                        Negative_Sampling_Usecase const& negative_sampling_usecase)
+  {
+    constexpr bool do_expensive_check{false};
+
+    HighResTimer hr_timer{};
+
+    auto graph_view = graph_.view();
+
+    if (negative_sampling_usecase.edge_masking) { graph_view.attach_edge_mask(edge_mask_->view()); }
+
+    size_t num_samples =
+      graph_view.compute_number_of_edges(*handle_) * negative_sampling_usecase.sample_multiplier;
+
+    rmm::device_uvector<weight_t> src_bias_v(0, handle_->get_stream());
+    rmm::device_uvector<weight_t> dst_bias_v(0, handle_->get_stream());
+
+    std::optional<raft::device_span<weight_t const>> src_bias{std::nullopt};
+    std::optional<raft::device_span<weight_t const>> dst_bias{std::nullopt};
+
+    if (negative_sampling_usecase.use_src_bias) {
+      src_bias_v.resize(graph_view.local_vertex_partition_range_size(), handle_->get_stream());
+
+      cugraph::detail::uniform_random_fill(handle_->get_stream(),
+                                           src_bias_v.data(),
+                                           src_bias_v.size(),
+                                           weight_t{1},
+                                           weight_t{10},
+                                           rng_state);
+
+      src_bias = raft::device_span<weight_t const>{src_bias_v.data(), src_bias_v.size()};
+    }
+
+    if (negative_sampling_usecase.use_dst_bias) {
+      dst_bias_v.resize(graph_view.local_vertex_partition_range_size(), handle_->get_stream());
+
+      cugraph::detail::uniform_random_fill(handle_->get_stream(),
+                                           dst_bias_v.data(),
+                                           dst_bias_v.size(),
+                                           weight_t{1},
+                                           weight_t{10},
+                                           rng_state);
+
+      dst_bias = raft::device_span<weight_t const>{dst_bias_v.data(), dst_bias_v.size()};
+    }
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Negative sampling");
+    }
+
+    auto&& [src_out, dst_out] =
+      cugraph::negative_sampling(*handle_,
+                                 rng_state,
+                                 graph_view,
+                                 src_bias,
+                                 dst_bias,
+                                 num_samples,
+                                 negative_sampling_usecase.remove_duplicates,
+                                 negative_sampling_usecase.remove_existing_edges,
+                                 negative_sampling_usecase.exact_number_of_samples,
+                                 do_expensive_check);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (negative_sampling_usecase.check_correctness) {
+      ASSERT_EQ(src_out.size(), dst_out.size()) << "Result size (src, dst) mismatch";
+
+      cugraph::test::sort(*handle_,
+                          raft::device_span<vertex_t>{src_out.data(), src_out.size()},
+                          raft::device_span<vertex_t>{dst_out.data(), dst_out.size()});
+
+      // TODO:  Move this to validation_utilities...
+      auto h_vertex_partition_range_lasts = graph_view.vertex_partition_range_lasts();
+      rmm::device_uvector<vertex_t> d_vertex_partition_range_lasts(
+        h_vertex_partition_range_lasts.size(), handle_->get_stream());
+      raft::update_device(d_vertex_partition_range_lasts.data(),
+                          h_vertex_partition_range_lasts.data(),
+                          h_vertex_partition_range_lasts.size(),
+                          handle_->get_stream());
+
+      size_t error_count = cugraph::test::count_edges_on_wrong_int_gpu(
+        *handle_,
+        raft::device_span<vertex_t const>{src_out.data(), src_out.size()},
+        raft::device_span<vertex_t const>{dst_out.data(), dst_out.size()},
+        raft::device_span<vertex_t const>{d_vertex_partition_range_lasts.data(),
+                                          d_vertex_partition_range_lasts.size()});
+
+      ASSERT_EQ(error_count, 0) << "generate edges out of range > 0";
+
+      if ((negative_sampling_usecase.remove_duplicates) && (src_out.size() > 0)) {
+        error_count = cugraph::test::count_duplicate_vertex_pairs_sorted(
+          *handle_,
+          raft::device_span<vertex_t const>{src_out.data(), src_out.size()},
+          raft::device_span<vertex_t const>{dst_out.data(), dst_out.size()});
+        ASSERT_EQ(error_count, 0) << "Remove duplicates specified, found duplicate entries";
+      }
+
+      if (negative_sampling_usecase.remove_existing_edges) {
+        rmm::device_uvector<vertex_t> graph_src(0, handle_->get_stream());
+        rmm::device_uvector<vertex_t> graph_dst(0, handle_->get_stream());
+
+        std::tie(graph_src, graph_dst, std::ignore, std::ignore, std::ignore) =
+          cugraph::decompress_to_edgelist<vertex_t, edge_t, float, int, false, true>(
+            *handle_, graph_view, std::nullopt, std::nullopt, std::nullopt, std::nullopt);
+
+        error_count = cugraph::test::count_intersection<vertex_t, edge_t, weight_t, int32_t>(
+          *handle_,
+          raft::device_span<vertex_t const>{graph_src.data(), graph_src.size()},
+          raft::device_span<vertex_t const>{graph_dst.data(), graph_dst.size()},
+          std::nullopt,
+          std::nullopt,
+          std::nullopt,
+          raft::device_span<vertex_t const>{src_out.data(), src_out.size()},
+          raft::device_span<vertex_t const>{dst_out.data(), dst_out.size()},
+          std::nullopt,
+          std::nullopt,
+          std::nullopt);
+        ASSERT_EQ(error_count, 0) << "Remove existing edges specified, found existing edges";
+      }
+
+      if (negative_sampling_usecase.exact_number_of_samples) {
+        size_t sz = cugraph::host_scalar_allreduce(
+          handle_->get_comms(), src_out.size(), raft::comms::op_t::SUM, handle_->get_stream());
+        ASSERT_EQ(sz, num_samples) << "Expected exact number of samples";
+      }
+
+      //  TBD: How do we determine if we have properly reflected the biases?
+    }
+  }
+
+ public:
+  static std::unique_ptr<raft::handle_t> handle_;
+
+ private:
+  graph_t graph_;
+  std::optional<cugraph::edge_property_t<graph_view_t, weight_t>> edge_weights_{std::nullopt};
+  std::optional<cugraph::edge_property_t<graph_view_t, bool>> edge_mask_{std::nullopt};
+  std::optional<rmm::device_uvector<vertex_t>> renumber_map_labels_{std::nullopt};
+};
+
+template <typename input_usecase_t, typename vertex_t, typename edge_t, typename weight_t>
+std::unique_ptr<raft::handle_t>
+  Tests_MGNegative_Sampling<input_usecase_t, vertex_t, edge_t, weight_t>::handle_ = nullptr;
+
+using Tests_MGNegative_Sampling_File_i64_i64_float =
+  Tests_MGNegative_Sampling<cugraph::test::File_Usecase, int64_t, int64_t, float>;
+
+using Tests_MGNegative_Sampling_Rmat_i64_i64_float =
+  Tests_MGNegative_Sampling<cugraph::test::Rmat_Usecase, int64_t, int64_t, float>;
+
+template <typename CurrentTest>
+void run_all_tests(CurrentTest* current_test)
+{
+  raft::random::RngState rng_state{
+    static_cast<uint64_t>(current_test->handle_->get_comms().get_rank())};
+
+  for (bool use_src_bias : {false, true})
+    for (bool use_dst_bias : {false, true})
+      for (bool remove_duplicates : {false, true})
+        for (bool remove_existing_edges : {false, true})
+          for (bool exact_number_of_samples : {false, true})
+            for (bool edge_masking : {false, true})
+              current_test->run_current_test(rng_state,
+                                             Negative_Sampling_Usecase{2,
+                                                                       use_src_bias,
+                                                                       use_dst_bias,
+                                                                       remove_duplicates,
+                                                                       remove_existing_edges,
+                                                                       exact_number_of_samples,
+                                                                       edge_masking});
+}
+
+TEST_P(Tests_MGNegative_Sampling_File_i64_i64_float, CheckInt64Int64Float)
+{
+  load_graph(override_File_Usecase_with_cmd_line_arguments(GetParam()));
+  run_all_tests(this);
+}
+
+TEST_P(Tests_MGNegative_Sampling_Rmat_i64_i64_float, CheckInt64Int64Float)
+{
+  load_graph(override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+  run_all_tests(this);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MGNegative_Sampling_File_i64_i64_float,
+  ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx")));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_large_test,
+  Tests_MGNegative_Sampling_File_i64_i64_float,
+  ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                    cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                    cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx")));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_MGNegative_Sampling_Rmat_i64_i64_float,
+  ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false, 0)));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGNegative_Sampling_Rmat_i64_i64_float,
+  ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false, 0)));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/sampling/mg_random_walks_test.cpp b/cpp/tests/sampling/mg_random_walks_test.cpp
index c2ad5c37e9e..e2415c08e60 100644
--- a/cpp/tests/sampling/mg_random_walks_test.cpp
+++ b/cpp/tests/sampling/mg_random_walks_test.cpp
@@ -44,8 +44,10 @@ struct UniformRandomWalks_Usecase {
              raft::device_span<vertex_t const> start_vertices,
              size_t max_depth)
   {
+    raft::random::RngState rng_state(static_cast<uint64_t>(handle.get_comms().get_rank()));
+
     return cugraph::uniform_random_walks(
-      handle, graph_view, edge_weight_view, start_vertices, max_depth, seed);
+      handle, rng_state, graph_view, edge_weight_view, start_vertices, max_depth);
   }
 
   bool expect_throw() { return false; }
@@ -66,12 +68,13 @@ struct BiasedRandomWalks_Usecase {
   {
     CUGRAPH_EXPECTS(edge_weight_view.has_value(), "Biased random walk requires edge weights.");
 
+    raft::random::RngState rng_state(static_cast<uint64_t>(handle.get_comms().get_rank()));
+
     return cugraph::biased_random_walks(
-      handle, graph_view, *edge_weight_view, start_vertices, max_depth, seed);
+      handle, rng_state, graph_view, *edge_weight_view, start_vertices, max_depth);
   }
 
-  // FIXME: Not currently implemented
-  bool expect_throw() { return true; }
+  bool expect_throw() { return !test_weighted; }
 };
 
 struct Node2VecRandomWalks_Usecase {
@@ -89,18 +92,19 @@ struct Node2VecRandomWalks_Usecase {
              raft::device_span<vertex_t const> start_vertices,
              size_t max_depth)
   {
+    raft::random::RngState rng_state(static_cast<uint64_t>(handle.get_comms().get_rank()));
+
     return cugraph::node2vec_random_walks(handle,
+                                          rng_state,
                                           graph_view,
                                           edge_weight_view,
                                           start_vertices,
                                           max_depth,
                                           static_cast<weight_t>(p),
-                                          static_cast<weight_t>(q),
-                                          seed);
+                                          static_cast<weight_t>(q));
   }
 
-  // FIXME: Not currently implemented
-  bool expect_throw() { return true; }
+  bool expect_throw() { return false; }
 };
 
 template <typename tuple_t>
diff --git a/cpp/tests/sampling/negative_sampling.cpp b/cpp/tests/sampling/negative_sampling.cpp
new file mode 100644
index 00000000000..ba929c63e9b
--- /dev/null
+++ b/cpp/tests/sampling/negative_sampling.cpp
@@ -0,0 +1,348 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utilities/base_fixture.hpp"
+#include "utilities/conversion_utilities.hpp"
+#include "utilities/property_generator_utilities.hpp"
+#include "utilities/validation_utilities.hpp"
+
+#include <cugraph/sampling_functions.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <gtest/gtest.h>
+
+struct Negative_Sampling_Usecase {
+  float sample_multiplier{2};
+  bool use_src_bias{false};
+  bool use_dst_bias{false};
+  bool remove_duplicates{false};
+  bool remove_existing_edges{false};
+  bool exact_number_of_samples{false};
+  bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t, typename vertex_t, typename edge_t, typename weight_t>
+class Tests_Negative_Sampling : public ::testing::TestWithParam<input_usecase_t> {
+ public:
+  using graph_t      = cugraph::graph_t<vertex_t, edge_t, false, false>;
+  using graph_view_t = cugraph::graph_view_t<vertex_t, edge_t, false, false>;
+
+  Tests_Negative_Sampling() : graph_(raft::handle_t{}) {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  template <typename input_t>
+  void load_graph(input_t const& param)
+  {
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Construct graph");
+    }
+
+    std::tie(graph_, edge_weights_, renumber_map_labels_) =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, false>(
+        handle, param, true, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    edge_mask_ =
+      cugraph::test::generate<graph_view_t, bool>::edge_property(handle, graph_.view(), 2);
+  }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  void run_current_test(raft::random::RngState& rng_state,
+                        Negative_Sampling_Usecase const& negative_sampling_usecase)
+  {
+    constexpr bool do_expensive_check{false};
+
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    auto graph_view = graph_.view();
+
+    if (negative_sampling_usecase.edge_masking) { graph_view.attach_edge_mask(edge_mask_->view()); }
+
+    size_t num_samples =
+      graph_view.compute_number_of_edges(handle) * negative_sampling_usecase.sample_multiplier;
+
+    rmm::device_uvector<weight_t> src_bias_v(0, handle.get_stream());
+    rmm::device_uvector<weight_t> dst_bias_v(0, handle.get_stream());
+
+    std::optional<raft::device_span<weight_t const>> src_bias{std::nullopt};
+    std::optional<raft::device_span<weight_t const>> dst_bias{std::nullopt};
+
+    if (negative_sampling_usecase.use_src_bias) {
+      src_bias_v.resize(graph_view.number_of_vertices(), handle.get_stream());
+
+      cugraph::detail::uniform_random_fill(handle.get_stream(),
+                                           src_bias_v.data(),
+                                           src_bias_v.size(),
+                                           weight_t{1},
+                                           weight_t{10},
+                                           rng_state);
+
+      src_bias = raft::device_span<weight_t const>{src_bias_v.data(), src_bias_v.size()};
+    }
+
+    if (negative_sampling_usecase.use_dst_bias) {
+      dst_bias_v.resize(graph_view.number_of_vertices(), handle.get_stream());
+
+      cugraph::detail::uniform_random_fill(handle.get_stream(),
+                                           dst_bias_v.data(),
+                                           dst_bias_v.size(),
+                                           weight_t{1},
+                                           weight_t{10},
+                                           rng_state);
+
+      dst_bias = raft::device_span<weight_t const>{dst_bias_v.data(), dst_bias_v.size()};
+    }
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Negative sampling");
+    }
+
+    auto&& [src_out, dst_out] =
+      cugraph::negative_sampling(handle,
+                                 rng_state,
+                                 graph_view,
+                                 src_bias,
+                                 dst_bias,
+                                 num_samples,
+                                 negative_sampling_usecase.remove_duplicates,
+                                 negative_sampling_usecase.remove_existing_edges,
+                                 negative_sampling_usecase.exact_number_of_samples,
+                                 do_expensive_check);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (negative_sampling_usecase.check_correctness) {
+      ASSERT_EQ(src_out.size(), dst_out.size()) << "Result size (src, dst) mismatch";
+
+      cugraph::test::sort(handle,
+                          raft::device_span<vertex_t>{src_out.data(), src_out.size()},
+                          raft::device_span<vertex_t>{dst_out.data(), dst_out.size()});
+
+      size_t error_count = cugraph::test::count_invalid_vertices(
+        handle,
+        raft::device_span<vertex_t const>{src_out.data(), src_out.size()},
+        graph_view.local_vertex_partition_view());
+      ASSERT_EQ(error_count, 0) << "Source vertices out of range > 0";
+
+      error_count = cugraph::test::count_invalid_vertices(
+        handle,
+        raft::device_span<vertex_t const>{dst_out.data(), dst_out.size()},
+        graph_view.local_vertex_partition_view());
+      ASSERT_EQ(error_count, 0) << "Dest vertices out of range > 0";
+
+      if (negative_sampling_usecase.remove_duplicates) {
+        error_count = cugraph::test::count_duplicate_vertex_pairs_sorted(
+          handle,
+          raft::device_span<vertex_t const>{src_out.data(), src_out.size()},
+          raft::device_span<vertex_t const>{dst_out.data(), dst_out.size()});
+        ASSERT_EQ(error_count, 0) << "Remove duplicates specified, found duplicate entries";
+      }
+
+      if (negative_sampling_usecase.remove_existing_edges) {
+        rmm::device_uvector<vertex_t> graph_src(0, handle.get_stream());
+        rmm::device_uvector<vertex_t> graph_dst(0, handle.get_stream());
+
+        std::tie(graph_src, graph_dst, std::ignore, std::ignore, std::ignore) =
+          cugraph::decompress_to_edgelist<vertex_t, edge_t, float, int, false, false>(
+            handle, graph_view, std::nullopt, std::nullopt, std::nullopt, std::nullopt);
+
+        error_count = cugraph::test::count_intersection<vertex_t, edge_t, weight_t, int32_t>(
+          handle,
+          raft::device_span<vertex_t const>{graph_src.data(), graph_src.size()},
+          raft::device_span<vertex_t const>{graph_dst.data(), graph_dst.size()},
+          std::nullopt,
+          std::nullopt,
+          std::nullopt,
+          raft::device_span<vertex_t const>{src_out.data(), src_out.size()},
+          raft::device_span<vertex_t const>{dst_out.data(), dst_out.size()},
+          std::nullopt,
+          std::nullopt,
+          std::nullopt);
+
+        ASSERT_EQ(error_count, 0) << "Remove existing edges specified, found existing edges";
+      }
+
+      if (negative_sampling_usecase.exact_number_of_samples) {
+        ASSERT_EQ(src_out.size(), num_samples) << "Expected exact number of samples";
+      }
+
+      //  TBD: How do we determine if we have properly reflected the biases?
+    }
+  }
+
+ private:
+  graph_t graph_;
+  std::optional<cugraph::edge_property_t<graph_view_t, weight_t>> edge_weights_{std::nullopt};
+  std::optional<cugraph::edge_property_t<graph_view_t, bool>> edge_mask_{std::nullopt};
+  std::optional<rmm::device_uvector<vertex_t>> renumber_map_labels_{std::nullopt};
+};
+
+using Tests_Negative_Sampling_File_i32_i32_float =
+  Tests_Negative_Sampling<cugraph::test::File_Usecase, int32_t, int32_t, float>;
+
+using Tests_Negative_Sampling_File_i32_i64_float =
+  Tests_Negative_Sampling<cugraph::test::File_Usecase, int32_t, int64_t, float>;
+
+using Tests_Negative_Sampling_File_i64_i64_float =
+  Tests_Negative_Sampling<cugraph::test::File_Usecase, int64_t, int64_t, float>;
+
+using Tests_Negative_Sampling_Rmat_i32_i32_float =
+  Tests_Negative_Sampling<cugraph::test::Rmat_Usecase, int32_t, int32_t, float>;
+
+using Tests_Negative_Sampling_Rmat_i32_i64_float =
+  Tests_Negative_Sampling<cugraph::test::Rmat_Usecase, int32_t, int64_t, float>;
+
+using Tests_Negative_Sampling_Rmat_i64_i64_float =
+  Tests_Negative_Sampling<cugraph::test::Rmat_Usecase, int64_t, int64_t, float>;
+
+template <typename CurrentTest>
+void run_all_tests(CurrentTest* current_test)
+{
+  raft::random::RngState rng_state{0};
+
+  for (bool use_src_bias : {false, true})
+    for (bool use_dst_bias : {false, true})
+      for (bool remove_duplicates : {false, true})
+        for (bool remove_existing_edges : {false, true})
+          for (bool exact_number_of_samples : {false, true})
+            for (bool edge_masking : {false, true})
+              current_test->run_current_test(rng_state,
+                                             Negative_Sampling_Usecase{2,
+                                                                       use_src_bias,
+                                                                       use_dst_bias,
+                                                                       remove_duplicates,
+                                                                       remove_existing_edges,
+                                                                       exact_number_of_samples,
+                                                                       edge_masking});
+}
+
+TEST_P(Tests_Negative_Sampling_File_i32_i32_float, CheckInt32Int32Float)
+{
+  load_graph(override_File_Usecase_with_cmd_line_arguments(GetParam()));
+  run_all_tests(this);
+}
+
+TEST_P(Tests_Negative_Sampling_File_i32_i64_float, CheckInt32Int64Float)
+{
+  load_graph(override_File_Usecase_with_cmd_line_arguments(GetParam()));
+  run_all_tests(this);
+}
+
+TEST_P(Tests_Negative_Sampling_File_i64_i64_float, CheckInt64Int64Float)
+{
+  load_graph(override_File_Usecase_with_cmd_line_arguments(GetParam()));
+  run_all_tests(this);
+}
+
+TEST_P(Tests_Negative_Sampling_Rmat_i32_i32_float, CheckInt32Int32Float)
+{
+  load_graph(override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+  run_all_tests(this);
+}
+
+TEST_P(Tests_Negative_Sampling_Rmat_i32_i64_float, CheckInt32Int64Float)
+{
+  load_graph(override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+  run_all_tests(this);
+}
+
+TEST_P(Tests_Negative_Sampling_Rmat_i64_i64_float, CheckInt64Int64Float)
+{
+  load_graph(override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+  run_all_tests(this);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_Negative_Sampling_File_i32_i32_float,
+  ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx")));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_large_test,
+  Tests_Negative_Sampling_File_i32_i32_float,
+  ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                    cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                    cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx")));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_Negative_Sampling_File_i32_i64_float,
+  ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx")));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_large_test,
+  Tests_Negative_Sampling_File_i32_i64_float,
+  ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                    cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                    cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx")));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_Negative_Sampling_File_i64_i64_float,
+  ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx")));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_large_test,
+  Tests_Negative_Sampling_File_i64_i64_float,
+  ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                    cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                    cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx")));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_Negative_Sampling_Rmat_i32_i32_float,
+  ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false, 0)));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_Negative_Sampling_Rmat_i32_i64_float,
+  ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false, 0)));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_Negative_Sampling_Rmat_i64_i64_float,
+  ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false, 0)));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_Negative_Sampling_Rmat_i64_i64_float,
+  ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false, 0)));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/sampling/random_walks_check.cuh b/cpp/tests/sampling/random_walks_check.cuh
index 0fd73b5bba7..380b97a5b84 100644
--- a/cpp/tests/sampling/random_walks_check.cuh
+++ b/cpp/tests/sampling/random_walks_check.cuh
@@ -108,7 +108,7 @@ void random_walks_validate(
                      (int)d,
                      (float)w);
             } else {
-              printf("edge (%d,%d) NOT FOUND\n", (int)s, (int)d);
+              printf("edge (%d,%d), weight %g NOT FOUND\n", (int)s, (int)d, (float)w);
             }
 
             return 1;
diff --git a/cpp/tests/sampling/sampling_heterogeneous_post_processing_test.cpp b/cpp/tests/sampling/sampling_heterogeneous_post_processing_test.cpp
new file mode 100644
index 00000000000..2b2049dc8db
--- /dev/null
+++ b/cpp/tests/sampling/sampling_heterogeneous_post_processing_test.cpp
@@ -0,0 +1,828 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "detail/sampling_post_processing_validate.hpp"
+#include "utilities/base_fixture.hpp"
+#include "utilities/conversion_utilities.hpp"
+#include "utilities/property_generator_utilities.hpp"
+
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/sampling_functions.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/core/handle.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+struct SamplingHeterogeneousPostProcessing_Usecase {
+  size_t num_labels{};
+  size_t num_seeds_per_label{};
+  size_t num_vertex_types{};
+  std::vector<int32_t> fanouts{{-1}};
+  bool sample_with_replacement{false};
+
+  bool src_is_major{true};
+  bool renumber_with_seeds{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_SamplingHeterogeneousPostProcessing
+  : public ::testing::TestWithParam<
+      std::tuple<SamplingHeterogeneousPostProcessing_Usecase, input_usecase_t>> {
+ public:
+  Tests_SamplingHeterogeneousPostProcessing() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t>
+  void run_current_test(std::tuple<SamplingHeterogeneousPostProcessing_Usecase const&,
+                                   input_usecase_t const&> const& param)
+  {
+    using label_t     = int32_t;
+    using weight_t    = float;
+    using edge_id_t   = edge_t;
+    using edge_type_t = int32_t;
+
+    bool constexpr store_transposed = false;
+    bool constexpr renumber         = true;
+    bool constexpr test_weighted    = true;
+
+    auto [sampling_heterogeneous_post_processing_usecase, input_usecase] = param;
+
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    // 1. create a graph
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Construct graph");
+    }
+
+    auto [graph, edge_weights, d_renumber_map_labels] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, store_transposed, false>(
+        handle, input_usecase, test_weighted, renumber);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto graph_view = graph.view();
+    auto edge_weight_view =
+      edge_weights ? std::make_optional((*edge_weights).view()) : std::nullopt;
+
+    // 2. vertex type offsets
+
+    raft::random::RngState rng_state(0);
+
+    rmm::device_uvector<vertex_t> vertex_type_offsets(
+      sampling_heterogeneous_post_processing_usecase.num_vertex_types + 1, handle.get_stream());
+    {
+      auto num_vertices = graph_view.number_of_vertices();
+      vertex_type_offsets.set_element_to_zero_async(0, handle.get_stream());
+      vertex_type_offsets.set_element_async(
+        vertex_type_offsets.size() - 1, num_vertices, handle.get_stream());
+      auto tmp = cugraph::select_random_vertices<vertex_t, edge_t, store_transposed, false>(
+        handle,
+        graph_view,
+        std::nullopt,
+        rng_state,
+        sampling_heterogeneous_post_processing_usecase.num_vertex_types - 1,
+        false /* with_replacement */,
+        true /* sort_vertices */);
+      raft::copy(vertex_type_offsets.data() + 1, tmp.data(), tmp.size(), handle.get_stream());
+    }
+
+    // 3. seed vertices (& labels)
+
+    rmm::device_uvector<vertex_t> starting_vertices(
+      sampling_heterogeneous_post_processing_usecase.num_labels *
+        sampling_heterogeneous_post_processing_usecase.num_seeds_per_label,
+      handle.get_stream());
+    cugraph::detail::uniform_random_fill(handle.get_stream(),
+                                         starting_vertices.data(),
+                                         starting_vertices.size(),
+                                         vertex_t{0},
+                                         graph_view.number_of_vertices(),
+                                         rng_state);
+    auto starting_vertex_labels = (sampling_heterogeneous_post_processing_usecase.num_labels > 1)
+                                    ? std::make_optional<rmm::device_uvector<label_t>>(
+                                        starting_vertices.size(), handle.get_stream())
+                                    : std::nullopt;
+    auto starting_vertex_label_offsets =
+      (sampling_heterogeneous_post_processing_usecase.num_labels > 1)
+        ? std::make_optional<rmm::device_uvector<size_t>>(
+            sampling_heterogeneous_post_processing_usecase.num_labels + 1, handle.get_stream())
+        : std::nullopt;
+    if (starting_vertex_labels) {
+      auto num_seeds_per_label = sampling_heterogeneous_post_processing_usecase.num_seeds_per_label;
+      for (size_t i = 0; i < sampling_heterogeneous_post_processing_usecase.num_labels; ++i) {
+        cugraph::detail::scalar_fill(handle.get_stream(),
+                                     (*starting_vertex_labels).data() + i * num_seeds_per_label,
+                                     num_seeds_per_label,
+                                     static_cast<label_t>(i));
+      }
+      cugraph::detail::stride_fill(handle.get_stream(),
+                                   (*starting_vertex_label_offsets).data(),
+                                   (*starting_vertex_label_offsets).size(),
+                                   size_t{0},
+                                   num_seeds_per_label);
+    }
+
+    // 4. generate edge IDs and types
+
+    auto num_edge_types =
+      sampling_heterogeneous_post_processing_usecase.num_vertex_types *
+      sampling_heterogeneous_post_processing_usecase
+        .num_vertex_types;  // necessary to enforce that edge type dictates edge source vertex type
+                            // and edge destination vertex type.
+
+    std::optional<cugraph::edge_property_t<decltype(graph_view), edge_type_t>> edge_types{
+      std::nullopt};
+    if (num_edge_types > 1) {
+      edge_types =
+        cugraph::test::generate<decltype(graph_view), edge_type_t>::edge_property_by_src_dst_types(
+          handle,
+          graph_view,
+          raft::device_span<vertex_t const>(vertex_type_offsets.data(), vertex_type_offsets.size()),
+          num_edge_types);
+    }
+
+    cugraph::edge_property_t<decltype(graph_view), edge_id_t> edge_ids(handle);
+    if (edge_types) {
+      static_assert(std::is_same_v<edge_type_t, int32_t>);
+      edge_ids =
+        cugraph::test::generate<decltype(graph_view), edge_id_t>::unique_edge_property_per_type(
+          handle, graph_view, (*edge_types).view(), static_cast<int32_t>(num_edge_types));
+    } else {
+      edge_ids = cugraph::test::generate<decltype(graph_view), edge_id_t>::unique_edge_property(
+        handle, graph_view);
+    }
+
+    // 5. sampling
+
+    rmm::device_uvector<vertex_t> org_edgelist_srcs(0, handle.get_stream());
+    rmm::device_uvector<vertex_t> org_edgelist_dsts(0, handle.get_stream());
+    std::optional<rmm::device_uvector<weight_t>> org_edgelist_weights{std::nullopt};
+    std::optional<rmm::device_uvector<edge_id_t>> org_edgelist_edge_ids{std::nullopt};
+    std::optional<rmm::device_uvector<edge_type_t>> org_edgelist_edge_types{std::nullopt};
+    std::optional<rmm::device_uvector<int32_t>> org_edgelist_hops{std::nullopt};
+    std::optional<rmm::device_uvector<label_t>> org_labels{std::nullopt};
+    std::optional<rmm::device_uvector<size_t>> org_edgelist_label_offsets{std::nullopt};
+    std::tie(org_edgelist_srcs,
+             org_edgelist_dsts,
+             org_edgelist_weights,
+             org_edgelist_edge_ids,
+             org_edgelist_edge_types,
+             org_edgelist_hops,
+             org_labels,
+             org_edgelist_label_offsets) = cugraph::uniform_neighbor_sample<vertex_t,
+                                                                            edge_t,
+                                                                            weight_t,
+                                                                            edge_type_t,
+                                                                            label_t,
+                                                                            store_transposed,
+                                                                            false>(
+      handle,
+      graph_view,
+      edge_weight_view,
+      std::optional<cugraph::edge_property_view_t<edge_t, edge_id_t const*>>{edge_ids.view()},
+      edge_types
+        ? std::optional<cugraph::edge_property_view_t<edge_t, edge_type_t const*>>{(*edge_types)
+                                                                                     .view()}
+        : std::nullopt,
+      raft::device_span<vertex_t const>(starting_vertices.data(), starting_vertices.size()),
+      starting_vertex_labels ? std::make_optional<raft::device_span<label_t const>>(
+                                 (*starting_vertex_labels).data(), (*starting_vertex_labels).size())
+                             : std::nullopt,
+      std::nullopt,
+      raft::host_span<int32_t const>(sampling_heterogeneous_post_processing_usecase.fanouts.data(),
+                                     sampling_heterogeneous_post_processing_usecase.fanouts.size()),
+      rng_state,
+      sampling_heterogeneous_post_processing_usecase.fanouts.size() > 1,
+      sampling_heterogeneous_post_processing_usecase.sample_with_replacement,
+      cugraph::prior_sources_behavior_t::EXCLUDE,
+      false);
+
+    if (!sampling_heterogeneous_post_processing_usecase.src_is_major) {
+      std::swap(org_edgelist_srcs, org_edgelist_dsts);
+    }
+
+    // 6. post processing: renumber & sort
+
+    {
+      rmm::device_uvector<vertex_t> renumbered_and_sorted_edgelist_srcs(org_edgelist_srcs.size(),
+                                                                        handle.get_stream());
+      rmm::device_uvector<vertex_t> renumbered_and_sorted_edgelist_dsts(org_edgelist_dsts.size(),
+                                                                        handle.get_stream());
+      auto renumbered_and_sorted_edgelist_weights =
+        org_edgelist_weights ? std::make_optional<rmm::device_uvector<weight_t>>(
+                                 (*org_edgelist_weights).size(), handle.get_stream())
+                             : std::nullopt;
+      auto renumbered_and_sorted_edgelist_edge_ids =
+        org_edgelist_edge_ids ? std::make_optional<rmm::device_uvector<edge_id_t>>(
+                                  (*org_edgelist_edge_ids).size(), handle.get_stream())
+                              : std::nullopt;
+      auto renumbered_and_sorted_edgelist_edge_types =
+        org_edgelist_edge_types ? std::make_optional<rmm::device_uvector<edge_type_t>>(
+                                    (*org_edgelist_edge_types).size(), handle.get_stream())
+                                : std::nullopt;
+      auto renumbered_and_sorted_edgelist_hops =
+        org_edgelist_hops ? std::make_optional(rmm::device_uvector<int32_t>(
+                              (*org_edgelist_hops).size(), handle.get_stream()))
+                          : std::nullopt;
+
+      raft::copy(renumbered_and_sorted_edgelist_srcs.data(),
+                 org_edgelist_srcs.data(),
+                 org_edgelist_srcs.size(),
+                 handle.get_stream());
+      raft::copy(renumbered_and_sorted_edgelist_dsts.data(),
+                 org_edgelist_dsts.data(),
+                 org_edgelist_dsts.size(),
+                 handle.get_stream());
+      if (renumbered_and_sorted_edgelist_weights) {
+        raft::copy((*renumbered_and_sorted_edgelist_weights).data(),
+                   (*org_edgelist_weights).data(),
+                   (*org_edgelist_weights).size(),
+                   handle.get_stream());
+      }
+      if (renumbered_and_sorted_edgelist_edge_ids) {
+        raft::copy((*renumbered_and_sorted_edgelist_edge_ids).data(),
+                   (*org_edgelist_edge_ids).data(),
+                   (*org_edgelist_edge_ids).size(),
+                   handle.get_stream());
+      }
+      if (renumbered_and_sorted_edgelist_edge_types) {
+        raft::copy((*renumbered_and_sorted_edgelist_edge_types).data(),
+                   (*org_edgelist_edge_types).data(),
+                   (*org_edgelist_edge_types).size(),
+                   handle.get_stream());
+      }
+      if (renumbered_and_sorted_edgelist_hops) {
+        raft::copy((*renumbered_and_sorted_edgelist_hops).data(),
+                   (*org_edgelist_hops).data(),
+                   (*org_edgelist_hops).size(),
+                   handle.get_stream());
+      }
+
+      std::optional<rmm::device_uvector<size_t>>
+        renumbered_and_sorted_edgelist_label_type_hop_offsets{std::nullopt};
+      rmm::device_uvector<vertex_t> renumbered_and_sorted_vertex_renumber_map(0,
+                                                                              handle.get_stream());
+      rmm::device_uvector<size_t> renumbered_and_sorted_vertex_renumber_map_label_type_offsets(
+        0, handle.get_stream());
+      std::optional<rmm::device_uvector<edge_id_t>> renumbered_and_sorted_edge_id_renumber_map{
+        std::nullopt};
+      std::optional<rmm::device_uvector<size_t>>
+        renumbered_and_sorted_edge_id_renumber_map_label_type_offsets{std::nullopt};
+
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        hr_timer.start("Renumber and sort sampled edgelist");
+      }
+
+      std::tie(renumbered_and_sorted_edgelist_srcs,
+               renumbered_and_sorted_edgelist_dsts,
+               renumbered_and_sorted_edgelist_weights,
+               renumbered_and_sorted_edgelist_edge_ids,
+               renumbered_and_sorted_edgelist_label_type_hop_offsets,
+               renumbered_and_sorted_vertex_renumber_map,
+               renumbered_and_sorted_vertex_renumber_map_label_type_offsets,
+               renumbered_and_sorted_edge_id_renumber_map,
+               renumbered_and_sorted_edge_id_renumber_map_label_type_offsets) =
+        cugraph::heterogeneous_renumber_and_sort_sampled_edgelist<vertex_t,
+                                                                  weight_t,
+                                                                  edge_id_t,
+                                                                  edge_type_t>(
+          handle,
+          std::move(renumbered_and_sorted_edgelist_srcs),
+          std::move(renumbered_and_sorted_edgelist_dsts),
+          std::move(renumbered_and_sorted_edgelist_weights),
+          std::move(renumbered_and_sorted_edgelist_edge_ids),
+          std::move(renumbered_and_sorted_edgelist_edge_types),
+          std::move(renumbered_and_sorted_edgelist_hops),
+          sampling_heterogeneous_post_processing_usecase.renumber_with_seeds
+            ? std::make_optional<raft::device_span<vertex_t const>>(starting_vertices.data(),
+                                                                    starting_vertices.size())
+            : std::nullopt,
+          (sampling_heterogeneous_post_processing_usecase.renumber_with_seeds &&
+           starting_vertex_label_offsets)
+            ? std::make_optional<raft::device_span<size_t const>>(
+                (*starting_vertex_label_offsets).data(), (*starting_vertex_label_offsets).size())
+            : std::nullopt,
+          org_edgelist_label_offsets
+            ? std::make_optional(raft::device_span<size_t const>(
+                (*org_edgelist_label_offsets).data(), (*org_edgelist_label_offsets).size()))
+            : std::nullopt,
+          raft::device_span<vertex_t const>(vertex_type_offsets.data(), vertex_type_offsets.size()),
+          sampling_heterogeneous_post_processing_usecase.num_labels,
+          sampling_heterogeneous_post_processing_usecase.fanouts.size(),
+          sampling_heterogeneous_post_processing_usecase.num_vertex_types,
+          num_edge_types,
+          sampling_heterogeneous_post_processing_usecase.src_is_major);
+
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        hr_timer.stop();
+        hr_timer.display_and_clear(std::cout);
+      }
+
+      if (sampling_heterogeneous_post_processing_usecase.check_correctness) {
+        if (renumbered_and_sorted_edgelist_label_type_hop_offsets) {
+          ASSERT_TRUE(check_offsets(
+            handle,
+            raft::device_span<size_t const>(
+              (*renumbered_and_sorted_edgelist_label_type_hop_offsets).data(),
+              (*renumbered_and_sorted_edgelist_label_type_hop_offsets).size()),
+            sampling_heterogeneous_post_processing_usecase.num_labels * num_edge_types *
+              sampling_heterogeneous_post_processing_usecase.fanouts.size(),
+            renumbered_and_sorted_edgelist_srcs.size()))
+            << "Renumbered and sorted edge (label, edge type, hop) offset array is invalid.";
+        }
+
+        ASSERT_TRUE(
+          check_offsets(handle,
+                        raft::device_span<size_t const>(
+                          renumbered_and_sorted_vertex_renumber_map_label_type_offsets.data(),
+                          renumbered_and_sorted_vertex_renumber_map_label_type_offsets.size()),
+                        sampling_heterogeneous_post_processing_usecase.num_labels *
+                          sampling_heterogeneous_post_processing_usecase.num_vertex_types,
+                        renumbered_and_sorted_vertex_renumber_map.size()))
+          << "Renumbered and sorted vertex renumber map (label, vertex type) offset array is "
+             "invalid.";
+
+        if (renumbered_and_sorted_edge_id_renumber_map_label_type_offsets) {
+          ASSERT_TRUE(check_offsets(
+            handle,
+            raft::device_span<size_t const>(
+              (*renumbered_and_sorted_edge_id_renumber_map_label_type_offsets).data(),
+              (*renumbered_and_sorted_edge_id_renumber_map_label_type_offsets).size()),
+            sampling_heterogeneous_post_processing_usecase.num_labels * num_edge_types,
+            (*renumbered_and_sorted_edge_id_renumber_map).size()))
+            << "Renumbered and sorted edge renumber map (label, edge type) offset array is "
+               "invalid.";
+        }
+
+        // check whether the edges are properly sorted
+
+        auto renumbered_and_sorted_edgelist_majors =
+          sampling_heterogeneous_post_processing_usecase.src_is_major
+            ? raft::device_span<vertex_t const>(renumbered_and_sorted_edgelist_srcs.data(),
+                                                renumbered_and_sorted_edgelist_srcs.size())
+            : raft::device_span<vertex_t const>(renumbered_and_sorted_edgelist_dsts.data(),
+                                                renumbered_and_sorted_edgelist_dsts.size());
+        auto renumbered_and_sorted_edgelist_minors =
+          sampling_heterogeneous_post_processing_usecase.src_is_major
+            ? raft::device_span<vertex_t const>(renumbered_and_sorted_edgelist_dsts.data(),
+                                                renumbered_and_sorted_edgelist_dsts.size())
+            : raft::device_span<vertex_t const>(renumbered_and_sorted_edgelist_srcs.data(),
+                                                renumbered_and_sorted_edgelist_srcs.size());
+
+        if (renumbered_and_sorted_edgelist_label_type_hop_offsets) {
+          for (size_t i = 0;
+               i < sampling_heterogeneous_post_processing_usecase.num_labels * num_edge_types *
+                     sampling_heterogeneous_post_processing_usecase.fanouts.size();
+               ++i) {
+            auto hop_start_offset = (*renumbered_and_sorted_edgelist_label_type_hop_offsets)
+                                      .element(i, handle.get_stream());
+            auto hop_end_offset = (*renumbered_and_sorted_edgelist_label_type_hop_offsets)
+                                    .element(i + 1, handle.get_stream());
+            ASSERT_TRUE(check_edgelist_is_sorted(
+              handle,
+              raft::device_span<vertex_t const>(
+                renumbered_and_sorted_edgelist_majors.data() + hop_start_offset,
+                hop_end_offset - hop_start_offset),
+              raft::device_span<vertex_t const>(
+                renumbered_and_sorted_edgelist_minors.data() + hop_start_offset,
+                hop_end_offset - hop_start_offset)))
+              << "Renumbered and sorted edge list is not properly sorted.";
+          }
+        } else {
+          ASSERT_TRUE(check_edgelist_is_sorted(
+            handle,
+            raft::device_span<vertex_t const>(renumbered_and_sorted_edgelist_majors.data(),
+                                              renumbered_and_sorted_edgelist_majors.size()),
+            raft::device_span<vertex_t const>(renumbered_and_sorted_edgelist_minors.data(),
+                                              renumbered_and_sorted_edgelist_minors.size())))
+            << "Renumbered and sorted edge list is not properly sorted.";
+        }
+
+        // check whether renumbering recovers the original edge list
+
+        ASSERT_TRUE(compare_heterogeneous_edgelist(
+          handle,
+          raft::device_span<vertex_t const>(org_edgelist_srcs.data(), org_edgelist_srcs.size()),
+          raft::device_span<vertex_t const>(org_edgelist_dsts.data(), org_edgelist_dsts.size()),
+          org_edgelist_weights ? std::make_optional<raft::device_span<weight_t const>>(
+                                   (*org_edgelist_weights).data(), (*org_edgelist_weights).size())
+                               : std::nullopt,
+          org_edgelist_edge_ids
+            ? std::make_optional<raft::device_span<edge_id_t const>>(
+                (*org_edgelist_edge_ids).data(), (*org_edgelist_edge_ids).size())
+            : std::nullopt,
+          org_edgelist_edge_types
+            ? std::make_optional<raft::device_span<edge_type_t const>>(
+                (*org_edgelist_edge_types).data(), (*org_edgelist_edge_types).size())
+            : std::nullopt,
+          org_edgelist_hops ? std::make_optional<raft::device_span<int32_t const>>(
+                                (*org_edgelist_hops).data(), (*org_edgelist_hops).size())
+                            : std::nullopt,
+          org_edgelist_label_offsets
+            ? std::make_optional<raft::device_span<size_t const>>(
+                (*org_edgelist_label_offsets).data(), (*org_edgelist_label_offsets).size())
+            : std::nullopt,
+          raft::device_span<vertex_t const>(renumbered_and_sorted_edgelist_srcs.data(),
+                                            renumbered_and_sorted_edgelist_srcs.size()),
+          raft::device_span<vertex_t const>(renumbered_and_sorted_edgelist_dsts.data(),
+                                            renumbered_and_sorted_edgelist_dsts.size()),
+          renumbered_and_sorted_edgelist_weights
+            ? std::make_optional<raft::device_span<weight_t const>>(
+                (*renumbered_and_sorted_edgelist_weights).data(),
+                (*renumbered_and_sorted_edgelist_weights).size())
+            : std::nullopt,
+          renumbered_and_sorted_edgelist_edge_ids
+            ? std::make_optional<raft::device_span<edge_id_t const>>(
+                (*renumbered_and_sorted_edgelist_edge_ids).data(),
+                (*renumbered_and_sorted_edgelist_edge_ids).size())
+            : std::nullopt,
+          renumbered_and_sorted_edgelist_label_type_hop_offsets
+            ? std::make_optional<raft::device_span<size_t const>>(
+                (*renumbered_and_sorted_edgelist_label_type_hop_offsets).data(),
+                (*renumbered_and_sorted_edgelist_label_type_hop_offsets).size())
+            : std::nullopt,
+          raft::device_span<vertex_t const>(renumbered_and_sorted_vertex_renumber_map.data(),
+                                            renumbered_and_sorted_vertex_renumber_map.size()),
+          raft::device_span<size_t const>(
+            renumbered_and_sorted_vertex_renumber_map_label_type_offsets.data(),
+            renumbered_and_sorted_vertex_renumber_map_label_type_offsets.size()),
+          renumbered_and_sorted_edge_id_renumber_map
+            ? std::make_optional<raft::device_span<edge_id_t const>>(
+                (*renumbered_and_sorted_edge_id_renumber_map).data(),
+                (*renumbered_and_sorted_edge_id_renumber_map).size())
+            : std::nullopt,
+          renumbered_and_sorted_edge_id_renumber_map_label_type_offsets
+            ? std::make_optional<raft::device_span<size_t const>>(
+                (*renumbered_and_sorted_edge_id_renumber_map_label_type_offsets).data(),
+                (*renumbered_and_sorted_edge_id_renumber_map_label_type_offsets).size())
+            : std::nullopt,
+          raft::device_span<vertex_t const>(vertex_type_offsets.data(), vertex_type_offsets.size()),
+          sampling_heterogeneous_post_processing_usecase.num_labels,
+          sampling_heterogeneous_post_processing_usecase.num_vertex_types,
+          num_edge_types,
+          sampling_heterogeneous_post_processing_usecase.fanouts.size()))
+          << "Unrenumbering the renumbered and sorted edge list does not recover the original "
+             "edgelist.";
+
+        // Check the invariants in vertex renumber_map
+
+        ASSERT_TRUE(check_vertex_renumber_map_invariants<vertex_t>(
+          handle,
+          sampling_heterogeneous_post_processing_usecase.renumber_with_seeds
+            ? std::make_optional<raft::device_span<vertex_t const>>(starting_vertices.data(),
+                                                                    starting_vertices.size())
+            : std::nullopt,
+          (sampling_heterogeneous_post_processing_usecase.renumber_with_seeds &&
+           starting_vertex_label_offsets)
+            ? std::make_optional<raft::device_span<size_t const>>(
+                (*starting_vertex_label_offsets).data(), (*starting_vertex_label_offsets).size())
+            : std::nullopt,
+          raft::device_span<vertex_t const>(org_edgelist_srcs.data(), org_edgelist_srcs.size()),
+          raft::device_span<vertex_t const>(org_edgelist_dsts.data(), org_edgelist_dsts.size()),
+          org_edgelist_hops ? std::make_optional<raft::device_span<int32_t const>>(
+                                (*org_edgelist_hops).data(), (*org_edgelist_hops).size())
+                            : std::nullopt,
+          org_edgelist_label_offsets
+            ? std::make_optional<raft::device_span<size_t const>>(
+                (*org_edgelist_label_offsets).data(), (*org_edgelist_label_offsets).size())
+            : std::nullopt,
+          raft::device_span<vertex_t const>(renumbered_and_sorted_vertex_renumber_map.data(),
+                                            renumbered_and_sorted_vertex_renumber_map.size()),
+          std::make_optional<raft::device_span<size_t const>>(
+            renumbered_and_sorted_vertex_renumber_map_label_type_offsets.data(),
+            renumbered_and_sorted_vertex_renumber_map_label_type_offsets.size()),
+          raft::device_span<vertex_t const>(vertex_type_offsets.data(), vertex_type_offsets.size()),
+          sampling_heterogeneous_post_processing_usecase.num_labels,
+          sampling_heterogeneous_post_processing_usecase.num_vertex_types,
+          sampling_heterogeneous_post_processing_usecase.src_is_major))
+          << "Renumbered and sorted output vertex renumber map violates invariants.";
+
+        // Check the invariants in edge renumber_map
+
+        if (org_edgelist_edge_ids) {
+          ASSERT_TRUE(check_edge_id_renumber_map_invariants(
+            handle,
+            raft::device_span<edge_id_t const>((*org_edgelist_edge_ids).data(),
+                                               (*org_edgelist_edge_ids).size()),
+            org_edgelist_edge_types
+              ? std::make_optional<raft::device_span<edge_type_t const>>(
+                  (*org_edgelist_edge_types).data(), (*org_edgelist_edge_types).size())
+              : std::nullopt,
+            org_edgelist_hops ? std::make_optional<raft::device_span<int32_t const>>(
+                                  (*org_edgelist_hops).data(), (*org_edgelist_hops).size())
+                              : std::nullopt,
+            org_edgelist_label_offsets
+              ? std::make_optional<raft::device_span<size_t const>>(
+                  (*org_edgelist_label_offsets).data(), (*org_edgelist_label_offsets).size())
+              : std::nullopt,
+            raft::device_span<edge_id_t const>(
+              (*renumbered_and_sorted_edge_id_renumber_map).data(),
+              (*renumbered_and_sorted_edge_id_renumber_map).size()),
+            renumbered_and_sorted_edge_id_renumber_map_label_type_offsets
+              ? std::make_optional<raft::device_span<size_t const>>(
+                  (*renumbered_and_sorted_edge_id_renumber_map_label_type_offsets).data(),
+                  (*renumbered_and_sorted_edge_id_renumber_map_label_type_offsets).size())
+              : std::nullopt,
+            sampling_heterogeneous_post_processing_usecase.num_labels,
+            num_edge_types))
+            << "Renumbered and sorted output edge ID renumber map violates invariants.";
+        }
+      }
+    }
+  }
+};
+
+using Tests_SamplingHeterogeneousPostProcessing_File =
+  Tests_SamplingHeterogeneousPostProcessing<cugraph::test::File_Usecase>;
+using Tests_SamplingHeterogeneousPostProcessing_Rmat =
+  Tests_SamplingHeterogeneousPostProcessing<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_SamplingHeterogeneousPostProcessing_File, CheckInt32Int32)
+{
+  run_current_test<int32_t, int32_t>(override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SamplingHeterogeneousPostProcessing_Rmat, CheckInt32Int32)
+{
+  run_current_test<int32_t, int32_t>(override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SamplingHeterogeneousPostProcessing_Rmat, CheckInt32Int64)
+{
+  run_current_test<int32_t, int64_t>(override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SamplingHeterogeneousPostProcessing_Rmat, CheckInt64Int64)
+{
+  run_current_test<int64_t, int64_t>(override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_SamplingHeterogeneousPostProcessing_File,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {10}, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {10}, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {10}, false, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {10}, false, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {10}, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {10}, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {10}, false, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {10}, false, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {10}, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {10}, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {10}, true, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {10}, true, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {10}, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {10}, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {10}, true, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {10}, true, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {5, 10, 15}, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {5, 10, 25}, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {5, 10, 25}, false, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {5, 10, 25}, false, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {5, 10, 25}, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {5, 10, 25}, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {5, 10, 25}, false, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {5, 10, 25}, false, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {5, 10, 25}, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {5, 10, 25}, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {5, 10, 25}, true, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {5, 10, 25}, true, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {5, 10, 25}, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {5, 10, 25}, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {5, 10, 25}, true, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {5, 10, 25}, true, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {10}, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {10}, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {10}, false, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {10}, false, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {10}, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {10}, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {10}, false, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {10}, false, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {10}, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {10}, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {10}, true, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {10}, true, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {10}, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {10}, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {10}, true, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {10}, true, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {5, 10, 25}, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {5, 10, 25}, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {5, 10, 25}, false, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {5, 10, 25}, false, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {5, 10, 25}, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {5, 10, 25}, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {5, 10, 25}, false, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {5, 10, 25}, false, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {5, 10, 25}, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {5, 10, 25}, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {5, 10, 25}, true, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {5, 10, 25}, true, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {5, 10, 25}, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {5, 10, 25}, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {5, 10, 25}, true, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {5, 10, 25}, true, true, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/dolphins.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_SamplingHeterogeneousPostProcessing_Rmat,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {10}, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {10}, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {10}, false, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {10}, false, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {10}, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {10}, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {10}, false, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {10}, false, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {10}, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {10}, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {10}, true, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {10}, true, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {10}, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {10}, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {10}, true, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {10}, true, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {5, 10, 25}, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {5, 10, 25}, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {5, 10, 25}, false, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {5, 10, 25}, false, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {5, 10, 25}, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {5, 10, 25}, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {5, 10, 25}, false, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {5, 10, 25}, false, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {5, 10, 25}, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {5, 10, 25}, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {5, 10, 25}, true, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {5, 10, 25}, true, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {5, 10, 25}, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {5, 10, 25}, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 1, {5, 10, 25}, true, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 16, 4, {5, 10, 25}, true, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {10}, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {10}, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {10}, false, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {10}, false, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {10}, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {10}, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {10}, false, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {10}, false, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {10}, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {10}, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {10}, true, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {10}, true, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {10}, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {10}, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {10}, true, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {10}, true, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {5, 10, 25}, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {5, 10, 25}, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {5, 10, 25}, false, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {5, 10, 25}, false, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {5, 10, 25}, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {5, 10, 25}, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {5, 10, 25}, false, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {5, 10, 25}, false, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {5, 10, 25}, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {5, 10, 25}, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {5, 10, 25}, true, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {5, 10, 25}, true, false, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {5, 10, 25}, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {5, 10, 25}, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 1, {5, 10, 25}, true, true, true},
+      SamplingHeterogeneousPostProcessing_Usecase{32, 16, 4, {5, 10, 25}, true, true, true}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test,
+  Tests_SamplingHeterogeneousPostProcessing_Rmat,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(
+      SamplingHeterogeneousPostProcessing_Usecase{1, 64, 1, {10}, false, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 64, 16, {10}, false, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 64, 1, {10}, false, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 64, 16, {10}, false, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 64, 1, {10}, false, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 64, 16, {10}, false, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 64, 1, {10}, false, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 64, 16, {10}, false, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 64, 1, {10}, true, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 64, 16, {10}, true, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 64, 1, {10}, true, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 64, 16, {10}, true, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 64, 1, {10}, true, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 64, 16, {10}, true, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 64, 1, {10}, true, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 64, 16, {10}, true, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{
+        1, 64, 1, {5, 10, 15}, false, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{
+        1, 64, 16, {5, 10, 15}, false, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 64, 1, {5, 10, 15}, false, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{
+        1, 64, 16, {5, 10, 15}, false, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 64, 1, {5, 10, 15}, false, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{
+        1, 64, 16, {5, 10, 15}, false, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 64, 1, {5, 10, 15}, false, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 64, 16, {5, 10, 15}, false, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 64, 1, {5, 10, 15}, true, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{
+        1, 64, 16, {5, 10, 15}, true, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 64, 1, {5, 10, 15}, true, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 64, 16, {5, 10, 15}, true, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 64, 1, {5, 10, 15}, true, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 64, 16, {5, 10, 15}, true, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 64, 1, {5, 10, 15}, true, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{1, 64, 16, {5, 10, 15}, true, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{128, 64, 1, {10}, false, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{128, 64, 16, {10}, false, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{128, 64, 1, {10}, false, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{128, 64, 16, {10}, false, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{128, 64, 1, {10}, false, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{128, 64, 16, {10}, false, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{128, 64, 1, {10}, false, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{128, 64, 16, {10}, false, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{128, 64, 1, {10}, true, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{128, 64, 16, {10}, true, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{128, 64, 1, {10}, true, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{128, 64, 16, {10}, true, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{128, 64, 1, {10}, true, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{128, 64, 16, {10}, true, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{128, 64, 1, {10}, true, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{128, 64, 16, {10}, true, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{
+        128, 64, 1, {5, 10, 15}, false, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{
+        128, 64, 16, {5, 10, 15}, false, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{
+        128, 64, 1, {5, 10, 15}, false, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{
+        128, 64, 16, {5, 10, 15}, false, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{
+        128, 64, 1, {5, 10, 15}, false, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{
+        128, 64, 16, {5, 10, 15}, false, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{
+        128, 64, 1, {5, 10, 15}, false, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{
+        128, 64, 16, {5, 10, 15}, false, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{
+        128, 64, 1, {5, 10, 15}, true, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{
+        128, 64, 16, {5, 10, 15}, true, false, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{
+        128, 64, 1, {5, 10, 15}, true, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{
+        128, 64, 16, {5, 10, 15}, true, false, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{
+        128, 64, 1, {5, 10, 15}, true, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{
+        128, 64, 16, {5, 10, 15}, true, true, false, false},
+      SamplingHeterogeneousPostProcessing_Usecase{128, 64, 1, {5, 10, 15}, true, true, true, false},
+      SamplingHeterogeneousPostProcessing_Usecase{
+        128, 64, 16, {5, 10, 15}, true, true, true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/sampling/sampling_post_processing_test.cu b/cpp/tests/sampling/sampling_post_processing_test.cpp
similarity index 52%
rename from cpp/tests/sampling/sampling_post_processing_test.cu
rename to cpp/tests/sampling/sampling_post_processing_test.cpp
index ecec1d0ed89..b262794d26d 100644
--- a/cpp/tests/sampling/sampling_post_processing_test.cu
+++ b/cpp/tests/sampling/sampling_post_processing_test.cpp
@@ -14,30 +14,17 @@
  * limitations under the License.
  */
 
+#include "detail/sampling_post_processing_validate.hpp"
 #include "utilities/base_fixture.hpp"
 
-#include <cugraph/algorithms.hpp>
-#include <cugraph/detail/utility_wrappers.hpp>
 #include <cugraph/graph_functions.hpp>
 #include <cugraph/sampling_functions.hpp>
-#include <cugraph/utilities/device_functors.cuh>
 #include <cugraph/utilities/high_res_timer.hpp>
 
 #include <raft/core/handle.hpp>
 
 #include <rmm/device_uvector.hpp>
 
-#include <cuda/functional>
-#include <thrust/binary_search.h>
-#include <thrust/distance.h>
-#include <thrust/equal.h>
-#include <thrust/fill.h>
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/reduce.h>
-#include <thrust/remove.h>
-#include <thrust/sort.h>
-#include <thrust/unique.h>
-
 #include <gtest/gtest.h>
 
 struct SamplingPostProcessing_Usecase {
@@ -53,385 +40,6 @@ struct SamplingPostProcessing_Usecase {
   bool check_correctness{true};
 };
 
-template <typename vertex_t, typename weight_t>
-bool compare_edgelist(raft::handle_t const& handle,
-                      raft::device_span<vertex_t const> org_edgelist_srcs,
-                      raft::device_span<vertex_t const> org_edgelist_dsts,
-                      std::optional<raft::device_span<weight_t const>> org_edgelist_weights,
-                      raft::device_span<vertex_t const> renumbered_edgelist_srcs,
-                      raft::device_span<vertex_t const> renumbered_edgelist_dsts,
-                      std::optional<raft::device_span<weight_t const>> renumbered_edgelist_weights,
-                      std::optional<raft::device_span<vertex_t const>> renumber_map)
-{
-  if (org_edgelist_srcs.size() != renumbered_edgelist_srcs.size()) { return false; }
-
-  rmm::device_uvector<vertex_t> sorted_org_edgelist_srcs(org_edgelist_srcs.size(),
-                                                         handle.get_stream());
-  thrust::copy(handle.get_thrust_policy(),
-               org_edgelist_srcs.begin(),
-               org_edgelist_srcs.end(),
-               sorted_org_edgelist_srcs.begin());
-  rmm::device_uvector<vertex_t> sorted_org_edgelist_dsts(org_edgelist_dsts.size(),
-                                                         handle.get_stream());
-  thrust::copy(handle.get_thrust_policy(),
-               org_edgelist_dsts.begin(),
-               org_edgelist_dsts.end(),
-               sorted_org_edgelist_dsts.begin());
-  auto sorted_org_edgelist_weights = org_edgelist_weights
-                                       ? std::make_optional<rmm::device_uvector<weight_t>>(
-                                           (*org_edgelist_weights).size(), handle.get_stream())
-                                       : std::nullopt;
-  if (sorted_org_edgelist_weights) {
-    thrust::copy(handle.get_thrust_policy(),
-                 (*org_edgelist_weights).begin(),
-                 (*org_edgelist_weights).end(),
-                 (*sorted_org_edgelist_weights).begin());
-  }
-
-  if (sorted_org_edgelist_weights) {
-    auto sorted_org_edge_first = thrust::make_zip_iterator(sorted_org_edgelist_srcs.begin(),
-                                                           sorted_org_edgelist_dsts.begin(),
-                                                           (*sorted_org_edgelist_weights).begin());
-    thrust::sort(handle.get_thrust_policy(),
-                 sorted_org_edge_first,
-                 sorted_org_edge_first + sorted_org_edgelist_srcs.size());
-  } else {
-    auto sorted_org_edge_first =
-      thrust::make_zip_iterator(sorted_org_edgelist_srcs.begin(), sorted_org_edgelist_dsts.begin());
-    thrust::sort(handle.get_thrust_policy(),
-                 sorted_org_edge_first,
-                 sorted_org_edge_first + sorted_org_edgelist_srcs.size());
-  }
-
-  rmm::device_uvector<vertex_t> sorted_unrenumbered_edgelist_srcs(renumbered_edgelist_srcs.size(),
-                                                                  handle.get_stream());
-  thrust::copy(handle.get_thrust_policy(),
-               renumbered_edgelist_srcs.begin(),
-               renumbered_edgelist_srcs.end(),
-               sorted_unrenumbered_edgelist_srcs.begin());
-  rmm::device_uvector<vertex_t> sorted_unrenumbered_edgelist_dsts(renumbered_edgelist_dsts.size(),
-                                                                  handle.get_stream());
-  thrust::copy(handle.get_thrust_policy(),
-               renumbered_edgelist_dsts.begin(),
-               renumbered_edgelist_dsts.end(),
-               sorted_unrenumbered_edgelist_dsts.begin());
-  auto sorted_unrenumbered_edgelist_weights =
-    renumbered_edgelist_weights ? std::make_optional<rmm::device_uvector<weight_t>>(
-                                    (*renumbered_edgelist_weights).size(), handle.get_stream())
-                                : std::nullopt;
-  if (sorted_unrenumbered_edgelist_weights) {
-    thrust::copy(handle.get_thrust_policy(),
-                 (*renumbered_edgelist_weights).begin(),
-                 (*renumbered_edgelist_weights).end(),
-                 (*sorted_unrenumbered_edgelist_weights).begin());
-  }
-
-  if (renumber_map) {
-    cugraph::unrenumber_int_vertices<vertex_t, false>(
-      handle,
-      sorted_unrenumbered_edgelist_srcs.data(),
-      sorted_unrenumbered_edgelist_srcs.size(),
-      (*renumber_map).data(),
-      std::vector<vertex_t>{static_cast<vertex_t>((*renumber_map).size())});
-    cugraph::unrenumber_int_vertices<vertex_t, false>(
-      handle,
-      sorted_unrenumbered_edgelist_dsts.data(),
-      sorted_unrenumbered_edgelist_dsts.size(),
-      (*renumber_map).data(),
-      std::vector<vertex_t>{static_cast<vertex_t>((*renumber_map).size())});
-  }
-
-  if (sorted_unrenumbered_edgelist_weights) {
-    auto sorted_unrenumbered_edge_first =
-      thrust::make_zip_iterator(sorted_unrenumbered_edgelist_srcs.begin(),
-                                sorted_unrenumbered_edgelist_dsts.begin(),
-                                (*sorted_unrenumbered_edgelist_weights).begin());
-    thrust::sort(handle.get_thrust_policy(),
-                 sorted_unrenumbered_edge_first,
-                 sorted_unrenumbered_edge_first + sorted_unrenumbered_edgelist_srcs.size());
-
-    auto sorted_org_edge_first = thrust::make_zip_iterator(sorted_org_edgelist_srcs.begin(),
-                                                           sorted_org_edgelist_dsts.begin(),
-                                                           (*sorted_org_edgelist_weights).begin());
-    return thrust::equal(handle.get_thrust_policy(),
-                         sorted_org_edge_first,
-                         sorted_org_edge_first + sorted_org_edgelist_srcs.size(),
-                         sorted_unrenumbered_edge_first);
-  } else {
-    auto sorted_unrenumbered_edge_first = thrust::make_zip_iterator(
-      sorted_unrenumbered_edgelist_srcs.begin(), sorted_unrenumbered_edgelist_dsts.begin());
-    thrust::sort(handle.get_thrust_policy(),
-                 sorted_unrenumbered_edge_first,
-                 sorted_unrenumbered_edge_first + sorted_unrenumbered_edgelist_srcs.size());
-
-    auto sorted_org_edge_first =
-      thrust::make_zip_iterator(sorted_org_edgelist_srcs.begin(), sorted_org_edgelist_dsts.begin());
-    return thrust::equal(handle.get_thrust_policy(),
-                         sorted_org_edge_first,
-                         sorted_org_edge_first + sorted_org_edgelist_srcs.size(),
-                         sorted_unrenumbered_edge_first);
-  }
-}
-
-template <typename vertex_t>
-bool check_renumber_map_invariants(
-  raft::handle_t const& handle,
-  std::optional<raft::device_span<vertex_t const>> starting_vertices,
-  raft::device_span<vertex_t const> org_edgelist_srcs,
-  raft::device_span<vertex_t const> org_edgelist_dsts,
-  std::optional<raft::device_span<int32_t const>> org_edgelist_hops,
-  raft::device_span<vertex_t const> renumber_map,
-  bool src_is_major)
-{
-  // Check the invariants in renumber_map
-  // Say we found the minimum (primary key:hop, secondary key:flag) pairs for every unique vertices,
-  // where flag is 0 for sources and 1 for destinations. Then, vertices with smaller (hop, flag)
-  // pairs should be renumbered to smaller numbers than vertices with larger (hop, flag) pairs.
-  auto org_edgelist_majors = src_is_major ? org_edgelist_srcs : org_edgelist_dsts;
-  auto org_edgelist_minors = src_is_major ? org_edgelist_dsts : org_edgelist_srcs;
-
-  rmm::device_uvector<vertex_t> unique_majors(org_edgelist_majors.size(), handle.get_stream());
-  thrust::copy(handle.get_thrust_policy(),
-               org_edgelist_majors.begin(),
-               org_edgelist_majors.end(),
-               unique_majors.begin());
-  if (starting_vertices) {
-    auto old_size = unique_majors.size();
-    unique_majors.resize(old_size + (*starting_vertices).size(), handle.get_stream());
-    thrust::copy(handle.get_thrust_policy(),
-                 (*starting_vertices).begin(),
-                 (*starting_vertices).end(),
-                 unique_majors.begin() + old_size);
-  }
-
-  std::optional<rmm::device_uvector<int32_t>> unique_major_hops =
-    org_edgelist_hops ? std::make_optional<rmm::device_uvector<int32_t>>(
-                          (*org_edgelist_hops).size(), handle.get_stream())
-                      : std::nullopt;
-  if (org_edgelist_hops) {
-    thrust::copy(handle.get_thrust_policy(),
-                 (*org_edgelist_hops).begin(),
-                 (*org_edgelist_hops).end(),
-                 (*unique_major_hops).begin());
-    if (starting_vertices) {
-      auto old_size = (*unique_major_hops).size();
-      (*unique_major_hops).resize(old_size + (*starting_vertices).size(), handle.get_stream());
-      thrust::fill(handle.get_thrust_policy(),
-                   (*unique_major_hops).begin() + old_size,
-                   (*unique_major_hops).end(),
-                   int32_t{0});
-    }
-
-    auto pair_first =
-      thrust::make_zip_iterator(unique_majors.begin(), (*unique_major_hops).begin());
-    thrust::sort(handle.get_thrust_policy(), pair_first, pair_first + unique_majors.size());
-    unique_majors.resize(
-      thrust::distance(unique_majors.begin(),
-                       thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
-                                                            unique_majors.begin(),
-                                                            unique_majors.end(),
-                                                            (*unique_major_hops).begin()))),
-      handle.get_stream());
-    (*unique_major_hops).resize(unique_majors.size(), handle.get_stream());
-  } else {
-    thrust::sort(handle.get_thrust_policy(), unique_majors.begin(), unique_majors.end());
-    unique_majors.resize(
-      thrust::distance(
-        unique_majors.begin(),
-        thrust::unique(handle.get_thrust_policy(), unique_majors.begin(), unique_majors.end())),
-      handle.get_stream());
-  }
-
-  rmm::device_uvector<vertex_t> unique_minors(org_edgelist_minors.size(), handle.get_stream());
-  thrust::copy(handle.get_thrust_policy(),
-               org_edgelist_minors.begin(),
-               org_edgelist_minors.end(),
-               unique_minors.begin());
-  std::optional<rmm::device_uvector<int32_t>> unique_minor_hops =
-    org_edgelist_hops ? std::make_optional<rmm::device_uvector<int32_t>>(
-                          (*org_edgelist_hops).size(), handle.get_stream())
-                      : std::nullopt;
-  if (org_edgelist_hops) {
-    thrust::copy(handle.get_thrust_policy(),
-                 (*org_edgelist_hops).begin(),
-                 (*org_edgelist_hops).end(),
-                 (*unique_minor_hops).begin());
-
-    auto pair_first =
-      thrust::make_zip_iterator(unique_minors.begin(), (*unique_minor_hops).begin());
-    thrust::sort(handle.get_thrust_policy(), pair_first, pair_first + unique_minors.size());
-    unique_minors.resize(
-      thrust::distance(unique_minors.begin(),
-                       thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
-                                                            unique_minors.begin(),
-                                                            unique_minors.end(),
-                                                            (*unique_minor_hops).begin()))),
-      handle.get_stream());
-    (*unique_minor_hops).resize(unique_minors.size(), handle.get_stream());
-  } else {
-    thrust::sort(handle.get_thrust_policy(), unique_minors.begin(), unique_minors.end());
-    unique_minors.resize(
-      thrust::distance(
-        unique_minors.begin(),
-        thrust::unique(handle.get_thrust_policy(), unique_minors.begin(), unique_minors.end())),
-      handle.get_stream());
-  }
-
-  rmm::device_uvector<vertex_t> sorted_org_vertices(renumber_map.size(), handle.get_stream());
-  rmm::device_uvector<vertex_t> matching_renumbered_vertices(sorted_org_vertices.size(),
-                                                             handle.get_stream());
-  thrust::copy(handle.get_thrust_policy(),
-               renumber_map.begin(),
-               renumber_map.end(),
-               sorted_org_vertices.begin());
-  thrust::sequence(handle.get_thrust_policy(),
-                   matching_renumbered_vertices.begin(),
-                   matching_renumbered_vertices.end(),
-                   vertex_t{0});
-  thrust::sort_by_key(handle.get_thrust_policy(),
-                      sorted_org_vertices.begin(),
-                      sorted_org_vertices.end(),
-                      matching_renumbered_vertices.begin());
-
-  if (org_edgelist_hops) {
-    rmm::device_uvector<vertex_t> merged_vertices(unique_majors.size() + unique_minors.size(),
-                                                  handle.get_stream());
-    rmm::device_uvector<int32_t> merged_hops(merged_vertices.size(), handle.get_stream());
-    rmm::device_uvector<int8_t> merged_flags(merged_vertices.size(), handle.get_stream());
-
-    auto major_triplet_first = thrust::make_zip_iterator(unique_majors.begin(),
-                                                         (*unique_major_hops).begin(),
-                                                         thrust::make_constant_iterator(int8_t{0}));
-    auto minor_triplet_first = thrust::make_zip_iterator(unique_minors.begin(),
-                                                         (*unique_minor_hops).begin(),
-                                                         thrust::make_constant_iterator(int8_t{1}));
-    thrust::merge(handle.get_thrust_policy(),
-                  major_triplet_first,
-                  major_triplet_first + unique_majors.size(),
-                  minor_triplet_first,
-                  minor_triplet_first + unique_minors.size(),
-                  thrust::make_zip_iterator(
-                    merged_vertices.begin(), merged_hops.begin(), merged_flags.begin()));
-    merged_vertices.resize(
-      thrust::distance(merged_vertices.begin(),
-                       thrust::get<0>(thrust::unique_by_key(
-                         handle.get_thrust_policy(),
-                         merged_vertices.begin(),
-                         merged_vertices.end(),
-                         thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin())))),
-      handle.get_stream());
-    merged_hops.resize(merged_vertices.size(), handle.get_stream());
-    merged_flags.resize(merged_vertices.size(), handle.get_stream());
-
-    auto sort_key_first = thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin());
-    thrust::sort_by_key(handle.get_thrust_policy(),
-                        sort_key_first,
-                        sort_key_first + merged_hops.size(),
-                        merged_vertices.begin());
-
-    auto num_unique_keys = thrust::count_if(
-      handle.get_thrust_policy(),
-      thrust::make_counting_iterator(size_t{0}),
-      thrust::make_counting_iterator(merged_hops.size()),
-      cugraph::detail::is_first_in_run_t<decltype(sort_key_first)>{sort_key_first});
-    rmm::device_uvector<vertex_t> min_vertices(num_unique_keys, handle.get_stream());
-    rmm::device_uvector<vertex_t> max_vertices(num_unique_keys, handle.get_stream());
-
-    auto renumbered_merged_vertex_first = thrust::make_transform_iterator(
-      merged_vertices.begin(),
-      cuda::proclaim_return_type<vertex_t>(
-        [sorted_org_vertices = raft::device_span<vertex_t const>(sorted_org_vertices.data(),
-                                                                 sorted_org_vertices.size()),
-         matching_renumbered_vertices = raft::device_span<vertex_t const>(
-           matching_renumbered_vertices.data(),
-           matching_renumbered_vertices.size())] __device__(vertex_t major) {
-          auto it = thrust::lower_bound(
-            thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), major);
-          return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(), it)];
-        }));
-
-    thrust::reduce_by_key(handle.get_thrust_policy(),
-                          sort_key_first,
-                          sort_key_first + merged_hops.size(),
-                          renumbered_merged_vertex_first,
-                          thrust::make_discard_iterator(),
-                          min_vertices.begin(),
-                          thrust::equal_to<thrust::tuple<int32_t, int8_t>>{},
-                          thrust::minimum<vertex_t>{});
-    thrust::reduce_by_key(handle.get_thrust_policy(),
-                          sort_key_first,
-                          sort_key_first + merged_hops.size(),
-                          renumbered_merged_vertex_first,
-                          thrust::make_discard_iterator(),
-                          max_vertices.begin(),
-                          thrust::equal_to<thrust::tuple<int32_t, int8_t>>{},
-                          thrust::maximum<vertex_t>{});
-
-    auto num_violations = thrust::count_if(
-      handle.get_thrust_policy(),
-      thrust::make_counting_iterator(size_t{1}),
-      thrust::make_counting_iterator(min_vertices.size()),
-      [min_vertices = raft::device_span<vertex_t const>(min_vertices.data(), min_vertices.size()),
-       max_vertices = raft::device_span<vertex_t const>(max_vertices.data(),
-                                                        max_vertices.size())] __device__(size_t i) {
-        return min_vertices[i] <= max_vertices[i - 1];
-      });
-
-    return (num_violations == 0);
-  } else {
-    unique_minors.resize(
-      thrust::distance(
-        unique_minors.begin(),
-        thrust::remove_if(handle.get_thrust_policy(),
-                          unique_minors.begin(),
-                          unique_minors.end(),
-                          [sorted_unique_majors = raft::device_span<vertex_t const>(
-                             unique_majors.data(), unique_majors.size())] __device__(auto minor) {
-                            return thrust::binary_search(thrust::seq,
-                                                         sorted_unique_majors.begin(),
-                                                         sorted_unique_majors.end(),
-                                                         minor);
-                          })),
-      handle.get_stream());
-
-    auto max_major_renumbered_vertex = thrust::transform_reduce(
-      handle.get_thrust_policy(),
-      unique_majors.begin(),
-      unique_majors.end(),
-      cuda::proclaim_return_type<vertex_t>(
-        [sorted_org_vertices = raft::device_span<vertex_t const>(sorted_org_vertices.data(),
-                                                                 sorted_org_vertices.size()),
-         matching_renumbered_vertices = raft::device_span<vertex_t const>(
-           matching_renumbered_vertices.data(),
-           matching_renumbered_vertices.size())] __device__(vertex_t major) -> vertex_t {
-          auto it = thrust::lower_bound(
-            thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), major);
-          return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(), it)];
-        }),
-      std::numeric_limits<vertex_t>::lowest(),
-      thrust::maximum<vertex_t>{});
-
-    auto min_minor_renumbered_vertex = thrust::transform_reduce(
-      handle.get_thrust_policy(),
-      unique_minors.begin(),
-      unique_minors.end(),
-      cuda::proclaim_return_type<vertex_t>(
-        [sorted_org_vertices = raft::device_span<vertex_t const>(sorted_org_vertices.data(),
-                                                                 sorted_org_vertices.size()),
-         matching_renumbered_vertices = raft::device_span<vertex_t const>(
-           matching_renumbered_vertices.data(),
-           matching_renumbered_vertices.size())] __device__(vertex_t minor) -> vertex_t {
-          auto it = thrust::lower_bound(
-            thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), minor);
-          return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(), it)];
-        }),
-      std::numeric_limits<vertex_t>::max(),
-      thrust::minimum<vertex_t>{});
-
-    return (max_major_renumbered_vertex < min_minor_renumbered_vertex);
-  }
-}
-
 template <typename input_usecase_t>
 class Tests_SamplingPostProcessing
   : public ::testing::TestWithParam<std::tuple<SamplingPostProcessing_Usecase, input_usecase_t>> {
@@ -450,7 +58,7 @@ class Tests_SamplingPostProcessing
   {
     using label_t     = int32_t;
     using weight_t    = float;
-    using edge_id_t   = vertex_t;
+    using edge_id_t   = edge_t;
     using edge_type_t = int32_t;
 
     bool constexpr store_transposed = false;
@@ -462,6 +70,8 @@ class Tests_SamplingPostProcessing
     raft::handle_t handle{};
     HighResTimer hr_timer{};
 
+    // 1. create a graph
+
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
       hr_timer.start("Construct graph");
@@ -481,6 +91,8 @@ class Tests_SamplingPostProcessing
     auto edge_weight_view =
       edge_weights ? std::make_optional((*edge_weights).view()) : std::nullopt;
 
+    // 2. seed vertices (& labels)
+
     raft::random::RngState rng_state(0);
 
     rmm::device_uvector<vertex_t> starting_vertices(
@@ -503,20 +115,22 @@ class Tests_SamplingPostProcessing
             sampling_post_processing_usecase.num_labels + 1, handle.get_stream())
         : std::nullopt;
     if (starting_vertex_labels) {
-      thrust::tabulate(
-        handle.get_thrust_policy(),
-        (*starting_vertex_labels).begin(),
-        (*starting_vertex_labels).end(),
-        [num_seeds_per_label = sampling_post_processing_usecase.num_seeds_per_label] __device__(
-          size_t i) { return static_cast<label_t>(i / num_seeds_per_label); });
-      thrust::tabulate(
-        handle.get_thrust_policy(),
-        (*starting_vertex_label_offsets).begin(),
-        (*starting_vertex_label_offsets).end(),
-        [num_seeds_per_label = sampling_post_processing_usecase.num_seeds_per_label] __device__(
-          size_t i) { return num_seeds_per_label * i; });
+      auto num_seeds_per_label = sampling_post_processing_usecase.num_seeds_per_label;
+      for (size_t i = 0; i < sampling_post_processing_usecase.num_labels; ++i) {
+        cugraph::detail::scalar_fill(handle.get_stream(),
+                                     (*starting_vertex_labels).data() + i * num_seeds_per_label,
+                                     num_seeds_per_label,
+                                     static_cast<label_t>(i));
+      }
+      cugraph::detail::stride_fill(handle.get_stream(),
+                                   (*starting_vertex_label_offsets).data(),
+                                   (*starting_vertex_label_offsets).size(),
+                                   size_t{0},
+                                   num_seeds_per_label);
     }
 
+    // 3. sampling
+
     rmm::device_uvector<vertex_t> org_edgelist_srcs(0, handle.get_stream());
     rmm::device_uvector<vertex_t> org_edgelist_dsts(0, handle.get_stream());
     std::optional<rmm::device_uvector<weight_t>> org_edgelist_weights{std::nullopt};
@@ -562,6 +176,8 @@ class Tests_SamplingPostProcessing
       std::swap(org_edgelist_srcs, org_edgelist_dsts);
     }
 
+    // 4. post processing: renumber & sort
+
     {
       rmm::device_uvector<vertex_t> renumbered_and_sorted_edgelist_srcs(org_edgelist_srcs.size(),
                                                                         handle.get_stream());
@@ -652,178 +268,138 @@ class Tests_SamplingPostProcessing
 
       if (sampling_post_processing_usecase.check_correctness) {
         if (renumbered_and_sorted_edgelist_label_hop_offsets) {
-          ASSERT_TRUE((*renumbered_and_sorted_edgelist_label_hop_offsets).size() ==
-                      sampling_post_processing_usecase.num_labels *
-                          sampling_post_processing_usecase.fanouts.size() +
-                        1)
-            << "Renumbered and sorted edge list (label,hop) offset array size should coincide with "
-               "the number of labels * the number of hops + 1.";
-
-          ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
-                                        (*renumbered_and_sorted_edgelist_label_hop_offsets).begin(),
-                                        (*renumbered_and_sorted_edgelist_label_hop_offsets).end()))
-            << "Renumbered and sorted edge list (label,hop) offset array values should be "
-               "non-decreasing.";
-
-          ASSERT_TRUE(
-            (*renumbered_and_sorted_edgelist_label_hop_offsets).back_element(handle.get_stream()) ==
-            renumbered_and_sorted_edgelist_srcs.size())
-            << "Renumbered and sorted edge list (label,hop) offset array's last element should "
-               "coincide with the number of edges.";
+          ASSERT_TRUE(check_offsets(handle,
+                                    raft::device_span<size_t const>(
+                                      (*renumbered_and_sorted_edgelist_label_hop_offsets).data(),
+                                      (*renumbered_and_sorted_edgelist_label_hop_offsets).size()),
+                                    sampling_post_processing_usecase.num_labels *
+                                      sampling_post_processing_usecase.fanouts.size(),
+                                    renumbered_and_sorted_edgelist_srcs.size()))
+            << "Renumbered and sorted edge (label, hop) offset array is invalid.";
         }
 
         if (renumbered_and_sorted_renumber_map_label_offsets) {
-          ASSERT_TRUE((*renumbered_and_sorted_renumber_map_label_offsets).size() ==
-                      sampling_post_processing_usecase.num_labels + 1)
-            << "Renumbered and sorted offset (label, hop) offset array size should coincide with "
-               "the number of labels + 1.";
-
-          ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
-                                        (*renumbered_and_sorted_renumber_map_label_offsets).begin(),
-                                        (*renumbered_and_sorted_renumber_map_label_offsets).end()))
-            << "Renumbered and sorted renumber map label offset array values should be "
-               "non-decreasing.";
-
-          ASSERT_TRUE(
-            (*renumbered_and_sorted_renumber_map_label_offsets).back_element(handle.get_stream()) ==
-            renumbered_and_sorted_renumber_map.size())
-            << "Renumbered and sorted renumber map label offset array's last value should coincide "
-               "with the renumber map size.";
+          ASSERT_TRUE(check_offsets(handle,
+                                    raft::device_span<size_t const>(
+                                      (*renumbered_and_sorted_renumber_map_label_offsets).data(),
+                                      (*renumbered_and_sorted_renumber_map_label_offsets).size()),
+                                    sampling_post_processing_usecase.num_labels,
+                                    renumbered_and_sorted_renumber_map.size()))
+            << "Renumbered and sorted renumber map label offset array is invalid.";
         }
 
-        for (size_t i = 0; i < sampling_post_processing_usecase.num_labels; ++i) {
-          size_t starting_vertex_start_offset =
-            starting_vertex_label_offsets
-              ? (*starting_vertex_label_offsets).element(i, handle.get_stream())
-              : size_t{0};
-          size_t starting_vertex_end_offset =
-            starting_vertex_label_offsets
-              ? (*starting_vertex_label_offsets).element(i + 1, handle.get_stream())
-              : starting_vertices.size();
+        // check whether the edges are properly sorted
+
+        auto renumbered_and_sorted_edgelist_majors =
+          sampling_post_processing_usecase.src_is_major
+            ? raft::device_span<vertex_t const>(renumbered_and_sorted_edgelist_srcs.data(),
+                                                renumbered_and_sorted_edgelist_srcs.size())
+            : raft::device_span<vertex_t const>(renumbered_and_sorted_edgelist_dsts.data(),
+                                                renumbered_and_sorted_edgelist_dsts.size());
+        auto renumbered_and_sorted_edgelist_minors =
+          sampling_post_processing_usecase.src_is_major
+            ? raft::device_span<vertex_t const>(renumbered_and_sorted_edgelist_dsts.data(),
+                                                renumbered_and_sorted_edgelist_dsts.size())
+            : raft::device_span<vertex_t const>(renumbered_and_sorted_edgelist_srcs.data(),
+                                                renumbered_and_sorted_edgelist_srcs.size());
 
-          size_t edgelist_start_offset =
-            org_edgelist_label_offsets
-              ? (*org_edgelist_label_offsets).element(i, handle.get_stream())
-              : size_t{0};
-          size_t edgelist_end_offset =
-            org_edgelist_label_offsets
-              ? (*org_edgelist_label_offsets).element(i + 1, handle.get_stream())
-              : org_edgelist_srcs.size();
-          if (edgelist_start_offset == edgelist_end_offset) continue;
-
-          auto this_label_starting_vertices = raft::device_span<vertex_t const>(
-            starting_vertices.data() + starting_vertex_start_offset,
-            starting_vertex_end_offset - starting_vertex_start_offset);
-
-          auto this_label_org_edgelist_srcs =
-            raft::device_span<vertex_t const>(org_edgelist_srcs.data() + edgelist_start_offset,
-                                              edgelist_end_offset - edgelist_start_offset);
-          auto this_label_org_edgelist_dsts =
-            raft::device_span<vertex_t const>(org_edgelist_dsts.data() + edgelist_start_offset,
-                                              edgelist_end_offset - edgelist_start_offset);
-          auto this_label_org_edgelist_hops =
-            org_edgelist_hops ? std::make_optional<raft::device_span<int32_t const>>(
-                                  (*org_edgelist_hops).data() + edgelist_start_offset,
-                                  edgelist_end_offset - edgelist_start_offset)
-                              : std::nullopt;
-          auto this_label_org_edgelist_weights =
-            org_edgelist_weights ? std::make_optional<raft::device_span<weight_t const>>(
-                                     (*org_edgelist_weights).data() + edgelist_start_offset,
-                                     edgelist_end_offset - edgelist_start_offset)
-                                 : std::nullopt;
-
-          auto this_label_output_edgelist_srcs = raft::device_span<vertex_t const>(
-            renumbered_and_sorted_edgelist_srcs.data() + edgelist_start_offset,
-            edgelist_end_offset - edgelist_start_offset);
-          auto this_label_output_edgelist_dsts = raft::device_span<vertex_t const>(
-            renumbered_and_sorted_edgelist_dsts.data() + edgelist_start_offset,
-            edgelist_end_offset - edgelist_start_offset);
-          auto this_label_output_edgelist_weights =
-            renumbered_and_sorted_edgelist_weights
-              ? std::make_optional<raft::device_span<weight_t const>>(
-                  (*renumbered_and_sorted_edgelist_weights).data() + edgelist_start_offset,
-                  edgelist_end_offset - edgelist_start_offset)
-              : std::nullopt;
-
-          size_t renumber_map_start_offset =
-            renumbered_and_sorted_renumber_map_label_offsets
-              ? (*renumbered_and_sorted_renumber_map_label_offsets).element(i, handle.get_stream())
-              : size_t{0};
-          size_t renumber_map_end_offset      = renumbered_and_sorted_renumber_map_label_offsets
-                                                  ? (*renumbered_and_sorted_renumber_map_label_offsets)
-                                                 .element(i + 1, handle.get_stream())
-                                                  : renumbered_and_sorted_renumber_map.size();
-          auto this_label_output_renumber_map = raft::device_span<vertex_t const>(
-            renumbered_and_sorted_renumber_map.data() + renumber_map_start_offset,
-            renumber_map_end_offset - renumber_map_start_offset);
-
-          // check whether the edges are properly sorted
-
-          auto this_label_output_edgelist_majors = sampling_post_processing_usecase.src_is_major
-                                                     ? this_label_output_edgelist_srcs
-                                                     : this_label_output_edgelist_dsts;
-          auto this_label_output_edgelist_minors = sampling_post_processing_usecase.src_is_major
-                                                     ? this_label_output_edgelist_dsts
-                                                     : this_label_output_edgelist_srcs;
-
-          if (this_label_org_edgelist_hops) {
-            auto num_hops   = sampling_post_processing_usecase.fanouts.size();
-            auto edge_first = thrust::make_zip_iterator(this_label_output_edgelist_majors.begin(),
-                                                        this_label_output_edgelist_minors.begin());
-            for (size_t j = 0; j < num_hops; ++j) {
-              auto hop_start_offset = (*renumbered_and_sorted_edgelist_label_hop_offsets)
-                                        .element(i * num_hops + j, handle.get_stream()) -
-                                      (*renumbered_and_sorted_edgelist_label_hop_offsets)
-                                        .element(i * num_hops, handle.get_stream());
-              auto hop_end_offset = (*renumbered_and_sorted_edgelist_label_hop_offsets)
-                                      .element(i * num_hops + j + 1, handle.get_stream()) -
-                                    (*renumbered_and_sorted_edgelist_label_hop_offsets)
-                                      .element(i * num_hops, handle.get_stream());
-              ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
-                                            edge_first + hop_start_offset,
-                                            edge_first + hop_end_offset))
-                << "Renumbered and sorted output edges are not properly sorted.";
-            }
-          } else {
-            auto edge_first = thrust::make_zip_iterator(this_label_output_edgelist_majors.begin(),
-                                                        this_label_output_edgelist_minors.begin());
-            ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
-                                          edge_first,
-                                          edge_first + this_label_output_edgelist_majors.size()))
-              << "Renumbered and sorted output edges are not properly sorted.";
+        if (renumbered_and_sorted_edgelist_label_hop_offsets) {
+          for (size_t i = 0; i < sampling_post_processing_usecase.num_labels *
+                                   sampling_post_processing_usecase.fanouts.size();
+               ++i) {
+            auto hop_start_offset =
+              (*renumbered_and_sorted_edgelist_label_hop_offsets).element(i, handle.get_stream());
+            auto hop_end_offset = (*renumbered_and_sorted_edgelist_label_hop_offsets)
+                                    .element(i + 1, handle.get_stream());
+            ASSERT_TRUE(check_edgelist_is_sorted(
+              handle,
+              raft::device_span<vertex_t const>(
+                renumbered_and_sorted_edgelist_majors.data() + hop_start_offset,
+                hop_end_offset - hop_start_offset),
+              raft::device_span<vertex_t const>(
+                renumbered_and_sorted_edgelist_minors.data() + hop_start_offset,
+                hop_end_offset - hop_start_offset)))
+              << "Renumbered and sorted edge list is not properly sorted.";
           }
+        } else {
+          ASSERT_TRUE(check_edgelist_is_sorted(
+            handle,
+            raft::device_span<vertex_t const>(renumbered_and_sorted_edgelist_majors.data(),
+                                              renumbered_and_sorted_edgelist_majors.size()),
+            raft::device_span<vertex_t const>(renumbered_and_sorted_edgelist_minors.data(),
+                                              renumbered_and_sorted_edgelist_minors.size())))
+            << "Renumbered and sorted edge list is not properly sorted.";
+        }
 
-          // check whether renumbering recovers the original edge list
-
-          ASSERT_TRUE(compare_edgelist(handle,
-                                       this_label_org_edgelist_srcs,
-                                       this_label_org_edgelist_dsts,
-                                       this_label_org_edgelist_weights,
-                                       this_label_output_edgelist_srcs,
-                                       this_label_output_edgelist_dsts,
-                                       this_label_output_edgelist_weights,
-                                       std::make_optional(this_label_output_renumber_map)))
-            << "Unrenumbering the renumbered and sorted edge list does not recover the original "
-               "edgelist.";
+        ASSERT_TRUE(compare_edgelist(
+          handle,
+          raft::device_span<vertex_t const>(org_edgelist_srcs.data(), org_edgelist_srcs.size()),
+          raft::device_span<vertex_t const>(org_edgelist_dsts.data(), org_edgelist_dsts.size()),
+          org_edgelist_weights ? std::make_optional<raft::device_span<weight_t const>>(
+                                   (*org_edgelist_weights).data(), (*org_edgelist_weights).size())
+                               : std::nullopt,
+          org_edgelist_label_offsets
+            ? std::make_optional<raft::device_span<size_t const>>(
+                (*org_edgelist_label_offsets).data(), (*org_edgelist_label_offsets).size())
+            : std::nullopt,
+          raft::device_span<vertex_t const>(renumbered_and_sorted_edgelist_srcs.data(),
+                                            renumbered_and_sorted_edgelist_srcs.size()),
+          raft::device_span<vertex_t const>(renumbered_and_sorted_edgelist_dsts.data(),
+                                            renumbered_and_sorted_edgelist_dsts.size()),
+          renumbered_and_sorted_edgelist_weights
+            ? std::make_optional<raft::device_span<weight_t const>>(
+                (*renumbered_and_sorted_edgelist_weights).data(),
+                (*renumbered_and_sorted_edgelist_weights).size())
+            : std::nullopt,
+          std::make_optional<raft::device_span<vertex_t const>>(
+            renumbered_and_sorted_renumber_map.data(), renumbered_and_sorted_renumber_map.size()),
+          renumbered_and_sorted_renumber_map_label_offsets
+            ? std::make_optional<raft::device_span<size_t const>>(
+                (*renumbered_and_sorted_renumber_map_label_offsets).data(),
+                (*renumbered_and_sorted_renumber_map_label_offsets).size())
+            : std::nullopt,
+          sampling_post_processing_usecase.num_labels))
+          << "Unrenumbering the renumbered and sorted edge list does not recover the original "
+             "edgelist.";
 
-          // Check the invariants in renumber_map
+        // Check the invariants in renumber_map
 
-          ASSERT_TRUE(check_renumber_map_invariants(
-            handle,
-            sampling_post_processing_usecase.renumber_with_seeds
-              ? std::make_optional<raft::device_span<vertex_t const>>(
-                  this_label_starting_vertices.data(), this_label_starting_vertices.size())
-              : std::nullopt,
-            this_label_org_edgelist_srcs,
-            this_label_org_edgelist_dsts,
-            this_label_org_edgelist_hops,
-            this_label_output_renumber_map,
-            sampling_post_processing_usecase.src_is_major))
-            << "Renumbered and sorted output renumber map violates invariants.";
-        }
+        ASSERT_TRUE(check_vertex_renumber_map_invariants<vertex_t>(
+          handle,
+          sampling_post_processing_usecase.renumber_with_seeds
+            ? std::make_optional<raft::device_span<vertex_t const>>(starting_vertices.data(),
+                                                                    starting_vertices.size())
+            : std::nullopt,
+          (sampling_post_processing_usecase.renumber_with_seeds && starting_vertex_label_offsets)
+            ? std::make_optional<raft::device_span<size_t const>>(
+                (*starting_vertex_label_offsets).data(), (*starting_vertex_label_offsets).size())
+            : std::nullopt,
+          raft::device_span<vertex_t const>(org_edgelist_srcs.data(), org_edgelist_srcs.size()),
+          raft::device_span<vertex_t const>(org_edgelist_dsts.data(), org_edgelist_dsts.size()),
+          org_edgelist_hops ? std::make_optional<raft::device_span<int32_t const>>(
+                                (*org_edgelist_hops).data(), (*org_edgelist_hops).size())
+                            : std::nullopt,
+          org_edgelist_label_offsets
+            ? std::make_optional<raft::device_span<size_t const>>(
+                (*org_edgelist_label_offsets).data(), (*org_edgelist_label_offsets).size())
+            : std::nullopt,
+          raft::device_span<vertex_t const>(renumbered_and_sorted_renumber_map.data(),
+                                            renumbered_and_sorted_renumber_map.size()),
+          renumbered_and_sorted_renumber_map_label_offsets
+            ? std::make_optional<raft::device_span<size_t const>>(
+                (*renumbered_and_sorted_renumber_map_label_offsets).data(),
+                (*renumbered_and_sorted_renumber_map_label_offsets).size())
+            : std::nullopt,
+          std::nullopt,
+          sampling_post_processing_usecase.num_labels,
+          1,
+          sampling_post_processing_usecase.src_is_major))
+          << "Renumbered and sorted output renumber map violates invariants.";
       }
     }
 
+    // 5. post processing: renumber & compress
+
     {
       rmm::device_uvector<vertex_t> renumbered_and_compressed_edgelist_srcs(
         org_edgelist_srcs.size(), handle.get_stream());
@@ -921,126 +497,52 @@ class Tests_SamplingPostProcessing
       }
 
       if (sampling_post_processing_usecase.check_correctness) {
-        if (renumbered_and_compressed_nzd_vertices) {
-          ASSERT_TRUE(renumbered_and_compressed_offsets.size() ==
-                      (*renumbered_and_compressed_nzd_vertices).size() + 1)
-            << "Renumbered and compressed offset array size should coincide with the number of "
-               "non-zero-degree vertices + 1.";
-        }
-
-        ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
-                                      renumbered_and_compressed_offsets.begin(),
-                                      renumbered_and_compressed_offsets.end()))
-          << "Renumbered and compressed offset array values should be non-decreasing.";
-
-        ASSERT_TRUE(renumbered_and_compressed_offsets.back_element(handle.get_stream()) ==
-                    renumbered_and_compressed_edgelist_minors.size())
-          << "Renumbered and compressed offset array's last value should coincide with the number "
-             "of "
-             "edges.";
+        ASSERT_TRUE(check_offsets(
+          handle,
+          raft::device_span<size_t const>(renumbered_and_compressed_offsets.data(),
+                                          renumbered_and_compressed_offsets.size()),
+          renumbered_and_compressed_nzd_vertices ? (*renumbered_and_compressed_nzd_vertices).size()
+                                                 : renumbered_and_compressed_offsets.size() - 1,
+          renumbered_and_compressed_edgelist_minors.size()))
+          << "Renumbered and compressed offset array is invalid";
 
         if (renumbered_and_compressed_offset_label_hop_offsets) {
-          ASSERT_TRUE((*renumbered_and_compressed_offset_label_hop_offsets).size() ==
-                      sampling_post_processing_usecase.num_labels *
-                          sampling_post_processing_usecase.fanouts.size() +
-                        1)
-            << "Renumbered and compressed offset (label,hop) offset array size should coincide "
-               "with "
-               "the number of labels * the number of hops + 1.";
-
-          ASSERT_TRUE(
-            thrust::is_sorted(handle.get_thrust_policy(),
-                              (*renumbered_and_compressed_offset_label_hop_offsets).begin(),
-                              (*renumbered_and_compressed_offset_label_hop_offsets).end()))
-            << "Renumbered and compressed offset (label,hop) offset array values should be "
-               "non-decreasing.";
-
-          ASSERT_TRUE((*renumbered_and_compressed_offset_label_hop_offsets)
-                        .back_element(handle.get_stream()) ==
-                      renumbered_and_compressed_offsets.size() - 1)
-            << "Renumbered and compressed offset (label,hop) offset array's last value should "
-               "coincide with the offset array size - 1.";
+          ASSERT_TRUE(check_offsets(handle,
+                                    raft::device_span<size_t const>(
+                                      (*renumbered_and_compressed_offset_label_hop_offsets).data(),
+                                      (*renumbered_and_compressed_offset_label_hop_offsets).size()),
+                                    sampling_post_processing_usecase.num_labels *
+                                      sampling_post_processing_usecase.fanouts.size(),
+                                    renumbered_and_compressed_offsets.size() - 1))
+            << "Renumbered and compressed offset (label, hop) offset array is invalid";
         }
 
         if (renumbered_and_compressed_renumber_map_label_offsets) {
-          ASSERT_TRUE((*renumbered_and_compressed_renumber_map_label_offsets).size() ==
-                      sampling_post_processing_usecase.num_labels + 1)
-            << "Renumbered and compressed offset (label, hop) offset array size should coincide "
-               "with "
-               "the number of labels + 1.";
-
           ASSERT_TRUE(
-            thrust::is_sorted(handle.get_thrust_policy(),
-                              (*renumbered_and_compressed_renumber_map_label_offsets).begin(),
-                              (*renumbered_and_compressed_renumber_map_label_offsets).end()))
-            << "Renumbered and compressed renumber map label offset array values should be "
-               "non-decreasing.";
-
-          ASSERT_TRUE((*renumbered_and_compressed_renumber_map_label_offsets)
-                        .back_element(handle.get_stream()) ==
-                      renumbered_and_compressed_renumber_map.size())
-            << "Renumbered and compressed renumber map label offset array's last value should "
-               "coincide with the renumber map size.";
+            check_offsets(handle,
+                          raft::device_span<size_t const>(
+                            (*renumbered_and_compressed_renumber_map_label_offsets).data(),
+                            (*renumbered_and_compressed_renumber_map_label_offsets).size()),
+                          sampling_post_processing_usecase.num_labels,
+                          renumbered_and_compressed_renumber_map.size()))
+            << "Renumbered and compressed renumber map label offset array is invalid";
         }
 
-        for (size_t i = 0; i < sampling_post_processing_usecase.num_labels; ++i) {
-          size_t starting_vertex_start_offset =
-            starting_vertex_label_offsets
-              ? (*starting_vertex_label_offsets).element(i, handle.get_stream())
-              : size_t{0};
-          size_t starting_vertex_end_offset =
-            starting_vertex_label_offsets
-              ? (*starting_vertex_label_offsets).element(i + 1, handle.get_stream())
-              : starting_vertices.size();
-
-          size_t edgelist_start_offset =
-            org_edgelist_label_offsets
-              ? (*org_edgelist_label_offsets).element(i, handle.get_stream())
-              : size_t{0};
-          size_t edgelist_end_offset =
-            org_edgelist_label_offsets
-              ? (*org_edgelist_label_offsets).element(i + 1, handle.get_stream())
-              : org_edgelist_srcs.size();
-          if (edgelist_start_offset == edgelist_end_offset) continue;
-
-          auto this_label_starting_vertices = raft::device_span<vertex_t const>(
-            starting_vertices.data() + starting_vertex_start_offset,
-            starting_vertex_end_offset - starting_vertex_start_offset);
-
-          auto this_label_org_edgelist_srcs =
-            raft::device_span<vertex_t const>(org_edgelist_srcs.data() + edgelist_start_offset,
-                                              edgelist_end_offset - edgelist_start_offset);
-          auto this_label_org_edgelist_dsts =
-            raft::device_span<vertex_t const>(org_edgelist_dsts.data() + edgelist_start_offset,
-                                              edgelist_end_offset - edgelist_start_offset);
-          auto this_label_org_edgelist_hops =
-            org_edgelist_hops ? std::make_optional<raft::device_span<int32_t const>>(
-                                  (*org_edgelist_hops).data() + edgelist_start_offset,
-                                  edgelist_end_offset - edgelist_start_offset)
-                              : std::nullopt;
-          auto this_label_org_edgelist_weights =
-            org_edgelist_weights ? std::make_optional<raft::device_span<weight_t const>>(
-                                     (*org_edgelist_weights).data() + edgelist_start_offset,
-                                     edgelist_end_offset - edgelist_start_offset)
-                                 : std::nullopt;
-
-          rmm::device_uvector<vertex_t> this_label_output_edgelist_srcs(0, handle.get_stream());
-          rmm::device_uvector<vertex_t> this_label_output_edgelist_dsts(0, handle.get_stream());
-          auto this_label_output_edgelist_weights =
-            renumbered_and_compressed_edgelist_weights
-              ? std::make_optional<rmm::device_uvector<weight_t>>(0, handle.get_stream())
-              : std::nullopt;
-          this_label_output_edgelist_srcs.reserve(edgelist_end_offset - edgelist_start_offset,
-                                                  handle.get_stream());
-          this_label_output_edgelist_dsts.reserve(edgelist_end_offset - edgelist_start_offset,
-                                                  handle.get_stream());
-          if (this_label_output_edgelist_weights) {
-            (*this_label_output_edgelist_weights)
-              .reserve(edgelist_end_offset - edgelist_start_offset, handle.get_stream());
-          }
-
-          // decompress
+        // check whether renumbering recovers the original edge list
+
+        rmm::device_uvector<vertex_t> output_edgelist_srcs(0, handle.get_stream());
+        rmm::device_uvector<vertex_t> output_edgelist_dsts(0, handle.get_stream());
+        auto output_edgelist_weights =
+          renumbered_and_compressed_edgelist_weights
+            ? std::make_optional<rmm::device_uvector<weight_t>>(0, handle.get_stream())
+            : std::nullopt;
+        output_edgelist_srcs.reserve(org_edgelist_srcs.size(), handle.get_stream());
+        output_edgelist_dsts.reserve(org_edgelist_srcs.capacity(), handle.get_stream());
+        if (output_edgelist_weights) {
+          (*output_edgelist_weights).reserve(org_edgelist_srcs.capacity(), handle.get_stream());
+        }
 
+        for (size_t i = 0; i < sampling_post_processing_usecase.num_labels; ++i) {
           auto num_hops = sampling_post_processing_usecase.fanouts.size();
           for (size_t j = 0; j < num_hops; ++j) {
             auto offset_start_offset = renumbered_and_compressed_offset_label_hop_offsets
@@ -1069,108 +571,123 @@ class Tests_SamplingPostProcessing
               h_offsets.data(), d_offsets.data(), h_offsets.size(), handle.get_stream());
             handle.sync_stream();
 
-            auto old_size = this_label_output_edgelist_srcs.size();
-            this_label_output_edgelist_srcs.resize(old_size + (h_offsets.back() - h_offsets[0]),
-                                                   handle.get_stream());
-            this_label_output_edgelist_dsts.resize(this_label_output_edgelist_srcs.size(),
-                                                   handle.get_stream());
-            if (this_label_output_edgelist_weights) {
-              (*this_label_output_edgelist_weights)
-                .resize(this_label_output_edgelist_srcs.size(), handle.get_stream());
+            auto old_size = output_edgelist_srcs.size();
+            output_edgelist_srcs.resize(old_size + (h_offsets.back() - h_offsets[0]),
+                                        handle.get_stream());
+            output_edgelist_dsts.resize(output_edgelist_srcs.size(), handle.get_stream());
+            if (output_edgelist_weights) {
+              (*output_edgelist_weights).resize(output_edgelist_srcs.size(), handle.get_stream());
+            }
+            if (renumbered_and_compressed_nzd_vertices) {
+              cugraph::test::expand_hypersparse_offsets(
+                handle,
+                raft::device_span<size_t const>(d_offsets.data(), d_offsets.size()),
+                raft::device_span<vertex_t const>(
+                  (*renumbered_and_compressed_nzd_vertices).data() + offset_start_offset,
+                  (offset_end_offset - offset_start_offset) - 1),
+                raft::device_span<vertex_t>(
+                  (sampling_post_processing_usecase.src_is_major ? output_edgelist_srcs.data()
+                                                                 : output_edgelist_dsts.data()) +
+                    old_size,
+                  h_offsets.back() - h_offsets[0]),
+                h_offsets[0]);
+            } else {
+              cugraph::test::expand_sparse_offsets(
+                handle,
+                raft::device_span<size_t const>(d_offsets.data(), d_offsets.size()),
+                raft::device_span<vertex_t>(
+                  (sampling_post_processing_usecase.src_is_major ? output_edgelist_srcs.data()
+                                                                 : output_edgelist_dsts.data()) +
+                    old_size,
+                  h_offsets.back() - h_offsets[0]),
+                h_offsets[0],
+                base_v);
             }
-            thrust::transform(
-              handle.get_thrust_policy(),
-              thrust::make_counting_iterator(h_offsets[0]),
-              thrust::make_counting_iterator(h_offsets.back()),
-              (sampling_post_processing_usecase.src_is_major
-                 ? this_label_output_edgelist_srcs.begin()
-                 : this_label_output_edgelist_dsts.begin()) +
+            raft::copy(
+              (sampling_post_processing_usecase.src_is_major ? output_edgelist_dsts.begin()
+                                                             : output_edgelist_srcs.begin()) +
                 old_size,
-              cuda::proclaim_return_type<vertex_t>(
-                [offsets = raft::device_span<size_t const>(d_offsets.data(), d_offsets.size()),
-                 nzd_vertices =
-                   renumbered_and_compressed_nzd_vertices
-                     ? thrust::make_optional<raft::device_span<vertex_t const>>(
-                         (*renumbered_and_compressed_nzd_vertices).data() + offset_start_offset,
-                         (offset_end_offset - offset_start_offset) - 1)
-                     : thrust::nullopt,
-                 base_v] __device__(size_t i) {
-                  auto idx = static_cast<size_t>(thrust::distance(
-                    offsets.begin() + 1,
-                    thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), i)));
-                  if (nzd_vertices) {
-                    return (*nzd_vertices)[idx];
-                  } else {
-                    return base_v + static_cast<vertex_t>(idx);
-                  }
-                }));
-            thrust::copy(handle.get_thrust_policy(),
-                         renumbered_and_compressed_edgelist_minors.begin() + h_offsets[0],
-                         renumbered_and_compressed_edgelist_minors.begin() + h_offsets.back(),
-                         (sampling_post_processing_usecase.src_is_major
-                            ? this_label_output_edgelist_dsts.begin()
-                            : this_label_output_edgelist_srcs.begin()) +
-                           old_size);
-            if (this_label_output_edgelist_weights) {
-              thrust::copy(handle.get_thrust_policy(),
-                           (*renumbered_and_compressed_edgelist_weights).begin() + h_offsets[0],
-                           (*renumbered_and_compressed_edgelist_weights).begin() + h_offsets.back(),
-                           (*this_label_output_edgelist_weights).begin() + old_size);
+              renumbered_and_compressed_edgelist_minors.begin() + h_offsets[0],
+              h_offsets.back() - h_offsets[0],
+              handle.get_stream());
+            if (output_edgelist_weights) {
+              raft::copy((*output_edgelist_weights).begin() + old_size,
+                         (*renumbered_and_compressed_edgelist_weights).begin() + h_offsets[0],
+                         h_offsets.back() - h_offsets[0],
+                         handle.get_stream());
             }
           }
-
-          size_t renumber_map_start_offset =
-            renumbered_and_compressed_renumber_map_label_offsets
-              ? (*renumbered_and_compressed_renumber_map_label_offsets)
-                  .element(i, handle.get_stream())
-              : size_t{0};
-          size_t renumber_map_end_offset =
-            renumbered_and_compressed_renumber_map_label_offsets
-              ? (*renumbered_and_compressed_renumber_map_label_offsets)
-                  .element(i + 1, handle.get_stream())
-              : renumbered_and_compressed_renumber_map.size();
-          auto this_label_output_renumber_map = raft::device_span<vertex_t const>(
-            renumbered_and_compressed_renumber_map.data() + renumber_map_start_offset,
-            renumber_map_end_offset - renumber_map_start_offset);
-
-          // check whether renumbering recovers the original edge list
-
-          ASSERT_TRUE(compare_edgelist(
-            handle,
-            this_label_org_edgelist_srcs,
-            this_label_org_edgelist_dsts,
-            this_label_org_edgelist_weights,
-            raft::device_span<vertex_t const>(this_label_output_edgelist_srcs.data(),
-                                              this_label_output_edgelist_srcs.size()),
-            raft::device_span<vertex_t const>(this_label_output_edgelist_dsts.data(),
-                                              this_label_output_edgelist_dsts.size()),
-            this_label_output_edgelist_weights
-              ? std::make_optional<raft::device_span<weight_t const>>(
-                  (*this_label_output_edgelist_weights).data(),
-                  (*this_label_output_edgelist_weights).size())
-              : std::nullopt,
-            std::make_optional(this_label_output_renumber_map)))
-            << "Unrenumbering the renumbered and sorted edge list does not recover the original "
-               "edgelist.";
-
-          // Check the invariants in renumber_map
-
-          ASSERT_TRUE(check_renumber_map_invariants(
-            handle,
-            sampling_post_processing_usecase.renumber_with_seeds
-              ? std::make_optional<raft::device_span<vertex_t const>>(
-                  this_label_starting_vertices.data(), this_label_starting_vertices.size())
-              : std::nullopt,
-            this_label_org_edgelist_srcs,
-            this_label_org_edgelist_dsts,
-            this_label_org_edgelist_hops,
-            this_label_output_renumber_map,
-            sampling_post_processing_usecase.src_is_major))
-            << "Renumbered and sorted output renumber map violates invariants.";
         }
+
+        ASSERT_TRUE(compare_edgelist(
+          handle,
+          raft::device_span<vertex_t const>(org_edgelist_srcs.data(), org_edgelist_srcs.size()),
+          raft::device_span<vertex_t const>(org_edgelist_dsts.data(), org_edgelist_dsts.size()),
+          org_edgelist_weights ? std::make_optional<raft::device_span<weight_t const>>(
+                                   (*org_edgelist_weights).data(), (*org_edgelist_weights).size())
+                               : std::nullopt,
+          org_edgelist_label_offsets
+            ? std::make_optional(raft::device_span<size_t const>(
+                (*org_edgelist_label_offsets).data(), (*org_edgelist_label_offsets).size()))
+            : std::nullopt,
+          raft::device_span<vertex_t const>(output_edgelist_srcs.data(),
+                                            output_edgelist_srcs.size()),
+          raft::device_span<vertex_t const>(output_edgelist_dsts.data(),
+                                            output_edgelist_dsts.size()),
+          output_edgelist_weights
+            ? std::make_optional<raft::device_span<weight_t const>>(
+                (*output_edgelist_weights).data(), (*output_edgelist_weights).size())
+            : std::nullopt,
+          std::make_optional<raft::device_span<vertex_t const>>(
+            renumbered_and_compressed_renumber_map.data(),
+            renumbered_and_compressed_renumber_map.size()),
+          renumbered_and_compressed_renumber_map_label_offsets
+            ? std::make_optional<raft::device_span<size_t const>>(
+                (*renumbered_and_compressed_renumber_map_label_offsets).data(),
+                (*renumbered_and_compressed_renumber_map_label_offsets).size())
+            : std::nullopt,
+          sampling_post_processing_usecase.num_labels))
+          << "Unrenumbering the renumbered and sorted edge list does not recover the original "
+             "edgelist.";
+
+        // Check the invariants in renumber_map
+
+        ASSERT_TRUE(check_vertex_renumber_map_invariants<vertex_t>(
+          handle,
+          sampling_post_processing_usecase.renumber_with_seeds
+            ? std::make_optional<raft::device_span<vertex_t const>>(starting_vertices.data(),
+                                                                    starting_vertices.size())
+            : std::nullopt,
+          (sampling_post_processing_usecase.renumber_with_seeds && starting_vertex_label_offsets)
+            ? std::make_optional<raft::device_span<size_t const>>(
+                (*starting_vertex_label_offsets).data(), (*starting_vertex_label_offsets).size())
+            : std::nullopt,
+          raft::device_span<vertex_t const>(org_edgelist_srcs.data(), org_edgelist_srcs.size()),
+          raft::device_span<vertex_t const>(org_edgelist_dsts.data(), org_edgelist_dsts.size()),
+          org_edgelist_hops ? std::make_optional<raft::device_span<int32_t const>>(
+                                (*org_edgelist_hops).data(), (*org_edgelist_hops).size())
+                            : std::nullopt,
+          org_edgelist_label_offsets
+            ? std::make_optional(raft::device_span<size_t const>(
+                (*org_edgelist_label_offsets).data(), (*org_edgelist_label_offsets).size()))
+            : std::nullopt,
+          raft::device_span<vertex_t const>(renumbered_and_compressed_renumber_map.data(),
+                                            renumbered_and_compressed_renumber_map.size()),
+          renumbered_and_compressed_renumber_map_label_offsets
+            ? std::make_optional<raft::device_span<size_t const>>(
+                (*renumbered_and_compressed_renumber_map_label_offsets).data(),
+                (*renumbered_and_compressed_renumber_map_label_offsets).size())
+            : std::nullopt,
+          std::nullopt,
+          sampling_post_processing_usecase.num_labels,
+          1,
+          sampling_post_processing_usecase.src_is_major))
+          << "Renumbered and sorted output renumber map violates invariants.";
       }
     }
 
+    // 6. post processing: sort only
+
     {
       rmm::device_uvector<vertex_t> sorted_edgelist_srcs(org_edgelist_srcs.size(),
                                                          handle.get_stream());
@@ -1245,25 +762,42 @@ class Tests_SamplingPostProcessing
 
       if (sampling_post_processing_usecase.check_correctness) {
         if (sorted_edgelist_label_hop_offsets) {
-          ASSERT_TRUE((*sorted_edgelist_label_hop_offsets).size() ==
-                      sampling_post_processing_usecase.num_labels *
-                          sampling_post_processing_usecase.fanouts.size() +
-                        1)
-            << "Sorted edge list (label,hop) offset array size should coincide with "
-               "the number of labels * the number of hops + 1.";
-
-          ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
-                                        (*sorted_edgelist_label_hop_offsets).begin(),
-                                        (*sorted_edgelist_label_hop_offsets).end()))
-            << "Sorted edge list (label,hop) offset array values should be "
-               "non-decreasing.";
-
-          ASSERT_TRUE((*sorted_edgelist_label_hop_offsets).back_element(handle.get_stream()) ==
-                      sorted_edgelist_srcs.size())
-            << "Sorted edge list (label,hop) offset array's last element should coincide with the "
-               "number of edges.";
+          ASSERT_TRUE(check_offsets(
+            handle,
+            raft::device_span<size_t const>((*sorted_edgelist_label_hop_offsets).data(),
+                                            (*sorted_edgelist_label_hop_offsets).size()),
+            sampling_post_processing_usecase.num_labels *
+              sampling_post_processing_usecase.fanouts.size(),
+            sorted_edgelist_srcs.size()))
+            << "Sorted edge list (label, hop) offset array is invalid.";
         }
 
+        // check whether renumbering recovers the original edge list
+
+        ASSERT_TRUE(compare_edgelist(
+          handle,
+          raft::device_span<vertex_t const>(org_edgelist_srcs.data(), org_edgelist_srcs.size()),
+          raft::device_span<vertex_t const>(org_edgelist_dsts.data(), org_edgelist_dsts.size()),
+          org_edgelist_weights ? std::make_optional<raft::device_span<weight_t const>>(
+                                   (*org_edgelist_weights).data(), (*org_edgelist_weights).size())
+                               : std::nullopt,
+          org_edgelist_label_offsets
+            ? std::make_optional(raft::device_span<size_t const>(
+                (*org_edgelist_label_offsets).data(), (*org_edgelist_label_offsets).size()))
+            : std::nullopt,
+          raft::device_span<vertex_t const>(sorted_edgelist_srcs.data(),
+                                            sorted_edgelist_srcs.size()),
+          raft::device_span<vertex_t const>(sorted_edgelist_dsts.data(),
+                                            sorted_edgelist_dsts.size()),
+          sorted_edgelist_weights
+            ? std::make_optional<raft::device_span<weight_t const>>(
+                (*sorted_edgelist_weights).data(), (*sorted_edgelist_weights).size())
+            : std::nullopt,
+          std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+          std::optional<raft::device_span<size_t const>>{std::nullopt},
+          sampling_post_processing_usecase.num_labels))
+          << "Sorted edge list does not coincide with the original edgelist.";
+
         for (size_t i = 0; i < sampling_post_processing_usecase.num_labels; ++i) {
           size_t edgelist_start_offset =
             org_edgelist_label_offsets
@@ -1314,9 +848,7 @@ class Tests_SamplingPostProcessing
                                                      : this_label_output_edgelist_srcs;
 
           if (this_label_org_edgelist_hops) {
-            auto num_hops   = sampling_post_processing_usecase.fanouts.size();
-            auto edge_first = thrust::make_zip_iterator(this_label_output_edgelist_majors.begin(),
-                                                        this_label_output_edgelist_minors.begin());
+            auto num_hops = sampling_post_processing_usecase.fanouts.size();
             for (size_t j = 0; j < num_hops; ++j) {
               auto hop_start_offset =
                 (*sorted_edgelist_label_hop_offsets)
@@ -1326,32 +858,25 @@ class Tests_SamplingPostProcessing
                 (*sorted_edgelist_label_hop_offsets)
                   .element(i * num_hops + j + 1, handle.get_stream()) -
                 (*sorted_edgelist_label_hop_offsets).element(i * num_hops, handle.get_stream());
-              ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
-                                            edge_first + hop_start_offset,
-                                            edge_first + hop_end_offset))
-                << "Renumbered and sorted output edges are not properly sorted.";
+              ASSERT_TRUE(check_edgelist_is_sorted(
+                handle,
+                raft::device_span<vertex_t const>(
+                  this_label_output_edgelist_majors.data() + hop_start_offset,
+                  hop_end_offset - hop_start_offset),
+                raft::device_span<vertex_t const>(
+                  this_label_output_edgelist_minors.data() + hop_start_offset,
+                  hop_end_offset - hop_start_offset)))
+                << "Sorted edge list is not properly sorted.";
             }
           } else {
-            auto edge_first = thrust::make_zip_iterator(this_label_output_edgelist_majors.begin(),
-                                                        this_label_output_edgelist_minors.begin());
-            ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
-                                          edge_first,
-                                          edge_first + this_label_output_edgelist_majors.size()))
-              << "Renumbered and sorted output edges are not properly sorted.";
+            ASSERT_TRUE(check_edgelist_is_sorted(
+              handle,
+              raft::device_span<vertex_t const>(this_label_output_edgelist_majors.data(),
+                                                this_label_output_edgelist_majors.size()),
+              raft::device_span<vertex_t const>(this_label_output_edgelist_minors.data(),
+                                                this_label_output_edgelist_minors.size())))
+              << "Sorted edge list is not properly sorted.";
           }
-
-          // check whether renumbering recovers the original edge list
-
-          ASSERT_TRUE(
-            compare_edgelist(handle,
-                             this_label_org_edgelist_srcs,
-                             this_label_org_edgelist_dsts,
-                             this_label_org_edgelist_weights,
-                             this_label_output_edgelist_srcs,
-                             this_label_output_edgelist_dsts,
-                             this_label_output_edgelist_weights,
-                             std::optional<raft::device_span<vertex_t const>>{std::nullopt}))
-            << "Sorted edge list does not coincide with the original edgelist.";
         }
       }
     }
diff --git a/cpp/tests/sampling/sg_random_walks_test.cpp b/cpp/tests/sampling/sg_random_walks_test.cpp
index 7409c2ab758..4bcfebc6d51 100644
--- a/cpp/tests/sampling/sg_random_walks_test.cpp
+++ b/cpp/tests/sampling/sg_random_walks_test.cpp
@@ -40,8 +40,10 @@ struct UniformRandomWalks_Usecase {
              raft::device_span<vertex_t const> start_vertices,
              size_t num_paths)
   {
+    raft::random::RngState rng_state(0);
+
     return cugraph::uniform_random_walks(
-      handle, graph_view, edge_weight_view, start_vertices, num_paths, seed);
+      handle, rng_state, graph_view, edge_weight_view, start_vertices, num_paths);
   }
 
   bool expect_throw() { return false; }
@@ -62,12 +64,13 @@ struct BiasedRandomWalks_Usecase {
   {
     CUGRAPH_EXPECTS(edge_weight_view.has_value(), "Biased random walk requires edge weights.");
 
+    raft::random::RngState rng_state(0);
+
     return cugraph::biased_random_walks(
-      handle, graph_view, *edge_weight_view, start_vertices, num_paths, seed);
+      handle, rng_state, graph_view, *edge_weight_view, start_vertices, num_paths);
   }
 
-  // FIXME: Not currently implemented
-  bool expect_throw() { return true; }
+  bool expect_throw() { return !test_weighted; }
 };
 
 struct Node2VecRandomWalks_Usecase {
@@ -85,18 +88,19 @@ struct Node2VecRandomWalks_Usecase {
              raft::device_span<vertex_t const> start_vertices,
              size_t num_paths)
   {
+    raft::random::RngState rng_state(0);
+
     return cugraph::node2vec_random_walks(handle,
+                                          rng_state,
                                           graph_view,
                                           edge_weight_view,
                                           start_vertices,
                                           num_paths,
                                           static_cast<weight_t>(p),
-                                          static_cast<weight_t>(q),
-                                          seed);
+                                          static_cast<weight_t>(q));
   }
 
-  // FIXME: Not currently implemented
-  bool expect_throw() { return true; }
+  bool expect_throw() { return false; }
 };
 
 template <typename tuple_t>
@@ -197,9 +201,6 @@ using Tests_Node2VecRandomWalks_File =
 using Tests_Node2VecRandomWalks_Rmat =
   Tests_RandomWalks<std::tuple<Node2VecRandomWalks_Usecase, cugraph::test::Rmat_Usecase>>;
 
-#if 0
-// FIXME:  We should use these tests, gtest-1.11.0 makes it a runtime error
-//         to define and not instantiate these.
 TEST_P(Tests_UniformRandomWalks_File, Initialize_i32_i32_f)
 {
   run_current_test<int32_t, int32_t, float>(
@@ -211,7 +212,6 @@ TEST_P(Tests_UniformRandomWalks_Rmat, Initialize_i32_i32_f)
   run_current_test<int32_t, int32_t, float>(
     override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
 }
-#endif
 
 TEST_P(Tests_BiasedRandomWalks_File, Initialize_i32_i32_f)
 {
@@ -237,19 +237,12 @@ TEST_P(Tests_Node2VecRandomWalks_Rmat, Initialize_i32_i32_f)
     override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
 }
 
-#if 0
-// FIXME: Not sure why these are failing, but we're refactoring anyway.
 INSTANTIATE_TEST_SUITE_P(
   simple_test,
   Tests_UniformRandomWalks_File,
-  ::testing::Combine(
-    ::testing::Values(UniformRandomWalks_Usecase{false, 0, true},
-                      UniformRandomWalks_Usecase{true, 0, true}),
-    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
-                      cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
-                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
-                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
-#endif
+  ::testing::Combine(::testing::Values(UniformRandomWalks_Usecase{false, 0, true},
+                                       UniformRandomWalks_Usecase{true, 0, true}),
+                     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
 
 INSTANTIATE_TEST_SUITE_P(
   file_test,
@@ -265,6 +258,16 @@ INSTANTIATE_TEST_SUITE_P(
                                        Node2VecRandomWalks_Usecase{4, 8, true, 0, true}),
                      ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
 
+INSTANTIATE_TEST_SUITE_P(
+  file_large_test,
+  Tests_UniformRandomWalks_File,
+  ::testing::Combine(
+    ::testing::Values(UniformRandomWalks_Usecase{false, 0, true},
+                      UniformRandomWalks_Usecase{true, 0, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
 INSTANTIATE_TEST_SUITE_P(
   file_large_test,
   Tests_BiasedRandomWalks_File,
@@ -285,23 +288,20 @@ INSTANTIATE_TEST_SUITE_P(
                       cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
                       cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
 
-#if 0
-// FIXME: Not sure why these are failing, but we're refactoring anyway.
 INSTANTIATE_TEST_SUITE_P(
   rmat_small_test,
   Tests_UniformRandomWalks_Rmat,
-  ::testing::Combine(::testing::Values(UniformRandomWalks_Usecase{false, 0, true},
-                                       UniformRandomWalks_Usecase{true, 0, true}),
-                     ::testing::Values(cugraph::test::Rmat_Usecase(
-                       10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+  ::testing::Combine(
+    ::testing::Values(UniformRandomWalks_Usecase{false, 0, true},
+                      UniformRandomWalks_Usecase{true, 0, true}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
 INSTANTIATE_TEST_SUITE_P(
   rmat_benchmark_test,
   Tests_UniformRandomWalks_Rmat,
-  ::testing::Combine(::testing::Values(UniformRandomWalks_Usecase{true, 0, false}),
-                     ::testing::Values(cugraph::test::Rmat_Usecase(
-                       20, 32, 0.57, 0.19, 0.19, 0, false, false))));
-#endif
+  ::testing::Combine(
+    ::testing::Values(UniformRandomWalks_Usecase{true, 0, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 INSTANTIATE_TEST_SUITE_P(
   rmat_small_test,
diff --git a/cpp/tests/utilities/property_generator_utilities.hpp b/cpp/tests/utilities/property_generator_utilities.hpp
index 6bd22da1f75..f907501cc7c 100644
--- a/cpp/tests/utilities/property_generator_utilities.hpp
+++ b/cpp/tests/utilities/property_generator_utilities.hpp
@@ -34,6 +34,7 @@ template <typename GraphViewType, typename property_t>
 struct generate {
  private:
   using vertex_type = typename GraphViewType::vertex_type;
+  using edge_type_t = int32_t;
 
   using property_buffer_type = std::decay_t<decltype(allocate_dataframe_buffer<property_t>(
     size_t{0}, rmm::cuda_stream_view{}))>;
@@ -62,6 +63,28 @@ struct generate {
 
   static cugraph::edge_property_t<GraphViewType, property_t> edge_property(
     raft::handle_t const& handle, GraphViewType const& graph_view, int32_t hash_bin_count);
+
+  static cugraph::edge_property_t<GraphViewType, property_t> edge_property_by_src_dst_types(
+    raft::handle_t const& handle,
+    GraphViewType const& graph_view,
+    raft::device_span<typename GraphViewType::vertex_type const> vertex_type_offsets,
+    int32_t hash_bin_count);
+
+  // generate unqiue edge property values (in [0, # edges in the graph) if property_t is an integer
+  // type, this function requires std::numeric_limits<property_t>::max() to be no smaller than the
+  // number of edges in the input graph).
+  static cugraph::edge_property_t<GraphViewType, property_t> unique_edge_property(
+    raft::handle_t const& handle, GraphViewType const& graph_view);
+
+  // generate unique (edge property value, edge type) pairs (if property_t is an integral type, edge
+  // property values for each type are consecutive integers starting from 0, this function requires
+  // std::numeric_limits<property_t>::max() to be no smaller than the number of edges in the input
+  // graph).
+  static cugraph::edge_property_t<GraphViewType, property_t> unique_edge_property_per_type(
+    raft::handle_t const& handle,
+    GraphViewType const& graph_view,
+    cugraph::edge_property_view_t<typename GraphViewType::edge_type, int32_t const*> edge_type_view,
+    int32_t num_edge_types);
 };
 
 }  // namespace test
diff --git a/cpp/tests/utilities/property_generator_utilities_impl.cuh b/cpp/tests/utilities/property_generator_utilities_impl.cuh
index a46009f95e3..61a861b6670 100644
--- a/cpp/tests/utilities/property_generator_utilities_impl.cuh
+++ b/cpp/tests/utilities/property_generator_utilities_impl.cuh
@@ -26,6 +26,7 @@
 
 #include <raft/core/handle.hpp>
 
+#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/distance.h>
@@ -127,5 +128,102 @@ generate<GraphViewType, property_t>::edge_property(raft::handle_t const& handle,
   return output_property;
 }
 
+template <typename GraphViewType, typename property_t>
+cugraph::edge_property_t<GraphViewType, property_t>
+generate<GraphViewType, property_t>::edge_property_by_src_dst_types(
+  raft::handle_t const& handle,
+  GraphViewType const& graph_view,
+  raft::device_span<typename GraphViewType::vertex_type const> vertex_type_offsets,
+  int32_t hash_bin_count)
+{
+  auto output_property = cugraph::edge_property_t<GraphViewType, property_t>(handle, graph_view);
+
+  cugraph::transform_e(
+    handle,
+    graph_view,
+    cugraph::edge_src_dummy_property_t{}.view(),
+    cugraph::edge_dst_dummy_property_t{}.view(),
+    cugraph::edge_dummy_property_t{}.view(),
+    [vertex_type_offsets, hash_bin_count] __device__(auto src, auto dst, auto, auto, auto) {
+      auto src_v_type = thrust::distance(
+        vertex_type_offsets.begin() + 1,
+        thrust::upper_bound(
+          thrust::seq, vertex_type_offsets.begin() + 1, vertex_type_offsets.end(), src));
+      auto dst_v_type = thrust::distance(
+        vertex_type_offsets.begin() + 1,
+        thrust::upper_bound(
+          thrust::seq, vertex_type_offsets.begin() + 1, vertex_type_offsets.end(), dst));
+      auto num_v_types = vertex_type_offsets.size() - 1;
+      return detail::make_property_value<property_t>((src_v_type * num_v_types + dst_v_type) %
+                                                     hash_bin_count);
+    },
+    output_property.mutable_view());
+
+  return output_property;
+}
+
+template <typename GraphViewType, typename property_t>
+cugraph::edge_property_t<GraphViewType, property_t>
+generate<GraphViewType, property_t>::unique_edge_property(raft::handle_t const& handle,
+                                                          GraphViewType const& graph_view)
+{
+  auto output_property = cugraph::edge_property_t<GraphViewType, property_t>(handle, graph_view);
+  if constexpr (std::is_integral_v<property_t> && !std::is_same_v<property_t, bool>) {
+    CUGRAPH_EXPECTS(
+      graph_view.compute_number_of_edges(handle) <= std::numeric_limits<property_t>::max(),
+      "std::numeric_limits<property_t>::max() is smaller than the number of edges.");
+    rmm::device_scalar<property_t> counter(property_t{0}, handle.get_stream());
+    cugraph::transform_e(
+      handle,
+      graph_view,
+      cugraph::edge_src_dummy_property_t{}.view(),
+      cugraph::edge_dst_dummy_property_t{}.view(),
+      cugraph::edge_dummy_property_t{}.view(),
+      [counter = counter.data()] __device__(auto, auto, auto, auto, auto) {
+        cuda::atomic_ref<property_t, cuda::thread_scope_device> atomic_counter(*counter);
+        return atomic_counter.fetch_add(property_t{1}, cuda::std::memory_order_relaxed);
+      },
+      output_property.mutable_view());
+    if constexpr (GraphViewType::is_multi_gpu) { CUGRAPH_FAIL("unimplemented."); }
+  } else {
+    CUGRAPH_FAIL("unimplemented.");
+  }
+  return output_property;
+}
+
+template <typename GraphViewType, typename property_t>
+cugraph::edge_property_t<GraphViewType, property_t>
+generate<GraphViewType, property_t>::unique_edge_property_per_type(
+  raft::handle_t const& handle,
+  GraphViewType const& graph_view,
+  cugraph::edge_property_view_t<typename GraphViewType::edge_type, int32_t const*> edge_type_view,
+  int32_t num_edge_types)
+{
+  auto output_property = cugraph::edge_property_t<GraphViewType, property_t>(handle, graph_view);
+  if constexpr (std::is_integral_v<property_t> && !std::is_same_v<property_t, bool>) {
+    CUGRAPH_EXPECTS(
+      graph_view.compute_number_of_edges(handle) <= std::numeric_limits<property_t>::max(),
+      "std::numeric_limits<property_t>::max() is smaller than the number of edges.");
+    rmm::device_uvector<property_t> counters(num_edge_types, handle.get_stream());
+    thrust::fill(handle.get_thrust_policy(), counters.begin(), counters.end(), property_t{0});
+    cugraph::transform_e(
+      handle,
+      graph_view,
+      cugraph::edge_src_dummy_property_t{}.view(),
+      cugraph::edge_dst_dummy_property_t{}.view(),
+      edge_type_view,
+      [counters = raft::device_span<property_t>(counters.data(), counters.size())] __device__(
+        auto, auto, auto, auto, int32_t edge_type) {
+        cuda::atomic_ref<property_t, cuda::thread_scope_device> atomic_counter(counters[edge_type]);
+        return atomic_counter.fetch_add(property_t{1}, cuda::std::memory_order_relaxed);
+      },
+      output_property.mutable_view());
+    if constexpr (GraphViewType::is_multi_gpu) { CUGRAPH_FAIL("unimplemented."); }
+  } else {
+    CUGRAPH_FAIL("unimplemented.");
+  }
+  return output_property;
+}
+
 }  // namespace test
 }  // namespace cugraph
diff --git a/cpp/tests/utilities/thrust_wrapper.cu b/cpp/tests/utilities/thrust_wrapper.cu
index 8d26ac1f2fe..ef1c4f831eb 100644
--- a/cpp/tests/utilities/thrust_wrapper.cu
+++ b/cpp/tests/utilities/thrust_wrapper.cu
@@ -16,11 +16,15 @@
 
 #include "utilities/thrust_wrapper.hpp"
 
+#include <cugraph/utilities/device_functors.cuh>
+#include <cugraph/utilities/misc_utils.cuh>
+
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/distance.h>
 #include <thrust/extrema.h>
+#include <thrust/gather.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/random.h>
 #include <thrust/remove.h>
@@ -477,5 +481,70 @@ template void populate_vertex_ids(raft::handle_t const& handle,
                                   rmm::device_uvector<int64_t>& d_vertices_v,
                                   int64_t vertex_id_offset);
 
+template <typename idx_t, typename offset_t>
+void expand_sparse_offsets(raft::handle_t const& handle,
+                           raft::device_span<offset_t const> offsets,
+                           raft::device_span<idx_t> indices,
+                           offset_t base_offset,
+                           idx_t base_idx)
+{
+  rmm::device_uvector<offset_t> tmp_offsets(offsets.size(), handle.get_stream());
+  thrust::transform(handle.get_thrust_policy(),
+                    offsets.begin(),
+                    offsets.end(),
+                    tmp_offsets.begin(),
+                    cugraph::detail::shift_left_t<offset_t>{base_offset});
+  auto tmp = cugraph::detail::expand_sparse_offsets(
+    raft::device_span<offset_t const>(tmp_offsets.data(), tmp_offsets.size()),
+    base_idx,
+    handle.get_stream());
+  thrust::copy(handle.get_thrust_policy(), tmp.begin(), tmp.end(), indices.begin());
+}
+
+template void expand_sparse_offsets(raft::handle_t const& handle,
+                                    raft::device_span<size_t const> offsets,
+                                    raft::device_span<int32_t> indices,
+                                    size_t base_offset,
+                                    int32_t base_idx);
+
+template void expand_sparse_offsets(raft::handle_t const& handle,
+                                    raft::device_span<size_t const> offsets,
+                                    raft::device_span<int64_t> indices,
+                                    size_t base_offset,
+                                    int64_t base_idx);
+
+template <typename idx_t, typename offset_t>
+void expand_hypersparse_offsets(raft::handle_t const& handle,
+                                raft::device_span<offset_t const> offsets,
+                                raft::device_span<idx_t const> nzd_indices,
+                                raft::device_span<idx_t> indices,
+                                offset_t base_offset)
+{
+  rmm::device_uvector<offset_t> tmp_offsets(offsets.size(), handle.get_stream());
+  thrust::transform(handle.get_thrust_policy(),
+                    offsets.begin(),
+                    offsets.end(),
+                    tmp_offsets.begin(),
+                    cugraph::detail::shift_left_t<offset_t>{base_offset});
+  auto tmp = cugraph::detail::expand_sparse_offsets(
+    raft::device_span<offset_t const>(tmp_offsets.data(), tmp_offsets.size()),
+    idx_t{0},
+    handle.get_stream());
+  thrust::gather(
+    handle.get_thrust_policy(), tmp.begin(), tmp.end(), nzd_indices.begin(), indices.begin());
+}
+
+template void expand_hypersparse_offsets(raft::handle_t const& handle,
+                                         raft::device_span<size_t const> offsets,
+                                         raft::device_span<int32_t const> nzd_indices,
+                                         raft::device_span<int32_t> indices,
+                                         size_t base_offset);
+
+template void expand_hypersparse_offsets(raft::handle_t const& handle,
+                                         raft::device_span<size_t const> offsets,
+                                         raft::device_span<int64_t const> nzd_indices,
+                                         raft::device_span<int64_t> indices,
+                                         size_t base_offset);
+
 }  // namespace test
 }  // namespace cugraph
diff --git a/cpp/tests/utilities/thrust_wrapper.hpp b/cpp/tests/utilities/thrust_wrapper.hpp
index cd8bc33308f..afdff33d80a 100644
--- a/cpp/tests/utilities/thrust_wrapper.hpp
+++ b/cpp/tests/utilities/thrust_wrapper.hpp
@@ -93,5 +93,19 @@ void populate_vertex_ids(raft::handle_t const& handle,
                          rmm::device_uvector<vertex_t>& d_vertices_v /* [INOUT] */,
                          vertex_t vertex_id_offset);
 
+template <typename idx_t, typename offset_t>
+void expand_sparse_offsets(raft::handle_t const& handle,
+                           raft::device_span<offset_t const> offsets,
+                           raft::device_span<idx_t> indices,
+                           offset_t base_offset,
+                           idx_t base_idx);
+
+template <typename idx_t, typename offset_t>
+void expand_hypersparse_offsets(raft::handle_t const& handle,
+                                raft::device_span<offset_t const> offsets,
+                                raft::device_span<idx_t const> nzd_indices,
+                                raft::device_span<idx_t> indices,
+                                offset_t base_offset);
+
 }  // namespace test
 }  // namespace cugraph
diff --git a/cpp/tests/utilities/validation_utilities.cu b/cpp/tests/utilities/validation_utilities.cu
new file mode 100644
index 00000000000..3da998ad626
--- /dev/null
+++ b/cpp/tests/utilities/validation_utilities.cu
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "detail/graph_partition_utils.cuh"
+#include "utilities/validation_utilities.hpp"
+
+#include <cugraph/vertex_partition_device_view.cuh>
+
+#include <thrust/count.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/set_operations.h>
+#include <thrust/sort.h>
+
+namespace cugraph::test {
+
+template <typename vertex_t, bool multi_gpu>
+size_t count_invalid_vertices(
+  raft::handle_t const& handle,
+  raft::device_span<vertex_t const> vertices,
+  cugraph::vertex_partition_view_t<vertex_t, multi_gpu> const& vertex_partition_view)
+{
+  return thrust::count_if(
+    handle.get_thrust_policy(),
+    vertices.begin(),
+    vertices.end(),
+    [vertex_partition = cugraph::vertex_partition_device_view_t<vertex_t, multi_gpu>{
+       vertex_partition_view}] __device__(auto val) {
+      return !(vertex_partition.is_valid_vertex(val) &&
+               vertex_partition.in_local_vertex_partition_range_nocheck(val));
+    });
+}
+
+template <typename vertex_t>
+size_t count_duplicate_vertex_pairs_sorted(raft::handle_t const& handle,
+                                           raft::device_span<vertex_t const> src,
+                                           raft::device_span<vertex_t const> dst)
+{
+  return thrust::count_if(handle.get_thrust_policy(),
+                          thrust::make_counting_iterator<size_t>(1),
+                          thrust::make_counting_iterator<size_t>(src.size()),
+                          [src, dst] __device__(size_t index) {
+                            return (src[index - 1] == src[index]) && (dst[index - 1] == dst[index]);
+                          });
+}
+
+// FIXME: Resolve this with dataframe_buffer variations in thrust_wrappers.cu
+template <typename vertex_t>
+void sort(raft::handle_t const& handle,
+          raft::device_span<vertex_t> srcs,
+          raft::device_span<vertex_t> dsts)
+{
+  thrust::sort(handle.get_thrust_policy(),
+               thrust::make_zip_iterator(srcs.begin(), dsts.begin()),
+               thrust::make_zip_iterator(srcs.end(), dsts.end()));
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t, typename edge_type_t>
+size_t count_intersection(raft::handle_t const& handle,
+                          raft::device_span<vertex_t const> srcs1,
+                          raft::device_span<vertex_t const> dsts1,
+                          std::optional<raft::device_span<weight_t const>> wgts1,
+                          std::optional<raft::device_span<edge_t const>> edge_ids1,
+                          std::optional<raft::device_span<edge_type_t const>> edge_types1,
+                          raft::device_span<vertex_t const> srcs2,
+                          raft::device_span<vertex_t const> dsts2,
+                          std::optional<raft::device_span<weight_t const>> wgts2,
+                          std::optional<raft::device_span<edge_t const>> edge_ids2,
+                          std::optional<raft::device_span<edge_type_t const>> edge_types2)
+{
+  // FIXME: Add support for wgts, edgeids and edge_types...
+  //    Added to the API for future support.
+
+  auto iter1       = thrust::make_zip_iterator(srcs1.begin(), dsts1.begin());
+  auto iter2       = thrust::make_zip_iterator(srcs2.begin(), dsts2.begin());
+  auto output_iter = thrust::make_discard_iterator();
+
+  return thrust::distance(output_iter,
+                          thrust::set_intersection(handle.get_thrust_policy(),
+                                                   iter1,
+                                                   iter1 + srcs1.size(),
+                                                   iter2,
+                                                   iter2 + srcs2.size(),
+                                                   output_iter));
+#if 0
+  // OLD Approach
+  return thrust::count_if(
+    handle.get_thrust_policy(),
+    thrust::make_zip_iterator(src_out.begin(), dst_out.begin()),
+    thrust::make_zip_iterator(src_out.end(), dst_out.end()),
+    cuda::proclaim_return_type<size_t>(
+      [src = raft::device_span<vertex_t const>{graph_src.data(), graph_src.size()},
+       dst = raft::device_span<vertex_t const>{graph_dst.data(),
+                                               graph_dst.size()}] __device__(auto tuple) {
+#if 0
+        // FIXME: This fails on rocky linux CUDA 11.8, works on CUDA 12
+        return thrust::binary_search(thrust::seq,
+                                     thrust::make_zip_iterator(src.begin(), dst.begin()),
+                                     thrust::make_zip_iterator(src.end(), dst.end()),
+                                     tuple) ? size_t{1} : size_t{0};
+#else
+        auto lb = thrust::distance(
+          src.begin(),
+          thrust::lower_bound(thrust::seq, src.begin(), src.end(), thrust::get<0>(tuple)));
+        auto ub = thrust::distance(
+          src.begin(),
+          thrust::upper_bound(thrust::seq, src.begin(), src.end(), thrust::get<0>(tuple)));
+
+        if (src.data()[lb] == thrust::get<0>(tuple)) {
+          return thrust::binary_search(
+            thrust::seq, dst.begin() + lb, dst.begin() + ub, thrust::get<1>(tuple))
+              ? size_t{1}
+              : size_t{0};
+        } else {
+          return size_t{0};
+        }
+#endif
+      }));
+#endif
+}
+
+template <typename vertex_t>
+size_t count_edges_on_wrong_int_gpu(raft::handle_t const& handle,
+                                    raft::device_span<vertex_t const> srcs,
+                                    raft::device_span<vertex_t const> dsts,
+                                    raft::device_span<vertex_t const> vertex_partition_range_lasts)
+{
+  return thrust::count_if(
+    handle.get_thrust_policy(),
+    thrust::make_zip_iterator(srcs.begin(), dsts.begin()),
+    thrust::make_zip_iterator(srcs.end(), dsts.end()),
+    [comm_rank       = handle.get_comms().get_rank(),
+     gpu_id_key_func = cugraph::detail::compute_gpu_id_from_int_edge_endpoints_t<vertex_t>{
+       vertex_partition_range_lasts,
+       handle.get_comms().get_size(),
+       handle.get_subcomm(cugraph::partition_manager::major_comm_name()).get_size(),
+       handle.get_subcomm(cugraph::partition_manager::minor_comm_name())
+         .get_size()}] __device__(auto e) {
+      return (gpu_id_key_func(thrust::get<0>(e), thrust::get<1>(e)) != comm_rank);
+    });
+}
+
+// TODO: Split SG from MG?
+template size_t count_invalid_vertices(
+  raft::handle_t const& handle,
+  raft::device_span<int32_t const> vertices,
+  cugraph::vertex_partition_view_t<int32_t, false> const& vertex_partition_view);
+
+template size_t count_invalid_vertices(
+  raft::handle_t const& handle,
+  raft::device_span<int64_t const> vertices,
+  cugraph::vertex_partition_view_t<int64_t, false> const& vertex_partition_view);
+
+template size_t count_duplicate_vertex_pairs_sorted(raft::handle_t const& handle,
+                                                    raft::device_span<int32_t const> src,
+                                                    raft::device_span<int32_t const> dst);
+
+template size_t count_duplicate_vertex_pairs_sorted(raft::handle_t const& handle,
+                                                    raft::device_span<int64_t const> src,
+                                                    raft::device_span<int64_t const> dst);
+
+template void sort(raft::handle_t const& handle,
+                   raft::device_span<int32_t> srcs,
+                   raft::device_span<int32_t> dsts);
+template void sort(raft::handle_t const& handle,
+                   raft::device_span<int64_t> srcs,
+                   raft::device_span<int64_t> dsts);
+
+template size_t count_intersection(raft::handle_t const& handle,
+                                   raft::device_span<int32_t const> srcs1,
+                                   raft::device_span<int32_t const> dsts1,
+                                   std::optional<raft::device_span<float const>> wgts1,
+                                   std::optional<raft::device_span<int32_t const>> edge_ids1,
+                                   std::optional<raft::device_span<int32_t const>> edge_types1,
+                                   raft::device_span<int32_t const> srcs2,
+                                   raft::device_span<int32_t const> dsts2,
+                                   std::optional<raft::device_span<float const>> wgts2,
+                                   std::optional<raft::device_span<int32_t const>> edge_ids2,
+                                   std::optional<raft::device_span<int32_t const>> edge_types2);
+
+template size_t count_intersection(raft::handle_t const& handle,
+                                   raft::device_span<int32_t const> srcs1,
+                                   raft::device_span<int32_t const> dsts1,
+                                   std::optional<raft::device_span<float const>> wgts1,
+                                   std::optional<raft::device_span<int64_t const>> edge_ids1,
+                                   std::optional<raft::device_span<int32_t const>> edge_types1,
+                                   raft::device_span<int32_t const> srcs2,
+                                   raft::device_span<int32_t const> dsts2,
+                                   std::optional<raft::device_span<float const>> wgts2,
+                                   std::optional<raft::device_span<int64_t const>> edge_ids2,
+                                   std::optional<raft::device_span<int32_t const>> edge_types2);
+
+template size_t count_intersection(raft::handle_t const& handle,
+                                   raft::device_span<int64_t const> srcs1,
+                                   raft::device_span<int64_t const> dsts1,
+                                   std::optional<raft::device_span<float const>> wgts1,
+                                   std::optional<raft::device_span<int64_t const>> edge_ids1,
+                                   std::optional<raft::device_span<int32_t const>> edge_types1,
+                                   raft::device_span<int64_t const> srcs2,
+                                   raft::device_span<int64_t const> dsts2,
+                                   std::optional<raft::device_span<float const>> wgts2,
+                                   std::optional<raft::device_span<int64_t const>> edge_ids2,
+                                   std::optional<raft::device_span<int32_t const>> edge_types2);
+
+template size_t count_intersection(raft::handle_t const& handle,
+                                   raft::device_span<int32_t const> srcs1,
+                                   raft::device_span<int32_t const> dsts1,
+                                   std::optional<raft::device_span<double const>> wgts1,
+                                   std::optional<raft::device_span<int32_t const>> edge_ids1,
+                                   std::optional<raft::device_span<int32_t const>> edge_types1,
+                                   raft::device_span<int32_t const> srcs2,
+                                   raft::device_span<int32_t const> dsts2,
+                                   std::optional<raft::device_span<double const>> wgts2,
+                                   std::optional<raft::device_span<int32_t const>> edge_ids2,
+                                   std::optional<raft::device_span<int32_t const>> edge_types2);
+
+template size_t count_intersection(raft::handle_t const& handle,
+                                   raft::device_span<int32_t const> srcs1,
+                                   raft::device_span<int32_t const> dsts1,
+                                   std::optional<raft::device_span<double const>> wgts1,
+                                   std::optional<raft::device_span<int64_t const>> edge_ids1,
+                                   std::optional<raft::device_span<int32_t const>> edge_types1,
+                                   raft::device_span<int32_t const> srcs2,
+                                   raft::device_span<int32_t const> dsts2,
+                                   std::optional<raft::device_span<double const>> wgts2,
+                                   std::optional<raft::device_span<int64_t const>> edge_ids2,
+                                   std::optional<raft::device_span<int32_t const>> edge_types2);
+
+template size_t count_intersection(raft::handle_t const& handle,
+                                   raft::device_span<int64_t const> srcs1,
+                                   raft::device_span<int64_t const> dsts1,
+                                   std::optional<raft::device_span<double const>> wgts1,
+                                   std::optional<raft::device_span<int64_t const>> edge_ids1,
+                                   std::optional<raft::device_span<int32_t const>> edge_types1,
+                                   raft::device_span<int64_t const> srcs2,
+                                   raft::device_span<int64_t const> dsts2,
+                                   std::optional<raft::device_span<double const>> wgts2,
+                                   std::optional<raft::device_span<int64_t const>> edge_ids2,
+                                   std::optional<raft::device_span<int32_t const>> edge_types2);
+
+template size_t count_edges_on_wrong_int_gpu(
+  raft::handle_t const& handle,
+  raft::device_span<int32_t const> srcs,
+  raft::device_span<int32_t const> dsts,
+  raft::device_span<int32_t const> vertex_partition_range_lasts);
+
+template size_t count_edges_on_wrong_int_gpu(
+  raft::handle_t const& handle,
+  raft::device_span<int64_t const> srcs,
+  raft::device_span<int64_t const> dsts,
+  raft::device_span<int64_t const> vertex_partition_range_lasts);
+
+}  // namespace cugraph::test
diff --git a/cpp/tests/utilities/validation_utilities.hpp b/cpp/tests/utilities/validation_utilities.hpp
new file mode 100644
index 00000000000..b94ceaf68be
--- /dev/null
+++ b/cpp/tests/utilities/validation_utilities.hpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/vertex_partition_view.hpp>
+
+#include <raft/core/device_span.hpp>
+#include <raft/core/handle.hpp>
+
+namespace cugraph::test {
+template <typename vertex_t, bool multi_gpu>
+size_t count_invalid_vertices(
+  raft::handle_t const& handle,
+  raft::device_span<vertex_t const> vertices,
+  cugraph::vertex_partition_view_t<vertex_t, multi_gpu> const& vertex_partition);
+
+template <typename vertex_t>
+size_t count_duplicate_vertex_pairs_sorted(raft::handle_t const& handle,
+                                           raft::device_span<vertex_t const> src,
+                                           raft::device_span<vertex_t const> dst);
+
+template <typename vertex_t>
+void sort(raft::handle_t const& handle,
+          raft::device_span<vertex_t> srcs,
+          raft::device_span<vertex_t> dsts);
+
+template <typename vertex_t, typename edge_t, typename weight_t, typename edge_type_t>
+size_t count_intersection(raft::handle_t const& handle,
+                          raft::device_span<vertex_t const> srcs1,
+                          raft::device_span<vertex_t const> dsts1,
+                          std::optional<raft::device_span<weight_t const>> wgts1,
+                          std::optional<raft::device_span<edge_t const>> edge_ids1,
+                          std::optional<raft::device_span<edge_type_t const>> edge_types1,
+                          raft::device_span<vertex_t const> srcs2,
+                          raft::device_span<vertex_t const> dsts2,
+                          std::optional<raft::device_span<weight_t const>> wgts2,
+                          std::optional<raft::device_span<edge_t const>> edge_ids2,
+                          std::optional<raft::device_span<edge_type_t const>> edge_types2);
+
+template <typename vertex_t>
+size_t count_edges_on_wrong_int_gpu(raft::handle_t const& handle,
+                                    raft::device_span<vertex_t const> srcs,
+                                    raft::device_span<vertex_t const> dsts,
+                                    raft::device_span<vertex_t const> vertex_partition_range_lasts);
+
+}  // namespace cugraph::test
diff --git a/datasets/get_test_data.sh b/datasets/get_test_data.sh
index eea789ef3e3..6778166ab6e 100755
--- a/datasets/get_test_data.sh
+++ b/datasets/get_test_data.sh
@@ -27,6 +27,12 @@ cd "$( cd "$( dirname "$(realpath -m "${BASH_SOURCE[0]}")" )" && pwd )";
 #
 # FIXME: some test data needs to be extracted to "benchmarks", which is
 # confusing now that there's dedicated datasets for benchmarks.
+CPP_CI_DATASET_DATA="
+# ~10s download
+https://data.rapids.ai/cugraph/test/cpp_ci_datasets.tgz
+test
+"
+
 BASE_DATASET_DATA="
 # ~22s download
 https://data.rapids.ai/cugraph/test/datasets.tgz
@@ -89,6 +95,8 @@ if hasArg "--benchmark"; then
     DATASET_DATA="${BENCHMARK_DATASET_DATA}"
 elif hasArg "--subset"; then
     DATASET_DATA="${BASE_DATASET_DATA}"
+elif hasArg "--cpp_ci_subset"; then
+    DATASET_DATA="${CPP_CI_DATASET_DATA}"
 elif hasArg "--self_loops"; then
     DATASET_DATA="${SELF_LOOPS_DATASET_DATA}"
 # Do not include benchmark datasets by default - too big
diff --git a/dependencies.yaml b/dependencies.yaml
index 6bb728a2aae..640adf8099f 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -20,6 +20,7 @@ files:
       - depends_on_dask_cudf
       - depends_on_pylibraft
       - depends_on_raft_dask
+        # Deprecate pylibcugraphops
       - depends_on_pylibcugraphops
       - depends_on_pylibwholegraph
       - depends_on_cupy
@@ -44,6 +45,7 @@ files:
       - cuda_version
       - docs
       - py_version
+        # Deprecate pylibcugraphops
       - depends_on_pylibcugraphops
   test_cpp:
     output: none
@@ -135,6 +137,7 @@ files:
     extras:
       table: project
     includes:
+      - cuda_wheels
       - depends_on_rmm
       - depends_on_pylibraft
   py_test_pylibcugraph:
@@ -188,6 +191,7 @@ files:
       table: project
     includes:
       - python_run_cugraph_dgl
+        # Deprecate pylibcugraphops
       - depends_on_pylibcugraphops
   py_test_cugraph_dgl:
     output: pyproject
@@ -214,6 +218,7 @@ files:
       table: project
     includes:
       - python_run_cugraph_pyg
+        # Deprecate pylibcugraphops
       - depends_on_pylibcugraphops
   py_test_cugraph_pyg:
     output: pyproject
@@ -239,6 +244,7 @@ files:
     extras:
       table: project
     includes:
+        # Deprecate pylibcugraphops
       - depends_on_pylibcugraphops
   py_test_cugraph_equivariant:
     output: pyproject
@@ -298,6 +304,7 @@ files:
     conda_dir: python/cugraph-dgl/conda
     includes:
       - checks
+        # Deprecate pylibcugraphops
       - depends_on_pylibcugraphops
       - cugraph_dgl_dev
       - test_python_common
@@ -308,6 +315,7 @@ files:
     conda_dir: python/cugraph-pyg/conda
     includes:
       - checks
+        # Deprecate pylibcugraphops
       - depends_on_pylibcugraphops
       - cugraph_pyg_dev
       - test_python_common
@@ -376,6 +384,36 @@ dependencies:
             packages:
               - cudatoolkit
               - cuda-nvtx
+  cuda_wheels:
+    specific:
+      - output_types: pyproject
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              use_cuda_wheels: "true"
+            packages:
+              - nvidia-cublas-cu12
+              - nvidia-curand-cu12
+              - nvidia-cusolver-cu12
+              - nvidia-cusparse-cu12
+          # CUDA 11 does not provide wheels, so use the system libraries instead
+          - matrix:
+              cuda: "11.*"
+              use_cuda_wheels: "true"
+            packages:
+          # if use_cuda_wheels=false is provided, do not add dependencies on any CUDA wheels
+          # (e.g. for DLFW and pip devcontainers)
+          - matrix:
+              use_cuda_wheels: "false"
+            packages:
+          # if no matching matrix selectors passed, list the unsuffixed packages
+          # (just as a source of documentation, as this populates pyproject.toml in source control)
+          - matrix:
+            packages:
+              - nvidia-cublas
+              - nvidia-curand
+              - nvidia-cusolver
+              - nvidia-cusparse
   common_build:
     common:
       - output_types: [conda, pyproject]
@@ -388,11 +426,12 @@ dependencies:
         packages:
           - c-compiler
           - cxx-compiler
-          - libcudf==24.10.*,>=0.0.0a0
-          - libcugraphops==24.10.*,>=0.0.0a0
-          - libraft-headers==24.10.*,>=0.0.0a0
-          - libraft==24.10.*,>=0.0.0a0
-          - librmm==24.10.*,>=0.0.0a0
+          - libcudf==24.12.*,>=0.0.0a0
+          # Deprecate libcugraphops
+          - libcugraphops==24.12.*,>=0.0.0a0
+          - libraft-headers==24.12.*,>=0.0.0a0
+          - libraft==24.12.*,>=0.0.0a0
+          - librmm==24.12.*,>=0.0.0a0
           - openmpi # Required for building cpp-mgtests (multi-GPU tests)
     specific:
       - output_types: [conda]
@@ -441,10 +480,6 @@ dependencies:
     specific:
       - output_types: [conda]
         matrices:
-          - matrix:
-              py: "3.9"
-            packages:
-              - python=3.9
           - matrix:
               py: "3.10"
             packages:
@@ -453,9 +488,13 @@ dependencies:
               py: "3.11"
             packages:
               - python=3.11
+          - matrix:
+              py: "3.12"
+            packages:
+              - python=3.12
           - matrix:
             packages:
-              - python>=3.9,<3.12
+              - python>=3.10,<3.13
   python_build_rapids:
     common:
       - output_types: [conda, pyproject, requirements]
@@ -474,26 +513,26 @@ dependencies:
           - cython>=3.0.0
       - output_types: conda
         packages:
-          - scikit-build-core>=0.7.0
+          - scikit-build-core>=0.10.0
       - output_types: [pyproject, requirements]
         packages:
-          - scikit-build-core[pyproject]>=0.7.0
+          - scikit-build-core[pyproject]>=0.10.0
   python_run_cugraph:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - &dask rapids-dask-dependency==24.10.*,>=0.0.0a0
-          - &dask_cuda dask-cuda==24.10.*,>=0.0.0a0
+          - &dask rapids-dask-dependency==24.12.*,>=0.0.0a0
+          - &dask_cuda dask-cuda==24.12.*,>=0.0.0a0
           - &numba numba>=0.57
-          - &numpy numpy>=1.23,<2.0a0
+          - &numpy numpy>=1.23,<3.0a0
       - output_types: conda
         packages:
           - aiohttp
           - fsspec>=0.6.0
           - requests
-          - nccl>=2.9.9
+          - nccl>=2.19
           - ucx-proc=*=gpu
-          - &ucx_py_unsuffixed ucx-py==0.40.*,>=0.0.0a0
+          - &ucx_py_unsuffixed ucx-py==0.41.*,>=0.0.0a0
       - output_types: pyproject
         packages:
             # cudf uses fsspec but is protocol independent. cugraph
@@ -506,12 +545,12 @@ dependencies:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - &ucx_py_cu11 ucx-py-cu11==0.40.*,>=0.0.0a0
+              - &ucx_py_cu11 ucx-py-cu11==0.41.*,>=0.0.0a0
           - matrix:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - &ucx_py_cu12 ucx-py-cu12==0.40.*,>=0.0.0a0
+              - &ucx_py_cu12 ucx-py-cu12==0.41.*,>=0.0.0a0
           - matrix:
             packages:
               - *ucx_py_unsuffixed
@@ -534,15 +573,15 @@ dependencies:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - &cugraph_cu11 cugraph-cu11==24.10.*,>=0.0.0a0
+              - &cugraph_cu11 cugraph-cu11==24.12.*,>=0.0.0a0
           - matrix:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - &cugraph_cu12 cugraph-cu12==24.10.*,>=0.0.0a0
+              - &cugraph_cu12 cugraph-cu12==24.12.*,>=0.0.0a0
           - matrix:
             packages:
-              - &cugraph_unsuffixed cugraph==24.10.*,>=0.0.0a0
+              - &cugraph_unsuffixed cugraph==24.12.*,>=0.0.0a0
   python_run_cugraph_pyg:
     common:
       - output_types: [conda, pyproject]
@@ -590,19 +629,19 @@ dependencies:
               cuda_suffixed: "true"
             packages:
               - *cugraph_cu11
-              - cugraph-service-client-cu11==24.10.*,>=0.0.0a0
+              - cugraph-service-client-cu11==24.12.*,>=0.0.0a0
               - *ucx_py_cu11
           - matrix:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
               - *cugraph_cu12
-              - cugraph-service-client-cu12==24.10.*,>=0.0.0a0
+              - cugraph-service-client-cu12==24.12.*,>=0.0.0a0
               - *ucx_py_cu12
           - matrix:
             packages:
               - *cugraph_unsuffixed
-              - cugraph-service-client==24.10.*,>=0.0.0a0
+              - cugraph-service-client==24.12.*,>=0.0.0a0
               - *ucx_py_unsuffixed
   test_cpp:
     common:
@@ -638,7 +677,7 @@ dependencies:
           - scikit-learn>=0.23.1
       - output_types: [conda]
         packages:
-          - &pylibwholegraph_unsuffixed pylibwholegraph==24.10.*,>=0.0.0a0
+          - &pylibwholegraph_unsuffixed pylibwholegraph==24.12.*,>=0.0.0a0
           - *thrift
   test_python_pylibcugraph:
     common:
@@ -649,7 +688,6 @@ dependencies:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - packaging>=21
             # not needed by nx-cugraph tests, but is required for running networkx tests
           - pytest-mpl
   cugraph_dgl_dev:
@@ -657,7 +695,9 @@ dependencies:
       - output_types: [conda]
         packages:
           - *cugraph_unsuffixed
-          - pytorch>=2.0
+          # ceiling could be removed when this is fixed:
+          # https://github.com/conda-forge/pytorch-cpu-feedstock/issues/254
+          - &pytorch_conda pytorch>=2.3,<2.4.0a0
           - pytorch-cuda==11.8
           - &tensordict tensordict>=0.1.2
           - dgl>=1.1.0.cu*
@@ -666,7 +706,7 @@ dependencies:
       - output_types: [conda]
         packages:
           - *cugraph_unsuffixed
-          - pytorch>=2.0
+          - *pytorch_conda
           - pytorch-cuda==11.8
           - *tensordict
           - pyg>=2.5,<2.6
@@ -675,9 +715,11 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - &pytorch_unsuffixed pytorch>=2.0,<2.2.0a0
+          - *pytorch_conda
           - torchdata
           - pydantic
+          - ogb
+          - torchmetrics
 
     specific:
       - output_types: [requirements]
@@ -693,7 +735,7 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - &pytorch_pip torch>=2.0,<2.2.0a0
+              - &pytorch_pip torch>=2.3,<2.4.0a0
               - *tensordict
           - matrix: {cuda: "11.*"}
             packages:
@@ -718,19 +760,19 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - pylibwholegraph-cu12==24.10.*,>=0.0.0a0
+              - pylibwholegraph-cu12==24.12.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - pylibwholegraph-cu11==24.10.*,>=0.0.0a0
+              - pylibwholegraph-cu11==24.12.*,>=0.0.0a0
           - {matrix: null, packages: [*pylibwholegraph_unsuffixed]}
 
   depends_on_rmm:
     common:
       - output_types: conda
         packages:
-          - &rmm_unsuffixed rmm==24.10.*,>=0.0.0a0
+          - &rmm_unsuffixed rmm==24.12.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -743,19 +785,19 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - rmm-cu12==24.10.*,>=0.0.0a0
+              - rmm-cu12==24.12.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - rmm-cu11==24.10.*,>=0.0.0a0
+              - rmm-cu11==24.12.*,>=0.0.0a0
           - {matrix: null, packages: [*rmm_unsuffixed]}
 
   depends_on_cudf:
     common:
       - output_types: conda
         packages:
-          - &cudf_unsuffixed cudf==24.10.*,>=0.0.0a0
+          - &cudf_unsuffixed cudf==24.12.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -768,19 +810,19 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - cudf-cu12==24.10.*,>=0.0.0a0
+              - cudf-cu12==24.12.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - cudf-cu11==24.10.*,>=0.0.0a0
+              - cudf-cu11==24.12.*,>=0.0.0a0
           - {matrix: null, packages: [*cudf_unsuffixed]}
 
   depends_on_dask_cudf:
     common:
       - output_types: conda
         packages:
-          - &dask_cudf_unsuffixed dask-cudf==24.10.*,>=0.0.0a0
+          - &dask_cudf_unsuffixed dask-cudf==24.12.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -793,19 +835,19 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - dask-cudf-cu12==24.10.*,>=0.0.0a0
+              - dask-cudf-cu12==24.12.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - dask-cudf-cu11==24.10.*,>=0.0.0a0
+              - dask-cudf-cu11==24.12.*,>=0.0.0a0
           - {matrix: null, packages: [*dask_cudf_unsuffixed]}
 
   depends_on_pylibraft:
     common:
       - output_types: conda
         packages:
-          - &pylibraft_unsuffixed pylibraft==24.10.*,>=0.0.0a0
+          - &pylibraft_unsuffixed pylibraft==24.12.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -818,19 +860,19 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - pylibraft-cu12==24.10.*,>=0.0.0a0
+              - pylibraft-cu12==24.12.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - pylibraft-cu11==24.10.*,>=0.0.0a0
+              - pylibraft-cu11==24.12.*,>=0.0.0a0
           - {matrix: null, packages: [*pylibraft_unsuffixed]}
 
   depends_on_raft_dask:
     common:
       - output_types: conda
         packages:
-          - &raft_dask_unsuffixed raft-dask==24.10.*,>=0.0.0a0
+          - &raft_dask_unsuffixed raft-dask==24.12.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -843,19 +885,19 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - raft-dask-cu12==24.10.*,>=0.0.0a0
+              - raft-dask-cu12==24.12.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - raft-dask-cu11==24.10.*,>=0.0.0a0
+              - raft-dask-cu11==24.12.*,>=0.0.0a0
           - {matrix: null, packages: [*raft_dask_unsuffixed]}
 
   depends_on_pylibcugraph:
     common:
       - output_types: conda
         packages:
-          - &pylibcugraph_unsuffixed pylibcugraph==24.10.*,>=0.0.0a0
+          - &pylibcugraph_unsuffixed pylibcugraph==24.12.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -868,19 +910,20 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - pylibcugraph-cu12==24.10.*,>=0.0.0a0
+              - pylibcugraph-cu12==24.12.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - pylibcugraph-cu11==24.10.*,>=0.0.0a0
+              - pylibcugraph-cu11==24.12.*,>=0.0.0a0
           - {matrix: null, packages: [*pylibcugraph_unsuffixed]}
 
+  # deprecate pylibcugraphops
   depends_on_pylibcugraphops:
     common:
       - output_types: conda
         packages:
-          - &pylibcugraphops_unsuffixed pylibcugraphops==24.10.*,>=0.0.0a0
+          - &pylibcugraphops_unsuffixed pylibcugraphops==24.12.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -893,12 +936,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - pylibcugraphops-cu12==24.10.*,>=0.0.0a0
+              - pylibcugraphops-cu12==24.12.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - pylibcugraphops-cu11==24.10.*,>=0.0.0a0
+              - pylibcugraphops-cu11==24.12.*,>=0.0.0a0
           - {matrix: null, packages: [*pylibcugraphops_unsuffixed]}
 
   depends_on_cupy:
diff --git a/docs/cugraph/source/_static/bc_benchmark.png b/docs/cugraph/source/_static/bc_benchmark.png
new file mode 100644
index 00000000000..9e385c97e99
Binary files /dev/null and b/docs/cugraph/source/_static/bc_benchmark.png differ
diff --git a/docs/cugraph/source/_static/colab.png b/docs/cugraph/source/_static/colab.png
new file mode 100644
index 00000000000..c4c3f5b46e1
Binary files /dev/null and b/docs/cugraph/source/_static/colab.png differ
diff --git a/docs/cugraph/source/_static/nxcg-execution-diagram.jpg b/docs/cugraph/source/_static/nxcg-execution-diagram.jpg
new file mode 100644
index 00000000000..48136289af9
Binary files /dev/null and b/docs/cugraph/source/_static/nxcg-execution-diagram.jpg differ
diff --git a/docs/cugraph/source/installation/getting_cugraph.md b/docs/cugraph/source/installation/getting_cugraph.md
index 126325c09af..01bc9e379c9 100644
--- a/docs/cugraph/source/installation/getting_cugraph.md
+++ b/docs/cugraph/source/installation/getting_cugraph.md
@@ -21,7 +21,7 @@ The RAPIDS Docker containers contain all RAPIDS packages, including all from cuG
 
 
 ## Conda
-It is easy to install cuGraph using conda. You can get a minimal conda installation with [Miniconda](https://conda.io/miniconda.html) or get the full installation with [Anaconda](https://www.anaconda.com/download).
+It is easy to install cuGraph using conda. You can get a minimal conda installation with [miniforge](https://github.com/conda-forge/miniforge).
 
 cuGraph Conda packages
  * cugraph - this will also import:
@@ -45,7 +45,7 @@ conda install -c rapidsai -c conda-forge -c nvidia cugraph cuda-version=12.0
 
 Alternatively, use `cuda-version=11.8` for packages supporting CUDA 11.
 
-Note: This conda installation only applies to Linux and Python versions 3.9/3.10/3.11.
+Note: This conda installation only applies to Linux and Python versions 3.10/3.11/3.12.
 
 <br>
 
diff --git a/docs/cugraph/source/installation/source_build.md b/docs/cugraph/source/installation/source_build.md
index 89e63badef8..243a62e5c81 100644
--- a/docs/cugraph/source/installation/source_build.md
+++ b/docs/cugraph/source/installation/source_build.md
@@ -12,8 +12,7 @@ __Compilers:__
 * `nvcc`          version 11.5+
 
 __CUDA:__
-* CUDA 11.2+
-* NVIDIA driver 470.42.01 or newer
+* CUDA 11.8+
 * NVIDIA GPU, Volta architecture or later, with [compute capability](https://developer.nvidia.com/cuda-gpus) 7.0+
 
 Further details and download links for these prerequisites are available on the
@@ -178,7 +177,7 @@ Run either the C++ or the Python tests with datasets
    make test
    ```
 
-Note: This conda installation only applies to Linux and Python versions 3.8/3.11.
+Note: This conda installation only applies to Linux and Python versions 3.10, 3.11, and 3.12.
 
 ### (OPTIONAL) Set environment variable on activation
 
diff --git a/docs/cugraph/source/nx_cugraph/benchmarks.md b/docs/cugraph/source/nx_cugraph/benchmarks.md
new file mode 100644
index 00000000000..31d5e5b09eb
--- /dev/null
+++ b/docs/cugraph/source/nx_cugraph/benchmarks.md
@@ -0,0 +1,28 @@
+# Benchmarks
+
+## NetworkX vs. nx-cugraph
+We ran several commonly used graph algorithms on both `networkx` and `nx-cugraph`. Here are the results
+
+
+<figure>
+
+![bench-image](../_static/bc_benchmark.png)
+
+<figcaption style="text-align: center;">Results from running this <a
+href="https://github.com/rapidsai/cugraph/blob/HEAD/benchmarks/nx-cugraph/pytest-based/bench_algos.py">Benchmark</a><span
+class="title-ref"></span></figcaption>
+</figure>
+
+## Reproducing Benchmarks
+
+Below are the steps to reproduce the results on your workstation. These are documented in this [README](https://github.com/rapidsai/cugraph/blob/HEAD/benchmarks/nx-cugraph/pytest-based).
+
+1. Clone the latest <https://github.com/rapidsai/cugraph>
+
+2. Follow the instructions to build an environment
+
+3. Activate the environment
+
+4. Install the latest `nx-cugraph` by following the [guide](installation.md)
+
+5. Follow the instructions written in the README here: `cugraph/benchmarks/nx-cugraph/pytest-based/`
diff --git a/docs/cugraph/source/nx_cugraph/faqs.md b/docs/cugraph/source/nx_cugraph/faqs.md
new file mode 100644
index 00000000000..dee943d1908
--- /dev/null
+++ b/docs/cugraph/source/nx_cugraph/faqs.md
@@ -0,0 +1,5 @@
+# FAQ
+
+ > **1. Is `nx-cugraph` able to run across multiple GPUs?**
+
+nx-cugraph currently does not support multi-GPU. Multi-GPU support may be added to a future release of nx-cugraph, but consider [cugraph](https://docs.rapids.ai/api/cugraph/stable) for multi-GPU accelerated graph analytics in Python today.
diff --git a/docs/cugraph/source/nx_cugraph/how-it-works.md b/docs/cugraph/source/nx_cugraph/how-it-works.md
new file mode 100644
index 00000000000..f9dc5af67ac
--- /dev/null
+++ b/docs/cugraph/source/nx_cugraph/how-it-works.md
@@ -0,0 +1,114 @@
+# How it Works
+
+NetworkX has the ability to **dispatch function calls to separately-installed third-party backends**.
+
+NetworkX backends let users experience improved performance and/or additional functionality without changing their NetworkX Python code. Examples include backends that provide algorithm acceleration using GPUs, parallel processing, graph database integration, and more.
+
+While NetworkX is a pure-Python implementation with minimal to no dependencies, backends may be written in other languages and require specialized hardware and/or OS support, additional software dependencies, or even separate services. Installation instructions vary based on the backend, and additional information can be found from the individual backend project pages listed in the NetworkX Backend Gallery.
+
+
+![nxcg-execution-flow](../_static/nxcg-execution-diagram.jpg)
+
+## Enabling nx-cugraph
+
+NetworkX will use nx-cugraph as the graph analytics backend if any of the
+following are used:
+
+### `NETWORKX_BACKEND_PRIORITY` environment variable.
+
+The `NETWORKX_BACKEND_PRIORITY` environment variable can be used to have NetworkX automatically dispatch to specified backends. This variable can be set to a single backend name, or a comma-separated list of backends ordered using the priority which NetworkX should try.  If a NetworkX function is called that nx-cugraph supports, NetworkX will redirect the function call to nx-cugraph automatically, or fall back to the next backend in the list if provided, or run using the default NetworkX implementation. See [NetworkX Backends and Configs](https://networkx.org/documentation/stable/reference/backends.html).
+
+For example, this setting will have NetworkX use nx-cugraph for any function called by the script supported by nx-cugraph, and the default NetworkX implementation for all others.
+```
+bash> NETWORKX_BACKEND_PRIORITY=cugraph python my_networkx_script.py
+```
+
+This example will have NetworkX use nx-cugraph for functions it supports, then try other_backend if nx-cugraph does not support them, and finally the default NetworkX implementation if not supported by either backend:
+```
+bash> NETWORKX_BACKEND_PRIORITY="cugraph,other_backend" python my_networkx_script.py
+```
+
+### `backend=` keyword argument
+
+To explicitly specify a particular backend for an API, use the `backend=`
+keyword argument. This argument takes precedence over the
+`NETWORKX_BACKEND_PRIORITY` environment variable. This requires anyone
+running code that uses the `backend=` keyword argument to have the specified
+backend installed.
+
+Example:
+```python
+nx.betweenness_centrality(cit_patents_graph, k=k, backend="cugraph")
+```
+
+### Type-based dispatching
+
+NetworkX also supports automatically dispatching to backends associated with
+specific graph types. Like the `backend=` keyword argument example above, this
+requires the user to write code for a specific backend, and therefore requires
+the backend to be installed, but has the advantage of ensuring a particular
+behavior without the potential for runtime conversions.
+
+To use type-based dispatching with nx-cugraph, the user must import the backend
+directly in their code to access the utilities provided to create a Graph
+instance specifically for the nx-cugraph backend.
+
+Example:
+```python
+import networkx as nx
+import nx_cugraph as nxcg
+
+G = nx.Graph()
+...
+nxcg_G = nxcg.from_networkx(G)             # conversion happens once here
+nx.betweenness_centrality(nxcg_G, k=1000)  # nxcg Graph type causes cugraph backend
+                                           # to be used, no conversion necessary
+```
+
+## Command Line Example
+
+---
+
+Create `bc_demo.ipy` and paste the code below.
+
+```python
+import pandas as pd
+import networkx as nx
+
+url = "https://data.rapids.ai/cugraph/datasets/cit-Patents.csv"
+df = pd.read_csv(url, sep=" ", names=["src", "dst"], dtype="int32")
+G = nx.from_pandas_edgelist(df, source="src", target="dst")
+
+%time result = nx.betweenness_centrality(G, k=10)
+```
+Run the command:
+```
+user@machine:/# ipython bc_demo.ipy
+```
+
+You will observe a run time of approximately 7 minutes...more or less depending on your CPU.
+
+Run the command again, this time specifying cugraph as the NetworkX backend.
+```
+user@machine:/# NETWORKX_BACKEND_PRIORITY=cugraph ipython bc_demo.ipy
+```
+This run will be much faster, typically around 20 seconds depending on your GPU.
+```
+user@machine:/# NETWORKX_BACKEND_PRIORITY=cugraph ipython bc_demo.ipy
+```
+There is also an option to cache the graph conversion to GPU. This can dramatically improve performance when running multiple algorithms on the same graph. Caching is enabled by default for NetworkX versions 3.4 and later, but if using an older version, set "NETWORKX_CACHE_CONVERTED_GRAPHS=True"
+```
+NETWORKX_BACKEND_PRIORITY=cugraph NETWORKX_CACHE_CONVERTED_GRAPHS=True ipython bc_demo.ipy
+```
+
+When running Python interactively, the cugraph backend can be specified as an argument in the algorithm call.
+
+For example:
+```
+nx.betweenness_centrality(cit_patents_graph, k=k, backend="cugraph")
+```
+
+
+The latest list of algorithms supported by nx-cugraph can be found [here](https://github.com/rapidsai/cugraph/blob/HEAD/python/nx-cugraph/README.md#algorithms) or in the next section.
+
+---
diff --git a/docs/cugraph/source/nx_cugraph/index.rst b/docs/cugraph/source/nx_cugraph/index.rst
index ef6f51601ab..110300c1836 100644
--- a/docs/cugraph/source/nx_cugraph/index.rst
+++ b/docs/cugraph/source/nx_cugraph/index.rst
@@ -1,9 +1,48 @@
-===============================
-nxCugraph as a NetworkX Backend
-===============================
+nx-cugraph
+-----------
 
+nx-cugraph is a `NetworkX backend <https://networkx.org/documentation/stable/reference/utils.html#backends>`_ that provides **GPU acceleration** to many popular NetworkX algorithms.
+
+By simply `installing and enabling nx-cugraph <https://github.com/rapidsai/cugraph/blob/HEAD/python/nx-cugraph/README.md#install>`_, users can see significant speedup on workflows where performance is hindered by the default NetworkX implementation.  With ``nx-cugraph``, users can have GPU-based, large-scale performance **without** changing their familiar and easy-to-use NetworkX code.
+
+.. code-block:: python
+
+    import pandas as pd
+    import networkx as nx
+
+    url = "https://data.rapids.ai/cugraph/datasets/cit-Patents.csv"
+    df = pd.read_csv(url, sep=" ", names=["src", "dst"], dtype="int32")
+    G = nx.from_pandas_edgelist(df, source="src", target="dst")
+
+    %time result = nx.betweenness_centrality(G, k=10)
+
+.. figure:: ../_static/colab.png
+    :width: 200px
+    :target: https://nvda.ws/4drM4re
+
+    Try it on Google Colab!
+
+
++------------------------------------------------------------------------------------------------------------------------+
+| **Zero Code Change Acceleration**                                                                                      |
+|                                                                                                                        |
+| Just ``nx.config.backend_priority=["cugraph"]`` in Jupyter, or set ``NETWORKX_BACKEND_PRIORITY=cugraph`` in the shell. |
++------------------------------------------------------------------------------------------------------------------------+
+| **Run the same code on CPU or GPU**                                                                                    |
+|                                                                                                                        |
+| Nothing changes, not even your `import` statements, when going from CPU to GPU.                                        |
++------------------------------------------------------------------------------------------------------------------------+
+
+
+``nx-cugraph`` is now Generally Available (GA) as part of the ``RAPIDS`` package.  See `RAPIDS
+Quick Start <https://rapids.ai/#quick-start>`_ to get up-and-running with ``nx-cugraph``.
 
 .. toctree::
-   :maxdepth: 2
+    :maxdepth: 1
+    :caption: Contents:
 
-   nx_cugraph.md
+    how-it-works
+    supported-algorithms
+    installation
+    benchmarks
+    faqs
diff --git a/docs/cugraph/source/nx_cugraph/installation.md b/docs/cugraph/source/nx_cugraph/installation.md
new file mode 100644
index 00000000000..8d221f16fec
--- /dev/null
+++ b/docs/cugraph/source/nx_cugraph/installation.md
@@ -0,0 +1,50 @@
+# Getting Started
+
+This guide describes how to install ``nx-cugraph`` and use it in your workflows.
+
+
+## System Requirements
+
+`nx-cugraph` requires the following:
+
+ - **Volta architecture or later NVIDIA GPU, with [compute capability](https://developer.nvidia.com/cuda-gpus) 7.0+**
+ - **[CUDA](https://docs.nvidia.com/cuda/index.html) 11.2, 11.4, 11.5, 11.8, 12.0, 12.2, or 12.5**
+ - **Python >= 3.10**
+ - **[NetworkX](https://networkx.org/documentation/stable/install.html#) >= 3.0 (version 3.2 or higher recommended)**
+
+More details about system requirements can be found in the [RAPIDS System Requirements Documentation](https://docs.rapids.ai/install#system-req).
+
+## Installing nx-cugraph
+
+Read the [RAPIDS Quick Start Guide](https://docs.rapids.ai/install) to learn more about installing all RAPIDS libraries.
+
+`nx-cugraph` can be installed using conda or pip. It is included in the RAPIDS metapackage, or can be installed separately.
+
+### Conda
+**Nightly version**
+```bash
+conda install -c rapidsai-nightly -c conda-forge -c nvidia nx-cugraph
+```
+
+**Stable version**
+```bash
+conda install -c rapidsai -c conda-forge -c nvidia nx-cugraph
+```
+
+### pip
+**Nightly version**
+```bash
+pip install nx-cugraph-cu11 --extra-index-url https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+```
+
+**Stable version**
+```bash
+pip install nx-cugraph-cu11 --extra-index-url https://pypi.nvidia.com
+```
+
+<div style="border: 1px solid #ccc; background-color: #f9f9f9; padding: 10px; border-radius: 5px;">
+
+**Note:**
+ - The `pip install` examples above are for CUDA 11. To install for CUDA 12, replace `-cu11` with `-cu12`
+
+</div>
diff --git a/docs/cugraph/source/nx_cugraph/nx_cugraph.md b/docs/cugraph/source/nx_cugraph/nx_cugraph.md
index 75a30b0be5c..900362a6e2b 100644
--- a/docs/cugraph/source/nx_cugraph/nx_cugraph.md
+++ b/docs/cugraph/source/nx_cugraph/nx_cugraph.md
@@ -1,18 +1,10 @@
 ### nx_cugraph
 
 
-nx-cugraph is a [NetworkX
-backend](<https://networkx.org/documentation/stable/reference/utils.html#backends>) that provides GPU acceleration to many popular NetworkX algorithms.
-
-By simply [installing and enabling nx-cugraph](<https://github.com/rapidsai/cugraph/blob/HEAD/python/nx-cugraph/README.md#install>), users can see significant speedup on workflows where performance is hindered by the default NetworkX implementation.  With nx-cugraph, users can have GPU-based, large-scale performance without changing their familiar and easy-to-use NetworkX code.
-
-Let's look at some examples of algorithm speedups comparing NetworkX with and without GPU acceleration using nx-cugraph.
-
-Each chart has three measurements.
-* NX - default NetworkX, no GPU acceleration
-* nx-cugraph - GPU-accelerated NetworkX using nx-cugraph. This involves an internal conversion/transfer of graph data from CPU to GPU memory
-* nx-cugraph (preconvert) - GPU-accelerated NetworkX using nx-cugraph with the graph data pre-converted/transferred to GPU
+`nx-cugraph` is a [networkX backend](<https://networkx.org/documentation/stable/reference/utils.html#backends>) that accelerates many popular NetworkX functions using cuGraph and NVIDIA GPUs.
+Users simply [install and enable nx-cugraph](installation.md) to experience GPU speedups.
 
+Lets look at some examples of algorithm speedups comparing CPU based NetworkX to dispatched versions run on GPU with nx_cugraph.
 
 ![Ancestors](../images/ancestors.png)
 ![BFS Tree](../images/bfs_tree.png)
@@ -22,46 +14,3 @@ Each chart has three measurements.
 ![Pagerank](../images/pagerank.png)
 ![Single Source Shortest Path](../images/sssp.png)
 ![Weakly Connected Components](../images/wcc.png)
-
-### Command line example
-Open bc_demo.ipy and paste the code below.
-
-```
-import pandas as pd
-import networkx as nx
-
-url = "https://data.rapids.ai/cugraph/datasets/cit-Patents.csv"
-df = pd.read_csv(url, sep=" ", names=["src", "dst"], dtype="int32")
-G = nx.from_pandas_edgelist(df, source="src", target="dst")
-
-%time result = nx.betweenness_centrality(G, k=10)
-```
-Run the command:
-```
-user@machine:/# ipython bc_demo.ipy
-```
-
-You will observe a run time of approximately 7 minutes...more or less depending on your cpu.
-
-Run the command again, this time specifying cugraph as the NetworkX backend.
-```
-user@machine:/# NETWORKX_BACKEND_PRIORITY=cugraph ipython bc_demo.ipy
-```
-This run will be much faster, typically around 20 seconds depending on your GPU.
-```
-user@machine:/# NETWORKX_BACKEND_PRIORITY=cugraph ipython bc_demo.ipy
-```
-There is also an option to cache the graph conversion to GPU. This can dramatically improve performance when running multiple algorithms on the same graph.
-```
-NETWORKX_BACKEND_PRIORITY=cugraph NETWORKX_CACHE_CONVERTED_GRAPHS=True ipython bc_demo.ipy
-```
-
-When running Python interactively, the cugraph backend can be specified as an argument in the algorithm call.
-
-For example:
-```
-nx.betweenness_centrality(cit_patents_graph, k=k, backend="cugraph")
-```
-
-
-The latest list of algorithms supported by nx-cugraph can be found [here](https://github.com/rapidsai/cugraph/blob/main/python/nx-cugraph/README.md#algorithms).
diff --git a/docs/cugraph/source/nx_cugraph/supported-algorithms.rst b/docs/cugraph/source/nx_cugraph/supported-algorithms.rst
new file mode 100644
index 00000000000..b21ef7bb668
--- /dev/null
+++ b/docs/cugraph/source/nx_cugraph/supported-algorithms.rst
@@ -0,0 +1,354 @@
+Supported Algorithms
+=====================
+
+The nx-cugraph backend to NetworkX connects
+`pylibcugraph <../../readme_pages/pylibcugraph.md>`_ (cuGraph's low-level Python
+interface to its CUDA-based graph analytics library) and
+`CuPy <https://cupy.dev/>`_ (a GPU-accelerated array library) to NetworkX's
+familiar and easy-to-use API.
+
+Below is the list of algorithms that are currently supported in nx-cugraph.
+
+
+Algorithms
+----------
+
++-----------------------------+
+| **Centrality**              |
++=============================+
+| betweenness_centrality      |
++-----------------------------+
+| edge_betweenness_centrality |
++-----------------------------+
+| degree_centrality           |
++-----------------------------+
+| in_degree_centrality        |
++-----------------------------+
+| out_degree_centrality       |
++-----------------------------+
+| eigenvector_centrality      |
++-----------------------------+
+| katz_centrality             |
++-----------------------------+
+
++---------------------+
+| **Cluster**         |
++=====================+
+| average_clustering  |
++---------------------+
+| clustering          |
++---------------------+
+| transitivity        |
++---------------------+
+| triangles           |
++---------------------+
+
++--------------------------+
+| **Community**            |
++==========================+
+| louvain_communities      |
++--------------------------+
+
++--------------------------+
+| **Bipartite**            |
++==========================+
+| complete_bipartite_graph |
++--------------------------+
+
++------------------------------------+
+| **Components**                     |
++====================================+
+| connected_components               |
++------------------------------------+
+| is_connected                       |
++------------------------------------+
+| node_connected_component           |
++------------------------------------+
+| number_connected_components        |
++------------------------------------+
+| weakly_connected                   |
++------------------------------------+
+| is_weakly_connected                |
++------------------------------------+
+| number_weakly_connected_components |
++------------------------------------+
+| weakly_connected_components        |
++------------------------------------+
+
++-------------+
+| **Core**    |
++=============+
+| core_number |
++-------------+
+| k_truss     |
++-------------+
+
++-------------+
+| **DAG**     |
++=============+
+| ancestors   |
++-------------+
+| descendants |
++-------------+
+
++--------------------+
+| **Isolate**        |
++====================+
+| is_isolate         |
++--------------------+
+| isolates           |
++--------------------+
+| number_of_isolates |
++--------------------+
+
++-------------------+
+| **Link analysis** |
++===================+
+| hits              |
++-------------------+
+| pagerank          |
++-------------------+
+
++----------------+
+| **Operators**  |
++================+
+| complement     |
++----------------+
+| reverse        |
++----------------+
+
++----------------------+
+| **Reciprocity**      |
++======================+
+| overall_reciprocity  |
++----------------------+
+| reciprocity          |
++----------------------+
+
++---------------------------------------+
+| **Shortest Paths**                    |
++=======================================+
+| has_path                              |
++---------------------------------------+
+| shortest_path                         |
++---------------------------------------+
+| shortest_path_length                  |
++---------------------------------------+
+| all_pairs_shortest_path               |
++---------------------------------------+
+| all_pairs_shortest_path_length        |
++---------------------------------------+
+| bidirectional_shortest_path           |
++---------------------------------------+
+| single_source_shortest_path           |
++---------------------------------------+
+| single_source_shortest_path_length    |
++---------------------------------------+
+| single_target_shortest_path           |
++---------------------------------------+
+| single_target_shortest_path_length    |
++---------------------------------------+
+| all_pairs_bellman_ford_path           |
++---------------------------------------+
+| all_pairs_bellman_ford_path_length    |
++---------------------------------------+
+| all_pairs_dijkstra                    |
++---------------------------------------+
+| all_pairs_dijkstra_path               |
++---------------------------------------+
+| all_pairs_dijkstra_path_length        |
++---------------------------------------+
+| bellman_ford_path                     |
++---------------------------------------+
+| bellman_ford_path_length              |
++---------------------------------------+
+| dijkstra_path                         |
++---------------------------------------+
+| dijkstra_path_length                  |
++---------------------------------------+
+| single_source_bellman_ford            |
++---------------------------------------+
+| single_source_bellman_ford_path       |
++---------------------------------------+
+| single_source_bellman_ford_path_length|
++---------------------------------------+
+| single_source_dijkstra                |
++---------------------------------------+
+| single_source_dijkstra_path           |
++---------------------------------------+
+| single_source_dijkstra_path_length    |
++---------------------------------------+
+
++---------------------------+
+| **Traversal**       		|
++===========================+
+| bfs_edges                 |
++---------------------------+
+| bfs_layers                |
++---------------------------+
+| bfs_predecessors          |
++---------------------------+
+| bfs_successors            |
++---------------------------+
+| bfs_tree                  |
++---------------------------+
+| descendants_at_distance   |
++---------------------------+
+| generic_bfs_edges         |
++---------------------------+
+
++---------------------+
+| **Tree**            |
++=====================+
+| is_arborescence     |
++---------------------+
+| is_branching        |
++---------------------+
+| is_forest           |
++---------------------+
+| is_tree             |
++---------------------+
+
+Generators
+------------
+
++-------------------------------+
+| **Classic**                   |
++===============================+
+| barbell_graph                 |
++-------------------------------+
+| circular_ladder_graph         |
++-------------------------------+
+| complete_graph                |
++-------------------------------+
+| complete_multipartite_graph   |
++-------------------------------+
+| cycle_graph                   |
++-------------------------------+
+| empty_graph                   |
++-------------------------------+
+| ladder_graph                  |
++-------------------------------+
+| lollipop_graph                |
++-------------------------------+
+| null_graph                    |
++-------------------------------+
+| path_graph                    |
++-------------------------------+
+| star_graph                    |
++-------------------------------+
+| tadpole_graph                 |
++-------------------------------+
+| trivial_graph                 |
++-------------------------------+
+| turan_graph                   |
++-------------------------------+
+| wheel_graph                   |
++-------------------------------+
+
++-----------------+
+| **Classic**     |
++=================+
+| caveman_graph   |
++-----------------+
+
++------------+
+| **Ego**    |
++============+
+| ego_graph  |
++------------+
+
++------------------------------+
+| **small**                    |
++==============================+
+| bull_graph                   |
++------------------------------+
+| chvatal_graph                |
++------------------------------+
+| cubical_graph                |
++------------------------------+
+| desargues_graph              |
++------------------------------+
+| diamond_graph                |
++------------------------------+
+| dodecahedral_graph           |
++------------------------------+
+| frucht_graph                 |
++------------------------------+
+| heawood_graph                |
++------------------------------+
+| house_graph                  |
++------------------------------+
+| house_x_graph                |
++------------------------------+
+| icosahedral_graph            |
++------------------------------+
+| krackhardt_kite_graph        |
++------------------------------+
+| moebius_kantor_graph         |
++------------------------------+
+| octahedral_graph             |
++------------------------------+
+| pappus_graph                 |
++------------------------------+
+| petersen_graph               |
++------------------------------+
+| sedgewick_maze_graph         |
++------------------------------+
+| tetrahedral_graph            |
++------------------------------+
+| truncated_cube_graph         |
++------------------------------+
+| truncated_tetrahedron_graph  |
++------------------------------+
+| tutte_graph                  |
++------------------------------+
+
++-------------------------------+
+| **Social**                    |
++===============================+
+| davis_southern_women_graph    |
++-------------------------------+
+| florentine_families_graph     |
++-------------------------------+
+| karate_club_graph             |
++-------------------------------+
+| les_miserables_graph          |
++-------------------------------+
+
+Other
+-------
+
++-------------------------+
+| **Classes**             |
++=========================+
+| is_negatively_weighted  |
++-------------------------+
+
++----------------------+
+| **Convert**          |
++======================+
+| from_dict_of_lists   |
++----------------------+
+| to_dict_of_lists     |
++----------------------+
+
++--------------------------+
+| **Convert Matrix**       |
++==========================+
+| from_pandas_edgelist     |
++--------------------------+
+| from_scipy_sparse_array  |
++--------------------------+
+
++-----------------------------------+
+| **Relabel**                       |
++===================================+
+| convert_node_labels_to_integers   |
++-----------------------------------+
+| relabel_nodes                     |
++-----------------------------------+
+
+
+To request nx-cugraph backend support for a NetworkX API that is not listed
+above, visit the `cuGraph GitHub repo <https://github.com/rapidsai/cugraph>`_.
diff --git a/docs/cugraph/source/tutorials/basic_cugraph.md b/docs/cugraph/source/tutorials/basic_cugraph.md
index 78325472489..a0c9ad576b2 100644
--- a/docs/cugraph/source/tutorials/basic_cugraph.md
+++ b/docs/cugraph/source/tutorials/basic_cugraph.md
@@ -4,8 +4,8 @@
 
 CuGraph is part of [Rapids](https://docs.rapids.ai/user-guide) and has the following system requirements:
  * NVIDIA GPU, Volta architecture or later, with [compute capability](https://developer.nvidia.com/cuda-gpus) 7.0+
- * CUDA 11.2, 11.4, 11.5, 11.8, 12.0 or 12.2
- * Python version 3.9, 3.10, or 3.11
+ * CUDA 11.2, 11.4, 11.5, 11.8, 12.0, 12.2, or 12.5
+ * Python version 3.10, 3.11, or 3.12
  * NetworkX >= version 3.3 or newer in order to use use [NetworkX Configs](https://networkx.org/documentation/stable/reference/backends.html#module-networkx.utils.configs) **This is required for use of nx-cuGraph, [see below](#cugraph-using-networkx-code).**
 
 ## Installation
diff --git a/docs/cugraph/source/tutorials/cugraph_notebooks.md b/docs/cugraph/source/tutorials/cugraph_notebooks.md
index 559ba36e97e..6d7840dc3c4 100644
--- a/docs/cugraph/source/tutorials/cugraph_notebooks.md
+++ b/docs/cugraph/source/tutorials/cugraph_notebooks.md
@@ -55,10 +55,9 @@ Running the example in these notebooks requires:
   * Download via Docker, Conda (See [__Getting Started__](https://rapids.ai/start.html))
 
 * cuGraph is dependent on the latest version of cuDF.  Please install all components of RAPIDS
-* Python 3.8+
-* A system with an NVIDIA GPU:  Pascal architecture or better
+* Python 3.10+
+* A system with an NVIDIA GPU: Volta architecture or newer
 * CUDA 11.4+
-* NVIDIA driver 450.51+
 
 ## Copyright
 
diff --git a/docs/cugraph/source/wholegraph/installation/getting_wholegraph.md b/docs/cugraph/source/wholegraph/installation/getting_wholegraph.md
index 57314dcd426..80c666d6593 100644
--- a/docs/cugraph/source/wholegraph/installation/getting_wholegraph.md
+++ b/docs/cugraph/source/wholegraph/installation/getting_wholegraph.md
@@ -21,7 +21,7 @@ The RAPIDS Docker containers (as of Release 23.10) contain all RAPIDS packages,
 
 
 ## Conda
-It is easy to install WholeGraph using conda. You can get a minimal conda installation with [Miniconda](https://conda.io/miniconda.html) or get the full installation with [Anaconda](https://www.anaconda.com/download).
+It is easy to install WholeGraph using conda. You can get a minimal conda installation with [miniforge](https://github.com/conda-forge/miniforge).
 
 WholeGraph conda packages
  * libwholegraph
diff --git a/docs/cugraph/source/wholegraph/installation/source_build.md b/docs/cugraph/source/wholegraph/installation/source_build.md
index a7727ac4052..7213cbfb096 100644
--- a/docs/cugraph/source/wholegraph/installation/source_build.md
+++ b/docs/cugraph/source/wholegraph/installation/source_build.md
@@ -16,8 +16,7 @@ __Compiler__:
 
 __CUDA__:
 * CUDA 11.8+
-* NVIDIA driver 450.80.02+
-* Pascal architecture or better
+* Volta architecture or better
 
 You can obtain CUDA from [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads).
 
@@ -177,7 +176,7 @@ Run either the C++ or the Python tests with datasets
     ```
 
 
-Note: This conda installation only applies to Linux and Python versions 3.8/3.10.
+Note: This conda installation only applies to Linux and Python versions 3.10, 3.11, and 3.12.
 
 ## Creating documentation
 
diff --git a/notebooks/README.md b/notebooks/README.md
index 818382f35a7..f0d0a25b9dd 100644
--- a/notebooks/README.md
+++ b/notebooks/README.md
@@ -56,10 +56,9 @@ Running the example in these notebooks requires:
   * Download via Docker, Conda (See [__Getting Started__](https://rapids.ai/start.html))
 
 * cuGraph is dependent on the latest version of cuDF.  Please install all components of RAPIDS
-* Python 3.8+
-* A system with an NVIDIA GPU:  Pascal architecture or better
+* Python 3.10+
+* A system with an NVIDIA GPU: Volta architecture or newer
 * CUDA 11.4+
-* NVIDIA driver 450.51+
 
 ### QuickStart
 
@@ -67,13 +66,13 @@ The easiest way to run the notebooks is to get the latest [rapidsai/notebooks](h
 
 For example, get the latest (as of writing the document) nightly image (`a` after the version number indicates that an image is nightly) with cuda 12.0 using
 ```sh
-docker pull rapidsai/notebooks:24.10a-cuda12.0-py3.9
+docker pull rapidsai/notebooks:24.12a-cuda12.0-py3.10
 ```
 
 And, then run a container based on the image using
 
 ```sh
-docker run --rm  -it --pull always --gpus all --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864   -p 8888:8888 rapidsai/notebooks:24.10a-cuda12.0-py3.9
+docker run --rm  -it --pull always --gpus all --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864   -p 8888:8888 rapidsai/notebooks:24.12a-cuda12.0-py3.10
 ```
 You are all set. Run and edit cugraph notebooks from a browser at url
 http://127.0.0.1:8888/lab/tree/cugraph/cugraph_benchmarks
@@ -89,8 +88,8 @@ ssh -L  127.0.0.1:8888:127.0.0.1:8888 [USER_NAME@][REMOTE_HOST_NAME or REMOTE_HO
 and then run the container in your remote machine.
 
 ```sh
-docker pull rapidsai/notebooks:24.10a-cuda12.0-py3.9
-docker run --rm  -it --pull always --gpus all --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 -p 8888:8888 rapidsai/notebooks:24.10a-cuda12.0-py3.9
+docker pull rapidsai/notebooks:24.12a-cuda12.0-py3.10
+docker run --rm  -it --pull always --gpus all --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 -p 8888:8888 rapidsai/notebooks:24.12a-cuda12.0-py3.10
 ```
 
 You can run and edit cugraph notebooks at url http://127.0.0.1:8888/lab/tree/cugraph/cugraph_benchmarks as if they are running locally.
diff --git a/notebooks/demo/accelerating_networkx.ipynb b/notebooks/demo/accelerating_networkx.ipynb
new file mode 100644
index 00000000000..1a6c6cfb3f6
--- /dev/null
+++ b/notebooks/demo/accelerating_networkx.ipynb
@@ -0,0 +1,614 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "R2cpVp2WdOsp"
+      },
+      "source": [
+        "# NetworkX - Easy Graph Analytics\n",
+        "\n",
+        "NetworkX is the most popular library for graph analytics available in Python, or quite possibly any language. To illustrate this, NetworkX was downloaded more than 71 million times in September of 2024 alone, which is roughly 71 times more than the next most popular graph analytics library! [*](https://en.wikipedia.org/wiki/NetworkX) NetworkX has earned this popularity from its very easy-to-use API, the wealth of documentation and examples available, the large (and friendly) community behind it, and its easy installation which requires nothing more than Python.\n",
+        "\n",
+        "However, NetworkX users are familiar with the tradeoff that comes with those benefits. The pure-Python implementation often results in poor performance when graph data starts to reach larger scales, limiting the usefulness of the library for many real-world problems.\n",
+        "\n",
+        "# Accelerated NetworkX - Easy (and fast!) Graph Analytics\n",
+        "\n",
+        "To address the performance problem, NetworkX 3.0 introduced a mechanism to dispatch algorithm calls to alternate implementations. The NetworkX Python API remains the same but NetworkX will use more capable algorithm implementations provided by one or more backends. This approach means users don't have to give up NetworkX -or even change their code- in order to take advantage of GPU performance."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "xkg10FrNThrK"
+      },
+      "source": [
+        "# Let's Get the Environment Setup\n",
+        "This notebook will demonstrate NetworkX both with and without GPU acceleration provided by the `nx-cugraph` backend.\n",
+        "\n",
+        "`nx-cugraph` is available as a package installable using `pip`, `conda`, and [from source](https://github.com/rapidsai/nx-cugraph).  Before importing `networkx`, lets install `nx-cugraph` so it can be registered as an available backend by NetworkX when needed.  We'll use `pip` to install.\n",
+        "\n",
+        "NOTES:\n",
+        "* `nx-cugraph` requires a compatible NVIDIA GPU, NVIDIA CUDA and associated drivers, and a supported OS. Details about these and other installation prerequisites can be seen [here](https://docs.rapids.ai/install#system-req).\n",
+        "* The `nx-cugraph` package is currently hosted by NVIDIA and therefore the `--extra-index-url` option must be used.\n",
+        "* `nx-cugraph` is supported on specific 11.x and 12.x CUDA versions, and the major version number must be known in order to install the correct build (this is determined automatically when using `conda`).\n",
+        "\n",
+        "To find the CUDA major version on your system, run the following command:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "NMFwzc1I95BS"
+      },
+      "outputs": [],
+      "source": [
+        "!nvcc --version"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "i91Yj-yZ-nGS"
+      },
+      "source": [
+        "From the above output we can see we're using CUDA 12.x so we'll be installing `nx-cugraph-cu12`. If we were using CUDA 11.x, the package name would be `nx-cugraph-cu11`. We'll also be adding `https://pypi.nvidia.com` as an `--extra-index-url`:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "mYYN9EpnWphu"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install nx-cugraph-cu12 --extra-index-url=https://pypi.nvidia.com"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "0h1K-7tI_AZH"
+      },
+      "source": [
+        "Of course, we'll also be using `networkx`, which is already provided in the Colab environment. This notebook will be using features added in version 3.3, so we'll import it here to verify we have a compatible version."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "YTV0ZTME2tV6"
+      },
+      "outputs": [],
+      "source": [
+        "import networkx as nx\n",
+        "nx.__version__"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "UiZKOa3WC7be"
+      },
+      "source": [
+        "# Let's Start with Something Simple\n",
+        "\n",
+        "To begin, we'll compare NetworkX results without a backend to results of the same algorithm using the `nx-cugraph` backend on a small graph.  `nx.karate_club_graph()` returns an instance of the famous example graph consisting of 34 nodes and 78 edges from Zachary's paper, described [here](https://en.wikipedia.org/wiki/Zachary%27s_karate_club)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "3atL3tI0frYm"
+      },
+      "source": [
+        "## Betweenness Centrality\n",
+        "[Betweenness Centrality](https://en.wikipedia.org/wiki/Betweenness_centrality) is a graph algorithm that computes a centrality score for each node (`v`) based on how many of the shortest paths between pairs of nodes in the graph pass through `v`. A higher centrality score represents a node that \"connects\" other nodes in a network more than that of a node with a lower score.\n",
+        "\n",
+        "First, let's create a NetworkX Graph instance of the the Karate Club graph and inspect it."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "JSw7EZ46-kRu"
+      },
+      "outputs": [],
+      "source": [
+        "G = nx.karate_club_graph()\n",
+        "G.number_of_nodes(), G.number_of_edges()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "_-E17u2gKgbC"
+      },
+      "source": [
+        "Next, let's run betweenness centrality and save the results.  Because the Karate Club graph is so small, this should not take long."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "qjxXXKJhKQ4s"
+      },
+      "outputs": [],
+      "source": [
+        "%%time\n",
+        "nx_bc_results = nx.betweenness_centrality(G)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ClrR3z9XMfLr"
+      },
+      "source": [
+        "Now, let's run the same algorithm on the same data using the `nx-cugraph` backend.\n",
+        "\n",
+        "There are several ways to instruct NetworkX to use a particular backend instead of the default implementation. Here, we will use the `config` API, which was added in NetworkX version 3.3.\n",
+        "\n",
+        "The following two lines set the backend to \"cugraph\" and enable graph conversion caching.\n",
+        "\n",
+        "Some notes:\n",
+        "* The standard convention for NetworkX backends is to name the package with a `nx-` prefix to denote that these are packages intended to be used with NetworkX, but the `nx-` prefix is not included when referring to them in NetworkX API calls. Here, `nx-cugraph` is the name of the backend package, and `\"cugraph\"` is the name NetworkX will use to refer to it.\n",
+        "* NetworkX can use multiple backends! `nx.config.backend_priority` is a list that can contain several backends, ordered based on priority. If a backend in the list cannot run a particular algorithm (either because it isn't supported in the backend, the algorithm doesn't support a particular option, or some other reason), NetworkX will try the next backend in the list. If no specified backend is able to run the algorithm, NetworkX will fall back to the default implementation.\n",
+        "* Many backends have their own data structures for representing an input graph, often optimized for that backend's implementation. Prior to running a backend algorithm, NetworkX will have the backend convert the standard NetworkX Graph instance to the backend-specific type. This conversion can be expensive, and rather than repeat it as part of each algorithm call, NetworkX can cache the conversion so it can be skipped on future calls if the graph doesn't change. This caching can save significant time and improve overall performance."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "oFHwNqqsNsqS"
+      },
+      "outputs": [],
+      "source": [
+        "nx.config.backend_priority=[\"cugraph\"]  # NETWORKX_BACKEND_PRIORITY=cugraph\n",
+        "nx.config.cache_converted_graphs=True   # NETWORKX_CACHE_CONVERTED_GRAPHS=True"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "HrUeWRRQRzFP"
+      },
+      "outputs": [],
+      "source": [
+        "%%time\n",
+        "nxcg_bc_results = nx.betweenness_centrality(G)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "z1hxut3GTj5A"
+      },
+      "source": [
+        "You may have noticed that using the `nx-cugraph` backend resulted in a slightly slower execution time. This is not surprising when working with a graph this small, since the overhead of converting the graph for the first time and launching the algorithm kernel on the GPU is actually significantly more than the computation time itself.  We'll see later that this overhead is negligible when compared to the time saved when running on a GPU for larger graphs.\n",
+        "\n",
+        "Since we've enabled graph conversion caching, we can see that if we re-run the same call the execution time is noticeably shorter."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "7a0XvpUOr9Ju"
+      },
+      "outputs": [],
+      "source": [
+        "%%time\n",
+        "nxcg_bc_results = nx.betweenness_centrality(G)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ppjE5J5RscOe"
+      },
+      "source": [
+        "Notice the warning above about using the cache. This will only be raised **once** per graph instance (it can also be easily disabled), but its purpose is to point out that the cache should not be used if the Graph object will have its attribute dictionary modified directly. In this case and many others, we won't be modifying the dictionaries directly. Instead, we will use APIs such as `nx.set_node_attributes` which properly clear the cache, so it's safe for us to use the cache. Because of that, we'll disable the warning so we don't see it on other graphs in this session."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Namb5JLvwS-q"
+      },
+      "outputs": [],
+      "source": [
+        "import warnings\n",
+        "warnings.filterwarnings(\"ignore\", message=\"Using cached graph for 'cugraph' backend\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "BzGAphcILFsT"
+      },
+      "source": [
+        "Smaller graphs are also easy to visualize with NetworkX's plotting utilities. The flexibility of NetworkX's `Graph` instances make it trivial to add the betweenness centrality scores back to the graph object as node attributes. This will allow us to use those values for the visualization.\n",
+        "\n",
+        "In this case, we'll create new attributes for each node called \"nx_bc\" for the default NetworkX results, and \"nxcg_bc\" for the nx-cugraph results. We'll use those values to assign the color for each node and plot two graphs side-by-side. This will make it easy to visually validate that the nodes with the higher centrality scores for both implementations match and do indeed appear to be more \"central\" to other nodes."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "1coV6ZfcUoqI"
+      },
+      "outputs": [],
+      "source": [
+        "nx.set_node_attributes(G, nx_bc_results, \"nx_bc\")\n",
+        "nx.set_node_attributes(G, nxcg_bc_results, \"nxcg_bc\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Sba2iYJgLoN2"
+      },
+      "outputs": [],
+      "source": [
+        "# Configure plot size and layout/position for each node\n",
+        "import matplotlib.pyplot as plt\n",
+        "plt.rcParams['figure.figsize'] = [12, 8]\n",
+        "pos = nx.spring_layout(G)\n",
+        "\n",
+        "# Assign colors for each set of betweenness centrality results\n",
+        "nx_colors = [G.nodes[n][\"nx_bc\"] for n in G.nodes()]\n",
+        "nxcg_colors = [G.nodes[n][\"nxcg_bc\"] for n in G.nodes()]\n",
+        "\n",
+        "# Plot the graph and color each node corresponding to NetworkX betweenness centrality values\n",
+        "plt.subplot(1, 2, 1)\n",
+        "nx.draw(G, pos=pos, with_labels=True, node_color=nx_colors)\n",
+        "\n",
+        "# Plot the graph and color each node corresponding to nx-cugraph betweenness centrality values\n",
+        "plt.subplot(1, 2, 2)\n",
+        "nx.draw(G, pos=pos, with_labels=True, node_color=nxcg_colors)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "dJXH4Zn5VNSg"
+      },
+      "source": [
+        "As we can see, the same two nodes (`0` and `33`) are the two most central in both graphs, followed by `2`, `31`, and `32`.\n",
+        "\n",
+        "## PageRank\n",
+        "Another popular algorithm is [PageRank](https://en.wikipedia.org/wiki/PageRank). PageRank also assigns scores to each node, but these scores are based on analyzing links to each node to determine relative \"importance\" within the graph.\n",
+        "\n",
+        "Let's update the config to use the default NetworkX implementation and run `nx.pagerank`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "9CdYNk62E1v_"
+      },
+      "outputs": [],
+      "source": [
+        "nx.config.backend_priority=[]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Jo39YxVmYolq"
+      },
+      "outputs": [],
+      "source": [
+        "%%time\n",
+        "nx_pr_results = nx.pagerank(G)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "sV6dM8ToZDiC"
+      },
+      "source": [
+        "We could set `nx.config.backend_priority` again to list `\"cugraph\"` as the backend, but let's instead show how the `backend` kwarg can be used to override the priority list and force a specific backend to be used."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "oMSvQVGKY0rn"
+      },
+      "outputs": [],
+      "source": [
+        "%%time\n",
+        "nxcg_pr_results = nx.pagerank(G, backend=\"cugraph\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ZGux_8xFZneI"
+      },
+      "source": [
+        "In this example, instead of plotting the graph to show that the results are identical, we can compare them directly using the saved values from both runs."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "RcmtdFy4Zw7p"
+      },
+      "outputs": [],
+      "source": [
+        "sorted(nx_pr_results) == sorted(nxcg_pr_results)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "mefjUEAnZ4pq"
+      },
+      "source": [
+        "# Working with Bigger Data"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "yLY-yl6PuNYo"
+      },
+      "source": [
+        "Now we'll look at a larger dataset from https://snap.stanford.edu/data/cit-Patents.html which contains citations across different U.S. patents granted from January 1, 1963 to December 30, 1999. The dataset represents 16.5M citations (edges) between 3.77M patents (nodes).\n",
+        "\n",
+        "This will demonstrate that data of this size starts to push the limits of the default pure-Python NetworkX implementation."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "lyYF0LbtFwjh"
+      },
+      "outputs": [],
+      "source": [
+        "# The locale encoding may have been modified from the plots above, reset here to run shell commands\n",
+        "import locale\n",
+        "locale.getpreferredencoding = lambda: \"UTF-8\"\n",
+        "!wget https://data.rapids.ai/cugraph/datasets/cit-Patents.csv  # Skip if cit-Patents.csv already exists.\n",
+        "# !wget https://snap.stanford.edu/data/cit-Patents.txt.gz  # Skip if cit-Patents.txt.gz already exists."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "kjGINYphQSQ2"
+      },
+      "outputs": [],
+      "source": [
+        "%load_ext cudf.pandas\n",
+        "import pandas as pd"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "iV4DieGZOalc"
+      },
+      "outputs": [],
+      "source": [
+        "%%time\n",
+        "df = pd.read_csv(\"cit-Patents.csv\",\n",
+        "                sep=\" \",\n",
+        "                names=[\"src\", \"dst\"],\n",
+        "                dtype=\"int32\",\n",
+        ")\n",
+        "# df = pd.read_csv(\"cit-Patents.txt.gz\",\n",
+        "#                  compression=\"gzip\",\n",
+        "#                  skiprows=4,\n",
+        "#                  sep=\"\\t\",\n",
+        "#                  names=[\"src\", \"dst\"],\n",
+        "#                  dtype=\"int32\",\n",
+        "# )"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "PREA67u4eKat"
+      },
+      "outputs": [],
+      "source": [
+        "%%time\n",
+        "G = nx.from_pandas_edgelist(df, source=\"src\", target=\"dst\")\n",
+        "G.number_of_nodes(), G.number_of_edges()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "NcsUxBqpu4zY"
+      },
+      "source": [
+        "By default, `nx.betweenness_centrality` will perform an all-pairs shortest path analysis when determining the centrality scores for each node. However, due to the much larger size of this graph, determining the shortest path for all pairs of nodes in the graph is not feasible. Instead, we'll use the parameter `k` to limit the number of shortest path computations used for determining the centrality scores, at the expense of accuracy. As we'll see when using a dataset this size with `nx.betweenness_centrality`, we have to limit `k` to `1` which is not practical but is sufficient here for demonstration purposes (since anything larger than `1` will result in many minutes of execution time)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "gNDWbj3kAk3j"
+      },
+      "outputs": [],
+      "source": [
+        "%%time\n",
+        "bc_results = nx.betweenness_centrality(G, k=1)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "NB8xmxMd1PlX"
+      },
+      "source": [
+        "Now we'll configure NetworkX to use the `nx-cugraph` backend (again, using the name convention that drops the package name's `nx-` prefix) and run the same call. Because this is a Graph that `nx-cugraph` hasn't seen before, the runtime will include the time to convert and cache a GPU-based graph."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "xUYNG1xhvbWc"
+      },
+      "outputs": [],
+      "source": [
+        "nx.config.backend_priority = [\"cugraph\"]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "cmK8ZuQGvfPo"
+      },
+      "outputs": [],
+      "source": [
+        "%%time\n",
+        "bc_results = nx.betweenness_centrality(G, k=1)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "vdHb1YXP15TZ"
+      },
+      "source": [
+        "Let's run betweenness centrality again, now with a more useful number of samples by setting `k=100`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "fKjIrzL-vrGS"
+      },
+      "outputs": [],
+      "source": [
+        "%%time\n",
+        "bc_results = nx.betweenness_centrality(G, k=100)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "QeMcrAX2HZSM"
+      },
+      "source": [
+        "Let's also run pagerank on the same dataset to compare."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "gR8ID6ekHgHt"
+      },
+      "outputs": [],
+      "source": [
+        "nx.config.backend_priority = []"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rTFuvX5wb_c1"
+      },
+      "outputs": [],
+      "source": [
+        "%%time\n",
+        "nx_pr_results = nx.pagerank(G)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "8sJx9aeJV9hv"
+      },
+      "outputs": [],
+      "source": [
+        "%%time\n",
+        "nxcg_pr_results = nx.pagerank(G, backend=\"cugraph\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "wGOVQ6ZyY4Ih"
+      },
+      "outputs": [],
+      "source": [
+        "sorted(nx_pr_results) == sorted(nxcg_pr_results)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "k2DfAaZaDIBj"
+      },
+      "source": [
+        "---\n",
+        "<i>\n",
+        "Information on the U.S. Patent Citation Network dataset used in this notebook is as follows:\n",
+        "<br>Authors: Jure Leskovec and Andrej Krevl\n",
+        "<br>Title: SNAP Datasets, Stanford Large Network Dataset Collection\n",
+        "<br>URL: http://snap.stanford.edu/data\n",
+        "<br>Date: June 2014\n",
+        "</i>\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.12.4"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/notebooks/demo/mg_pagerank.ipynb b/notebooks/demo/mg_pagerank.ipynb
index bb333048450..e3314f80b3c 100644
--- a/notebooks/demo/mg_pagerank.ipynb
+++ b/notebooks/demo/mg_pagerank.ipynb
@@ -219,250 +219,250 @@
      "text": [
       "2023-05-12 09:25:01,974 - distributed.sizeof - WARNING - Sizeof calculation failed. Defaulting to 0.95 MiB\n",
       "Traceback (most recent call last):\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/sizeof.py\", line 17, in safe_sizeof\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/distributed/sizeof.py\", line 17, in safe_sizeof\n",
       "    return sizeof(obj)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/dask/utils.py\", line 642, in __call__\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/dask/utils.py\", line 642, in __call__\n",
       "    return meth(arg, *args, **kwargs)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
       "    return func(*args, **kwds)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/dask_cudf/backends.py\", line 430, in sizeof_cudf_dataframe\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/dask_cudf/backends.py\", line 430, in sizeof_cudf_dataframe\n",
       "    + df._index.memory_usage()\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
       "    return func(*args, **kwds)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 1594, in memory_usage\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 1594, in memory_usage\n",
       "    if self.levels:\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
       "    return func(*args, **kwds)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 605, in levels\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 605, in levels\n",
       "    self._compute_levels_and_codes()\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
       "    return func(*args, **kwds)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 748, in _compute_levels_and_codes\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 748, in _compute_levels_and_codes\n",
       "    code, cats = cudf.Series._from_data({None: col}).factorize()\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
       "    return func(*args, **kwds)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/single_column_frame.py\", line 311, in factorize\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/single_column_frame.py\", line 311, in factorize\n",
       "    return cudf.core.algorithms.factorize(\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/algorithms.py\", line 138, in factorize\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/algorithms.py\", line 138, in factorize\n",
       "    labels = values._column._label_encoding(\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1385, in _label_encoding\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1385, in _label_encoding\n",
       "    order = order.take(left_gather_map, check_bounds=False).argsort()\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1101, in argsort\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1101, in argsort\n",
       "    return self.as_frame()._get_sorted_inds(\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/frame.py\", line 1572, in _get_sorted_inds\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/frame.py\", line 1572, in _get_sorted_inds\n",
       "    return libcudf.sort.order_by(to_sort, ascending, na_position)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
       "    return func(*args, **kwds)\n",
       "  File \"sort.pyx\", line 141, in cudf._lib.sort.order_by\n",
-      "MemoryError: std::bad_alloc: out_of_memory: CUDA error at: /home/dacosta/miniconda3/envs/cugraph_0411/include/rmm/mr/device/cuda_memory_resource.hpp\n",
+      "MemoryError: std::bad_alloc: out_of_memory: CUDA error at: /home/dacosta/miniforge/envs/cugraph_0411/include/rmm/mr/device/cuda_memory_resource.hpp\n",
       "2023-05-12 09:25:01,976 - distributed.sizeof - WARNING - Sizeof calculation failed. Defaulting to 0.95 MiB\n",
       "Traceback (most recent call last):\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/sizeof.py\", line 17, in safe_sizeof\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/distributed/sizeof.py\", line 17, in safe_sizeof\n",
       "    return sizeof(obj)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/dask/utils.py\", line 642, in __call__\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/dask/utils.py\", line 642, in __call__\n",
       "    return meth(arg, *args, **kwargs)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
       "    return func(*args, **kwds)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/dask_cudf/backends.py\", line 430, in sizeof_cudf_dataframe\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/dask_cudf/backends.py\", line 430, in sizeof_cudf_dataframe\n",
       "    + df._index.memory_usage()\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
       "    return func(*args, **kwds)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 1594, in memory_usage\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 1594, in memory_usage\n",
       "    if self.levels:\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
       "    return func(*args, **kwds)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 605, in levels\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 605, in levels\n",
       "    self._compute_levels_and_codes()\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
       "    return func(*args, **kwds)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 748, in _compute_levels_and_codes\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 748, in _compute_levels_and_codes\n",
       "    code, cats = cudf.Series._from_data({None: col}).factorize()\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
       "    return func(*args, **kwds)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/single_column_frame.py\", line 311, in factorize\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/single_column_frame.py\", line 311, in factorize\n",
       "    return cudf.core.algorithms.factorize(\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/algorithms.py\", line 138, in factorize\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/algorithms.py\", line 138, in factorize\n",
       "    labels = values._column._label_encoding(\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1385, in _label_encoding\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1385, in _label_encoding\n",
       "    order = order.take(left_gather_map, check_bounds=False).argsort()\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1101, in argsort\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1101, in argsort\n",
       "    return self.as_frame()._get_sorted_inds(\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/frame.py\", line 1572, in _get_sorted_inds\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/frame.py\", line 1572, in _get_sorted_inds\n",
       "    return libcudf.sort.order_by(to_sort, ascending, na_position)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
       "    return func(*args, **kwds)\n",
       "  File \"sort.pyx\", line 141, in cudf._lib.sort.order_by\n",
-      "MemoryError: std::bad_alloc: out_of_memory: CUDA error at: /home/dacosta/miniconda3/envs/cugraph_0411/include/rmm/mr/device/cuda_memory_resource.hpp\n",
+      "MemoryError: std::bad_alloc: out_of_memory: CUDA error at: /home/dacosta/miniforge/envs/cugraph_0411/include/rmm/mr/device/cuda_memory_resource.hpp\n",
       "2023-05-12 09:25:03,767 - distributed.sizeof - WARNING - Sizeof calculation failed. Defaulting to 0.95 MiB\n",
       "Traceback (most recent call last):\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/sizeof.py\", line 17, in safe_sizeof\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/distributed/sizeof.py\", line 17, in safe_sizeof\n",
       "    return sizeof(obj)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/dask/utils.py\", line 642, in __call__\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/dask/utils.py\", line 642, in __call__\n",
       "    return meth(arg, *args, **kwargs)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
       "    return func(*args, **kwds)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/dask_cudf/backends.py\", line 430, in sizeof_cudf_dataframe\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/dask_cudf/backends.py\", line 430, in sizeof_cudf_dataframe\n",
       "    + df._index.memory_usage()\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
       "    return func(*args, **kwds)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 1594, in memory_usage\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 1594, in memory_usage\n",
       "    if self.levels:\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
       "    return func(*args, **kwds)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 605, in levels\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 605, in levels\n",
       "    self._compute_levels_and_codes()\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
       "    return func(*args, **kwds)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 748, in _compute_levels_and_codes\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 748, in _compute_levels_and_codes\n",
       "    code, cats = cudf.Series._from_data({None: col}).factorize()\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
       "    return func(*args, **kwds)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/single_column_frame.py\", line 311, in factorize\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/single_column_frame.py\", line 311, in factorize\n",
       "    return cudf.core.algorithms.factorize(\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/algorithms.py\", line 138, in factorize\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/algorithms.py\", line 138, in factorize\n",
       "    labels = values._column._label_encoding(\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1385, in _label_encoding\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1385, in _label_encoding\n",
       "    order = order.take(left_gather_map, check_bounds=False).argsort()\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1101, in argsort\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1101, in argsort\n",
       "    return self.as_frame()._get_sorted_inds(\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/frame.py\", line 1572, in _get_sorted_inds\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/frame.py\", line 1572, in _get_sorted_inds\n",
       "    return libcudf.sort.order_by(to_sort, ascending, na_position)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
       "    return func(*args, **kwds)\n",
       "  File \"sort.pyx\", line 141, in cudf._lib.sort.order_by\n",
-      "MemoryError: std::bad_alloc: out_of_memory: CUDA error at: /home/dacosta/miniconda3/envs/cugraph_0411/include/rmm/mr/device/cuda_memory_resource.hpp\n",
+      "MemoryError: std::bad_alloc: out_of_memory: CUDA error at: /home/dacosta/miniforge/envs/cugraph_0411/include/rmm/mr/device/cuda_memory_resource.hpp\n",
       "2023-05-12 09:25:03,768 - distributed.sizeof - WARNING - Sizeof calculation failed. Defaulting to 0.95 MiB\n",
       "Traceback (most recent call last):\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/sizeof.py\", line 17, in safe_sizeof\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/distributed/sizeof.py\", line 17, in safe_sizeof\n",
       "    return sizeof(obj)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/dask/utils.py\", line 642, in __call__\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/dask/utils.py\", line 642, in __call__\n",
       "    return meth(arg, *args, **kwargs)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
       "    return func(*args, **kwds)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/dask_cudf/backends.py\", line 430, in sizeof_cudf_dataframe\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/dask_cudf/backends.py\", line 430, in sizeof_cudf_dataframe\n",
       "    + df._index.memory_usage()\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
       "    return func(*args, **kwds)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 1594, in memory_usage\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 1594, in memory_usage\n",
       "    if self.levels:\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
       "    return func(*args, **kwds)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 605, in levels\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 605, in levels\n",
       "    self._compute_levels_and_codes()\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
       "    return func(*args, **kwds)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 748, in _compute_levels_and_codes\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 748, in _compute_levels_and_codes\n",
       "    code, cats = cudf.Series._from_data({None: col}).factorize()\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
       "    return func(*args, **kwds)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/single_column_frame.py\", line 311, in factorize\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/single_column_frame.py\", line 311, in factorize\n",
       "    return cudf.core.algorithms.factorize(\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/algorithms.py\", line 138, in factorize\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/algorithms.py\", line 138, in factorize\n",
       "    labels = values._column._label_encoding(\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1385, in _label_encoding\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1385, in _label_encoding\n",
       "    order = order.take(left_gather_map, check_bounds=False).argsort()\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1101, in argsort\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1101, in argsort\n",
       "    return self.as_frame()._get_sorted_inds(\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/frame.py\", line 1572, in _get_sorted_inds\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/frame.py\", line 1572, in _get_sorted_inds\n",
       "    return libcudf.sort.order_by(to_sort, ascending, na_position)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
       "    return func(*args, **kwds)\n",
       "  File \"sort.pyx\", line 141, in cudf._lib.sort.order_by\n",
-      "MemoryError: std::bad_alloc: out_of_memory: CUDA error at: /home/dacosta/miniconda3/envs/cugraph_0411/include/rmm/mr/device/cuda_memory_resource.hpp\n",
+      "MemoryError: std::bad_alloc: out_of_memory: CUDA error at: /home/dacosta/miniforge/envs/cugraph_0411/include/rmm/mr/device/cuda_memory_resource.hpp\n",
       "2023-05-12 09:25:03,820 - distributed.worker - ERROR - Could not deserialize task ('len-chunk-319fe46af5510615b2fae86c6e732896-841a12bf4568ebb80eb2030cc4d9651d', 1)\n",
       "Traceback (most recent call last):\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2923, in loads_function\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2923, in loads_function\n",
       "    result = cache_loads[bytes_object]\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/collections.py\", line 24, in __getitem__\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/distributed/collections.py\", line 24, in __getitem__\n",
       "    value = super().__getitem__(key)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/collections/__init__.py\", line 1106, in __getitem__\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/collections/__init__.py\", line 1106, in __getitem__\n",
       "    raise KeyError(key)\n",
       "KeyError: b'\\x80\\x05\\x95>\\x0b\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x11dask.optimization\\x94\\x8c\\x10SubgraphCallable\\x94\\x93\\x94(}\\x94(\\x8cKlen-chunk-319fe46af5510615b2fae86c6e732896-841a12bf4568ebb80eb2030cc4d9651d\\x94\\x8cZassign-getitem-len-chunk-319fe46af5510615b2fae86c6e732896-841a12bf4568ebb80eb2030cc4d9651d\\x94\\x8c*rename-01db283bd79fee66f232920c8dc6b55e_.0\\x94\\x8c;getitem-to_frame-rename-01db283bd79fee66f232920c8dc6b55e_.0\\x94\\x8c+getitem-3499fd71ac25ebbc1a06991edea6067c_.0\\x94\\x8c\\t_operator\\x94\\x8c\\x07getitem\\x94\\x93\\x94\\x8c/reset_index-f4c18304ca92859ccd09f44cf89b4b43_.0\\x94\\x8c\\x13__dask_blockwise__1\\x94\\x87\\x94h\\x0c(\\x8c\\ndask.utils\\x94\\x8c\\x05apply\\x94\\x93\\x94h\\x0f\\x8c\\x0cmethodcaller\\x94\\x93\\x94\\x8c\\x0breset_index\\x94\\x85\\x94R\\x94]\\x94\\x8c\\x13__dask_blockwise__5\\x94a\\x8c\\x08builtins\\x94\\x8c\\x04dict\\x94\\x93\\x94]\\x94]\\x94(\\x8c\\x04drop\\x94\\x89ea\\x86\\x94t\\x94h\\x07(h\\x11\\x8c\\x13dask.dataframe.core\\x94\\x8c\\x11apply_and_enforce\\x94\\x93\\x94]\\x94((h\\x11h#]\\x94h\\x0bh\\x0c\\x8c\\x13__dask_blockwise__0\\x94\\x87\\x94ah\\x1b]\\x94(]\\x94(\\x8c\\x05_func\\x94h\\x13\\x8c\\x08to_frame\\x94\\x85\\x94R\\x94e]\\x94(\\x8c\\x05_meta\\x94\\x8c\\x08builtins\\x94\\x8c\\x07getattr\\x94\\x93\\x94\\x8c\\x13cudf.core.dataframe\\x94\\x8c\\tDataFrame\\x94\\x93\\x94\\x8c\\x10host_deserialize\\x94\\x86\\x94R\\x94}\\x94(\\x8c\\x0ftype-serialized\\x94C0\\x80\\x04\\x95%\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x13cudf.core.dataframe\\x94\\x8c\\tDataFrame\\x94\\x93\\x94.\\x94\\x8c\\x0ccolumn_names\\x94C\\x14\\x80\\x04\\x95\\t\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x03src\\x94\\x85\\x94.\\x94\\x8c\\x07columns\\x94}\\x94(\\x8c\\x0ftype-serialized\\x94C=\\x80\\x04\\x952\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x1acudf.core.column.numerical\\x94\\x8c\\x0fNumericalColumn\\x94\\x93\\x94.\\x94\\x8c\\x05dtype\\x94CB\\x80\\x04\\x957\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x05numpy\\x94\\x8c\\x05dtype\\x94\\x93\\x94\\x8c\\x02i4\\x94\\x89\\x88\\x87\\x94R\\x94(K\\x03\\x8c\\x01<\\x94NNNJ\\xff\\xff\\xff\\xffJ\\xff\\xff\\xff\\xffK\\x00t\\x94b.\\x94\\x8c\\x18dtype-is-cudf-serialized\\x94\\x89\\x8c\\x04data\\x94}\\x94(\\x8c\\x0ftype-serialized\\x94CI\\x80\\x04\\x95>\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c!cudf.core.buffer.spillable_buffer\\x94\\x8c\\x14SpillableBufferSlice\\x94\\x93\\x94.\\x94\\x8c\\x0bframe_count\\x94K\\x01u\\x8c\\x04mask\\x94}\\x94(hGCD\\x80\\x04\\x959\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c!cudf.core.buffer.spillable_buffer\\x94\\x8c\\x0fSpillableBuffer\\x94\\x93\\x94.\\x94hIK\\x01u\\x8c\\x04size\\x94K\\x00hIK\\x02u\\x85\\x94\\x8c\\x05index\\x94}\\x94(\\x8c\\x0cindex_column\\x94}\\x94(\\x8c\\x05start\\x94K\\x00\\x8c\\x04stop\\x94K\\x00\\x8c\\x04step\\x94K\\x01u\\x8c\\x04name\\x94C\\x04\\x80\\x04N.\\x94hBCB\\x80\\x04\\x957\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x05numpy\\x94\\x8c\\x05dtype\\x94\\x93\\x94\\x8c\\x02i8\\x94\\x89\\x88\\x87\\x94R\\x94(K\\x03\\x8c\\x01<\\x94NNNJ\\xff\\xff\\xff\\xffJ\\xff\\xff\\xff\\xffK\\x00t\\x94b.\\x94\\x8c\\x0ftype-serialized\\x94C-\\x80\\x04\\x95\"\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x0fcudf.core.index\\x94\\x8c\\nRangeIndex\\x94\\x93\\x94.\\x94hIK\\x00u\\x8c\\x11index_frame_count\\x94K\\x00\\x8c\\x07is-cuda\\x94]\\x94(\\x88\\x88e\\x8c\\x07lengths\\x94]\\x94(K\\x00K\\x00e\\x8c\\twriteable\\x94NN\\x86\\x94u]\\x94(\\x8c\\x12numpy.core.numeric\\x94\\x8c\\x0b_frombuffer\\x94\\x93\\x94(C\\x00\\x94\\x8c\\x05numpy\\x94hB\\x93\\x94\\x8c\\x02u1\\x94\\x89\\x88\\x87\\x94R\\x94(K\\x03\\x8c\\x01|\\x94NNNJ\\xff\\xff\\xff\\xffJ\\xff\\xff\\xff\\xffK\\x00t\\x94bK\\x00\\x85\\x94\\x8c\\x01C\\x94t\\x94R\\x94he(C\\x00\\x94hkK\\x00\\x85\\x94hot\\x94R\\x94e\\x86\\x94R\\x94ee\\x86\\x94t\\x94\\x8c\\x13__dask_blockwise__2\\x94eh\\x1b]\\x94(]\\x94(h*h\\x13\\x8c\\x06rename\\x94\\x85\\x94R\\x94e]\\x94(h/h2h5h6\\x86\\x94R\\x94}\\x94(h:C0\\x80\\x04\\x95%\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x13cudf.core.dataframe\\x94\\x8c\\tDataFrame\\x94\\x93\\x94.\\x94h<C\\x14\\x80\\x04\\x95\\t\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x03src\\x94\\x85\\x94.\\x94h>}\\x94(h@C=\\x80\\x04\\x952\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x1acudf.core.column.numerical\\x94\\x8c\\x0fNumericalColumn\\x94\\x93\\x94.\\x94hBCB\\x80\\x04\\x957\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x05numpy\\x94\\x8c\\x05dtype\\x94\\x93\\x94\\x8c\\x02i4\\x94\\x89\\x88\\x87\\x94R\\x94(K\\x03\\x8c\\x01<\\x94NNNJ\\xff\\xff\\xff\\xffJ\\xff\\xff\\xff\\xffK\\x00t\\x94b.\\x94hD\\x89hE}\\x94(hGCI\\x80\\x04\\x95>\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c!cudf.core.buffer.spillable_buffer\\x94\\x8c\\x14SpillableBufferSlice\\x94\\x93\\x94.\\x94hIK\\x01uhMK\\x00hIK\\x01u\\x85\\x94hO}\\x94(hQ}\\x94(hSK\\x00hTK\\x00hUK\\x01uhVC\\x04\\x80\\x04N.\\x94hBCB\\x80\\x04\\x957\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x05numpy\\x94\\x8c\\x05dtype\\x94\\x93\\x94\\x8c\\x02i8\\x94\\x89\\x88\\x87\\x94R\\x94(K\\x03\\x8c\\x01<\\x94NNNJ\\xff\\xff\\xff\\xffJ\\xff\\xff\\xff\\xffK\\x00t\\x94b.\\x94hYC-\\x80\\x04\\x95\"\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x0fcudf.core.index\\x94\\x8c\\nRangeIndex\\x94\\x93\\x94.\\x94hIK\\x00uh[K\\x00h\\\\]\\x94\\x88ah^]\\x94K\\x00ah`N\\x85\\x94u]\\x94he(C\\x00\\x94hkK\\x00\\x85\\x94hot\\x94R\\x94a\\x86\\x94R\\x94e]\\x94(h>h\\x1b]\\x94]\\x94(\\x8c\\x03src\\x94h\\x9eea\\x86\\x94ee\\x86\\x94t\\x94h\\x05(h\\x11h!\\x8c\\x10_reduction_chunk\\x94\\x93\\x94]\\x94h\\x0b(\\x8c\\x16dask.dataframe.methods\\x94\\x8c\\x06assign\\x94\\x93\\x94h\\x06h\\rh\\x08t\\x94h&\\x87\\x94ah\\x1b]\\x94]\\x94(\\x8c\\taca_chunk\\x94h0\\x8c\\x03len\\x94\\x93\\x94ea\\x86\\x94t\\x94\\x8c\\x13__dask_blockwise__0\\x94h\\x9e\\x8c\\x13__dask_blockwise__1\\x94\\x8c\\x03dst\\x94\\x8c\\x13__dask_blockwise__2\\x94N\\x8c\\x13__dask_blockwise__3\\x94\\x8c)to_frame-804980ae30b71d28f0a6bd3d5b7610f9\\x94\\x8c\\x13__dask_blockwise__4\\x94\\x8c(getitem-15414b72be12e28054238b44933937ab\\x94\\x8c\\x13__dask_blockwise__6\\x94\\x8c3cudf-aggregate-agg-c50c2d97de169ca4f41e43a92a042630\\x94uh\\x04\\x8c\\x13__dask_blockwise__5\\x94\\x85\\x94\\x8c6subgraph_callable-b4ca530e-8895-432e-b553-40a7b5892ab2\\x94t\\x94R\\x94.'\n",
       "\n",
       "During handling of the above exception, another exception occurred:\n",
       "\n",
       "Traceback (most recent call last):\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2244, in execute\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2244, in execute\n",
       "    function, args, kwargs = await self._maybe_deserialize_task(ts)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2216, in _maybe_deserialize_task\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2216, in _maybe_deserialize_task\n",
       "    function, args, kwargs = _deserialize(*ts.run_spec)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
       "    return func(*args, **kwds)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2937, in _deserialize\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2937, in _deserialize\n",
       "    function = loads_function(function)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2925, in loads_function\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2925, in loads_function\n",
       "    result = pickle.loads(bytes_object)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/protocol/pickle.py\", line 96, in loads\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/distributed/protocol/pickle.py\", line 96, in loads\n",
       "    return pickle.loads(x)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/abc.py\", line 176, in host_deserialize\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/abc.py\", line 176, in host_deserialize\n",
       "    obj = cls.device_deserialize(header, frames)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/abc.py\", line 130, in device_deserialize\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/abc.py\", line 130, in device_deserialize\n",
       "    return typ.deserialize(header, frames)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/dataframe.py\", line 1019, in deserialize\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/dataframe.py\", line 1019, in deserialize\n",
       "    obj = super().deserialize(\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/frame.py\", line 106, in deserialize\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/frame.py\", line 106, in deserialize\n",
       "    columns = deserialize_columns(header[\"columns\"], frames)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 2450, in deserialize_columns\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 2450, in deserialize_columns\n",
       "    colobj = col_typ.deserialize(meta, frames[:col_frame_count])\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1216, in deserialize\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1216, in deserialize\n",
       "    data, frames = unpack(header[\"data\"], frames)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1204, in unpack\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1204, in unpack\n",
       "    obj = klass.deserialize(header, frames[:count])\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/spillable_buffer.py\", line 574, in deserialize\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/spillable_buffer.py\", line 574, in deserialize\n",
       "    return SpillableBuffer.deserialize(header, frames)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/buffer.py\", line 335, in deserialize\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/buffer.py\", line 335, in deserialize\n",
       "    return cls._from_device_memory(frame)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/spillable_buffer.py\", line 235, in _from_device_memory\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/spillable_buffer.py\", line 235, in _from_device_memory\n",
       "    ret._finalize_init(ptr_desc={\"type\": \"gpu\"}, exposed=exposed)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/spillable_buffer.py\", line 206, in _finalize_init\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/spillable_buffer.py\", line 206, in _finalize_init\n",
       "    raise ValueError(\n",
       "ValueError: cannot create <class 'cudf.core.buffer.spillable_buffer.SpillableBuffer'> without a global spill manager\n",
       "2023-05-12 09:25:03,817 - distributed.worker - ERROR - Could not deserialize task ('len-chunk-319fe46af5510615b2fae86c6e732896-841a12bf4568ebb80eb2030cc4d9651d', 0)\n",
       "Traceback (most recent call last):\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2923, in loads_function\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2923, in loads_function\n",
       "    result = cache_loads[bytes_object]\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/collections.py\", line 24, in __getitem__\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/distributed/collections.py\", line 24, in __getitem__\n",
       "    value = super().__getitem__(key)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/collections/__init__.py\", line 1106, in __getitem__\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/collections/__init__.py\", line 1106, in __getitem__\n",
       "    raise KeyError(key)\n",
       "KeyError: b'\\x80\\x05\\x95>\\x0b\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x11dask.optimization\\x94\\x8c\\x10SubgraphCallable\\x94\\x93\\x94(}\\x94(\\x8cKlen-chunk-319fe46af5510615b2fae86c6e732896-841a12bf4568ebb80eb2030cc4d9651d\\x94\\x8cZassign-getitem-len-chunk-319fe46af5510615b2fae86c6e732896-841a12bf4568ebb80eb2030cc4d9651d\\x94\\x8c*rename-01db283bd79fee66f232920c8dc6b55e_.0\\x94\\x8c;getitem-to_frame-rename-01db283bd79fee66f232920c8dc6b55e_.0\\x94\\x8c+getitem-3499fd71ac25ebbc1a06991edea6067c_.0\\x94\\x8c\\t_operator\\x94\\x8c\\x07getitem\\x94\\x93\\x94\\x8c/reset_index-f4c18304ca92859ccd09f44cf89b4b43_.0\\x94\\x8c\\x13__dask_blockwise__1\\x94\\x87\\x94h\\x0c(\\x8c\\ndask.utils\\x94\\x8c\\x05apply\\x94\\x93\\x94h\\x0f\\x8c\\x0cmethodcaller\\x94\\x93\\x94\\x8c\\x0breset_index\\x94\\x85\\x94R\\x94]\\x94\\x8c\\x13__dask_blockwise__5\\x94a\\x8c\\x08builtins\\x94\\x8c\\x04dict\\x94\\x93\\x94]\\x94]\\x94(\\x8c\\x04drop\\x94\\x89ea\\x86\\x94t\\x94h\\x07(h\\x11\\x8c\\x13dask.dataframe.core\\x94\\x8c\\x11apply_and_enforce\\x94\\x93\\x94]\\x94((h\\x11h#]\\x94h\\x0bh\\x0c\\x8c\\x13__dask_blockwise__0\\x94\\x87\\x94ah\\x1b]\\x94(]\\x94(\\x8c\\x05_func\\x94h\\x13\\x8c\\x08to_frame\\x94\\x85\\x94R\\x94e]\\x94(\\x8c\\x05_meta\\x94\\x8c\\x08builtins\\x94\\x8c\\x07getattr\\x94\\x93\\x94\\x8c\\x13cudf.core.dataframe\\x94\\x8c\\tDataFrame\\x94\\x93\\x94\\x8c\\x10host_deserialize\\x94\\x86\\x94R\\x94}\\x94(\\x8c\\x0ftype-serialized\\x94C0\\x80\\x04\\x95%\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x13cudf.core.dataframe\\x94\\x8c\\tDataFrame\\x94\\x93\\x94.\\x94\\x8c\\x0ccolumn_names\\x94C\\x14\\x80\\x04\\x95\\t\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x03src\\x94\\x85\\x94.\\x94\\x8c\\x07columns\\x94}\\x94(\\x8c\\x0ftype-serialized\\x94C=\\x80\\x04\\x952\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x1acudf.core.column.numerical\\x94\\x8c\\x0fNumericalColumn\\x94\\x93\\x94.\\x94\\x8c\\x05dtype\\x94CB\\x80\\x04\\x957\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x05numpy\\x94\\x8c\\x05dtype\\x94\\x93\\x94\\x8c\\x02i4\\x94\\x89\\x88\\x87\\x94R\\x94(K\\x03\\x8c\\x01<\\x94NNNJ\\xff\\xff\\xff\\xffJ\\xff\\xff\\xff\\xffK\\x00t\\x94b.\\x94\\x8c\\x18dtype-is-cudf-serialized\\x94\\x89\\x8c\\x04data\\x94}\\x94(\\x8c\\x0ftype-serialized\\x94CI\\x80\\x04\\x95>\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c!cudf.core.buffer.spillable_buffer\\x94\\x8c\\x14SpillableBufferSlice\\x94\\x93\\x94.\\x94\\x8c\\x0bframe_count\\x94K\\x01u\\x8c\\x04mask\\x94}\\x94(hGCD\\x80\\x04\\x959\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c!cudf.core.buffer.spillable_buffer\\x94\\x8c\\x0fSpillableBuffer\\x94\\x93\\x94.\\x94hIK\\x01u\\x8c\\x04size\\x94K\\x00hIK\\x02u\\x85\\x94\\x8c\\x05index\\x94}\\x94(\\x8c\\x0cindex_column\\x94}\\x94(\\x8c\\x05start\\x94K\\x00\\x8c\\x04stop\\x94K\\x00\\x8c\\x04step\\x94K\\x01u\\x8c\\x04name\\x94C\\x04\\x80\\x04N.\\x94hBCB\\x80\\x04\\x957\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x05numpy\\x94\\x8c\\x05dtype\\x94\\x93\\x94\\x8c\\x02i8\\x94\\x89\\x88\\x87\\x94R\\x94(K\\x03\\x8c\\x01<\\x94NNNJ\\xff\\xff\\xff\\xffJ\\xff\\xff\\xff\\xffK\\x00t\\x94b.\\x94\\x8c\\x0ftype-serialized\\x94C-\\x80\\x04\\x95\"\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x0fcudf.core.index\\x94\\x8c\\nRangeIndex\\x94\\x93\\x94.\\x94hIK\\x00u\\x8c\\x11index_frame_count\\x94K\\x00\\x8c\\x07is-cuda\\x94]\\x94(\\x88\\x88e\\x8c\\x07lengths\\x94]\\x94(K\\x00K\\x00e\\x8c\\twriteable\\x94NN\\x86\\x94u]\\x94(\\x8c\\x12numpy.core.numeric\\x94\\x8c\\x0b_frombuffer\\x94\\x93\\x94(C\\x00\\x94\\x8c\\x05numpy\\x94hB\\x93\\x94\\x8c\\x02u1\\x94\\x89\\x88\\x87\\x94R\\x94(K\\x03\\x8c\\x01|\\x94NNNJ\\xff\\xff\\xff\\xffJ\\xff\\xff\\xff\\xffK\\x00t\\x94bK\\x00\\x85\\x94\\x8c\\x01C\\x94t\\x94R\\x94he(C\\x00\\x94hkK\\x00\\x85\\x94hot\\x94R\\x94e\\x86\\x94R\\x94ee\\x86\\x94t\\x94\\x8c\\x13__dask_blockwise__2\\x94eh\\x1b]\\x94(]\\x94(h*h\\x13\\x8c\\x06rename\\x94\\x85\\x94R\\x94e]\\x94(h/h2h5h6\\x86\\x94R\\x94}\\x94(h:C0\\x80\\x04\\x95%\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x13cudf.core.dataframe\\x94\\x8c\\tDataFrame\\x94\\x93\\x94.\\x94h<C\\x14\\x80\\x04\\x95\\t\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x03src\\x94\\x85\\x94.\\x94h>}\\x94(h@C=\\x80\\x04\\x952\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x1acudf.core.column.numerical\\x94\\x8c\\x0fNumericalColumn\\x94\\x93\\x94.\\x94hBCB\\x80\\x04\\x957\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x05numpy\\x94\\x8c\\x05dtype\\x94\\x93\\x94\\x8c\\x02i4\\x94\\x89\\x88\\x87\\x94R\\x94(K\\x03\\x8c\\x01<\\x94NNNJ\\xff\\xff\\xff\\xffJ\\xff\\xff\\xff\\xffK\\x00t\\x94b.\\x94hD\\x89hE}\\x94(hGCI\\x80\\x04\\x95>\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c!cudf.core.buffer.spillable_buffer\\x94\\x8c\\x14SpillableBufferSlice\\x94\\x93\\x94.\\x94hIK\\x01uhMK\\x00hIK\\x01u\\x85\\x94hO}\\x94(hQ}\\x94(hSK\\x00hTK\\x00hUK\\x01uhVC\\x04\\x80\\x04N.\\x94hBCB\\x80\\x04\\x957\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x05numpy\\x94\\x8c\\x05dtype\\x94\\x93\\x94\\x8c\\x02i8\\x94\\x89\\x88\\x87\\x94R\\x94(K\\x03\\x8c\\x01<\\x94NNNJ\\xff\\xff\\xff\\xffJ\\xff\\xff\\xff\\xffK\\x00t\\x94b.\\x94hYC-\\x80\\x04\\x95\"\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x0fcudf.core.index\\x94\\x8c\\nRangeIndex\\x94\\x93\\x94.\\x94hIK\\x00uh[K\\x00h\\\\]\\x94\\x88ah^]\\x94K\\x00ah`N\\x85\\x94u]\\x94he(C\\x00\\x94hkK\\x00\\x85\\x94hot\\x94R\\x94a\\x86\\x94R\\x94e]\\x94(h>h\\x1b]\\x94]\\x94(\\x8c\\x03src\\x94h\\x9eea\\x86\\x94ee\\x86\\x94t\\x94h\\x05(h\\x11h!\\x8c\\x10_reduction_chunk\\x94\\x93\\x94]\\x94h\\x0b(\\x8c\\x16dask.dataframe.methods\\x94\\x8c\\x06assign\\x94\\x93\\x94h\\x06h\\rh\\x08t\\x94h&\\x87\\x94ah\\x1b]\\x94]\\x94(\\x8c\\taca_chunk\\x94h0\\x8c\\x03len\\x94\\x93\\x94ea\\x86\\x94t\\x94\\x8c\\x13__dask_blockwise__0\\x94h\\x9e\\x8c\\x13__dask_blockwise__1\\x94\\x8c\\x03dst\\x94\\x8c\\x13__dask_blockwise__2\\x94N\\x8c\\x13__dask_blockwise__3\\x94\\x8c)to_frame-804980ae30b71d28f0a6bd3d5b7610f9\\x94\\x8c\\x13__dask_blockwise__4\\x94\\x8c(getitem-15414b72be12e28054238b44933937ab\\x94\\x8c\\x13__dask_blockwise__6\\x94\\x8c3cudf-aggregate-agg-c50c2d97de169ca4f41e43a92a042630\\x94uh\\x04\\x8c\\x13__dask_blockwise__5\\x94\\x85\\x94\\x8c6subgraph_callable-b4ca530e-8895-432e-b553-40a7b5892ab2\\x94t\\x94R\\x94.'\n",
       "\n",
       "During handling of the above exception, another exception occurred:\n",
       "\n",
       "Traceback (most recent call last):\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2244, in execute\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2244, in execute\n",
       "    function, args, kwargs = await self._maybe_deserialize_task(ts)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2216, in _maybe_deserialize_task\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2216, in _maybe_deserialize_task\n",
       "    function, args, kwargs = _deserialize(*ts.run_spec)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
       "    return func(*args, **kwds)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2937, in _deserialize\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2937, in _deserialize\n",
       "    function = loads_function(function)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2925, in loads_function\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2925, in loads_function\n",
       "    result = pickle.loads(bytes_object)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/protocol/pickle.py\", line 96, in loads\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/distributed/protocol/pickle.py\", line 96, in loads\n",
       "    return pickle.loads(x)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/abc.py\", line 176, in host_deserialize\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/abc.py\", line 176, in host_deserialize\n",
       "    obj = cls.device_deserialize(header, frames)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/abc.py\", line 130, in device_deserialize\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/abc.py\", line 130, in device_deserialize\n",
       "    return typ.deserialize(header, frames)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/dataframe.py\", line 1019, in deserialize\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/dataframe.py\", line 1019, in deserialize\n",
       "    obj = super().deserialize(\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/frame.py\", line 106, in deserialize\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/frame.py\", line 106, in deserialize\n",
       "    columns = deserialize_columns(header[\"columns\"], frames)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 2450, in deserialize_columns\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 2450, in deserialize_columns\n",
       "    colobj = col_typ.deserialize(meta, frames[:col_frame_count])\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1216, in deserialize\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1216, in deserialize\n",
       "    data, frames = unpack(header[\"data\"], frames)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1204, in unpack\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1204, in unpack\n",
       "    obj = klass.deserialize(header, frames[:count])\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/spillable_buffer.py\", line 574, in deserialize\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/spillable_buffer.py\", line 574, in deserialize\n",
       "    return SpillableBuffer.deserialize(header, frames)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/buffer.py\", line 335, in deserialize\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/buffer.py\", line 335, in deserialize\n",
       "    return cls._from_device_memory(frame)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/spillable_buffer.py\", line 235, in _from_device_memory\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/spillable_buffer.py\", line 235, in _from_device_memory\n",
       "    ret._finalize_init(ptr_desc={\"type\": \"gpu\"}, exposed=exposed)\n",
-      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/spillable_buffer.py\", line 206, in _finalize_init\n",
+      "  File \"/home/dacosta/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/spillable_buffer.py\", line 206, in _finalize_init\n",
       "    raise ValueError(\n",
       "ValueError: cannot create <class 'cudf.core.buffer.spillable_buffer.SpillableBuffer'> without a global spill manager\n"
      ]
@@ -475,34 +475,34 @@
       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
       "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
       "Cell \u001b[0;32mIn[6], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[39m# Create a directed graph using the source (src) and destination (dst) vertex pairs from the Dataframe \u001b[39;00m\n\u001b[1;32m      2\u001b[0m G \u001b[39m=\u001b[39m cugraph\u001b[39m.\u001b[39mGraph(directed\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[0;32m----> 3\u001b[0m G\u001b[39m.\u001b[39;49mfrom_dask_cudf_edgelist(e_list, source\u001b[39m=\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39msrc\u001b[39;49m\u001b[39m'\u001b[39;49m, destination\u001b[39m=\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mdst\u001b[39;49m\u001b[39m'\u001b[39;49m)\n\u001b[1;32m      5\u001b[0m \u001b[39m# Print time\u001b[39;00m\n\u001b[1;32m      6\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mRead, load and renumber: \u001b[39m\u001b[39m\"\u001b[39m, time\u001b[39m.\u001b[39mtime()\u001b[39m-\u001b[39mt_start, \u001b[39m\"\u001b[39m\u001b[39ms\u001b[39m\u001b[39m\"\u001b[39m)\n",
-      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cugraph/structure/graph_classes.py:309\u001b[0m, in \u001b[0;36mGraph.from_dask_cudf_edgelist\u001b[0;34m(self, input_ddf, source, destination, edge_attr, renumber, store_transposed, legacy_renum_only)\u001b[0m\n\u001b[1;32m    307\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_Impl\u001b[39m.\u001b[39medgelist \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m    308\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\u001b[39m\"\u001b[39m\u001b[39mGraph already has values\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m--> 309\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_Impl\u001b[39m.\u001b[39;49m_simpleDistributedGraphImpl__from_edgelist(\n\u001b[1;32m    310\u001b[0m     input_ddf,\n\u001b[1;32m    311\u001b[0m     source,\n\u001b[1;32m    312\u001b[0m     destination,\n\u001b[1;32m    313\u001b[0m     edge_attr,\n\u001b[1;32m    314\u001b[0m     renumber,\n\u001b[1;32m    315\u001b[0m     store_transposed,\n\u001b[1;32m    316\u001b[0m     legacy_renum_only,\n\u001b[1;32m    317\u001b[0m )\n",
-      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cugraph/structure/graph_implementation/simpleDistributedGraph.py:272\u001b[0m, in \u001b[0;36msimpleDistributedGraphImpl.__from_edgelist\u001b[0;34m(self, input_ddf, source, destination, edge_attr, renumber, store_transposed, legacy_renum_only)\u001b[0m\n\u001b[1;32m    268\u001b[0m     dst_col_name \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mrenumber_map\u001b[39m.\u001b[39mrenumbered_dst_col_name\n\u001b[1;32m    270\u001b[0m ddf \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39medgelist\u001b[39m.\u001b[39medgelist_df\n\u001b[0;32m--> 272\u001b[0m num_edges \u001b[39m=\u001b[39m \u001b[39mlen\u001b[39;49m(ddf)\n\u001b[1;32m    273\u001b[0m edge_data \u001b[39m=\u001b[39m get_distributed_data(ddf)\n\u001b[1;32m    275\u001b[0m graph_props \u001b[39m=\u001b[39m GraphProperties(\n\u001b[1;32m    276\u001b[0m     is_multigraph\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mproperties\u001b[39m.\u001b[39mmulti_edge,\n\u001b[1;32m    277\u001b[0m     is_symmetric\u001b[39m=\u001b[39m\u001b[39mnot\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mproperties\u001b[39m.\u001b[39mdirected,\n\u001b[1;32m    278\u001b[0m )\n",
-      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/dask/dataframe/core.py:4775\u001b[0m, in \u001b[0;36mDataFrame.__len__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   4773\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39msuper\u001b[39m()\u001b[39m.\u001b[39m\u001b[39m__len__\u001b[39m()\n\u001b[1;32m   4774\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m-> 4775\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mlen\u001b[39;49m(s)\n",
-      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/dask/dataframe/core.py:843\u001b[0m, in \u001b[0;36m_Frame.__len__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    840\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__len__\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[1;32m    841\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mreduction(\n\u001b[1;32m    842\u001b[0m         \u001b[39mlen\u001b[39;49m, np\u001b[39m.\u001b[39;49msum, token\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mlen\u001b[39;49m\u001b[39m\"\u001b[39;49m, meta\u001b[39m=\u001b[39;49m\u001b[39mint\u001b[39;49m, split_every\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m\n\u001b[0;32m--> 843\u001b[0m     )\u001b[39m.\u001b[39;49mcompute()\n",
-      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/dask/base.py:314\u001b[0m, in \u001b[0;36mDaskMethodsMixin.compute\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m    290\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mcompute\u001b[39m(\u001b[39mself\u001b[39m, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs):\n\u001b[1;32m    291\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"Compute this dask collection\u001b[39;00m\n\u001b[1;32m    292\u001b[0m \n\u001b[1;32m    293\u001b[0m \u001b[39m    This turns a lazy Dask collection into its in-memory equivalent.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    312\u001b[0m \u001b[39m    dask.base.compute\u001b[39;00m\n\u001b[1;32m    313\u001b[0m \u001b[39m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 314\u001b[0m     (result,) \u001b[39m=\u001b[39m compute(\u001b[39mself\u001b[39;49m, traverse\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m    315\u001b[0m     \u001b[39mreturn\u001b[39;00m result\n",
-      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/dask/base.py:599\u001b[0m, in \u001b[0;36mcompute\u001b[0;34m(traverse, optimize_graph, scheduler, get, *args, **kwargs)\u001b[0m\n\u001b[1;32m    596\u001b[0m     keys\u001b[39m.\u001b[39mappend(x\u001b[39m.\u001b[39m__dask_keys__())\n\u001b[1;32m    597\u001b[0m     postcomputes\u001b[39m.\u001b[39mappend(x\u001b[39m.\u001b[39m__dask_postcompute__())\n\u001b[0;32m--> 599\u001b[0m results \u001b[39m=\u001b[39m schedule(dsk, keys, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m    600\u001b[0m \u001b[39mreturn\u001b[39;00m repack([f(r, \u001b[39m*\u001b[39ma) \u001b[39mfor\u001b[39;00m r, (f, a) \u001b[39min\u001b[39;00m \u001b[39mzip\u001b[39m(results, postcomputes)])\n",
-      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/client.py:3186\u001b[0m, in \u001b[0;36mClient.get\u001b[0;34m(self, dsk, keys, workers, allow_other_workers, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)\u001b[0m\n\u001b[1;32m   3184\u001b[0m         should_rejoin \u001b[39m=\u001b[39m \u001b[39mFalse\u001b[39;00m\n\u001b[1;32m   3185\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m-> 3186\u001b[0m     results \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mgather(packed, asynchronous\u001b[39m=\u001b[39;49masynchronous, direct\u001b[39m=\u001b[39;49mdirect)\n\u001b[1;32m   3187\u001b[0m \u001b[39mfinally\u001b[39;00m:\n\u001b[1;32m   3188\u001b[0m     \u001b[39mfor\u001b[39;00m f \u001b[39min\u001b[39;00m futures\u001b[39m.\u001b[39mvalues():\n",
-      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/client.py:2345\u001b[0m, in \u001b[0;36mClient.gather\u001b[0;34m(self, futures, errors, direct, asynchronous)\u001b[0m\n\u001b[1;32m   2343\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m   2344\u001b[0m     local_worker \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m-> 2345\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49msync(\n\u001b[1;32m   2346\u001b[0m     \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_gather,\n\u001b[1;32m   2347\u001b[0m     futures,\n\u001b[1;32m   2348\u001b[0m     errors\u001b[39m=\u001b[39;49merrors,\n\u001b[1;32m   2349\u001b[0m     direct\u001b[39m=\u001b[39;49mdirect,\n\u001b[1;32m   2350\u001b[0m     local_worker\u001b[39m=\u001b[39;49mlocal_worker,\n\u001b[1;32m   2351\u001b[0m     asynchronous\u001b[39m=\u001b[39;49masynchronous,\n\u001b[1;32m   2352\u001b[0m )\n",
-      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/utils.py:349\u001b[0m, in \u001b[0;36mSyncMethodMixin.sync\u001b[0;34m(self, func, asynchronous, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[1;32m    347\u001b[0m     \u001b[39mreturn\u001b[39;00m future\n\u001b[1;32m    348\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 349\u001b[0m     \u001b[39mreturn\u001b[39;00m sync(\n\u001b[1;32m    350\u001b[0m         \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mloop, func, \u001b[39m*\u001b[39;49margs, callback_timeout\u001b[39m=\u001b[39;49mcallback_timeout, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs\n\u001b[1;32m    351\u001b[0m     )\n",
-      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/utils.py:416\u001b[0m, in \u001b[0;36msync\u001b[0;34m(loop, func, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[1;32m    414\u001b[0m \u001b[39mif\u001b[39;00m error:\n\u001b[1;32m    415\u001b[0m     typ, exc, tb \u001b[39m=\u001b[39m error\n\u001b[0;32m--> 416\u001b[0m     \u001b[39mraise\u001b[39;00m exc\u001b[39m.\u001b[39mwith_traceback(tb)\n\u001b[1;32m    417\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m    418\u001b[0m     \u001b[39mreturn\u001b[39;00m result\n",
-      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/utils.py:389\u001b[0m, in \u001b[0;36msync.<locals>.f\u001b[0;34m()\u001b[0m\n\u001b[1;32m    387\u001b[0m         future \u001b[39m=\u001b[39m wait_for(future, callback_timeout)\n\u001b[1;32m    388\u001b[0m     future \u001b[39m=\u001b[39m asyncio\u001b[39m.\u001b[39mensure_future(future)\n\u001b[0;32m--> 389\u001b[0m     result \u001b[39m=\u001b[39m \u001b[39myield\u001b[39;00m future\n\u001b[1;32m    390\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m:\n\u001b[1;32m    391\u001b[0m     error \u001b[39m=\u001b[39m sys\u001b[39m.\u001b[39mexc_info()\n",
-      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/tornado/gen.py:769\u001b[0m, in \u001b[0;36mRunner.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    766\u001b[0m exc_info \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m    768\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 769\u001b[0m     value \u001b[39m=\u001b[39m future\u001b[39m.\u001b[39;49mresult()\n\u001b[1;32m    770\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m:\n\u001b[1;32m    771\u001b[0m     exc_info \u001b[39m=\u001b[39m sys\u001b[39m.\u001b[39mexc_info()\n",
-      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/client.py:2208\u001b[0m, in \u001b[0;36mClient._gather\u001b[0;34m(self, futures, errors, direct, local_worker)\u001b[0m\n\u001b[1;32m   2206\u001b[0m         exc \u001b[39m=\u001b[39m CancelledError(key)\n\u001b[1;32m   2207\u001b[0m     \u001b[39melse\u001b[39;00m:\n\u001b[0;32m-> 2208\u001b[0m         \u001b[39mraise\u001b[39;00m exception\u001b[39m.\u001b[39mwith_traceback(traceback)\n\u001b[1;32m   2209\u001b[0m     \u001b[39mraise\u001b[39;00m exc\n\u001b[1;32m   2210\u001b[0m \u001b[39mif\u001b[39;00m errors \u001b[39m==\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mskip\u001b[39m\u001b[39m\"\u001b[39m:\n",
-      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py:79\u001b[0m, in \u001b[0;36minner\u001b[0;34m()\u001b[0m\n\u001b[1;32m     76\u001b[0m \u001b[39m@wraps\u001b[39m(func)\n\u001b[1;32m     77\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39minner\u001b[39m(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwds):\n\u001b[1;32m     78\u001b[0m     \u001b[39mwith\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_recreate_cm():\n\u001b[0;32m---> 79\u001b[0m         \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwds)\n",
-      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py:2937\u001b[0m, in \u001b[0;36m_deserialize\u001b[0;34m()\u001b[0m\n\u001b[1;32m   2934\u001b[0m \u001b[39m# Some objects require threadlocal state during deserialization, e.g. to\u001b[39;00m\n\u001b[1;32m   2935\u001b[0m \u001b[39m# detect the current worker\u001b[39;00m\n\u001b[1;32m   2936\u001b[0m \u001b[39mif\u001b[39;00m function \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m-> 2937\u001b[0m     function \u001b[39m=\u001b[39m loads_function(function)\n\u001b[1;32m   2938\u001b[0m \u001b[39mif\u001b[39;00m args \u001b[39mand\u001b[39;00m \u001b[39misinstance\u001b[39m(args, \u001b[39mbytes\u001b[39m):\n\u001b[1;32m   2939\u001b[0m     args \u001b[39m=\u001b[39m pickle\u001b[39m.\u001b[39mloads(args)\n",
-      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py:2925\u001b[0m, in \u001b[0;36mloads_function\u001b[0;34m()\u001b[0m\n\u001b[1;32m   2923\u001b[0m     result \u001b[39m=\u001b[39m cache_loads[bytes_object]\n\u001b[1;32m   2924\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mKeyError\u001b[39;00m:\n\u001b[0;32m-> 2925\u001b[0m     result \u001b[39m=\u001b[39m pickle\u001b[39m.\u001b[39mloads(bytes_object)\n\u001b[1;32m   2926\u001b[0m     cache_loads[bytes_object] \u001b[39m=\u001b[39m result\n\u001b[1;32m   2927\u001b[0m \u001b[39mreturn\u001b[39;00m result\n",
-      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/protocol/pickle.py:96\u001b[0m, in \u001b[0;36mloads\u001b[0;34m()\u001b[0m\n\u001b[1;32m     94\u001b[0m         \u001b[39mreturn\u001b[39;00m pickle\u001b[39m.\u001b[39mloads(x, buffers\u001b[39m=\u001b[39mbuffers)\n\u001b[1;32m     95\u001b[0m     \u001b[39melse\u001b[39;00m:\n\u001b[0;32m---> 96\u001b[0m         \u001b[39mreturn\u001b[39;00m pickle\u001b[39m.\u001b[39mloads(x)\n\u001b[1;32m     97\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m:\n\u001b[1;32m     98\u001b[0m     logger\u001b[39m.\u001b[39minfo(\u001b[39m\"\u001b[39m\u001b[39mFailed to deserialize \u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m\"\u001b[39m, x[:\u001b[39m10000\u001b[39m], exc_info\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n",
-      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/abc.py:176\u001b[0m, in \u001b[0;36mhost_deserialize\u001b[0;34m()\u001b[0m\n\u001b[1;32m    154\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"Perform device-side deserialization tasks.\u001b[39;00m\n\u001b[1;32m    155\u001b[0m \n\u001b[1;32m    156\u001b[0m \u001b[39mParameters\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    170\u001b[0m \u001b[39m:meta private:\u001b[39;00m\n\u001b[1;32m    171\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m    172\u001b[0m frames \u001b[39m=\u001b[39m [\n\u001b[1;32m    173\u001b[0m     cudf\u001b[39m.\u001b[39mcore\u001b[39m.\u001b[39mbuffer\u001b[39m.\u001b[39mas_buffer(f) \u001b[39mif\u001b[39;00m c \u001b[39melse\u001b[39;00m f\n\u001b[1;32m    174\u001b[0m     \u001b[39mfor\u001b[39;00m c, f \u001b[39min\u001b[39;00m \u001b[39mzip\u001b[39m(header[\u001b[39m\"\u001b[39m\u001b[39mis-cuda\u001b[39m\u001b[39m\"\u001b[39m], \u001b[39mmap\u001b[39m(\u001b[39mmemoryview\u001b[39m, frames))\n\u001b[1;32m    175\u001b[0m ]\n\u001b[0;32m--> 176\u001b[0m obj \u001b[39m=\u001b[39m \u001b[39mcls\u001b[39m\u001b[39m.\u001b[39mdevice_deserialize(header, frames)\n\u001b[1;32m    177\u001b[0m \u001b[39mreturn\u001b[39;00m obj\n",
-      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/abc.py:130\u001b[0m, in \u001b[0;36mdevice_deserialize\u001b[0;34m()\u001b[0m\n\u001b[1;32m    125\u001b[0m typ \u001b[39m=\u001b[39m pickle\u001b[39m.\u001b[39mloads(header[\u001b[39m\"\u001b[39m\u001b[39mtype-serialized\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[1;32m    126\u001b[0m frames \u001b[39m=\u001b[39m [\n\u001b[1;32m    127\u001b[0m     cudf\u001b[39m.\u001b[39mcore\u001b[39m.\u001b[39mbuffer\u001b[39m.\u001b[39mas_buffer(f) \u001b[39mif\u001b[39;00m c \u001b[39melse\u001b[39;00m \u001b[39mmemoryview\u001b[39m(f)\n\u001b[1;32m    128\u001b[0m     \u001b[39mfor\u001b[39;00m c, f \u001b[39min\u001b[39;00m \u001b[39mzip\u001b[39m(header[\u001b[39m\"\u001b[39m\u001b[39mis-cuda\u001b[39m\u001b[39m\"\u001b[39m], frames)\n\u001b[1;32m    129\u001b[0m ]\n\u001b[0;32m--> 130\u001b[0m \u001b[39mreturn\u001b[39;00m typ\u001b[39m.\u001b[39mdeserialize(header, frames)\n",
-      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/dataframe.py:1019\u001b[0m, in \u001b[0;36mdeserialize\u001b[0;34m()\u001b[0m\n\u001b[1;32m   1016\u001b[0m \u001b[39m@classmethod\u001b[39m\n\u001b[1;32m   1017\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mdeserialize\u001b[39m(\u001b[39mcls\u001b[39m, header, frames):\n\u001b[1;32m   1018\u001b[0m     index_nframes \u001b[39m=\u001b[39m header[\u001b[39m\"\u001b[39m\u001b[39mindex_frame_count\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[0;32m-> 1019\u001b[0m     obj \u001b[39m=\u001b[39m \u001b[39msuper\u001b[39m()\u001b[39m.\u001b[39mdeserialize(\n\u001b[1;32m   1020\u001b[0m         header, frames[header[\u001b[39m\"\u001b[39m\u001b[39mindex_frame_count\u001b[39m\u001b[39m\"\u001b[39m] :]\n\u001b[1;32m   1021\u001b[0m     )\n\u001b[1;32m   1023\u001b[0m     idx_typ \u001b[39m=\u001b[39m pickle\u001b[39m.\u001b[39mloads(header[\u001b[39m\"\u001b[39m\u001b[39mindex\u001b[39m\u001b[39m\"\u001b[39m][\u001b[39m\"\u001b[39m\u001b[39mtype-serialized\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[1;32m   1024\u001b[0m     index \u001b[39m=\u001b[39m idx_typ\u001b[39m.\u001b[39mdeserialize(header[\u001b[39m\"\u001b[39m\u001b[39mindex\u001b[39m\u001b[39m\"\u001b[39m], frames[:index_nframes])\n",
-      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/frame.py:106\u001b[0m, in \u001b[0;36mdeserialize\u001b[0;34m()\u001b[0m\n\u001b[1;32m    104\u001b[0m cls_deserialize \u001b[39m=\u001b[39m pickle\u001b[39m.\u001b[39mloads(header[\u001b[39m\"\u001b[39m\u001b[39mtype-serialized\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[1;32m    105\u001b[0m column_names \u001b[39m=\u001b[39m pickle\u001b[39m.\u001b[39mloads(header[\u001b[39m\"\u001b[39m\u001b[39mcolumn_names\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[0;32m--> 106\u001b[0m columns \u001b[39m=\u001b[39m deserialize_columns(header[\u001b[39m\"\u001b[39m\u001b[39mcolumns\u001b[39m\u001b[39m\"\u001b[39m], frames)\n\u001b[1;32m    107\u001b[0m \u001b[39mreturn\u001b[39;00m cls_deserialize\u001b[39m.\u001b[39m_from_data(\u001b[39mdict\u001b[39m(\u001b[39mzip\u001b[39m(column_names, columns)))\n",
-      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py:2450\u001b[0m, in \u001b[0;36mdeserialize_columns\u001b[0;34m()\u001b[0m\n\u001b[1;32m   2448\u001b[0m col_frame_count \u001b[39m=\u001b[39m meta[\u001b[39m\"\u001b[39m\u001b[39mframe_count\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[1;32m   2449\u001b[0m col_typ \u001b[39m=\u001b[39m pickle\u001b[39m.\u001b[39mloads(meta[\u001b[39m\"\u001b[39m\u001b[39mtype-serialized\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[0;32m-> 2450\u001b[0m colobj \u001b[39m=\u001b[39m col_typ\u001b[39m.\u001b[39mdeserialize(meta, frames[:col_frame_count])\n\u001b[1;32m   2451\u001b[0m columns\u001b[39m.\u001b[39mappend(colobj)\n\u001b[1;32m   2452\u001b[0m \u001b[39m# Advance frames\u001b[39;00m\n",
-      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py:1216\u001b[0m, in \u001b[0;36mdeserialize\u001b[0;34m()\u001b[0m\n\u001b[1;32m   1214\u001b[0m     dtype \u001b[39m=\u001b[39m pickle\u001b[39m.\u001b[39mloads(header[\u001b[39m\"\u001b[39m\u001b[39mdtype\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[1;32m   1215\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mdata\u001b[39m\u001b[39m\"\u001b[39m \u001b[39min\u001b[39;00m header:\n\u001b[0;32m-> 1216\u001b[0m     data, frames \u001b[39m=\u001b[39m unpack(header[\u001b[39m\"\u001b[39m\u001b[39mdata\u001b[39m\u001b[39m\"\u001b[39m], frames)\n\u001b[1;32m   1217\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m   1218\u001b[0m     data \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n",
-      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py:1204\u001b[0m, in \u001b[0;36munpack\u001b[0;34m()\u001b[0m\n\u001b[1;32m   1202\u001b[0m count \u001b[39m=\u001b[39m header[\u001b[39m\"\u001b[39m\u001b[39mframe_count\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[1;32m   1203\u001b[0m klass \u001b[39m=\u001b[39m pickle\u001b[39m.\u001b[39mloads(header[\u001b[39m\"\u001b[39m\u001b[39mtype-serialized\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[0;32m-> 1204\u001b[0m obj \u001b[39m=\u001b[39m klass\u001b[39m.\u001b[39mdeserialize(header, frames[:count])\n\u001b[1;32m   1205\u001b[0m \u001b[39mreturn\u001b[39;00m obj, frames[count:]\n",
-      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/spillable_buffer.py:574\u001b[0m, in \u001b[0;36mdeserialize\u001b[0;34m()\u001b[0m\n\u001b[1;32m    567\u001b[0m \u001b[39m@classmethod\u001b[39m\n\u001b[1;32m    568\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mdeserialize\u001b[39m(\u001b[39mcls\u001b[39m, header: \u001b[39mdict\u001b[39m, frames: \u001b[39mlist\u001b[39m):\n\u001b[1;32m    569\u001b[0m     \u001b[39m# TODO: because of the hack in `SpillableBuffer.serialize()` where\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    572\u001b[0m     \u001b[39m# deserialize into `SpillableBufferSlice` when the frames hasn't been\u001b[39;00m\n\u001b[1;32m    573\u001b[0m     \u001b[39m# copied.\u001b[39;00m\n\u001b[0;32m--> 574\u001b[0m     \u001b[39mreturn\u001b[39;00m SpillableBuffer\u001b[39m.\u001b[39mdeserialize(header, frames)\n",
-      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/buffer.py:335\u001b[0m, in \u001b[0;36mdeserialize\u001b[0;34m()\u001b[0m\n\u001b[1;32m    332\u001b[0m     \u001b[39mreturn\u001b[39;00m frame  \u001b[39m# The frame is already deserialized\u001b[39;00m\n\u001b[1;32m    334\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mhasattr\u001b[39m(frame, \u001b[39m\"\u001b[39m\u001b[39m__cuda_array_interface__\u001b[39m\u001b[39m\"\u001b[39m):\n\u001b[0;32m--> 335\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mcls\u001b[39m\u001b[39m.\u001b[39m_from_device_memory(frame)\n\u001b[1;32m    336\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mcls\u001b[39m\u001b[39m.\u001b[39m_from_host_memory(frame)\n",
-      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/spillable_buffer.py:235\u001b[0m, in \u001b[0;36m_from_device_memory\u001b[0;34m()\u001b[0m\n\u001b[1;32m    218\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"Create a spillabe buffer from device memory.\u001b[39;00m\n\u001b[1;32m    219\u001b[0m \n\u001b[1;32m    220\u001b[0m \u001b[39mNo data is being copied.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    232\u001b[0m \u001b[39m    Buffer representing the same device memory as `data`\u001b[39;00m\n\u001b[1;32m    233\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m    234\u001b[0m ret \u001b[39m=\u001b[39m \u001b[39msuper\u001b[39m()\u001b[39m.\u001b[39m_from_device_memory(data)\n\u001b[0;32m--> 235\u001b[0m ret\u001b[39m.\u001b[39m_finalize_init(ptr_desc\u001b[39m=\u001b[39m{\u001b[39m\"\u001b[39m\u001b[39mtype\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39m\"\u001b[39m\u001b[39mgpu\u001b[39m\u001b[39m\"\u001b[39m}, exposed\u001b[39m=\u001b[39mexposed)\n\u001b[1;32m    236\u001b[0m \u001b[39mreturn\u001b[39;00m ret\n",
-      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/spillable_buffer.py:206\u001b[0m, in \u001b[0;36m_finalize_init\u001b[0;34m()\u001b[0m\n\u001b[1;32m    204\u001b[0m manager \u001b[39m=\u001b[39m get_global_manager()\n\u001b[1;32m    205\u001b[0m \u001b[39mif\u001b[39;00m manager \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m--> 206\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m    207\u001b[0m         \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mcannot create \u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__class__\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m without \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    208\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39ma global spill manager\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    209\u001b[0m     )\n\u001b[1;32m    211\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_manager \u001b[39m=\u001b[39m manager\n\u001b[1;32m    212\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_manager\u001b[39m.\u001b[39madd(\u001b[39mself\u001b[39m)\n",
+      "File \u001b[0;32m~/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cugraph/structure/graph_classes.py:309\u001b[0m, in \u001b[0;36mGraph.from_dask_cudf_edgelist\u001b[0;34m(self, input_ddf, source, destination, edge_attr, renumber, store_transposed, legacy_renum_only)\u001b[0m\n\u001b[1;32m    307\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_Impl\u001b[39m.\u001b[39medgelist \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m    308\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\u001b[39m\"\u001b[39m\u001b[39mGraph already has values\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m--> 309\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_Impl\u001b[39m.\u001b[39;49m_simpleDistributedGraphImpl__from_edgelist(\n\u001b[1;32m    310\u001b[0m     input_ddf,\n\u001b[1;32m    311\u001b[0m     source,\n\u001b[1;32m    312\u001b[0m     destination,\n\u001b[1;32m    313\u001b[0m     edge_attr,\n\u001b[1;32m    314\u001b[0m     renumber,\n\u001b[1;32m    315\u001b[0m     store_transposed,\n\u001b[1;32m    316\u001b[0m     legacy_renum_only,\n\u001b[1;32m    317\u001b[0m )\n",
+      "File \u001b[0;32m~/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cugraph/structure/graph_implementation/simpleDistributedGraph.py:272\u001b[0m, in \u001b[0;36msimpleDistributedGraphImpl.__from_edgelist\u001b[0;34m(self, input_ddf, source, destination, edge_attr, renumber, store_transposed, legacy_renum_only)\u001b[0m\n\u001b[1;32m    268\u001b[0m     dst_col_name \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mrenumber_map\u001b[39m.\u001b[39mrenumbered_dst_col_name\n\u001b[1;32m    270\u001b[0m ddf \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39medgelist\u001b[39m.\u001b[39medgelist_df\n\u001b[0;32m--> 272\u001b[0m num_edges \u001b[39m=\u001b[39m \u001b[39mlen\u001b[39;49m(ddf)\n\u001b[1;32m    273\u001b[0m edge_data \u001b[39m=\u001b[39m get_distributed_data(ddf)\n\u001b[1;32m    275\u001b[0m graph_props \u001b[39m=\u001b[39m GraphProperties(\n\u001b[1;32m    276\u001b[0m     is_multigraph\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mproperties\u001b[39m.\u001b[39mmulti_edge,\n\u001b[1;32m    277\u001b[0m     is_symmetric\u001b[39m=\u001b[39m\u001b[39mnot\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mproperties\u001b[39m.\u001b[39mdirected,\n\u001b[1;32m    278\u001b[0m )\n",
+      "File \u001b[0;32m~/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/dask/dataframe/core.py:4775\u001b[0m, in \u001b[0;36mDataFrame.__len__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   4773\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39msuper\u001b[39m()\u001b[39m.\u001b[39m\u001b[39m__len__\u001b[39m()\n\u001b[1;32m   4774\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m-> 4775\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mlen\u001b[39;49m(s)\n",
+      "File \u001b[0;32m~/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/dask/dataframe/core.py:843\u001b[0m, in \u001b[0;36m_Frame.__len__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    840\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__len__\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[1;32m    841\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mreduction(\n\u001b[1;32m    842\u001b[0m         \u001b[39mlen\u001b[39;49m, np\u001b[39m.\u001b[39;49msum, token\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mlen\u001b[39;49m\u001b[39m\"\u001b[39;49m, meta\u001b[39m=\u001b[39;49m\u001b[39mint\u001b[39;49m, split_every\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m\n\u001b[0;32m--> 843\u001b[0m     )\u001b[39m.\u001b[39;49mcompute()\n",
+      "File \u001b[0;32m~/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/dask/base.py:314\u001b[0m, in \u001b[0;36mDaskMethodsMixin.compute\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m    290\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mcompute\u001b[39m(\u001b[39mself\u001b[39m, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs):\n\u001b[1;32m    291\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"Compute this dask collection\u001b[39;00m\n\u001b[1;32m    292\u001b[0m \n\u001b[1;32m    293\u001b[0m \u001b[39m    This turns a lazy Dask collection into its in-memory equivalent.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    312\u001b[0m \u001b[39m    dask.base.compute\u001b[39;00m\n\u001b[1;32m    313\u001b[0m \u001b[39m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 314\u001b[0m     (result,) \u001b[39m=\u001b[39m compute(\u001b[39mself\u001b[39;49m, traverse\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m    315\u001b[0m     \u001b[39mreturn\u001b[39;00m result\n",
+      "File \u001b[0;32m~/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/dask/base.py:599\u001b[0m, in \u001b[0;36mcompute\u001b[0;34m(traverse, optimize_graph, scheduler, get, *args, **kwargs)\u001b[0m\n\u001b[1;32m    596\u001b[0m     keys\u001b[39m.\u001b[39mappend(x\u001b[39m.\u001b[39m__dask_keys__())\n\u001b[1;32m    597\u001b[0m     postcomputes\u001b[39m.\u001b[39mappend(x\u001b[39m.\u001b[39m__dask_postcompute__())\n\u001b[0;32m--> 599\u001b[0m results \u001b[39m=\u001b[39m schedule(dsk, keys, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m    600\u001b[0m \u001b[39mreturn\u001b[39;00m repack([f(r, \u001b[39m*\u001b[39ma) \u001b[39mfor\u001b[39;00m r, (f, a) \u001b[39min\u001b[39;00m \u001b[39mzip\u001b[39m(results, postcomputes)])\n",
+      "File \u001b[0;32m~/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/distributed/client.py:3186\u001b[0m, in \u001b[0;36mClient.get\u001b[0;34m(self, dsk, keys, workers, allow_other_workers, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)\u001b[0m\n\u001b[1;32m   3184\u001b[0m         should_rejoin \u001b[39m=\u001b[39m \u001b[39mFalse\u001b[39;00m\n\u001b[1;32m   3185\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m-> 3186\u001b[0m     results \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mgather(packed, asynchronous\u001b[39m=\u001b[39;49masynchronous, direct\u001b[39m=\u001b[39;49mdirect)\n\u001b[1;32m   3187\u001b[0m \u001b[39mfinally\u001b[39;00m:\n\u001b[1;32m   3188\u001b[0m     \u001b[39mfor\u001b[39;00m f \u001b[39min\u001b[39;00m futures\u001b[39m.\u001b[39mvalues():\n",
+      "File \u001b[0;32m~/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/distributed/client.py:2345\u001b[0m, in \u001b[0;36mClient.gather\u001b[0;34m(self, futures, errors, direct, asynchronous)\u001b[0m\n\u001b[1;32m   2343\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m   2344\u001b[0m     local_worker \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m-> 2345\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49msync(\n\u001b[1;32m   2346\u001b[0m     \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_gather,\n\u001b[1;32m   2347\u001b[0m     futures,\n\u001b[1;32m   2348\u001b[0m     errors\u001b[39m=\u001b[39;49merrors,\n\u001b[1;32m   2349\u001b[0m     direct\u001b[39m=\u001b[39;49mdirect,\n\u001b[1;32m   2350\u001b[0m     local_worker\u001b[39m=\u001b[39;49mlocal_worker,\n\u001b[1;32m   2351\u001b[0m     asynchronous\u001b[39m=\u001b[39;49masynchronous,\n\u001b[1;32m   2352\u001b[0m )\n",
+      "File \u001b[0;32m~/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/distributed/utils.py:349\u001b[0m, in \u001b[0;36mSyncMethodMixin.sync\u001b[0;34m(self, func, asynchronous, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[1;32m    347\u001b[0m     \u001b[39mreturn\u001b[39;00m future\n\u001b[1;32m    348\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 349\u001b[0m     \u001b[39mreturn\u001b[39;00m sync(\n\u001b[1;32m    350\u001b[0m         \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mloop, func, \u001b[39m*\u001b[39;49margs, callback_timeout\u001b[39m=\u001b[39;49mcallback_timeout, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs\n\u001b[1;32m    351\u001b[0m     )\n",
+      "File \u001b[0;32m~/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/distributed/utils.py:416\u001b[0m, in \u001b[0;36msync\u001b[0;34m(loop, func, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[1;32m    414\u001b[0m \u001b[39mif\u001b[39;00m error:\n\u001b[1;32m    415\u001b[0m     typ, exc, tb \u001b[39m=\u001b[39m error\n\u001b[0;32m--> 416\u001b[0m     \u001b[39mraise\u001b[39;00m exc\u001b[39m.\u001b[39mwith_traceback(tb)\n\u001b[1;32m    417\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m    418\u001b[0m     \u001b[39mreturn\u001b[39;00m result\n",
+      "File \u001b[0;32m~/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/distributed/utils.py:389\u001b[0m, in \u001b[0;36msync.<locals>.f\u001b[0;34m()\u001b[0m\n\u001b[1;32m    387\u001b[0m         future \u001b[39m=\u001b[39m wait_for(future, callback_timeout)\n\u001b[1;32m    388\u001b[0m     future \u001b[39m=\u001b[39m asyncio\u001b[39m.\u001b[39mensure_future(future)\n\u001b[0;32m--> 389\u001b[0m     result \u001b[39m=\u001b[39m \u001b[39myield\u001b[39;00m future\n\u001b[1;32m    390\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m:\n\u001b[1;32m    391\u001b[0m     error \u001b[39m=\u001b[39m sys\u001b[39m.\u001b[39mexc_info()\n",
+      "File \u001b[0;32m~/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/tornado/gen.py:769\u001b[0m, in \u001b[0;36mRunner.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    766\u001b[0m exc_info \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m    768\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 769\u001b[0m     value \u001b[39m=\u001b[39m future\u001b[39m.\u001b[39;49mresult()\n\u001b[1;32m    770\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m:\n\u001b[1;32m    771\u001b[0m     exc_info \u001b[39m=\u001b[39m sys\u001b[39m.\u001b[39mexc_info()\n",
+      "File \u001b[0;32m~/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/distributed/client.py:2208\u001b[0m, in \u001b[0;36mClient._gather\u001b[0;34m(self, futures, errors, direct, local_worker)\u001b[0m\n\u001b[1;32m   2206\u001b[0m         exc \u001b[39m=\u001b[39m CancelledError(key)\n\u001b[1;32m   2207\u001b[0m     \u001b[39melse\u001b[39;00m:\n\u001b[0;32m-> 2208\u001b[0m         \u001b[39mraise\u001b[39;00m exception\u001b[39m.\u001b[39mwith_traceback(traceback)\n\u001b[1;32m   2209\u001b[0m     \u001b[39mraise\u001b[39;00m exc\n\u001b[1;32m   2210\u001b[0m \u001b[39mif\u001b[39;00m errors \u001b[39m==\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mskip\u001b[39m\u001b[39m\"\u001b[39m:\n",
+      "File \u001b[0;32m~/miniforge/envs/cugraph_0411/lib/python3.10/contextlib.py:79\u001b[0m, in \u001b[0;36minner\u001b[0;34m()\u001b[0m\n\u001b[1;32m     76\u001b[0m \u001b[39m@wraps\u001b[39m(func)\n\u001b[1;32m     77\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39minner\u001b[39m(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwds):\n\u001b[1;32m     78\u001b[0m     \u001b[39mwith\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_recreate_cm():\n\u001b[0;32m---> 79\u001b[0m         \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwds)\n",
+      "File \u001b[0;32m~/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py:2937\u001b[0m, in \u001b[0;36m_deserialize\u001b[0;34m()\u001b[0m\n\u001b[1;32m   2934\u001b[0m \u001b[39m# Some objects require threadlocal state during deserialization, e.g. to\u001b[39;00m\n\u001b[1;32m   2935\u001b[0m \u001b[39m# detect the current worker\u001b[39;00m\n\u001b[1;32m   2936\u001b[0m \u001b[39mif\u001b[39;00m function \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m-> 2937\u001b[0m     function \u001b[39m=\u001b[39m loads_function(function)\n\u001b[1;32m   2938\u001b[0m \u001b[39mif\u001b[39;00m args \u001b[39mand\u001b[39;00m \u001b[39misinstance\u001b[39m(args, \u001b[39mbytes\u001b[39m):\n\u001b[1;32m   2939\u001b[0m     args \u001b[39m=\u001b[39m pickle\u001b[39m.\u001b[39mloads(args)\n",
+      "File \u001b[0;32m~/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py:2925\u001b[0m, in \u001b[0;36mloads_function\u001b[0;34m()\u001b[0m\n\u001b[1;32m   2923\u001b[0m     result \u001b[39m=\u001b[39m cache_loads[bytes_object]\n\u001b[1;32m   2924\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mKeyError\u001b[39;00m:\n\u001b[0;32m-> 2925\u001b[0m     result \u001b[39m=\u001b[39m pickle\u001b[39m.\u001b[39mloads(bytes_object)\n\u001b[1;32m   2926\u001b[0m     cache_loads[bytes_object] \u001b[39m=\u001b[39m result\n\u001b[1;32m   2927\u001b[0m \u001b[39mreturn\u001b[39;00m result\n",
+      "File \u001b[0;32m~/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/distributed/protocol/pickle.py:96\u001b[0m, in \u001b[0;36mloads\u001b[0;34m()\u001b[0m\n\u001b[1;32m     94\u001b[0m         \u001b[39mreturn\u001b[39;00m pickle\u001b[39m.\u001b[39mloads(x, buffers\u001b[39m=\u001b[39mbuffers)\n\u001b[1;32m     95\u001b[0m     \u001b[39melse\u001b[39;00m:\n\u001b[0;32m---> 96\u001b[0m         \u001b[39mreturn\u001b[39;00m pickle\u001b[39m.\u001b[39mloads(x)\n\u001b[1;32m     97\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m:\n\u001b[1;32m     98\u001b[0m     logger\u001b[39m.\u001b[39minfo(\u001b[39m\"\u001b[39m\u001b[39mFailed to deserialize \u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m\"\u001b[39m, x[:\u001b[39m10000\u001b[39m], exc_info\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n",
+      "File \u001b[0;32m~/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/abc.py:176\u001b[0m, in \u001b[0;36mhost_deserialize\u001b[0;34m()\u001b[0m\n\u001b[1;32m    154\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"Perform device-side deserialization tasks.\u001b[39;00m\n\u001b[1;32m    155\u001b[0m \n\u001b[1;32m    156\u001b[0m \u001b[39mParameters\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    170\u001b[0m \u001b[39m:meta private:\u001b[39;00m\n\u001b[1;32m    171\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m    172\u001b[0m frames \u001b[39m=\u001b[39m [\n\u001b[1;32m    173\u001b[0m     cudf\u001b[39m.\u001b[39mcore\u001b[39m.\u001b[39mbuffer\u001b[39m.\u001b[39mas_buffer(f) \u001b[39mif\u001b[39;00m c \u001b[39melse\u001b[39;00m f\n\u001b[1;32m    174\u001b[0m     \u001b[39mfor\u001b[39;00m c, f \u001b[39min\u001b[39;00m \u001b[39mzip\u001b[39m(header[\u001b[39m\"\u001b[39m\u001b[39mis-cuda\u001b[39m\u001b[39m\"\u001b[39m], \u001b[39mmap\u001b[39m(\u001b[39mmemoryview\u001b[39m, frames))\n\u001b[1;32m    175\u001b[0m ]\n\u001b[0;32m--> 176\u001b[0m obj \u001b[39m=\u001b[39m \u001b[39mcls\u001b[39m\u001b[39m.\u001b[39mdevice_deserialize(header, frames)\n\u001b[1;32m    177\u001b[0m \u001b[39mreturn\u001b[39;00m obj\n",
+      "File \u001b[0;32m~/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/abc.py:130\u001b[0m, in \u001b[0;36mdevice_deserialize\u001b[0;34m()\u001b[0m\n\u001b[1;32m    125\u001b[0m typ \u001b[39m=\u001b[39m pickle\u001b[39m.\u001b[39mloads(header[\u001b[39m\"\u001b[39m\u001b[39mtype-serialized\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[1;32m    126\u001b[0m frames \u001b[39m=\u001b[39m [\n\u001b[1;32m    127\u001b[0m     cudf\u001b[39m.\u001b[39mcore\u001b[39m.\u001b[39mbuffer\u001b[39m.\u001b[39mas_buffer(f) \u001b[39mif\u001b[39;00m c \u001b[39melse\u001b[39;00m \u001b[39mmemoryview\u001b[39m(f)\n\u001b[1;32m    128\u001b[0m     \u001b[39mfor\u001b[39;00m c, f \u001b[39min\u001b[39;00m \u001b[39mzip\u001b[39m(header[\u001b[39m\"\u001b[39m\u001b[39mis-cuda\u001b[39m\u001b[39m\"\u001b[39m], frames)\n\u001b[1;32m    129\u001b[0m ]\n\u001b[0;32m--> 130\u001b[0m \u001b[39mreturn\u001b[39;00m typ\u001b[39m.\u001b[39mdeserialize(header, frames)\n",
+      "File \u001b[0;32m~/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/dataframe.py:1019\u001b[0m, in \u001b[0;36mdeserialize\u001b[0;34m()\u001b[0m\n\u001b[1;32m   1016\u001b[0m \u001b[39m@classmethod\u001b[39m\n\u001b[1;32m   1017\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mdeserialize\u001b[39m(\u001b[39mcls\u001b[39m, header, frames):\n\u001b[1;32m   1018\u001b[0m     index_nframes \u001b[39m=\u001b[39m header[\u001b[39m\"\u001b[39m\u001b[39mindex_frame_count\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[0;32m-> 1019\u001b[0m     obj \u001b[39m=\u001b[39m \u001b[39msuper\u001b[39m()\u001b[39m.\u001b[39mdeserialize(\n\u001b[1;32m   1020\u001b[0m         header, frames[header[\u001b[39m\"\u001b[39m\u001b[39mindex_frame_count\u001b[39m\u001b[39m\"\u001b[39m] :]\n\u001b[1;32m   1021\u001b[0m     )\n\u001b[1;32m   1023\u001b[0m     idx_typ \u001b[39m=\u001b[39m pickle\u001b[39m.\u001b[39mloads(header[\u001b[39m\"\u001b[39m\u001b[39mindex\u001b[39m\u001b[39m\"\u001b[39m][\u001b[39m\"\u001b[39m\u001b[39mtype-serialized\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[1;32m   1024\u001b[0m     index \u001b[39m=\u001b[39m idx_typ\u001b[39m.\u001b[39mdeserialize(header[\u001b[39m\"\u001b[39m\u001b[39mindex\u001b[39m\u001b[39m\"\u001b[39m], frames[:index_nframes])\n",
+      "File \u001b[0;32m~/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/frame.py:106\u001b[0m, in \u001b[0;36mdeserialize\u001b[0;34m()\u001b[0m\n\u001b[1;32m    104\u001b[0m cls_deserialize \u001b[39m=\u001b[39m pickle\u001b[39m.\u001b[39mloads(header[\u001b[39m\"\u001b[39m\u001b[39mtype-serialized\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[1;32m    105\u001b[0m column_names \u001b[39m=\u001b[39m pickle\u001b[39m.\u001b[39mloads(header[\u001b[39m\"\u001b[39m\u001b[39mcolumn_names\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[0;32m--> 106\u001b[0m columns \u001b[39m=\u001b[39m deserialize_columns(header[\u001b[39m\"\u001b[39m\u001b[39mcolumns\u001b[39m\u001b[39m\"\u001b[39m], frames)\n\u001b[1;32m    107\u001b[0m \u001b[39mreturn\u001b[39;00m cls_deserialize\u001b[39m.\u001b[39m_from_data(\u001b[39mdict\u001b[39m(\u001b[39mzip\u001b[39m(column_names, columns)))\n",
+      "File \u001b[0;32m~/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py:2450\u001b[0m, in \u001b[0;36mdeserialize_columns\u001b[0;34m()\u001b[0m\n\u001b[1;32m   2448\u001b[0m col_frame_count \u001b[39m=\u001b[39m meta[\u001b[39m\"\u001b[39m\u001b[39mframe_count\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[1;32m   2449\u001b[0m col_typ \u001b[39m=\u001b[39m pickle\u001b[39m.\u001b[39mloads(meta[\u001b[39m\"\u001b[39m\u001b[39mtype-serialized\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[0;32m-> 2450\u001b[0m colobj \u001b[39m=\u001b[39m col_typ\u001b[39m.\u001b[39mdeserialize(meta, frames[:col_frame_count])\n\u001b[1;32m   2451\u001b[0m columns\u001b[39m.\u001b[39mappend(colobj)\n\u001b[1;32m   2452\u001b[0m \u001b[39m# Advance frames\u001b[39;00m\n",
+      "File \u001b[0;32m~/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py:1216\u001b[0m, in \u001b[0;36mdeserialize\u001b[0;34m()\u001b[0m\n\u001b[1;32m   1214\u001b[0m     dtype \u001b[39m=\u001b[39m pickle\u001b[39m.\u001b[39mloads(header[\u001b[39m\"\u001b[39m\u001b[39mdtype\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[1;32m   1215\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mdata\u001b[39m\u001b[39m\"\u001b[39m \u001b[39min\u001b[39;00m header:\n\u001b[0;32m-> 1216\u001b[0m     data, frames \u001b[39m=\u001b[39m unpack(header[\u001b[39m\"\u001b[39m\u001b[39mdata\u001b[39m\u001b[39m\"\u001b[39m], frames)\n\u001b[1;32m   1217\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m   1218\u001b[0m     data \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n",
+      "File \u001b[0;32m~/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py:1204\u001b[0m, in \u001b[0;36munpack\u001b[0;34m()\u001b[0m\n\u001b[1;32m   1202\u001b[0m count \u001b[39m=\u001b[39m header[\u001b[39m\"\u001b[39m\u001b[39mframe_count\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[1;32m   1203\u001b[0m klass \u001b[39m=\u001b[39m pickle\u001b[39m.\u001b[39mloads(header[\u001b[39m\"\u001b[39m\u001b[39mtype-serialized\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[0;32m-> 1204\u001b[0m obj \u001b[39m=\u001b[39m klass\u001b[39m.\u001b[39mdeserialize(header, frames[:count])\n\u001b[1;32m   1205\u001b[0m \u001b[39mreturn\u001b[39;00m obj, frames[count:]\n",
+      "File \u001b[0;32m~/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/spillable_buffer.py:574\u001b[0m, in \u001b[0;36mdeserialize\u001b[0;34m()\u001b[0m\n\u001b[1;32m    567\u001b[0m \u001b[39m@classmethod\u001b[39m\n\u001b[1;32m    568\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mdeserialize\u001b[39m(\u001b[39mcls\u001b[39m, header: \u001b[39mdict\u001b[39m, frames: \u001b[39mlist\u001b[39m):\n\u001b[1;32m    569\u001b[0m     \u001b[39m# TODO: because of the hack in `SpillableBuffer.serialize()` where\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    572\u001b[0m     \u001b[39m# deserialize into `SpillableBufferSlice` when the frames hasn't been\u001b[39;00m\n\u001b[1;32m    573\u001b[0m     \u001b[39m# copied.\u001b[39;00m\n\u001b[0;32m--> 574\u001b[0m     \u001b[39mreturn\u001b[39;00m SpillableBuffer\u001b[39m.\u001b[39mdeserialize(header, frames)\n",
+      "File \u001b[0;32m~/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/buffer.py:335\u001b[0m, in \u001b[0;36mdeserialize\u001b[0;34m()\u001b[0m\n\u001b[1;32m    332\u001b[0m     \u001b[39mreturn\u001b[39;00m frame  \u001b[39m# The frame is already deserialized\u001b[39;00m\n\u001b[1;32m    334\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mhasattr\u001b[39m(frame, \u001b[39m\"\u001b[39m\u001b[39m__cuda_array_interface__\u001b[39m\u001b[39m\"\u001b[39m):\n\u001b[0;32m--> 335\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mcls\u001b[39m\u001b[39m.\u001b[39m_from_device_memory(frame)\n\u001b[1;32m    336\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mcls\u001b[39m\u001b[39m.\u001b[39m_from_host_memory(frame)\n",
+      "File \u001b[0;32m~/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/spillable_buffer.py:235\u001b[0m, in \u001b[0;36m_from_device_memory\u001b[0;34m()\u001b[0m\n\u001b[1;32m    218\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"Create a spillabe buffer from device memory.\u001b[39;00m\n\u001b[1;32m    219\u001b[0m \n\u001b[1;32m    220\u001b[0m \u001b[39mNo data is being copied.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    232\u001b[0m \u001b[39m    Buffer representing the same device memory as `data`\u001b[39;00m\n\u001b[1;32m    233\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m    234\u001b[0m ret \u001b[39m=\u001b[39m \u001b[39msuper\u001b[39m()\u001b[39m.\u001b[39m_from_device_memory(data)\n\u001b[0;32m--> 235\u001b[0m ret\u001b[39m.\u001b[39m_finalize_init(ptr_desc\u001b[39m=\u001b[39m{\u001b[39m\"\u001b[39m\u001b[39mtype\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39m\"\u001b[39m\u001b[39mgpu\u001b[39m\u001b[39m\"\u001b[39m}, exposed\u001b[39m=\u001b[39mexposed)\n\u001b[1;32m    236\u001b[0m \u001b[39mreturn\u001b[39;00m ret\n",
+      "File \u001b[0;32m~/miniforge/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/spillable_buffer.py:206\u001b[0m, in \u001b[0;36m_finalize_init\u001b[0;34m()\u001b[0m\n\u001b[1;32m    204\u001b[0m manager \u001b[39m=\u001b[39m get_global_manager()\n\u001b[1;32m    205\u001b[0m \u001b[39mif\u001b[39;00m manager \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m--> 206\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m    207\u001b[0m         \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mcannot create \u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__class__\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m without \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    208\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39ma global spill manager\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    209\u001b[0m     )\n\u001b[1;32m    211\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_manager \u001b[39m=\u001b[39m manager\n\u001b[1;32m    212\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_manager\u001b[39m.\u001b[39madd(\u001b[39mself\u001b[39m)\n",
       "\u001b[0;31mValueError\u001b[0m: cannot create <class 'cudf.core.buffer.spillable_buffer.SpillableBuffer'> without a global spill manager"
      ]
     }
diff --git a/notebooks/demo/nx_cugraph_demo.ipynb b/notebooks/demo/nx_cugraph_demo.ipynb
index 6e50370ed80..f1ce80aa188 100644
--- a/notebooks/demo/nx_cugraph_demo.ipynb
+++ b/notebooks/demo/nx_cugraph_demo.ipynb
@@ -20,7 +20,7 @@
     "Using `nx-cugraph` with this notebook requires the following: \n",
     "- NVIDIA GPU, Pascal architecture or later\n",
     "- CUDA 11.2, 11.4, 11.5, 11.8, or 12.0\n",
-    "- Python versions 3.9, 3.10, or 3.11\n",
+    "- Python versions 3.10, 3.11, or 3.12\n",
     "- NetworkX >= version 3.2\n",
     "  - _NetworkX 3.0 supports dispatching and is compatible with `nx-cugraph`, but this notebook will demonstrate features added in 3.2_\n",
     "  - At the time of this writing, NetworkX 3.2 is only available from source and can be installed by following the [development version install instructions](https://github.com/networkx/networkx/blob/main/INSTALL.rst#install-the-development-version).\n",
diff --git a/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml b/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
index ea30b652286..42cbcab5008 100644
--- a/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
+++ b/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
@@ -9,17 +9,17 @@ channels:
 - conda-forge
 - nvidia
 dependencies:
-- cugraph==24.10.*,>=0.0.0a0
+- cugraph==24.12.*,>=0.0.0a0
 - dgl>=1.1.0.cu*
 - pandas
 - pre-commit
-- pylibcugraphops==24.10.*,>=0.0.0a0
+- pylibcugraphops==24.12.*,>=0.0.0a0
 - pytest
 - pytest-benchmark
 - pytest-cov
 - pytest-xdist
 - pytorch-cuda==11.8
-- pytorch>=2.0
+- pytorch>=2.3,<2.4.0a0
 - scipy
 - tensordict>=0.1.2
 name: cugraph_dgl_dev_cuda-118
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
index 21b70b05f3a..4f36353cb18 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
@@ -140,6 +140,10 @@ def __init__(
         self.__graph = graph
         self.__device = device
 
+    @property
+    def _batch_size(self):
+        return self.__batch_size
+
     @property
     def dataset(
         self,
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py b/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py
index 1a35c3ea027..ecc51006995 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py
@@ -18,7 +18,7 @@
 
 from typing import Sequence, Optional, Union, List, Tuple, Iterator
 
-from cugraph.gnn import UniformNeighborSampler, DistSampleWriter
+from cugraph.gnn import UniformNeighborSampler, BiasedNeighborSampler, DistSampleWriter
 from cugraph.utilities.utils import import_optional
 
 import cugraph_dgl
@@ -93,7 +93,6 @@ def __init__(
             If provided, the probability of each neighbor being
             sampled is proportional to the edge feature
             with the given name.  Mutually exclusive with mask.
-            Currently unsupported.
         mask: str
             Optional.
             If proivided, only neighbors where the edge mask
@@ -133,10 +132,6 @@ def __init__(
             raise NotImplementedError(
                 "Edge masking is currently unsupported by cuGraph-DGL"
             )
-        if prob:
-            raise NotImplementedError(
-                "Edge masking is currently unsupported by cuGraph-DGL"
-            )
         if prefetch_edge_feats:
             warnings.warn("'prefetch_edge_feats' is ignored by cuGraph-DGL")
         if prefetch_node_feats:
@@ -146,6 +141,8 @@ def __init__(
         if fused:
             warnings.warn("'fused' is ignored by cuGraph-DGL")
 
+        self.__prob_attr = prob
+
         self.fanouts = fanouts_per_layer
         reverse_fanouts = fanouts_per_layer.copy()
         reverse_fanouts.reverse()
@@ -180,8 +177,14 @@ def sample(
             format=kwargs.pop("format", "parquet"),
         )
 
-        ds = UniformNeighborSampler(
-            g._graph(self.edge_dir),
+        sampling_clx = (
+            UniformNeighborSampler
+            if self.__prob_attr is None
+            else BiasedNeighborSampler
+        )
+
+        ds = sampling_clx(
+            g._graph(self.edge_dir, prob_attr=self.__prob_attr),
             writer,
             compression="CSR",
             fanout=self._reversed_fanout_vals,
@@ -194,10 +197,8 @@ def sample(
 
         if g.is_homogeneous:
             indices = torch.concat(list(indices))
-            ds.sample_from_nodes(indices, batch_size=batch_size)
-            return HomogeneousSampleReader(
-                ds.get_reader(), self.output_format, self.edge_dir
-            )
+            reader = ds.sample_from_nodes(indices.long(), batch_size=batch_size)
+            return HomogeneousSampleReader(reader, self.output_format, self.edge_dir)
 
         raise ValueError(
             "Sampling heterogeneous graphs is currently"
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py b/python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py
index 731ec1b8d6f..7ea608e7e53 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py
@@ -20,7 +20,6 @@
     create_homogeneous_sampled_graphs_from_tensors_csc,
 )
 
-from cugraph.gnn import DistSampleReader
 
 from cugraph.utilities.utils import import_optional
 
@@ -33,14 +32,18 @@ class SampleReader:
     Iterator that processes results from the cuGraph distributed sampler.
     """
 
-    def __init__(self, base_reader: DistSampleReader, output_format: str = "dgl.Block"):
+    def __init__(
+        self,
+        base_reader: Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]],
+        output_format: str = "dgl.Block",
+    ):
         """
         Constructs a new SampleReader.
 
         Parameters
         ----------
-        base_reader: DistSampleReader
-            The reader responsible for loading saved samples produced by
+        base_reader: Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]
+            The iterator responsible for loading saved samples produced by
             the cuGraph distributed sampler.
         """
         self.__output_format = output_format
@@ -83,7 +86,7 @@ class HomogeneousSampleReader(SampleReader):
 
     def __init__(
         self,
-        base_reader: DistSampleReader,
+        base_reader: Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]],
         output_format: str = "dgl.Block",
         edge_dir="in",
     ):
@@ -92,7 +95,7 @@ def __init__(
 
         Parameters
         ----------
-        base_reader: DistSampleReader
+        base_reader: Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]
             The reader responsible for loading saved samples produced by
             the cuGraph distributed sampler.
         output_format: str
diff --git a/python/cugraph-dgl/cugraph_dgl/graph.py b/python/cugraph-dgl/cugraph_dgl/graph.py
index 2eba13c6958..88b93656fa8 100644
--- a/python/cugraph-dgl/cugraph_dgl/graph.py
+++ b/python/cugraph-dgl/cugraph_dgl/graph.py
@@ -29,6 +29,7 @@
     HeteroNodeDataView,
     HeteroEdgeView,
     HeteroEdgeDataView,
+    EmbeddingView,
 )
 
 
@@ -311,7 +312,7 @@ def add_edges(
         self.__graph = None
         self.__vertex_offsets = None
 
-    def num_nodes(self, ntype: str = None) -> int:
+    def num_nodes(self, ntype: Optional[str] = None) -> int:
         """
         Returns the number of nodes of ntype, or if ntype is not provided,
         the total number of nodes in the graph.
@@ -321,7 +322,7 @@ def num_nodes(self, ntype: str = None) -> int:
 
         return self.__num_nodes_dict[ntype]
 
-    def number_of_nodes(self, ntype: str = None) -> int:
+    def number_of_nodes(self, ntype: Optional[str] = None) -> int:
         """
         Alias for num_nodes.
         """
@@ -380,7 +381,7 @@ def _vertex_offsets(self) -> Dict[str, int]:
 
         return dict(self.__vertex_offsets)
 
-    def __get_edgelist(self) -> Dict[str, "torch.Tensor"]:
+    def __get_edgelist(self, prob_attr=None) -> Dict[str, "torch.Tensor"]:
         """
         This function always returns src/dst labels with respect
         to the out direction.
@@ -430,63 +431,71 @@ def __get_edgelist(self) -> Dict[str, "torch.Tensor"]:
             )
         )
 
+        num_edges_t = torch.tensor(
+            [self.__edge_indices[et].shape[1] for et in sorted_keys], device="cuda"
+        )
+
         if self.is_multi_gpu:
             rank = torch.distributed.get_rank()
             world_size = torch.distributed.get_world_size()
 
-            num_edges_t = torch.tensor(
-                [self.__edge_indices[et].shape[1] for et in sorted_keys], device="cuda"
-            )
             num_edges_all_t = torch.empty(
                 world_size, num_edges_t.numel(), dtype=torch.int64, device="cuda"
             )
             torch.distributed.all_gather_into_tensor(num_edges_all_t, num_edges_t)
 
-            if rank > 0:
-                start_offsets = num_edges_all_t[:rank].T.sum(axis=1)
-                edge_id_array = torch.concat(
+            start_offsets = num_edges_all_t[:rank].T.sum(axis=1)
+
+        else:
+            rank = 0
+            start_offsets = torch.zeros(
+                (len(sorted_keys),), dtype=torch.int64, device="cuda"
+            )
+            num_edges_all_t = num_edges_t.reshape((1, num_edges_t.numel()))
+
+        # Use pinned memory here for fast access to CPU/WG storage
+        edge_id_array_per_type = [
+            torch.arange(
+                start_offsets[i],
+                start_offsets[i] + num_edges_all_t[rank][i],
+                dtype=torch.int64,
+                device="cpu",
+            ).pin_memory()
+            for i in range(len(sorted_keys))
+        ]
+
+        # Retrieve the weights from the appropriate feature(s)
+        # DGL implicitly requires all edge types use the same
+        # feature name.
+        if prob_attr is None:
+            weights = None
+        else:
+            if len(sorted_keys) > 1:
+                weights = torch.concat(
                     [
-                        torch.arange(
-                            start_offsets[i],
-                            start_offsets[i] + num_edges_all_t[rank][i],
-                            dtype=torch.int64,
-                            device="cuda",
-                        )
-                        for i in range(len(sorted_keys))
+                        self.edata[prob_attr][sorted_keys[i]][ix]
+                        for i, ix in enumerate(edge_id_array_per_type)
                     ]
                 )
             else:
-                edge_id_array = torch.concat(
-                    [
-                        torch.arange(
-                            self.__edge_indices[et].shape[1],
-                            dtype=torch.int64,
-                            device="cuda",
-                        )
-                        for et in sorted_keys
-                    ]
-                )
+                weights = self.edata[prob_attr][edge_id_array_per_type[0]]
 
-        else:
-            # single GPU
-            edge_id_array = torch.concat(
-                [
-                    torch.arange(
-                        self.__edge_indices[et].shape[1],
-                        dtype=torch.int64,
-                        device="cuda",
-                    )
-                    for et in sorted_keys
-                ]
-            )
+        # Safe to move this to cuda because the consumer will always
+        # move it to cuda if it isn't already there.
+        edge_id_array = torch.concat(edge_id_array_per_type).cuda()
 
-        return {
+        edgelist_dict = {
             "src": edge_index[0],
             "dst": edge_index[1],
             "etp": edge_type_array,
             "eid": edge_id_array,
         }
 
+        if weights is not None:
+            edgelist_dict["wgt"] = weights
+
+        return edgelist_dict
+
     @property
     def is_homogeneous(self):
         return len(self.__num_edges_dict) <= 1 and len(self.__num_nodes_dict) <= 1
@@ -507,7 +516,9 @@ def _resource_handle(self):
         return self.__handle
 
     def _graph(
-        self, direction: str
+        self,
+        direction: str,
+        prob_attr: Optional[str] = None,
     ) -> Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph]:
         """
         Gets the pylibcugraph Graph object with edges pointing in the given direction
@@ -521,12 +532,16 @@ def _graph(
             is_multigraph=True, is_symmetric=False
         )
 
-        if self.__graph is not None and self.__graph[1] != direction:
-            self.__graph = None
+        if self.__graph is not None:
+            if (
+                self.__graph["direction"] != direction
+                or self.__graph["prob_attr"] != prob_attr
+            ):
+                self.__graph = None
 
         if self.__graph is None:
             src_col, dst_col = ("src", "dst") if direction == "out" else ("dst", "src")
-            edgelist_dict = self.__get_edgelist()
+            edgelist_dict = self.__get_edgelist(prob_attr=prob_attr)
 
             if self.is_multi_gpu:
                 rank = torch.distributed.get_rank()
@@ -535,40 +550,42 @@ def _graph(
                 vertices_array = cupy.arange(self.num_nodes(), dtype="int64")
                 vertices_array = cupy.array_split(vertices_array, world_size)[rank]
 
-                self.__graph = (
-                    pylibcugraph.MGGraph(
-                        self._resource_handle,
-                        graph_properties,
-                        [cupy.asarray(edgelist_dict[src_col]).astype("int64")],
-                        [cupy.asarray(edgelist_dict[dst_col]).astype("int64")],
-                        vertices_array=[vertices_array],
-                        edge_id_array=[cupy.asarray(edgelist_dict["eid"])],
-                        edge_type_array=[cupy.asarray(edgelist_dict["etp"])],
-                    ),
-                    direction,
+                graph = pylibcugraph.MGGraph(
+                    self._resource_handle,
+                    graph_properties,
+                    [cupy.asarray(edgelist_dict[src_col]).astype("int64")],
+                    [cupy.asarray(edgelist_dict[dst_col]).astype("int64")],
+                    vertices_array=[vertices_array],
+                    edge_id_array=[cupy.asarray(edgelist_dict["eid"])],
+                    edge_type_array=[cupy.asarray(edgelist_dict["etp"])],
+                    weight_array=[cupy.asarray(edgelist_dict["wgt"])]
+                    if "wgt" in edgelist_dict
+                    else None,
                 )
             else:
-                self.__graph = (
-                    pylibcugraph.SGGraph(
-                        self._resource_handle,
-                        graph_properties,
-                        cupy.asarray(edgelist_dict[src_col]).astype("int64"),
-                        cupy.asarray(edgelist_dict[dst_col]).astype("int64"),
-                        vertices_array=cupy.arange(self.num_nodes(), dtype="int64"),
-                        edge_id_array=cupy.asarray(edgelist_dict["eid"]),
-                        edge_type_array=cupy.asarray(edgelist_dict["etp"]),
-                    ),
-                    direction,
+                graph = pylibcugraph.SGGraph(
+                    self._resource_handle,
+                    graph_properties,
+                    cupy.asarray(edgelist_dict[src_col]).astype("int64"),
+                    cupy.asarray(edgelist_dict[dst_col]).astype("int64"),
+                    vertices_array=cupy.arange(self.num_nodes(), dtype="int64"),
+                    edge_id_array=cupy.asarray(edgelist_dict["eid"]),
+                    edge_type_array=cupy.asarray(edgelist_dict["etp"]),
+                    weight_array=cupy.asarray(edgelist_dict["wgt"])
+                    if "wgt" in edgelist_dict
+                    else None,
                 )
 
-        return self.__graph[0]
+        self.__graph = {"graph": graph, "direction": direction, "prob_attr": prob_attr}
+
+        return self.__graph["graph"]
 
     def _has_n_emb(self, ntype: str, emb_name: str) -> bool:
         return (ntype, emb_name) in self.__ndata_storage
 
     def _get_n_emb(
-        self, ntype: str, emb_name: str, u: Union[str, TensorType]
-    ) -> "torch.Tensor":
+        self, ntype: Union[str, None], emb_name: str, u: Union[str, TensorType]
+    ) -> Union["torch.Tensor", "EmbeddingView"]:
         """
         Gets the embedding of a single node type.
         Unlike DGL, this function takes the string node
@@ -583,11 +600,11 @@ def _get_n_emb(
         u: Union[str, TensorType]
             Nodes to get the representation of, or ALL
             to get the representation of all nodes of
-            the given type.
+            the given type (returns embedding view).
 
         Returns
         -------
-        torch.Tensor
+        Union[torch.Tensor, cugraph_dgl.view.EmbeddingView]
             The embedding of the given edge type with the given embedding name.
         """
 
@@ -598,7 +615,9 @@ def _get_n_emb(
                 raise ValueError("Must provide the node type for a heterogeneous graph")
 
         if dgl.base.is_all(u):
-            u = torch.arange(self.num_nodes(ntype), dtype=self.idtype, device="cpu")
+            return EmbeddingView(
+                self.__ndata_storage[ntype, emb_name], self.num_nodes(ntype)
+            )
 
         try:
             return self.__ndata_storage[ntype, emb_name].fetch(
@@ -644,7 +663,9 @@ def _get_e_emb(
         etype = self.to_canonical_etype(etype)
 
         if dgl.base.is_all(u):
-            u = torch.arange(self.num_edges(etype), dtype=self.idtype, device="cpu")
+            return EmbeddingView(
+                self.__edata_storage[etype, emb_name], self.num_edges(etype)
+            )
 
         try:
             return self.__edata_storage[etype, emb_name].fetch(
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
index d2460f814c9..fcd5a26aee6 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
@@ -129,7 +129,7 @@ def __init__(
         if csrc_ids is not None:
             if csrc_ids.numel() != self._num_src_nodes + 1:
                 raise RuntimeError(
-                    f"Size mismatch for 'csrc_ids': expected ({size[0]+1},), "
+                    f"Size mismatch for 'csrc_ids': expected ({size[0] + 1},), "
                     f"but got {tuple(csrc_ids.size())}"
                 )
             csrc_ids = csrc_ids.contiguous()
@@ -137,7 +137,7 @@ def __init__(
         if cdst_ids is not None:
             if cdst_ids.numel() != self._num_dst_nodes + 1:
                 raise RuntimeError(
-                    f"Size mismatch for 'cdst_ids': expected ({size[1]+1},), "
+                    f"Size mismatch for 'cdst_ids': expected ({size[1] + 1},), "
                     f"but got {tuple(cdst_ids.size())}"
                 )
             cdst_ids = cdst_ids.contiguous()
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader.py b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader.py
index ef47875463d..419ec7790a9 100644
--- a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader.py
+++ b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader.py
@@ -11,6 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 import cugraph_dgl.dataloading
 import pytest
 
@@ -48,9 +49,12 @@ def test_dataloader_basic_homogeneous():
         assert len(out_t) <= 2
 
 
-def sample_dgl_graphs(g, train_nid, fanouts, batch_size=1):
+def sample_dgl_graphs(g, train_nid, fanouts, batch_size=1, prob_attr=None):
     # Single fanout to match cugraph
-    sampler = dgl.dataloading.NeighborSampler(fanouts)
+    sampler = dgl.dataloading.NeighborSampler(
+        fanouts,
+        prob=prob_attr,
+    )
     dataloader = dgl.dataloading.DataLoader(
         g,
         train_nid,
@@ -71,8 +75,13 @@ def sample_dgl_graphs(g, train_nid, fanouts, batch_size=1):
     return dgl_output
 
 
-def sample_cugraph_dgl_graphs(cugraph_g, train_nid, fanouts, batch_size=1):
-    sampler = cugraph_dgl.dataloading.NeighborSampler(fanouts)
+def sample_cugraph_dgl_graphs(
+    cugraph_g, train_nid, fanouts, batch_size=1, prob_attr=None
+):
+    sampler = cugraph_dgl.dataloading.NeighborSampler(
+        fanouts,
+        prob=prob_attr,
+    )
 
     dataloader = cugraph_dgl.dataloading.FutureDataLoader(
         cugraph_g,
@@ -126,3 +135,41 @@ def test_same_homogeneousgraph_results(ix, batch_size):
         dgl_output[0]["blocks"][0].num_edges()
         == cugraph_output[0]["blocks"][0].num_edges()
     )
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
+def test_dataloader_biased_homogeneous():
+    src = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8])
+    dst = torch.tensor([0, 0, 0, 0, 1, 1, 1, 1])
+    wgt = torch.tensor([1, 1, 2, 0, 0, 0, 2, 1], dtype=torch.float32)
+
+    train_nid = torch.tensor([0, 1])
+    # Create a heterograph with 3 node types and 3 edges types.
+    dgl_g = dgl.graph((src, dst))
+    dgl_g.edata["wgt"] = wgt
+
+    cugraph_g = cugraph_dgl.Graph(is_multi_gpu=False)
+    cugraph_g.add_nodes(9)
+    cugraph_g.add_edges(u=src, v=dst, data={"wgt": wgt})
+
+    dgl_output = sample_dgl_graphs(dgl_g, train_nid, [4], batch_size=2, prob_attr="wgt")
+    cugraph_output = sample_cugraph_dgl_graphs(
+        cugraph_g, train_nid, [4], batch_size=2, prob_attr="wgt"
+    )
+
+    cugraph_output_nodes = cugraph_output[0]["output_nodes"].cpu().numpy()
+    dgl_output_nodes = dgl_output[0]["output_nodes"].cpu().numpy()
+
+    np.testing.assert_array_equal(
+        np.sort(cugraph_output_nodes), np.sort(dgl_output_nodes)
+    )
+    assert (
+        dgl_output[0]["blocks"][0].num_dst_nodes()
+        == cugraph_output[0]["blocks"][0].num_dst_nodes()
+    )
+    assert (
+        dgl_output[0]["blocks"][0].num_edges()
+        == cugraph_output[0]["blocks"][0].num_edges()
+    )
+    assert 5 == cugraph_output[0]["blocks"][0].num_edges()
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader_mg.py b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader_mg.py
index b32233f16a6..061f4fa2077 100644
--- a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader_mg.py
+++ b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader_mg.py
@@ -82,9 +82,18 @@ def test_dataloader_basic_homogeneous():
     )
 
 
-def sample_dgl_graphs(g, train_nid, fanouts, batch_size=1):
+def sample_dgl_graphs(
+    g,
+    train_nid,
+    fanouts,
+    batch_size=1,
+    prob_attr=None,
+):
     # Single fanout to match cugraph
-    sampler = dgl.dataloading.NeighborSampler(fanouts)
+    sampler = dgl.dataloading.NeighborSampler(
+        fanouts,
+        prob=prob_attr,
+    )
     dataloader = dgl.dataloading.DataLoader(
         g,
         train_nid,
@@ -105,8 +114,17 @@ def sample_dgl_graphs(g, train_nid, fanouts, batch_size=1):
     return dgl_output
 
 
-def sample_cugraph_dgl_graphs(cugraph_g, train_nid, fanouts, batch_size=1):
-    sampler = cugraph_dgl.dataloading.NeighborSampler(fanouts)
+def sample_cugraph_dgl_graphs(
+    cugraph_g,
+    train_nid,
+    fanouts,
+    batch_size=1,
+    prob_attr=None,
+):
+    sampler = cugraph_dgl.dataloading.NeighborSampler(
+        fanouts,
+        prob=prob_attr,
+    )
 
     dataloader = cugraph_dgl.dataloading.FutureDataLoader(
         cugraph_g,
@@ -179,3 +197,58 @@ def test_same_homogeneousgraph_results_mg(ix, batch_size):
         args=(world_size, uid, ix, batch_size),
         nprocs=world_size,
     )
+
+
+def run_test_dataloader_biased_homogeneous(rank, world_size, uid):
+    init_pytorch_worker(rank, world_size, uid, True)
+
+    src = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8]) + (rank * 9)
+    dst = torch.tensor([0, 0, 0, 0, 1, 1, 1, 1]) + (rank * 9)
+    wgt = torch.tensor(
+        [0.1, 0.1, 0.2, 0, 0, 0, 0.2, 0.1] * world_size, dtype=torch.float32
+    )
+
+    train_nid = torch.tensor([0, 1]) + (rank * 9)
+    # Create a heterograph with 3 node types and 3 edge types.
+    dgl_g = dgl.graph((src, dst))
+    dgl_g.edata["wgt"] = wgt[:8]
+
+    cugraph_g = cugraph_dgl.Graph(is_multi_gpu=True)
+    cugraph_g.add_nodes(9 * world_size)
+    cugraph_g.add_edges(u=src, v=dst, data={"wgt": wgt})
+
+    dgl_output = sample_dgl_graphs(dgl_g, train_nid, [4], batch_size=2, prob_attr="wgt")
+    cugraph_output = sample_cugraph_dgl_graphs(
+        cugraph_g, train_nid, [4], batch_size=2, prob_attr="wgt"
+    )
+
+    cugraph_output_nodes = cugraph_output[0]["output_nodes"].cpu().numpy()
+    dgl_output_nodes = dgl_output[0]["output_nodes"].cpu().numpy()
+
+    np.testing.assert_array_equal(
+        np.sort(cugraph_output_nodes), np.sort(dgl_output_nodes)
+    )
+    assert (
+        dgl_output[0]["blocks"][0].num_dst_nodes()
+        == cugraph_output[0]["blocks"][0].num_dst_nodes()
+    )
+    assert (
+        dgl_output[0]["blocks"][0].num_edges()
+        == cugraph_output[0]["blocks"][0].num_edges()
+    )
+
+    assert 5 == cugraph_output[0]["blocks"][0].num_edges()
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
+def test_dataloader_biased_homogeneous_mg():
+    uid = cugraph_comms_create_unique_id()
+    # Limit the number of GPUs this test is run with
+    world_size = torch.cuda.device_count()
+
+    torch.multiprocessing.spawn(
+        run_test_dataloader_biased_homogeneous,
+        args=(world_size, uid),
+        nprocs=world_size,
+    )
diff --git a/python/cugraph-dgl/cugraph_dgl/view.py b/python/cugraph-dgl/cugraph_dgl/view.py
index dbc53e73b6a..4de9406be07 100644
--- a/python/cugraph-dgl/cugraph_dgl/view.py
+++ b/python/cugraph-dgl/cugraph_dgl/view.py
@@ -12,6 +12,8 @@
 # limitations under the License.
 
 
+import warnings
+
 from collections import defaultdict
 from collections.abc import MutableMapping
 from typing import Union, Dict, List, Tuple
@@ -20,11 +22,45 @@
 
 import cugraph_dgl
 from cugraph_dgl.typing import TensorType
+from cugraph_dgl.utils.cugraph_conversion_utils import _cast_to_torch_tensor
 
 torch = import_optional("torch")
 dgl = import_optional("dgl")
 
 
+class EmbeddingView:
+    def __init__(self, storage: "dgl.storages.base.FeatureStorage", ld: int):
+        self.__ld = ld
+        self.__storage = storage
+
+    def __getitem__(self, u: TensorType) -> "torch.Tensor":
+        u = _cast_to_torch_tensor(u)
+        try:
+            return self.__storage.fetch(
+                u,
+                "cuda",
+            )
+        except RuntimeError as ex:
+            warnings.warn(
+                "Got error accessing data, trying again with index on device: "
+                + str(ex)
+            )
+            return self.__storage.fetch(
+                u.cuda(),
+                "cuda",
+            )
+
+    @property
+    def shape(self) -> "torch.Size":
+        try:
+            f = self.__storage.fetch(torch.tensor([0]), "cpu")
+        except RuntimeError:
+            f = self.__storage.fetch(torch.tensor([0], device="cuda"), "cuda")
+        sz = [s for s in f.shape]
+        sz[0] = self.__ld
+        return torch.Size(tuple(sz))
+
+
 class HeteroEdgeDataView(MutableMapping):
     """
     Duck-typed version of DGL's HeteroEdgeDataView.
diff --git a/python/cugraph-dgl/examples/graphsage/node-classification-dask.py b/python/cugraph-dgl/examples/graphsage/node-classification-dask.py
new file mode 100644
index 00000000000..0481f9566bc
--- /dev/null
+++ b/python/cugraph-dgl/examples/graphsage/node-classification-dask.py
@@ -0,0 +1,272 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Example modified from:
+# https://github.com/dmlc/dgl/blob/master/examples/pytorch/graphsage/node_classification.py
+
+# Ignore Warning
+import warnings
+import time
+import cugraph_dgl
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchmetrics.functional as MF
+import dgl
+import dgl.nn as dglnn
+from dgl.data import AsNodePredDataset
+from dgl.dataloading import (
+    DataLoader,
+    NeighborSampler,
+    MultiLayerFullNeighborSampler,
+)
+from ogb.nodeproppred import DglNodePropPredDataset
+import tqdm
+import argparse
+
+warnings.filterwarnings("ignore")
+
+
+def set_allocators():
+    import rmm
+    import cudf
+    import cupy
+    from rmm.allocators.torch import rmm_torch_allocator
+    from rmm.allocators.cupy import rmm_cupy_allocator
+
+    mr = rmm.mr.CudaAsyncMemoryResource()
+    rmm.mr.set_current_device_resource(mr)
+    torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
+    cupy.cuda.set_allocator(rmm_cupy_allocator)
+    cudf.set_option("spill", True)
+
+
+class SAGE(nn.Module):
+    def __init__(self, in_size, hid_size, out_size):
+        super().__init__()
+        self.layers = nn.ModuleList()
+        # three-layer GraphSAGE-mean
+        self.layers.append(dglnn.SAGEConv(in_size, hid_size, "mean"))
+        self.layers.append(dglnn.SAGEConv(hid_size, hid_size, "mean"))
+        self.layers.append(dglnn.SAGEConv(hid_size, out_size, "mean"))
+        self.dropout = nn.Dropout(0.5)
+        self.hid_size = hid_size
+        self.out_size = out_size
+
+    def forward(self, blocks, x):
+        h = x
+        for l_id, (layer, block) in enumerate(zip(self.layers, blocks)):
+            h = layer(block, h)
+            if l_id != len(self.layers) - 1:
+                h = F.relu(h)
+                h = self.dropout(h)
+        return h
+
+    def inference(self, g, device, batch_size):
+        """Conduct layer-wise inference to get all the node embeddings."""
+        all_node_ids = torch.arange(0, g.num_nodes()).to(device)
+        feat = g.get_node_storage(key="feat", ntype="_N").fetch(
+            all_node_ids, device=device
+        )
+
+        sampler = MultiLayerFullNeighborSampler(1, prefetch_node_feats=["feat"])
+        dataloader = DataLoader(
+            g,
+            torch.arange(g.num_nodes()).to(g.device),
+            sampler,
+            device=device,
+            batch_size=batch_size,
+            shuffle=False,
+            drop_last=False,
+            num_workers=0,
+        )
+        buffer_device = torch.device("cpu")
+        pin_memory = buffer_device != device
+
+        for l_id, layer in enumerate(self.layers):
+            y = torch.empty(
+                g.num_nodes(),
+                self.hid_size if l_id != len(self.layers) - 1 else self.out_size,
+                device=buffer_device,
+                pin_memory=pin_memory,
+            )
+            feat = feat.to(device)
+            for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader):
+                x = feat[input_nodes]
+                h = layer(blocks[0], x)  # len(blocks) = 1
+                if l_id != len(self.layers) - 1:
+                    h = F.relu(h)
+                    h = self.dropout(h)
+                # by design, our output nodes are contiguous
+                y[output_nodes[0] : output_nodes[-1] + 1] = h.to(buffer_device)
+            feat = y
+        return y
+
+
+def evaluate(model, graph, dataloader):
+    model.eval()
+    ys = []
+    y_hats = []
+    for it, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
+        with torch.no_grad():
+            if isinstance(graph.ndata["feat"], dict):
+                x = graph.ndata["feat"]["_N"][input_nodes]
+                label = graph.ndata["label"]["_N"][output_nodes]
+            else:
+                x = graph.ndata["feat"][input_nodes]
+                label = graph.ndata["label"][output_nodes]
+            ys.append(label)
+            y_hats.append(model(blocks, x))
+    num_classes = y_hats[0].shape[1]
+    return MF.accuracy(
+        torch.cat(y_hats),
+        torch.cat(ys),
+        task="multiclass",
+        num_classes=num_classes,
+    )
+
+
+def layerwise_infer(device, graph, nid, model, batch_size):
+    model.eval()
+    with torch.no_grad():
+        pred = model.inference(graph, device, batch_size)  # pred in buffer_device
+        pred = pred[nid]
+        label = graph.ndata["label"]
+        if isinstance(label, dict):
+            label = label["_N"]
+        label = label[nid].to(device).to(pred.device)
+        num_classes = pred.shape[1]
+        return MF.accuracy(pred, label, task="multiclass", num_classes=num_classes)
+
+
+def train(args, device, g, dataset, model):
+    # create sampler & dataloader
+    train_idx = dataset.train_idx.to(device)
+    val_idx = dataset.val_idx.to(device)
+
+    use_uva = args.mode == "mixed"
+    batch_size = 1024
+    fanouts = [5, 10, 15]
+    sampler = NeighborSampler(fanouts)
+    train_dataloader = DataLoader(
+        g,
+        train_idx,
+        sampler,
+        device=device,
+        batch_size=batch_size,
+        shuffle=True,
+        drop_last=False,
+        num_workers=0,
+        use_uva=use_uva,
+    )
+    val_dataloader = DataLoader(
+        g,
+        val_idx,
+        sampler,
+        device=device,
+        batch_size=batch_size,
+        shuffle=True,
+        drop_last=False,
+        num_workers=0,
+        use_uva=use_uva,
+    )
+
+    opt = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=5e-4)
+
+    for epoch in range(10):
+        model.train()
+        total_loss = 0
+        st = time.time()
+        for it, (input_nodes, output_nodes, blocks) in enumerate(train_dataloader):
+            if isinstance(g.ndata["feat"], dict):
+                x = g.ndata["feat"]["_N"][input_nodes]
+                y = g.ndata["label"]["_N"][output_nodes]
+            else:
+                x = g.ndata["feat"][input_nodes]
+                y = g.ndata["label"][output_nodes]
+
+            y_hat = model(blocks, x)
+            loss = F.cross_entropy(y_hat, y)
+            opt.zero_grad()
+            loss.backward()
+            opt.step()
+            total_loss += loss.item()
+
+        et = time.time()
+
+        print(
+            f"Time taken for epoch {epoch} with batch_size {batch_size} = {et - st} s"
+        )
+        acc = evaluate(model, g, val_dataloader)
+        print(
+            "Epoch {:05d} | Loss {:.4f} | Accuracy {:.4f} ".format(
+                epoch, total_loss / (it + 1), acc.item()
+            )
+        )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--mode",
+        default="gpu_cugraph_dgl",
+        choices=["cpu", "mixed", "gpu_dgl", "gpu_cugraph_dgl"],
+        help="Training mode."
+        " 'cpu' for CPU training,"
+        " 'mixed' for CPU-GPU mixed training, "
+        " 'gpu_dgl' for pure-GPU training, "
+        " 'gpu_cugraph_dgl' for pure-GPU training.",
+    )
+    args = parser.parse_args()
+    if not torch.cuda.is_available():
+        args.mode = "cpu"
+    if args.mode == "gpu_cugraph_dgl":
+        set_allocators()
+    print(f"Training in {args.mode} mode.")
+
+    # load and preprocess dataset
+    print("Loading data")
+    dataset = AsNodePredDataset(DglNodePropPredDataset("ogbn-products"))
+    g = dataset[0]
+    g = dgl.add_self_loop(g)
+    if args.mode == "gpu_cugraph_dgl":
+        g = cugraph_dgl.cugraph_storage_from_heterograph(g.to("cuda"))
+        del dataset.g
+
+    else:
+        g = g.to("cuda" if args.mode == "gpu_dgl" else "cpu")
+    device = torch.device(
+        "cpu" if args.mode == "cpu" or args.mode == "mixed" else "cuda"
+    )
+
+    # create GraphSAGE model
+    feat_shape = (
+        g.get_node_storage(key="feat", ntype="_N")
+        .fetch(torch.LongTensor([0]).to(device), device=device)
+        .shape[1]
+    )
+    print(feat_shape)
+    # no ndata in cugraph storage object
+    in_size = feat_shape
+    out_size = dataset.num_classes
+    model = SAGE(in_size, 256, out_size).to(device)
+
+    # model training
+    print("Training...")
+    train(args, device, g, dataset, model)
+
+    # test the model
+    print("Testing...")
+    acc = layerwise_infer(device, g, dataset.test_idx, model, batch_size=4096)
+    print("Test Accuracy {:.4f}".format(acc.item()))
diff --git a/python/cugraph-dgl/examples/graphsage/node-classification.py b/python/cugraph-dgl/examples/graphsage/node-classification.py
index 539fd86d136..56ac41c09b4 100644
--- a/python/cugraph-dgl/examples/graphsage/node-classification.py
+++ b/python/cugraph-dgl/examples/graphsage/node-classification.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -17,8 +17,10 @@
 
 # Ignore Warning
 import warnings
+import tempfile
 import time
 import cugraph_dgl
+import cugraph_dgl.dataloading
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -76,14 +78,17 @@ def forward(self, blocks, x):
     def inference(self, g, device, batch_size):
         """Conduct layer-wise inference to get all the node embeddings."""
         all_node_ids = torch.arange(0, g.num_nodes()).to(device)
-        feat = g.get_node_storage(key="feat", ntype="_N").fetch(
-            all_node_ids, device=device
-        )
+        feat = g.ndata["feat"][all_node_ids].to(device)
 
-        sampler = MultiLayerFullNeighborSampler(1, prefetch_node_feats=["feat"])
-        dataloader = DataLoader(
+        if isinstance(g, cugraph_dgl.Graph):
+            sampler = cugraph_dgl.dataloading.NeighborSampler([-1])
+            loader_cls = cugraph_dgl.dataloading.FutureDataLoader
+        else:
+            sampler = MultiLayerFullNeighborSampler(1, prefetch_node_feats=["feat"])
+            loader_cls = DataLoader
+        dataloader = loader_cls(
             g,
-            torch.arange(g.num_nodes()).to(g.device),
+            torch.arange(g.num_nodes()).to(device),
             sampler,
             device=device,
             batch_size=batch_size,
@@ -150,7 +155,7 @@ def layerwise_infer(device, graph, nid, model, batch_size):
         return MF.accuracy(pred, label, task="multiclass", num_classes=num_classes)
 
 
-def train(args, device, g, dataset, model):
+def train(args, device, g, dataset, model, directory):
     # create sampler & dataloader
     train_idx = dataset.train_idx.to(device)
     val_idx = dataset.val_idx.to(device)
@@ -158,8 +163,13 @@ def train(args, device, g, dataset, model):
     use_uva = args.mode == "mixed"
     batch_size = 1024
     fanouts = [5, 10, 15]
-    sampler = NeighborSampler(fanouts)
-    train_dataloader = DataLoader(
+    if isinstance(g, cugraph_dgl.Graph):
+        sampler = cugraph_dgl.dataloading.NeighborSampler(fanouts, directory=directory)
+        loader_cls = cugraph_dgl.dataloading.FutureDataLoader
+    else:
+        sampler = NeighborSampler(fanouts)
+        loader_cls = DataLoader
+    train_dataloader = loader_cls(
         g,
         train_idx,
         sampler,
@@ -170,7 +180,7 @@ def train(args, device, g, dataset, model):
         num_workers=0,
         use_uva=use_uva,
     )
-    val_dataloader = DataLoader(
+    val_dataloader = loader_cls(
         g,
         val_idx,
         sampler,
@@ -195,6 +205,7 @@ def train(args, device, g, dataset, model):
             else:
                 x = g.ndata["feat"][input_nodes]
                 y = g.ndata["label"][output_nodes]
+
             y_hat = model(blocks, x)
             loss = F.cross_entropy(y_hat, y)
             opt.zero_grad()
@@ -204,7 +215,9 @@ def train(args, device, g, dataset, model):
 
         et = time.time()
 
-        print(f"Time taken for epoch {epoch} with batch_size {batch_size} = {et-st} s")
+        print(
+            f"Time taken for epoch {epoch} with batch_size {batch_size} = {et - st} s"
+        )
         acc = evaluate(model, g, val_dataloader)
         print(
             "Epoch {:05d} | Loss {:.4f} | Accuracy {:.4f} ".format(
@@ -225,6 +238,8 @@ def train(args, device, g, dataset, model):
         " 'gpu_dgl' for pure-GPU training, "
         " 'gpu_cugraph_dgl' for pure-GPU training.",
     )
+    parser.add_argument("--dataset_root", type=str, default="dataset")
+    parser.add_argument("--tempdir_root", type=str, default=None)
     args = parser.parse_args()
     if not torch.cuda.is_available():
         args.mode = "cpu"
@@ -234,11 +249,13 @@ def train(args, device, g, dataset, model):
 
     # load and preprocess dataset
     print("Loading data")
-    dataset = AsNodePredDataset(DglNodePropPredDataset("ogbn-products"))
+    dataset = AsNodePredDataset(
+        DglNodePropPredDataset("ogbn-products", root=args.dataset_root)
+    )
     g = dataset[0]
     g = dgl.add_self_loop(g)
     if args.mode == "gpu_cugraph_dgl":
-        g = cugraph_dgl.cugraph_storage_from_heterograph(g.to("cuda"))
+        g = cugraph_dgl.cugraph_dgl_graph_from_heterograph(g.to("cuda"))
         del dataset.g
 
     else:
@@ -248,19 +265,17 @@ def train(args, device, g, dataset, model):
     )
 
     # create GraphSAGE model
-    feat_shape = (
-        g.get_node_storage(key="feat", ntype="_N")
-        .fetch(torch.LongTensor([0]).to(device), device=device)
-        .shape[1]
-    )
-    # no ndata in cugraph storage object
+    feat_shape = g.ndata["feat"].shape[1]
+    print(feat_shape)
+
     in_size = feat_shape
     out_size = dataset.num_classes
     model = SAGE(in_size, 256, out_size).to(device)
 
     # model training
     print("Training...")
-    train(args, device, g, dataset, model)
+    with tempfile.TemporaryDirectory(dir=args.tempdir_root) as directory:
+        train(args, device, g, dataset, model, directory)
 
     # test the model
     print("Testing...")
diff --git a/python/cugraph-dgl/examples/multi_trainer_MG_example/model.py b/python/cugraph-dgl/examples/multi_trainer_MG_example/model.py
index a6f771e4b51..3e0c0454905 100644
--- a/python/cugraph-dgl/examples/multi_trainer_MG_example/model.py
+++ b/python/cugraph-dgl/examples/multi_trainer_MG_example/model.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -58,9 +58,8 @@ def inference(self, g, batch_size, device):
         # The nodes on each layer are of course splitted in batches.
 
         all_node_ids = torch.arange(0, g.num_nodes()).to(device)
-        feat = g.get_node_storage(key="feat", ntype="_N").fetch(
-            all_node_ids, device=device
-        )
+        feat = g.ndata["feat"][all_node_ids].to(device)
+
         sampler = dgl.dataloading.MultiLayerFullNeighborSampler(
             1, prefetch_node_feats=["feat"]
         )
@@ -114,15 +113,13 @@ def layerwise_infer(graph, nid, model, batch_size, device):
 
 
 def train_model(model, g, opt, train_dataloader, num_epochs, rank, val_nid):
-    g.ndata["feat"]["_N"] = g.ndata["feat"]["_N"].to("cuda")
-    g.ndata["label"]["_N"] = g.ndata["label"]["_N"].to("cuda")
     st = time.time()
     model.train()
     for epoch in range(num_epochs):
         total_loss = 0
         for _, (input_nodes, output_nodes, blocks) in enumerate(train_dataloader):
-            x = g.ndata["feat"]["_N"][input_nodes]
-            y = g.ndata["label"]["_N"][output_nodes]
+            x = g.ndata["feat"][input_nodes].to(torch.float32)
+            y = g.ndata["label"][output_nodes].to(torch.int64)
             y_hat = model(blocks, x)
             y = y.squeeze(1)
             loss = F.cross_entropy(y_hat, y)
@@ -137,7 +134,7 @@ def train_model(model, g, opt, train_dataloader, num_epochs, rank, val_nid):
     et = time.time()
     print(
         f"Total time taken for num_epochs {num_epochs} "
-        f"with batch_size {train_dataloader._batch_size} = {et-st} s on rank ={rank}"
+        f"with batch_size {train_dataloader._batch_size} = {et - st} s on rank ={rank}"
     )
     if rank == 0:
         val_acc = layerwise_infer(g, val_nid, model, 1024 * 5, "cuda")
diff --git a/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow.py b/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow.py
deleted file mode 100644
index 474f17dc2bb..00000000000
--- a/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow.py
+++ /dev/null
@@ -1,244 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import dgl
-import torch
-import time
-from distributed import Client, Event as Dask_Event
-import tempfile
-from cugraph.dask.comms import comms as Comms
-
-
-def enable_spilling():
-    import cudf
-
-    cudf.set_option("spill", True)
-
-
-def setup_cluster(dask_worker_devices):
-    dask_worker_devices_str = ",".join([str(i) for i in dask_worker_devices])
-    from dask_cuda import LocalCUDACluster
-
-    cluster = LocalCUDACluster(
-        protocol="tcp",
-        CUDA_VISIBLE_DEVICES=dask_worker_devices_str,
-        rmm_pool_size="25GB",
-    )
-
-    client = Client(cluster)
-    client.wait_for_workers(n_workers=len(dask_worker_devices))
-    client.run(enable_spilling)
-    print("Dask Cluster Setup Complete")
-    del client
-    return cluster
-
-
-def create_dask_client(scheduler_address):
-    from cugraph.dask.comms import comms as Comms
-
-    client = Client(scheduler_address)
-    Comms.initialize(p2p=True)
-    return client
-
-
-def initalize_pytorch_worker(dev_id):
-    import cupy as cp
-    import rmm
-    from rmm.allocators.torch import rmm_torch_allocator
-    from rmm.allocators.cupy import rmm_cupy_allocator
-
-    dev = cp.cuda.Device(
-        dev_id
-    )  # Create cuda context on the right gpu, defaults to gpu-0
-    dev.use()
-    rmm.reinitialize(
-        pool_allocator=True,
-        initial_pool_size=10e9,
-        maximum_pool_size=15e9,
-        devices=[dev_id],
-    )
-
-    if dev_id == 0:
-        torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
-
-    torch.cuda.set_device(dev_id)
-    cp.cuda.set_allocator(rmm_cupy_allocator)
-    enable_spilling()
-    print("device_id", dev_id, flush=True)
-
-
-def load_dgl_dataset(dataset_name="ogbn-products"):
-    from ogb.nodeproppred import DglNodePropPredDataset
-
-    dataset = DglNodePropPredDataset(name=dataset_name)
-    split_idx = dataset.get_idx_split()
-    train_idx, valid_idx, test_idx = (
-        split_idx["train"],
-        split_idx["valid"],
-        split_idx["test"],
-    )
-    g, label = dataset[0]
-    g.ndata["label"] = label
-    if len(g.etypes) <= 1:
-        g = dgl.add_self_loop(g)
-    else:
-        for etype in g.etypes:
-            if etype[0] == etype[2]:
-                # only add self loops for src->dst
-                g = dgl.add_self_loop(g, etype=etype)
-
-    g = g.int()
-    train_idx = train_idx.int()
-    valid_idx = valid_idx.int()
-    test_idx = test_idx.int()
-    return g, train_idx, valid_idx, test_idx, dataset.num_classes
-
-
-def create_cugraph_graphstore_from_dgl_dataset(
-    dataset_name="ogbn-products", single_gpu=False
-):
-    from cugraph_dgl import cugraph_storage_from_heterograph
-
-    dgl_g, train_idx, valid_idx, test_idx, num_classes = load_dgl_dataset(dataset_name)
-    cugraph_gs = cugraph_storage_from_heterograph(dgl_g, single_gpu=single_gpu)
-    return cugraph_gs, train_idx, valid_idx, test_idx, num_classes
-
-
-def create_dataloader(gs, train_idx, device):
-    import cugraph_dgl
-
-    temp_dir = tempfile.TemporaryDirectory()
-    sampler = cugraph_dgl.dataloading.NeighborSampler([10, 20])
-    dataloader = cugraph_dgl.dataloading.DataLoader(
-        gs,
-        train_idx,
-        sampler,
-        sampling_output_dir=temp_dir.name,
-        batches_per_partition=10,
-        device=device,  # Put the sampled MFGs on CPU or GPU
-        use_ddp=True,  # Make it work with distributed data parallel
-        batch_size=1024,
-        shuffle=False,  # Whether to shuffle the nodes for every epoch
-        drop_last=False,
-        num_workers=0,
-    )
-    return dataloader
-
-
-def run_workflow(rank, devices, scheduler_address):
-    from model import Sage, train_model
-
-    # Below sets gpu_number
-    dev_id = devices[rank]
-    initalize_pytorch_worker(dev_id)
-    device = torch.device(f"cuda:{dev_id}")
-    # cugraph dask client initialization
-    client = create_dask_client(scheduler_address)
-
-    # Pytorch training worker initialization
-    dist_init_method = "tcp://{master_ip}:{master_port}".format(
-        master_ip="127.0.0.1", master_port="12346"
-    )
-
-    torch.distributed.init_process_group(
-        backend="nccl",
-        init_method=dist_init_method,
-        world_size=len(devices),
-        rank=rank,
-    )
-
-    print(f"rank {rank}.", flush=True)
-    print("Initalized across GPUs.")
-
-    event = Dask_Event("cugraph_gs_creation_event")
-    if rank == 0:
-        (
-            gs,
-            train_idx,
-            valid_idx,
-            test_idx,
-            num_classes,
-        ) = create_cugraph_graphstore_from_dgl_dataset(
-            "ogbn-products", single_gpu=False
-        )
-        client.publish_dataset(cugraph_gs=gs)
-        client.publish_dataset(train_idx=train_idx)
-        client.publish_dataset(valid_idx=valid_idx)
-        client.publish_dataset(test_idx=test_idx)
-        client.publish_dataset(num_classes=num_classes)
-        event.set()
-    else:
-        if event.wait(timeout=1000):
-            gs = client.get_dataset("cugraph_gs")
-            train_idx = client.get_dataset("train_idx")
-            valid_idx = client.get_dataset("valid_idx")
-            test_idx = client.get_dataset("test_idx")
-            num_classes = client.get_dataset("num_classes")
-        else:
-            raise RuntimeError(f"Fetch cugraph_gs to worker_id {rank} failed")
-
-    torch.distributed.barrier()
-    print(f"Loading cugraph_store to worker {rank} is complete", flush=True)
-    dataloader = create_dataloader(gs, train_idx, device)
-    print("Data Loading Complete", flush=True)
-    num_feats = gs.ndata["feat"]["_N"].shape[1]
-    hid_size = 256
-    # Load Training example
-    model = Sage(num_feats, hid_size, num_classes).to(device)
-    model = torch.nn.parallel.DistributedDataParallel(
-        model,
-        device_ids=[device],
-        output_device=device,
-    )
-    torch.distributed.barrier()
-    n_epochs = 10
-    total_st = time.time()
-    opt = torch.optim.Adam(model.parameters(), lr=0.01)
-    train_model(model, gs, opt, dataloader, n_epochs, rank, valid_idx)
-    torch.distributed.barrier()
-    total_et = time.time()
-    print(
-        f"Total time taken on n_epochs {n_epochs} = {total_et-total_st} s",
-        f"measured by worker = {rank}",
-    )
-
-    # cleanup dask cluster
-    if rank == 0:
-        client.unpublish_dataset("cugraph_gs")
-        client.unpublish_dataset("train_idx")
-        client.unpublish_dataset("valid_idx")
-        client.unpublish_dataset("test_idx")
-        event.clear()
-    print("Workflow completed")
-    print("---" * 10)
-    Comms.destroy()
-
-
-if __name__ == "__main__":
-    # Load dummy first
-    # because new environments
-    # require dataset download
-    load_dgl_dataset()
-    dask_worker_devices = [5, 6]
-    cluster = setup_cluster(dask_worker_devices)
-
-    trainer_devices = [0, 1, 2]
-    import torch.multiprocessing as mp
-
-    mp.spawn(
-        run_workflow,
-        args=(trainer_devices, cluster.scheduler_address),
-        nprocs=len(trainer_devices),
-    )
-    Comms.destroy()
-    cluster.close()
diff --git a/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_mnmg.py b/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_mnmg.py
new file mode 100644
index 00000000000..11afe466014
--- /dev/null
+++ b/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_mnmg.py
@@ -0,0 +1,311 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dgl
+import torch
+import time
+import tempfile
+import argparse
+import json
+import os
+import warnings
+
+from datetime import timedelta
+
+import cugraph_dgl
+
+from cugraph.gnn import (
+    cugraph_comms_init,
+    cugraph_comms_shutdown,
+    cugraph_comms_create_unique_id,
+)
+
+from pylibwholegraph.torch.initialize import (
+    init as wm_init,
+    finalize as wm_finalize,
+)
+
+# Allow computation on objects that are larger than GPU memory
+# https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory
+os.environ["CUDF_SPILL"] = "1"
+
+
+def init_ddp_worker(global_rank, local_rank, world_size, cugraph_id):
+    import rmm
+
+    rmm.reinitialize(
+        devices=local_rank,
+        managed_memory=True,
+        pool_allocator=True,
+    )
+
+    import cupy
+
+    cupy.cuda.Device(local_rank).use()
+    from rmm.allocators.cupy import rmm_cupy_allocator
+
+    cupy.cuda.set_allocator(rmm_cupy_allocator)
+
+    from cugraph.testing.mg_utils import enable_spilling
+
+    enable_spilling()
+
+    torch.cuda.set_device(local_rank)
+
+    cugraph_comms_init(
+        rank=global_rank, world_size=world_size, uid=cugraph_id, device=local_rank
+    )
+
+    wm_init(global_rank, world_size, local_rank, torch.cuda.device_count())
+
+
+def load_dgl_dataset(dataset_root="dataset", dataset_name="ogbn-products"):
+    from ogb.nodeproppred import DglNodePropPredDataset
+
+    dataset = DglNodePropPredDataset(root=dataset_root, name=dataset_name)
+    split_idx = dataset.get_idx_split()
+    train_idx, valid_idx, test_idx = (
+        split_idx["train"],
+        split_idx["valid"],
+        split_idx["test"],
+    )
+    g, label = dataset[0]
+    g.ndata["label"] = label
+    if len(g.etypes) <= 1:
+        g = dgl.add_self_loop(g)
+    else:
+        for etype in g.etypes:
+            if etype[0] == etype[2]:
+                # only add self loops for src->dst
+                g = dgl.add_self_loop(g, etype=etype)
+
+    g = g.int()
+    idx = {
+        "train": train_idx.int(),
+        "valid": valid_idx.int(),
+        "test": test_idx.int(),
+    }
+
+    return g, idx, dataset.num_classes
+
+
+def partition_data(
+    g, split_idx, num_classes, edge_path, feature_path, label_path, meta_path
+):
+    # Split and save edge index
+    os.makedirs(
+        edge_path,
+        exist_ok=True,
+    )
+    src, dst = g.all_edges(form="uv", order="eid")
+    edge_index = torch.stack([src, dst])
+    for (r, e) in enumerate(torch.tensor_split(edge_index, world_size, dim=1)):
+        rank_path = os.path.join(edge_path, f"rank={r}.pt")
+        torch.save(
+            e.clone(),
+            rank_path,
+        )
+
+    # Split and save features
+    os.makedirs(
+        feature_path,
+        exist_ok=True,
+    )
+
+    nix = torch.arange(g.num_nodes())
+    for (r, f) in enumerate(torch.tensor_split(nix, world_size)):
+        feat_path = os.path.join(feature_path, f"rank={r}_feat.pt")
+        torch.save(g.ndata["feat"][f], feat_path)
+
+        label_f_path = os.path.join(feature_path, f"rank={r}_label.pt")
+        torch.save(g.ndata["label"][f], label_f_path)
+
+    # Split and save labels
+    os.makedirs(
+        label_path,
+        exist_ok=True,
+    )
+    for (d, i) in split_idx.items():
+        i_parts = torch.tensor_split(i, world_size)
+        for r, i_part in enumerate(i_parts):
+            rank_path = os.path.join(label_path, f"rank={r}")
+            os.makedirs(rank_path, exist_ok=True)
+            torch.save(i_part, os.path.join(rank_path, f"{d}.pt"))
+
+    # Save metadata
+    meta = {
+        "num_classes": int(num_classes),
+        "num_nodes": int(g.num_nodes()),
+    }
+    with open(meta_path, "w") as f:
+        json.dump(meta, f)
+
+
+def load_partitioned_data(rank, edge_path, feature_path, label_path, meta_path):
+    g = cugraph_dgl.Graph(
+        is_multi_gpu=True, ndata_storage="wholegraph", edata_storage="wholegraph"
+    )
+
+    # Load metadata
+    with open(meta_path, "r") as f:
+        meta = json.load(f)
+
+    # Load labels
+    split_idx = {}
+    for split in ["train", "test", "valid"]:
+        split_idx[split] = torch.load(
+            os.path.join(label_path, f"rank={rank}", f"{split}.pt")
+        )
+
+    # Load features
+    feat_t = torch.load(os.path.join(feature_path, f"rank={rank}_feat.pt"))
+    label_f_t = torch.load(os.path.join(feature_path, f"rank={rank}_label.pt"))
+    ndata = {"feat": feat_t, "label": label_f_t}
+    g.add_nodes(meta["num_nodes"], data=ndata)
+
+    # Load edge index
+    src, dst = torch.load(os.path.join(edge_path, f"rank={rank}.pt"))
+    g.add_edges(src.cuda(), dst.cuda(), data=None)
+
+    return g, split_idx, meta["num_classes"]
+
+
+def create_dataloader(gs, train_idx, device, temp_dir, stage):
+    import cugraph_dgl
+
+    temp_path = os.path.join(temp_dir, f"{stage}_{device}")
+    os.mkdir(temp_path)
+
+    sampler = cugraph_dgl.dataloading.NeighborSampler(
+        [10, 20],
+        directory=temp_path,
+        batches_per_partition=10,
+    )
+
+    dataloader = cugraph_dgl.dataloading.FutureDataLoader(
+        gs,
+        train_idx,
+        sampler,
+        device=device,  # Put the sampled MFGs on CPU or GPU
+        use_ddp=True,  # Make it work with distributed data parallel
+        batch_size=1024,
+        shuffle=False,  # Whether to shuffle the nodes for every epoch
+        drop_last=False,
+        num_workers=0,
+    )
+    return dataloader
+
+
+def run_workflow(
+    global_rank, local_rank, world_size, g, split_idx, num_classes, temp_dir
+):
+    from model import Sage, train_model
+
+    # Below sets gpu_number
+    dev_id = local_rank
+    device = torch.device(f"cuda:{dev_id}")
+
+    dataloader = create_dataloader(g, split_idx["train"], device, temp_dir, "train")
+    print("Dataloader Creation Complete", flush=True)
+    num_feats = g.ndata["feat"].shape[1]
+    hid_size = 256
+    # Load Training example
+    model = Sage(num_feats, hid_size, num_classes).to(device)
+    model = torch.nn.parallel.DistributedDataParallel(
+        model,
+        device_ids=[device],
+        output_device=device,
+    )
+    torch.distributed.barrier()
+    n_epochs = 10
+    total_st = time.time()
+    opt = torch.optim.Adam(model.parameters(), lr=0.01)
+    train_model(model, g, opt, dataloader, n_epochs, global_rank, split_idx["valid"])
+    torch.distributed.barrier()
+    total_et = time.time()
+    print(
+        f"Total time taken on n_epochs {n_epochs} = {total_et - total_st} s",
+        f"measured by worker = {global_rank}",
+    )
+
+    wm_finalize()
+    cugraph_comms_shutdown()
+
+
+if __name__ == "__main__":
+    if "LOCAL_RANK" in os.environ:
+        parser = argparse.ArgumentParser()
+        parser.add_argument("--dataset_root", type=str, default="dataset")
+        parser.add_argument("--tempdir_root", type=str, default=None)
+        parser.add_argument("--dataset", type=str, default="ogbn-products")
+        parser.add_argument("--skip_partition", action="store_true")
+        args = parser.parse_args()
+
+        torch.distributed.init_process_group(
+            "nccl",
+            timeout=timedelta(minutes=60),
+        )
+        world_size = torch.distributed.get_world_size()
+        global_rank = torch.distributed.get_rank()
+        local_rank = int(os.environ["LOCAL_RANK"])
+        device = torch.device(local_rank)
+
+        # Create the uid needed for cuGraph comms
+        if global_rank == 0:
+            cugraph_id = [cugraph_comms_create_unique_id()]
+        else:
+            cugraph_id = [None]
+        torch.distributed.broadcast_object_list(cugraph_id, src=0, device=device)
+        cugraph_id = cugraph_id[0]
+
+        init_ddp_worker(global_rank, local_rank, world_size, cugraph_id)
+
+        # Split the data
+        edge_path = os.path.join(args.dataset_root, args.dataset + "_eix_part")
+        feature_path = os.path.join(args.dataset_root, args.dataset + "_fea_part")
+        label_path = os.path.join(args.dataset_root, args.dataset + "_label_part")
+        meta_path = os.path.join(args.dataset_root, args.dataset + "_meta.json")
+
+        if not args.skip_partition and global_rank == 0:
+            partition_data(
+                *load_dgl_dataset(args.dataset_root, args.dataset),
+                edge_path,
+                feature_path,
+                label_path,
+                meta_path,
+            )
+        torch.distributed.barrier()
+
+        print("loading partitions...")
+        g, split_idx, num_classes = load_partitioned_data(
+            rank=global_rank,
+            edge_path=edge_path,
+            feature_path=feature_path,
+            label_path=label_path,
+            meta_path=meta_path,
+        )
+        print(f"rank {global_rank} has loaded its partition")
+        torch.distributed.barrier()
+
+        with tempfile.TemporaryDirectory(dir=args.tempdir_root) as directory:
+            run_workflow(
+                global_rank,
+                local_rank,
+                world_size,
+                g,
+                split_idx,
+                num_classes,
+                directory,
+            )
+    else:
+        warnings.warn("This script should be run with 'torchrun`.  Exiting.")
diff --git a/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_snmg.py b/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_snmg.py
new file mode 100644
index 00000000000..001d7fb82dc
--- /dev/null
+++ b/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_snmg.py
@@ -0,0 +1,242 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dgl
+import torch
+import time
+import tempfile
+import argparse
+import os
+
+import cugraph_dgl
+
+from cugraph.gnn import (
+    cugraph_comms_init,
+    cugraph_comms_shutdown,
+    cugraph_comms_create_unique_id,
+)
+
+from pylibwholegraph.torch.initialize import (
+    init as wm_init,
+    finalize as wm_finalize,
+)
+
+# Allow computation on objects that are larger than GPU memory
+# https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory
+os.environ["CUDF_SPILL"] = "1"
+
+
+def initalize_pytorch_worker(dev_id):
+    import cupy as cp
+    import rmm
+    from rmm.allocators.cupy import rmm_cupy_allocator
+
+    dev = cp.cuda.Device(
+        dev_id
+    )  # Create cuda context on the right gpu, defaults to gpu-0
+    dev.use()
+    rmm.reinitialize(
+        pool_allocator=True,
+        initial_pool_size=10e9,
+        maximum_pool_size=15e9,
+        devices=[dev_id],
+    )
+
+    from cugraph.testing.mg_utils import enable_spilling
+
+    enable_spilling()
+
+    torch.cuda.set_device(dev_id)
+    cp.cuda.set_allocator(rmm_cupy_allocator)
+    print("device_id", dev_id, flush=True)
+
+
+def load_dgl_dataset(
+    dataset_name="ogbn-products",
+    dataset_root=None,
+):
+    from ogb.nodeproppred import DglNodePropPredDataset
+
+    dataset = DglNodePropPredDataset(name=dataset_name, root=dataset_root)
+    split_idx = dataset.get_idx_split()
+    train_idx, valid_idx, test_idx = (
+        split_idx["train"],
+        split_idx["valid"],
+        split_idx["test"],
+    )
+    g, label = dataset[0]
+    g.ndata["label"] = label
+    if len(g.etypes) <= 1:
+        g = dgl.add_self_loop(g)
+    else:
+        for etype in g.etypes:
+            if etype[0] == etype[2]:
+                # only add self loops for src->dst
+                g = dgl.add_self_loop(g, etype=etype)
+
+    g = g.int()
+    train_idx = train_idx.int()
+    valid_idx = valid_idx.int()
+    test_idx = test_idx.int()
+    return g, train_idx, valid_idx, test_idx, dataset.num_classes
+
+
+def create_cugraph_graphstore_from_dgl_dataset(dataset, rank, world_size):
+    (g, train_idx, valid_idx, test_idx, num_classes) = dataset
+    # Partition the data
+    cg = cugraph_dgl.Graph(
+        is_multi_gpu=True, ndata_storage="wholegraph", edata_storage="wholegraph"
+    )
+
+    nix = torch.tensor_split(torch.arange(g.num_nodes()), world_size)[rank]
+    ndata = {k: g.ndata[k][nix].cuda() for k in g.ndata.keys()}
+
+    eix = torch.tensor_split(torch.arange(g.num_edges()), world_size)[rank]
+    src, dst = g.all_edges(form="uv", order="eid")
+    edata = {k: g.edata[k][eix].cuda() for k in g.edata.keys()}
+
+    cg.add_nodes(g.num_nodes(), data=ndata)
+    cg.add_edges(
+        torch.tensor_split(src, world_size)[rank].cuda(),
+        torch.tensor_split(dst, world_size)[rank].cuda(),
+        data=edata,
+    )
+
+    return (
+        cg,
+        torch.tensor_split(train_idx, world_size)[rank].to(torch.int64),
+        torch.tensor_split(valid_idx, world_size)[rank].to(torch.int64),
+        torch.tensor_split(test_idx, world_size)[rank].to(torch.int64),
+        num_classes,
+    )
+
+
+def create_dataloader(gs, train_idx, device, temp_dir, stage):
+    import cugraph_dgl
+
+    temp_path = os.path.join(temp_dir, f"{stage}_{device}")
+    os.mkdir(temp_path)
+
+    sampler = cugraph_dgl.dataloading.NeighborSampler(
+        [10, 20],
+        directory=temp_path,
+        batches_per_partition=10,
+    )
+    dataloader = cugraph_dgl.dataloading.FutureDataLoader(
+        gs,
+        train_idx,
+        sampler,
+        device=device,  # Put the sampled MFGs on CPU or GPU
+        use_ddp=True,  # Make it work with distributed data parallel
+        batch_size=1024,
+        shuffle=False,  # Whether to shuffle the nodes for every epoch
+        drop_last=False,
+        num_workers=0,
+    )
+    return dataloader
+
+
+def run_workflow(rank, world_size, cugraph_id, dataset, temp_dir):
+    from model import Sage, train_model
+
+    # Below sets gpu_number
+    dev_id = rank
+    initalize_pytorch_worker(dev_id)
+    device = torch.device(f"cuda:{dev_id}")
+
+    # Pytorch training worker initialization
+    dist_init_method = "tcp://{master_ip}:{master_port}".format(
+        master_ip="127.0.0.1", master_port="12346"
+    )
+
+    torch.distributed.init_process_group(
+        backend="nccl",
+        init_method=dist_init_method,
+        world_size=world_size,
+        rank=rank,
+    )
+
+    cugraph_comms_init(rank=rank, world_size=world_size, uid=cugraph_id, device=rank)
+    wm_init(rank, world_size, rank, world_size)
+
+    print(f"rank {rank}.", flush=True)
+    print("Initalized across GPUs.")
+
+    (
+        gs,
+        train_idx,
+        valid_idx,
+        test_idx,
+        num_classes,
+    ) = create_cugraph_graphstore_from_dgl_dataset(
+        dataset,
+        rank,
+        world_size,
+    )
+    del dataset
+
+    torch.distributed.barrier()
+    print(f"Loading graph to worker {rank} is complete", flush=True)
+
+    dataloader = create_dataloader(gs, train_idx, device, temp_dir, "train")
+    print("Dataloader Creation Complete", flush=True)
+    num_feats = gs.ndata["feat"].shape[1]
+    hid_size = 256
+    # Load Training example
+    model = Sage(num_feats, hid_size, num_classes).to(device)
+    model = torch.nn.parallel.DistributedDataParallel(
+        model,
+        device_ids=[device],
+        output_device=device,
+    )
+    torch.distributed.barrier()
+    n_epochs = 10
+    total_st = time.time()
+    opt = torch.optim.Adam(model.parameters(), lr=0.01)
+    train_model(model, gs, opt, dataloader, n_epochs, rank, valid_idx)
+    torch.distributed.barrier()
+    total_et = time.time()
+    print(
+        f"Total time taken on n_epochs {n_epochs} = {total_et - total_st} s",
+        f"measured by worker = {rank}",
+    )
+
+    torch.cuda.synchronize()
+    wm_finalize()
+    cugraph_comms_shutdown()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset_root", type=str, default="dataset")
+    parser.add_argument("--tempdir_root", type=str, default=None)
+    parser.add_argument("--dataset", type=str, default="ogbn-products")
+    args = parser.parse_args()
+
+    from rmm.allocators.torch import rmm_torch_allocator
+
+    torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
+
+    # Create the uid needed for cuGraph comms
+    cugraph_id = cugraph_comms_create_unique_id()
+
+    ds = load_dgl_dataset(args.dataset, args.dataset_root)
+
+    world_size = torch.cuda.device_count()
+
+    with tempfile.TemporaryDirectory(dir=args.tempdir_root) as directory:
+        torch.multiprocessing.spawn(
+            run_workflow,
+            args=(world_size, cugraph_id, ds, directory),
+            nprocs=world_size,
+        )
diff --git a/python/cugraph-dgl/pyproject.toml b/python/cugraph-dgl/pyproject.toml
index ba2bb4bc170..e3e12216ac7 100644
--- a/python/cugraph-dgl/pyproject.toml
+++ b/python/cugraph-dgl/pyproject.toml
@@ -18,29 +18,29 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 classifiers = [
     "Intended Audience :: Developers",
     "Programming Language :: Python",
 ]
 dependencies = [
-    "cugraph==24.10.*,>=0.0.0a0",
+    "cugraph==24.12.*,>=0.0.0a0",
     "numba>=0.57",
-    "numpy>=1.23,<2.0a0",
-    "pylibcugraphops==24.10.*,>=0.0.0a0",
+    "numpy>=1.23,<3.0a0",
+    "pylibcugraphops==24.12.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.optional-dependencies]
 test = [
     "pandas",
-    "pylibwholegraph==24.10.*,>=0.0.0a0",
+    "pylibwholegraph==24.12.*,>=0.0.0a0",
     "pytest",
     "pytest-benchmark",
     "pytest-cov",
     "pytest-xdist",
     "scipy",
     "tensordict>=0.1.2",
-    "torch>=2.0,<2.2.0a0",
+    "torch>=2.3,<2.4.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
diff --git a/python/cugraph-equivariant/cugraph_equivariant/tests/pytest.ini b/python/cugraph-equivariant/cugraph_equivariant/tests/pytest.ini
new file mode 100644
index 00000000000..7b0a9f29fb1
--- /dev/null
+++ b/python/cugraph-equivariant/cugraph_equivariant/tests/pytest.ini
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[pytest]
+addopts = --tb=native
diff --git a/python/cugraph-equivariant/pyproject.toml b/python/cugraph-equivariant/pyproject.toml
index e4a8d290d9e..7713e89ac20 100644
--- a/python/cugraph-equivariant/pyproject.toml
+++ b/python/cugraph-equivariant/pyproject.toml
@@ -28,16 +28,16 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 classifiers = [
     "Intended Audience :: Developers",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
 ]
 dependencies = [
-    "pylibcugraphops==24.10.*,>=0.0.0a0",
+    "pylibcugraphops==24.12.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
diff --git a/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml b/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
index bd1ca33af70..39b1ab21edb 100644
--- a/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
+++ b/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
@@ -9,17 +9,17 @@ channels:
 - conda-forge
 - nvidia
 dependencies:
-- cugraph==24.10.*,>=0.0.0a0
+- cugraph==24.12.*,>=0.0.0a0
 - pandas
 - pre-commit
 - pyg>=2.5,<2.6
-- pylibcugraphops==24.10.*,>=0.0.0a0
+- pylibcugraphops==24.12.*,>=0.0.0a0
 - pytest
 - pytest-benchmark
 - pytest-cov
 - pytest-xdist
 - pytorch-cuda==11.8
-- pytorch>=2.0
+- pytorch>=2.3,<2.4.0a0
 - scipy
 - tensordict>=0.1.2
 name: cugraph_pyg_dev_cuda-118
diff --git a/python/cugraph-pyg/cugraph_pyg/__init__.py b/python/cugraph-pyg/cugraph_pyg/__init__.py
index 719751c966a..e566e6e9fdd 100644
--- a/python/cugraph-pyg/cugraph_pyg/__init__.py
+++ b/python/cugraph-pyg/cugraph_pyg/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,3 +12,8 @@
 # limitations under the License.
 
 from cugraph_pyg._version import __git_commit__, __version__
+
+import cugraph_pyg.data
+import cugraph_pyg.loader
+import cugraph_pyg.sampler
+import cugraph_pyg.nn
diff --git a/python/cugraph-pyg/cugraph_pyg/data/dask_graph_store.py b/python/cugraph-pyg/cugraph_pyg/data/dask_graph_store.py
index c805cd496c8..6195f3118a4 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/dask_graph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/dask_graph_store.py
@@ -150,7 +150,7 @@ def is_set(self, key):
         if key not in self.__dataclass_fields__:
             raise KeyError(key)
         attr = getattr(self, key)
-        return type(attr) != _field_status or attr != _field_status.UNSET
+        return type(attr) is not _field_status or attr != _field_status.UNSET
 
     def is_fully_specified(self):
         """
diff --git a/python/cugraph-pyg/cugraph_pyg/data/graph_store.py b/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
index e086bf07b1f..c47dda5eaa5 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
@@ -21,7 +21,7 @@
 from cugraph.utilities.utils import import_optional, MissingModule
 from cugraph.gnn.comms import cugraph_comms_get_raft_handle
 
-from typing import Union, Optional, List, Dict
+from typing import Union, Optional, List, Dict, Tuple
 
 
 # Have to use import_optional even though these are required
@@ -58,13 +58,19 @@ def __init__(self, is_multi_gpu: bool = False):
         """
         self.__edge_indices = tensordict.TensorDict({}, batch_size=(2,))
         self.__sizes = {}
-        self.__graph = None
-        self.__vertex_offsets = None
+
         self.__handle = None
         self.__is_multi_gpu = is_multi_gpu
 
+        self.__clear_graph()
+
         super().__init__()
 
+    def __clear_graph(self):
+        self.__graph = None
+        self.__vertex_offsets = None
+        self.__weight_attr = None
+
     def _put_edge_index(
         self,
         edge_index: "torch_geometric.typing.EdgeTensorType",
@@ -88,8 +94,7 @@ def _put_edge_index(
         self.__sizes[edge_attr.edge_type] = edge_attr.size
 
         # invalidate the graph
-        self.__graph = None
-        self.__vertex_offsets = None
+        self.__clear_graph()
         return True
 
     def _get_edge_index(
@@ -108,7 +113,7 @@ def _remove_edge_index(self, edge_attr: "torch_geometric.data.EdgeAttr") -> bool
         del self.__edge_indices[edge_attr.edge_type]
 
         # invalidate the graph
-        self.__graph = None
+        self.__clear_graph()
         return True
 
     def get_all_edge_attrs(self) -> List["torch_geometric.data.EdgeAttr"]:
@@ -163,6 +168,9 @@ def _graph(self) -> Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph]:
                     vertices_array=[vertices_array],
                     edge_id_array=[cupy.asarray(edgelist_dict["eid"])],
                     edge_type_array=[cupy.asarray(edgelist_dict["etp"])],
+                    weight_array=[cupy.asarray(edgelist_dict["wgt"])]
+                    if "wgt" in edgelist_dict
+                    else None,
                 )
             else:
                 self.__graph = pylibcugraph.SGGraph(
@@ -175,6 +183,9 @@ def _graph(self) -> Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph]:
                     ),
                     edge_id_array=cupy.asarray(edgelist_dict["eid"]),
                     edge_type_array=cupy.asarray(edgelist_dict["etp"]),
+                    weight_array=cupy.asarray(edgelist_dict["wgt"])
+                    if "wgt" in edgelist_dict
+                    else None,
                 )
 
         return self.__graph
@@ -194,13 +205,18 @@ def _num_vertices(self) -> Dict[str, int]:
                     else edge_attr.size[1]
                 )
             else:
-                if edge_attr.edge_type[0] not in num_vertices:
+                if edge_attr.edge_type[0] != edge_attr.edge_type[2]:
+                    if edge_attr.edge_type[0] not in num_vertices:
+                        num_vertices[edge_attr.edge_type[0]] = int(
+                            self.__edge_indices[edge_attr.edge_type][0].max() + 1
+                        )
+                    if edge_attr.edge_type[2] not in num_vertices:
+                        num_vertices[edge_attr.edge_type[1]] = int(
+                            self.__edge_indices[edge_attr.edge_type][1].max() + 1
+                        )
+                elif edge_attr.edge_type[0] not in num_vertices:
                     num_vertices[edge_attr.edge_type[0]] = int(
-                        self.__edge_indices[edge_attr.edge_type][0].max() + 1
-                    )
-                if edge_attr.edge_type[2] not in num_vertices:
-                    num_vertices[edge_attr.edge_type[1]] = int(
-                        self.__edge_indices[edge_attr.edge_type][1].max() + 1
+                        self.__edge_indices[edge_attr.edge_type].max() + 1
                     )
 
         if self.is_multi_gpu:
@@ -228,6 +244,32 @@ def _vertex_offsets(self) -> Dict[str, int]:
     def is_homogeneous(self) -> bool:
         return len(self._vertex_offsets) == 1
 
+    def _set_weight_attr(self, attr: Tuple["torch_geometric.data.FeatureStore", str]):
+        if attr != self.__weight_attr:
+            self.__clear_graph()
+            self.__weight_attr = attr
+
+    def __get_weight_tensor(
+        self,
+        sorted_keys: List[Tuple[str, str, str]],
+        start_offsets: "torch.Tensor",
+        num_edges_t: "torch.Tensor",
+    ):
+        feature_store, attr_name = self.__weight_attr
+
+        weights = []
+        for i, et in enumerate(sorted_keys):
+            ix = torch.arange(
+                start_offsets[i],
+                start_offsets[i] + num_edges_t[i],
+                dtype=torch.int64,
+                device="cpu",
+            )
+
+            weights.append(feature_store[et, attr_name][ix])
+
+        return torch.concat(weights)
+
     def __get_edgelist(self):
         """
         Returns
@@ -275,59 +317,49 @@ def __get_edgelist(self):
             )
         )
 
+        num_edges_t = torch.tensor(
+            [self.__edge_indices[et].shape[1] for et in sorted_keys], device="cuda"
+        )
+
         if self.is_multi_gpu:
             rank = torch.distributed.get_rank()
             world_size = torch.distributed.get_world_size()
 
-            num_edges_t = torch.tensor(
-                [self.__edge_indices[et].shape[1] for et in sorted_keys], device="cuda"
-            )
             num_edges_all_t = torch.empty(
                 world_size, num_edges_t.numel(), dtype=torch.int64, device="cuda"
             )
             torch.distributed.all_gather_into_tensor(num_edges_all_t, num_edges_t)
 
-            if rank > 0:
-                start_offsets = num_edges_all_t[:rank].T.sum(axis=1)
-                edge_id_array = torch.concat(
-                    [
-                        torch.arange(
-                            start_offsets[i],
-                            start_offsets[i] + num_edges_all_t[rank][i],
-                            dtype=torch.int64,
-                            device="cuda",
-                        )
-                        for i in range(len(sorted_keys))
-                    ]
-                )
-            else:
-                edge_id_array = torch.concat(
-                    [
-                        torch.arange(
-                            self.__edge_indices[et].shape[1],
-                            dtype=torch.int64,
-                            device="cuda",
-                        )
-                        for et in sorted_keys
-                    ]
-                )
-
+            start_offsets = num_edges_all_t[:rank].T.sum(axis=1)
         else:
-            # single GPU
-            edge_id_array = torch.concat(
-                [
-                    torch.arange(
-                        self.__edge_indices[et].shape[1],
-                        dtype=torch.int64,
-                        device="cuda",
-                    )
-                    for et in sorted_keys
-                ]
+            rank = 0
+            start_offsets = torch.zeros(
+                (len(sorted_keys),), dtype=torch.int64, device="cuda"
             )
+            num_edges_all_t = num_edges_t.reshape((1, num_edges_t.numel()))
+
+        edge_id_array = torch.concat(
+            [
+                torch.arange(
+                    start_offsets[i],
+                    start_offsets[i] + num_edges_all_t[rank][i],
+                    dtype=torch.int64,
+                    device="cuda",
+                )
+                for i in range(len(sorted_keys))
+            ]
+        )
 
-        return {
+        d = {
             "dst": edge_index[0],
             "src": edge_index[1],
             "etp": edge_type_array,
             "eid": edge_id_array,
         }
+
+        if self.__weight_attr is not None:
+            d["wgt"] = self.__get_weight_tensor(
+                sorted_keys, start_offsets.cpu(), num_edges_t.cpu()
+            ).cuda()
+
+        return d
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mnmg.py b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mnmg.py
index 7002d7ebded..127ca809d91 100644
--- a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mnmg.py
+++ b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mnmg.py
@@ -185,6 +185,8 @@ def run_train(
     wall_clock_start,
     tempdir=None,
     num_layers=3,
+    in_memory=False,
+    seeds_per_call=-1,
 ):
     optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0005)
 
@@ -196,20 +198,23 @@ def run_train(
     from cugraph_pyg.loader import NeighborLoader
 
     ix_train = split_idx["train"].cuda()
-    train_path = os.path.join(tempdir, f"train_{global_rank}")
-    os.mkdir(train_path)
+    train_path = None if in_memory else os.path.join(tempdir, f"train_{global_rank}")
+    if train_path:
+        os.mkdir(train_path)
     train_loader = NeighborLoader(
         data,
         input_nodes=ix_train,
         directory=train_path,
         shuffle=True,
         drop_last=True,
+        local_seeds_per_call=seeds_per_call if seeds_per_call > 0 else None,
         **kwargs,
     )
 
     ix_test = split_idx["test"].cuda()
-    test_path = os.path.join(tempdir, f"test_{global_rank}")
-    os.mkdir(test_path)
+    test_path = None if in_memory else os.path.join(tempdir, f"test_{global_rank}")
+    if test_path:
+        os.mkdir(test_path)
     test_loader = NeighborLoader(
         data,
         input_nodes=ix_test,
@@ -221,14 +226,16 @@ def run_train(
     )
 
     ix_valid = split_idx["valid"].cuda()
-    valid_path = os.path.join(tempdir, f"valid_{global_rank}")
-    os.mkdir(valid_path)
+    valid_path = None if in_memory else os.path.join(tempdir, f"valid_{global_rank}")
+    if valid_path:
+        os.mkdir(valid_path)
     valid_loader = NeighborLoader(
         data,
         input_nodes=ix_valid,
         directory=valid_path,
         shuffle=True,
         drop_last=True,
+        local_seeds_per_call=seeds_per_call if seeds_per_call > 0 else None,
         **kwargs,
     )
 
@@ -347,6 +354,9 @@ def parse_args():
     parser.add_argument("--skip_partition", action="store_true")
     parser.add_argument("--wg_mem_type", type=str, default="distributed")
 
+    parser.add_argument("--in_memory", action="store_true", default=False)
+    parser.add_argument("--seeds_per_call", type=int, default=-1)
+
     return parser.parse_args()
 
 
@@ -429,6 +439,8 @@ def parse_args():
                 wall_clock_start,
                 tempdir,
                 args.num_layers,
+                args.in_memory,
+                args.seeds_per_call,
             )
     else:
         warnings.warn("This script should be run with 'torchrun`.  Exiting.")
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_sg.py b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_sg.py
index b299fc2a1a1..0f9c39bf04d 100644
--- a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_sg.py
+++ b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_sg.py
@@ -66,7 +66,7 @@ def train(epoch: int):
     torch.cuda.synchronize()
     print(
         f"Average Training Iteration Time (s/iter): \
-            {(time.perf_counter() - start_avg_time)/(i-warmup_steps):.6f}"
+            {(time.perf_counter() - start_avg_time) / (i - warmup_steps):.6f}"
     )
 
 
@@ -91,10 +91,20 @@ def test(loader: NeighborLoader, val_steps: Optional[int] = None):
 
 
 def create_loader(
-    data, num_neighbors, input_nodes, replace, batch_size, samples_dir, stage_name
+    data,
+    num_neighbors,
+    input_nodes,
+    replace,
+    batch_size,
+    samples_dir,
+    stage_name,
+    local_seeds_per_call,
 ):
-    directory = os.path.join(samples_dir, stage_name)
-    os.mkdir(directory)
+    if samples_dir is not None:
+        directory = os.path.join(samples_dir, stage_name)
+        os.mkdir(directory)
+    else:
+        directory = None
     return NeighborLoader(
         data,
         num_neighbors=num_neighbors,
@@ -102,6 +112,7 @@ def create_loader(
         replace=replace,
         batch_size=batch_size,
         directory=directory,
+        local_seeds_per_call=local_seeds_per_call,
     )
 
 
@@ -147,6 +158,8 @@ def parse_args():
     parser.add_argument("--tempdir_root", type=str, default=None)
     parser.add_argument("--dataset_root", type=str, default="dataset")
     parser.add_argument("--dataset", type=str, default="ogbn-products")
+    parser.add_argument("--in_memory", action="store_true", default=False)
+    parser.add_argument("--seeds_per_call", type=int, default=-1)
 
     return parser.parse_args()
 
@@ -170,7 +183,10 @@ def parse_args():
             "num_neighbors": [args.fan_out] * args.num_layers,
             "replace": False,
             "batch_size": args.batch_size,
-            "samples_dir": samples_dir,
+            "samples_dir": None if args.in_memory else samples_dir,
+            "local_seeds_per_call": None
+            if args.seeds_per_call <= 0
+            else args.seeds_per_call,
         }
 
         train_loader = create_loader(
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py
index b1bb0240e71..73efbc92a24 100644
--- a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py
+++ b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py
@@ -86,6 +86,8 @@ def run_train(
     wall_clock_start,
     tempdir=None,
     num_layers=3,
+    in_memory=False,
+    seeds_per_call=-1,
 ):
 
     init_pytorch_worker(
@@ -119,20 +121,23 @@ def run_train(
     dist.barrier()
 
     ix_train = torch.tensor_split(split_idx["train"], world_size)[rank].cuda()
-    train_path = os.path.join(tempdir, f"train_{rank}")
-    os.mkdir(train_path)
+    train_path = None if in_memory else os.path.join(tempdir, f"train_{rank}")
+    if train_path:
+        os.mkdir(train_path)
     train_loader = NeighborLoader(
         (feature_store, graph_store),
         input_nodes=ix_train,
         directory=train_path,
         shuffle=True,
         drop_last=True,
+        local_seeds_per_call=seeds_per_call if seeds_per_call > 0 else None,
         **kwargs,
     )
 
     ix_test = torch.tensor_split(split_idx["test"], world_size)[rank].cuda()
-    test_path = os.path.join(tempdir, f"test_{rank}")
-    os.mkdir(test_path)
+    test_path = None if in_memory else os.path.join(tempdir, f"test_{rank}")
+    if test_path:
+        os.mkdir(test_path)
     test_loader = NeighborLoader(
         (feature_store, graph_store),
         input_nodes=ix_test,
@@ -144,14 +149,16 @@ def run_train(
     )
 
     ix_valid = torch.tensor_split(split_idx["valid"], world_size)[rank].cuda()
-    valid_path = os.path.join(tempdir, f"valid_{rank}")
-    os.mkdir(valid_path)
+    valid_path = None if in_memory else os.path.join(tempdir, f"valid_{rank}")
+    if valid_path:
+        os.mkdir(valid_path)
     valid_loader = NeighborLoader(
         (feature_store, graph_store),
         input_nodes=ix_valid,
         directory=valid_path,
         shuffle=True,
         drop_last=True,
+        local_seeds_per_call=seeds_per_call if seeds_per_call > 0 else None,
         **kwargs,
     )
 
@@ -269,6 +276,8 @@ def run_train(
         parser.add_argument("--tempdir_root", type=str, default=None)
         parser.add_argument("--dataset_root", type=str, default="dataset")
         parser.add_argument("--dataset", type=str, default="ogbn-products")
+        parser.add_argument("--in_memory", action="store_true", default=False)
+        parser.add_argument("--seeds_per_call", type=int, default=-1)
 
         parser.add_argument(
             "--n_devices",
@@ -322,6 +331,8 @@ def run_train(
                     wall_clock_start,
                     tempdir,
                     args.num_layers,
+                    args.in_memory,
+                    args.seeds_per_call,
                 ),
                 nprocs=world_size,
                 join=True,
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_mnmg.py b/python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_mnmg.py
new file mode 100644
index 00000000000..5c75e01e6f5
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_mnmg.py
@@ -0,0 +1,418 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This example illustrates link classification using the ogbl-wikikg2 dataset.
+
+import os
+import json
+import argparse
+import warnings
+
+import torch
+
+import torch.nn.functional as F
+from torch.nn import Parameter
+from torch_geometric.nn import FastRGCNConv, GAE
+from torch.nn.parallel import DistributedDataParallel
+
+from ogb.linkproppred import PygLinkPropPredDataset
+
+import cugraph_pyg
+
+from cugraph.gnn import (
+    cugraph_comms_init,
+    cugraph_comms_create_unique_id,
+    cugraph_comms_shutdown,
+)
+
+from pylibwholegraph.torch.initialize import (
+    init as wm_init,
+    finalize as wm_finalize,
+)
+
+
+# Enable cudf spilling to save gpu memory
+from cugraph.testing.mg_utils import enable_spilling
+
+# Ensures that a CUDA context is not created on import of rapids.
+# Allows pytorch to create the context instead
+os.environ["RAPIDS_NO_INITIALIZE"] = "1"
+
+
+def init_pytorch_worker(global_rank, local_rank, world_size, uid):
+    import rmm
+
+    rmm.reinitialize(devices=[local_rank], pool_allocator=True, managed_memory=True)
+
+    import cupy
+    from rmm.allocators.cupy import rmm_cupy_allocator
+
+    cupy.cuda.set_allocator(rmm_cupy_allocator)
+
+    cugraph_comms_init(
+        global_rank,
+        world_size,
+        uid,
+        local_rank,
+    )
+
+    wm_init(global_rank, world_size, local_rank, torch.cuda.device_count())
+
+    enable_spilling()
+
+
+class RGCNEncoder(torch.nn.Module):
+    def __init__(self, num_nodes, hidden_channels, num_relations, num_bases=30):
+        super().__init__()
+        self.node_emb = Parameter(torch.empty(num_nodes, hidden_channels))
+        self.conv1 = FastRGCNConv(
+            hidden_channels, hidden_channels, num_relations, num_bases=num_bases
+        )
+        self.conv2 = FastRGCNConv(
+            hidden_channels, hidden_channels, num_relations, num_bases=num_bases
+        )
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        torch.nn.init.xavier_uniform_(self.node_emb)
+        self.conv1.reset_parameters()
+        self.conv2.reset_parameters()
+
+    def forward(self, edge_index, edge_type):
+        x = self.node_emb
+        x = self.conv1(x, edge_index, edge_type).relu_()
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv2(x, edge_index, edge_type)
+        return x
+
+
+def train(epoch, model, optimizer, train_loader, edge_feature_store, num_steps=None):
+    model.train()
+    optimizer.zero_grad()
+
+    for i, batch in enumerate(train_loader):
+        r = edge_feature_store[("n", "e", "n"), "rel"][batch.e_id].flatten().cuda()
+        z = model.encode(batch.edge_index, r)
+
+        loss = model.recon_loss(z, batch.edge_index)
+        loss.backward()
+        optimizer.step()
+
+        if i % 10 == 0:
+            print(
+                f"Epoch: {epoch:02d}, Iteration: {i:02d}, Loss: {loss:.4f}", flush=True
+            )
+        if num_steps and i == num_steps:
+            break
+
+
+def test(stage, epoch, model, loader, num_steps=None):
+    # TODO support ROC-AUC metric
+    # Predict probabilities of future edges
+    model.eval()
+
+    rr = 0.0
+    for i, (h, h_neg, t, t_neg, r) in enumerate(loader):
+        if num_steps and i >= num_steps:
+            break
+
+        ei = torch.concatenate(
+            [
+                torch.stack([h, t]).cuda(),
+                torch.stack([h_neg.flatten(), t_neg.flatten()]).cuda(),
+            ],
+            dim=-1,
+        )
+
+        r = torch.concatenate([r, torch.repeat_interleave(r, h_neg.shape[-1])]).cuda()
+
+        z = model.encode(ei, r)
+        q = model.decode(z, ei)
+
+        _, ix = torch.sort(q, descending=True)
+        rr += 1.0 / (1.0 + ix[0])
+
+    print(f"epoch {epoch:02d} {stage} mrr:", rr / i, flush=True)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--hidden_channels", type=int, default=128)
+    parser.add_argument("--num_layers", type=int, default=1)
+    parser.add_argument("--lr", type=float, default=0.001)
+    parser.add_argument("--epochs", type=int, default=4)
+    parser.add_argument("--batch_size", type=int, default=16384)
+    parser.add_argument("--num_neg", type=int, default=500)
+    parser.add_argument("--num_pos", type=int, default=-1)
+    parser.add_argument("--fan_out", type=int, default=10)
+    parser.add_argument("--dataset", type=str, default="ogbl-wikikg2")
+    parser.add_argument("--dataset_root", type=str, default="dataset")
+    parser.add_argument("--seeds_per_call", type=int, default=-1)
+    parser.add_argument("--n_devices", type=int, default=-1)
+    parser.add_argument("--skip_partition", action="store_true")
+
+    return parser.parse_args()
+
+
+def run_train(rank, world_size, model, data, edge_feature_store, meta, splits, args):
+    model = model.to(rank)
+    model = GAE(DistributedDataParallel(model, device_ids=[rank]))
+    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
+
+    eli = torch.stack([splits["train"]["head"], splits["train"]["tail"]])
+
+    train_loader = cugraph_pyg.loader.LinkNeighborLoader(
+        data,
+        [args.fan_out] * args.num_layers,
+        edge_label_index=eli,
+        local_seeds_per_call=args.seeds_per_call if args.seeds_per_call > 0 else None,
+        batch_size=args.batch_size,
+        shuffle=True,
+        drop_last=True,
+    )
+
+    def get_eval_loader(stage: str):
+        head = splits[stage]["head"]
+        tail = splits[stage]["tail"]
+
+        head_neg = splits[stage]["head_neg"][:, : args.num_neg]
+        tail_neg = splits[stage]["tail_neg"][:, : args.num_neg]
+
+        rel = splits[stage]["relation"]
+
+        return torch.utils.data.DataLoader(
+            torch.utils.data.TensorDataset(
+                head.pin_memory(),
+                head_neg.pin_memory(),
+                tail.pin_memory(),
+                tail_neg.pin_memory(),
+                rel.pin_memory(),
+            ),
+            batch_size=1,
+            shuffle=False,
+            drop_last=True,
+        )
+
+    test_loader = get_eval_loader("test")
+    valid_loader = get_eval_loader("valid")
+
+    num_train_steps = (args.num_pos // args.batch_size) if args.num_pos > 0 else 100
+
+    for epoch in range(1, 1 + args.epochs):
+        train(
+            epoch,
+            model,
+            optimizer,
+            train_loader,
+            edge_feature_store,
+            num_steps=num_train_steps,
+        )
+        test("validation", epoch, model, valid_loader, num_steps=1024)
+
+    test("test", epoch, model, test_loader, num_steps=1024)
+
+    wm_finalize()
+    cugraph_comms_shutdown()
+
+
+def partition_data(
+    data, splits, meta, edge_path, rel_path, pos_path, neg_path, meta_path
+):
+    # Split and save edge index
+    os.makedirs(
+        edge_path,
+        exist_ok=True,
+    )
+    for (r, e) in enumerate(torch.tensor_split(data.edge_index, world_size, dim=1)):
+        rank_path = os.path.join(edge_path, f"rank={r}.pt")
+        torch.save(
+            e.clone(),
+            rank_path,
+        )
+
+    # Split and save edge reltypes
+    os.makedirs(
+        rel_path,
+        exist_ok=True,
+    )
+    for (r, f) in enumerate(torch.tensor_split(data.edge_reltype, world_size)):
+        rank_path = os.path.join(rel_path, f"rank={r}.pt")
+        torch.save(
+            f.clone(),
+            rank_path,
+        )
+
+    # Split and save positive edges
+    os.makedirs(
+        pos_path,
+        exist_ok=True,
+    )
+    for stage in ["train", "test", "valid"]:
+        for (r, n) in enumerate(
+            torch.tensor_split(
+                torch.stack([splits[stage]["head"], splits[stage]["tail"]]),
+                world_size,
+                dim=-1,
+            )
+        ):
+            rank_path = os.path.join(pos_path, f"rank={r}_{stage}.pt")
+            torch.save(
+                n.clone(),
+                rank_path,
+            )
+
+    # Split and save negative edges
+    os.makedirs(
+        neg_path,
+        exist_ok=True,
+    )
+    for stage in ["test", "valid"]:
+        for (r, n) in enumerate(
+            torch.tensor_split(
+                torch.stack([splits[stage]["head_neg"], splits[stage]["tail_neg"]]),
+                world_size,
+                dim=1,
+            )
+        ):
+            rank_path = os.path.join(neg_path, f"rank={r}_{stage}.pt")
+            torch.save(n.clone(), rank_path)
+        for (r, n) in enumerate(
+            torch.tensor_split(splits[stage]["relation"], world_size, dim=-1)
+        ):
+            print(n)
+            rank_path = os.path.join(neg_path, f"rank={r}_{stage}_relation.pt")
+            torch.save(n.clone(), rank_path)
+
+    with open(meta_path, "w") as f:
+        json.dump(meta, f)
+
+
+def load_partitioned_data(rank, edge_path, rel_path, pos_path, neg_path, meta_path):
+    from cugraph_pyg.data import GraphStore, WholeFeatureStore, TensorDictFeatureStore
+
+    graph_store = GraphStore()
+    feature_store = TensorDictFeatureStore()
+    edge_feature_store = WholeFeatureStore()
+
+    # Load edge index
+    graph_store[("n", "e", "n"), "coo"] = torch.load(
+        os.path.join(edge_path, f"rank={rank}.pt")
+    )
+
+    # Load edge rel type
+    edge_feature_store[("n", "e", "n"), "rel"] = torch.load(
+        os.path.join(rel_path, f"rank={rank}.pt")
+    )
+
+    splits = {}
+
+    # Load positive edges
+    for stage in ["train", "test", "valid"]:
+        head, tail = torch.load(os.path.join(pos_path, f"rank={rank}_{stage}.pt"))
+        splits[stage] = {
+            "head": head,
+            "tail": tail,
+        }
+
+    # Load negative edges
+    for stage in ["test", "valid"]:
+        head_neg, tail_neg = torch.load(
+            os.path.join(neg_path, f"rank={rank}_{stage}.pt")
+        )
+        relation = torch.load(
+            os.path.join(neg_path, f"rank={rank}_{stage}_relation.pt")
+        )
+        splits[stage]["head_neg"] = head_neg
+        splits[stage]["tail_neg"] = tail_neg
+        splits[stage]["relation"] = relation
+
+    with open(meta_path, "r") as f:
+        meta = json.load(f)
+
+    return (feature_store, graph_store), edge_feature_store, splits, meta
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    if "LOCAL_RANK" in os.environ:
+        torch.distributed.init_process_group("nccl")
+        world_size = torch.distributed.get_world_size()
+        global_rank = torch.distributed.get_rank()
+        local_rank = int(os.environ["LOCAL_RANK"])
+        device = torch.device(local_rank)
+
+        # Create the uid needed for cuGraph comms
+        if global_rank == 0:
+            cugraph_id = [cugraph_comms_create_unique_id()]
+        else:
+            cugraph_id = [None]
+        torch.distributed.broadcast_object_list(cugraph_id, src=0, device=device)
+        cugraph_id = cugraph_id[0]
+
+        init_pytorch_worker(global_rank, local_rank, world_size, cugraph_id)
+
+        # Split the data
+        edge_path = os.path.join(args.dataset_root, args.dataset + "_eix_part")
+        rel_path = os.path.join(args.dataset_root, args.dataset + "_rel_part")
+        pos_path = os.path.join(args.dataset_root, args.dataset + "_e_pos_part")
+        neg_path = os.path.join(args.dataset_root, args.dataset + "_e_neg_part")
+        meta_path = os.path.join(args.dataset_root, args.dataset + "_meta.json")
+
+        if not args.skip_partition and global_rank == 0:
+            data = PygLinkPropPredDataset(args.dataset, root=args.dataset_root)
+            dataset = data[0]
+
+            splits = data.get_edge_split()
+
+            meta = {}
+            meta["num_nodes"] = int(dataset.num_nodes)
+            meta["num_rels"] = int(dataset.edge_reltype.max()) + 1
+
+            partition_data(
+                dataset,
+                splits,
+                meta,
+                edge_path=edge_path,
+                rel_path=rel_path,
+                pos_path=pos_path,
+                neg_path=neg_path,
+                meta_path=meta_path,
+            )
+            del data
+            del dataset
+            del splits
+        torch.distributed.barrier()
+
+        # Load partitions
+        data, edge_feature_store, splits, meta = load_partitioned_data(
+            rank=global_rank,
+            edge_path=edge_path,
+            rel_path=rel_path,
+            pos_path=pos_path,
+            neg_path=neg_path,
+            meta_path=meta_path,
+        )
+        torch.distributed.barrier()
+
+        model = RGCNEncoder(
+            meta["num_nodes"],
+            hidden_channels=args.hidden_channels,
+            num_relations=meta["num_rels"],
+        )
+
+        run_train(
+            global_rank, world_size, model, data, edge_feature_store, meta, splits, args
+        )
+    else:
+        warnings.warn("This script should be run with 'torchrun`.  Exiting.")
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_sg.py b/python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_sg.py
new file mode 100644
index 00000000000..67d7eecc7c2
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_sg.py
@@ -0,0 +1,219 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This example illustrates link classification using the ogbl-wikikg2 dataset.
+
+import argparse
+
+from typing import Tuple, Dict, Any
+
+import torch
+import cupy
+
+import rmm
+from rmm.allocators.cupy import rmm_cupy_allocator
+from rmm.allocators.torch import rmm_torch_allocator
+
+# Must change allocators immediately upon import
+# or else other imports will cause memory to be
+# allocated and prevent changing the allocator
+rmm.reinitialize(devices=[0], pool_allocator=True, managed_memory=True)
+cupy.cuda.set_allocator(rmm_cupy_allocator)
+torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
+
+import torch.nn.functional as F  # noqa: E402
+from torch.nn import Parameter  # noqa: E402
+from torch_geometric.nn import FastRGCNConv, GAE  # noqa: E402
+import torch_geometric  # noqa: E402
+import cugraph_pyg  # noqa: E402
+
+# Enable cudf spilling to save gpu memory
+from cugraph.testing.mg_utils import enable_spilling  # noqa: E402
+
+enable_spilling()
+
+
+class RGCNEncoder(torch.nn.Module):
+    def __init__(self, num_nodes, hidden_channels, num_relations, num_bases=30):
+        super().__init__()
+        self.node_emb = Parameter(torch.empty(num_nodes, hidden_channels))
+        self.conv1 = FastRGCNConv(
+            hidden_channels, hidden_channels, num_relations, num_bases=num_bases
+        )
+        self.conv2 = FastRGCNConv(
+            hidden_channels, hidden_channels, num_relations, num_bases=num_bases
+        )
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        torch.nn.init.xavier_uniform_(self.node_emb)
+        self.conv1.reset_parameters()
+        self.conv2.reset_parameters()
+
+    def forward(self, edge_index, edge_type):
+        x = self.node_emb
+        x = self.conv1(x, edge_index, edge_type).relu_()
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv2(x, edge_index, edge_type)
+        return x
+
+
+def load_data(
+    dataset_str, dataset_root: str
+) -> Tuple[
+    Tuple["torch_geometric.data.FeatureStore", "torch_geometric.data.GraphStore"],
+    "torch_geometric.data.FeatureStore",
+    Dict[str, Dict[str, "torch.Tensor"]],
+    Dict[str, Any],
+]:
+    from ogb.linkproppred import PygLinkPropPredDataset
+
+    data = PygLinkPropPredDataset(dataset_str, root=dataset_root)
+    dataset = data[0]
+
+    splits = data.get_edge_split()
+
+    from cugraph_pyg.data import GraphStore, TensorDictFeatureStore
+
+    graph_store = GraphStore()
+    feature_store = TensorDictFeatureStore()
+    edge_feature_store = TensorDictFeatureStore()
+    meta = {}
+
+    graph_store[("n", "e", "n"), "coo"] = dataset.edge_index
+    edge_feature_store[("n", "e", "n"), "rel"] = dataset.edge_reltype.pin_memory()
+    meta["num_nodes"] = dataset.num_nodes
+    meta["num_rels"] = dataset.edge_reltype.max() + 1
+
+    return (feature_store, graph_store), edge_feature_store, splits, meta
+
+
+def train(epoch, model, optimizer, train_loader, edge_feature_store):
+    model.train()
+    optimizer.zero_grad()
+
+    for i, batch in enumerate(train_loader):
+        r = edge_feature_store[("n", "e", "n"), "rel"][batch.e_id].flatten().cuda()
+        z = model.encode(batch.edge_index, r)
+
+        loss = model.recon_loss(z, batch.edge_index)
+        loss.backward()
+        optimizer.step()
+
+        if i % 10 == 0:
+            print(f"Epoch: {epoch:02d}, Iteration: {i:02d}, Loss: {loss:.4f}")
+            if i == 100:
+                break
+
+
+def test(stage, epoch, model, loader, num_steps=None):
+    # TODO support ROC-AUC metric
+    # Predict probabilities of future edges
+    model.eval()
+
+    rr = 0.0
+    for i, (h, h_neg, t, t_neg, r) in enumerate(loader):
+        if num_steps and i >= num_steps:
+            break
+
+        ei = torch.concatenate(
+            [
+                torch.stack([h, t]).cuda(),
+                torch.stack([h_neg.flatten(), t_neg.flatten()]).cuda(),
+            ],
+            dim=-1,
+        )
+
+        r = torch.concatenate([r, torch.repeat_interleave(r, h_neg.shape[-1])]).cuda()
+
+        z = model.encode(ei, r)
+        q = model.decode(z, ei)
+
+        _, ix = torch.sort(q, descending=True)
+        rr += 1.0 / (1.0 + ix[0])
+
+    print(f"epoch {epoch:02d} {stage} mrr:", rr / i)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--hidden_channels", type=int, default=128)
+    parser.add_argument("--num_layers", type=int, default=1)
+    parser.add_argument("--lr", type=float, default=0.001)
+    parser.add_argument("--epochs", type=int, default=4)
+    parser.add_argument("--batch_size", type=int, default=16384)
+    parser.add_argument("--num_neg", type=int, default=500)
+    parser.add_argument("--fan_out", type=int, default=10)
+    parser.add_argument("--dataset", type=str, default="ogbl-wikikg2")
+    parser.add_argument("--dataset_root", type=str, default="dataset")
+    parser.add_argument("--seeds_per_call", type=int, default=-1)
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    data, edge_feature_store, splits, meta = load_data(args.dataset, args.dataset_root)
+
+    model = GAE(
+        RGCNEncoder(
+            meta["num_nodes"],
+            hidden_channels=args.hidden_channels,
+            num_relations=meta["num_rels"],
+        )
+    ).cuda()
+    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
+
+    train_loader = cugraph_pyg.loader.LinkNeighborLoader(
+        data,
+        [args.fan_out] * args.num_layers,
+        edge_label_index=torch.stack(
+            [splits["train"]["head"], splits["train"]["tail"]]
+        ),
+        local_seeds_per_call=args.seeds_per_call if args.seeds_per_call > 0 else None,
+        batch_size=args.batch_size,
+        shuffle=True,
+        drop_last=True,
+    )
+
+    def get_eval_loader(stage: str):
+        head = splits[stage]["head"]
+        tail = splits[stage]["tail"]
+
+        head_neg = splits[stage]["head_neg"][:, : args.num_neg]
+        tail_neg = splits[stage]["tail_neg"][:, : args.num_neg]
+
+        rel = splits[stage]["relation"]
+
+        return torch.utils.data.DataLoader(
+            torch.utils.data.TensorDataset(
+                head.pin_memory(),
+                head_neg.pin_memory(),
+                tail.pin_memory(),
+                tail_neg.pin_memory(),
+                rel.pin_memory(),
+            ),
+            batch_size=1,
+            shuffle=False,
+            drop_last=True,
+        )
+
+    test_loader = get_eval_loader("test")
+    valid_loader = get_eval_loader("valid")
+
+    for epoch in range(1, 1 + args.epochs):
+        train(epoch, model, optimizer, train_loader, edge_feature_store)
+        test("validation", epoch, model, valid_loader, num_steps=1024)
+
+    test("test", epoch, model, test_loader, num_steps=1024)
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_snmg.py b/python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_snmg.py
new file mode 100644
index 00000000000..2c0ae53a08e
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_snmg.py
@@ -0,0 +1,320 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This example illustrates link classification using the ogbl-wikikg2 dataset.
+
+import os
+import argparse
+import warnings
+
+from typing import Tuple, Any
+
+import torch
+
+import torch.nn.functional as F
+from torch.nn import Parameter
+from torch_geometric.nn import FastRGCNConv, GAE
+from torch.nn.parallel import DistributedDataParallel
+
+import torch_geometric
+import cugraph_pyg
+
+from cugraph.gnn import (
+    cugraph_comms_init,
+    cugraph_comms_create_unique_id,
+    cugraph_comms_shutdown,
+)
+
+from pylibwholegraph.torch.initialize import (
+    init as wm_init,
+    finalize as wm_finalize,
+)
+
+
+# Enable cudf spilling to save gpu memory
+from cugraph.testing.mg_utils import enable_spilling
+
+# Ensures that a CUDA context is not created on import of rapids.
+# Allows pytorch to create the context instead
+os.environ["RAPIDS_NO_INITIALIZE"] = "1"
+
+
+def init_pytorch_worker(rank, world_size, uid):
+    import rmm
+
+    rmm.reinitialize(devices=[rank], pool_allocator=True, managed_memory=True)
+
+    import cupy
+    from rmm.allocators.cupy import rmm_cupy_allocator
+
+    cupy.cuda.set_allocator(rmm_cupy_allocator)
+
+    cugraph_comms_init(
+        rank,
+        world_size,
+        uid,
+        rank,
+    )
+
+    wm_init(rank, world_size, rank, world_size)
+
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12355"
+    torch.distributed.init_process_group(
+        "nccl",
+        rank=rank,
+        world_size=world_size,
+    )
+
+    enable_spilling()
+
+
+class RGCNEncoder(torch.nn.Module):
+    def __init__(self, num_nodes, hidden_channels, num_relations, num_bases=30):
+        super().__init__()
+        self.node_emb = Parameter(torch.empty(num_nodes, hidden_channels))
+        self.conv1 = FastRGCNConv(
+            hidden_channels, hidden_channels, num_relations, num_bases=num_bases
+        )
+        self.conv2 = FastRGCNConv(
+            hidden_channels, hidden_channels, num_relations, num_bases=num_bases
+        )
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        torch.nn.init.xavier_uniform_(self.node_emb)
+        self.conv1.reset_parameters()
+        self.conv2.reset_parameters()
+
+    def forward(self, edge_index, edge_type):
+        x = self.node_emb
+        x = self.conv1(x, edge_index, edge_type).relu_()
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv2(x, edge_index, edge_type)
+        return x
+
+
+def load_data(
+    rank: int,
+    world_size: int,
+    data: Any,
+) -> Tuple[
+    Tuple["torch_geometric.data.FeatureStore", "torch_geometric.data.GraphStore"],
+    "torch_geometric.data.FeatureStore",
+]:
+    from cugraph_pyg.data import GraphStore, WholeFeatureStore, TensorDictFeatureStore
+
+    graph_store = GraphStore()
+    feature_store = TensorDictFeatureStore()  # empty fs required by PyG
+    edge_feature_store = WholeFeatureStore()
+
+    graph_store[("n", "e", "n"), "coo"] = torch.tensor_split(
+        data.edge_index.cuda(), world_size, dim=1
+    )[rank]
+
+    edge_feature_store[("n", "e", "n"), "rel"] = torch.tensor_split(
+        data.edge_reltype.cuda(),
+        world_size,
+    )[rank]
+
+    return (feature_store, graph_store), edge_feature_store
+
+
+def train(epoch, model, optimizer, train_loader, edge_feature_store, num_steps=None):
+    model.train()
+    optimizer.zero_grad()
+
+    for i, batch in enumerate(train_loader):
+        r = edge_feature_store[("n", "e", "n"), "rel"][batch.e_id].flatten().cuda()
+        z = model.encode(batch.edge_index, r)
+
+        loss = model.recon_loss(z, batch.edge_index)
+        loss.backward()
+        optimizer.step()
+
+        if i % 10 == 0:
+            print(
+                f"Epoch: {epoch:02d}, Iteration: {i:02d}, Loss: {loss:.4f}", flush=True
+            )
+        if num_steps and i == num_steps:
+            break
+
+
+def test(stage, epoch, model, loader, num_steps=None):
+    # TODO support ROC-AUC metric
+    # Predict probabilities of future edges
+    model.eval()
+
+    rr = 0.0
+    for i, (h, h_neg, t, t_neg, r) in enumerate(loader):
+        if num_steps and i >= num_steps:
+            break
+
+        ei = torch.concatenate(
+            [
+                torch.stack([h, t]).cuda(),
+                torch.stack([h_neg.flatten(), t_neg.flatten()]).cuda(),
+            ],
+            dim=-1,
+        )
+
+        r = torch.concatenate([r, torch.repeat_interleave(r, h_neg.shape[-1])]).cuda()
+
+        z = model.encode(ei, r)
+        q = model.decode(z, ei)
+
+        _, ix = torch.sort(q, descending=True)
+        rr += 1.0 / (1.0 + ix[0])
+
+    print(f"epoch {epoch:02d} {stage} mrr:", rr / i, flush=True)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--hidden_channels", type=int, default=128)
+    parser.add_argument("--num_layers", type=int, default=1)
+    parser.add_argument("--lr", type=float, default=0.001)
+    parser.add_argument("--epochs", type=int, default=4)
+    parser.add_argument("--batch_size", type=int, default=16384)
+    parser.add_argument("--num_neg", type=int, default=500)
+    parser.add_argument("--num_pos", type=int, default=-1)
+    parser.add_argument("--fan_out", type=int, default=10)
+    parser.add_argument("--dataset", type=str, default="ogbl-wikikg2")
+    parser.add_argument("--dataset_root", type=str, default="dataset")
+    parser.add_argument("--seeds_per_call", type=int, default=-1)
+    parser.add_argument("--n_devices", type=int, default=-1)
+
+    return parser.parse_args()
+
+
+def run_train(rank, world_size, uid, model, data, meta, splits, args):
+    init_pytorch_worker(
+        rank,
+        world_size,
+        uid,
+    )
+
+    model = model.to(rank)
+    model = GAE(DistributedDataParallel(model, device_ids=[rank]))
+    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
+
+    data, edge_feature_store = load_data(rank, world_size, data)
+
+    eli = torch.stack(
+        [
+            torch.tensor_split(splits["train"]["head"], world_size)[rank],
+            torch.tensor_split(splits["train"]["tail"], world_size)[rank],
+        ]
+    )
+
+    train_loader = cugraph_pyg.loader.LinkNeighborLoader(
+        data,
+        [args.fan_out] * args.num_layers,
+        edge_label_index=eli,
+        local_seeds_per_call=args.seeds_per_call if args.seeds_per_call > 0 else None,
+        batch_size=args.batch_size,
+        shuffle=True,
+        drop_last=True,
+    )
+
+    def get_eval_loader(stage: str):
+        head = torch.tensor_split(splits[stage]["head"], world_size)[rank]
+        tail = torch.tensor_split(splits[stage]["tail"], world_size)[rank]
+
+        head_neg = torch.tensor_split(
+            splits[stage]["head_neg"][:, : args.num_neg], world_size
+        )[rank]
+        tail_neg = torch.tensor_split(
+            splits[stage]["tail_neg"][:, : args.num_neg], world_size
+        )[rank]
+
+        rel = torch.tensor_split(splits[stage]["relation"], world_size)[rank]
+
+        return torch.utils.data.DataLoader(
+            torch.utils.data.TensorDataset(
+                head.pin_memory(),
+                head_neg.pin_memory(),
+                tail.pin_memory(),
+                tail_neg.pin_memory(),
+                rel.pin_memory(),
+            ),
+            batch_size=1,
+            shuffle=False,
+            drop_last=True,
+        )
+
+    test_loader = get_eval_loader("test")
+    valid_loader = get_eval_loader("valid")
+
+    num_train_steps = (args.num_pos // args.batch_size) if args.num_pos > 0 else 100
+
+    for epoch in range(1, 1 + args.epochs):
+        train(
+            epoch,
+            model,
+            optimizer,
+            train_loader,
+            edge_feature_store,
+            num_steps=num_train_steps,
+        )
+        test("validation", epoch, model, valid_loader, num_steps=1024)
+
+    test("test", epoch, model, test_loader, num_steps=1024)
+
+    wm_finalize()
+    cugraph_comms_shutdown()
+
+
+if __name__ == "__main__":
+    if "CI_RUN" in os.environ and os.environ["CI_RUN"] == "1":
+        warnings.warn("Skipping SMNG example in CI due to memory limit")
+    else:
+        args = parse_args()
+
+        # change the allocator before any allocations are made
+        from rmm.allocators.torch import rmm_torch_allocator
+
+        torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
+
+        # import ogb here to stop it from creating a context and breaking pytorch/rmm
+        from ogb.linkproppred import PygLinkPropPredDataset
+
+        data = PygLinkPropPredDataset(args.dataset, root=args.dataset_root)
+        dataset = data[0]
+
+        splits = data.get_edge_split()
+
+        meta = {}
+        meta["num_nodes"] = dataset.num_nodes
+        meta["num_rels"] = dataset.edge_reltype.max() + 1
+
+        model = RGCNEncoder(
+            meta["num_nodes"],
+            hidden_channels=args.hidden_channels,
+            num_relations=meta["num_rels"],
+        )
+
+        print("Data =", data)
+        if args.n_devices == -1:
+            world_size = torch.cuda.device_count()
+        else:
+            world_size = args.n_devices
+        print("Using", world_size, "GPUs...")
+
+        uid = cugraph_comms_create_unique_id()
+        torch.multiprocessing.spawn(
+            run_train,
+            (world_size, uid, model, data, meta, splits, args),
+            nprocs=world_size,
+            join=True,
+        )
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/__init__.py b/python/cugraph-pyg/cugraph_pyg/loader/__init__.py
index cad66aaa183..c804b3d1f97 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/__init__.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/__init__.py
@@ -16,6 +16,9 @@
 from cugraph_pyg.loader.node_loader import NodeLoader
 from cugraph_pyg.loader.neighbor_loader import NeighborLoader
 
+from cugraph_pyg.loader.link_loader import LinkLoader
+from cugraph_pyg.loader.link_neighbor_loader import LinkNeighborLoader
+
 from cugraph_pyg.loader.dask_node_loader import DaskNeighborLoader
 
 from cugraph_pyg.loader.dask_node_loader import BulkSampleLoader
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/link_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/link_loader.py
new file mode 100644
index 00000000000..77e2ac4f99d
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/loader/link_loader.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+import cugraph_pyg
+from typing import Union, Tuple, Callable, Optional
+
+from cugraph.utilities.utils import import_optional
+
+torch_geometric = import_optional("torch_geometric")
+torch = import_optional("torch")
+
+
+class LinkLoader:
+    """
+    Duck-typed version of torch_geometric.loader.LinkLoader.
+    Loads samples from batches of input nodes using a
+    `~cugraph_pyg.sampler.BaseSampler.sample_from_edges`
+    function.
+    """
+
+    def __init__(
+        self,
+        data: Union[
+            "torch_geometric.data.Data",
+            "torch_geometric.data.HeteroData",
+            Tuple[
+                "torch_geometric.data.FeatureStore", "torch_geometric.data.GraphStore"
+            ],
+        ],
+        link_sampler: "cugraph_pyg.sampler.BaseSampler",
+        edge_label_index: "torch_geometric.typing.InputEdges" = None,
+        edge_label: "torch_geometric.typing.OptTensor" = None,
+        edge_label_time: "torch_geometric.typing.OptTensor" = None,
+        neg_sampling: Optional["torch_geometric.sampler.NegativeSampling"] = None,
+        neg_sampling_ratio: Optional[Union[int, float]] = None,
+        transform: Optional[Callable] = None,
+        transform_sampler_output: Optional[Callable] = None,
+        filter_per_worker: Optional[bool] = None,
+        custom_cls: Optional["torch_geometric.data.HeteroData"] = None,
+        input_id: "torch_geometric.typing.OptTensor" = None,
+        batch_size: int = 1,  # refers to number of edges in batch
+        shuffle: bool = False,
+        drop_last: bool = False,
+        **kwargs,
+    ):
+        """
+        Parameters
+        ----------
+            data: Data, HeteroData, or Tuple[FeatureStore, GraphStore]
+                See torch_geometric.loader.NodeLoader.
+            link_sampler: BaseSampler
+                See torch_geometric.loader.LinkLoader.
+            edge_label_index: InputEdges
+                See torch_geometric.loader.LinkLoader.
+            edge_label: OptTensor
+                See torch_geometric.loader.LinkLoader.
+            edge_label_time: OptTensor
+                See torch_geometric.loader.LinkLoader.
+            neg_sampling: Optional[NegativeSampling]
+                Type of negative sampling to perform, if desired.
+                See torch_geometric.loader.LinkLoader.
+            neg_sampling_ratio: Optional[Union[int, float]]
+                Negative sampling ratio.  Affects how many negative
+                samples are generated.
+                See torch_geometric.loader.LinkLoader.
+            transform: Callable (optional, default=None)
+                This argument currently has no effect.
+            transform_sampler_output: Callable (optional, default=None)
+                This argument currently has no effect.
+            filter_per_worker: bool (optional, default=False)
+                This argument currently has no effect.
+            custom_cls: HeteroData
+                This argument currently has no effect.  This loader will
+                always return a Data or HeteroData object.
+            input_id: OptTensor
+                See torch_geometric.loader.LinkLoader.
+
+        """
+        if not isinstance(data, (list, tuple)) or not isinstance(
+            data[1], cugraph_pyg.data.GraphStore
+        ):
+            # Will eventually automatically convert these objects to cuGraph objects.
+            raise NotImplementedError("Currently can't accept non-cugraph graphs")
+
+        if not isinstance(link_sampler, cugraph_pyg.sampler.BaseSampler):
+            raise NotImplementedError("Must provide a cuGraph sampler")
+
+        if edge_label_time is not None:
+            raise ValueError("Temporal sampling is currently unsupported")
+
+        if filter_per_worker:
+            warnings.warn("filter_per_worker is currently ignored")
+
+        if custom_cls is not None:
+            warnings.warn("custom_cls is currently ignored")
+
+        if transform is not None:
+            warnings.warn("transform is currently ignored.")
+
+        if transform_sampler_output is not None:
+            warnings.warn("transform_sampler_output is currently ignored.")
+
+        if neg_sampling_ratio is not None:
+            warnings.warn(
+                "The 'neg_sampling_ratio' argument is deprecated in PyG"
+                " and is not supported in cuGraph-PyG."
+            )
+
+        neg_sampling = torch_geometric.sampler.NegativeSampling.cast(neg_sampling)
+
+        (
+            input_type,
+            edge_label_index,
+        ) = torch_geometric.loader.utils.get_edge_label_index(
+            data,
+            (None, edge_label_index),
+        )
+
+        self.__input_data = torch_geometric.sampler.EdgeSamplerInput(
+            input_id=torch.arange(
+                edge_label_index[0].numel(), dtype=torch.int64, device="cuda"
+            )
+            if input_id is None
+            else input_id,
+            row=edge_label_index[0],
+            col=edge_label_index[1],
+            label=edge_label,
+            time=edge_label_time,
+            input_type=input_type,
+        )
+
+        # Edge label check from torch_geometric.loader.LinkLoader
+        if (
+            neg_sampling is not None
+            and neg_sampling.is_binary()
+            and edge_label is not None
+            and edge_label.min() == 0
+        ):
+            edge_label = edge_label + 1
+
+        if (
+            neg_sampling is not None
+            and neg_sampling.is_triplet()
+            and edge_label is not None
+        ):
+            raise ValueError(
+                "'edge_label' needs to be undefined for "
+                "'triplet'-based negative sampling. Please use "
+                "`src_index`, `dst_pos_index` and "
+                "`neg_pos_index` of the returned mini-batch "
+                "instead to differentiate between positive and "
+                "negative samples."
+            )
+
+        self.__data = data
+
+        self.__link_sampler = link_sampler
+        self.__neg_sampling = neg_sampling
+
+        self.__batch_size = batch_size
+        self.__shuffle = shuffle
+        self.__drop_last = drop_last
+
+    def __iter__(self):
+        if self.__shuffle:
+            perm = torch.randperm(self.__input_data.row.numel())
+        else:
+            perm = torch.arange(self.__input_data.row.numel())
+
+        if self.__drop_last:
+            d = perm.numel() % self.__batch_size
+            perm = perm[:-d]
+
+        input_data = torch_geometric.sampler.EdgeSamplerInput(
+            input_id=self.__input_data.input_id[perm],
+            row=self.__input_data.row[perm],
+            col=self.__input_data.col[perm],
+            label=None
+            if self.__input_data.label is None
+            else self.__input_data.label[perm],
+            time=None
+            if self.__input_data.time is None
+            else self.__input_data.time[perm],
+            input_type=self.__input_data.input_type,
+        )
+
+        return cugraph_pyg.sampler.SampleIterator(
+            self.__data,
+            self.__link_sampler.sample_from_edges(
+                input_data,
+                neg_sampling=self.__neg_sampling,
+            ),
+        )
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/link_neighbor_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/link_neighbor_loader.py
new file mode 100644
index 00000000000..080565368c4
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/loader/link_neighbor_loader.py
@@ -0,0 +1,243 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+from typing import Union, Tuple, Optional, Callable, List, Dict
+
+import cugraph_pyg
+from cugraph_pyg.loader import LinkLoader
+from cugraph_pyg.sampler import BaseSampler
+
+from cugraph.gnn import NeighborSampler, DistSampleWriter
+from cugraph.utilities.utils import import_optional
+
+torch_geometric = import_optional("torch_geometric")
+
+
+class LinkNeighborLoader(LinkLoader):
+    """
+    Duck-typed version of torch_geometric.loader.LinkNeighborLoader
+
+    Link loader that implements the neighbor sampling
+    algorithm used in GraphSAGE.
+    """
+
+    def __init__(
+        self,
+        data: Union[
+            "torch_geometric.data.Data",
+            "torch_geometric.data.HeteroData",
+            Tuple[
+                "torch_geometric.data.FeatureStore", "torch_geometric.data.GraphStore"
+            ],
+        ],
+        num_neighbors: Union[
+            List[int], Dict["torch_geometric.typing.EdgeType", List[int]]
+        ],
+        edge_label_index: "torch_geometric.typing.InputEdges" = None,
+        edge_label: "torch_geometric.typing.OptTensor" = None,
+        edge_label_time: "torch_geometric.typing.OptTensor" = None,
+        replace: bool = False,
+        subgraph_type: Union[
+            "torch_geometric.typing.SubgraphType", str
+        ] = "directional",
+        disjoint: bool = False,
+        temporal_strategy: str = "uniform",
+        neg_sampling: Optional["torch_geometric.sampler.NegativeSampling"] = None,
+        neg_sampling_ratio: Optional[Union[int, float]] = None,
+        time_attr: Optional[str] = None,
+        weight_attr: Optional[str] = None,
+        transform: Optional[Callable] = None,
+        transform_sampler_output: Optional[Callable] = None,
+        is_sorted: bool = False,
+        filter_per_worker: Optional[bool] = None,
+        neighbor_sampler: Optional["torch_geometric.sampler.NeighborSampler"] = None,
+        directed: bool = True,  # Deprecated.
+        batch_size: int = 16,  # Refers to number of edges per batch.
+        directory: Optional[str] = None,
+        batches_per_partition=256,
+        format: str = "parquet",
+        compression: Optional[str] = None,
+        local_seeds_per_call: Optional[int] = None,
+        **kwargs,
+    ):
+        """
+        data: Data, HeteroData, or Tuple[FeatureStore, GraphStore]
+            See torch_geometric.loader.LinkNeighborLoader.
+        num_neighbors: List[int] or Dict[EdgeType, List[int]]
+            Fanout values.
+            See torch_geometric.loader.LinkNeighborLoader.
+        edge_label_index: InputEdges
+            Input edges for sampling.
+            See torch_geometric.loader.LinkNeighborLoader.
+        edge_label: OptTensor
+            Labels for input edges.
+            See torch_geometric.loader.LinkNeighborLoader.
+        edge_label_time: OptTensor
+            Time attribute for input edges.
+            See torch_geometric.loader.LinkNeighborLoader.
+        replace: bool (optional, default=False)
+            Whether to sample with replacement.
+            See torch_geometric.loader.LinkNeighborLoader.
+        subgraph_type: Union[SubgraphType, str] (optional, default='directional')
+            The type of subgraph to return.
+            Currently only 'directional' is supported.
+            See torch_geometric.loader.LinkNeighborLoader.
+        disjoint: bool (optional, default=False)
+            Whether to perform disjoint sampling.
+            Currently unsupported.
+            See torch_geometric.loader.LinkNeighborLoader.
+        temporal_strategy: str (optional, default='uniform')
+            Currently only 'uniform' is suppported.
+            See torch_geometric.loader.LinkNeighborLoader.
+        time_attr: str (optional, default=None)
+            Used for temporal sampling.
+            See torch_geometric.loader.LinkNeighborLoader.
+        weight_attr: str (optional, default=None)
+            Used for biased sampling.
+            See torch_geometric.loader.LinkNeighborLoader.
+        transform: Callable (optional, default=None)
+            See torch_geometric.loader.LinkNeighborLoader.
+        transform_sampler_output: Callable (optional, default=None)
+            See torch_geometric.loader.LinkNeighborLoader.
+        is_sorted: bool (optional, default=False)
+            Ignored by cuGraph.
+            See torch_geometric.loader.LinkNeighborLoader.
+        filter_per_worker: bool (optional, default=False)
+            Currently ignored by cuGraph, but this may
+            change once in-memory sampling is implemented.
+            See torch_geometric.loader.LinkNeighborLoader.
+        neighbor_sampler: torch_geometric.sampler.NeighborSampler
+            (optional, default=None)
+            Not supported by cuGraph.
+            See torch_geometric.loader.LinkNeighborLoader.
+        directed: bool (optional, default=True)
+            Deprecated.
+            See torch_geometric.loader.LinkNeighborLoader.
+        batch_size: int (optional, default=16)
+            The number of input nodes per output minibatch.
+            See torch.utils.dataloader.
+        directory: str (optional, default=None)
+            The directory where samples will be temporarily stored,
+            if spilling samples to disk.  If None, this loader
+            will perform buffered in-memory sampling.
+            If writing to disk, setting this argument
+            to a tempfile.TemporaryDirectory with a context
+            manager is a good option but depending on the filesystem,
+            you may want to choose an alternative location with fast I/O
+            intead.
+            See cugraph.gnn.DistSampleWriter.
+        batches_per_partition: int (optional, default=256)
+            The number of batches per partition if writing samples to
+            disk.  Manually tuning this parameter is not recommended
+            but reducing it may help conserve GPU memory.
+            See cugraph.gnn.DistSampleWriter.
+        format: str (optional, default='parquet')
+            If writing samples to disk, they will be written in this
+            file format.
+            See cugraph.gnn.DistSampleWriter.
+        compression: str (optional, default=None)
+            The compression type to use if writing samples to disk.
+            If not provided, it is automatically chosen.
+        local_seeds_per_call: int (optional, default=None)
+            The number of seeds to process within a single sampling call.
+            Manually tuning this parameter is not recommended but reducing
+            it may conserve GPU memory.  The total number of seeds processed
+            per sampling call is equal to the sum of this parameter across
+            all workers.  If not provided, it will be automatically
+            calculated.
+            See cugraph.gnn.DistSampler.
+        **kwargs
+            Other keyword arguments passed to the superclass.
+        """
+
+        subgraph_type = torch_geometric.sampler.base.SubgraphType(subgraph_type)
+
+        if not directed:
+            subgraph_type = torch_geometric.sampler.base.SubgraphType.induced
+            warnings.warn(
+                "The 'directed' argument is deprecated. "
+                "Use subgraph_type='induced' instead."
+            )
+        if subgraph_type != torch_geometric.sampler.base.SubgraphType.directional:
+            raise ValueError("Only directional subgraphs are currently supported")
+        if disjoint:
+            raise ValueError("Disjoint sampling is currently unsupported")
+        if temporal_strategy != "uniform":
+            warnings.warn("Only the uniform temporal strategy is currently supported")
+        if neighbor_sampler is not None:
+            raise ValueError("Passing a neighbor sampler is currently unsupported")
+        if time_attr is not None:
+            raise ValueError("Temporal sampling is currently unsupported")
+        if is_sorted:
+            warnings.warn("The 'is_sorted' argument is ignored by cuGraph.")
+        if not isinstance(data, (list, tuple)) or not isinstance(
+            data[1], cugraph_pyg.data.GraphStore
+        ):
+            # Will eventually automatically convert these objects to cuGraph objects.
+            raise NotImplementedError("Currently can't accept non-cugraph graphs")
+
+        if compression is None:
+            compression = "CSR"
+        elif compression not in ["CSR", "COO"]:
+            raise ValueError("Invalid value for compression (expected 'CSR' or 'COO')")
+
+        writer = (
+            None
+            if directory is None
+            else DistSampleWriter(
+                directory=directory,
+                batches_per_partition=batches_per_partition,
+                format=format,
+            )
+        )
+
+        feature_store, graph_store = data
+
+        if weight_attr is not None:
+            graph_store._set_weight_attr((feature_store, weight_attr))
+
+        sampler = BaseSampler(
+            NeighborSampler(
+                graph_store._graph,
+                writer,
+                retain_original_seeds=True,
+                fanout=num_neighbors,
+                prior_sources_behavior="exclude",
+                deduplicate_sources=True,
+                compression=compression,
+                compress_per_hop=False,
+                with_replacement=replace,
+                local_seeds_per_call=local_seeds_per_call,
+                biased=(weight_attr is not None),
+            ),
+            (feature_store, graph_store),
+            batch_size=batch_size,
+        )
+        # TODO add heterogeneous support and pass graph_store._vertex_offsets
+
+        super().__init__(
+            (feature_store, graph_store),
+            sampler,
+            edge_label_index=edge_label_index,
+            edge_label=edge_label,
+            edge_label_time=edge_label_time,
+            neg_sampling=neg_sampling,
+            neg_sampling_ratio=neg_sampling_ratio,
+            transform=transform,
+            transform_sampler_output=transform_sampler_output,
+            filter_per_worker=filter_per_worker,
+            batch_size=batch_size,
+            **kwargs,
+        )
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py
index 1199895e99d..1da2c6dc381 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py
@@ -12,7 +12,6 @@
 # limitations under the License.
 
 import warnings
-import tempfile
 
 from typing import Union, Tuple, Optional, Callable, List, Dict
 
@@ -20,7 +19,7 @@
 from cugraph_pyg.loader import NodeLoader
 from cugraph_pyg.sampler import BaseSampler
 
-from cugraph.gnn import UniformNeighborSampler, DistSampleWriter
+from cugraph.gnn import NeighborSampler, DistSampleWriter
 from cugraph.utilities.utils import import_optional
 
 torch_geometric = import_optional("torch_geometric")
@@ -63,7 +62,7 @@ def __init__(
         neighbor_sampler: Optional["torch_geometric.sampler.NeighborSampler"] = None,
         directed: bool = True,  # Deprecated.
         batch_size: int = 16,
-        directory: str = None,
+        directory: Optional[str] = None,
         batches_per_partition=256,
         format: str = "parquet",
         compression: Optional[str] = None,
@@ -123,14 +122,14 @@ def __init__(
             The number of input nodes per output minibatch.
             See torch.utils.dataloader.
         directory: str (optional, default=None)
-            The directory where samples will be temporarily stored.
-            It is recommend that this be set by the user, usually
-            setting it to a tempfile.TemporaryDirectory with a context
+            The directory where samples will be temporarily stored,
+            if spilling samples to disk.  If None, this loader
+            will perform buffered in-memory sampling.
+            If writing to disk, setting this argument
+            to a tempfile.TemporaryDirectory with a context
             manager is a good option but depending on the filesystem,
             you may want to choose an alternative location with fast I/O
             intead.
-            If not set, this will create a TemporaryDirectory that will
-            persist until this object is garbage collected.
             See cugraph.gnn.DistSampleWriter.
         batches_per_partition: int (optional, default=256)
             The number of batches per partition if writing samples to
@@ -174,8 +173,6 @@ def __init__(
             raise ValueError("Passing a neighbor sampler is currently unsupported")
         if time_attr is not None:
             raise ValueError("Temporal sampling is currently unsupported")
-        if weight_attr is not None:
-            raise ValueError("Biased sampling is currently unsupported")
         if is_sorted:
             warnings.warn("The 'is_sorted' argument is ignored by cuGraph.")
         if not isinstance(data, (list, tuple)) or not isinstance(
@@ -184,25 +181,28 @@ def __init__(
             # Will eventually automatically convert these objects to cuGraph objects.
             raise NotImplementedError("Currently can't accept non-cugraph graphs")
 
-        if directory is None:
-            warnings.warn("Setting a directory to store samples is recommended.")
-            self._tempdir = tempfile.TemporaryDirectory()
-            directory = self._tempdir.name
-
         if compression is None:
             compression = "CSR"
         elif compression not in ["CSR", "COO"]:
             raise ValueError("Invalid value for compression (expected 'CSR' or 'COO')")
 
-        writer = DistSampleWriter(
-            directory=directory,
-            batches_per_partition=batches_per_partition,
-            format=format,
+        writer = (
+            None
+            if directory is None
+            else DistSampleWriter(
+                directory=directory,
+                batches_per_partition=batches_per_partition,
+                format=format,
+            )
         )
 
         feature_store, graph_store = data
+
+        if weight_attr is not None:
+            graph_store._set_weight_attr((feature_store, weight_attr))
+
         sampler = BaseSampler(
-            UniformNeighborSampler(
+            NeighborSampler(
                 graph_store._graph,
                 writer,
                 retain_original_seeds=True,
@@ -213,6 +213,7 @@ def __init__(
                 compress_per_hop=False,
                 with_replacement=replace,
                 local_seeds_per_call=local_seeds_per_call,
+                biased=(weight_attr is not None),
             ),
             (feature_store, graph_store),
             batch_size=batch_size,
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py
index 49923783d6b..4b236f75885 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py
@@ -110,8 +110,10 @@ def __init__(
             input_id,
         )
 
-        self.__input_data = torch_geometric.loader.node_loader.NodeSamplerInput(
-            input_id=input_id,
+        self.__input_data = torch_geometric.sampler.NodeSamplerInput(
+            input_id=torch.arange(len(input_nodes), dtype=torch.int64, device="cuda")
+            if input_id is None
+            else input_id,
             node=input_nodes,
             time=None,
             input_type=input_type,
@@ -135,10 +137,8 @@ def __iter__(self):
             d = perm.numel() % self.__batch_size
             perm = perm[:-d]
 
-        input_data = torch_geometric.loader.node_loader.NodeSamplerInput(
-            input_id=None
-            if self.__input_data.input_id is None
-            else self.__input_data.input_id[perm],
+        input_data = torch_geometric.sampler.NodeSamplerInput(
+            input_id=self.__input_data.input_id[perm],
             node=self.__input_data.node[perm],
             time=None
             if self.__input_data.time is None
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py b/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
index d877b856ad6..bc3d4fd8d3c 100644
--- a/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
@@ -14,7 +14,9 @@
 from typing import Optional, Iterator, Union, Dict, Tuple
 
 from cugraph.utilities.utils import import_optional
-from cugraph.gnn import DistSampler, DistSampleReader
+from cugraph.gnn import DistSampler
+
+from .sampler_utils import filter_cugraph_pyg_store, neg_sample, neg_cat
 
 torch = import_optional("torch")
 torch_geometric = import_optional("torch_geometric")
@@ -58,13 +60,31 @@ def __next__(self):
         next_sample = next(self.__output_iter)
         if isinstance(next_sample, torch_geometric.sampler.SamplerOutput):
             sz = next_sample.edge.numel()
-            if sz == next_sample.col.numel():
+            if sz == next_sample.col.numel() and (
+                next_sample.node.numel() > next_sample.col[-1]
+            ):
+                # This will only trigger on very small batches and will have minimal
+                # performance impact.  If COO output is removed, then this condition
+                # can be avoided.
                 col = next_sample.col
             else:
                 col = torch_geometric.edge_index.ptr2index(
                     next_sample.col, next_sample.edge.numel()
                 )
 
+            data = filter_cugraph_pyg_store(
+                self.__feature_store,
+                self.__graph_store,
+                next_sample.node,
+                next_sample.row,
+                col,
+                next_sample.edge,
+                None,
+            )
+
+            """
+            # TODO Re-enable this once PyG resolves
+            # the issue with edge features (9566)
             data = torch_geometric.loader.utils.filter_custom_store(
                 self.__feature_store,
                 self.__graph_store,
@@ -74,6 +94,7 @@ def __next__(self):
                 next_sample.edge,
                 None,
             )
+            """
 
             if "n_id" not in data:
                 data.n_id = next_sample.node
@@ -85,10 +106,20 @@ def __next__(self):
             data.num_sampled_nodes = next_sample.num_sampled_nodes
             data.num_sampled_edges = next_sample.num_sampled_edges
 
-            data.input_id = data.batch
-            data.seed_time = None
+            data.input_id = next_sample.metadata[0]
             data.batch_size = data.input_id.size(0)
 
+            if len(next_sample.metadata) == 2:
+                data.seed_time = next_sample.metadata[1]
+            elif len(next_sample.metadata) == 4:
+                (
+                    data.edge_label_index,
+                    data.edge_label,
+                    data.seed_time,
+                ) = next_sample.metadata[1:]
+            else:
+                raise ValueError("Invalid metadata")
+
         elif isinstance(next_sample, torch_geometric.sampler.HeteroSamplerOutput):
             col = {}
             for edge_type, col_idx in next_sample.col:
@@ -136,13 +167,15 @@ class SampleReader:
     Iterator that processes results from the cuGraph distributed sampler.
     """
 
-    def __init__(self, base_reader: DistSampleReader):
+    def __init__(
+        self, base_reader: Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]
+    ):
         """
         Constructs a new SampleReader.
 
         Parameters
         ----------
-        base_reader: DistSampleReader
+        base_reader: Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]
             The reader responsible for loading saved samples produced by
             the cuGraph distributed sampler.
         """
@@ -157,6 +190,9 @@ def __next__(self):
                 self.__base_reader
             )
 
+            self.__raw_sample_data["input_offsets"] -= self.__raw_sample_data[
+                "input_offsets"
+            ][0].clone()
             self.__raw_sample_data["label_hop_offsets"] -= self.__raw_sample_data[
                 "label_hop_offsets"
             ][0].clone()
@@ -186,14 +222,16 @@ class HomogeneousSampleReader(SampleReader):
     produced by the cuGraph distributed sampler.
     """
 
-    def __init__(self, base_reader: DistSampleReader):
+    def __init__(
+        self, base_reader: Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]
+    ):
         """
         Constructs a new HomogeneousSampleReader
 
         Parameters
         ----------
-        base_reader: DistSampleReader
-            The reader responsible for loading saved samples produced by
+        base_reader: Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]
+            The iterator responsible for loading saved samples produced by
             the cuGraph distributed sampler.
         """
         super().__init__(base_reader)
@@ -246,14 +284,61 @@ def __decode_csc(self, raw_sample_data: Dict[str, "torch.Tensor"], index: int):
             [num_seeds, num_sampled_nodes_hops.diff(prepend=num_seeds)]
         )
 
+        input_index = raw_sample_data["input_index"][
+            raw_sample_data["input_offsets"][index] : raw_sample_data["input_offsets"][
+                index + 1
+            ]
+        ]
+
+        num_seeds = input_index.numel()
+        input_index = input_index[input_index >= 0]
+
+        num_pos = input_index.numel()
+        num_neg = num_seeds - num_pos
+        if num_neg > 0:
+            edge_label = torch.concat(
+                [
+                    torch.full((num_pos,), 1.0),
+                    torch.full((num_neg,), 0.0),
+                ]
+            )
+        else:
+            edge_label = None
+
+        edge_inverse = (
+            (
+                raw_sample_data["edge_inverse"][
+                    (raw_sample_data["input_offsets"][index] * 2) : (
+                        raw_sample_data["input_offsets"][index + 1] * 2
+                    )
+                ]
+            )
+            if "edge_inverse" in raw_sample_data
+            else None
+        )
+
+        if edge_inverse is None:
+            metadata = (
+                input_index,
+                None,  # TODO this will eventually include time
+            )
+        else:
+            metadata = (
+                input_index,
+                edge_inverse.view(2, -1),
+                edge_label,
+                None,  # TODO this will eventually include time
+            )
+
         return torch_geometric.sampler.SamplerOutput(
             node=renumber_map.cpu(),
             row=minors,
             col=major_offsets,
-            edge=edge_id,
+            edge=edge_id.cpu(),
             batch=renumber_map[:num_seeds],
             num_sampled_nodes=num_sampled_nodes.cpu(),
             num_sampled_edges=num_sampled_edges.cpu(),
+            metadata=metadata,
         )
 
     def __decode_coo(self, raw_sample_data: Dict[str, "torch.Tensor"], index: int):
@@ -299,6 +384,37 @@ def __decode_coo(self, raw_sample_data: Dict[str, "torch.Tensor"], index: int):
             [num_seeds, num_sampled_nodes_hops.diff(prepend=num_seeds)]
         )
 
+        input_index = raw_sample_data["input_index"][
+            raw_sample_data["input_offsets"][index] : raw_sample_data["input_offsets"][
+                index + 1
+            ]
+        ]
+
+        edge_inverse = (
+            (
+                raw_sample_data["edge_inverse"][
+                    (raw_sample_data["input_offsets"][index] * 2) : (
+                        raw_sample_data["input_offsets"][index + 1] * 2
+                    )
+                ]
+            )
+            if "edge_inverse" in raw_sample_data
+            else None
+        )
+
+        if edge_inverse is None:
+            metadata = (
+                input_index,
+                None,  # TODO this will eventually include time
+            )
+        else:
+            metadata = (
+                input_index,
+                edge_inverse.view(2, -1),
+                None,
+                None,  # TODO this will eventually include time
+            )
+
         return torch_geometric.sampler.SamplerOutput(
             node=renumber_map.cpu(),
             row=minors,
@@ -307,6 +423,7 @@ def __decode_coo(self, raw_sample_data: Dict[str, "torch.Tensor"], index: int):
             batch=renumber_map[:num_seeds],
             num_sampled_nodes=num_sampled_nodes,
             num_sampled_edges=num_sampled_edges,
+            metadata=metadata,
         )
 
     def _decode(self, raw_sample_data: Dict[str, "torch.Tensor"], index: int):
@@ -337,8 +454,8 @@ def sample_from_nodes(
             "torch_geometric.sampler.SamplerOutput",
         ]
     ]:
-        self.__sampler.sample_from_nodes(
-            index.node, batch_size=self.__batch_size, **kwargs
+        reader = self.__sampler.sample_from_nodes(
+            index.node, batch_size=self.__batch_size, input_id=index.input_id, **kwargs
         )
 
         edge_attrs = self.__graph_store.get_all_edge_attrs()
@@ -346,7 +463,7 @@ def sample_from_nodes(
             len(edge_attrs) == 1
             and edge_attrs[0].edge_type[0] == edge_attrs[0].edge_type[2]
         ):
-            return HomogeneousSampleReader(self.__sampler.get_reader())
+            return HomogeneousSampleReader(reader)
         else:
             # TODO implement heterogeneous sampling
             raise NotImplementedError(
@@ -365,4 +482,59 @@ def sample_from_edges(
             "torch_geometric.sampler.SamplerOutput",
         ]
     ]:
-        raise NotImplementedError("Edge sampling is currently unimplemented.")
+        src = index.row
+        dst = index.col
+        input_id = index.input_id
+        neg_batch_size = 0
+        if neg_sampling:
+            # Sample every negative subset at once.
+            # TODO handle temporal sampling (node_time)
+            src_neg, dst_neg = neg_sample(
+                self.__graph_store,
+                index.row,
+                index.col,
+                self.__batch_size,
+                neg_sampling,
+                None,  # src_time,
+                None,  # src_node_time,
+            )
+            if neg_sampling.is_binary():
+                src, _ = neg_cat(src.cuda(), src_neg, self.__batch_size)
+            else:
+                # triplet, cat dst to src so length is the same; will
+                # result in the same set of unique vertices
+                src, _ = neg_cat(src.cuda(), dst_neg, self.__batch_size)
+            dst, neg_batch_size = neg_cat(dst.cuda(), dst_neg, self.__batch_size)
+
+            # Concatenate -1s so the input id tensor lines up and can
+            # be processed by the dist sampler.
+            # When loading the output batch, '-1' will be dropped.
+            input_id, _ = neg_cat(
+                input_id,
+                torch.full(
+                    (dst_neg.numel(),), -1, dtype=torch.int64, device=input_id.device
+                ),
+                self.__batch_size,
+            )
+
+        # TODO for temporal sampling, node times have to be
+        # adjusted here.
+        reader = self.__sampler.sample_from_edges(
+            torch.stack([src, dst]),  # reverse of usual convention
+            input_id=input_id,
+            batch_size=self.__batch_size + neg_batch_size,
+            **kwargs,
+        )
+
+        edge_attrs = self.__graph_store.get_all_edge_attrs()
+        if (
+            len(edge_attrs) == 1
+            and edge_attrs[0].edge_type[0] == edge_attrs[0].edge_type[2]
+        ):
+            return HomogeneousSampleReader(reader)
+        else:
+            # TODO implement heterogeneous sampling
+            raise NotImplementedError(
+                "Sampling heterogeneous graphs is currently"
+                " unsupported in the non-dask API"
+            )
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py b/python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py
index c3e19393970..b3d56ef9992 100644
--- a/python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py
+++ b/python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py
@@ -14,10 +14,14 @@
 
 from typing import Sequence, Dict, Tuple
 
-from cugraph_pyg.data import DaskGraphStore
+from math import ceil
+
+from cugraph_pyg.data import GraphStore, DaskGraphStore
 
 from cugraph.utilities.utils import import_optional
 import cudf
+import cupy
+import pylibcugraph
 
 dask_cudf = import_optional("dask_cudf")
 torch_geometric = import_optional("torch_geometric")
@@ -403,3 +407,125 @@ def _sampler_output_from_sampling_results_heterogeneous(
         num_sampled_edges={k: t.tolist() for k, t in num_edges_per_hop_dict.items()},
         metadata=metadata,
     )
+
+
+def filter_cugraph_pyg_store(
+    feature_store,
+    graph_store,
+    node,
+    row,
+    col,
+    edge,
+    clx,
+) -> "torch_geometric.data.Data":
+    data = torch_geometric.data.Data()
+
+    data.edge_index = torch.stack([row, col], dim=0)
+
+    required_attrs = []
+    for attr in feature_store.get_all_tensor_attrs():
+        attr.index = edge if isinstance(attr.group_name, tuple) else node
+        required_attrs.append(attr)
+        data.num_nodes = attr.index.size(0)
+
+    tensors = feature_store.multi_get_tensor(required_attrs)
+    for i, attr in enumerate(required_attrs):
+        data[attr.attr_name] = tensors[i]
+
+    return data
+
+
+def neg_sample(
+    graph_store: GraphStore,
+    seed_src: "torch.Tensor",
+    seed_dst: "torch.Tensor",
+    batch_size: int,
+    neg_sampling: "torch_geometric.sampler.NegativeSampling",
+    time: "torch.Tensor",
+    node_time: "torch.Tensor",
+) -> Tuple["torch.Tensor", "torch.Tensor"]:
+    try:
+        # Compatibility for PyG 2.5
+        src_weight = neg_sampling.src_weight
+        dst_weight = neg_sampling.dst_weight
+    except AttributeError:
+        src_weight = neg_sampling.weight
+        dst_weight = neg_sampling.weight
+    unweighted = src_weight is None and dst_weight is None
+
+    # Require at least one negative edge per batch
+    num_neg = max(
+        int(ceil(neg_sampling.amount * seed_src.numel())),
+        int(ceil(seed_src.numel() / batch_size)),
+    )
+
+    if graph_store.is_multi_gpu:
+        num_neg_global = torch.tensor([num_neg], device="cuda")
+        torch.distributed.all_reduce(num_neg_global, op=torch.distributed.ReduceOp.SUM)
+        num_neg = int(num_neg_global)
+    else:
+        num_neg_global = num_neg
+
+    if node_time is None:
+        result_dict = pylibcugraph.negative_sampling(
+            graph_store._resource_handle,
+            graph_store._graph,
+            num_neg_global,
+            vertices=None
+            if unweighted
+            else cupy.arange(src_weight.numel(), dtype="int64"),
+            src_bias=None if src_weight is None else cupy.asarray(src_weight),
+            dst_bias=None if dst_weight is None else cupy.asarray(dst_weight),
+            remove_duplicates=False,
+            remove_false_negatives=False,
+            exact_number_of_samples=True,
+            do_expensive_check=False,
+        )
+
+        src_neg = torch.as_tensor(result_dict["sources"], device="cuda")[:num_neg]
+        dst_neg = torch.as_tensor(result_dict["destinations"], device="cuda")[:num_neg]
+
+        # TODO modifiy the C API so this condition is impossible
+        if src_neg.numel() < num_neg:
+            num_gen = num_neg - src_neg.numel()
+            src_neg = torch.concat(
+                [
+                    src_neg,
+                    torch.randint(
+                        0, src_neg.max(), (num_gen,), device="cuda", dtype=torch.int64
+                    ),
+                ]
+            )
+            dst_neg = torch.concat(
+                [
+                    dst_neg,
+                    torch.randint(
+                        0, dst_neg.max(), (num_gen,), device="cuda", dtype=torch.int64
+                    ),
+                ]
+            )
+        return src_neg, dst_neg
+    raise NotImplementedError(
+        "Temporal negative sampling is currently unimplemented in cuGraph-PyG"
+    )
+
+
+def neg_cat(
+    seed_pos: "torch.Tensor", seed_neg: "torch.Tensor", pos_batch_size: int
+) -> Tuple["torch.Tensor", int]:
+    num_seeds = seed_pos.numel()
+    num_batches = int(ceil(num_seeds / pos_batch_size))
+    neg_batch_size = int(ceil(seed_neg.numel() / num_batches))
+
+    batch_pos_offsets = torch.full((num_batches,), pos_batch_size).cumsum(-1)[:-1]
+    seed_pos_splits = torch.tensor_split(seed_pos, batch_pos_offsets)
+
+    batch_neg_offsets = torch.full((num_batches,), neg_batch_size).cumsum(-1)[:-1]
+    seed_neg_splits = torch.tensor_split(seed_neg, batch_neg_offsets)
+
+    return (
+        torch.concatenate(
+            [torch.concatenate(s) for s in zip(seed_pos_splits, seed_neg_splits)]
+        ),
+        neg_batch_size,
+    )
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader.py
index 8edb5276953..8ee18a826f7 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader.py
@@ -16,6 +16,7 @@
 from cugraph.datasets import karate
 from cugraph.utilities.utils import import_optional, MissingModule
 
+import cugraph_pyg
 from cugraph_pyg.data import TensorDictFeatureStore, GraphStore
 from cugraph_pyg.loader import NeighborLoader
 
@@ -46,9 +47,150 @@ def test_neighbor_loader():
         (feature_store, graph_store),
         [5, 5],
         input_nodes=torch.arange(34),
-        directory=".",
     )
 
     for batch in loader:
         assert isinstance(batch, torch_geometric.data.Data)
         assert (feature_store["person", "feat"][batch.n_id] == batch.feat).all()
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
+def test_neighbor_loader_biased():
+    eix = torch.tensor(
+        [
+            [3, 4, 5],
+            [0, 1, 2],
+        ]
+    )
+
+    graph_store = GraphStore()
+    graph_store.put_edge_index(eix, ("person", "knows", "person"), "coo")
+
+    feature_store = TensorDictFeatureStore()
+    feature_store["person", "feat"] = torch.randint(128, (6, 12))
+    feature_store[("person", "knows", "person"), "bias"] = torch.tensor(
+        [0, 12, 14], dtype=torch.float32
+    )
+
+    loader = NeighborLoader(
+        (feature_store, graph_store),
+        [1],
+        input_nodes=torch.tensor([0, 1, 2], dtype=torch.int64),
+        batch_size=3,
+        weight_attr="bias",
+    )
+
+    out = list(iter(loader))
+    assert len(out) == 1
+    out = out[0]
+
+    assert out.edge_index.shape[1] == 2
+    assert (out.edge_index.cpu() == torch.tensor([[3, 4], [1, 2]])).all()
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
+@pytest.mark.parametrize("num_nodes", [10, 25])
+@pytest.mark.parametrize("num_edges", [64, 128])
+@pytest.mark.parametrize("batch_size", [2, 4])
+@pytest.mark.parametrize("select_edges", [16, 32])
+@pytest.mark.parametrize("depth", [1, 3])
+@pytest.mark.parametrize("num_neighbors", [1, 4])
+def test_link_neighbor_loader_basic(
+    num_nodes, num_edges, batch_size, select_edges, num_neighbors, depth
+):
+    graph_store = GraphStore()
+    feature_store = TensorDictFeatureStore()
+
+    eix = torch.randperm(num_edges)[:select_edges]
+    graph_store[("n", "e", "n"), "coo"] = torch.stack(
+        [
+            torch.randint(0, num_nodes, (num_edges,)),
+            torch.randint(0, num_nodes, (num_edges,)),
+        ]
+    )
+
+    elx = graph_store[("n", "e", "n"), "coo"][:, eix]
+    loader = cugraph_pyg.loader.LinkNeighborLoader(
+        (feature_store, graph_store),
+        num_neighbors=[num_neighbors] * depth,
+        edge_label_index=elx,
+        batch_size=batch_size,
+        shuffle=False,
+    )
+
+    elx = torch.tensor_split(elx, eix.numel() // batch_size, dim=1)
+    for i, batch in enumerate(loader):
+        assert (
+            batch.input_id.cpu() == torch.arange(i * batch_size, (i + 1) * batch_size)
+        ).all()
+        assert (elx[i] == batch.n_id[batch.edge_label_index.cpu()]).all()
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
+@pytest.mark.parametrize("batch_size", [1, 2])
+def test_link_neighbor_loader_negative_sampling_basic(batch_size):
+    num_edges = 62
+    num_nodes = 19
+    select_edges = 17
+
+    graph_store = GraphStore()
+    feature_store = TensorDictFeatureStore()
+
+    eix = torch.randperm(num_edges)[:select_edges]
+    graph_store[("n", "e", "n"), "coo"] = torch.stack(
+        [
+            torch.randint(0, num_nodes, (num_edges,)),
+            torch.randint(0, num_nodes, (num_edges,)),
+        ]
+    )
+
+    elx = graph_store[("n", "e", "n"), "coo"][:, eix]
+    loader = cugraph_pyg.loader.LinkNeighborLoader(
+        (feature_store, graph_store),
+        num_neighbors=[3, 3, 3],
+        edge_label_index=elx,
+        batch_size=batch_size,
+        neg_sampling="binary",
+        shuffle=False,
+    )
+
+    elx = torch.tensor_split(elx, eix.numel() // batch_size, dim=1)
+    for i, batch in enumerate(loader):
+        assert batch.edge_label[0] == 1.0
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
+@pytest.mark.parametrize("batch_size", [1, 2])
+def test_link_neighbor_loader_negative_sampling_uneven(batch_size):
+    num_edges = 62
+    num_nodes = 19
+    select_edges = 17
+
+    graph_store = GraphStore()
+    feature_store = TensorDictFeatureStore()
+
+    eix = torch.randperm(num_edges)[:select_edges]
+    graph_store[("n", "e", "n"), "coo"] = torch.stack(
+        [
+            torch.randint(0, num_nodes, (num_edges,)),
+            torch.randint(0, num_nodes, (num_edges,)),
+        ]
+    )
+
+    elx = graph_store[("n", "e", "n"), "coo"][:, eix]
+    loader = cugraph_pyg.loader.LinkNeighborLoader(
+        (feature_store, graph_store),
+        num_neighbors=[3, 3, 3],
+        edge_label_index=elx,
+        batch_size=batch_size,
+        neg_sampling=torch_geometric.sampler.NegativeSampling("binary", amount=0.1),
+        shuffle=False,
+    )
+
+    elx = torch.tensor_split(elx, eix.numel() // batch_size, dim=1)
+    for i, batch in enumerate(loader):
+        assert batch.edge_label[0] == 1.0
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader_mg.py b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader_mg.py
index 6a5f46b0940..d1dee01a508 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader_mg.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader_mg.py
@@ -19,7 +19,7 @@
 from cugraph.utilities.utils import import_optional, MissingModule
 
 from cugraph_pyg.data import TensorDictFeatureStore, GraphStore
-from cugraph_pyg.loader import NeighborLoader
+from cugraph_pyg.loader import NeighborLoader, LinkNeighborLoader
 
 from cugraph.gnn import (
     cugraph_comms_init,
@@ -27,6 +27,8 @@
     cugraph_comms_create_unique_id,
 )
 
+os.environ["RAPIDS_NO_INITIALIZE"] = "1"
+
 torch = import_optional("torch")
 torch_geometric = import_optional("torch_geometric")
 
@@ -36,6 +38,7 @@ def init_pytorch_worker(rank, world_size, cugraph_id):
 
     rmm.reinitialize(
         devices=rank,
+        pool_allocator=False,
     )
 
     import cupy
@@ -93,6 +96,7 @@ def run_test_neighbor_loader_mg(rank, uid, world_size, specify_size):
     cugraph_comms_shutdown()
 
 
+@pytest.mark.skip(reason="deleteme")
 @pytest.mark.parametrize("specify_size", [True, False])
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 @pytest.mark.mg
@@ -109,3 +113,252 @@ def test_neighbor_loader_mg(specify_size):
         ),
         nprocs=world_size,
     )
+
+
+def run_test_neighbor_loader_biased_mg(rank, uid, world_size):
+    init_pytorch_worker(rank, world_size, uid)
+
+    eix = torch.stack(
+        [
+            torch.arange(
+                3 * (world_size + rank),
+                3 * (world_size + rank + 1),
+                dtype=torch.int64,
+                device="cuda",
+            ),
+            torch.arange(3 * rank, 3 * (rank + 1), dtype=torch.int64, device="cuda"),
+        ]
+    )
+
+    graph_store = GraphStore(is_multi_gpu=True)
+    graph_store.put_edge_index(eix, ("person", "knows", "person"), "coo")
+
+    feature_store = TensorDictFeatureStore()
+    feature_store["person", "feat"] = torch.randint(128, (6 * world_size, 12))
+    feature_store[("person", "knows", "person"), "bias"] = torch.concat(
+        [torch.tensor([0, 1, 1], dtype=torch.float32) for _ in range(world_size)]
+    )
+
+    loader = NeighborLoader(
+        (feature_store, graph_store),
+        [1],
+        input_nodes=torch.arange(
+            3 * rank, 3 * (rank + 1), dtype=torch.int64, device="cuda"
+        ),
+        batch_size=3,
+        weight_attr="bias",
+    )
+
+    out = list(iter(loader))
+    assert len(out) == 1
+    out = out[0]
+
+    assert (
+        out.edge_index.cpu()
+        == torch.tensor(
+            [
+                [3, 4],
+                [1, 2],
+            ]
+        )
+    ).all()
+
+    cugraph_comms_shutdown()
+
+
+@pytest.mark.skip(reason="deleteme")
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
+def test_neighbor_loader_biased_mg():
+    uid = cugraph_comms_create_unique_id()
+    world_size = torch.cuda.device_count()
+
+    torch.multiprocessing.spawn(
+        run_test_neighbor_loader_biased_mg,
+        args=(
+            uid,
+            world_size,
+        ),
+        nprocs=world_size,
+    )
+
+
+def run_test_link_neighbor_loader_basic_mg(
+    rank,
+    uid,
+    world_size,
+    num_nodes: int,
+    num_edges: int,
+    select_edges: int,
+    batch_size: int,
+    num_neighbors: int,
+    depth: int,
+):
+    init_pytorch_worker(rank, world_size, uid)
+
+    graph_store = GraphStore(is_multi_gpu=True)
+    feature_store = TensorDictFeatureStore()
+
+    eix = torch.randperm(num_edges)[:select_edges]
+    graph_store[("n", "e", "n"), "coo"] = torch.stack(
+        [
+            torch.randint(0, num_nodes, (num_edges,)),
+            torch.randint(0, num_nodes, (num_edges,)),
+        ]
+    )
+
+    elx = graph_store[("n", "e", "n"), "coo"][:, eix]
+    loader = LinkNeighborLoader(
+        (feature_store, graph_store),
+        num_neighbors=[num_neighbors] * depth,
+        edge_label_index=elx,
+        batch_size=batch_size,
+        shuffle=False,
+    )
+
+    elx = torch.tensor_split(elx, eix.numel() // batch_size, dim=1)
+    for i, batch in enumerate(loader):
+        assert (
+            batch.input_id.cpu() == torch.arange(i * batch_size, (i + 1) * batch_size)
+        ).all()
+        assert (elx[i] == batch.n_id[batch.edge_label_index.cpu()]).all()
+
+    cugraph_comms_shutdown()
+
+
+@pytest.mark.skip(reason="deleteme")
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
+@pytest.mark.parametrize("select_edges", [64, 128])
+@pytest.mark.parametrize("batch_size", [2, 4])
+@pytest.mark.parametrize("depth", [1, 3])
+def test_link_neighbor_loader_basic_mg(select_edges, batch_size, depth):
+    num_nodes = 25
+    num_edges = 128
+    num_neighbors = 2
+
+    uid = cugraph_comms_create_unique_id()
+    world_size = torch.cuda.device_count()
+
+    torch.multiprocessing.spawn(
+        run_test_link_neighbor_loader_basic_mg,
+        args=(
+            uid,
+            world_size,
+            num_nodes,
+            num_edges,
+            select_edges,
+            batch_size,
+            num_neighbors,
+            depth,
+        ),
+        nprocs=world_size,
+    )
+
+
+def run_test_link_neighbor_loader_uneven_mg(rank, uid, world_size, edge_index):
+    init_pytorch_worker(rank, world_size, uid)
+
+    graph_store = GraphStore(is_multi_gpu=True)
+    feature_store = TensorDictFeatureStore()
+
+    batch_size = 1
+    graph_store[("n", "e", "n"), "coo"] = torch.tensor_split(
+        edge_index, world_size, dim=-1
+    )[rank]
+
+    elx = graph_store[("n", "e", "n"), "coo"]  # select all edges on each worker
+    loader = LinkNeighborLoader(
+        (feature_store, graph_store),
+        num_neighbors=[2, 2, 2],
+        edge_label_index=elx,
+        batch_size=batch_size,
+        shuffle=False,
+    )
+
+    for i, batch in enumerate(loader):
+        assert (
+            batch.input_id.cpu() == torch.arange(i * batch_size, (i + 1) * batch_size)
+        ).all()
+
+        assert (elx[:, [i]] == batch.n_id[batch.edge_label_index.cpu()]).all()
+
+    cugraph_comms_shutdown()
+
+
+@pytest.mark.skip(reason="deleteme")
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
+def test_link_neighbor_loader_uneven_mg():
+    edge_index = torch.tensor(
+        [
+            [0, 1, 3, 4, 7],
+            [1, 0, 8, 9, 12],
+        ]
+    )
+
+    uid = cugraph_comms_create_unique_id()
+    world_size = torch.cuda.device_count()
+
+    torch.multiprocessing.spawn(
+        run_test_link_neighbor_loader_uneven_mg,
+        args=(
+            uid,
+            world_size,
+            edge_index,
+        ),
+        nprocs=world_size,
+    )
+
+
+def run_test_link_neighbor_loader_negative_sampling_basic_mg(
+    rank, world_size, uid, batch_size
+):
+    num_edges = 62 * world_size
+    num_nodes = 19 * world_size
+    select_edges = 17
+
+    init_pytorch_worker(rank, world_size, uid)
+
+    graph_store = GraphStore(is_multi_gpu=True)
+    feature_store = TensorDictFeatureStore()
+
+    eix = torch.randperm(num_edges)[:select_edges]
+    graph_store[("n", "e", "n"), "coo"] = torch.stack(
+        [
+            torch.randint(0, num_nodes, (num_edges,)),
+            torch.randint(0, num_nodes, (num_edges,)),
+        ]
+    )
+
+    elx = graph_store[("n", "e", "n"), "coo"][:, eix]
+    loader = LinkNeighborLoader(
+        (feature_store, graph_store),
+        num_neighbors=[3, 3, 3],
+        edge_label_index=elx,
+        batch_size=batch_size,
+        neg_sampling="binary",
+        shuffle=False,
+    )
+
+    elx = torch.tensor_split(elx, eix.numel() // batch_size, dim=1)
+    for i, batch in enumerate(loader):
+        assert batch.edge_label[0] == 1.0
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
+@pytest.mark.parametrize("batch_size", [1, 2])
+def test_link_neighbor_loader_negative_sampling_basic_mg(batch_size):
+    uid = cugraph_comms_create_unique_id()
+    world_size = torch.cuda.device_count()
+
+    torch.multiprocessing.spawn(
+        run_test_link_neighbor_loader_negative_sampling_basic_mg,
+        args=(
+            world_size,
+            uid,
+            batch_size,
+        ),
+        nprocs=world_size,
+    )
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/pytest.ini b/python/cugraph-pyg/cugraph_pyg/tests/pytest.ini
new file mode 100644
index 00000000000..7b0a9f29fb1
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/tests/pytest.ini
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[pytest]
+addopts = --tb=native
diff --git a/python/cugraph-pyg/pyproject.toml b/python/cugraph-pyg/pyproject.toml
index b29c108e3f4..e157f36f8f6 100644
--- a/python/cugraph-pyg/pyproject.toml
+++ b/python/cugraph-pyg/pyproject.toml
@@ -20,19 +20,19 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 classifiers = [
     "Intended Audience :: Developers",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
 ]
 dependencies = [
-    "cugraph==24.10.*,>=0.0.0a0",
+    "cugraph==24.12.*,>=0.0.0a0",
     "numba>=0.57",
-    "numpy>=1.23,<2.0a0",
-    "pylibcugraphops==24.10.*,>=0.0.0a0",
+    "numpy>=1.23,<3.0a0",
+    "pylibcugraphops==24.12.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
@@ -42,14 +42,14 @@ Documentation = "https://docs.rapids.ai/api/cugraph/stable/"
 [project.optional-dependencies]
 test = [
     "pandas",
-    "pylibwholegraph==24.10.*,>=0.0.0a0",
+    "pylibwholegraph==24.12.*,>=0.0.0a0",
     "pytest",
     "pytest-benchmark",
     "pytest-cov",
     "pytest-xdist",
     "scipy",
     "tensordict>=0.1.2",
-    "torch>=2.0,<2.2.0a0",
+    "torch>=2.3,<2.4.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [tool.setuptools]
diff --git a/python/cugraph-pyg/pytest.ini b/python/cugraph-pyg/pytest.ini
index db99a54ae49..07c4ffa0958 100644
--- a/python/cugraph-pyg/pytest.ini
+++ b/python/cugraph-pyg/pytest.ini
@@ -17,6 +17,7 @@ addopts =
            --benchmark-max-time=0
            --benchmark-min-rounds=1
            --benchmark-columns="mean, rounds"
+           --tb=native
            ## do not run slow tests/benchmarks by default
            -m "not slow"
 
diff --git a/python/cugraph-service/client/pyproject.toml b/python/cugraph-service/client/pyproject.toml
index 75deea88e2e..ac5e6bad0d5 100644
--- a/python/cugraph-service/client/pyproject.toml
+++ b/python/cugraph-service/client/pyproject.toml
@@ -18,16 +18,16 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 dependencies = [
     "thriftpy2!=0.5.0,!=0.5.1",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
 ]
 
 [project.urls]
diff --git a/python/cugraph-service/pytest.ini b/python/cugraph-service/pytest.ini
index 6a0dd36ecec..f2ba9175f82 100644
--- a/python/cugraph-service/pytest.ini
+++ b/python/cugraph-service/pytest.ini
@@ -16,6 +16,7 @@ addopts = --benchmark-warmup=off
           --benchmark-max-time=0
           --benchmark-min-rounds=1
           --benchmark-columns="min, max, mean, rounds"
+          --tb=native
           ## for use with rapids-pytest-benchmark plugin
           #--benchmark-gpu-disable
           ## for use with pytest-cov plugin
diff --git a/python/cugraph-service/server/pyproject.toml b/python/cugraph-service/server/pyproject.toml
index 2ae40911821..f388fd4c126 100644
--- a/python/cugraph-service/server/pyproject.toml
+++ b/python/cugraph-service/server/pyproject.toml
@@ -18,27 +18,27 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 dependencies = [
-    "cudf==24.10.*,>=0.0.0a0",
-    "cugraph-service-client==24.10.*,>=0.0.0a0",
-    "cugraph==24.10.*,>=0.0.0a0",
+    "cudf==24.12.*,>=0.0.0a0",
+    "cugraph-service-client==24.12.*,>=0.0.0a0",
+    "cugraph==24.12.*,>=0.0.0a0",
     "cupy-cuda11x>=12.0.0",
-    "dask-cuda==24.10.*,>=0.0.0a0",
-    "dask-cudf==24.10.*,>=0.0.0a0",
+    "dask-cuda==24.12.*,>=0.0.0a0",
+    "dask-cudf==24.12.*,>=0.0.0a0",
     "numba>=0.57",
-    "numpy>=1.23,<2.0a0",
-    "rapids-dask-dependency==24.10.*,>=0.0.0a0",
-    "rmm==24.10.*,>=0.0.0a0",
+    "numpy>=1.23,<3.0a0",
+    "rapids-dask-dependency==24.12.*,>=0.0.0a0",
+    "rmm==24.12.*,>=0.0.0a0",
     "thriftpy2!=0.5.0,!=0.5.1",
-    "ucx-py==0.40.*,>=0.0.0a0",
+    "ucx-py==0.41.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
 ]
 
 [project.scripts]
@@ -47,7 +47,7 @@ cugraph-service-server = "cugraph_service_server.__main__:main"
 [project.optional-dependencies]
 test = [
     "networkx>=2.5.1",
-    "numpy>=1.23,<2.0a0",
+    "numpy>=1.23,<3.0a0",
     "pandas",
     "pytest",
     "pytest-benchmark",
diff --git a/python/cugraph-service/tests/pytest.ini b/python/cugraph-service/tests/pytest.ini
new file mode 100644
index 00000000000..7b0a9f29fb1
--- /dev/null
+++ b/python/cugraph-service/tests/pytest.ini
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[pytest]
+addopts = --tb=native
diff --git a/python/cugraph-service/tests/test_e2e.py b/python/cugraph-service/tests/test_e2e.py
index c9b3d24f20e..3079a2423c7 100644
--- a/python/cugraph-service/tests/test_e2e.py
+++ b/python/cugraph-service/tests/test_e2e.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -307,8 +307,8 @@ def test_load_call_unload_extension(client, extension1):
     assert len(results) == 2
     assert len(results[0]) == 33
     assert len(results[1]) == 21
-    assert type(results[0][0]) == int
-    assert type(results[1][0]) == float
+    assert type(results[0][0]) is int
+    assert type(results[1][0]) is float
     assert results[0][0] == 9
     assert results[1][0] == 9.0
 
diff --git a/python/cugraph-service/tests/test_mg_e2e.py b/python/cugraph-service/tests/test_mg_e2e.py
index 39c1195151d..5526593aee0 100644
--- a/python/cugraph-service/tests/test_mg_e2e.py
+++ b/python/cugraph-service/tests/test_mg_e2e.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -413,8 +413,8 @@ def test_call_extension_result_on_device(
         assert len(results) == 2
         assert len(results[0]) == array1_len
         assert len(results[1]) == array2_len
-        assert type(results[0][0]) == int
-        assert type(results[1][0]) == float
+        assert type(results[0][0]) is int
+        assert type(results[1][0]) is float
         assert results[0][0] == 9
         assert results[1][0] == 9.0
     else:
diff --git a/python/cugraph/CMakeLists.txt b/python/cugraph/CMakeLists.txt
index dfccf02d042..ca38b5551c9 100644
--- a/python/cugraph/CMakeLists.txt
+++ b/python/cugraph/CMakeLists.txt
@@ -33,6 +33,7 @@ option(FIND_CUGRAPH_CPP "Search for existing CUGRAPH C++ installations before de
        OFF
 )
 option(USE_CUGRAPH_OPS "Enable all functions that call cugraph-ops" ON)
+option(USE_CUDA_MATH_WHEELS "Use the CUDA math wheels instead of the system libraries" OFF)
 
 if(NOT USE_CUGRAPH_OPS)
     message(STATUS "Disabling libcugraph functions that reference cugraph-ops")
@@ -49,18 +50,39 @@ endif()
 include(rapids-cython-core)
 
 if(NOT cugraph_FOUND)
+  find_package(CUDAToolkit REQUIRED)
+
   set(BUILD_TESTS OFF)
   set(BUILD_CUGRAPH_MG_TESTS OFF)
   set(BUILD_CUGRAPH_OPS_CPP_TESTS OFF)
   set(CUDA_STATIC_RUNTIME ON)
+  set(CUDA_STATIC_MATH_LIBRARIES ON)
   set(USE_RAFT_STATIC ON)
   set(CUGRAPH_COMPILE_RAFT_LIB ON)
   set(CUGRAPH_USE_CUGRAPH_OPS_STATIC ON)
   set(CUGRAPH_EXCLUDE_CUGRAPH_OPS_FROM_ALL ON)
   set(ALLOW_CLONE_CUGRAPH_OPS ON)
 
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.0)
+    set(CUDA_STATIC_MATH_LIBRARIES OFF)
+  elseif(USE_CUDA_MATH_WHEELS)
+    message(FATAL_ERROR "Cannot use CUDA math wheels with CUDA < 12.0")
+  endif()
+
   add_subdirectory(../../cpp cugraph-cpp EXCLUDE_FROM_ALL)
 
+  if(NOT CUDA_STATIC_MATH_LIBRARIES AND USE_CUDA_MATH_WHEELS)
+    set(rpaths
+      "$ORIGIN/../nvidia/cublas/lib"
+      "$ORIGIN/../nvidia/cufft/lib"
+      "$ORIGIN/../nvidia/curand/lib"
+      "$ORIGIN/../nvidia/cusolver/lib"
+      "$ORIGIN/../nvidia/cusparse/lib"
+      "$ORIGIN/../nvidia/nvjitlink/lib"
+    )
+    set_property(TARGET cugraph PROPERTY INSTALL_RPATH ${rpaths} APPEND)
+  endif()
+
   set(cython_lib_dir cugraph)
   install(TARGETS cugraph DESTINATION ${cython_lib_dir})
 endif()
diff --git a/python/cugraph/cugraph/gnn/__init__.py b/python/cugraph/cugraph/gnn/__init__.py
index b6c8e1981d0..5845f70ef7c 100644
--- a/python/cugraph/cugraph/gnn/__init__.py
+++ b/python/cugraph/cugraph/gnn/__init__.py
@@ -13,11 +13,13 @@
 
 from .feature_storage.feat_storage import FeatureStore
 from .data_loading.bulk_sampler import BulkSampler
-from .data_loading.dist_sampler import (
+from .data_loading import (
     DistSampler,
     DistSampleWriter,
     DistSampleReader,
+    NeighborSampler,
     UniformNeighborSampler,
+    BiasedNeighborSampler,
 )
 from .comms.cugraph_nccl_comms import (
     cugraph_comms_init,
diff --git a/python/cugraph/cugraph/gnn/data_loading/__init__.py b/python/cugraph/cugraph/gnn/data_loading/__init__.py
index 98c547a0083..25f58be88aa 100644
--- a/python/cugraph/cugraph/gnn/data_loading/__init__.py
+++ b/python/cugraph/cugraph/gnn/data_loading/__init__.py
@@ -14,7 +14,26 @@
 from cugraph.gnn.data_loading.bulk_sampler import BulkSampler
 from cugraph.gnn.data_loading.dist_sampler import (
     DistSampler,
+    NeighborSampler,
+)
+from cugraph.gnn.data_loading.dist_io import (
     DistSampleWriter,
     DistSampleReader,
-    UniformNeighborSampler,
+    BufferedSampleReader,
 )
+
+
+def UniformNeighborSampler(*args, **kwargs):
+    return NeighborSampler(
+        *args,
+        **kwargs,
+        biased=False,
+    )
+
+
+def BiasedNeighborSampler(*args, **kwargs):
+    return NeighborSampler(
+        *args,
+        **kwargs,
+        biased=True,
+    )
diff --git a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py
index 6abbd82647b..222fb49a836 100644
--- a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py
+++ b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py
@@ -33,10 +33,12 @@ def create_df_from_disjoint_series(series_list: List[cudf.Series]):
 
 
 def create_df_from_disjoint_arrays(array_dict: Dict[str, cupy.array]):
+    series_dict = {}
     for k in list(array_dict.keys()):
-        array_dict[k] = cudf.Series(array_dict[k], name=k)
+        if array_dict[k] is not None:
+            series_dict[k] = cudf.Series(array_dict[k], name=k)
 
-    return create_df_from_disjoint_series(list(array_dict.values()))
+    return create_df_from_disjoint_series(list(series_dict.values()))
 
 
 def _write_samples_to_parquet_csr(
diff --git a/python/cugraph/cugraph/gnn/data_loading/dist_io/__init__.py b/python/cugraph/cugraph/gnn/data_loading/dist_io/__init__.py
new file mode 100644
index 00000000000..29bb5489be2
--- /dev/null
+++ b/python/cugraph/cugraph/gnn/data_loading/dist_io/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .reader import BufferedSampleReader, DistSampleReader
+from .writer import DistSampleWriter
diff --git a/python/cugraph/cugraph/gnn/data_loading/dist_io/reader.py b/python/cugraph/cugraph/gnn/data_loading/dist_io/reader.py
new file mode 100644
index 00000000000..69f909e7a8d
--- /dev/null
+++ b/python/cugraph/cugraph/gnn/data_loading/dist_io/reader.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import re
+
+import cudf
+
+from typing import Callable, Iterator, Tuple, Dict, Optional
+
+from cugraph.utilities.utils import import_optional, MissingModule
+
+# Prevent PyTorch from being imported and causing an OOM error
+torch = MissingModule("torch")
+
+
+class DistSampleReader:
+    def __init__(
+        self,
+        directory: str,
+        *,
+        format: str = "parquet",
+        rank: Optional[int] = None,
+        filelist=None,
+    ):
+        torch = import_optional("torch")
+
+        self.__format = format
+        self.__directory = directory
+
+        if format != "parquet":
+            raise ValueError("Invalid format (currently supported: 'parquet')")
+
+        if filelist is None:
+            files = os.listdir(directory)
+            ex = re.compile(r"batch\=([0-9]+)\.([0-9]+)\-([0-9]+)\.([0-9]+)\.parquet")
+            filematch = [ex.match(f) for f in files]
+            filematch = [f for f in filematch if f]
+
+            if rank is not None:
+                filematch = [f for f in filematch if int(f[1]) == rank]
+
+            batch_count = sum([int(f[4]) - int(f[2]) + 1 for f in filematch])
+            filematch = sorted(filematch, key=lambda f: int(f[2]), reverse=True)
+
+            self.__files = filematch
+        else:
+            self.__files = list(filelist)
+
+        if rank is None:
+            self.__batch_count = batch_count
+        else:
+            # TODO maybe remove this in favor of warning users that they are
+            # probably going to cause a hang, instead of attempting to resolve
+            # the hang for them by dropping batches.
+            batch_count = torch.tensor([batch_count], device="cuda")
+            torch.distributed.all_reduce(batch_count, torch.distributed.ReduceOp.MIN)
+            self.__batch_count = int(batch_count)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self) -> Tuple[Dict[str, "torch.Tensor"], int, int]:
+        torch = import_optional("torch")
+
+        if len(self.__files) > 0:
+            f = self.__files.pop()
+            fname = f[0]
+            start_inclusive = int(f[2])
+            end_inclusive = int(f[4])
+
+            if (end_inclusive - start_inclusive + 1) > self.__batch_count:
+                end_inclusive = start_inclusive + self.__batch_count - 1
+                self.__batch_count = 0
+            else:
+                self.__batch_count -= end_inclusive - start_inclusive + 1
+
+            df = cudf.read_parquet(os.path.join(self.__directory, fname))
+            tensors = {}
+            for col in list(df.columns):
+                s = df[col].dropna()
+                if len(s) > 0:
+                    tensors[col] = torch.as_tensor(s, device="cuda")
+                df.drop(col, axis=1, inplace=True)
+
+            return tensors, start_inclusive, end_inclusive
+
+        raise StopIteration
+
+
+class BufferedSampleReader:
+    def __init__(
+        self,
+        nodes_call_groups: list["torch.Tensor"],
+        sample_fn: Callable[..., Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]],
+        *args,
+        **kwargs,
+    ):
+        self.__sample_args = args
+        self.__sample_kwargs = kwargs
+
+        self.__nodes_call_groups = iter(nodes_call_groups)
+        self.__sample_fn = sample_fn
+        self.__current_call_id = 0
+        self.__current_reader = None
+
+    def __next__(self) -> Tuple[Dict[str, "torch.Tensor"], int, int]:
+        new_reader = False
+
+        if self.__current_reader is None:
+            new_reader = True
+        else:
+            try:
+                out = next(self.__current_reader)
+            except StopIteration:
+                new_reader = True
+
+        if new_reader:
+            # Will trigger StopIteration if there are no more call groups
+            self.__current_reader = self.__sample_fn(
+                self.__current_call_id,
+                next(self.__nodes_call_groups),
+                *self.__sample_args,
+                **self.__sample_kwargs,
+            )
+
+            self.__current_call_id += 1
+            out = next(self.__current_reader)
+
+        return out
+
+    def __iter__(self) -> Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]:
+        return self
diff --git a/python/cugraph/cugraph/gnn/data_loading/dist_io/writer.py b/python/cugraph/cugraph/gnn/data_loading/dist_io/writer.py
new file mode 100644
index 00000000000..f8ad4719a76
--- /dev/null
+++ b/python/cugraph/cugraph/gnn/data_loading/dist_io/writer.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from math import ceil
+
+
+import cupy
+
+from cugraph.utilities.utils import MissingModule
+from cugraph.gnn.data_loading.dist_io import DistSampleReader
+
+from cugraph.gnn.data_loading.bulk_sampler_io import create_df_from_disjoint_arrays
+
+from typing import Iterator, Tuple, Dict
+
+torch = MissingModule("torch")
+
+
+class DistSampleWriter:
+    def __init__(
+        self,
+        directory: str,
+        *,
+        batches_per_partition: int = 256,
+        format: str = "parquet",
+    ):
+        """
+        Parameters
+        ----------
+        directory: str (required)
+            The directory where samples will be written.  This
+            writer can only write to disk.
+        batches_per_partition: int (optional, default=256)
+            The number of batches to write in a single file.
+        format: str (optional, default='parquet')
+            The file format of the output files containing the
+            sampled minibatches.  Currently, only parquet format
+            is supported.
+        """
+        if format != "parquet":
+            raise ValueError("Invalid format (currently supported: 'parquet')")
+
+        self.__format = format
+        self.__directory = directory
+        self.__batches_per_partition = batches_per_partition
+
+    @property
+    def _format(self):
+        return self.__format
+
+    @property
+    def _directory(self):
+        return self.__directory
+
+    @property
+    def _batches_per_partition(self):
+        return self.__batches_per_partition
+
+    def get_reader(
+        self, rank: int
+    ) -> Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]:
+        """
+        Returns an iterator over sampled data.
+        """
+
+        # currently only disk reading is supported
+        return DistSampleReader(self._directory, format=self._format, rank=rank)
+
+    def __write_minibatches_coo(self, minibatch_dict):
+        has_edge_ids = minibatch_dict["edge_id"] is not None
+        has_edge_types = minibatch_dict["edge_type"] is not None
+        has_weights = minibatch_dict["weight"] is not None
+
+        if minibatch_dict["renumber_map"] is None:
+            raise ValueError(
+                "Distributed sampling without renumbering is not supported"
+            )
+
+        # Quit if there are no batches to write.
+        if len(minibatch_dict["batch_id"]) == 0:
+            return
+
+        fanout_length = (len(minibatch_dict["label_hop_offsets"]) - 1) // len(
+            minibatch_dict["batch_id"]
+        )
+
+        for p in range(
+            0, int(ceil(len(minibatch_dict["batch_id"]) / self.__batches_per_partition))
+        ):
+            partition_start = p * (self.__batches_per_partition)
+            partition_end = (p + 1) * (self.__batches_per_partition)
+
+            label_hop_offsets_array_p = minibatch_dict["label_hop_offsets"][
+                partition_start * fanout_length : partition_end * fanout_length + 1
+            ]
+
+            batch_id_array_p = minibatch_dict["batch_id"][partition_start:partition_end]
+            start_batch_id = batch_id_array_p[0]
+
+            input_offsets_p = minibatch_dict["input_offsets"][
+                partition_start : (partition_end + 1)
+            ]
+            input_index_p = minibatch_dict["input_index"][
+                input_offsets_p[0] : input_offsets_p[-1]
+            ]
+            edge_inverse_p = (
+                minibatch_dict["edge_inverse"][
+                    (input_offsets_p[0] * 2) : (input_offsets_p[-1] * 2)
+                ]
+                if "edge_inverse" in minibatch_dict
+                else None
+            )
+
+            start_ix, end_ix = label_hop_offsets_array_p[[0, -1]]
+            majors_array_p = minibatch_dict["majors"][start_ix:end_ix]
+            minors_array_p = minibatch_dict["minors"][start_ix:end_ix]
+            edge_id_array_p = (
+                minibatch_dict["edge_id"][start_ix:end_ix]
+                if has_edge_ids
+                else cupy.array([], dtype="int64")
+            )
+            edge_type_array_p = (
+                minibatch_dict["edge_type"][start_ix:end_ix]
+                if has_edge_types
+                else cupy.array([], dtype="int32")
+            )
+            weight_array_p = (
+                minibatch_dict["weight"][start_ix:end_ix]
+                if has_weights
+                else cupy.array([], dtype="float32")
+            )
+
+            # create the renumber map offsets
+            renumber_map_offsets_array_p = minibatch_dict["renumber_map_offsets"][
+                partition_start : partition_end + 1
+            ]
+
+            renumber_map_start_ix, renumber_map_end_ix = renumber_map_offsets_array_p[
+                [0, -1]
+            ]
+
+            renumber_map_array_p = minibatch_dict["renumber_map"][
+                renumber_map_start_ix:renumber_map_end_ix
+            ]
+
+            results_dataframe_p = create_df_from_disjoint_arrays(
+                {
+                    "majors": majors_array_p,
+                    "minors": minors_array_p,
+                    "map": renumber_map_array_p,
+                    "label_hop_offsets": label_hop_offsets_array_p,
+                    "weight": weight_array_p,
+                    "edge_id": edge_id_array_p,
+                    "edge_type": edge_type_array_p,
+                    "renumber_map_offsets": renumber_map_offsets_array_p,
+                    "input_index": input_index_p,
+                    "input_offsets": input_offsets_p,
+                    "edge_inverse": edge_inverse_p,
+                }
+            )
+
+            end_batch_id = start_batch_id + len(batch_id_array_p) - 1
+            rank = minibatch_dict["rank"] if "rank" in minibatch_dict else 0
+
+            full_output_path = os.path.join(
+                self.__directory,
+                f"batch={rank:05d}.{start_batch_id:08d}-"
+                f"{rank:05d}.{end_batch_id:08d}.parquet",
+            )
+
+            results_dataframe_p.to_parquet(
+                full_output_path,
+                compression=None,
+                index=False,
+                force_nullable_schema=True,
+            )
+
+    def __write_minibatches_csr(self, minibatch_dict):
+        has_edge_ids = minibatch_dict["edge_id"] is not None
+        has_edge_types = minibatch_dict["edge_type"] is not None
+        has_weights = minibatch_dict["weight"] is not None
+
+        if minibatch_dict["renumber_map"] is None:
+            raise ValueError(
+                "Distributed sampling without renumbering is not supported"
+            )
+
+        # Quit if there are no batches to write.
+        if len(minibatch_dict["batch_id"]) == 0:
+            return
+
+        fanout_length = (len(minibatch_dict["label_hop_offsets"]) - 1) // len(
+            minibatch_dict["batch_id"]
+        )
+
+        for p in range(
+            0, int(ceil(len(minibatch_dict["batch_id"]) / self.__batches_per_partition))
+        ):
+            partition_start = p * (self.__batches_per_partition)
+            partition_end = (p + 1) * (self.__batches_per_partition)
+
+            label_hop_offsets_array_p = minibatch_dict["label_hop_offsets"][
+                partition_start * fanout_length : partition_end * fanout_length + 1
+            ]
+
+            batch_id_array_p = minibatch_dict["batch_id"][partition_start:partition_end]
+            start_batch_id = batch_id_array_p[0]
+
+            input_offsets_p = minibatch_dict["input_offsets"][
+                partition_start : (partition_end + 1)
+            ]
+            input_index_p = minibatch_dict["input_index"][
+                input_offsets_p[0] : input_offsets_p[-1]
+            ]
+            edge_inverse_p = (
+                minibatch_dict["edge_inverse"][
+                    (input_offsets_p[0] * 2) : (input_offsets_p[-1] * 2)
+                ]
+                if "edge_inverse" in minibatch_dict
+                else None
+            )
+
+            # major offsets and minors
+            (
+                major_offsets_start_incl,
+                major_offsets_end_incl,
+            ) = label_hop_offsets_array_p[[0, -1]]
+
+            start_ix, end_ix = minibatch_dict["major_offsets"][
+                [major_offsets_start_incl, major_offsets_end_incl]
+            ]
+
+            major_offsets_array_p = minibatch_dict["major_offsets"][
+                major_offsets_start_incl : major_offsets_end_incl + 1
+            ]
+
+            minors_array_p = minibatch_dict["minors"][start_ix:end_ix]
+            edge_id_array_p = (
+                minibatch_dict["edge_id"][start_ix:end_ix]
+                if has_edge_ids
+                else cupy.array([], dtype="int64")
+            )
+            edge_type_array_p = (
+                minibatch_dict["edge_type"][start_ix:end_ix]
+                if has_edge_types
+                else cupy.array([], dtype="int32")
+            )
+            weight_array_p = (
+                minibatch_dict["weight"][start_ix:end_ix]
+                if has_weights
+                else cupy.array([], dtype="float32")
+            )
+
+            # create the renumber map offsets
+            renumber_map_offsets_array_p = minibatch_dict["renumber_map_offsets"][
+                partition_start : partition_end + 1
+            ]
+
+            renumber_map_start_ix, renumber_map_end_ix = renumber_map_offsets_array_p[
+                [0, -1]
+            ]
+
+            renumber_map_array_p = minibatch_dict["renumber_map"][
+                renumber_map_start_ix:renumber_map_end_ix
+            ]
+
+            results_dataframe_p = create_df_from_disjoint_arrays(
+                {
+                    "major_offsets": major_offsets_array_p,
+                    "minors": minors_array_p,
+                    "map": renumber_map_array_p,
+                    "label_hop_offsets": label_hop_offsets_array_p,
+                    "weight": weight_array_p,
+                    "edge_id": edge_id_array_p,
+                    "edge_type": edge_type_array_p,
+                    "renumber_map_offsets": renumber_map_offsets_array_p,
+                    "input_index": input_index_p,
+                    "input_offsets": input_offsets_p,
+                    "edge_inverse": edge_inverse_p,
+                }
+            )
+
+            end_batch_id = start_batch_id + len(batch_id_array_p) - 1
+            rank = minibatch_dict["rank"] if "rank" in minibatch_dict else 0
+
+            full_output_path = os.path.join(
+                self.__directory,
+                f"batch={rank:05d}.{start_batch_id:08d}-"
+                f"{rank:05d}.{end_batch_id:08d}.parquet",
+            )
+
+            results_dataframe_p.to_parquet(
+                full_output_path,
+                compression=None,
+                index=False,
+                force_nullable_schema=True,
+            )
+
+    def write_minibatches(self, minibatch_dict):
+        if (minibatch_dict["majors"] is not None) and (
+            minibatch_dict["minors"] is not None
+        ):
+            self.__write_minibatches_coo(minibatch_dict)
+        elif (minibatch_dict["major_offsets"] is not None) and (
+            minibatch_dict["minors"] is not None
+        ):
+            self.__write_minibatches_csr(minibatch_dict)
+        else:
+            raise ValueError("invalid columns")
diff --git a/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py b/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
index a5a84362a07..0ff38741e1a 100644
--- a/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
+++ b/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
@@ -11,8 +11,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-import re
 import warnings
 from math import ceil
 from functools import reduce
@@ -27,348 +25,19 @@
 from cugraph.utilities.utils import import_optional, MissingModule
 from cugraph.gnn.comms import cugraph_comms_get_raft_handle
 
-from cugraph.gnn.data_loading.bulk_sampler_io import create_df_from_disjoint_arrays
+
+from cugraph.gnn.data_loading.dist_io import BufferedSampleReader
+from cugraph.gnn.data_loading.dist_io import DistSampleWriter
 
 torch = MissingModule("torch")
 TensorType = Union["torch.Tensor", cupy.ndarray, cudf.Series]
 
 
-class DistSampleReader:
-    def __init__(
-        self,
-        directory: str,
-        *,
-        format: str = "parquet",
-        rank: Optional[int] = None,
-        filelist=None,
-    ):
-        torch = import_optional("torch")
-
-        self.__format = format
-        self.__directory = directory
-
-        if format != "parquet":
-            raise ValueError("Invalid format (currently supported: 'parquet')")
-
-        if filelist is None:
-            files = os.listdir(directory)
-            ex = re.compile(r"batch\=([0-9]+)\.([0-9]+)\-([0-9]+)\.([0-9]+)\.parquet")
-            filematch = [ex.match(f) for f in files]
-            filematch = [f for f in filematch if f]
-
-            if rank is not None:
-                filematch = [f for f in filematch if int(f[1]) == rank]
-
-            batch_count = sum([int(f[4]) - int(f[2]) + 1 for f in filematch])
-            filematch = sorted(filematch, key=lambda f: int(f[2]), reverse=True)
-
-            self.__files = filematch
-        else:
-            self.__files = list(filelist)
-
-        if rank is None:
-            self.__batch_count = batch_count
-        else:
-            batch_count = torch.tensor([batch_count], device="cuda")
-            torch.distributed.all_reduce(batch_count, torch.distributed.ReduceOp.MIN)
-            self.__batch_count = int(batch_count)
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        torch = import_optional("torch")
-
-        if len(self.__files) > 0:
-            f = self.__files.pop()
-            fname = f[0]
-            start_inclusive = int(f[2])
-            end_inclusive = int(f[4])
-
-            if (end_inclusive - start_inclusive + 1) > self.__batch_count:
-                end_inclusive = start_inclusive + self.__batch_count - 1
-                self.__batch_count = 0
-            else:
-                self.__batch_count -= end_inclusive - start_inclusive + 1
-
-            df = cudf.read_parquet(os.path.join(self.__directory, fname))
-            tensors = {}
-            for col in list(df.columns):
-                s = df[col].dropna()
-                if len(s) > 0:
-                    tensors[col] = torch.as_tensor(s, device="cuda")
-                df.drop(col, axis=1, inplace=True)
-
-            return tensors, start_inclusive, end_inclusive
-
-        raise StopIteration
-
-
-class DistSampleWriter:
-    def __init__(
-        self,
-        directory: str,
-        *,
-        batches_per_partition: int = 256,
-        format: str = "parquet",
-    ):
-        """
-        Parameters
-        ----------
-        directory: str (required)
-            The directory where samples will be written.  This
-            writer can only write to disk.
-        batches_per_partition: int (optional, default=256)
-            The number of batches to write in a single file.
-        format: str (optional, default='parquet')
-            The file format of the output files containing the
-            sampled minibatches.  Currently, only parquet format
-            is supported.
-        """
-        if format != "parquet":
-            raise ValueError("Invalid format (currently supported: 'parquet')")
-
-        self.__format = format
-        self.__directory = directory
-        self.__batches_per_partition = batches_per_partition
-
-    @property
-    def _format(self):
-        return self.__format
-
-    @property
-    def _directory(self):
-        return self.__directory
-
-    @property
-    def _batches_per_partition(self):
-        return self.__batches_per_partition
-
-    def get_reader(
-        self, rank: int
-    ) -> Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]:
-        """
-        Returns an iterator over sampled data.
-        """
-
-        # currently only disk reading is supported
-        return DistSampleReader(self._directory, format=self._format, rank=rank)
-
-    def __write_minibatches_coo(self, minibatch_dict):
-        has_edge_ids = minibatch_dict["edge_id"] is not None
-        has_edge_types = minibatch_dict["edge_type"] is not None
-        has_weights = minibatch_dict["weight"] is not None
-
-        if minibatch_dict["renumber_map"] is None:
-            raise ValueError(
-                "Distributed sampling without renumbering is not supported"
-            )
-
-        # Quit if there are no batches to write.
-        if len(minibatch_dict["batch_id"]) == 0:
-            return
-
-        fanout_length = (len(minibatch_dict["label_hop_offsets"]) - 1) // len(
-            minibatch_dict["batch_id"]
-        )
-        rank_batch_offset = minibatch_dict["batch_id"][0]
-
-        for p in range(
-            0, int(ceil(len(minibatch_dict["batch_id"]) / self.__batches_per_partition))
-        ):
-            partition_start = p * (self.__batches_per_partition)
-            partition_end = (p + 1) * (self.__batches_per_partition)
-
-            label_hop_offsets_array_p = minibatch_dict["label_hop_offsets"][
-                partition_start * fanout_length : partition_end * fanout_length + 1
-            ]
-
-            batch_id_array_p = minibatch_dict["batch_id"][partition_start:partition_end]
-            start_batch_id = batch_id_array_p[0] - rank_batch_offset
-
-            start_ix, end_ix = label_hop_offsets_array_p[[0, -1]]
-            majors_array_p = minibatch_dict["majors"][start_ix:end_ix]
-            minors_array_p = minibatch_dict["minors"][start_ix:end_ix]
-            edge_id_array_p = (
-                minibatch_dict["edge_id"][start_ix:end_ix]
-                if has_edge_ids
-                else cupy.array([], dtype="int64")
-            )
-            edge_type_array_p = (
-                minibatch_dict["edge_type"][start_ix:end_ix]
-                if has_edge_types
-                else cupy.array([], dtype="int32")
-            )
-            weight_array_p = (
-                minibatch_dict["weight"][start_ix:end_ix]
-                if has_weights
-                else cupy.array([], dtype="float32")
-            )
-
-            # create the renumber map offsets
-            renumber_map_offsets_array_p = minibatch_dict["renumber_map_offsets"][
-                partition_start : partition_end + 1
-            ]
-
-            renumber_map_start_ix, renumber_map_end_ix = renumber_map_offsets_array_p[
-                [0, -1]
-            ]
-
-            renumber_map_array_p = minibatch_dict["renumber_map"][
-                renumber_map_start_ix:renumber_map_end_ix
-            ]
-
-            results_dataframe_p = create_df_from_disjoint_arrays(
-                {
-                    "majors": majors_array_p,
-                    "minors": minors_array_p,
-                    "map": renumber_map_array_p,
-                    "label_hop_offsets": label_hop_offsets_array_p,
-                    "weight": weight_array_p,
-                    "edge_id": edge_id_array_p,
-                    "edge_type": edge_type_array_p,
-                    "renumber_map_offsets": renumber_map_offsets_array_p,
-                }
-            )
-
-            end_batch_id = start_batch_id + len(batch_id_array_p) - 1
-            rank = minibatch_dict["rank"] if "rank" in minibatch_dict else 0
-
-            full_output_path = os.path.join(
-                self.__directory,
-                f"batch={rank:05d}.{start_batch_id:08d}-"
-                f"{rank:05d}.{end_batch_id:08d}.parquet",
-            )
-
-            results_dataframe_p.to_parquet(
-                full_output_path,
-                compression=None,
-                index=False,
-                force_nullable_schema=True,
-            )
-
-    def __write_minibatches_csr(self, minibatch_dict):
-        has_edge_ids = minibatch_dict["edge_id"] is not None
-        has_edge_types = minibatch_dict["edge_type"] is not None
-        has_weights = minibatch_dict["weight"] is not None
-
-        if minibatch_dict["renumber_map"] is None:
-            raise ValueError(
-                "Distributed sampling without renumbering is not supported"
-            )
-
-        # Quit if there are no batches to write.
-        if len(minibatch_dict["batch_id"]) == 0:
-            return
-
-        fanout_length = (len(minibatch_dict["label_hop_offsets"]) - 1) // len(
-            minibatch_dict["batch_id"]
-        )
-
-        for p in range(
-            0, int(ceil(len(minibatch_dict["batch_id"]) / self.__batches_per_partition))
-        ):
-            partition_start = p * (self.__batches_per_partition)
-            partition_end = (p + 1) * (self.__batches_per_partition)
-
-            label_hop_offsets_array_p = minibatch_dict["label_hop_offsets"][
-                partition_start * fanout_length : partition_end * fanout_length + 1
-            ]
-
-            batch_id_array_p = minibatch_dict["batch_id"][partition_start:partition_end]
-            start_batch_id = batch_id_array_p[0]
-
-            # major offsets and minors
-            (
-                major_offsets_start_incl,
-                major_offsets_end_incl,
-            ) = label_hop_offsets_array_p[[0, -1]]
-
-            start_ix, end_ix = minibatch_dict["major_offsets"][
-                [major_offsets_start_incl, major_offsets_end_incl]
-            ]
-
-            major_offsets_array_p = minibatch_dict["major_offsets"][
-                major_offsets_start_incl : major_offsets_end_incl + 1
-            ]
-
-            minors_array_p = minibatch_dict["minors"][start_ix:end_ix]
-            edge_id_array_p = (
-                minibatch_dict["edge_id"][start_ix:end_ix]
-                if has_edge_ids
-                else cupy.array([], dtype="int64")
-            )
-            edge_type_array_p = (
-                minibatch_dict["edge_type"][start_ix:end_ix]
-                if has_edge_types
-                else cupy.array([], dtype="int32")
-            )
-            weight_array_p = (
-                minibatch_dict["weight"][start_ix:end_ix]
-                if has_weights
-                else cupy.array([], dtype="float32")
-            )
-
-            # create the renumber map offsets
-            renumber_map_offsets_array_p = minibatch_dict["renumber_map_offsets"][
-                partition_start : partition_end + 1
-            ]
-
-            renumber_map_start_ix, renumber_map_end_ix = renumber_map_offsets_array_p[
-                [0, -1]
-            ]
-
-            renumber_map_array_p = minibatch_dict["renumber_map"][
-                renumber_map_start_ix:renumber_map_end_ix
-            ]
-
-            results_dataframe_p = create_df_from_disjoint_arrays(
-                {
-                    "major_offsets": major_offsets_array_p,
-                    "minors": minors_array_p,
-                    "map": renumber_map_array_p,
-                    "label_hop_offsets": label_hop_offsets_array_p,
-                    "weight": weight_array_p,
-                    "edge_id": edge_id_array_p,
-                    "edge_type": edge_type_array_p,
-                    "renumber_map_offsets": renumber_map_offsets_array_p,
-                }
-            )
-
-            end_batch_id = start_batch_id + len(batch_id_array_p) - 1
-            rank = minibatch_dict["rank"] if "rank" in minibatch_dict else 0
-
-            full_output_path = os.path.join(
-                self.__directory,
-                f"batch={rank:05d}.{start_batch_id:08d}-"
-                f"{rank:05d}.{end_batch_id:08d}.parquet",
-            )
-
-            results_dataframe_p.to_parquet(
-                full_output_path,
-                compression=None,
-                index=False,
-                force_nullable_schema=True,
-            )
-
-    def write_minibatches(self, minibatch_dict):
-        if (minibatch_dict["majors"] is not None) and (
-            minibatch_dict["minors"] is not None
-        ):
-            self.__write_minibatches_coo(minibatch_dict)
-        elif (minibatch_dict["major_offsets"] is not None) and (
-            minibatch_dict["minors"] is not None
-        ):
-            self.__write_minibatches_csr(minibatch_dict)
-        else:
-            raise ValueError("invalid columns")
-
-
 class DistSampler:
     def __init__(
         self,
         graph: Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph],
-        writer: DistSampleWriter,
+        writer: Optional[DistSampleWriter],
         local_seeds_per_call: int,
         retain_original_seeds: bool = False,
     ):
@@ -379,7 +48,8 @@ def __init__(
             The pylibcugraph graph object that will be sampled.
         writer: DistSampleWriter (required)
             The writer responsible for writing samples to disk
-            or, in the future, device or host memory.
+            or; if None, then samples will be written to memory
+            instead.
         local_seeds_per_call: int
             The number of seeds on this rank this sampler will
             process in a single sampling call.  Batches will
@@ -402,14 +72,6 @@ def __init__(
         self.__handle = None
         self.__retain_original_seeds = retain_original_seeds
 
-    def get_reader(self) -> Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]:
-        """
-        Returns an iterator over sampled data.
-        """
-        torch = import_optional("torch")
-        rank = torch.distributed.get_rank() if self.is_multi_gpu else None
-        return self.__writer.get_reader(rank)
-
     def sample_batches(
         self,
         seeds: TensorType,
@@ -564,6 +226,108 @@ def get_start_batch_offset(
         else:
             return 0, input_size_is_equal
 
+    def __sample_from_nodes_func(
+        self,
+        call_id: int,
+        current_seeds_and_ix: Tuple["torch.Tensor", "torch.Tensor"],
+        batch_id_start: int,
+        batch_size: int,
+        batches_per_call: int,
+        random_state: int,
+        assume_equal_input_size: bool,
+    ) -> Union[None, Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]]:
+        torch = import_optional("torch")
+
+        current_seeds, current_ix = current_seeds_and_ix
+
+        current_batches = torch.arange(
+            batch_id_start + call_id * batches_per_call,
+            batch_id_start
+            + call_id * batches_per_call
+            + int(ceil(len(current_seeds)))
+            + 1,
+            device="cuda",
+            dtype=torch.int32,
+        )
+
+        current_batches = current_batches.repeat_interleave(batch_size)[
+            : len(current_seeds)
+        ]
+
+        # do qr division to get the number of batch_size batches and the
+        # size of the last batch
+        num_full, last_count = divmod(len(current_seeds), batch_size)
+        input_offsets = torch.concatenate(
+            [
+                torch.tensor([0], device="cuda", dtype=torch.int64),
+                torch.full((num_full,), batch_size, device="cuda", dtype=torch.int64),
+                torch.tensor([last_count], device="cuda", dtype=torch.int64)
+                if last_count > 0
+                else torch.tensor([], device="cuda", dtype=torch.int64),
+            ]
+        ).cumsum(-1)
+
+        minibatch_dict = self.sample_batches(
+            seeds=current_seeds,
+            batch_ids=current_batches,
+            random_state=random_state,
+            assume_equal_input_size=assume_equal_input_size,
+        )
+        minibatch_dict["input_index"] = current_ix.cuda()
+        minibatch_dict["input_offsets"] = input_offsets
+
+        if self.__writer is None:
+            # rename renumber_map -> map to match unbuffered format
+            minibatch_dict["map"] = minibatch_dict["renumber_map"]
+            del minibatch_dict["renumber_map"]
+            minibatch_dict = {
+                k: torch.as_tensor(v, device="cuda")
+                for k, v in minibatch_dict.items()
+                if v is not None
+            }
+
+            return iter([(minibatch_dict, current_batches[0], current_batches[-1])])
+        else:
+            self.__writer.write_minibatches(minibatch_dict)
+            return None
+
+    def __get_call_groups(
+        self,
+        seeds: TensorType,
+        input_id: TensorType,
+        seeds_per_call: int,
+        assume_equal_input_size: bool = False,
+    ):
+        torch = import_optional("torch")
+
+        # Split the input seeds into call groups.  Each call group
+        # corresponds to one sampling call.  A call group contains
+        # many batches.
+        seeds_call_groups = torch.split(seeds, seeds_per_call, dim=-1)
+        index_call_groups = torch.split(input_id, seeds_per_call, dim=-1)
+
+        # Need to add empties to the list of call groups to handle the case
+        # where not all ranks have the same number of call groups.  This
+        # prevents a hang since we need all ranks to make the same number
+        # of calls.
+        if not assume_equal_input_size:
+            num_call_groups = torch.tensor(
+                [len(seeds_call_groups)], device="cuda", dtype=torch.int32
+            )
+            torch.distributed.all_reduce(
+                num_call_groups, op=torch.distributed.ReduceOp.MAX
+            )
+            seeds_call_groups = list(seeds_call_groups) + (
+                [torch.tensor([], dtype=seeds.dtype, device="cuda")]
+                * (int(num_call_groups) - len(seeds_call_groups))
+            )
+            index_call_groups = list(index_call_groups) + (
+                [torch.tensor([], dtype=torch.int64, device=input_id.device)]
+                * (int(num_call_groups) - len(index_call_groups))
+            )
+
+        return seeds_call_groups, index_call_groups
+
     def sample_from_nodes(
         self,
         nodes: TensorType,
@@ -571,7 +335,8 @@ def sample_from_nodes(
         batch_size: int = 16,
         random_state: int = 62,
         assume_equal_input_size: bool = False,
-    ):
+        input_id: Optional[TensorType] = None,
+    ) -> Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]:
         """
         Performs node-based sampling.  Accepts a list of seed nodes, and batch size.
         Splits the seed list into batches, then divides the batches into call groups
@@ -587,61 +352,301 @@ def sample_from_nodes(
             The size of each batch.
         random_state: int
             The random seed to use for sampling.
+        assume_equal_input_size: bool
+            Whether the inputs across workers should be assumed to be equal in
+            dimension.  Skips some checks if True.
+        input_id: Optional[TensorType]
+            Input ids corresponding to the original batch tensor, if it
+            was permuted prior to calling this function.  If present,
+            will be saved with the samples.
         """
         torch = import_optional("torch")
 
         nodes = torch.as_tensor(nodes, device="cuda")
+        num_seeds = nodes.numel()
 
         batches_per_call = self._local_seeds_per_call // batch_size
         actual_seeds_per_call = batches_per_call * batch_size
 
-        # Split the input seeds into call groups.  Each call group
-        # corresponds to one sampling call.  A call group contains
-        # many batches.
-        num_seeds = len(nodes)
-        nodes_call_groups = torch.split(nodes, actual_seeds_per_call)
+        if input_id is None:
+            input_id = torch.arange(num_seeds, dtype=torch.int64, device="cpu")
 
         local_num_batches = int(ceil(num_seeds / batch_size))
         batch_id_start, input_size_is_equal = self.get_start_batch_offset(
             local_num_batches, assume_equal_input_size=assume_equal_input_size
         )
 
-        # Need to add empties to the list of call groups to handle the case
-        # where not all nodes have the same number of call groups.  This
-        # prevents a hang since we need all ranks to make the same number
-        # of calls.
-        if not input_size_is_equal:
-            num_call_groups = torch.tensor(
-                [len(nodes_call_groups)], device="cuda", dtype=torch.int32
-            )
-            torch.distributed.all_reduce(
-                num_call_groups, op=torch.distributed.ReduceOp.MAX
+        nodes_call_groups, index_call_groups = self.__get_call_groups(
+            nodes,
+            input_id,
+            actual_seeds_per_call,
+            assume_equal_input_size=input_size_is_equal,
+        )
+
+        sample_args = (
+            batch_id_start,
+            batch_size,
+            batches_per_call,
+            random_state,
+            input_size_is_equal,
+        )
+
+        if self.__writer is None:
+            # Buffered sampling
+            return BufferedSampleReader(
+                zip(nodes_call_groups, index_call_groups),
+                self.__sample_from_nodes_func,
+                *sample_args,
             )
-            nodes_call_groups = list(nodes_call_groups) + (
-                [torch.tensor([], dtype=nodes.dtype, device="cuda")]
-                * (int(num_call_groups) - len(nodes_call_groups))
+        else:
+            # Unbuffered sampling
+            for i, current_seeds_and_ix in enumerate(
+                zip(nodes_call_groups, index_call_groups)
+            ):
+                self.__sample_from_nodes_func(
+                    i,
+                    current_seeds_and_ix,
+                    *sample_args,
+                )
+
+            # Return a reader that points to the stored samples
+            rank = torch.distributed.get_rank() if self.is_multi_gpu else None
+            return self.__writer.get_reader(rank)
+
+    def __sample_from_edges_func(
+        self,
+        call_id: int,
+        current_seeds_and_ix: Tuple["torch.Tensor", "torch.Tensor"],
+        batch_id_start: int,
+        batch_size: int,
+        batches_per_call: int,
+        random_state: int,
+        assume_equal_input_size: bool,
+    ) -> Union[None, Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]]:
+        torch = import_optional("torch")
+
+        current_seeds, current_ix = current_seeds_and_ix
+        num_seed_edges = current_ix.numel()
+
+        # The index gets stored as-is regardless of what makes it into
+        # the final batch and in what order.
+        # do qr division to get the number of batch_size batches and the
+        # size of the last batch
+        num_whole_batches, last_count = divmod(num_seed_edges, batch_size)
+        input_offsets = torch.concatenate(
+            [
+                torch.tensor([0], device="cuda", dtype=torch.int64),
+                torch.full(
+                    (num_whole_batches,), batch_size, device="cuda", dtype=torch.int64
+                ),
+                torch.tensor([last_count], device="cuda", dtype=torch.int64)
+                if last_count > 0
+                else torch.tensor([], device="cuda", dtype=torch.int64),
+            ]
+        ).cumsum(-1)
+
+        current_seeds, leftover_seeds = (
+            current_seeds[:, : (batch_size * num_whole_batches)],
+            current_seeds[:, (batch_size * num_whole_batches) :],
+        )
+
+        # For input edges, we need to translate this into unique vertices
+        # for each batch.
+        # We start by reorganizing the seed and index tensors so we can
+        # determine the unique vertices.  This results in the expected
+        # src-to-dst concatenation for each batch
+        current_seeds = torch.concat(
+            [
+                current_seeds[0].reshape((-1, batch_size)),
+                current_seeds[1].reshape((-1, batch_size)),
+            ],
+            axis=-1,
+        )
+
+        # The returned unique values must be sorted or else the inverse won't line up
+        # In the future this may be a good target for a C++ function
+        # Each element is a tuple of (unique, index, inverse)
+        # The seeds must be presorted with a stable sort prior to calling
+        # unique_consecutive in order to support negative sampling.  This is
+        # because if we put positive edges after negative ones, then we may
+        # inadvertently turn a true positive into a false negative.
+        y = (
+            torch.sort(
+                t,
+                stable=True,
             )
+            for t in current_seeds
+        )
+        z = ((v, torch.sort(i)[1]) for v, i in y)
 
-        # Make a call to sample_batches for each call group
-        for i, current_seeds in enumerate(nodes_call_groups):
-            current_batches = torch.arange(
-                batch_id_start + i * batches_per_call,
-                batch_id_start + (i + 1) * batches_per_call,
-                device="cuda",
-                dtype=torch.int32,
+        u = [
+            (
+                torch.unique_consecutive(
+                    t,
+                    return_inverse=True,
+                ),
+                i,
             )
+            for t, i in z
+        ]
 
-            current_batches = current_batches.repeat_interleave(batch_size)[
-                : len(current_seeds)
+        if len(u) > 0:
+            current_seeds = torch.concat([a[0] for a, _ in u])
+            current_inv = torch.concat([a[1][i] for a, i in u])
+            current_batches = torch.concat(
+                [
+                    torch.full(
+                        (a[0].numel(),),
+                        i + batch_id_start + (call_id * batches_per_call),
+                        device="cuda",
+                        dtype=torch.int32,
+                    )
+                    for i, (a, _) in enumerate(u)
+                ]
+            )
+        else:
+            current_seeds = torch.tensor([], device="cuda", dtype=torch.int64)
+            current_inv = torch.tensor([], device="cuda", dtype=torch.int64)
+            current_batches = torch.tensor([], device="cuda", dtype=torch.int32)
+        del u
+
+        # Join with the leftovers
+        leftover_seeds, lyi = torch.sort(
+            leftover_seeds.flatten(),
+            stable=True,
+        )
+        lz = torch.sort(lyi)[1]
+        leftover_seeds, lui = leftover_seeds.unique_consecutive(return_inverse=True)
+        leftover_inv = lui[lz]
+
+        current_seeds = torch.concat([current_seeds, leftover_seeds])
+        current_inv = torch.concat([current_inv, leftover_inv])
+        current_batches = torch.concat(
+            [
+                current_batches,
+                torch.full(
+                    (leftover_seeds.numel(),),
+                    (current_batches[-1] + 1) if current_batches.numel() > 0 else 0,
+                    device="cuda",
+                    dtype=torch.int32,
+                ),
             ]
+        )
+        del leftover_seeds
+        del lz
+        del lui
+
+        minibatch_dict = self.sample_batches(
+            seeds=current_seeds,
+            batch_ids=current_batches,
+            random_state=random_state,
+            assume_equal_input_size=assume_equal_input_size,
+        )
+        minibatch_dict["input_index"] = current_ix.cuda()
+        minibatch_dict["input_offsets"] = input_offsets
+        minibatch_dict[
+            "edge_inverse"
+        ] = current_inv  # (2 * batch_size) entries per batch
+
+        if self.__writer is None:
+            # rename renumber_map -> map to match unbuffered format
+            minibatch_dict["map"] = minibatch_dict["renumber_map"]
+            del minibatch_dict["renumber_map"]
+            minibatch_dict = {
+                k: torch.as_tensor(v, device="cuda")
+                for k, v in minibatch_dict.items()
+                if v is not None
+            }
+
+            return iter([(minibatch_dict, current_batches[0], current_batches[-1])])
+        else:
+            self.__writer.write_minibatches(minibatch_dict)
+            return None
 
-            minibatch_dict = self.sample_batches(
-                seeds=current_seeds,
-                batch_ids=current_batches,
-                random_state=random_state,
-                assume_equal_input_size=input_size_is_equal,
+    def sample_from_edges(
+        self,
+        edges: TensorType,
+        *,
+        batch_size: int = 16,
+        random_state: int = 62,
+        assume_equal_input_size: bool = False,
+        input_id: Optional[TensorType] = None,
+    ) -> Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]:
+        """
+        Performs sampling starting from seed edges.
+
+        Parameters
+        ----------
+        edges: TensorType
+            2 x (# edges) tensor of edges to sample from.
+            Standard src/dst format.  This will be converted
+            to a list of seed nodes.
+        batch_size: int
+            The size of each batch.
+        random_state: int
+            The random seed to use for sampling.
+        assume_equal_input_size: bool
+            Whether this function should assume that inputs
+            are equal across ranks.  Skips some potentially
+            slow steps if True.
+        input_id: Optional[TensorType]
+            Input ids corresponding to the original batch tensor, if it
+            was permuted prior to calling this function.  If present,
+            will be saved with the samples.
+        """
+
+        torch = import_optional("torch")
+
+        edges = torch.as_tensor(edges, device="cuda")
+        num_seed_edges = edges.shape[-1]
+
+        batches_per_call = self._local_seeds_per_call // batch_size
+        actual_seed_edges_per_call = batches_per_call * batch_size
+
+        if input_id is None:
+            input_id = torch.arange(len(edges), dtype=torch.int64, device="cpu")
+
+        local_num_batches = int(ceil(num_seed_edges / batch_size))
+        batch_id_start, input_size_is_equal = self.get_start_batch_offset(
+            local_num_batches, assume_equal_input_size=assume_equal_input_size
+        )
+
+        edges_call_groups, index_call_groups = self.__get_call_groups(
+            edges,
+            input_id,
+            actual_seed_edges_per_call,
+            assume_equal_input_size=input_size_is_equal,
+        )
+
+        sample_args = (
+            batch_id_start,
+            batch_size,
+            batches_per_call,
+            random_state,
+            input_size_is_equal,
+        )
+
+        if self.__writer is None:
+            # Buffered sampling
+            return BufferedSampleReader(
+                zip(edges_call_groups, index_call_groups),
+                self.__sample_from_edges_func,
+                *sample_args,
             )
-            self.__writer.write_minibatches(minibatch_dict)
+        else:
+            # Unbuffered sampling
+            for i, current_seeds_and_ix in enumerate(
+                zip(edges_call_groups, index_call_groups)
+            ):
+                self.__sample_from_edges_func(
+                    i,
+                    current_seeds_and_ix,
+                    *sample_args,
+                )
+
+            # Return a reader that points to the stored samples
+            rank = torch.distributed.get_rank() if self.is_multi_gpu else None
+            return self.__writer.get_reader(rank)
 
     @property
     def is_multi_gpu(self):
@@ -671,7 +676,7 @@ def _retain_original_seeds(self):
         return self.__retain_original_seeds
 
 
-class UniformNeighborSampler(DistSampler):
+class NeighborSampler(DistSampler):
     # Number of vertices in the output minibatch, based
     # on benchmarking.
     BASE_VERTICES_PER_BYTE = 0.1107662486009992
@@ -693,6 +698,7 @@ def __init__(
         compression: str = "COO",
         compress_per_hop: bool = False,
         with_replacement: bool = False,
+        biased: bool = False,
     ):
         self.__fanout = fanout
         self.__prior_sources_behavior = prior_sources_behavior
@@ -701,6 +707,18 @@ def __init__(
         self.__compression = compression
         self.__with_replacement = with_replacement
 
+        # It is currently required that graphs are weighted for biased
+        # sampling.  So setting the function here is safe.  In the future,
+        # if libcugraph allows setting a new attribute, this API might
+        # change.
+        # TODO allow func to be a call to a future remote sampling API
+        # if the provided graph is in another process (rapidsai/cugraph#4623).
+        self.__func = (
+            pylibcugraph.biased_neighbor_sample
+            if biased
+            else pylibcugraph.uniform_neighbor_sample
+        )
+
         super().__init__(
             graph,
             writer,
@@ -713,14 +731,12 @@ def __calc_local_seeds_per_call(self, local_seeds_per_call: Optional[int] = None
 
         if local_seeds_per_call is None:
             if len([x for x in self.__fanout if x <= 0]) > 0:
-                return UniformNeighborSampler.UNKNOWN_VERTICES_DEFAULT
+                return NeighborSampler.UNKNOWN_VERTICES_DEFAULT
 
             total_memory = torch.cuda.get_device_properties(0).total_memory
             fanout_prod = reduce(lambda x, y: x * y, self.__fanout)
             return int(
-                UniformNeighborSampler.BASE_VERTICES_PER_BYTE
-                * total_memory
-                / fanout_prod
+                NeighborSampler.BASE_VERTICES_PER_BYTE * total_memory / fanout_prod
             )
 
         return local_seeds_per_call
@@ -755,7 +771,7 @@ def sample_batches(
             else:
                 label_offsets = None
 
-            sampling_results_dict = pylibcugraph.uniform_neighbor_sample(
+            sampling_results_dict = self.__func(
                 self._resource_handle,
                 self._graph,
                 start_list=cupy.asarray(seeds),
@@ -764,7 +780,7 @@ def sample_batches(
                 label_to_output_comm_rank=cupy.asarray(label_to_output_comm_rank),
                 h_fan_out=np.array(self.__fanout, dtype="int32"),
                 with_replacement=self.__with_replacement,
-                do_expensive_check=True,
+                do_expensive_check=False,
                 with_edge_properties=True,
                 random_state=random_state + rank,
                 prior_sources_behavior=self.__prior_sources_behavior,
@@ -795,7 +811,7 @@ def sample_batches(
             else:
                 label_offsets = None
 
-            sampling_results_dict = pylibcugraph.uniform_neighbor_sample(
+            sampling_results_dict = self.__func(
                 self._resource_handle,
                 self._graph,
                 start_list=cupy.asarray(seeds),
diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py
index d83f88c0c96..bc5cca67c2e 100644
--- a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py
+++ b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py
@@ -255,6 +255,13 @@ def __from_edgelist(
             elif elist[source].dtype not in [np.int32, np.int64] or elist[
                 destination
             ].dtype not in [np.int32, np.int64]:
+                if elist[destination].dtype in [np.uint32, np.uint64] or elist[
+                    source
+                ].dtype in [np.uint32, np.uint64]:
+                    raise ValueError(
+                        "Unsigned integers are not supported as vertex ids."
+                        "  Either convert to signed integers or set renumber=True"
+                    )
                 raise ValueError("set renumber to True for non integer columns ids")
 
         # The dataframe will be symmetrized iff the graph is undirected
diff --git a/python/cugraph/cugraph/structure/hypergraph.py b/python/cugraph/cugraph/structure/hypergraph.py
index add68cb6dac..bdc98333da0 100644
--- a/python/cugraph/cugraph/structure/hypergraph.py
+++ b/python/cugraph/cugraph/structure/hypergraph.py
@@ -440,6 +440,7 @@ def _create_hyper_edges(
     for key, col in events[columns].items():
         cat = categories.get(key, key)
         fs = [EVENTID] + ([key] if drop_edge_attrs else edge_attrs)
+        fs = list(set(fs))
         df = events[fs].dropna(subset=[key]) if dropna else events[fs]
         if len(df) == 0:
             continue
@@ -464,8 +465,7 @@ def _create_hyper_edges(
     if not drop_edge_attrs:
         columns += edge_attrs
 
-    edges = cudf.concat(edges)[columns]
-    edges.reset_index(drop=True, inplace=True)
+    edges = cudf.concat(edges, ignore_index=True)[list(set(columns))]
     return edges
 
 
@@ -546,6 +546,7 @@ def _create_direct_edges(
         for key2, col2 in events[sorted(edge_shape[key1])].items():
             cat2 = categories.get(key2, key2)
             fs = [EVENTID] + ([key1, key2] if drop_edge_attrs else edge_attrs)
+            fs = list(set(fs))
             df = events[fs].dropna(subset=[key1, key2]) if dropna else events[fs]
             if len(df) == 0:
                 continue
@@ -573,20 +574,22 @@ def _create_direct_edges(
     if not drop_edge_attrs:
         columns += edge_attrs
 
-    edges = cudf.concat(edges)[columns]
+    edges = cudf.concat(edges)[list(set(columns))]
     edges.reset_index(drop=True, inplace=True)
     return edges
 
 
 def _str_scalar_to_category(size, val):
-    return cudf.core.column.build_categorical_column(
-        categories=cudf.core.column.as_column([val], dtype="str"),
-        codes=cudf.core.column.as_column(0, length=size, dtype=np.int32),
-        mask=None,
+    return cudf.core.column.CategoricalColumn(
+        data=None,
         size=size,
+        dtype=cudf.CategoricalDtype(
+            categories=cudf.core.column.as_column([val], dtype="str"), ordered=False
+        ),
+        mask=None,
         offset=0,
         null_count=0,
-        ordered=False,
+        children=(cudf.core.column.as_column(0, length=size, dtype=np.int32),),
     )
 
 
diff --git a/python/cugraph/cugraph/testing/resultset.py b/python/cugraph/cugraph/testing/resultset.py
index 9570d7f3e04..f557ad13089 100644
--- a/python/cugraph/cugraph/testing/resultset.py
+++ b/python/cugraph/cugraph/testing/resultset.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,6 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
 import tarfile
 
 import urllib.request
@@ -108,7 +109,11 @@ def load_resultset(resultset_name, resultset_download_url):
         if not compressed_file_path.exists():
             urllib.request.urlretrieve(resultset_download_url, compressed_file_path)
         tar = tarfile.open(str(compressed_file_path), "r:gz")
-        tar.extractall(str(curr_resultset_download_dir))
+        # TODO: pass filter="fully_trusted" when minimum supported Python version >=3.12
+        #  ref: https://docs.python.org/3/library/tarfile.html#tarfile-extraction-filter
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", category=DeprecationWarning)
+            tar.extractall(str(curr_resultset_download_dir))
         tar.close()
 
     # FIXME: This assumes separator is " ", but should this be configurable?
diff --git a/python/cugraph/cugraph/tests/data_store/test_property_graph.py b/python/cugraph/cugraph/tests/data_store/test_property_graph.py
index da5608e0193..50f08cdf3d0 100644
--- a/python/cugraph/cugraph/tests/data_store/test_property_graph.py
+++ b/python/cugraph/cugraph/tests/data_store/test_property_graph.py
@@ -2576,9 +2576,10 @@ def bench_extract_subgraph_for_rmat(gpubenchmark, rmat_PropertyGraph):
     scn = PropertyGraph.src_col_name
     dcn = PropertyGraph.dst_col_name
 
-    verts = []
-    for i in range(0, 10000, 10):
-        verts.append(generated_df["src"].iloc[i])
+    # Build a query string to extract a graph with only specific edges based on
+    # the integer vertex IDs. Other edge and/or vertex properties can be
+    # included in the query as well.
+    verts = [int(generated_df["src"].iloc[i]) for i in range(0, 10000, 10)]
 
     selected_edges = pG.select_edges(f"{scn}.isin({verts}) | {dcn}.isin({verts})")
     gpubenchmark(
@@ -2618,9 +2619,10 @@ def bench_extract_subgraph_for_rmat_detect_duplicate_edges(
     scn = PropertyGraph.src_col_name
     dcn = PropertyGraph.dst_col_name
 
-    verts = []
-    for i in range(0, 10000, 10):
-        verts.append(generated_df["src"].iloc[i])
+    # Build a query string to extract a graph with only specific edges based on
+    # the integer vertex IDs. Other edge and/or vertex properties can be
+    # included in the query as well.
+    verts = [int(generated_df["src"].iloc[i]) for i in range(0, 10000, 10)]
 
     selected_edges = pG.select_edges(f"{scn}.isin({verts}) | {dcn}.isin({verts})")
 
diff --git a/python/cugraph/cugraph/tests/generators/test_rmat.py b/python/cugraph/cugraph/tests/generators/test_rmat.py
index 1cee0461686..87cbe636fdc 100644
--- a/python/cugraph/cugraph/tests/generators/test_rmat.py
+++ b/python/cugraph/cugraph/tests/generators/test_rmat.py
@@ -27,7 +27,9 @@
 _scale_values = [2, 4, 16]
 _scale_test_ids = [f"scale={x}" for x in _scale_values]
 _graph_types = [cugraph.Graph, None, int]
-_graph_test_ids = [f"create_using={getattr(x,'__name__',str(x))}" for x in _graph_types]
+_graph_test_ids = [
+    f"create_using={getattr(x, '__name__', str(x))}" for x in _graph_types
+]
 _clip_and_flip = [False, True]
 _clip_and_flip_test_ids = [f"clip_and_flip={x}" for x in _clip_and_flip]
 _scramble_vertex_ids = [False, True]
diff --git a/python/cugraph/cugraph/tests/generators/test_rmat_mg.py b/python/cugraph/cugraph/tests/generators/test_rmat_mg.py
index 0e1808d2f80..44a6b3a2fc1 100644
--- a/python/cugraph/cugraph/tests/generators/test_rmat_mg.py
+++ b/python/cugraph/cugraph/tests/generators/test_rmat_mg.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -34,7 +34,9 @@
 _scale_values = [2, 4, 16]
 _scale_test_ids = [f"scale={x}" for x in _scale_values]
 _graph_types = [cugraph.Graph, None, int]
-_graph_test_ids = [f"create_using={getattr(x,'__name__',str(x))}" for x in _graph_types]
+_graph_test_ids = [
+    f"create_using={getattr(x, '__name__', str(x))}" for x in _graph_types
+]
 
 
 def _call_rmat(scale, num_edges, create_using, mg=True):
diff --git a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py
index 65bcce78771..3c5d6428001 100644
--- a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py
+++ b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py
@@ -119,7 +119,7 @@ def test_bulk_sampler_remainder(scratch_dir):
         assert b in recovered_samples["batch_id"].values_host.tolist()
 
     for x in range(0, 6, 2):
-        subdir = f"{x}-{x+1}"
+        subdir = f"{x}-{x + 1}"
         df = cudf.read_parquet(os.path.join(samples_path, f"batch={subdir}.parquet"))
 
         assert ((df.batch_id == x) | (df.batch_id == (x + 1))).all()
diff --git a/python/cugraph/cugraph/tests/sampling/test_dist_sampler.py b/python/cugraph/cugraph/tests/sampling/test_dist_sampler.py
index 965f731d328..64db0232fb1 100644
--- a/python/cugraph/cugraph/tests/sampling/test_dist_sampler.py
+++ b/python/cugraph/cugraph/tests/sampling/test_dist_sampler.py
@@ -20,6 +20,7 @@
 
 from cugraph.datasets import karate
 from cugraph.gnn import UniformNeighborSampler, DistSampleWriter
+from cugraph.gnn.data_loading.bulk_sampler_io import create_df_from_disjoint_arrays
 
 from pylibcugraph import SGGraph, ResourceHandle, GraphProperties
 
@@ -41,7 +42,7 @@
 
 
 @pytest.fixture
-def karate_graph():
+def karate_graph() -> SGGraph:
     el = karate.get_edgelist().reset_index().rename(columns={"index": "eid"})
     G = SGGraph(
         ResourceHandle(),
@@ -78,11 +79,13 @@ def test_dist_sampler_simple(
     )
 
     recovered_samples = cudf.read_parquet(samples_path)
+    print(recovered_samples)
     original_el = karate.get_edgelist()
 
     for b in range(len(seeds) // batch_size):
         el_start = recovered_samples.label_hop_offsets.iloc[b * len(fanout)]
         el_end = recovered_samples.label_hop_offsets.iloc[(b + 1) * len(fanout)]
+        print(el_start, el_end)
         src = recovered_samples.majors.iloc[el_start:el_end]
         dst = recovered_samples.minors.iloc[el_start:el_end]
         edge_id = recovered_samples.edge_id.iloc[el_start:el_end]
@@ -99,3 +102,60 @@ def test_dist_sampler_simple(
             assert original_el.dst.iloc[edge_id.iloc[i]] == dst.iloc[i]
 
     shutil.rmtree(samples_path)
+
+
+@pytest.mark.sg
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.parametrize("seeds_per_call", [4, 5, 10])
+@pytest.mark.parametrize("compression", ["COO", "CSR"])
+def test_dist_sampler_buffered_in_memory(
+    scratch_dir: str, karate_graph: SGGraph, seeds_per_call: int, compression: str
+):
+    G = karate_graph
+
+    samples_path = os.path.join(scratch_dir, "test_bulk_sampler_buffered_in_memory")
+    create_directory_with_overwrite(samples_path)
+
+    seeds = cupy.arange(10, dtype="int64")
+
+    unbuffered_sampler = UniformNeighborSampler(
+        G,
+        writer=DistSampleWriter(samples_path),
+        local_seeds_per_call=seeds_per_call,
+        compression=compression,
+    )
+
+    buffered_sampler = UniformNeighborSampler(
+        G,
+        writer=None,
+        local_seeds_per_call=seeds_per_call,
+        compression=compression,
+    )
+
+    unbuffered_results = unbuffered_sampler.sample_from_nodes(
+        seeds,
+        batch_size=4,
+    )
+
+    unbuffered_results = [
+        (create_df_from_disjoint_arrays(r[0]), r[1], r[2]) for r in unbuffered_results
+    ]
+
+    buffered_results = buffered_sampler.sample_from_nodes(seeds, batch_size=4)
+    buffered_results = [
+        (create_df_from_disjoint_arrays(r[0]), r[1], r[2]) for r in buffered_results
+    ]
+
+    assert len(buffered_results) == len(unbuffered_results)
+
+    for k in range(len(buffered_results)):
+        br, bs, be = buffered_results[k]
+        ur, us, ue = unbuffered_results[k]
+
+        assert bs == us
+        assert be == ue
+
+        for col in ur.columns:
+            assert (br[col].dropna() == ur[col].dropna()).all()
+
+    shutil.rmtree(samples_path)
diff --git a/python/cugraph/cugraph/tests/sampling/test_dist_sampler_mg.py b/python/cugraph/cugraph/tests/sampling/test_dist_sampler_mg.py
index a1c32938994..5bb541d6cf3 100644
--- a/python/cugraph/cugraph/tests/sampling/test_dist_sampler_mg.py
+++ b/python/cugraph/cugraph/tests/sampling/test_dist_sampler_mg.py
@@ -18,6 +18,8 @@
 import cupy
 import cudf
 
+from typing import Any
+
 from cugraph.datasets import karate
 from cugraph.gnn import (
     UniformNeighborSampler,
@@ -27,6 +29,7 @@
     cugraph_comms_init,
     cugraph_comms_shutdown,
 )
+from cugraph.gnn.data_loading.bulk_sampler_io import create_df_from_disjoint_arrays
 from pylibcugraph import MGGraph, ResourceHandle, GraphProperties
 
 from cugraph.utilities.utils import (
@@ -235,3 +238,80 @@ def test_dist_sampler_uneven(scratch_dir, batch_size, fanout, seeds_per_call):
                 assert original_el.dst.iloc[edge_id.iloc[i]] == dst.iloc[i]
 
     shutil.rmtree(samples_path)
+
+
+def run_test_dist_sampler_buffered_in_memory(
+    rank: int,
+    world_size: int,
+    uid: Any,
+    samples_path: str,
+    seeds_per_call: int,
+    compression: str,
+):
+    init_pytorch(rank, world_size)
+    cugraph_comms_init(rank, world_size, uid, device=rank)
+
+    G = karate_mg_graph(rank, world_size)
+
+    num_seeds = 8
+    seeds = cupy.random.randint(0, 34, num_seeds, dtype="int64")
+
+    unbuffered_sampler = UniformNeighborSampler(
+        G,
+        writer=DistSampleWriter(samples_path),
+        local_seeds_per_call=seeds_per_call,
+        compression=compression,
+    )
+
+    buffered_sampler = UniformNeighborSampler(
+        G,
+        writer=None,
+        local_seeds_per_call=seeds_per_call,
+        compression=compression,
+    )
+
+    unbuffered_results = unbuffered_sampler.sample_from_nodes(
+        seeds,
+        batch_size=4,
+    )
+
+    unbuffered_results = [
+        (create_df_from_disjoint_arrays(r[0]), r[1], r[2]) for r in unbuffered_results
+    ]
+
+    buffered_results = buffered_sampler.sample_from_nodes(seeds, batch_size=4)
+    buffered_results = [
+        (create_df_from_disjoint_arrays(r[0]), r[1], r[2]) for r in buffered_results
+    ]
+
+    assert len(buffered_results) == len(unbuffered_results)
+
+    for k in range(len(buffered_results)):
+        br, bs, be = buffered_results[k]
+        ur, us, ue = unbuffered_results[k]
+
+        assert bs == us
+        assert be == ue
+
+        for col in ur.columns:
+            assert (br[col].dropna() == ur[col].dropna()).all()
+
+
+@pytest.mark.mg
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.parametrize("seeds_per_call", [4, 5, 10])
+@pytest.mark.parametrize("compression", ["COO", "CSR"])
+def test_dist_sampler_buffered_in_memory(scratch_dir, seeds_per_call, compression):
+    uid = cugraph_comms_create_unique_id()
+
+    samples_path = os.path.join(scratch_dir, "test_bulk_sampler_buffered_in_memory_mg")
+    create_directory_with_overwrite(samples_path)
+
+    world_size = torch.cuda.device_count()
+    torch.multiprocessing.spawn(
+        run_test_dist_sampler_buffered_in_memory,
+        args=(world_size, uid, samples_path, seeds_per_call, compression),
+        nprocs=world_size,
+    )
+
+    shutil.rmtree(samples_path)
diff --git a/python/cugraph/cugraph/tests/structure/test_graph_mg.py b/python/cugraph/cugraph/tests/structure/test_graph_mg.py
index cba61731e9a..f2cc1583f93 100644
--- a/python/cugraph/cugraph/tests/structure/test_graph_mg.py
+++ b/python/cugraph/cugraph/tests/structure/test_graph_mg.py
@@ -303,7 +303,7 @@ def test_mg_graph_serializable(dask_client, input_combo):
     G = input_combo["MGGraph"]
     dask_client.publish_dataset(shared_g=G)
     shared_g = dask_client.get_dataset("shared_g")
-    assert type(shared_g) == type(G)
+    assert type(shared_g) is type(G)
     assert G.number_of_vertices() == shared_g.number_of_vertices()
     assert G.number_of_edges() == shared_g.number_of_edges()
     # cleanup
@@ -314,7 +314,7 @@ def test_mg_graph_serializable(dask_client, input_combo):
 def test_mg_graph_copy():
     G = cugraph.MultiGraph(directed=True)
     G_c = copy.deepcopy(G)
-    assert type(G) == type(G_c)
+    assert type(G) is type(G_c)
 
 
 @pytest.mark.mg
diff --git a/python/cugraph/cugraph/tests/structure/test_hypergraph.py b/python/cugraph/cugraph/tests/structure/test_hypergraph.py
index 848f31b940f..f1dfc17a509 100644
--- a/python/cugraph/cugraph/tests/structure/test_hypergraph.py
+++ b/python/cugraph/cugraph/tests/structure/test_hypergraph.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -171,7 +171,8 @@ def test_hyperedges(categorical_metadata):
     if categorical_metadata:
         edges = edges.astype({"edge_type": "category"})
 
-    assert_frame_equal(edges, h["edges"], check_dtype=False)
+    # check_like ignores the order of columns as long as all correct ones are present
+    assert_frame_equal(edges, h["edges"], check_dtype=False, check_like=True)
     for (k, v) in [("entities", 12), ("nodes", 15), ("edges", 12), ("events", 3)]:
         assert len(h[k]) == v
 
@@ -266,7 +267,8 @@ def test_drop_edge_attrs(categorical_metadata):
     if categorical_metadata:
         edges = edges.astype({"edge_type": "category"})
 
-    assert_frame_equal(edges, h["edges"], check_dtype=False)
+    # check_like ignores the order of columns as long as all correct ones are present
+    assert_frame_equal(edges, h["edges"], check_dtype=False, check_like=True)
 
     for (k, v) in [("entities", 9), ("nodes", 12), ("edges", 9), ("events", 3)]:
         assert len(h[k]) == v
@@ -308,7 +310,8 @@ def test_drop_edge_attrs_direct(categorical_metadata):
     if categorical_metadata:
         edges = edges.astype({"edge_type": "category"})
 
-    assert_frame_equal(edges, h["edges"], check_dtype=False)
+    # check_like ignores the order of columns as long as all correct ones are present
+    assert_frame_equal(edges, h["edges"], check_dtype=False, check_like=True)
 
     for (k, v) in [("entities", 9), ("nodes", 9), ("edges", 6), ("events", 0)]:
         assert len(h[k]) == v
diff --git a/python/cugraph/cugraph/tests/traversal/test_sssp.py b/python/cugraph/cugraph/tests/traversal/test_sssp.py
index 58288e022e8..ceb6040275d 100644
--- a/python/cugraph/cugraph/tests/traversal/test_sssp.py
+++ b/python/cugraph/cugraph/tests/traversal/test_sssp.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -486,7 +486,7 @@ def test_scipy_api_compat():
     distances = cugraph.shortest_path(
         input_coo_matrix, source=0, return_predecessors=False
     )
-    assert type(distances) != tuple
+    assert type(distances) is not tuple
 
     with pytest.raises(ValueError):
         cugraph.shortest_path(input_coo_matrix, source=0, unweighted=False)
diff --git a/python/cugraph/cugraph/traversal/sssp.py b/python/cugraph/cugraph/traversal/sssp.py
index 5ab97e60390..bb98b5a9a29 100644
--- a/python/cugraph/cugraph/traversal/sssp.py
+++ b/python/cugraph/cugraph/traversal/sssp.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -36,7 +36,7 @@ def _ensure_args(
     # checks common to all input types
     if (method is not None) and (method != "auto"):
         raise ValueError("only 'auto' is currently accepted for method")
-    if (indices is not None) and (type(indices) == list):
+    if (indices is not None) and (type(indices) is list):
         raise ValueError("indices currently cannot be a list-like type")
     if (indices is not None) and (source is not None):
         raise TypeError("cannot specify both 'source' and 'indices'")
@@ -70,9 +70,11 @@ def _ensure_args(
 
     # Check for non-Graph-type inputs
     else:
-        if (directed is not None) and (type(directed) != bool):
+        if (directed is not None) and (type(directed) is not bool):
             raise ValueError("'directed' must be a bool")
-        if (return_predecessors is not None) and (type(return_predecessors) != bool):
+        if (return_predecessors is not None) and (
+            type(return_predecessors) is not bool
+        ):
             raise ValueError("'return_predecessors' must be a bool")
         if (unweighted is not None) and (unweighted is not True):
             raise ValueError("'unweighted' currently must be True if " "specified")
diff --git a/python/cugraph/cugraph/utilities/utils.py b/python/cugraph/cugraph/utilities/utils.py
index 7a54a0bf2cf..69616f26857 100644
--- a/python/cugraph/cugraph/utilities/utils.py
+++ b/python/cugraph/cugraph/utilities/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -523,6 +523,7 @@ def create_list_series_from_2d_ar(ar, index):
     mask_col = cp.full(shape=n_rows, fill_value=True)
     mask = cudf._lib.transform.bools_to_mask(as_column(mask_col))
     lc = cudf.core.column.ListColumn(
+        data=None,
         size=n_rows,
         dtype=cudf.ListDtype(data.dtype),
         mask=mask,
@@ -530,7 +531,7 @@ def create_list_series_from_2d_ar(ar, index):
         null_count=0,
         children=(offset_col, data),
     )
-    return cudf.Series(lc, index=index)
+    return cudf.Series._from_column(lc, index=index)
 
 
 def create_directory_with_overwrite(directory):
diff --git a/python/cugraph/pyproject.toml b/python/cugraph/pyproject.toml
index bbb89b03697..8185a8d915d 100644
--- a/python/cugraph/pyproject.toml
+++ b/python/cugraph/pyproject.toml
@@ -5,7 +5,7 @@
 requires = [
     "cython>=3.0.0",
     "rapids-build-backend>=0.3.1,<0.4.0.dev0",
-    "scikit-build-core[pyproject]>=0.7.0",
+    "scikit-build-core[pyproject]>=0.10.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 build-backend = "rapids_build_backend.build"
 
@@ -21,35 +21,35 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 dependencies = [
-    "cudf==24.10.*,>=0.0.0a0",
+    "cudf==24.12.*,>=0.0.0a0",
     "cupy-cuda11x>=12.0.0",
-    "dask-cuda==24.10.*,>=0.0.0a0",
-    "dask-cudf==24.10.*,>=0.0.0a0",
+    "dask-cuda==24.12.*,>=0.0.0a0",
+    "dask-cudf==24.12.*,>=0.0.0a0",
     "fsspec[http]>=0.6.0",
     "numba>=0.57",
-    "numpy>=1.23,<2.0a0",
-    "pylibcugraph==24.10.*,>=0.0.0a0",
-    "raft-dask==24.10.*,>=0.0.0a0",
-    "rapids-dask-dependency==24.10.*,>=0.0.0a0",
-    "rmm==24.10.*,>=0.0.0a0",
-    "ucx-py==0.40.*,>=0.0.0a0",
+    "numpy>=1.23,<3.0a0",
+    "pylibcugraph==24.12.*,>=0.0.0a0",
+    "raft-dask==24.12.*,>=0.0.0a0",
+    "rapids-dask-dependency==24.12.*,>=0.0.0a0",
+    "rmm==24.12.*,>=0.0.0a0",
+    "ucx-py==0.41.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
 ]
 
 [project.optional-dependencies]
 test = [
     "networkx>=2.5.1",
-    "numpy>=1.23,<2.0a0",
+    "numpy>=1.23,<3.0a0",
     "pandas",
-    "pylibwholegraph==24.10.*,>=0.0.0a0",
+    "pylibwholegraph==24.12.*,>=0.0.0a0",
     "pytest",
     "pytest-benchmark",
     "pytest-cov",
@@ -66,7 +66,8 @@ Documentation = "https://docs.rapids.ai/api/cugraph/stable/"
 [tool.scikit-build]
 build-dir = "build/{wheel_tag}"
 cmake.build-type = "Release"
-cmake.minimum-version = "3.26.4"
+cmake.version = "CMakeLists.txt"
+minimum-version = "build-system.requires"
 ninja.make-fallback = true
 sdist.reproducible = true
 wheel.packages = ["cugraph"]
@@ -81,9 +82,9 @@ build-backend = "scikit_build_core.build"
 requires = [
     "cmake>=3.26.4,!=3.30.0",
     "ninja",
-    "pylibcugraph==24.10.*,>=0.0.0a0",
-    "pylibraft==24.10.*,>=0.0.0a0",
-    "rmm==24.10.*,>=0.0.0a0",
+    "pylibcugraph==24.12.*,>=0.0.0a0",
+    "pylibraft==24.12.*,>=0.0.0a0",
+    "rmm==24.12.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 dependencies-file = "../../dependencies.yaml"
 matrix-entry = "cuda_suffixed=true"
diff --git a/python/cugraph/pytest.ini b/python/cugraph/pytest.ini
index 675a6cf8fde..bca148538d9 100644
--- a/python/cugraph/pytest.ini
+++ b/python/cugraph/pytest.ini
@@ -17,6 +17,7 @@ addopts =
            --benchmark-max-time=0
            --benchmark-min-rounds=1
            --benchmark-columns="mean, rounds"
+           --tb=native
            ## do not run the slow tests/benchmarks by default
            -m "not slow"
            ## for use with rapids-pytest-benchmark plugin
diff --git a/python/nx-cugraph/README.md b/python/nx-cugraph/README.md
index 458421e2b6e..c3ca0b880a9 100644
--- a/python/nx-cugraph/README.md
+++ b/python/nx-cugraph/README.md
@@ -8,8 +8,8 @@ to run supported algorithms with GPU acceleration.
 
 nx-cugraph requires the following:
  * NVIDIA GPU, Volta architecture or later, with [compute capability](https://developer.nvidia.com/cuda-gpus) 7.0+
- * CUDA 11.2, 11.4, 11.5, 11.8, or 12.0
- * Python version 3.9, 3.10, or 3.11
+ * CUDA 11.2, 11.4, 11.5, 11.8, 12.0, 12.2, or 12.5
+ * Python version 3.10, 3.11, or 3.12
  * NetworkX >= version 3.0 (version 3.2 or higher recommended)
 
 More details about system requirements can be found in the [RAPIDS System Requirements documentation](https://docs.rapids.ai/install#system-req).
diff --git a/python/nx-cugraph/_nx_cugraph/__init__.py b/python/nx-cugraph/_nx_cugraph/__init__.py
index f58a6e2293b..a5e45979fe2 100644
--- a/python/nx-cugraph/_nx_cugraph/__init__.py
+++ b/python/nx-cugraph/_nx_cugraph/__init__.py
@@ -22,6 +22,7 @@
 
 $ python _nx_cugraph/__init__.py
 """
+import os
 
 from _nx_cugraph._version import __version__
 
@@ -35,7 +36,7 @@
     "backend_name": "cugraph",
     "project": "nx-cugraph",
     "package": "nx_cugraph",
-    "url": f"https://github.com/rapidsai/cugraph/tree/branch-{_version_major:0>2}.{_version_minor:0>2}/python/nx-cugraph",
+    "url": f"https://rapids.ai/nx-cugraph",
     "short_summary": "GPU-accelerated backend.",
     # "description": "TODO",
     "functions": {
@@ -293,10 +294,19 @@ def get_info():
 
     for key in info_keys:
         del d[key]
+
+    d["default_config"] = {
+        "use_compat_graphs": os.environ.get("NX_CUGRAPH_USE_COMPAT_GRAPHS", "true")
+        .strip()
+        .lower()
+        == "true",
+    }
     return d
 
 
-def _check_networkx_version():
+def _check_networkx_version() -> tuple[int, int]:
+    """Check the version of networkx and return ``(major, minor)`` version tuple."""
+    import re
     import warnings
 
     import networkx as nx
@@ -310,12 +320,20 @@ def _check_networkx_version():
             UserWarning,
             stacklevel=2,
         )
-    if len(version_minor) > 1:
+
+    # Allow single-digit minor versions, e.g. 3.4 and release candidates, e.g. 3.4rc0
+    pattern = r"^\d(rc\d+)?$"
+
+    if not re.match(pattern, version_minor):
         raise RuntimeWarning(
             f"nx-cugraph version {__version__} does not work with networkx version "
             f"{nx.__version__}. Please upgrade (or fix) your Python environment."
         )
 
+    nxver_major = int(version_major)
+    nxver_minor = int(re.match(r"^\d+", version_minor).group())
+    return (nxver_major, nxver_minor)
+
 
 if __name__ == "__main__":
     from pathlib import Path
diff --git a/python/nx-cugraph/lint.yaml b/python/nx-cugraph/lint.yaml
index ce46360e234..dab2ea70ef1 100644
--- a/python/nx-cugraph/lint.yaml
+++ b/python/nx-cugraph/lint.yaml
@@ -26,7 +26,7 @@ repos:
       - id: mixed-line-ending
       - id: trailing-whitespace
   - repo: https://github.com/abravalheri/validate-pyproject
-    rev: v0.18
+    rev: v0.19
     hooks:
       - id: validate-pyproject
         name: Validate pyproject.toml
@@ -40,29 +40,29 @@ repos:
     hooks:
       - id: isort
   - repo: https://github.com/asottile/pyupgrade
-    rev: v3.16.0
+    rev: v3.17.0
     hooks:
       - id: pyupgrade
-        args: [--py39-plus]
+        args: [--py310-plus]
   - repo: https://github.com/psf/black
-    rev: 24.4.2
+    rev: 24.8.0
     hooks:
       - id: black
       # - id: black-jupyter
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.5.4
+    rev: v0.6.7
     hooks:
       - id: ruff
         args: [--fix-only, --show-fixes]  # --unsafe-fixes]
   - repo: https://github.com/PyCQA/flake8
-    rev: 7.1.0
+    rev: 7.1.1
     hooks:
       - id: flake8
         args: ['--per-file-ignores=_nx_cugraph/__init__.py:E501', '--extend-ignore=B020,SIM105']  # Why is this necessary?
         additional_dependencies: &flake8_dependencies
           # These versions need updated manually
-          - flake8==7.1.0
-          - flake8-bugbear==24.4.26
+          - flake8==7.1.1
+          - flake8-bugbear==24.8.19
           - flake8-simplify==0.21.0
   - repo: https://github.com/asottile/yesqa
     rev: v1.5.0
@@ -77,7 +77,7 @@ repos:
         additional_dependencies: [tomli]
         files: ^(nx_cugraph|docs)/
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.5.4
+    rev: v0.6.7
     hooks:
       - id: ruff
   - repo: https://github.com/pre-commit/pre-commit-hooks
diff --git a/python/nx-cugraph/nx_cugraph/__init__.py b/python/nx-cugraph/nx_cugraph/__init__.py
index 542256fa781..4404e57f645 100644
--- a/python/nx-cugraph/nx_cugraph/__init__.py
+++ b/python/nx-cugraph/nx_cugraph/__init__.py
@@ -12,6 +12,11 @@
 # limitations under the License.
 from networkx.exception import *
 
+from _nx_cugraph._version import __git_commit__, __version__
+from _nx_cugraph import _check_networkx_version
+
+_nxver: tuple[int, int] = _check_networkx_version()
+
 from . import utils
 
 from . import classes
@@ -32,7 +37,10 @@
 from . import algorithms
 from .algorithms import *
 
-from _nx_cugraph._version import __git_commit__, __version__
-from _nx_cugraph import _check_networkx_version
+from .interface import BackendInterface
 
-_check_networkx_version()
+BackendInterface.Graph = classes.Graph
+BackendInterface.DiGraph = classes.DiGraph
+BackendInterface.MultiGraph = classes.MultiGraph
+BackendInterface.MultiDiGraph = classes.MultiDiGraph
+del BackendInterface
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/bipartite/generators.py b/python/nx-cugraph/nx_cugraph/algorithms/bipartite/generators.py
index 60276b7d41b..214970235c6 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/bipartite/generators.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/bipartite/generators.py
@@ -16,6 +16,7 @@
 import networkx as nx
 import numpy as np
 
+from nx_cugraph import _nxver
 from nx_cugraph.generators._utils import _create_using_class, _number_and_nodes
 from nx_cugraph.utils import index_dtype, networkx_algorithm
 
@@ -48,7 +49,7 @@ def complete_bipartite_graph(n1, n2, create_using=None):
         nodes.extend(range(n2) if nodes2 is None else nodes2)
         if len(set(nodes)) != len(nodes):
             raise nx.NetworkXError("Inputs n1 and n2 must contain distinct nodes")
-    if nx.__version__[:3] <= "3.3":
+    if _nxver <= (3, 3):
         name = f"complete_bipartite_graph({orig_n1}, {orig_n2})"
     else:
         name = f"complete_bipartite_graph({n1}, {n2})"
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py b/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py
index ea1318060e0..52c512c454d 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py
@@ -12,9 +12,9 @@
 # limitations under the License.
 import warnings
 
-import networkx as nx
 import pylibcugraph as plc
 
+from nx_cugraph import _nxver
 from nx_cugraph.convert import _to_undirected_graph
 from nx_cugraph.utils import (
     _dtype_param,
@@ -27,7 +27,7 @@
 __all__ = ["louvain_communities"]
 
 # max_level argument was added to NetworkX 3.3
-if nx.__version__[:3] <= "3.2":
+if _nxver <= (3, 2):
     _max_level_param = {
         "max_level : int, optional": (
             "Upper limit of the number of macro-iterations (max: 500)."
@@ -81,7 +81,7 @@ def _louvain_communities(
     node_ids, clusters, modularity = plc.louvain(
         resource_handle=plc.ResourceHandle(),
         graph=G._get_plc_graph(weight, 1, dtype),
-        max_level=max_level,  # TODO: add this parameter to NetworkX
+        max_level=max_level,
         threshold=threshold,
         resolution=resolution,
         do_expensive_check=False,
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/core.py b/python/nx-cugraph/nx_cugraph/algorithms/core.py
index 8eb9a9946e7..e69ee88a17c 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/core.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/core.py
@@ -15,6 +15,7 @@
 import pylibcugraph as plc
 
 import nx_cugraph as nxcg
+from nx_cugraph import _nxver
 from nx_cugraph.convert import _to_undirected_graph
 from nx_cugraph.utils import (
     _get_int_dtype,
@@ -58,9 +59,12 @@ def _(G):
 @networkx_algorithm(is_incomplete=True, version_added="23.12", _plc="k_truss_subgraph")
 def k_truss(G, k):
     if is_nx := isinstance(G, nx.Graph):
+        is_compat_graph = isinstance(G, nxcg.Graph)
         G = nxcg.from_networkx(G, preserve_all_attrs=True)
+    else:
+        is_compat_graph = False
     if nxcg.number_of_selfloops(G) > 0:
-        if nx.__version__[:3] <= "3.2":
+        if _nxver <= (3, 2):
             exc_class = nx.NetworkXError
         else:
             exc_class = nx.NetworkXNotImplemented
@@ -128,6 +132,7 @@ def k_truss(G, k):
         node_values,
         node_masks,
         key_to_id=key_to_id,
+        use_compat_graph=is_compat_graph,
     )
     new_graph.graph.update(G.graph)
     return new_graph
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/hits_alg.py b/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/hits_alg.py
index e529b83ab1a..cc59fd5eb64 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/hits_alg.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/hits_alg.py
@@ -15,6 +15,7 @@
 import numpy as np
 import pylibcugraph as plc
 
+from nx_cugraph import _nxver
 from nx_cugraph.convert import _to_graph
 from nx_cugraph.utils import (
     _dtype_param,
@@ -53,7 +54,7 @@ def hits(
     if nstart is not None:
         nstart = G._dict_to_nodearray(nstart, 0, dtype)
     if max_iter <= 0:
-        if nx.__version__[:3] <= "3.2":
+        if _nxver <= (3, 2):
             raise ValueError("`maxiter` must be a positive integer.")
         raise nx.PowerIterationFailedConvergence(max_iter)
     try:
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/operators/unary.py b/python/nx-cugraph/nx_cugraph/algorithms/operators/unary.py
index f53b3458949..75dc5fbc706 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/operators/unary.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/operators/unary.py
@@ -23,6 +23,7 @@
 
 @networkx_algorithm(version_added="24.02")
 def complement(G):
+    is_compat_graph = isinstance(G, nxcg.Graph)
     G = _to_graph(G)
     N = G._N
     # Upcast to int64 so indices don't overflow.
@@ -43,6 +44,7 @@ def complement(G):
         src_indices.astype(index_dtype),
         dst_indices.astype(index_dtype),
         key_to_id=G.key_to_id,
+        use_compat_graph=is_compat_graph,
     )
 
 
@@ -51,10 +53,16 @@ def reverse(G, copy=True):
     if not G.is_directed():
         raise nx.NetworkXError("Cannot reverse an undirected graph.")
     if isinstance(G, nx.Graph):
-        if not copy:
+        is_compat_graph = isinstance(G, nxcg.Graph)
+        if not copy and not is_compat_graph:
             raise RuntimeError(
                 "Using `copy=False` is invalid when using a NetworkX graph "
                 "as input to `nx_cugraph.reverse`"
             )
         G = nxcg.from_networkx(G, preserve_all_attrs=True)
-    return G.reverse(copy=copy)
+    else:
+        is_compat_graph = False
+    rv = G.reverse(copy=copy)
+    if is_compat_graph:
+        return rv._to_compat_graph()
+    return rv
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/generic.py b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/generic.py
index 7d6d77f34a4..ab3c7214303 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/generic.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/generic.py
@@ -14,6 +14,7 @@
 import numpy as np
 
 import nx_cugraph as nxcg
+from nx_cugraph import _nxver
 from nx_cugraph.convert import _to_graph
 from nx_cugraph.utils import _dtype_param, _get_float_dtype, networkx_algorithm
 
@@ -57,7 +58,7 @@ def shortest_path(
                 paths = nxcg.all_pairs_dijkstra_path(G, weight=weight, dtype=dtype)
             else:  # method == 'bellman-ford':
                 paths = nxcg.all_pairs_bellman_ford_path(G, weight=weight, dtype=dtype)
-            if nx.__version__[:3] <= "3.4":
+            if _nxver <= (3, 4):
                 paths = dict(paths)
         # To target
         elif method == "unweighted":
@@ -129,7 +130,7 @@ def shortest_path_length(
         # To target
         elif method == "unweighted":
             lengths = nxcg.single_target_shortest_path_length(G, target)
-            if nx.__version__[:3] <= "3.4":
+            if _nxver <= (3, 4):
                 lengths = dict(lengths)
         elif method == "dijkstra":
             lengths = nxcg.single_source_dijkstra_path_length(
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py
index 0e98c366e4a..e9c515632ca 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py
@@ -17,6 +17,7 @@
 import numpy as np
 import pylibcugraph as plc
 
+from nx_cugraph import _nxver
 from nx_cugraph.convert import _to_graph
 from nx_cugraph.utils import _groupby, index_dtype, networkx_algorithm
 
@@ -43,7 +44,7 @@ def single_source_shortest_path_length(G, source, cutoff=None):
 def single_target_shortest_path_length(G, target, cutoff=None):
     G = _to_graph(G)
     rv = _bfs(G, target, cutoff, "Target", return_type="length")
-    if nx.__version__[:3] <= "3.4":
+    if _nxver <= (3, 4):
         return iter(rv.items())
     return rv
 
@@ -61,7 +62,7 @@ def bidirectional_shortest_path(G, source, target):
     # TODO PERF: do bidirectional traversal in core
     G = _to_graph(G)
     if source not in G or target not in G:
-        if nx.__version__[:3] <= "3.3":
+        if _nxver <= (3, 3):
             raise nx.NodeNotFound(
                 f"Either source {source} or target {target} is not in G"
             )
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/traversal/breadth_first_search.py b/python/nx-cugraph/nx_cugraph/algorithms/traversal/breadth_first_search.py
index 5e4466d7d33..72d0079cf0c 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/traversal/breadth_first_search.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/traversal/breadth_first_search.py
@@ -18,6 +18,7 @@
 import pylibcugraph as plc
 
 import nx_cugraph as nxcg
+from nx_cugraph import _nxver
 from nx_cugraph.convert import _to_graph
 from nx_cugraph.utils import _groupby, index_dtype, networkx_algorithm
 
@@ -57,7 +58,7 @@ def _bfs(G, source, *, depth_limit=None, reverse=False):
     return distances[mask], predecessors[mask], node_ids[mask]
 
 
-if nx.__version__[:3] <= "3.3":
+if _nxver <= (3, 3):
 
     @networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="bfs")
     def generic_bfs_edges(
@@ -132,13 +133,15 @@ def bfs_tree(G, source, reverse=False, depth_limit=None, sort_neighbors=None):
         raise NotImplementedError(
             "sort_neighbors argument in bfs_tree is not currently supported"
         )
+    is_compat_graph = isinstance(G, nxcg.Graph)
     G = _check_G_and_source(G, source)
     if depth_limit is not None and depth_limit < 1:
-        return nxcg.DiGraph.from_coo(
+        return nxcg.CudaDiGraph.from_coo(
             1,
             cp.array([], dtype=index_dtype),
             cp.array([], dtype=index_dtype),
             id_to_key=[source],
+            use_compat_graph=is_compat_graph,
         )
 
     distances, predecessors, node_ids = _bfs(
@@ -148,11 +151,12 @@ def bfs_tree(G, source, reverse=False, depth_limit=None, sort_neighbors=None):
         reverse=reverse,
     )
     if predecessors.size == 0:
-        return nxcg.DiGraph.from_coo(
+        return nxcg.CudaDiGraph.from_coo(
             1,
             cp.array([], dtype=index_dtype),
             cp.array([], dtype=index_dtype),
             id_to_key=[source],
+            use_compat_graph=is_compat_graph,
         )
     # TODO: create renumbering helper function(s)
     unique_node_ids = cp.unique(cp.hstack((predecessors, node_ids)))
@@ -170,11 +174,12 @@ def bfs_tree(G, source, reverse=False, depth_limit=None, sort_neighbors=None):
             old_index: new_index
             for new_index, old_index in enumerate(unique_node_ids.tolist())
         }
-    return nxcg.DiGraph.from_coo(
+    return nxcg.CudaDiGraph.from_coo(
         unique_node_ids.size,
         src_indices,
         dst_indices,
         key_to_id=key_to_id,
+        use_compat_graph=is_compat_graph,
     )
 
 
diff --git a/python/nx-cugraph/nx_cugraph/classes/__init__.py b/python/nx-cugraph/nx_cugraph/classes/__init__.py
index 19a5357da55..71168e5364f 100644
--- a/python/nx-cugraph/nx_cugraph/classes/__init__.py
+++ b/python/nx-cugraph/nx_cugraph/classes/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -10,9 +10,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .graph import Graph
-from .digraph import DiGraph
-from .multigraph import MultiGraph
-from .multidigraph import MultiDiGraph
+from .graph import CudaGraph, Graph
+from .digraph import CudaDiGraph, DiGraph
+from .multigraph import CudaMultiGraph, MultiGraph
+from .multidigraph import CudaMultiDiGraph, MultiDiGraph
 
 from .function import *
diff --git a/python/nx-cugraph/nx_cugraph/classes/digraph.py b/python/nx-cugraph/nx_cugraph/classes/digraph.py
index e5cfb8f6815..178bf44f16e 100644
--- a/python/nx-cugraph/nx_cugraph/classes/digraph.py
+++ b/python/nx-cugraph/nx_cugraph/classes/digraph.py
@@ -18,34 +18,108 @@
 import cupy as cp
 import networkx as nx
 import numpy as np
+from networkx.classes.digraph import (
+    _CachedPropertyResetterAdjAndSucc,
+    _CachedPropertyResetterPred,
+)
 
 import nx_cugraph as nxcg
 
 from ..utils import index_dtype
-from .graph import Graph
+from .graph import CudaGraph, Graph
 
 if TYPE_CHECKING:  # pragma: no cover
     from nx_cugraph.typing import AttrKey
 
-__all__ = ["DiGraph"]
+__all__ = ["CudaDiGraph", "DiGraph"]
 
 networkx_api = nxcg.utils.decorators.networkx_class(nx.DiGraph)
 
 
-class DiGraph(Graph):
-    #################
-    # Class methods #
-    #################
+class DiGraph(nx.DiGraph, Graph):
+    _nx_attrs = ("_node", "_adj", "_succ", "_pred")
+
+    name = Graph.name
+    _node = Graph._node
+
+    @property
+    @networkx_api
+    def _adj(self):
+        if (adj := self.__dict__["_adj"]) is None:
+            self._reify_networkx()
+            adj = self.__dict__["_adj"]
+        return adj
+
+    @_adj.setter
+    def _adj(self, val):
+        self._prepare_setter()
+        _CachedPropertyResetterAdjAndSucc.__set__(None, self, val)
+        if cache := getattr(self, "__networkx_cache__", None):
+            cache.clear()
+
+    @property
+    @networkx_api
+    def _succ(self):
+        if (succ := self.__dict__["_succ"]) is None:
+            self._reify_networkx()
+            succ = self.__dict__["_succ"]
+        return succ
+
+    @_succ.setter
+    def _succ(self, val):
+        self._prepare_setter()
+        _CachedPropertyResetterAdjAndSucc.__set__(None, self, val)
+        if cache := getattr(self, "__networkx_cache__", None):
+            cache.clear()
+
+    @property
+    @networkx_api
+    def _pred(self):
+        if (pred := self.__dict__["_pred"]) is None:
+            self._reify_networkx()
+            pred = self.__dict__["_pred"]
+        return pred
+
+    @_pred.setter
+    def _pred(self, val):
+        self._prepare_setter()
+        _CachedPropertyResetterPred.__set__(None, self, val)
+        if cache := getattr(self, "__networkx_cache__", None):
+            cache.clear()
 
     @classmethod
     @networkx_api
     def is_directed(cls) -> bool:
         return True
 
+    @classmethod
+    @networkx_api
+    def is_multigraph(cls) -> bool:
+        return False
+
+    @classmethod
+    def to_cudagraph_class(cls) -> type[CudaDiGraph]:
+        return CudaDiGraph
+
     @classmethod
     def to_networkx_class(cls) -> type[nx.DiGraph]:
         return nx.DiGraph
 
+
+class CudaDiGraph(CudaGraph):
+    #################
+    # Class methods #
+    #################
+
+    is_directed = classmethod(DiGraph.is_directed.__func__)
+    is_multigraph = classmethod(DiGraph.is_multigraph.__func__)
+    to_cudagraph_class = classmethod(DiGraph.to_cudagraph_class.__func__)
+    to_networkx_class = classmethod(DiGraph.to_networkx_class.__func__)
+
+    @classmethod
+    def _to_compat_graph_class(cls) -> type[DiGraph]:
+        return DiGraph
+
     @networkx_api
     def size(self, weight: AttrKey | None = None) -> int:
         if weight is not None:
@@ -57,7 +131,7 @@ def size(self, weight: AttrKey | None = None) -> int:
     ##########################
 
     @networkx_api
-    def reverse(self, copy: bool = True) -> DiGraph:
+    def reverse(self, copy: bool = True) -> CudaDiGraph:
         return self._copy(not copy, self.__class__, reverse=True)
 
     @networkx_api
@@ -162,6 +236,7 @@ def to_undirected(self, reciprocal=False, as_view=False):
             node_masks,
             key_to_id=key_to_id,
             id_to_key=id_to_key,
+            use_compat_graph=False,
         )
         if as_view:
             rv.graph = self.graph
diff --git a/python/nx-cugraph/nx_cugraph/classes/graph.py b/python/nx-cugraph/nx_cugraph/classes/graph.py
index 7425eacb2b4..cfe1e1c87e9 100644
--- a/python/nx-cugraph/nx_cugraph/classes/graph.py
+++ b/python/nx-cugraph/nx_cugraph/classes/graph.py
@@ -20,8 +20,13 @@
 import networkx as nx
 import numpy as np
 import pylibcugraph as plc
+from networkx.classes.graph import (
+    _CachedPropertyResetterAdj,
+    _CachedPropertyResetterNode,
+)
 
 import nx_cugraph as nxcg
+from nx_cugraph import _nxver
 
 from ..utils import index_dtype
 
@@ -40,57 +45,246 @@
         any_ndarray,
     )
 
-__all__ = ["Graph"]
+__all__ = ["CudaGraph", "Graph"]
 
 networkx_api = nxcg.utils.decorators.networkx_class(nx.Graph)
 
+# The "everything" cache key is an internal implementation detail of NetworkX
+# that may change between releases.
+if _nxver < (3, 4):
+    _CACHE_KEY = (
+        True,  # Include all edge values
+        True,  # Include all node values
+        True,  # Include `.graph` attributes
+    )
+else:
+    _CACHE_KEY = (
+        True,  # Include all edge values
+        True,  # Include all node values
+        # `.graph` attributes are always included now
+    )
+
+# Use to indicate when a full conversion to GPU failed so we don't try again.
+_CANT_CONVERT_TO_GPU = "_CANT_CONVERT_TO_GPU"
+
+
+# `collections.UserDict` was the preferred way to subclass dict, but now
+# subclassing dict directly is much better supported and should work here.
+# This class should only be necessary if the user clears the cache manually.
+class _GraphCache(dict):
+    """Cache that ensures Graph will reify into a NetworkX graph when cleared."""
+
+    _graph: Graph
 
-class Graph:
+    def __init__(self, graph: Graph):
+        self._graph = graph
+
+    def clear(self) -> None:
+        self._graph._reify_networkx()
+        super().clear()
+
+
+class Graph(nx.Graph):
     # Tell networkx to dispatch calls with this object to nx-cugraph
     __networkx_backend__: ClassVar[str] = "cugraph"  # nx >=3.2
     __networkx_plugin__: ClassVar[str] = "cugraph"  # nx <3.2
 
+    # Core attributes of NetowkrX graphs that will be copied and cleared as appropriate.
+    # These attributes comprise the edge and node data model for NetworkX graphs.
+    _nx_attrs = ("_node", "_adj")
+
     # Allow networkx dispatch machinery to cache conversions.
     # This means we should clear the cache if we ever mutate the object!
-    __networkx_cache__: dict | None
+    __networkx_cache__: _GraphCache | None
 
     # networkx properties
     graph: dict
-    graph_attr_dict_factory: ClassVar[type] = dict
+    # Should we declare type annotations for the rest?
+
+    # Properties that trigger copying to the CPU
+    def _prepare_setter(self):
+        """Be careful when setting private attributes which may be used during init."""
+        if (
+            # If not present, then this must be in init
+            any(attr not in self.__dict__ for attr in self._nx_attrs)
+            # Already on the CPU
+            or not any(self.__dict__[attr] is None for attr in self._nx_attrs)
+        ):
+            return
+        if self._is_on_gpu:
+            # Copy from GPU to CPU
+            self._reify_networkx()
+            return
+        # Default values
+        for attr in self._nx_attrs:
+            if self.__dict__[attr] is None:
+                if attr == "_succ":
+                    self.__dict__[attr] = self.__dict__["_adj"]
+                else:
+                    self.__dict__[attr] = {}
 
-    # Not networkx properties
-    # We store edge data in COO format with {src,dst}_indices and edge_values.
-    src_indices: cp.ndarray[IndexValue]
-    dst_indices: cp.ndarray[IndexValue]
-    edge_values: dict[AttrKey, cp.ndarray[EdgeValue]]
-    edge_masks: dict[AttrKey, cp.ndarray[bool]]
-    node_values: dict[AttrKey, any_ndarray[NodeValue]]
-    node_masks: dict[AttrKey, any_ndarray[bool]]
-    key_to_id: dict[NodeKey, IndexValue] | None
-    _id_to_key: list[NodeKey] | None
-    _N: int
-    _node_ids: cp.ndarray[IndexValue] | None  # holds plc.SGGraph.vertices_array data
+    @property
+    @networkx_api
+    def _node(self):
+        if (node := self.__dict__["_node"]) is None:
+            self._reify_networkx()
+            node = self.__dict__["_node"]
+        return node
+
+    @_node.setter
+    def _node(self, val):
+        self._prepare_setter()
+        _CachedPropertyResetterNode.__set__(None, self, val)
+        if cache := getattr(self, "__networkx_cache__", None):
+            cache.clear()
 
-    # Used by graph._get_plc_graph
-    _plc_type_map: ClassVar[dict[np.dtype, np.dtype]] = {
-        # signed int
-        np.dtype(np.int8): np.dtype(np.float32),
-        np.dtype(np.int16): np.dtype(np.float32),
-        np.dtype(np.int32): np.dtype(np.float64),
-        np.dtype(np.int64): np.dtype(np.float64),  # raise if abs(x) > 2**53
-        # unsigned int
-        np.dtype(np.uint8): np.dtype(np.float32),
-        np.dtype(np.uint16): np.dtype(np.float32),
-        np.dtype(np.uint32): np.dtype(np.float64),
-        np.dtype(np.uint64): np.dtype(np.float64),  # raise if x > 2**53
-        # other
-        np.dtype(np.bool_): np.dtype(np.float32),
-        np.dtype(np.float16): np.dtype(np.float32),
-    }
-    _plc_allowed_edge_types: ClassVar[set[np.dtype]] = {
-        np.dtype(np.float32),
-        np.dtype(np.float64),
-    }
+    @property
+    @networkx_api
+    def _adj(self):
+        if (adj := self.__dict__["_adj"]) is None:
+            self._reify_networkx()
+            adj = self.__dict__["_adj"]
+        return adj
+
+    @_adj.setter
+    def _adj(self, val):
+        self._prepare_setter()
+        _CachedPropertyResetterAdj.__set__(None, self, val)
+        if cache := getattr(self, "__networkx_cache__", None):
+            cache.clear()
+
+    @property
+    def _is_on_gpu(self) -> bool:
+        """Whether the full graph is on device (in the cache).
+
+        This returns False when only a subset of the graph (such as only
+        edge indices and edge attribute) is on device.
+
+        The graph may be on host (CPU) and device (GPU) at the same time.
+        """
+        cache = getattr(self, "__networkx_cache__", None)
+        if not cache:
+            return False
+        return _CACHE_KEY in cache.get("backends", {}).get("cugraph", {})
+
+    @property
+    def _is_on_cpu(self) -> bool:
+        """Whether the graph is on host as a NetworkX graph.
+
+        This means the core data structures that comprise a NetworkX graph
+        (such as ``G._node`` and ``G._adj``) are present.
+
+        The graph may be on host (CPU) and device (GPU) at the same time.
+        """
+        return self.__dict__["_node"] is not None
+
+    @property
+    def _cudagraph(self):
+        """Return the full ``CudaGraph`` on device, computing if necessary, or None."""
+        nx_cache = getattr(self, "__networkx_cache__", None)
+        if nx_cache is None:
+            nx_cache = {}
+        elif _CANT_CONVERT_TO_GPU in nx_cache:
+            return None
+        cache = nx_cache.setdefault("backends", {}).setdefault("cugraph", {})
+        if (Gcg := cache.get(_CACHE_KEY)) is not None:
+            if isinstance(Gcg, Graph):
+                # This shouldn't happen during normal use, but be extra-careful anyway
+                return Gcg._cudagraph
+            return Gcg
+        if self.__dict__["_node"] is None:
+            raise RuntimeError(
+                f"{type(self).__name__} cannot be converted to the GPU, because it is "
+                "not on the CPU! This is not supposed to be possible. If you believe "
+                "you have found a bug, please report a minimum reproducible example to "
+                "https://github.com/rapidsai/cugraph/issues/new/choose"
+            )
+        try:
+            Gcg = nxcg.from_networkx(
+                self, preserve_edge_attrs=True, preserve_node_attrs=True
+            )
+        except Exception:
+            # Should we warn that the full graph can't be on GPU?
+            nx_cache[_CANT_CONVERT_TO_GPU] = True
+            return None
+        Gcg.graph = self.graph
+        cache[_CACHE_KEY] = Gcg
+        return Gcg
+
+    @_cudagraph.setter
+    def _cudagraph(self, val, *, clear_cpu=True):
+        """Set the full ``CudaGraph`` for this graph, or remove from device if None."""
+        if (cache := getattr(self, "__networkx_cache__", None)) is None:
+            # Should we warn?
+            return
+        # TODO: pay close attention to when we should clear the cache, since
+        # this may or may not be a mutation.
+        cache = cache.setdefault("backends", {}).setdefault("cugraph", {})
+        if val is None:
+            cache.pop(_CACHE_KEY, None)
+        else:
+            self.graph = val.graph
+            cache[_CACHE_KEY] = val
+            if clear_cpu:
+                for key in self._nx_attrs:
+                    self.__dict__[key] = None
+
+    @nx.Graph.name.setter
+    def name(self, s):
+        # Don't clear the cache when setting the name, since `.graph` is shared.
+        # There is a very small risk here for the cache to become (slightly)
+        # insconsistent if graphs from other backends are cached.
+        self.graph["name"] = s
+
+    @classmethod
+    @networkx_api
+    def is_directed(cls) -> bool:
+        return False
+
+    @classmethod
+    @networkx_api
+    def is_multigraph(cls) -> bool:
+        return False
+
+    @classmethod
+    def to_cudagraph_class(cls) -> type[CudaGraph]:
+        return CudaGraph
+
+    @classmethod
+    @networkx_api
+    def to_directed_class(cls) -> type[nxcg.DiGraph]:
+        return nxcg.DiGraph
+
+    @classmethod
+    def to_networkx_class(cls) -> type[nx.Graph]:
+        return nx.Graph
+
+    @classmethod
+    @networkx_api
+    def to_undirected_class(cls) -> type[Graph]:
+        return Graph
+
+    def __init__(self, incoming_graph_data=None, **attr):
+        super().__init__(incoming_graph_data, **attr)
+        self.__networkx_cache__ = _GraphCache(self)
+
+    def _reify_networkx(self) -> None:
+        """Copy graph to host (CPU) if necessary."""
+        if self.__dict__["_node"] is None:
+            # After we make this into an nx graph, we rely on the cache being correct
+            Gcg = self._cudagraph
+            G = nxcg.to_networkx(Gcg)
+            for key in self._nx_attrs:
+                self.__dict__[key] = G.__dict__[key]
+
+    def _become(self, other: Graph):
+        if self.__class__ is not other.__class__:
+            raise TypeError(
+                "Attempting to update graph inplace with graph of different type!"
+            )
+        # Begin with the simplest implementation; do we need to do more?
+        self.__dict__.update(other.__dict__)
+        return self
 
     ####################
     # Creation methods #
@@ -109,9 +303,10 @@ def from_coo(
         *,
         key_to_id: dict[NodeKey, IndexValue] | None = None,
         id_to_key: list[NodeKey] | None = None,
+        use_compat_graph: bool | None = None,
         **attr,
-    ) -> Graph:
-        new_graph = object.__new__(cls)
+    ) -> Graph | CudaGraph:
+        new_graph = object.__new__(cls.to_cudagraph_class())
         new_graph.__networkx_cache__ = {}
         new_graph.src_indices = src_indices
         new_graph.dst_indices = dst_indices
@@ -173,7 +368,8 @@ def from_coo(
         isolates = nxcg.algorithms.isolate._isolates(new_graph)
         if len(isolates) > 0:
             new_graph._node_ids = cp.arange(new_graph._N, dtype=index_dtype)
-
+        if use_compat_graph or use_compat_graph is None and issubclass(cls, Graph):
+            new_graph = new_graph._to_compat_graph()
         return new_graph
 
     @classmethod
@@ -188,8 +384,9 @@ def from_csr(
         *,
         key_to_id: dict[NodeKey, IndexValue] | None = None,
         id_to_key: list[NodeKey] | None = None,
+        use_compat_graph: bool | None = None,
         **attr,
-    ) -> Graph:
+    ) -> Graph | CudaGraph:
         N = indptr.size - 1
         src_indices = cp.array(
             # cp.repeat is slow to use here, so use numpy instead
@@ -205,6 +402,7 @@ def from_csr(
             node_masks,
             key_to_id=key_to_id,
             id_to_key=id_to_key,
+            use_compat_graph=use_compat_graph,
             **attr,
         )
 
@@ -220,8 +418,9 @@ def from_csc(
         *,
         key_to_id: dict[NodeKey, IndexValue] | None = None,
         id_to_key: list[NodeKey] | None = None,
+        use_compat_graph: bool | None = None,
         **attr,
-    ) -> Graph:
+    ) -> Graph | CudaGraph:
         N = indptr.size - 1
         dst_indices = cp.array(
             # cp.repeat is slow to use here, so use numpy instead
@@ -237,6 +436,7 @@ def from_csc(
             node_masks,
             key_to_id=key_to_id,
             id_to_key=id_to_key,
+            use_compat_graph=use_compat_graph,
             **attr,
         )
 
@@ -254,8 +454,9 @@ def from_dcsr(
         *,
         key_to_id: dict[NodeKey, IndexValue] | None = None,
         id_to_key: list[NodeKey] | None = None,
+        use_compat_graph: bool | None = None,
         **attr,
-    ) -> Graph:
+    ) -> Graph | CudaGraph:
         src_indices = cp.array(
             # cp.repeat is slow to use here, so use numpy instead
             np.repeat(compressed_srcs.get(), cp.diff(indptr).get())
@@ -270,6 +471,7 @@ def from_dcsr(
             node_masks,
             key_to_id=key_to_id,
             id_to_key=id_to_key,
+            use_compat_graph=use_compat_graph,
             **attr,
         )
 
@@ -287,8 +489,9 @@ def from_dcsc(
         *,
         key_to_id: dict[NodeKey, IndexValue] | None = None,
         id_to_key: list[NodeKey] | None = None,
+        use_compat_graph: bool | None = None,
         **attr,
-    ) -> Graph:
+    ) -> Graph | CudaGraph:
         dst_indices = cp.array(
             # cp.repeat is slow to use here, so use numpy instead
             np.repeat(compressed_dsts.get(), cp.diff(indptr).get())
@@ -303,13 +506,75 @@ def from_dcsc(
             node_masks,
             key_to_id=key_to_id,
             id_to_key=id_to_key,
+            use_compat_graph=use_compat_graph,
             **attr,
         )
 
-    def __new__(cls, incoming_graph_data=None, **attr) -> Graph:
+
+class CudaGraph:
+    # Tell networkx to dispatch calls with this object to nx-cugraph
+    __networkx_backend__: ClassVar[str] = "cugraph"  # nx >=3.2
+    __networkx_plugin__: ClassVar[str] = "cugraph"  # nx <3.2
+
+    # Allow networkx dispatch machinery to cache conversions.
+    # This means we should clear the cache if we ever mutate the object!
+    __networkx_cache__: dict | None
+
+    # networkx properties
+    graph: dict
+    graph_attr_dict_factory: ClassVar[type] = dict
+
+    # Not networkx properties
+    # We store edge data in COO format with {src,dst}_indices and edge_values.
+    src_indices: cp.ndarray[IndexValue]
+    dst_indices: cp.ndarray[IndexValue]
+    edge_values: dict[AttrKey, cp.ndarray[EdgeValue]]
+    edge_masks: dict[AttrKey, cp.ndarray[bool]]
+    node_values: dict[AttrKey, any_ndarray[NodeValue]]
+    node_masks: dict[AttrKey, any_ndarray[bool]]
+    key_to_id: dict[NodeKey, IndexValue] | None
+    _id_to_key: list[NodeKey] | None
+    _N: int
+    _node_ids: cp.ndarray[IndexValue] | None  # holds plc.SGGraph.vertices_array data
+
+    # Used by graph._get_plc_graph
+    _plc_type_map: ClassVar[dict[np.dtype, np.dtype]] = {
+        # signed int
+        np.dtype(np.int8): np.dtype(np.float32),
+        np.dtype(np.int16): np.dtype(np.float32),
+        np.dtype(np.int32): np.dtype(np.float64),
+        np.dtype(np.int64): np.dtype(np.float64),  # raise if abs(x) > 2**53
+        # unsigned int
+        np.dtype(np.uint8): np.dtype(np.float32),
+        np.dtype(np.uint16): np.dtype(np.float32),
+        np.dtype(np.uint32): np.dtype(np.float64),
+        np.dtype(np.uint64): np.dtype(np.float64),  # raise if x > 2**53
+        # other
+        np.dtype(np.bool_): np.dtype(np.float32),
+        np.dtype(np.float16): np.dtype(np.float32),
+    }
+    _plc_allowed_edge_types: ClassVar[set[np.dtype]] = {
+        np.dtype(np.float32),
+        np.dtype(np.float64),
+    }
+
+    ####################
+    # Creation methods #
+    ####################
+
+    from_coo = classmethod(Graph.from_coo.__func__)
+    from_csr = classmethod(Graph.from_csr.__func__)
+    from_csc = classmethod(Graph.from_csc.__func__)
+    from_dcsr = classmethod(Graph.from_dcsr.__func__)
+    from_dcsc = classmethod(Graph.from_dcsc.__func__)
+
+    def __new__(cls, incoming_graph_data=None, **attr) -> CudaGraph:
         if incoming_graph_data is None:
             new_graph = cls.from_coo(
-                0, cp.empty(0, index_dtype), cp.empty(0, index_dtype)
+                0,
+                cp.empty(0, index_dtype),
+                cp.empty(0, index_dtype),
+                use_compat_graph=False,
             )
         elif incoming_graph_data.__class__ is cls:
             new_graph = incoming_graph_data.copy()
@@ -318,34 +583,30 @@ def __new__(cls, incoming_graph_data=None, **attr) -> Graph:
         else:
             raise NotImplementedError
         new_graph.graph.update(attr)
+        # We could return Graph here (if configured), but let's not for now
         return new_graph
 
     #################
     # Class methods #
     #################
 
-    @classmethod
-    @networkx_api
-    def is_directed(cls) -> bool:
-        return False
+    is_directed = classmethod(Graph.is_directed.__func__)
+    is_multigraph = classmethod(Graph.is_multigraph.__func__)
+    to_cudagraph_class = classmethod(Graph.to_cudagraph_class.__func__)
+    to_networkx_class = classmethod(Graph.to_networkx_class.__func__)
 
     @classmethod
     @networkx_api
-    def is_multigraph(cls) -> bool:
-        return False
+    def to_directed_class(cls) -> type[nxcg.CudaDiGraph]:
+        return nxcg.CudaDiGraph
 
     @classmethod
     @networkx_api
-    def to_directed_class(cls) -> type[nxcg.DiGraph]:
-        return nxcg.DiGraph
-
-    @classmethod
-    def to_networkx_class(cls) -> type[nx.Graph]:
-        return nx.Graph
+    def to_undirected_class(cls) -> type[CudaGraph]:
+        return CudaGraph
 
     @classmethod
-    @networkx_api
-    def to_undirected_class(cls) -> type[Graph]:
+    def _to_compat_graph_class(cls) -> type[Graph]:
         return Graph
 
     ##############
@@ -438,7 +699,7 @@ def clear_edges(self) -> None:
             cache.clear()
 
     @networkx_api
-    def copy(self, as_view: bool = False) -> Graph:
+    def copy(self, as_view: bool = False) -> CudaGraph:
         # Does shallow copy in networkx
         return self._copy(as_view, self.__class__)
 
@@ -534,14 +795,19 @@ def size(self, weight: AttrKey | None = None) -> int:
         return int(cp.count_nonzero(self.src_indices <= self.dst_indices))
 
     @networkx_api
-    def to_directed(self, as_view: bool = False) -> nxcg.DiGraph:
+    def to_directed(self, as_view: bool = False) -> nxcg.CudaDiGraph:
         return self._copy(as_view, self.to_directed_class())
 
     @networkx_api
-    def to_undirected(self, as_view: bool = False) -> Graph:
+    def to_undirected(self, as_view: bool = False) -> CudaGraph:
         # Does deep copy in networkx
         return self._copy(as_view, self.to_undirected_class())
 
+    def _to_compat_graph(self) -> Graph:
+        rv = self._to_compat_graph_class()()
+        rv._cudagraph = self
+        return rv
+
     # Not implemented...
     # adj, adjacency, add_edge, add_edges_from, add_node,
     # add_nodes_from, add_weighted_edges_from, degree,
@@ -552,8 +818,8 @@ def to_undirected(self, as_view: bool = False) -> Graph:
     # Private methods #
     ###################
 
-    def _copy(self, as_view: bool, cls: type[Graph], reverse: bool = False):
-        # DRY warning: see also MultiGraph._copy
+    def _copy(self, as_view: bool, cls: type[CudaGraph], reverse: bool = False):
+        # DRY warning: see also CudaMultiGraph._copy
         src_indices = self.src_indices
         dst_indices = self.dst_indices
         edge_values = self.edge_values
@@ -593,6 +859,7 @@ def _copy(self, as_view: bool, cls: type[Graph], reverse: bool = False):
             node_masks,
             key_to_id=key_to_id,
             id_to_key=id_to_key,
+            use_compat_graph=False,
         )
         if as_view:
             rv.graph = self.graph
@@ -689,6 +956,14 @@ def _get_plc_graph(
             src_indices = src_indices.astype(index_dtype)
             dst_indices = dst_indices.astype(index_dtype)
 
+        # This sets drop_multi_edges=True for non-multigraph input, which means
+        # the data in self.src_indices and self.dst_indices may not be
+        # identical to that contained in the returned pcl.SGGraph (the returned
+        # SGGraph may have fewer edges since duplicates are dropped). Ideally
+        # self.src_indices and self.dst_indices would be updated to have
+        # duplicate edges removed for non-multigraph instances, but that
+        # requires additional code which would be redundant and likely not as
+        # performant as the code in PLC.
         return plc.SGGraph(
             resource_handle=plc.ResourceHandle(),
             graph_properties=plc.GraphProperties(
@@ -702,10 +977,11 @@ def _get_plc_graph(
             renumber=False,
             do_expensive_check=False,
             vertices_array=self._node_ids,
+            drop_multi_edges=not self.is_multigraph(),
         )
 
     def _sort_edge_indices(self, primary="src"):
-        # DRY warning: see also MultiGraph._sort_edge_indices
+        # DRY warning: see also CudaMultiGraph._sort_edge_indices
         if primary == "src":
             stacked = cp.vstack((self.dst_indices, self.src_indices))
         elif primary == "dst":
@@ -727,7 +1003,7 @@ def _sort_edge_indices(self, primary="src"):
             {key: val[indices] for key, val in self.edge_masks.items()}
         )
 
-    def _become(self, other: Graph):
+    def _become(self, other: CudaGraph):
         if self.__class__ is not other.__class__:
             raise TypeError(
                 "Attempting to update graph inplace with graph of different type!"
diff --git a/python/nx-cugraph/nx_cugraph/classes/multidigraph.py b/python/nx-cugraph/nx_cugraph/classes/multidigraph.py
index 2e7a55a9eb1..5a6595567d2 100644
--- a/python/nx-cugraph/nx_cugraph/classes/multidigraph.py
+++ b/python/nx-cugraph/nx_cugraph/classes/multidigraph.py
@@ -16,24 +16,51 @@
 
 import nx_cugraph as nxcg
 
-from .digraph import DiGraph
-from .multigraph import MultiGraph
+from .digraph import CudaDiGraph, DiGraph
+from .graph import Graph
+from .multigraph import CudaMultiGraph, MultiGraph
 
-__all__ = ["MultiDiGraph"]
+__all__ = ["CudaMultiDiGraph", "MultiDiGraph"]
 
 networkx_api = nxcg.utils.decorators.networkx_class(nx.MultiDiGraph)
 
 
-class MultiDiGraph(MultiGraph, DiGraph):
+class MultiDiGraph(nx.MultiDiGraph, MultiGraph, DiGraph):
+    name = Graph.name
+    _node = Graph._node
+    _adj = DiGraph._adj
+    _succ = DiGraph._succ
+    _pred = DiGraph._pred
+
     @classmethod
     @networkx_api
     def is_directed(cls) -> bool:
         return True
 
+    @classmethod
+    @networkx_api
+    def is_multigraph(cls) -> bool:
+        return True
+
+    @classmethod
+    def to_cudagraph_class(cls) -> type[CudaMultiDiGraph]:
+        return CudaMultiDiGraph
+
     @classmethod
     def to_networkx_class(cls) -> type[nx.MultiDiGraph]:
         return nx.MultiDiGraph
 
+
+class CudaMultiDiGraph(CudaMultiGraph, CudaDiGraph):
+    is_directed = classmethod(MultiDiGraph.is_directed.__func__)
+    is_multigraph = classmethod(MultiDiGraph.is_multigraph.__func__)
+    to_cudagraph_class = classmethod(MultiDiGraph.to_cudagraph_class.__func__)
+    to_networkx_class = classmethod(MultiDiGraph.to_networkx_class.__func__)
+
+    @classmethod
+    def _to_compat_graph_class(cls) -> type[MultiDiGraph]:
+        return MultiDiGraph
+
     ##########################
     # NetworkX graph methods #
     ##########################
diff --git a/python/nx-cugraph/nx_cugraph/classes/multigraph.py b/python/nx-cugraph/nx_cugraph/classes/multigraph.py
index 23d9faa8734..c8c8f1dfb00 100644
--- a/python/nx-cugraph/nx_cugraph/classes/multigraph.py
+++ b/python/nx-cugraph/nx_cugraph/classes/multigraph.py
@@ -22,7 +22,7 @@
 import nx_cugraph as nxcg
 
 from ..utils import index_dtype
-from .graph import Graph
+from .graph import CudaGraph, Graph, _GraphCache
 
 if TYPE_CHECKING:
     from nx_cugraph.typing import (
@@ -34,32 +34,47 @@
         NodeValue,
         any_ndarray,
     )
-__all__ = ["MultiGraph"]
+__all__ = ["MultiGraph", "CudaMultiGraph"]
 
 networkx_api = nxcg.utils.decorators.networkx_class(nx.MultiGraph)
 
 
-class MultiGraph(Graph):
-    # networkx properties
-    edge_key_dict_factory: ClassVar[type] = dict
+class MultiGraph(nx.MultiGraph, Graph):
+    name = Graph.name
+    _node = Graph._node
+    _adj = Graph._adj
 
-    # Not networkx properties
+    @classmethod
+    @networkx_api
+    def is_directed(cls) -> bool:
+        return False
 
-    # In a MultiGraph, each edge has a unique `(src, dst, key)` key.
-    # By default, `key` is 0 if possible, else 1, else 2, etc.
-    # This key can be any hashable Python object in NetworkX.
-    # We don't use a dict for our data structure here, because
-    # that would require a `(src, dst, key)` key.
-    # Instead, we keep `edge_keys` and/or `edge_indices`.
-    # `edge_keys` is the list of Python objects for each edge.
-    # `edge_indices` is for the common case of default multiedge keys,
-    # in which case we can store it as a cupy array.
-    # `edge_indices` is generally preferred. It is possible to provide
-    # both where edge_indices is the default and edge_keys is anything.
-    # It is also possible for them both to be None, which means the
-    # default edge indices has not yet been calculated.
-    edge_indices: cp.ndarray[IndexValue] | None
-    edge_keys: list[EdgeKey] | None
+    @classmethod
+    @networkx_api
+    def is_multigraph(cls) -> bool:
+        return True
+
+    @classmethod
+    def to_cudagraph_class(cls) -> type[CudaMultiGraph]:
+        return CudaMultiGraph
+
+    @classmethod
+    @networkx_api
+    def to_directed_class(cls) -> type[nxcg.MultiDiGraph]:
+        return nxcg.MultiDiGraph
+
+    @classmethod
+    def to_networkx_class(cls) -> type[nx.MultiGraph]:
+        return nx.MultiGraph
+
+    @classmethod
+    @networkx_api
+    def to_undirected_class(cls) -> type[MultiGraph]:
+        return MultiGraph
+
+    def __init__(self, incoming_graph_data=None, multigraph_input=None, **attr):
+        super().__init__(incoming_graph_data, multigraph_input, **attr)
+        self.__networkx_cache__ = _GraphCache(self)
 
     ####################
     # Creation methods #
@@ -80,9 +95,10 @@ def from_coo(
         key_to_id: dict[NodeKey, IndexValue] | None = None,
         id_to_key: list[NodeKey] | None = None,
         edge_keys: list[EdgeKey] | None = None,
+        use_compat_graph: bool | None = None,
         **attr,
-    ) -> MultiGraph:
-        new_graph = super().from_coo(
+    ) -> MultiGraph | CudaMultiGraph:
+        new_graph = super(cls.to_undirected_class(), cls).from_coo(
             N,
             src_indices,
             dst_indices,
@@ -92,6 +108,7 @@ def from_coo(
             node_masks,
             key_to_id=key_to_id,
             id_to_key=id_to_key,
+            use_compat_graph=False,
             **attr,
         )
         new_graph.edge_indices = edge_indices
@@ -102,6 +119,8 @@ def from_coo(
             and len(new_graph.edge_keys) != src_indices.size
         ):
             raise ValueError
+        if use_compat_graph or use_compat_graph is None and issubclass(cls, Graph):
+            new_graph = new_graph._to_compat_graph()
         return new_graph
 
     @classmethod
@@ -118,8 +137,9 @@ def from_csr(
         key_to_id: dict[NodeKey, IndexValue] | None = None,
         id_to_key: list[NodeKey] | None = None,
         edge_keys: list[EdgeKey] | None = None,
+        use_compat_graph: bool | None = None,
         **attr,
-    ) -> MultiGraph:
+    ) -> MultiGraph | CudaMultiGraph:
         N = indptr.size - 1
         src_indices = cp.array(
             # cp.repeat is slow to use here, so use numpy instead
@@ -137,6 +157,7 @@ def from_csr(
             key_to_id=key_to_id,
             id_to_key=id_to_key,
             edge_keys=edge_keys,
+            use_compat_graph=use_compat_graph,
             **attr,
         )
 
@@ -154,8 +175,9 @@ def from_csc(
         key_to_id: dict[NodeKey, IndexValue] | None = None,
         id_to_key: list[NodeKey] | None = None,
         edge_keys: list[EdgeKey] | None = None,
+        use_compat_graph: bool | None = None,
         **attr,
-    ) -> MultiGraph:
+    ) -> MultiGraph | CudaMultiGraph:
         N = indptr.size - 1
         dst_indices = cp.array(
             # cp.repeat is slow to use here, so use numpy instead
@@ -173,6 +195,7 @@ def from_csc(
             key_to_id=key_to_id,
             id_to_key=id_to_key,
             edge_keys=edge_keys,
+            use_compat_graph=use_compat_graph,
             **attr,
         )
 
@@ -192,8 +215,9 @@ def from_dcsr(
         key_to_id: dict[NodeKey, IndexValue] | None = None,
         id_to_key: list[NodeKey] | None = None,
         edge_keys: list[EdgeKey] | None = None,
+        use_compat_graph: bool | None = None,
         **attr,
-    ) -> MultiGraph:
+    ) -> MultiGraph | CudaMultiGraph:
         src_indices = cp.array(
             # cp.repeat is slow to use here, so use numpy instead
             np.repeat(compressed_srcs.get(), cp.diff(indptr).get())
@@ -210,6 +234,7 @@ def from_dcsr(
             key_to_id=key_to_id,
             id_to_key=id_to_key,
             edge_keys=edge_keys,
+            use_compat_graph=use_compat_graph,
             **attr,
         )
 
@@ -229,8 +254,9 @@ def from_dcsc(
         key_to_id: dict[NodeKey, IndexValue] | None = None,
         id_to_key: list[NodeKey] | None = None,
         edge_keys: list[EdgeKey] | None = None,
+        use_compat_graph: bool | None = None,
         **attr,
-    ) -> Graph:
+    ) -> MultiGraph | CudaGraph:
         dst_indices = cp.array(
             # cp.repeat is slow to use here, so use numpy instead
             np.repeat(compressed_dsts.get(), cp.diff(indptr).get())
@@ -247,12 +273,46 @@ def from_dcsc(
             key_to_id=key_to_id,
             id_to_key=id_to_key,
             edge_keys=edge_keys,
+            use_compat_graph=use_compat_graph,
             **attr,
         )
 
+
+class CudaMultiGraph(CudaGraph):
+    # networkx properties
+    edge_key_dict_factory: ClassVar[type] = dict
+
+    # Not networkx properties
+
+    # In a MultiGraph, each edge has a unique `(src, dst, key)` key.
+    # By default, `key` is 0 if possible, else 1, else 2, etc.
+    # This key can be any hashable Python object in NetworkX.
+    # We don't use a dict for our data structure here, because
+    # that would require a `(src, dst, key)` key.
+    # Instead, we keep `edge_keys` and/or `edge_indices`.
+    # `edge_keys` is the list of Python objects for each edge.
+    # `edge_indices` is for the common case of default multiedge keys,
+    # in which case we can store it as a cupy array.
+    # `edge_indices` is generally preferred. It is possible to provide
+    # both where edge_indices is the default and edge_keys is anything.
+    # It is also possible for them both to be None, which means the
+    # default edge indices has not yet been calculated.
+    edge_indices: cp.ndarray[IndexValue] | None
+    edge_keys: list[EdgeKey] | None
+
+    ####################
+    # Creation methods #
+    ####################
+
+    from_coo = classmethod(MultiGraph.from_coo.__func__)
+    from_csr = classmethod(MultiGraph.from_csr.__func__)
+    from_csc = classmethod(MultiGraph.from_csc.__func__)
+    from_dcsr = classmethod(MultiGraph.from_dcsr.__func__)
+    from_dcsc = classmethod(MultiGraph.from_dcsc.__func__)
+
     def __new__(
         cls, incoming_graph_data=None, multigraph_input=None, **attr
-    ) -> MultiGraph:
+    ) -> CudaMultiGraph:
         if isinstance(incoming_graph_data, dict) and multigraph_input is not False:
             new_graph = nxcg.from_networkx(
                 nx.MultiGraph(incoming_graph_data, multigraph_input=multigraph_input),
@@ -267,28 +327,23 @@ def __new__(
     # Class methods #
     #################
 
-    @classmethod
-    @networkx_api
-    def is_directed(cls) -> bool:
-        return False
+    is_directed = classmethod(MultiGraph.is_directed.__func__)
+    is_multigraph = classmethod(MultiGraph.is_multigraph.__func__)
+    to_cudagraph_class = classmethod(MultiGraph.to_cudagraph_class.__func__)
+    to_networkx_class = classmethod(MultiGraph.to_networkx_class.__func__)
 
     @classmethod
     @networkx_api
-    def is_multigraph(cls) -> bool:
-        return True
+    def to_directed_class(cls) -> type[nxcg.CudaMultiDiGraph]:
+        return nxcg.CudaMultiDiGraph
 
     @classmethod
     @networkx_api
-    def to_directed_class(cls) -> type[nxcg.MultiDiGraph]:
-        return nxcg.MultiDiGraph
-
-    @classmethod
-    def to_networkx_class(cls) -> type[nx.MultiGraph]:
-        return nx.MultiGraph
+    def to_undirected_class(cls) -> type[CudaMultiGraph]:
+        return CudaMultiGraph
 
     @classmethod
-    @networkx_api
-    def to_undirected_class(cls) -> type[MultiGraph]:
+    def _to_compat_graph_class(cls) -> type[MultiGraph]:
         return MultiGraph
 
     ##########################
@@ -308,7 +363,7 @@ def clear_edges(self) -> None:
         self.edge_keys = None
 
     @networkx_api
-    def copy(self, as_view: bool = False) -> MultiGraph:
+    def copy(self, as_view: bool = False) -> CudaMultiGraph:
         # Does shallow copy in networkx
         return self._copy(as_view, self.__class__)
 
@@ -391,11 +446,11 @@ def has_edge(self, u: NodeKey, v: NodeKey, key: EdgeKey | None = None) -> bool:
         return any(edge_keys[i] == key for i in indices.tolist())
 
     @networkx_api
-    def to_directed(self, as_view: bool = False) -> nxcg.MultiDiGraph:
+    def to_directed(self, as_view: bool = False) -> nxcg.CudaMultiDiGraph:
         return self._copy(as_view, self.to_directed_class())
 
     @networkx_api
-    def to_undirected(self, as_view: bool = False) -> MultiGraph:
+    def to_undirected(self, as_view: bool = False) -> CudaMultiGraph:
         # Does deep copy in networkx
         return self._copy(as_view, self.to_undirected_class())
 
@@ -403,8 +458,8 @@ def to_undirected(self, as_view: bool = False) -> MultiGraph:
     # Private methods #
     ###################
 
-    def _copy(self, as_view: bool, cls: type[Graph], reverse: bool = False):
-        # DRY warning: see also Graph._copy
+    def _copy(self, as_view: bool, cls: type[CudaGraph], reverse: bool = False):
+        # DRY warning: see also CudaGraph._copy
         src_indices = self.src_indices
         dst_indices = self.dst_indices
         edge_indices = self.edge_indices
@@ -451,6 +506,7 @@ def _copy(self, as_view: bool, cls: type[Graph], reverse: bool = False):
             key_to_id=key_to_id,
             id_to_key=id_to_key,
             edge_keys=edge_keys,
+            use_compat_graph=False,
         )
         if as_view:
             rv.graph = self.graph
@@ -460,7 +516,7 @@ def _copy(self, as_view: bool, cls: type[Graph], reverse: bool = False):
         return rv
 
     def _sort_edge_indices(self, primary="src"):
-        # DRY warning: see also Graph._sort_edge_indices
+        # DRY warning: see also CudaGraph._sort_edge_indices
         if self.edge_indices is None and self.edge_keys is None:
             return super()._sort_edge_indices(primary=primary)
         if primary == "src":
diff --git a/python/nx-cugraph/nx_cugraph/convert.py b/python/nx-cugraph/nx_cugraph/convert.py
index 56d16d837d7..a872f13ac70 100644
--- a/python/nx-cugraph/nx_cugraph/convert.py
+++ b/python/nx-cugraph/nx_cugraph/convert.py
@@ -12,6 +12,7 @@
 # limitations under the License.
 from __future__ import annotations
 
+import functools
 import itertools
 import operator as op
 from collections import Counter, defaultdict
@@ -23,9 +24,13 @@
 import numpy as np
 
 import nx_cugraph as nxcg
+from nx_cugraph import _nxver
 
 from .utils import index_dtype, networkx_algorithm
-from .utils.misc import pairwise
+from .utils.misc import _And_NotImplementedError, pairwise
+
+if _nxver >= (3, 4):
+    from networkx.utils.backends import _get_cache_key, _get_from_cache, _set_to_cache
 
 if TYPE_CHECKING:  # pragma: no cover
     from nx_cugraph.typing import AttrKey, Dtype, EdgeValue, NodeValue, any_ndarray
@@ -60,6 +65,27 @@ def _iterate_values(graph, adj, is_dicts, func):
     return func(it), False
 
 
+# Consider adding this to `utils` if it is useful elsewhere
+def _fallback_decorator(func):
+    """Catch and convert exceptions to ``NotImplementedError``; use as a decorator.
+
+    ``nx.NetworkXError`` are raised without being converted. This allows
+    falling back to other backends if, for example, conversion to GPU failed.
+    """
+
+    @functools.wraps(func)
+    def inner(*args, **kwargs):
+        try:
+            return func(*args, **kwargs)
+        except nx.NetworkXError:
+            raise
+        except Exception as exc:
+            raise _And_NotImplementedError(exc) from exc
+
+    return inner
+
+
+@_fallback_decorator
 def from_networkx(
     graph: nx.Graph,
     edge_attrs: AttrKey | dict[AttrKey, EdgeValue | None] | None = None,
@@ -74,7 +100,8 @@ def from_networkx(
     as_directed: bool = False,
     name: str | None = None,
     graph_name: str | None = None,
-) -> nxcg.Graph:
+    use_compat_graph: bool | None = False,
+) -> nxcg.Graph | nxcg.CudaGraph:
     """Convert a networkx graph to nx_cugraph graph; can convert all attributes.
 
     Parameters
@@ -114,10 +141,16 @@ def from_networkx(
         The name of the algorithm when dispatched from networkx.
     graph_name : str, optional
         The name of the graph argument geing converted when dispatched from networkx.
+    use_compat_graph : bool or None, default False
+        Indicate whether to return a graph that is compatible with NetworkX graph.
+        For example, ``nx_cugraph.Graph`` can be used as a NetworkX graph and can
+        reside in host (CPU) or device (GPU) memory. The default is False, which
+        will return e.g. ``nx_cugraph.CudaGraph`` that only resides on device (GPU)
+        and is not fully compatible as a NetworkX graph.
 
     Returns
     -------
-    nx_cugraph.Graph
+    nx_cugraph.Graph or nx_cugraph.CudaGraph
 
     Notes
     -----
@@ -145,6 +178,41 @@ def from_networkx(
             graph = G
         else:
             raise TypeError(f"Expected networkx.Graph; got {type(graph)}")
+    elif isinstance(graph, nxcg.Graph):
+        if (
+            use_compat_graph
+            # Use compat graphs by default
+            or use_compat_graph is None
+            and (_nxver < (3, 3) or nx.config.backends.cugraph.use_compat_graphs)
+        ):
+            return graph
+        if graph._is_on_gpu:
+            return graph._cudagraph
+        if not graph._is_on_cpu:
+            raise RuntimeError(
+                f"{type(graph).__name__} cannot be converted to the GPU, because it is "
+                "not on the CPU! This is not supposed to be possible. If you believe "
+                "you have found a bug, please report a minimum reproducible example to "
+                "https://github.com/rapidsai/cugraph/issues/new/choose"
+            )
+        if _nxver >= (3, 4):
+            cache_key = _get_cache_key(
+                edge_attrs=edge_attrs,
+                node_attrs=node_attrs,
+                preserve_edge_attrs=preserve_edge_attrs,
+                preserve_node_attrs=preserve_node_attrs,
+                preserve_graph_attrs=preserve_graph_attrs,
+            )
+            cache = getattr(graph, "__networkx_cache__", None)
+            if cache is not None:
+                cache = cache.setdefault("backends", {}).setdefault("cugraph", {})
+                compat_key, rv = _get_from_cache(cache, cache_key)
+                if rv is not None:
+                    if isinstance(rv, nxcg.Graph):
+                        # This shouldn't happen during normal use, but be extra-careful
+                        rv = rv._cudagraph
+                    if rv is not None:
+                        return rv
 
     if preserve_all_attrs:
         preserve_edge_attrs = True
@@ -165,7 +233,12 @@ def from_networkx(
         else:
             node_attrs = {node_attrs: None}
 
-    if graph.__class__ in {nx.Graph, nx.DiGraph, nx.MultiGraph, nx.MultiDiGraph}:
+    if graph.__class__ in {
+        nx.Graph,
+        nx.DiGraph,
+        nx.MultiGraph,
+        nx.MultiDiGraph,
+    } or isinstance(graph, nxcg.Graph):
         # This is a NetworkX private attribute, but is much faster to use
         adj = graph._adj
     else:
@@ -455,9 +528,9 @@ def func(it, edge_attr=edge_attr, dtype=dtype):
                 # if vals.ndim > 1: ...
     if graph.is_multigraph():
         if graph.is_directed() or as_directed:
-            klass = nxcg.MultiDiGraph
+            klass = nxcg.CudaMultiDiGraph
         else:
-            klass = nxcg.MultiGraph
+            klass = nxcg.CudaMultiGraph
         rv = klass.from_coo(
             N,
             src_indices,
@@ -469,12 +542,13 @@ def func(it, edge_attr=edge_attr, dtype=dtype):
             node_masks,
             key_to_id=key_to_id,
             edge_keys=edge_keys,
+            use_compat_graph=False,
         )
     else:
         if graph.is_directed() or as_directed:
-            klass = nxcg.DiGraph
+            klass = nxcg.CudaDiGraph
         else:
-            klass = nxcg.Graph
+            klass = nxcg.CudaGraph
         rv = klass.from_coo(
             N,
             src_indices,
@@ -484,9 +558,22 @@ def func(it, edge_attr=edge_attr, dtype=dtype):
             node_values,
             node_masks,
             key_to_id=key_to_id,
+            use_compat_graph=False,
         )
     if preserve_graph_attrs:
         rv.graph.update(graph.graph)  # deepcopy?
+    if _nxver >= (3, 4) and isinstance(graph, nxcg.Graph) and cache is not None:
+        # Make sure this conversion is added to the cache, and make all of
+        # our graphs share the same `.graph` attribute for consistency.
+        rv.graph = graph.graph
+        _set_to_cache(cache, cache_key, rv)
+    if (
+        use_compat_graph
+        # Use compat graphs by default
+        or use_compat_graph is None
+        and (_nxver < (3, 3) or nx.config.backends.cugraph.use_compat_graphs)
+    ):
+        return rv._to_compat_graph()
     return rv
 
 
@@ -535,14 +622,16 @@ def _iter_attr_dicts(
     return full_dicts
 
 
-def to_networkx(G: nxcg.Graph, *, sort_edges: bool = False) -> nx.Graph:
+def to_networkx(
+    G: nxcg.Graph | nxcg.CudaGraph, *, sort_edges: bool = False
+) -> nx.Graph:
     """Convert a nx_cugraph graph to networkx graph.
 
     All edge and node attributes and ``G.graph`` properties are converted.
 
     Parameters
     ----------
-    G : nx_cugraph.Graph
+    G : nx_cugraph.Graph or nx_cugraph.CudaGraph
     sort_edges : bool, default False
         Whether to sort the edge data of the input graph by (src, dst) indices
         before converting. This can be useful to convert to networkx graphs
@@ -557,6 +646,9 @@ def to_networkx(G: nxcg.Graph, *, sort_edges: bool = False) -> nx.Graph:
     --------
     from_networkx : The opposite; convert networkx graph to nx_cugraph graph
     """
+    if isinstance(G, nxcg.Graph):
+        # These graphs are already NetworkX graphs :)
+        return G
     rv = G.to_networkx_class()()
     id_to_key = G.id_to_key
     if sort_edges:
@@ -623,13 +715,13 @@ def _to_graph(
     edge_attr: AttrKey | None = None,
     edge_default: EdgeValue | None = 1,
     edge_dtype: Dtype | None = None,
-) -> nxcg.Graph | nxcg.DiGraph:
+) -> nxcg.CudaGraph | nxcg.CudaDiGraph:
     """Ensure that input type is a nx_cugraph graph, and convert if necessary.
 
     Directed and undirected graphs are both allowed.
     This is an internal utility function and may change or be removed.
     """
-    if isinstance(G, nxcg.Graph):
+    if isinstance(G, nxcg.CudaGraph):
         return G
     if isinstance(G, nx.Graph):
         return from_networkx(
@@ -644,15 +736,15 @@ def _to_directed_graph(
     edge_attr: AttrKey | None = None,
     edge_default: EdgeValue | None = 1,
     edge_dtype: Dtype | None = None,
-) -> nxcg.DiGraph:
-    """Ensure that input type is a nx_cugraph DiGraph, and convert if necessary.
+) -> nxcg.CudaDiGraph:
+    """Ensure that input type is a nx_cugraph CudaDiGraph, and convert if necessary.
 
     Undirected graphs will be converted to directed.
     This is an internal utility function and may change or be removed.
     """
-    if isinstance(G, nxcg.DiGraph):
+    if isinstance(G, nxcg.CudaDiGraph):
         return G
-    if isinstance(G, nxcg.Graph):
+    if isinstance(G, nxcg.CudaGraph):
         return G.to_directed()
     if isinstance(G, nx.Graph):
         return from_networkx(
@@ -670,13 +762,13 @@ def _to_undirected_graph(
     edge_attr: AttrKey | None = None,
     edge_default: EdgeValue | None = 1,
     edge_dtype: Dtype | None = None,
-) -> nxcg.Graph:
-    """Ensure that input type is a nx_cugraph Graph, and convert if necessary.
+) -> nxcg.CudaGraph:
+    """Ensure that input type is a nx_cugraph CudaGraph, and convert if necessary.
 
     Only undirected graphs are allowed. Directed graphs will raise ValueError.
     This is an internal utility function and may change or be removed.
     """
-    if isinstance(G, nxcg.Graph):
+    if isinstance(G, nxcg.CudaGraph):
         if G.is_directed():
             raise ValueError("Only undirected graphs supported; got a directed graph")
         return G
@@ -688,7 +780,7 @@ def _to_undirected_graph(
     raise TypeError
 
 
-@networkx_algorithm(version_added="24.08")
+@networkx_algorithm(version_added="24.08", fallback=True)
 def from_dict_of_lists(d, create_using=None):
     from .generators._utils import _create_using_class
 
diff --git a/python/nx-cugraph/nx_cugraph/convert_matrix.py b/python/nx-cugraph/nx_cugraph/convert_matrix.py
index 38139b913cf..54975902861 100644
--- a/python/nx-cugraph/nx_cugraph/convert_matrix.py
+++ b/python/nx-cugraph/nx_cugraph/convert_matrix.py
@@ -14,6 +14,8 @@
 import networkx as nx
 import numpy as np
 
+from nx_cugraph import _nxver
+
 from .generators._utils import _create_using_class
 from .utils import _cp_iscopied_asarray, index_dtype, networkx_algorithm
 
@@ -24,7 +26,7 @@
 
 
 # Value columns with string dtype is not supported
-@networkx_algorithm(is_incomplete=True, version_added="23.12")
+@networkx_algorithm(is_incomplete=True, version_added="23.12", fallback=True)
 def from_pandas_edgelist(
     df,
     source="source",
@@ -138,7 +140,7 @@ def from_pandas_edgelist(
         and (
             # In nx <= 3.3, `edge_key` was ignored if `edge_attr` is None
             edge_attr is not None
-            or nx.__version__[:3] > "3.3"
+            or _nxver > (3, 3)
         )
     ):
         try:
@@ -161,7 +163,7 @@ def from_pandas_edgelist(
     return G
 
 
-@networkx_algorithm(version_added="23.12")
+@networkx_algorithm(version_added="23.12", fallback=True)
 def from_scipy_sparse_array(
     A, parallel_edges=False, create_using=None, edge_attribute="weight"
 ):
diff --git a/python/nx-cugraph/nx_cugraph/generators/_utils.py b/python/nx-cugraph/nx_cugraph/generators/_utils.py
index e38ace5b28d..bc9ab84bdad 100644
--- a/python/nx-cugraph/nx_cugraph/generators/_utils.py
+++ b/python/nx-cugraph/nx_cugraph/generators/_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -16,6 +16,7 @@
 import networkx as nx
 
 import nx_cugraph as nxcg
+from nx_cugraph import _nxver
 
 from ..utils import index_dtype
 
@@ -74,7 +75,7 @@ def _common_small_graph(n, nodes, create_using, *, allow_directed=True):
     return G
 
 
-def _create_using_class(create_using, *, default=nxcg.Graph):
+def _create_using_class(create_using, *, default=nx.Graph):
     """Handle ``create_using`` argument and return a Graph type from nx_cugraph."""
     inplace = False
     if create_using is None:
@@ -85,16 +86,17 @@ def _create_using_class(create_using, *, default=nxcg.Graph):
         create_using, "is_multigraph"
     ):
         raise TypeError("create_using is not a valid graph type or instance")
-    elif not isinstance(create_using, nxcg.Graph):
+    elif not isinstance(create_using, (nxcg.Graph, nxcg.CudaGraph)):
         raise NotImplementedError(
             f"create_using with object of type {type(create_using)} is not supported "
-            "by the cugraph backend; only nx_cugraph.Graph objects are allowed."
+            "by the cugraph backend; only nx_cugraph.Graph or nx_cugraph.CudaGraph "
+            "objects are allowed."
         )
     else:
         inplace = True
         G = create_using
         G.clear()
-    if not isinstance(G, nxcg.Graph):
+    if not isinstance(G, (nxcg.Graph, nxcg.CudaGraph)):
         if G.is_multigraph():
             if G.is_directed():
                 graph_class = nxcg.MultiDiGraph
@@ -104,10 +106,12 @@ def _create_using_class(create_using, *, default=nxcg.Graph):
             graph_class = nxcg.DiGraph
         else:
             graph_class = nxcg.Graph
+        if _nxver >= (3, 3) and not nx.config.backends.cugraph.use_compat_graphs:
+            graph_class = graph_class.to_cudagraph_class()
         if G.__class__ not in {nx.Graph, nx.DiGraph, nx.MultiGraph, nx.MultiDiGraph}:
             raise NotImplementedError(
                 f"create_using with type {type(G)} is not supported by the cugraph "
-                "backend; only standard networkx or nx_cugraph Graph objects are "
+                "backend; only standard networkx or nx_cugraph graph objects are "
                 "allowed (but not customized subclasses derived from them)."
             )
     else:
diff --git a/python/nx-cugraph/nx_cugraph/generators/classic.py b/python/nx-cugraph/nx_cugraph/generators/classic.py
index a548beea34f..cfcb2a3afec 100644
--- a/python/nx-cugraph/nx_cugraph/generators/classic.py
+++ b/python/nx-cugraph/nx_cugraph/generators/classic.py
@@ -18,6 +18,7 @@
 import numpy as np
 
 import nx_cugraph as nxcg
+from nx_cugraph import _nxver
 
 from ..utils import _get_int_dtype, index_dtype, networkx_algorithm
 from ._utils import (
@@ -102,7 +103,9 @@ def complete_graph(n, create_using=None):
 @networkx_algorithm(version_added="23.12")
 def complete_multipartite_graph(*subset_sizes):
     if not subset_sizes:
-        return nxcg.Graph()
+        if _nxver < (3, 3) or nx.config.backends.cugraph.use_compat_graphs:
+            return nxcg.Graph()
+        return nxcg.CudaGraph()
     try:
         subset_sizes = [_ensure_int(size) for size in subset_sizes]
     except TypeError:
@@ -139,6 +142,8 @@ def complete_multipartite_graph(*subset_sizes):
         dst_indices,
         node_values={"subset": subsets_array},
         id_to_key=nodes,
+        use_compat_graph=_nxver < (3, 3)
+        or nx.config.backends.cugraph.use_compat_graphs,
     )
 
 
diff --git a/python/nx-cugraph/nx_cugraph/generators/community.py b/python/nx-cugraph/nx_cugraph/generators/community.py
index 9b0e0848de9..4e5063cc345 100644
--- a/python/nx-cugraph/nx_cugraph/generators/community.py
+++ b/python/nx-cugraph/nx_cugraph/generators/community.py
@@ -11,8 +11,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import cupy as cp
+import networkx as nx
 
 import nx_cugraph as nxcg
+from nx_cugraph import _nxver
 
 from ..utils import networkx_algorithm
 from ._utils import (
@@ -42,4 +44,7 @@ def caveman_graph(l, k):  # noqa: E741
     dst_cliques.extend(dst_clique + i * k for i in range(1, l))
     src_indices = cp.hstack(src_cliques)
     dst_indices = cp.hstack(dst_cliques)
-    return nxcg.Graph.from_coo(l * k, src_indices, dst_indices)
+    use_compat_graph = _nxver < (3, 3) or nx.config.backends.cugraph.use_compat_graphs
+    return nxcg.CudaGraph.from_coo(
+        l * k, src_indices, dst_indices, use_compat_graph=use_compat_graph
+    )
diff --git a/python/nx-cugraph/nx_cugraph/generators/ego.py b/python/nx-cugraph/nx_cugraph/generators/ego.py
index 66c9c8b95ee..9a91fa0b6c3 100644
--- a/python/nx-cugraph/nx_cugraph/generators/ego.py
+++ b/python/nx-cugraph/nx_cugraph/generators/ego.py
@@ -32,7 +32,10 @@ def ego_graph(
 ):
     """Weighted ego_graph with negative cycles is not yet supported. `NotImplementedError` will be raised if there are negative `distance` edge weights."""  # noqa: E501
     if isinstance(G, nx.Graph):
+        is_compat_graph = isinstance(G, nxcg.Graph)
         G = nxcg.from_networkx(G, preserve_all_attrs=True)
+    else:
+        is_compat_graph = False
     if n not in G:
         if distance is None:
             raise nx.NodeNotFound(f"Source {n} is not in G")
@@ -100,7 +103,10 @@ def ego_graph(
             node_mask &= node_ids != src_index
         node_ids = node_ids[node_mask]
     if node_ids.size == G._N:
-        return G.copy()
+        rv = G.copy()
+        if is_compat_graph:
+            return rv._to_compat_graph()
+        return rv
     # TODO: create renumbering helper function(s)
     node_ids.sort()  # TODO: is this ever necessary? Keep for safety
     node_values = {key: val[node_ids] for key, val in G.node_values.items()}
@@ -137,6 +143,7 @@ def ego_graph(
         "node_values": node_values,
         "node_masks": node_masks,
         "key_to_id": key_to_id,
+        "use_compat_graph": False,
     }
     if G.is_multigraph():
         if G.edge_keys is not None:
@@ -147,6 +154,8 @@ def ego_graph(
             kwargs["edge_indices"] = G.edge_indices[edge_mask]
     rv = G.__class__.from_coo(**kwargs)
     rv.graph.update(G.graph)
+    if is_compat_graph:
+        return rv._to_compat_graph()
     return rv
 
 
diff --git a/python/nx-cugraph/nx_cugraph/generators/small.py b/python/nx-cugraph/nx_cugraph/generators/small.py
index 45487571cda..d0c03cb7dd4 100644
--- a/python/nx-cugraph/nx_cugraph/generators/small.py
+++ b/python/nx-cugraph/nx_cugraph/generators/small.py
@@ -14,6 +14,7 @@
 import networkx as nx
 
 import nx_cugraph as nxcg
+from nx_cugraph import _nxver
 
 from ..utils import index_dtype, networkx_algorithm
 from ._utils import _IS_NX32_OR_LESS, _create_using_class
@@ -449,7 +450,14 @@ def pappus_graph():
         index_dtype,
     )
     # fmt: on
-    return nxcg.Graph.from_coo(18, src_indices, dst_indices, name="Pappus Graph")
+    use_compat_graph = _nxver < (3, 3) or nx.config.backends.cugraph.use_compat_graphs
+    return nxcg.CudaGraph.from_coo(
+        18,
+        src_indices,
+        dst_indices,
+        name="Pappus Graph",
+        use_compat_graph=use_compat_graph,
+    )
 
 
 @networkx_algorithm(version_added="23.12")
diff --git a/python/nx-cugraph/nx_cugraph/generators/social.py b/python/nx-cugraph/nx_cugraph/generators/social.py
index 07e82c63fbf..09d405e7561 100644
--- a/python/nx-cugraph/nx_cugraph/generators/social.py
+++ b/python/nx-cugraph/nx_cugraph/generators/social.py
@@ -11,9 +11,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import cupy as cp
+import networkx as nx
 import numpy as np
 
 import nx_cugraph as nxcg
+from nx_cugraph import _nxver
 
 from ..utils import index_dtype, networkx_algorithm
 
@@ -77,7 +79,8 @@ def davis_southern_women_graph():
         "E13", "E14",
     ]
     # fmt: on
-    return nxcg.Graph.from_coo(
+    use_compat_graph = _nxver < (3, 3) or nx.config.backends.cugraph.use_compat_graphs
+    return nxcg.CudaGraph.from_coo(
         32,
         src_indices,
         dst_indices,
@@ -85,6 +88,7 @@ def davis_southern_women_graph():
         id_to_key=women + events,
         top=women,
         bottom=events,
+        use_compat_graph=use_compat_graph,
     )
 
 
@@ -111,7 +115,14 @@ def florentine_families_graph():
         "Salviati", "Strozzi", "Tornabuoni"
     ]
     # fmt: on
-    return nxcg.Graph.from_coo(15, src_indices, dst_indices, id_to_key=nodes)
+    use_compat_graph = _nxver < (3, 3) or nx.config.backends.cugraph.use_compat_graphs
+    return nxcg.CudaGraph.from_coo(
+        15,
+        src_indices,
+        dst_indices,
+        id_to_key=nodes,
+        use_compat_graph=use_compat_graph,
+    )
 
 
 @networkx_algorithm(version_added="23.12")
@@ -165,13 +176,15 @@ def karate_club_graph():
         "Officer", "Officer", "Officer", "Officer", "Officer", "Officer",
     ])
     # fmt: on
-    return nxcg.Graph.from_coo(
+    use_compat_graph = _nxver < (3, 3) or nx.config.backends.cugraph.use_compat_graphs
+    return nxcg.CudaGraph.from_coo(
         34,
         src_indices,
         dst_indices,
         edge_values={"weight": weights},
         node_values={"club": clubs},
         name="Zachary's Karate Club",
+        use_compat_graph=use_compat_graph,
     )
 
 
@@ -289,6 +302,12 @@ def les_miserables_graph():
         "Zephine",
     ]
     # fmt: on
-    return nxcg.Graph.from_coo(
-        77, src_indices, dst_indices, edge_values={"weight": weights}, id_to_key=nodes
+    use_compat_graph = _nxver < (3, 3) or nx.config.backends.cugraph.use_compat_graphs
+    return nxcg.CudaGraph.from_coo(
+        77,
+        src_indices,
+        dst_indices,
+        edge_values={"weight": weights},
+        id_to_key=nodes,
+        use_compat_graph=use_compat_graph,
     )
diff --git a/python/nx-cugraph/nx_cugraph/interface.py b/python/nx-cugraph/nx_cugraph/interface.py
index 4007230efa9..1a3d08409a2 100644
--- a/python/nx-cugraph/nx_cugraph/interface.py
+++ b/python/nx-cugraph/nx_cugraph/interface.py
@@ -18,6 +18,7 @@
 import networkx as nx
 
 import nx_cugraph as nxcg
+from nx_cugraph import _nxver
 
 
 class BackendInterface:
@@ -32,11 +33,19 @@ def convert_from_nx(graph, *args, edge_attrs=None, weight=None, **kwargs):
                     "edge_attrs and weight arguments should not both be given"
                 )
             edge_attrs = {weight: 1}
-        return nxcg.from_networkx(graph, *args, edge_attrs=edge_attrs, **kwargs)
+        return nxcg.from_networkx(
+            graph,
+            *args,
+            edge_attrs=edge_attrs,
+            use_compat_graph=_nxver < (3, 3)
+            or nx.config.backends.cugraph.use_compat_graphs,
+            **kwargs,
+        )
 
     @staticmethod
     def convert_to_nx(obj, *, name: str | None = None):
-        if isinstance(obj, nxcg.Graph):
+        if isinstance(obj, nxcg.CudaGraph):
+            # Observe that this does not try to convert Graph!
             return nxcg.to_networkx(obj)
         return obj
 
@@ -62,19 +71,32 @@ def key(testpath):
                 return (testname, frozenset({classname, filename}))
             return (testname, frozenset({filename}))
 
+        use_compat_graph = (
+            _nxver < (3, 3) or nx.config.backends.cugraph.use_compat_graphs
+        )
+        fallback = use_compat_graph or nx.utils.backends._dispatchable._fallback_to_nx
+
         # Reasons for xfailing
+        # For nx version <= 3.1
         no_weights = "weighted implementation not currently supported"
         no_multigraph = "multigraphs not currently supported"
+        # For nx version <= 3.2
+        nx_cugraph_in_test_setup = (
+            "nx-cugraph Graph is incompatible in test setup in nx versions < 3.3"
+        )
+        # For all versions
         louvain_different = "Louvain may be different due to RNG"
-        no_string_dtype = "string edge values not currently supported"
         sssp_path_different = "sssp may choose a different valid path"
+        tuple_elements_preferred = "elements are tuples instead of lists"
+        no_mixed_dtypes_for_nodes = (
+            # This one is tricky b/c we don't raise; all dtypes are treated as str
+            "mixed dtypes (str, int, float) for single node property not supported"
+        )
+        # These shouldn't fail if using Graph or falling back to networkx
+        no_string_dtype = "string edge values not currently supported"
         no_object_dtype_for_edges = (
             "Edges don't support object dtype (lists, strings, etc.)"
         )
-        tuple_elements_preferred = "elements are tuples instead of lists"
-        nx_cugraph_in_test_setup = (
-            "nx-cugraph Graph is incompatible in test setup in nx versions < 3.3"
-        )
 
         xfail = {
             # This is removed while strongly_connected_components() is not
@@ -98,38 +120,6 @@ def key(testpath):
                 "test_cycles.py:TestMinimumCycleBasis."
                 "test_gh6787_and_edge_attribute_names"
             ): sssp_path_different,
-            key(
-                "test_graph_hashing.py:test_isomorphic_edge_attr"
-            ): no_object_dtype_for_edges,
-            key(
-                "test_graph_hashing.py:test_isomorphic_edge_attr_and_node_attr"
-            ): no_object_dtype_for_edges,
-            key(
-                "test_graph_hashing.py:test_isomorphic_edge_attr_subgraph_hash"
-            ): no_object_dtype_for_edges,
-            key(
-                "test_graph_hashing.py:"
-                "test_isomorphic_edge_attr_and_node_attr_subgraph_hash"
-            ): no_object_dtype_for_edges,
-            key(
-                "test_summarization.py:TestSNAPNoEdgeTypes.test_summary_graph"
-            ): no_object_dtype_for_edges,
-            key(
-                "test_summarization.py:TestSNAPUndirected.test_summary_graph"
-            ): no_object_dtype_for_edges,
-            key(
-                "test_summarization.py:TestSNAPDirected.test_summary_graph"
-            ): no_object_dtype_for_edges,
-            key("test_gexf.py:TestGEXF.test_relabel"): no_object_dtype_for_edges,
-            key(
-                "test_gml.py:TestGraph.test_parse_gml_cytoscape_bug"
-            ): no_object_dtype_for_edges,
-            key("test_gml.py:TestGraph.test_parse_gml"): no_object_dtype_for_edges,
-            key("test_gml.py:TestGraph.test_read_gml"): no_object_dtype_for_edges,
-            key("test_gml.py:TestGraph.test_data_types"): no_object_dtype_for_edges,
-            key(
-                "test_gml.py:TestPropertyLists.test_reading_graph_with_list_property"
-            ): no_object_dtype_for_edges,
             key(
                 "test_relabel.py:"
                 "test_relabel_preserve_node_order_partial_mapping_with_copy_false"
@@ -138,48 +128,107 @@ def key(testpath):
                 "test_gml.py:"
                 "TestPropertyLists.test_reading_graph_with_single_element_list_property"
             ): tuple_elements_preferred,
-            key(
-                "test_relabel.py:"
-                "TestRelabel.test_relabel_multidigraph_inout_merge_nodes"
-            ): no_string_dtype,
-            key(
-                "test_relabel.py:TestRelabel.test_relabel_multigraph_merge_inplace"
-            ): no_string_dtype,
-            key(
-                "test_relabel.py:TestRelabel.test_relabel_multidigraph_merge_inplace"
-            ): no_string_dtype,
-            key(
-                "test_relabel.py:TestRelabel.test_relabel_multidigraph_inout_copy"
-            ): no_string_dtype,
-            key(
-                "test_relabel.py:TestRelabel.test_relabel_multigraph_merge_copy"
-            ): no_string_dtype,
-            key(
-                "test_relabel.py:TestRelabel.test_relabel_multidigraph_merge_copy"
-            ): no_string_dtype,
-            key(
-                "test_relabel.py:TestRelabel.test_relabel_multigraph_nonnumeric_key"
-            ): no_string_dtype,
-            key("test_contraction.py:test_multigraph_path"): no_object_dtype_for_edges,
-            key(
-                "test_contraction.py:test_directed_multigraph_path"
-            ): no_object_dtype_for_edges,
-            key(
-                "test_contraction.py:test_multigraph_blockmodel"
-            ): no_object_dtype_for_edges,
-            key(
-                "test_summarization.py:TestSNAPUndirectedMulti.test_summary_graph"
-            ): no_string_dtype,
-            key(
-                "test_summarization.py:TestSNAPDirectedMulti.test_summary_graph"
-            ): no_string_dtype,
         }
+        if not fallback:
+            xfail.update(
+                {
+                    key(
+                        "test_graph_hashing.py:test_isomorphic_edge_attr"
+                    ): no_object_dtype_for_edges,
+                    key(
+                        "test_graph_hashing.py:test_isomorphic_edge_attr_and_node_attr"
+                    ): no_object_dtype_for_edges,
+                    key(
+                        "test_graph_hashing.py:test_isomorphic_edge_attr_subgraph_hash"
+                    ): no_object_dtype_for_edges,
+                    key(
+                        "test_graph_hashing.py:"
+                        "test_isomorphic_edge_attr_and_node_attr_subgraph_hash"
+                    ): no_object_dtype_for_edges,
+                    key(
+                        "test_summarization.py:TestSNAPNoEdgeTypes.test_summary_graph"
+                    ): no_object_dtype_for_edges,
+                    key(
+                        "test_summarization.py:TestSNAPUndirected.test_summary_graph"
+                    ): no_object_dtype_for_edges,
+                    key(
+                        "test_summarization.py:TestSNAPDirected.test_summary_graph"
+                    ): no_object_dtype_for_edges,
+                    key(
+                        "test_gexf.py:TestGEXF.test_relabel"
+                    ): no_object_dtype_for_edges,
+                    key(
+                        "test_gml.py:TestGraph.test_parse_gml_cytoscape_bug"
+                    ): no_object_dtype_for_edges,
+                    key(
+                        "test_gml.py:TestGraph.test_parse_gml"
+                    ): no_object_dtype_for_edges,
+                    key(
+                        "test_gml.py:TestGraph.test_read_gml"
+                    ): no_object_dtype_for_edges,
+                    key(
+                        "test_gml.py:TestGraph.test_data_types"
+                    ): no_object_dtype_for_edges,
+                    key(
+                        "test_gml.py:"
+                        "TestPropertyLists.test_reading_graph_with_list_property"
+                    ): no_object_dtype_for_edges,
+                    key(
+                        "test_relabel.py:"
+                        "TestRelabel.test_relabel_multidigraph_inout_merge_nodes"
+                    ): no_string_dtype,
+                    key(
+                        "test_relabel.py:"
+                        "TestRelabel.test_relabel_multigraph_merge_inplace"
+                    ): no_string_dtype,
+                    key(
+                        "test_relabel.py:"
+                        "TestRelabel.test_relabel_multidigraph_merge_inplace"
+                    ): no_string_dtype,
+                    key(
+                        "test_relabel.py:"
+                        "TestRelabel.test_relabel_multidigraph_inout_copy"
+                    ): no_string_dtype,
+                    key(
+                        "test_relabel.py:TestRelabel.test_relabel_multigraph_merge_copy"
+                    ): no_string_dtype,
+                    key(
+                        "test_relabel.py:"
+                        "TestRelabel.test_relabel_multidigraph_merge_copy"
+                    ): no_string_dtype,
+                    key(
+                        "test_relabel.py:"
+                        "TestRelabel.test_relabel_multigraph_nonnumeric_key"
+                    ): no_string_dtype,
+                    key(
+                        "test_contraction.py:test_multigraph_path"
+                    ): no_object_dtype_for_edges,
+                    key(
+                        "test_contraction.py:test_directed_multigraph_path"
+                    ): no_object_dtype_for_edges,
+                    key(
+                        "test_contraction.py:test_multigraph_blockmodel"
+                    ): no_object_dtype_for_edges,
+                    key(
+                        "test_summarization.py:"
+                        "TestSNAPUndirectedMulti.test_summary_graph"
+                    ): no_string_dtype,
+                    key(
+                        "test_summarization.py:TestSNAPDirectedMulti.test_summary_graph"
+                    ): no_string_dtype,
+                }
+            )
+        else:
+            xfail.update(
+                {
+                    key(
+                        "test_gml.py:"
+                        "TestPropertyLists.test_reading_graph_with_list_property"
+                    ): no_mixed_dtypes_for_nodes,
+                }
+            )
 
-        from packaging.version import parse
-
-        nxver = parse(nx.__version__)
-
-        if nxver.major == 3 and nxver.minor <= 2:
+        if _nxver <= (3, 2):
             xfail.update(
                 {
                     # NetworkX versions prior to 3.2.1 have tests written to
@@ -216,7 +265,7 @@ def key(testpath):
                 }
             )
 
-        if nxver.major == 3 and nxver.minor <= 1:
+        if _nxver <= (3, 1):
             # MAINT: networkx 3.0, 3.1
             # NetworkX 3.2 added the ability to "fallback to nx" if backend algorithms
             # raise NotImplementedError or `can_run` returns False. The tests below
@@ -332,24 +381,25 @@ def key(testpath):
                 xfail[key("test_louvain.py:test_threshold")] = (
                     "Louvain does not support seed parameter"
                 )
-            if nxver.major == 3 and nxver.minor >= 2:
-                xfail.update(
-                    {
-                        key(
-                            "test_convert_pandas.py:TestConvertPandas."
-                            "test_from_edgelist_multi_attr_incl_target"
-                        ): no_string_dtype,
-                        key(
-                            "test_convert_pandas.py:TestConvertPandas."
-                            "test_from_edgelist_multidigraph_and_edge_attr"
-                        ): no_string_dtype,
-                        key(
-                            "test_convert_pandas.py:TestConvertPandas."
-                            "test_from_edgelist_int_attr_name"
-                        ): no_string_dtype,
-                    }
-                )
-                if nxver.minor == 2:
+            if _nxver >= (3, 2):
+                if not fallback:
+                    xfail.update(
+                        {
+                            key(
+                                "test_convert_pandas.py:TestConvertPandas."
+                                "test_from_edgelist_multi_attr_incl_target"
+                            ): no_string_dtype,
+                            key(
+                                "test_convert_pandas.py:TestConvertPandas."
+                                "test_from_edgelist_multidigraph_and_edge_attr"
+                            ): no_string_dtype,
+                            key(
+                                "test_convert_pandas.py:TestConvertPandas."
+                                "test_from_edgelist_int_attr_name"
+                            ): no_string_dtype,
+                        }
+                    )
+                if _nxver[1] == 2:
                     different_iteration_order = "Different graph data iteration order"
                     xfail.update(
                         {
@@ -366,7 +416,7 @@ def key(testpath):
                             ): different_iteration_order,
                         }
                     )
-                elif nxver.minor >= 3:
+                elif _nxver[1] >= 3:
                     xfail.update(
                         {
                             key("test_louvain.py:test_max_level"): louvain_different,
diff --git a/python/nx-cugraph/nx_cugraph/relabel.py b/python/nx-cugraph/nx_cugraph/relabel.py
index 20d1337a99c..e38e18c779e 100644
--- a/python/nx-cugraph/nx_cugraph/relabel.py
+++ b/python/nx-cugraph/nx_cugraph/relabel.py
@@ -29,13 +29,18 @@
 
 @networkx_algorithm(version_added="24.08")
 def relabel_nodes(G, mapping, copy=True):
+    G_orig = G
     if isinstance(G, nx.Graph):
-        if not copy:
+        is_compat_graph = isinstance(G, nxcg.Graph)
+        if not copy and not is_compat_graph:
             raise RuntimeError(
                 "Using `copy=False` is invalid when using a NetworkX graph "
                 "as input to `nx_cugraph.relabel_nodes`"
             )
         G = nxcg.from_networkx(G, preserve_all_attrs=True)
+    else:
+        is_compat_graph = False
+
     it = range(G._N) if G.key_to_id is None else G.id_to_key
     if callable(mapping):
         previd_to_key = [mapping(node) for node in it]
@@ -225,12 +230,13 @@ def relabel_nodes(G, mapping, copy=True):
         node_masks=node_masks,
         id_to_key=newid_to_key,
         key_to_id=key_to_newid,
+        use_compat_graph=is_compat_graph,
         **extra_kwargs,
     )
     rv.graph.update(G.graph)
     if not copy:
-        G._become(rv)
-        return G
+        G_orig._become(rv)
+        return G_orig
     return rv
 
 
@@ -241,7 +247,10 @@ def convert_node_labels_to_integers(
     if ordering not in {"default", "sorted", "increasing degree", "decreasing degree"}:
         raise nx.NetworkXError(f"Unknown node ordering: {ordering}")
     if isinstance(G, nx.Graph):
+        is_compat_graph = isinstance(G, nxcg.Graph)
         G = nxcg.from_networkx(G, preserve_all_attrs=True)
+    else:
+        is_compat_graph = False
     G = G.copy()
     if label_attribute is not None:
         prev_vals = G.id_to_key
@@ -279,4 +288,6 @@ def convert_node_labels_to_integers(
         key_to_id = G.key_to_id
         G.key_to_id = {i: key_to_id[n] for i, (d, n) in enumerate(pairs, first_label)}
     G._id_to_key = id_to_key
+    if is_compat_graph:
+        return G._to_compat_graph()
     return G
diff --git a/python/nx-cugraph/nx_cugraph/tests/pytest.ini b/python/nx-cugraph/nx_cugraph/tests/pytest.ini
new file mode 100644
index 00000000000..7b0a9f29fb1
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/tests/pytest.ini
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[pytest]
+addopts = --tb=native
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_bfs.py b/python/nx-cugraph/nx_cugraph/tests/test_bfs.py
index c2b22e98949..ad2c62c1fb9 100644
--- a/python/nx-cugraph/nx_cugraph/tests/test_bfs.py
+++ b/python/nx-cugraph/nx_cugraph/tests/test_bfs.py
@@ -12,11 +12,10 @@
 # limitations under the License.
 import networkx as nx
 import pytest
-from packaging.version import parse
 
-nxver = parse(nx.__version__)
+from nx_cugraph import _nxver
 
-if nxver.major == 3 and nxver.minor < 2:
+if _nxver < (3, 2):
     pytest.skip("Need NetworkX >=3.2 to test clustering", allow_module_level=True)
 
 
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_classes.py b/python/nx-cugraph/nx_cugraph/tests/test_classes.py
new file mode 100644
index 00000000000..0ac238b3558
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/tests/test_classes.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import nx_cugraph as nxcg
+
+
+def test_class_to_class():
+    """Basic sanity checks to ensure metadata relating graph classes are accurate."""
+    for prefix in ["", "Cuda"]:
+        for suffix in ["Graph", "DiGraph", "MultiGraph", "MultiDiGraph"]:
+            cls_name = f"{prefix}{suffix}"
+            cls = getattr(nxcg, cls_name)
+            assert cls.__name__ == cls_name
+            G = cls()
+            assert cls is G.__class__
+            # cudagraph
+            val = cls.to_cudagraph_class()
+            val2 = G.to_cudagraph_class()
+            assert val is val2
+            assert val.__name__ == f"Cuda{suffix}"
+            assert val.__module__.startswith("nx_cugraph")
+            assert cls.is_directed() == G.is_directed() == val.is_directed()
+            assert cls.is_multigraph() == G.is_multigraph() == val.is_multigraph()
+            # networkx
+            val = cls.to_networkx_class()
+            val2 = G.to_networkx_class()
+            assert val is val2
+            assert val.__name__ == suffix
+            assert val.__module__.startswith("networkx")
+            val = val()
+            assert cls.is_directed() == G.is_directed() == val.is_directed()
+            assert cls.is_multigraph() == G.is_multigraph() == val.is_multigraph()
+            # directed
+            val = cls.to_directed_class()
+            val2 = G.to_directed_class()
+            assert val is val2
+            assert val.__module__.startswith("nx_cugraph")
+            assert val.is_directed()
+            assert cls.is_multigraph() == G.is_multigraph() == val.is_multigraph()
+            if "Di" in suffix:
+                assert val is cls
+            else:
+                assert "Di" in val.__name__
+                assert prefix in val.__name__
+                assert cls.to_undirected_class() is cls
+            # undirected
+            val = cls.to_undirected_class()
+            val2 = G.to_undirected_class()
+            assert val is val2
+            assert val.__module__.startswith("nx_cugraph")
+            assert not val.is_directed()
+            assert cls.is_multigraph() == G.is_multigraph() == val.is_multigraph()
+            if "Di" not in suffix:
+                assert val is cls
+            else:
+                assert "Di" not in val.__name__
+                assert prefix in val.__name__
+                assert cls.to_directed_class() is cls
+            # "zero"
+            if prefix == "Cuda":
+                val = cls._to_compat_graph_class()
+                val2 = G._to_compat_graph_class()
+                assert val is val2
+                assert val.__name__ == suffix
+                assert val.__module__.startswith("nx_cugraph")
+                assert val.to_cudagraph_class() is cls
+                assert cls.is_directed() == G.is_directed() == val.is_directed()
+                assert cls.is_multigraph() == G.is_multigraph() == val.is_multigraph()
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_cluster.py b/python/nx-cugraph/nx_cugraph/tests/test_cluster.py
index ad4770f1ab8..fd8e1b3cf13 100644
--- a/python/nx-cugraph/nx_cugraph/tests/test_cluster.py
+++ b/python/nx-cugraph/nx_cugraph/tests/test_cluster.py
@@ -12,11 +12,10 @@
 # limitations under the License.
 import networkx as nx
 import pytest
-from packaging.version import parse
 
-nxver = parse(nx.__version__)
+from nx_cugraph import _nxver
 
-if nxver.major == 3 and nxver.minor < 2:
+if _nxver < (3, 2):
     pytest.skip("Need NetworkX >=3.2 to test clustering", allow_module_level=True)
 
 
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_convert.py b/python/nx-cugraph/nx_cugraph/tests/test_convert.py
index 634b28e961c..3d109af8a74 100644
--- a/python/nx-cugraph/nx_cugraph/tests/test_convert.py
+++ b/python/nx-cugraph/nx_cugraph/tests/test_convert.py
@@ -13,13 +13,10 @@
 import cupy as cp
 import networkx as nx
 import pytest
-from packaging.version import parse
 
 import nx_cugraph as nxcg
 from nx_cugraph import interface
 
-nxver = parse(nx.__version__)
-
 
 @pytest.mark.parametrize(
     "graph_class", [nx.Graph, nx.DiGraph, nx.MultiGraph, nx.MultiDiGraph]
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_ego_graph.py b/python/nx-cugraph/nx_cugraph/tests/test_ego_graph.py
index 5474f9d79e3..0697a744e85 100644
--- a/python/nx-cugraph/nx_cugraph/tests/test_ego_graph.py
+++ b/python/nx-cugraph/nx_cugraph/tests/test_ego_graph.py
@@ -12,16 +12,13 @@
 # limitations under the License.
 import networkx as nx
 import pytest
-from packaging.version import parse
 
 import nx_cugraph as nxcg
+from nx_cugraph import _nxver
 
 from .testing_utils import assert_graphs_equal
 
-nxver = parse(nx.__version__)
-
-
-if nxver.major == 3 and nxver.minor < 2:
+if _nxver < (3, 2):
     pytest.skip("Need NetworkX >=3.2 to test ego_graph", allow_module_level=True)
 
 
@@ -49,7 +46,12 @@ def test_ego_graph_cycle_graph(
     kwargs = {"radius": radius, "center": center, "undirected": undirected}
     Hnx = nx.ego_graph(Gnx, n, **kwargs)
     Hcg = nx.ego_graph(Gnx, n, **kwargs, backend="cugraph")
+    use_compat_graphs = _nxver < (3, 3) or nx.config.backends.cugraph.use_compat_graphs
+    assert_graphs_equal(Hnx, Hcg._cudagraph if use_compat_graphs else Hcg)
+    Hcg = nx.ego_graph(Gcg, n, **kwargs)
     assert_graphs_equal(Hnx, Hcg)
+    Hcg = nx.ego_graph(Gcg._to_compat_graph(), n, **kwargs)
+    assert_graphs_equal(Hnx, Hcg._cudagraph)
     with pytest.raises(nx.NodeNotFound, match="not in G"):
         nx.ego_graph(Gnx, -1, **kwargs)
     with pytest.raises(nx.NodeNotFound, match="not in G"):
@@ -61,20 +63,36 @@ def test_ego_graph_cycle_graph(
 
     kwargs["distance"] = "weight"
     H2nx = nx.ego_graph(Gnx, n, **kwargs)
-    is_nx32 = nxver.major == 3 and nxver.minor == 2
+    is_nx32 = _nxver[:2] == (3, 2)
     if undirected and Gnx.is_directed() and Gnx.is_multigraph():
         if is_nx32:
             # `should_run` was added in nx 3.3
             match = "Weighted ego_graph with undirected=True not implemented"
+        elif _nxver >= (3, 4):
+            match = "not implemented by 'cugraph'"
         else:
             match = "not implemented by cugraph"
-        with pytest.raises(RuntimeError, match=match):
+        with pytest.raises(
+            RuntimeError if _nxver < (3, 4) else NotImplementedError, match=match
+        ):
             nx.ego_graph(Gnx, n, **kwargs, backend="cugraph")
         with pytest.raises(NotImplementedError, match="ego_graph"):
-            nx.ego_graph(Gcg, n, **kwargs)
+            nx.ego_graph(Gcg, n, **kwargs, backend="cugraph")
+        if _nxver < (3, 4):
+            with pytest.raises(NotImplementedError, match="ego_graph"):
+                nx.ego_graph(Gcg, n, **kwargs)
+        else:
+            # This is an interesting case. `nxcg.ego_graph` is not implemented for
+            # these arguments, so it falls back to networkx. Hence, as it is currently
+            # implemented, the input graph is `nxcg.CudaGraph`, but the output graph
+            # is `nx.Graph`. Should networkx convert back to "cugraph" backend?
+            # TODO: make fallback to networkx configurable.
+            H2cg = nx.ego_graph(Gcg, n, **kwargs)
+            assert type(H2nx) is type(H2cg)
+            assert_graphs_equal(H2nx, nxcg.from_networkx(H2cg, preserve_all_attrs=True))
     else:
         H2cg = nx.ego_graph(Gnx, n, **kwargs, backend="cugraph")
-        assert_graphs_equal(H2nx, H2cg)
+        assert_graphs_equal(H2nx, H2cg._cudagraph if use_compat_graphs else H2cg)
         with pytest.raises(nx.NodeNotFound, match="not found in graph"):
             nx.ego_graph(Gnx, -1, **kwargs)
         with pytest.raises(nx.NodeNotFound, match="not found in graph"):
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_generators.py b/python/nx-cugraph/nx_cugraph/tests/test_generators.py
index c751b0fe2b3..5c405f1c93b 100644
--- a/python/nx-cugraph/nx_cugraph/tests/test_generators.py
+++ b/python/nx-cugraph/nx_cugraph/tests/test_generators.py
@@ -13,25 +13,24 @@
 import networkx as nx
 import numpy as np
 import pytest
-from packaging.version import parse
 
 import nx_cugraph as nxcg
+from nx_cugraph import _nxver
 
 from .testing_utils import assert_graphs_equal
 
-nxver = parse(nx.__version__)
-
-
-if nxver.major == 3 and nxver.minor < 2:
+if _nxver < (3, 2):
     pytest.skip("Need NetworkX >=3.2 to test generators", allow_module_level=True)
 
 
 def compare(name, create_using, *args, is_vanilla=False):
     exc1 = exc2 = None
     func = getattr(nx, name)
-    if isinstance(create_using, nxcg.Graph):
+    if isinstance(create_using, nxcg.CudaGraph):
         nx_create_using = nxcg.to_networkx(create_using)
-    elif isinstance(create_using, type) and issubclass(create_using, nxcg.Graph):
+    elif isinstance(create_using, type) and issubclass(
+        create_using, (nxcg.Graph, nxcg.CudaGraph)
+    ):
         nx_create_using = create_using.to_networkx_class()
     elif isinstance(create_using, nx.Graph):
         nx_create_using = create_using.copy()
@@ -61,8 +60,27 @@ def compare(name, create_using, *args, is_vanilla=False):
         exc2 = exc
     if exc1 is not None or exc2 is not None:
         assert type(exc1) is type(exc2)
+        return
+    if isinstance(Gcg, nxcg.Graph):
+        # If the graph is empty, it may be on host, otherwise it should be on device
+        if len(G):
+            assert Gcg._is_on_gpu
+            assert not Gcg._is_on_cpu
+        assert_graphs_equal(G, Gcg._cudagraph)
     else:
         assert_graphs_equal(G, Gcg)
+    # Ensure the output type is correct
+    if is_vanilla:
+        if _nxver < (3, 3) or nx.config.backends.cugraph.use_compat_graphs:
+            assert isinstance(Gcg, nxcg.Graph)
+        else:
+            assert isinstance(Gcg, nxcg.CudaGraph)
+    elif isinstance(create_using, type) and issubclass(
+        create_using, (nxcg.Graph, nxcg.CudaGraph)
+    ):
+        assert type(Gcg) is create_using
+    elif isinstance(create_using, (nxcg.Graph, nxcg.CudaGraph)):
+        assert type(Gcg) is type(create_using)
 
 
 N = list(range(-1, 5))
@@ -76,6 +94,10 @@ def compare(name, create_using, *args, is_vanilla=False):
     nxcg.DiGraph,
     nxcg.MultiGraph,
     nxcg.MultiDiGraph,
+    nxcg.CudaGraph,
+    nxcg.CudaDiGraph,
+    nxcg.CudaMultiGraph,
+    nxcg.CudaMultiDiGraph,
     # These raise NotImplementedError
     # nx.Graph(),
     # nx.DiGraph(),
@@ -85,6 +107,10 @@ def compare(name, create_using, *args, is_vanilla=False):
     nxcg.DiGraph(),
     nxcg.MultiGraph(),
     nxcg.MultiDiGraph(),
+    nxcg.CudaGraph(),
+    nxcg.CudaDiGraph(),
+    nxcg.CudaMultiGraph(),
+    nxcg.CudaMultiDiGraph(),
     None,
     object,  # Bad input
     7,  # Bad input
@@ -158,7 +184,7 @@ def compare(name, create_using, *args, is_vanilla=False):
 @pytest.mark.parametrize("create_using", COMPLETE_CREATE_USING)
 def test_generator_noarg(name, create_using):
     print(name, create_using, type(create_using))
-    if isinstance(create_using, nxcg.Graph) and name in {
+    if isinstance(create_using, nxcg.CudaGraph) and name in {
         # fmt: off
         "bull_graph", "chvatal_graph", "cubical_graph", "diamond_graph",
         "house_graph", "house_x_graph", "icosahedral_graph", "krackhardt_kite_graph",
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_graph_methods.py b/python/nx-cugraph/nx_cugraph/tests/test_graph_methods.py
index 3120995a2b2..40a361b1084 100644
--- a/python/nx-cugraph/nx_cugraph/tests/test_graph_methods.py
+++ b/python/nx-cugraph/nx_cugraph/tests/test_graph_methods.py
@@ -47,7 +47,7 @@ def _create_Gs():
 @pytest.mark.parametrize("Gnx", _create_Gs())
 @pytest.mark.parametrize("reciprocal", [False, True])
 def test_to_undirected_directed(Gnx, reciprocal):
-    Gcg = nxcg.DiGraph(Gnx)
+    Gcg = nxcg.CudaDiGraph(Gnx)
     assert_graphs_equal(Gnx, Gcg)
     Hnx1 = Gnx.to_undirected(reciprocal=reciprocal)
     Hcg1 = Gcg.to_undirected(reciprocal=reciprocal)
@@ -62,6 +62,6 @@ def test_multidigraph_to_undirected():
     Gnx.add_edge(0, 1)
     Gnx.add_edge(0, 1)
     Gnx.add_edge(1, 0)
-    Gcg = nxcg.MultiDiGraph(Gnx)
+    Gcg = nxcg.CudaMultiDiGraph(Gnx)
     with pytest.raises(NotImplementedError):
         Gcg.to_undirected()
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_match_api.py b/python/nx-cugraph/nx_cugraph/tests/test_match_api.py
index 176b531a6e7..1a61c69b3e7 100644
--- a/python/nx-cugraph/nx_cugraph/tests/test_match_api.py
+++ b/python/nx-cugraph/nx_cugraph/tests/test_match_api.py
@@ -14,13 +14,10 @@
 import inspect
 
 import networkx as nx
-from packaging.version import parse
 
 import nx_cugraph as nxcg
 from nx_cugraph.utils import networkx_algorithm
 
-nxver = parse(nx.__version__)
-
 
 def test_match_signature_and_names():
     """Simple test to ensure our signatures and basic module layout match networkx."""
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_multigraph.py b/python/nx-cugraph/nx_cugraph/tests/test_multigraph.py
index a8f189a4745..9208eea09f2 100644
--- a/python/nx-cugraph/nx_cugraph/tests/test_multigraph.py
+++ b/python/nx-cugraph/nx_cugraph/tests/test_multigraph.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -26,7 +26,7 @@ def test_get_edge_data(test_nxcugraph):
     G.add_edge(0, 3)
     G.add_edge(0, 3)
     if test_nxcugraph:
-        G = nxcg.MultiGraph(G)
+        G = nxcg.CudaMultiGraph(G)
     default = object()
     assert G.get_edge_data(0, 0, default=default) is default
     assert G.get_edge_data("a", "b", default=default) is default
@@ -60,7 +60,7 @@ def test_get_edge_data(test_nxcugraph):
     G = nx.MultiGraph()
     G.add_edge(0, 1)
     if test_nxcugraph:
-        G = nxcg.MultiGraph(G)
+        G = nxcg.CudaMultiGraph(G)
     assert G.get_edge_data(0, 1, default=default) == {0: {}}
     assert G.get_edge_data(0, 1, 0, default=default) == {}
     assert G.get_edge_data(0, 1, 1, default=default) is default
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_pagerank.py b/python/nx-cugraph/nx_cugraph/tests/test_pagerank.py
new file mode 100644
index 00000000000..252f9e6bbb8
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/tests/test_pagerank.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import networkx as nx
+import pandas as pd
+import pytest
+
+
+def test_pagerank_multigraph():
+    """
+    Ensures correct pagerank for Graphs and MultiGraphs when using from_pandas_edgelist.
+
+    PageRank for MultiGraph should give different result compared to Graph; when using
+    a Graph, the duplicate edges should be dropped.
+    """
+    df = pd.DataFrame(
+        {"source": [0, 1, 1, 1, 1, 1, 1, 2], "target": [1, 2, 2, 2, 2, 2, 2, 3]}
+    )
+    expected_pr_for_G = nx.pagerank(nx.from_pandas_edgelist(df))
+    expected_pr_for_MultiG = nx.pagerank(
+        nx.from_pandas_edgelist(df, create_using=nx.MultiGraph)
+    )
+
+    G = nx.from_pandas_edgelist(df, backend="cugraph")
+    actual_pr_for_G = nx.pagerank(G, backend="cugraph")
+
+    MultiG = nx.from_pandas_edgelist(df, create_using=nx.MultiGraph, backend="cugraph")
+    actual_pr_for_MultiG = nx.pagerank(MultiG, backend="cugraph")
+
+    assert actual_pr_for_G == pytest.approx(expected_pr_for_G)
+    assert actual_pr_for_MultiG == pytest.approx(expected_pr_for_MultiG)
diff --git a/python/nx-cugraph/nx_cugraph/tests/testing_utils.py b/python/nx-cugraph/nx_cugraph/tests/testing_utils.py
index 529a96efd81..50836acf55f 100644
--- a/python/nx-cugraph/nx_cugraph/tests/testing_utils.py
+++ b/python/nx-cugraph/nx_cugraph/tests/testing_utils.py
@@ -17,7 +17,7 @@
 
 def assert_graphs_equal(Gnx, Gcg):
     assert isinstance(Gnx, nx.Graph)
-    assert isinstance(Gcg, nxcg.Graph)
+    assert isinstance(Gcg, nxcg.CudaGraph)
     assert (a := Gnx.number_of_nodes()) == (b := Gcg.number_of_nodes()), (a, b)
     assert (a := Gnx.number_of_edges()) == (b := Gcg.number_of_edges()), (a, b)
     assert (a := Gnx.is_directed()) == (b := Gcg.is_directed()), (a, b)
diff --git a/python/nx-cugraph/nx_cugraph/utils/decorators.py b/python/nx-cugraph/nx_cugraph/utils/decorators.py
index 3c5de4f2936..16486996ba0 100644
--- a/python/nx-cugraph/nx_cugraph/utils/decorators.py
+++ b/python/nx-cugraph/nx_cugraph/utils/decorators.py
@@ -16,10 +16,14 @@
 from textwrap import dedent
 
 import networkx as nx
+from networkx import NetworkXError
 from networkx.utils.decorators import nodes_or_number, not_implemented_for
 
+from nx_cugraph import _nxver
 from nx_cugraph.interface import BackendInterface
 
+from .misc import _And_NotImplementedError
+
 try:
     from networkx.utils.backends import _registered_algorithms
 except ModuleNotFoundError:
@@ -44,6 +48,7 @@ class networkx_algorithm:
     version_added: str
     is_incomplete: bool
     is_different: bool
+    _fallback: bool
     _plc_names: set[str] | None
 
     def __new__(
@@ -59,6 +64,7 @@ def __new__(
         version_added: str,  # Required
         is_incomplete: bool = False,  # See self.extra_doc for details if True
         is_different: bool = False,  # See self.extra_doc for details if True
+        fallback: bool = False,  # Change non-nx exceptions to NotImplementedError
         _plc: str | set[str] | None = None,  # Hidden from user, may be removed someday
     ):
         if func is None:
@@ -70,10 +76,11 @@ def __new__(
                 version_added=version_added,
                 is_incomplete=is_incomplete,
                 is_different=is_different,
+                fallback=fallback,
                 _plc=_plc,
             )
         instance = object.__new__(cls)
-        if nodes_or_number is not None and nx.__version__[:3] > "3.2":
+        if nodes_or_number is not None and _nxver > (3, 2):
             func = nx.utils.decorators.nodes_or_number(nodes_or_number)(func)
         # update_wrapper sets __wrapped__, which will be used for the signature
         update_wrapper(instance, func)
@@ -100,6 +107,7 @@ def __new__(
         instance.version_added = version_added
         instance.is_incomplete = is_incomplete
         instance.is_different = is_different
+        instance.fallback = fallback
         # The docstring on our function is added to the NetworkX docstring.
         instance.extra_doc = (
             dedent(func.__doc__.lstrip("\n").rstrip()) if func.__doc__ else None
@@ -113,7 +121,7 @@ def __new__(
         # Set methods so they are in __dict__
         instance._can_run = instance._can_run
         instance._should_run = instance._should_run
-        if nodes_or_number is not None and nx.__version__[:3] <= "3.2":
+        if nodes_or_number is not None and _nxver <= (3, 2):
             instance = nx.utils.decorators.nodes_or_number(nodes_or_number)(instance)
         return instance
 
@@ -136,7 +144,14 @@ def _should_run(self, func):
         self.should_run = func
 
     def __call__(self, /, *args, **kwargs):
-        return self.__wrapped__(*args, **kwargs)
+        if not self.fallback:
+            return self.__wrapped__(*args, **kwargs)
+        try:
+            return self.__wrapped__(*args, **kwargs)
+        except NetworkXError:
+            raise
+        except Exception as exc:
+            raise _And_NotImplementedError(exc) from exc
 
     def __reduce__(self):
         return _restore_networkx_dispatched, (self.name,)
diff --git a/python/nx-cugraph/nx_cugraph/utils/misc.py b/python/nx-cugraph/nx_cugraph/utils/misc.py
index 8526524f1de..01c25dd5983 100644
--- a/python/nx-cugraph/nx_cugraph/utils/misc.py
+++ b/python/nx-cugraph/nx_cugraph/utils/misc.py
@@ -194,7 +194,7 @@ def _get_int_dtype(
 
 
 def _get_float_dtype(
-    dtype: Dtype, *, graph: nxcg.Graph | None = None, weight: EdgeKey | None = None
+    dtype: Dtype, *, graph: nxcg.CudaGraph | None = None, weight: EdgeKey | None = None
 ):
     """Promote dtype to float32 or float64 as appropriate."""
     if dtype is None:
@@ -238,3 +238,37 @@ def _cp_iscopied_asarray(a, *args, orig_object=None, **kwargs):
     ):
         return False, arr
     return True, arr
+
+
+class _And_NotImplementedError(NotImplementedError):
+    """Additionally make an exception a ``NotImplementedError``.
+
+    For example:
+
+    >>> try:
+    ...     raise _And_NotImplementedError(KeyError("missing"))
+    ... except KeyError:
+    ...     pass
+
+    or
+
+    >>> try:
+    ...     raise _And_NotImplementedError(KeyError("missing"))
+    ... except NotImplementedError:
+    ...     pass
+
+    """
+
+    def __new__(cls, exc):
+        exc_type = type(exc)
+        if issubclass(exc_type, NotImplementedError):
+            new_type = exc_type
+        else:
+            new_type = type(
+                f"{exc_type.__name__}{cls.__name__}",
+                (exc_type, NotImplementedError),
+                {},
+            )
+        instance = NotImplementedError.__new__(new_type)
+        instance.__init__(*exc.args)
+        return instance
diff --git a/python/nx-cugraph/pyproject.toml b/python/nx-cugraph/pyproject.toml
index 847444f9dd1..d145aa549da 100644
--- a/python/nx-cugraph/pyproject.toml
+++ b/python/nx-cugraph/pyproject.toml
@@ -18,15 +18,15 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 classifiers = [
     "Development Status :: 4 - Beta",
     "License :: OSI Approved :: Apache Software License",
     "Programming Language :: Python",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
     "Programming Language :: Python :: 3 :: Only",
     "Intended Audience :: Developers",
     "Topic :: Software Development :: Libraries :: Python Modules",
@@ -34,13 +34,12 @@ classifiers = [
 dependencies = [
     "cupy-cuda11x>=12.0.0",
     "networkx>=3.0",
-    "numpy>=1.23,<2.0a0",
-    "pylibcugraph==24.10.*,>=0.0.0a0",
+    "numpy>=1.23,<3.0a0",
+    "pylibcugraph==24.12.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.optional-dependencies]
 test = [
-    "packaging>=21",
     "pandas",
     "pytest",
     "pytest-benchmark",
@@ -90,7 +89,7 @@ matrix-entry = "cuda_suffixed=true"
 
 [tool.black]
 line-length = 88
-target-version = ["py39", "py310", "py311"]
+target-version = ["py310", "py311", "py312"]
 
 [tool.isort]
 sections = ["FUTURE", "STDLIB", "THIRDPARTY", "FIRSTPARTY", "LOCALFOLDER"]
@@ -156,7 +155,7 @@ exclude_lines = [
 [tool.ruff]
 # https://github.com/charliermarsh/ruff/
 line-length = 88
-target-version = "py39"
+target-version = "py310"
 [tool.ruff.lint]
 unfixable = [
     "F841",  # unused-variable (Note: can leave useless expression)
@@ -170,6 +169,7 @@ external = [
 ]
 ignore = [
     # Would be nice to fix these
+    "B905",  # `zip()` without an explicit `strict=` parameter (Note: possible since py39 was dropped; we should do this!)
     "D100",  # Missing docstring in public module
     "D101",  # Missing docstring in public class
     "D102",  # Missing docstring in public method
@@ -215,6 +215,7 @@ ignore = [
     "SIM105",  # Use contextlib.suppress(...) instead of try-except-pass (Note: try-except-pass is much faster)
     "SIM108",  # Use ternary operator ... instead of if-else-block (Note: if-else better for coverage and sometimes clearer)
     "TRY003",  # Avoid specifying long messages outside the exception class (Note: why?)
+    "UP038",  # Use `X | Y` in `isinstance` call instead of `(X, Y)` (Note: tuple is faster for now)
 
     # Ignored categories
     "C90",  # mccabe (Too strict, but maybe we should make things less complex)
@@ -241,6 +242,7 @@ ignore = [
 # Allow assert, print, RNG, and no docstring
 "nx_cugraph/**/tests/*py" = ["S101", "S311", "T201", "D103", "D100"]
 "_nx_cugraph/__init__.py" = ["E501"]
+"nx_cugraph/__init__.py" = ["E402"]  # Allow module level import not at top of file
 "nx_cugraph/algorithms/**/*py" = ["D205", "D401"]  # Allow flexible docstrings for algorithms
 "nx_cugraph/generators/**/*py" = ["D205", "D401"]  # Allow flexible docstrings for generators
 "nx_cugraph/interface.py" = ["D401"]  # Flexible docstrings
diff --git a/python/nx-cugraph/run_nx_tests.sh b/python/nx-cugraph/run_nx_tests.sh
index bceec53b7d5..5fb173cf939 100755
--- a/python/nx-cugraph/run_nx_tests.sh
+++ b/python/nx-cugraph/run_nx_tests.sh
@@ -18,6 +18,10 @@
 #   testing takes longer.  Without it, tests will xfail when encountering a
 #   function that we don't implement.
 #
+# NX_CUGRAPH_USE_COMPAT_GRAPHS, {"True", "False"}, default is "True"
+#   Whether to use `nxcg.Graph` as the nx_cugraph backend graph.
+#   A Graph should be a compatible NetworkX graph, so fewer tests should fail.
+#
 # Coverage of `nx_cugraph.algorithms` is reported and is a good sanity check
 # that algorithms run.
 
diff --git a/python/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/CMakeLists.txt
index f43b7db1279..045628e9c0d 100644
--- a/python/pylibcugraph/CMakeLists.txt
+++ b/python/pylibcugraph/CMakeLists.txt
@@ -33,6 +33,7 @@ option(FIND_CUGRAPH_CPP "Search for existing CUGRAPH C++ installations before de
        OFF
 )
 option(USE_CUGRAPH_OPS "Enable all functions that call cugraph-ops" ON)
+option(USE_CUDA_MATH_WHEELS "Use the CUDA math wheels instead of the system libraries" OFF)
 
 if(NOT USE_CUGRAPH_OPS)
     message(STATUS "Disabling libcugraph functions that reference cugraph-ops")
@@ -49,18 +50,38 @@ endif()
 include(rapids-cython-core)
 
 if (NOT cugraph_FOUND)
+  find_package(CUDAToolkit REQUIRED)
+
   set(BUILD_TESTS OFF)
   set(BUILD_CUGRAPH_MG_TESTS OFF)
   set(BUILD_CUGRAPH_OPS_CPP_TESTS OFF)
   set(CUDA_STATIC_RUNTIME ON)
+  set(CUDA_STATIC_MATH_LIBRARIES ON)
   set(USE_RAFT_STATIC ON)
   set(CUGRAPH_COMPILE_RAFT_LIB ON)
   set(CUGRAPH_USE_CUGRAPH_OPS_STATIC ON)
   set(CUGRAPH_EXCLUDE_CUGRAPH_OPS_FROM_ALL ON)
   set(ALLOW_CLONE_CUGRAPH_OPS ON)
 
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.0)
+    set(CUDA_STATIC_MATH_LIBRARIES OFF)
+  elseif(USE_CUDA_MATH_WHEELS)
+    message(FATAL_ERROR "Cannot use CUDA math wheels with CUDA < 12.0")
+  endif()
+
   add_subdirectory(../../cpp cugraph-cpp EXCLUDE_FROM_ALL)
 
+  if(NOT CUDA_STATIC_MATH_LIBRARIES AND USE_CUDA_MATH_WHEELS)
+    set(rpaths
+      "$ORIGIN/../nvidia/cublas/lib"
+      "$ORIGIN/../nvidia/curand/lib"
+      "$ORIGIN/../nvidia/cusolver/lib"
+      "$ORIGIN/../nvidia/cusparse/lib"
+      "$ORIGIN/../nvidia/nvjitlink/lib"
+    )
+    set_property(TARGET cugraph PROPERTY INSTALL_RPATH ${rpaths} APPEND)
+  endif()
+
   set(cython_lib_dir pylibcugraph)
   install(TARGETS cugraph DESTINATION ${cython_lib_dir})
   install(TARGETS cugraph_c DESTINATION ${cython_lib_dir})
diff --git a/python/pylibcugraph/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/pylibcugraph/CMakeLists.txt
index 90fce23282e..9f1b9924336 100644
--- a/python/pylibcugraph/pylibcugraph/CMakeLists.txt
+++ b/python/pylibcugraph/pylibcugraph/CMakeLists.txt
@@ -54,6 +54,8 @@ set(cython_sources
     triangle_count.pyx
     two_hop_neighbors.pyx
     uniform_neighbor_sample.pyx
+    biased_neighbor_sample.pyx
+    negative_sampling.pyx
     uniform_random_walks.pyx
     utils.pyx
     weakly_connected_components.pyx
diff --git a/python/pylibcugraph/pylibcugraph/__init__.py b/python/pylibcugraph/pylibcugraph/__init__.py
index b67acc8bbfc..26fa3f64ddd 100644
--- a/python/pylibcugraph/pylibcugraph/__init__.py
+++ b/python/pylibcugraph/pylibcugraph/__init__.py
@@ -39,6 +39,10 @@
 
 from pylibcugraph.uniform_neighbor_sample import uniform_neighbor_sample
 
+from pylibcugraph.biased_neighbor_sample import biased_neighbor_sample
+
+from pylibcugraph.negative_sampling import negative_sampling
+
 from pylibcugraph.core_number import core_number
 
 from pylibcugraph.k_core import k_core
diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd
index 4da7c4328fd..aa19ce60908 100644
--- a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd
+++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd
@@ -352,7 +352,7 @@ cdef extern from "cugraph_c/algorithms.h":
 
     # biased random walks
     cdef cugraph_error_code_t \
-        cugraph_based_random_walks(
+        cugraph_biased_random_walks(
             const cugraph_resource_handle_t* handle,
             cugraph_graph_t* graph,
             const cugraph_type_erased_device_array_view_t* start_vertices,
diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/coo.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/coo.pxd
new file mode 100644
index 00000000000..e466e6ee5a0
--- /dev/null
+++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/coo.pxd
@@ -0,0 +1,71 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Have cython use python 3 syntax
+# cython: language_level = 3
+
+from pylibcugraph._cugraph_c.array cimport (
+    cugraph_type_erased_device_array_view_t,
+)
+
+cdef extern from "cugraph_c/coo.h":
+    ctypedef struct cugraph_coo_t:
+        pass
+
+    ctypedef struct cugraph_coo_list_t:
+        pass
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_coo_get_sources(
+            cugraph_coo_t* coo
+        )
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_coo_get_destinations(
+            cugraph_coo_t* coo
+        )
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_coo_get_edge_weights(
+            cugraph_coo_t* coo
+        )
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_coo_get_edge_id(
+            cugraph_coo_t* coo
+        )
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_coo_get_edge_type(
+            cugraph_coo_t* coo
+        )
+
+    cdef size_t \
+        cugraph_coo_list_size(
+            const cugraph_coo_list_t* coo_list
+        )
+
+    cdef cugraph_coo_t* \
+        cugraph_coo_list_element(
+            cugraph_coo_list_t* coo_list,
+            size_t index)
+
+    cdef void \
+        cugraph_coo_free(
+            cugraph_coo_t* coo
+        )
+
+    cdef void \
+        cugraph_coo_list_free(
+            cugraph_coo_list_t* coo_list
+        )
diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_generators.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_generators.pxd
index f6d62377443..cda47e55f77 100644
--- a/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_generators.pxd
+++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_generators.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -31,62 +31,16 @@ from pylibcugraph._cugraph_c.random cimport (
     cugraph_rng_state_t,
 )
 
+from pylibcugraph._cugraph_c.coo cimport (
+    cugraph_coo_t,
+    cugraph_coo_list_t,
+)
+
 cdef extern from "cugraph_c/graph_generators.h":
     ctypedef enum cugraph_generator_distribution_t:
         POWER_LAW
         UNIFORM
 
-    ctypedef struct cugraph_coo_t:
-        pass
-
-    ctypedef struct cugraph_coo_list_t:
-        pass
-
-    cdef cugraph_type_erased_device_array_view_t* \
-        cugraph_coo_get_sources(
-            cugraph_coo_t* coo
-        )
-
-    cdef cugraph_type_erased_device_array_view_t* \
-        cugraph_coo_get_destinations(
-            cugraph_coo_t* coo
-        )
-
-    cdef cugraph_type_erased_device_array_view_t* \
-        cugraph_coo_get_edge_weights(
-            cugraph_coo_t* coo
-        )
-
-    cdef cugraph_type_erased_device_array_view_t* \
-        cugraph_coo_get_edge_id(
-            cugraph_coo_t* coo
-        )
-
-    cdef cugraph_type_erased_device_array_view_t* \
-        cugraph_coo_get_edge_type(
-            cugraph_coo_t* coo
-        )
-
-    cdef size_t \
-        cugraph_coo_list_size(
-            const cugraph_coo_list_t* coo_list
-        )
-
-    cdef cugraph_coo_t* \
-        cugraph_coo_list_element(
-            cugraph_coo_list_t* coo_list,
-            size_t index)
-
-    cdef void \
-        cugraph_coo_free(
-            cugraph_coo_t* coo
-        )
-
-    cdef void \
-        cugraph_coo_list_free(
-            cugraph_coo_list_t* coo_list
-        )
-
     cdef cugraph_error_code_t \
         cugraph_generate_rmat_edgelist(
             const cugraph_resource_handle_t* handle,
diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd
index 0f852d9cecd..c982b12665a 100644
--- a/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd
+++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd
@@ -39,7 +39,9 @@ from pylibcugraph._cugraph_c.random cimport (
 from pylibcugraph._cugraph_c.array cimport (
     cugraph_type_erased_device_array_t,
 )
-
+from pylibcugraph._cugraph_c.coo cimport (
+    cugraph_coo_t,
+)
 from pylibcugraph._cugraph_c.properties cimport (
     cugraph_edge_property_view_t,
 )
@@ -103,3 +105,21 @@ cdef extern from "cugraph_c/sampling_algorithms.h":
             cugraph_type_erased_device_array_t** vertices,
             cugraph_error_t** error
         )
+
+    # negative sampling
+    cdef cugraph_error_code_t \
+        cugraph_negative_sampling(
+            const cugraph_resource_handle_t* handle,
+            cugraph_rng_state_t* rng_state,
+            cugraph_graph_t* graph,
+            const cugraph_type_erased_device_array_view_t* vertices,
+            const cugraph_type_erased_device_array_view_t* src_bias,
+            const cugraph_type_erased_device_array_view_t* dst_bias,
+            size_t num_samples,
+            bool_t remove_duplicates,
+            bool_t remove_false_negatives,
+            bool_t exact_number_of_samples,
+            bool_t do_expensive_check,
+            cugraph_coo_t **result,
+            cugraph_error_t **error
+        )
diff --git a/python/pylibcugraph/pylibcugraph/biased_neighbor_sample.pyx b/python/pylibcugraph/pylibcugraph/biased_neighbor_sample.pyx
new file mode 100644
index 00000000000..2dd138d5d06
--- /dev/null
+++ b/python/pylibcugraph/pylibcugraph/biased_neighbor_sample.pyx
@@ -0,0 +1,448 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Have cython use python 3 syntax
+# cython: language_level = 3
+
+from libc.stdint cimport uintptr_t
+
+from pylibcugraph._cugraph_c.resource_handle cimport (
+    bool_t,
+    cugraph_resource_handle_t,
+)
+from pylibcugraph._cugraph_c.error cimport (
+    cugraph_error_code_t,
+    cugraph_error_t,
+)
+from pylibcugraph._cugraph_c.array cimport (
+    cugraph_type_erased_device_array_view_t,
+    cugraph_type_erased_device_array_view_create,
+    cugraph_type_erased_device_array_view_free,
+    cugraph_type_erased_host_array_view_t,
+    cugraph_type_erased_host_array_view_create,
+    cugraph_type_erased_host_array_view_free,
+)
+from pylibcugraph._cugraph_c.graph cimport (
+    cugraph_graph_t,
+)
+from pylibcugraph._cugraph_c.algorithms cimport (
+    cugraph_sample_result_t,
+    cugraph_prior_sources_behavior_t,
+    cugraph_compression_type_t,
+    cugraph_sampling_options_t,
+    cugraph_sampling_options_create,
+    cugraph_sampling_options_free,
+    cugraph_sampling_set_with_replacement,
+    cugraph_sampling_set_return_hops,
+    cugraph_sampling_set_prior_sources_behavior,
+    cugraph_sampling_set_dedupe_sources,
+    cugraph_sampling_set_renumber_results,
+    cugraph_sampling_set_compress_per_hop,
+    cugraph_sampling_set_compression_type,
+    cugraph_sampling_set_retain_seeds,
+)
+from pylibcugraph._cugraph_c.sampling_algorithms cimport (
+    cugraph_biased_neighbor_sample,
+
+)
+from pylibcugraph.resource_handle cimport (
+    ResourceHandle,
+)
+from pylibcugraph.graphs cimport (
+    _GPUGraph,
+)
+from pylibcugraph.utils cimport (
+    assert_success,
+    assert_CAI_type,
+    assert_AI_type,
+    get_c_type_from_numpy_type,
+)
+from pylibcugraph.internal_types.sampling_result cimport (
+    SamplingResult,
+)
+from pylibcugraph._cugraph_c.random cimport (
+    cugraph_rng_state_t
+)
+from pylibcugraph.random cimport (
+    CuGraphRandomState
+)
+import warnings
+
+# TODO accept cupy/numpy random state in addition to raw seed.
+def biased_neighbor_sample(ResourceHandle resource_handle,
+                            _GPUGraph input_graph,
+                            start_list,
+                            h_fan_out,
+                            *,
+                            bool_t with_replacement,
+                            bool_t do_expensive_check,
+                            with_edge_properties=True,
+                            batch_id_list=None,
+                            label_list=None,
+                            label_to_output_comm_rank=None,
+                            label_offsets=None,
+                            biases=None,
+                            prior_sources_behavior=None,
+                            deduplicate_sources=False,
+                            return_hops=False,
+                            renumber=False,
+                            retain_seeds=False,
+                            compression='COO',
+                            compress_per_hop=False,
+                            random_state=None,
+                            return_dict=False,):
+    """
+    Does neighborhood sampling, which samples nodes from a graph based on the
+    current node's neighbors, with a corresponding fanout value at each hop.
+
+    Parameters
+    ----------
+    resource_handle: ResourceHandle
+        Handle to the underlying device and host resources needed for
+        referencing data and running algorithms.
+
+    input_graph : SGGraph or MGGraph
+        The input graph, for either Single or Multi-GPU operations.
+
+    start_list: device array type
+        Device array containing the list of starting vertices for sampling.
+
+    h_fan_out: numpy array type
+        Host array containing the branching out (fan-out) degrees per
+        starting vertex for each hop level.
+
+    with_replacement: bool
+        If true, sampling procedure is done with replacement (the same vertex
+        can be selected multiple times in the same step).
+
+    do_expensive_check: bool
+        If True, performs more extensive tests on the inputs to ensure
+        validitity, at the expense of increased run time.
+
+    with_edge_properties: bool
+        This argument is present for compatibility with
+        uniform_neighbor_sample.  Only the 'True' option is accepted.
+        All edge properties in the graph are returned.
+
+    batch_id_list: list[int32] (Optional)
+        List of int32 batch ids that is returned with each edge.  Optional
+        argument, defaults to NULL, returning nothing.
+
+    label_list: list[int32] (Optional)
+        List of unique int32 batch ids.  Required if also passing the
+        label_to_output_comm_rank flag.  Default to NULL (does nothing)
+
+    label_to_output_comm_rank: list[int32] (Optional)
+        Maps the unique batch ids in label_list to the rank of the
+        worker that should hold results for that batch id.
+        Defaults to NULL (does nothing)
+
+    label_offsets: list[int] (Optional)
+        Offsets of each label within the start vertex list.
+
+    biases: list[float32/64] (Optional)
+        Edge biases.  If not provided, uses the weight property.
+        Currently unsupported.
+
+    prior_sources_behavior: str (Optional)
+        Options are "carryover", and "exclude".
+        Default will leave the source list as-is.
+        Carryover will carry over sources from previous hops to the
+        current hop.
+        Exclude will exclude sources from previous hops from reappearing
+        as sources in future hops.
+
+    deduplicate_sources: bool (Optional)
+        If True, will deduplicate the source list before sampling.
+        Defaults to False.
+
+    renumber: bool (Optional)
+        If True, will renumber the sources and destinations on a
+        per-batch basis and return the renumber map and batch offsets
+        in additional to the standard returns.
+
+    retain_seeds: bool (Optional)
+        If True, will retain the original seeds (original source vertices)
+        in the output even if they do not have outgoing neighbors.
+        Defaults to False.
+
+    compression: str (Optional)
+        Options: COO (default), CSR, CSC, DCSR, DCSR
+        Sets the compression format for the returned samples.
+
+    compress_per_hop: bool (Optional)
+        If False (default), will create a compressed edgelist for the
+        entire batch.
+        If True, will create a separate compressed edgelist per hop within
+        a batch.
+
+    random_state: int (Optional)
+        Random state to use when generating samples.  Optional argument,
+        defaults to a hash of process id, time, and hostname.
+        (See pylibcugraph.random.CuGraphRandomState)
+
+    return_dict: bool (Optional)
+        Whether to return a dictionary instead of a tuple.
+        Optional argument, defaults to False, returning a tuple.
+        This argument will eventually be deprecated in favor
+        of always returning a dictionary.
+
+    Returns
+    -------
+    A tuple of device arrays, where the first and second items in the tuple
+    are device arrays containing the starting and ending vertices of each
+    walk respectively, the third item in the tuple is a device array
+    containing the start labels, and the fourth item in the tuple is a device
+    array containing the indices for reconstructing paths.
+
+    If renumber was set to True, then the fifth item in the tuple is a device
+    array containing the renumber map, and the sixth item in the tuple is a
+    device array containing the renumber map offsets (which delineate where
+    the renumber map for each batch starts).
+
+    """
+    if biases is not None:
+        raise ValueError("The biases parameter is currently unsupported")
+
+    if not with_edge_properties:
+        raise ValueError("with_edge_properties=False is not supported by biased_neighbor_sample")
+
+    cdef cugraph_resource_handle_t* c_resource_handle_ptr = (
+        resource_handle.c_resource_handle_ptr
+    )
+
+    cdef cugraph_graph_t* c_graph_ptr = input_graph.c_graph_ptr
+
+    cdef bool_t c_deduplicate_sources = deduplicate_sources
+    cdef bool_t c_return_hops = return_hops
+    cdef bool_t c_renumber = renumber
+    cdef bool_t c_compress_per_hop = compress_per_hop
+
+    assert_CAI_type(start_list, "start_list")
+    assert_CAI_type(batch_id_list, "batch_id_list", True)
+    assert_CAI_type(label_list, "label_list", True)
+    assert_CAI_type(label_to_output_comm_rank, "label_to_output_comm_rank", True)
+    assert_CAI_type(label_offsets, "label_offsets", True)
+    assert_AI_type(h_fan_out, "h_fan_out")
+
+    cdef cugraph_sample_result_t* result_ptr
+    cdef cugraph_error_code_t error_code
+    cdef cugraph_error_t* error_ptr
+
+    cdef uintptr_t cai_start_ptr = \
+        start_list.__cuda_array_interface__["data"][0]
+
+    cdef uintptr_t cai_batch_id_ptr
+    if batch_id_list is not None:
+        cai_batch_id_ptr = \
+            batch_id_list.__cuda_array_interface__['data'][0]
+
+    cdef uintptr_t cai_label_list_ptr
+    if label_list is not None:
+        cai_label_list_ptr = \
+            label_list.__cuda_array_interface__['data'][0]
+
+    cdef uintptr_t cai_label_to_output_comm_rank_ptr
+    if label_to_output_comm_rank is not None:
+        cai_label_to_output_comm_rank_ptr = \
+            label_to_output_comm_rank.__cuda_array_interface__['data'][0]
+
+    cdef uintptr_t cai_label_offsets_ptr
+    if label_offsets is not None:
+        cai_label_offsets_ptr = \
+            label_offsets.__cuda_array_interface__['data'][0]
+
+    cdef uintptr_t ai_fan_out_ptr = \
+        h_fan_out.__array_interface__["data"][0]
+
+    cdef cugraph_type_erased_device_array_view_t* start_ptr = \
+        cugraph_type_erased_device_array_view_create(
+            <void*>cai_start_ptr,
+            len(start_list),
+            get_c_type_from_numpy_type(start_list.dtype))
+
+    cdef cugraph_type_erased_device_array_view_t* batch_id_ptr = <cugraph_type_erased_device_array_view_t*>NULL
+    if batch_id_list is not None:
+        batch_id_ptr = \
+            cugraph_type_erased_device_array_view_create(
+                <void*>cai_batch_id_ptr,
+                len(batch_id_list),
+                get_c_type_from_numpy_type(batch_id_list.dtype)
+            )
+
+    cdef cugraph_type_erased_device_array_view_t* label_list_ptr = <cugraph_type_erased_device_array_view_t*>NULL
+    if label_list is not None:
+        label_list_ptr = \
+            cugraph_type_erased_device_array_view_create(
+                <void*>cai_label_list_ptr,
+                len(label_list),
+                get_c_type_from_numpy_type(label_list.dtype)
+            )
+
+    cdef cugraph_type_erased_device_array_view_t* label_to_output_comm_rank_ptr = <cugraph_type_erased_device_array_view_t*>NULL
+    if label_to_output_comm_rank is not None:
+        label_to_output_comm_rank_ptr = \
+            cugraph_type_erased_device_array_view_create(
+                <void*>cai_label_to_output_comm_rank_ptr,
+                len(label_to_output_comm_rank),
+                get_c_type_from_numpy_type(label_to_output_comm_rank.dtype)
+            )
+
+    cdef cugraph_type_erased_device_array_view_t* label_offsets_ptr = <cugraph_type_erased_device_array_view_t*>NULL
+    if retain_seeds:
+        if label_offsets is None:
+            raise ValueError("Must provide label offsets if retain_seeds is True")
+        label_offsets_ptr = \
+            cugraph_type_erased_device_array_view_create(
+                <void*>cai_label_offsets_ptr,
+                len(label_offsets),
+                get_c_type_from_numpy_type(label_offsets.dtype)
+            )
+
+    cdef cugraph_type_erased_host_array_view_t* fan_out_ptr = \
+        cugraph_type_erased_host_array_view_create(
+            <void*>ai_fan_out_ptr,
+            len(h_fan_out),
+            get_c_type_from_numpy_type(h_fan_out.dtype))
+
+
+    cg_rng_state = CuGraphRandomState(resource_handle, random_state)
+
+    cdef cugraph_rng_state_t* rng_state_ptr = \
+        cg_rng_state.rng_state_ptr
+
+    cdef cugraph_prior_sources_behavior_t prior_sources_behavior_e
+    if prior_sources_behavior is None:
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.DEFAULT
+    elif prior_sources_behavior == 'carryover':
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.CARRY_OVER
+    elif prior_sources_behavior == 'exclude':
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.EXCLUDE
+    else:
+        raise ValueError(
+            f'Invalid option {prior_sources_behavior}'
+            ' for prior sources behavior'
+        )
+
+    cdef cugraph_compression_type_t compression_behavior_e
+    if compression is None or compression == 'COO':
+        compression_behavior_e = cugraph_compression_type_t.COO
+    elif compression == 'CSR':
+        compression_behavior_e = cugraph_compression_type_t.CSR
+    elif compression == 'CSC':
+        compression_behavior_e = cugraph_compression_type_t.CSC
+    elif compression == 'DCSR':
+        compression_behavior_e = cugraph_compression_type_t.DCSR
+    elif compression == 'DCSC':
+        compression_behavior_e = cugraph_compression_type_t.DCSC
+    else:
+        raise ValueError(
+            f'Invalid option {compression}'
+            ' for compression type'
+        )
+
+    cdef cugraph_sampling_options_t* sampling_options
+    error_code = cugraph_sampling_options_create(&sampling_options, &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_sampling_options_create")
+
+    cugraph_sampling_set_with_replacement(sampling_options, with_replacement)
+    cugraph_sampling_set_return_hops(sampling_options, c_return_hops)
+    cugraph_sampling_set_dedupe_sources(sampling_options, c_deduplicate_sources)
+    cugraph_sampling_set_prior_sources_behavior(sampling_options, prior_sources_behavior_e)
+    cugraph_sampling_set_renumber_results(sampling_options, c_renumber)
+    cugraph_sampling_set_compression_type(sampling_options, compression_behavior_e)
+    cugraph_sampling_set_compress_per_hop(sampling_options, c_compress_per_hop)
+    cugraph_sampling_set_retain_seeds(sampling_options, retain_seeds)
+
+    error_code = cugraph_biased_neighbor_sample(
+        c_resource_handle_ptr,
+        c_graph_ptr,
+        NULL,
+        start_ptr,
+        batch_id_ptr,
+        label_list_ptr,
+        label_to_output_comm_rank_ptr,
+        label_offsets_ptr,
+        fan_out_ptr,
+        rng_state_ptr,
+        sampling_options,
+        do_expensive_check,
+        &result_ptr,
+        &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_biased_neighbor_sample")
+
+    # Free the sampling options
+    cugraph_sampling_options_free(sampling_options)
+
+    # Free the two input arrays that are no longer needed.
+    cugraph_type_erased_device_array_view_free(start_ptr)
+    cugraph_type_erased_host_array_view_free(fan_out_ptr)
+    if batch_id_list is not None:
+        cugraph_type_erased_device_array_view_free(batch_id_ptr)
+    if label_offsets is not None:
+        cugraph_type_erased_device_array_view_free(label_offsets_ptr)
+
+    # Have the SamplingResult instance assume ownership of the result data.
+    result = SamplingResult()
+    result.set_ptr(result_ptr)
+
+    # Get cupy "views" of the individual arrays to return. These each increment
+    # the refcount on the SamplingResult instance which will keep the data alive
+    # until all references are removed and the GC runs.
+    cupy_majors = result.get_majors()
+    cupy_major_offsets = result.get_major_offsets()
+    cupy_minors = result.get_minors()
+    cupy_edge_weights = result.get_edge_weights()
+    cupy_edge_ids = result.get_edge_ids()
+    cupy_edge_types = result.get_edge_types()
+    cupy_batch_ids = result.get_batch_ids()
+    cupy_label_hop_offsets = result.get_label_hop_offsets()
+
+    if renumber:
+        cupy_renumber_map = result.get_renumber_map()
+        cupy_renumber_map_offsets = result.get_renumber_map_offsets()
+
+        if return_dict:
+            return {
+                'major_offsets': cupy_major_offsets,
+                'majors': cupy_majors,
+                'minors': cupy_minors,
+                'weight': cupy_edge_weights,
+                'edge_id': cupy_edge_ids,
+                'edge_type': cupy_edge_types,
+                'batch_id': cupy_batch_ids,
+                'label_hop_offsets': cupy_label_hop_offsets,
+                'hop_id': None,
+                'renumber_map': cupy_renumber_map,
+                'renumber_map_offsets': cupy_renumber_map_offsets
+            }
+        else:
+            cupy_majors = cupy_major_offsets if cupy_majors is None else cupy_majors
+            return (cupy_majors, cupy_minors, cupy_edge_weights, cupy_edge_ids, cupy_edge_types, cupy_batch_ids, cupy_label_hop_offsets, None, cupy_renumber_map, cupy_renumber_map_offsets)
+    else:
+        cupy_hop_ids = result.get_hop_ids()
+        if return_dict:
+            return {
+                'major_offsets': cupy_major_offsets,
+                'majors': cupy_majors,
+                'minors': cupy_minors,
+                'weight': cupy_edge_weights,
+                'edge_id': cupy_edge_ids,
+                'edge_type': cupy_edge_types,
+                'batch_id': cupy_batch_ids,
+                'label_hop_offsets': cupy_label_hop_offsets,
+                'hop_id': cupy_hop_ids,
+            }
+        else:
+            cupy_majors = cupy_major_offsets if cupy_majors is None else cupy_majors
+            return (cupy_majors, cupy_minors, cupy_edge_weights, cupy_edge_ids, cupy_edge_types, cupy_batch_ids, cupy_label_hop_offsets, cupy_hop_ids)
diff --git a/python/pylibcugraph/pylibcugraph/generate_rmat_edgelist.pyx b/python/pylibcugraph/pylibcugraph/generate_rmat_edgelist.pyx
index f38ad21d3b0..4ea96920e61 100644
--- a/python/pylibcugraph/pylibcugraph/generate_rmat_edgelist.pyx
+++ b/python/pylibcugraph/pylibcugraph/generate_rmat_edgelist.pyx
@@ -26,11 +26,7 @@ from pylibcugraph._cugraph_c.error cimport (
 from pylibcugraph._cugraph_c.array cimport (
     cugraph_type_erased_device_array_view_t,
 )
-from pylibcugraph._cugraph_c.graph_generators cimport (
-    cugraph_generate_rmat_edgelist,
-    cugraph_generate_edge_weights,
-    cugraph_generate_edge_ids,
-    cugraph_generate_edge_types,
+from pylibcugraph._cugraph_c.coo cimport (
     cugraph_coo_t,
     cugraph_coo_get_sources,
     cugraph_coo_get_destinations,
@@ -39,6 +35,12 @@ from pylibcugraph._cugraph_c.graph_generators cimport (
     cugraph_coo_get_edge_type,
     cugraph_coo_free,
 )
+from pylibcugraph._cugraph_c.graph_generators cimport (
+    cugraph_generate_rmat_edgelist,
+    cugraph_generate_edge_weights,
+    cugraph_generate_edge_ids,
+    cugraph_generate_edge_types,
+)
 from pylibcugraph.resource_handle cimport (
     ResourceHandle,
 )
diff --git a/python/pylibcugraph/pylibcugraph/generate_rmat_edgelists.pyx b/python/pylibcugraph/pylibcugraph/generate_rmat_edgelists.pyx
index 32af0c13fc0..7de48708f80 100644
--- a/python/pylibcugraph/pylibcugraph/generate_rmat_edgelists.pyx
+++ b/python/pylibcugraph/pylibcugraph/generate_rmat_edgelists.pyx
@@ -26,14 +26,9 @@ from pylibcugraph._cugraph_c.error cimport (
 from pylibcugraph._cugraph_c.array cimport (
     cugraph_type_erased_device_array_view_t,
 )
-from pylibcugraph._cugraph_c.graph_generators cimport (
-    cugraph_generate_rmat_edgelists,
-    cugraph_generate_edge_weights,
-    cugraph_generate_edge_ids,
-    cugraph_generate_edge_types,
+from pylibcugraph._cugraph_c.coo cimport (
     cugraph_coo_t,
     cugraph_coo_list_t,
-    cugraph_generator_distribution_t,
     cugraph_coo_get_sources,
     cugraph_coo_get_destinations,
     cugraph_coo_get_edge_weights,
@@ -44,6 +39,13 @@ from pylibcugraph._cugraph_c.graph_generators cimport (
     cugraph_coo_free,
     cugraph_coo_list_free,
 )
+from pylibcugraph._cugraph_c.graph_generators cimport (
+    cugraph_generate_rmat_edgelists,
+    cugraph_generate_edge_weights,
+    cugraph_generate_edge_ids,
+    cugraph_generate_edge_types,
+    cugraph_generator_distribution_t,
+)
 from pylibcugraph.resource_handle cimport (
     ResourceHandle,
 )
diff --git a/python/pylibcugraph/pylibcugraph/internal_types/CMakeLists.txt b/python/pylibcugraph/pylibcugraph/internal_types/CMakeLists.txt
index 1ca169c5869..22f07939db0 100644
--- a/python/pylibcugraph/pylibcugraph/internal_types/CMakeLists.txt
+++ b/python/pylibcugraph/pylibcugraph/internal_types/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -14,6 +14,7 @@
 
 set(cython_sources
     sampling_result.pyx
+    coo.pyx
 )
 set(linked_libraries cugraph::cugraph;cugraph::cugraph_c)
 
diff --git a/python/pylibcugraph/pylibcugraph/internal_types/coo.pxd b/python/pylibcugraph/pylibcugraph/internal_types/coo.pxd
new file mode 100644
index 00000000000..129b0be4dbe
--- /dev/null
+++ b/python/pylibcugraph/pylibcugraph/internal_types/coo.pxd
@@ -0,0 +1,28 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Have cython use python 3 syntax
+# cython: language_level = 3
+
+
+from pylibcugraph._cugraph_c.coo cimport (
+    cugraph_coo_t,
+)
+from pylibcugraph._cugraph_c.array cimport (
+    cugraph_type_erased_device_array_view_t,
+)
+
+cdef class COO:
+    cdef cugraph_coo_t* c_coo_ptr
+    cdef set_ptr(self, cugraph_coo_t* ptr)
+    cdef get_array(self, cugraph_type_erased_device_array_view_t* ptr)
diff --git a/python/pylibcugraph/pylibcugraph/internal_types/coo.pyx b/python/pylibcugraph/pylibcugraph/internal_types/coo.pyx
new file mode 100644
index 00000000000..64d10c22eaf
--- /dev/null
+++ b/python/pylibcugraph/pylibcugraph/internal_types/coo.pyx
@@ -0,0 +1,96 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Have cython use python 3 syntax
+# cython: language_level = 3
+
+from pylibcugraph._cugraph_c.coo cimport (
+    cugraph_coo_t,
+    cugraph_coo_free,
+    cugraph_coo_get_sources,
+    cugraph_coo_get_destinations,
+    cugraph_coo_get_edge_weights,
+    cugraph_coo_get_edge_id,
+    cugraph_coo_get_edge_type,
+)
+from pylibcugraph._cugraph_c.array cimport (
+    cugraph_type_erased_device_array_view_t,
+)
+from pylibcugraph.utils cimport create_cupy_array_view_for_device_ptr
+
+cdef class COO:
+    """
+    Cython interface to a cugraph_coo_t pointer. Instances of this
+    call will take ownership of the pointer and free it under standard python
+    GC rules (ie. when all references to it are no longer present).
+
+    This class provides methods to return non-owning cupy ndarrays for the
+    corresponding array members. Returning these cupy arrays increments the ref
+    count on the COO instances from which the cupy arrays are
+    referencing.
+    """
+    def __cinit__(self):
+        # This COO instance owns sample_result_ptr now. It will be
+        # freed when this instance is deleted (see __dealloc__())
+        self.c_coo_ptr = NULL
+
+    def __dealloc__(self):
+        if self.c_coo_ptr is not NULL:
+            cugraph_coo_free(self.c_coo_ptr)
+
+    cdef set_ptr(self, cugraph_coo_t* ptr):
+        self.c_coo_ptr = ptr
+
+    cdef get_array(self, cugraph_type_erased_device_array_view_t* ptr):
+        if ptr is NULL:
+            return None
+
+        return create_cupy_array_view_for_device_ptr(
+            ptr,
+            self,
+        )
+
+    def get_sources(self):
+        if self.c_coo_ptr is NULL:
+            raise ValueError("pointer not set, must call set_ptr() with a "
+                             "non-NULL value first.")
+        cdef cugraph_type_erased_device_array_view_t* ptr = cugraph_coo_get_sources(self.c_coo_ptr)
+        return self.get_array(ptr)
+
+    def get_destinations(self):
+        if self.c_coo_ptr is NULL:
+            raise ValueError("pointer not set, must call set_ptr() with a "
+                             "non-NULL value first.")
+        cdef cugraph_type_erased_device_array_view_t* ptr = cugraph_coo_get_destinations(self.c_coo_ptr)
+        return self.get_array(ptr)
+
+    def get_edge_ids(self):
+        if self.c_coo_ptr is NULL:
+            raise ValueError("pointer not set, must call set_ptr() with a "
+                             "non-NULL value first.")
+        cdef cugraph_type_erased_device_array_view_t* ptr = cugraph_coo_get_edge_id(self.c_coo_ptr)
+        return self.get_array(ptr)
+
+    def get_edge_types(self):
+        if self.c_coo_ptr is NULL:
+            raise ValueError("pointer not set, must call set_ptr() with a "
+                             "non-NULL value first.")
+        cdef cugraph_type_erased_device_array_view_t* ptr = cugraph_coo_get_edge_type(self.c_coo_ptr)
+        return self.get_array(ptr)
+
+    def get_edge_weights(self):
+        if self.c_coo_ptr is NULL:
+            raise ValueError("pointer not set, must call set_ptr() with a "
+                             "non-NULL value first.")
+        cdef cugraph_type_erased_device_array_view_t* ptr = cugraph_coo_get_edge_weights(self.c_coo_ptr)
+        return self.get_array(ptr)
diff --git a/python/pylibcugraph/pylibcugraph/negative_sampling.pyx b/python/pylibcugraph/pylibcugraph/negative_sampling.pyx
new file mode 100644
index 00000000000..610cfa90ccf
--- /dev/null
+++ b/python/pylibcugraph/pylibcugraph/negative_sampling.pyx
@@ -0,0 +1,184 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Have cython use python 3 syntax
+# cython: language_level = 3
+
+from libc.stdint cimport uintptr_t
+
+from pylibcugraph._cugraph_c.resource_handle cimport (
+    cugraph_resource_handle_t,
+    bool_t,
+)
+from pylibcugraph._cugraph_c.error cimport (
+    cugraph_error_code_t,
+    cugraph_error_t,
+)
+from pylibcugraph._cugraph_c.array cimport (
+    cugraph_type_erased_device_array_view_t,
+    cugraph_type_erased_device_array_view_create,
+    cugraph_type_erased_device_array_view_free,
+    cugraph_type_erased_host_array_view_t,
+    cugraph_type_erased_host_array_view_create,
+    cugraph_type_erased_host_array_view_free,
+)
+from pylibcugraph.resource_handle cimport (
+    ResourceHandle,
+)
+from pylibcugraph.graphs cimport (
+    _GPUGraph,
+)
+from pylibcugraph._cugraph_c.graph cimport (
+    cugraph_graph_t,
+)
+from pylibcugraph._cugraph_c.sampling_algorithms cimport (
+    cugraph_negative_sampling,
+)
+from pylibcugraph._cugraph_c.coo cimport (
+    cugraph_coo_t,
+)
+from pylibcugraph.internal_types.coo cimport (
+    COO,
+)
+from pylibcugraph.utils cimport (
+    assert_success,
+    assert_CAI_type,
+    create_cugraph_type_erased_device_array_view_from_py_obj,
+)
+from pylibcugraph._cugraph_c.random cimport (
+    cugraph_rng_state_t
+)
+from pylibcugraph.random cimport (
+    CuGraphRandomState
+)
+
+def negative_sampling(ResourceHandle resource_handle,
+                      _GPUGraph graph,
+                      size_t num_samples,
+                      random_state=None,
+                      vertices=None,
+                      src_bias=None,
+                      dst_bias=None,
+                      remove_duplicates=False,
+                      remove_false_negatives=False,
+                      exact_number_of_samples=False,
+                      do_expensive_check=False):
+    """
+    Performs negative sampling, which is essentially a form of graph generation.
+
+    By setting vertices, src_bias, and dst_bias, this function can perform
+    biased negative sampling.
+
+    Parameters
+    ----------
+    resource_handle: ResourceHandle
+        Handle to the underlying device and host resources needed for
+        referencing data and running algorithms.
+    input_graph: SGGraph or MGGraph
+        The stored cuGraph graph to create negative samples for.
+    num_samples: int
+        The number of negative edges to generate for each positive edge.
+    random_state: int (Optional)
+        Random state to use when generating samples.  Optional argument,
+        defaults to a hash of process id, time, and hostname.
+        (See pylibcugraph.random.CuGraphRandomState)
+    vertices: device array type (Optional)
+        Vertex ids corresponding to the src/dst biases, if provided.
+        Ignored if src/dst biases are not provided.
+    src_bias: device array type (Optional)
+        Probability per edge that a vertex is selected as a source vertex.
+        Does not have to be normalized.  Uses a uniform distribution if
+        not provided.
+    dst_bias: device array type (Optional)
+        Probability per edge that a vertex is selected as a destination vertex.
+        Does not have to be normalized.  Uses a uniform distribution if
+        not provided.
+    remove_duplicates: bool (Optional)
+        Whether to remove duplicate edges from the generated edgelist.
+        Defaults to False (does not remove duplicates).
+    remove_false_negatives: bool (Optional)
+        Whether to remove false negatives from the generated edgelist.
+        Defaults to False (does not check for and remove false negatives).
+    exact_number_of_samples: bool (Optional)
+        Whether to manually regenerate samples until the desired number
+        as specified by num_samples has been generated.
+        Defaults to False (does not regenerate if enough samples are not
+        produced in the initial round).
+    do_expensive_check: bool (Optional)
+        Whether to perform an expensive error check at the C++ level.
+        Defaults to False (no error check).
+
+    Returns
+    -------
+    dict[str, cupy.ndarray]
+        Generated edges in COO format.
+    """
+
+    assert_CAI_type(vertices, "vertices", True)
+    assert_CAI_type(src_bias, "src_bias", True)
+    assert_CAI_type(dst_bias, "dst_bias", True)
+
+    cdef cugraph_resource_handle_t* c_resource_handle_ptr = (
+        resource_handle.c_resource_handle_ptr
+    )
+
+    cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr
+
+    cdef bool_t c_remove_duplicates = remove_duplicates
+    cdef bool_t c_remove_false_negatives = remove_false_negatives
+    cdef bool_t c_exact_number_of_samples = exact_number_of_samples
+    cdef bool_t c_do_expensive_check = do_expensive_check
+
+    cg_rng_state = CuGraphRandomState(resource_handle, random_state)
+
+    cdef cugraph_rng_state_t* rng_state_ptr = \
+        cg_rng_state.rng_state_ptr
+
+    cdef cugraph_type_erased_device_array_view_t* vertices_ptr = \
+        create_cugraph_type_erased_device_array_view_from_py_obj(vertices)
+    cdef cugraph_type_erased_device_array_view_t* src_bias_ptr = \
+        create_cugraph_type_erased_device_array_view_from_py_obj(src_bias)
+    cdef cugraph_type_erased_device_array_view_t* dst_bias_ptr = \
+        create_cugraph_type_erased_device_array_view_from_py_obj(dst_bias)
+
+    cdef cugraph_coo_t* result_ptr
+    cdef cugraph_error_t* err_ptr
+    cdef cugraph_error_code_t error_code
+
+    error_code = cugraph_negative_sampling(
+        c_resource_handle_ptr,
+        rng_state_ptr,
+        c_graph_ptr,
+        vertices_ptr,
+        src_bias_ptr,
+        dst_bias_ptr,
+        num_samples,
+        c_remove_duplicates,
+        c_remove_false_negatives,
+        c_exact_number_of_samples,
+        c_do_expensive_check,
+        &result_ptr,
+        &err_ptr,
+    )
+    assert_success(error_code, err_ptr, "cugraph_negative_sampling")
+
+    coo = COO()
+    coo.set_ptr(result_ptr)
+
+    return {
+        'sources': coo.get_sources(),
+        'destinations': coo.get_destinations(),
+        'edge_id': coo.get_edge_ids(),
+        'edge_type': coo.get_edge_types(),
+        'weight': coo.get_edge_weights(),
+    }
diff --git a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx
index f002622f497..f3e2336d8f6 100644
--- a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx
+++ b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx
@@ -117,7 +117,7 @@ def uniform_neighbor_sample(ResourceHandle resource_handle,
         Device array containing the list of starting vertices for sampling.
 
     h_fan_out: numpy array type
-        Device array containing the brancing out (fan-out) degrees per
+        Host array containing the branching out (fan-out) degrees per
         starting vertex for each hop level.
 
     with_replacement: bool
diff --git a/python/pylibcugraph/pyproject.toml b/python/pylibcugraph/pyproject.toml
index 4dd513a4902..c12280473b5 100644
--- a/python/pylibcugraph/pyproject.toml
+++ b/python/pylibcugraph/pyproject.toml
@@ -5,7 +5,7 @@
 requires = [
     "cython>=3.0.0",
     "rapids-build-backend>=0.3.1,<0.4.0.dev0",
-    "scikit-build-core[pyproject]>=0.7.0",
+    "scikit-build-core[pyproject]>=0.10.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 build-backend = "rapids_build_backend.build"
 
@@ -21,23 +21,27 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 dependencies = [
-    "pylibraft==24.10.*,>=0.0.0a0",
-    "rmm==24.10.*,>=0.0.0a0",
+    "nvidia-cublas",
+    "nvidia-curand",
+    "nvidia-cusolver",
+    "nvidia-cusparse",
+    "pylibraft==24.12.*,>=0.0.0a0",
+    "rmm==24.12.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
 ]
 
 [project.optional-dependencies]
 test = [
-    "cudf==24.10.*,>=0.0.0a0",
-    "numpy>=1.23,<2.0a0",
+    "cudf==24.12.*,>=0.0.0a0",
+    "numpy>=1.23,<3.0a0",
     "pandas",
     "pytest",
     "pytest-benchmark",
@@ -53,7 +57,8 @@ Documentation = "https://docs.rapids.ai/api/cugraph/stable/"
 [tool.scikit-build]
 build-dir = "build/{wheel_tag}"
 cmake.build-type = "Release"
-cmake.minimum-version = "3.26.4"
+cmake.version = "CMakeLists.txt"
+minimum-version = "build-system.requires"
 ninja.make-fallback = true
 sdist.reproducible = true
 wheel.packages = ["pylibcugraph"]
@@ -69,7 +74,7 @@ dependencies-file = "../../dependencies.yaml"
 requires = [
     "cmake>=3.26.4,!=3.30.0",
     "ninja",
-    "pylibraft==24.10.*,>=0.0.0a0",
-    "rmm==24.10.*,>=0.0.0a0",
+    "pylibraft==24.12.*,>=0.0.0a0",
+    "rmm==24.12.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-matrix-entry = "cuda_suffixed=true"
+matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
diff --git a/python/pylibcugraph/pytest.ini b/python/pylibcugraph/pytest.ini
index 573628de680..d5ade9f4836 100644
--- a/python/pylibcugraph/pytest.ini
+++ b/python/pylibcugraph/pytest.ini
@@ -14,3 +14,5 @@
 [pytest]
 markers =
           cugraph_ops: Tests requiring cugraph-ops
+
+addopts = --tb=native