diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index a0528e4011..fc4fcd458b 100755
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -18,3 +18,4 @@ conda/             @rapidsai/ops-codeowners
 **/Dockerfile      @rapidsai/ops-codeowners
 **/.dockerignore   @rapidsai/ops-codeowners
 docker/            @rapidsai/ops-codeowners
+dependencies.yaml  @rapidsai/ops-codeowners
diff --git a/.github/labeler.yml b/.github/labeler.yml
index 9809e2cc2e..56f77e69c0 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -12,5 +12,5 @@ CMake:
  - '**/CMakeLists.txt'
  - '**/cmake/**'
 
-gpuCI:
+ci:
   - 'ci/**'
diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml
index 236696d948..2d1444c595 100644
--- a/.github/ops-bot.yaml
+++ b/.github/ops-bot.yaml
@@ -6,3 +6,4 @@ branch_checker: true
 label_checker: true
 release_drafter: true
 copy_prs: true
+recently_updated: true
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
new file mode 100644
index 0000000000..7a780e3de1
--- /dev/null
+++ b/.github/workflows/build.yaml
@@ -0,0 +1,97 @@
+name: build
+
+on:
+  push:
+    branches:
+      - "branch-*"
+    tags:
+      - v[0-9][0-9].[0-9][0-9].[0-9][0-9]
+  workflow_dispatch:
+    inputs:
+      branch:
+        required: true
+        type: string
+      date:
+        required: true
+        type: string
+      sha:
+        required: true
+        type: string
+      build_type:
+        type: string
+        default: nightly
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  cpp-build:
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.02
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+  python-build:
+    needs: [cpp-build]
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.02
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+  upload-conda:
+    needs: [cpp-build, python-build]
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.02
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+  wheel-build-pylibraft:
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.02
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: pylibraft
+      package-dir: python/pylibraft
+      skbuild-configure-options: "-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF"
+  wheel-publish-pylibraft:
+    needs: wheel-build-pylibraft
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@branch-23.02
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: pylibraft
+  wheel-build-raft-dask:
+    needs: wheel-publish-pylibraft
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.02
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: raft_dask
+      package-dir: python/raft-dask
+      skbuild-configure-options: "-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF"
+  wheel-publish-raft-dask:
+    needs: wheel-build-raft-dask
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@branch-23.02
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: raft_dask
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
new file mode 100644
index 0000000000..03b66debb8
--- /dev/null
+++ b/.github/workflows/pr.yaml
@@ -0,0 +1,96 @@
+name: pr
+
+on:
+  push:
+    branches:
+      - "pull-request/[0-9]+"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  pr-builder:
+    needs:
+      - checks
+      - conda-cpp-build
+      - conda-cpp-tests
+      - conda-python-build
+      - conda-python-tests
+      - wheel-build-pylibraft
+      - wheel-tests-pylibraft
+      - wheel-build-raft-dask
+      - wheel-tests-raft-dask
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.02
+  checks:
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.02
+  conda-cpp-build:
+    needs: checks
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.02
+    with:
+      build_type: pull-request
+      node_type: cpu16
+  conda-cpp-tests:
+    needs: conda-cpp-build
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.02
+    with:
+      build_type: pull-request
+  conda-python-build:
+    needs: conda-cpp-build
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.02
+    with:
+      build_type: pull-request
+  conda-python-tests:
+    needs: conda-python-build
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.02
+    with:
+      build_type: pull-request
+  wheel-build-pylibraft:
+    needs: checks
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.02
+    with:
+      build_type: pull-request
+      package-name: pylibraft
+      package-dir: python/pylibraft
+      skbuild-configure-options: "-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF"
+  wheel-tests-pylibraft:
+    needs: wheel-build-pylibraft
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.02
+    with:
+      build_type: pull-request
+      package-name: pylibraft
+      test-before-amd64: "pip install cupy-cuda11x"
+      # On arm also need to install cupy from the specific webpage.
+      test-before-arm64: "pip install cupy-cuda11x -f https://pip.cupy.dev/aarch64"
+      test-unittest: "python -m pytest -v ./python/pylibraft/pylibraft/test"
+      test-smoketest: "python ./ci/wheel_smoke_test_pylibraft.py"
+  wheel-build-raft-dask:
+    needs: wheel-tests-pylibraft
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.02
+    with:
+      build_type: pull-request
+      package-name: raft_dask
+      package-dir: python/raft-dask
+      before-wheel: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-wheelhouse"
+      skbuild-configure-options: "-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF"
+  wheel-tests-raft-dask:
+    needs: wheel-build-raft-dask
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.02
+    with:
+      build_type: pull-request
+      package-name: raft_dask
+      # Always want to test against latest dask/distributed.
+      test-before-amd64: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@2023.1.1 git+https://github.com/dask/distributed.git@2023.1.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.02"
+      test-before-arm64: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@2023.1.1 git+https://github.com/dask/distributed.git@2023.1.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.02"
+      test-unittest: "python -m pytest -v ./python/raft-dask/raft_dask/test"
+      test-smoketest: "python ./ci/wheel_smoke_test_raft_dask.py"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
new file mode 100644
index 0000000000..739f50861e
--- /dev/null
+++ b/.github/workflows/test.yaml
@@ -0,0 +1,56 @@
+name: test
+
+on:
+  workflow_dispatch:
+    inputs:
+      branch:
+        required: true
+        type: string
+      date:
+        required: true
+        type: string
+      sha:
+        required: true
+        type: string
+
+jobs:
+  conda-cpp-tests:
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.02
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+  conda-python-tests:
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.02
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+  wheel-tests-pylibraft:
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.02
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      package-name: pylibraft
+      test-before-amd64: "pip install cupy-cuda11x"
+      test-before-arm64: "pip install cupy-cuda11x -f https://pip.cupy.dev/aarch64"
+      test-unittest: "python -m pytest -v ./python/pylibraft/pylibraft/test"
+  wheel-tests-raft-dask:
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.02
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      package-name: raft_dask
+      test-before-amd64: "pip install git+https://github.com/dask/dask.git@2023.1.1 git+https://github.com/dask/distributed.git@2023.1.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.02"
+      test-before-arm64: "pip install git+https://github.com/dask/dask.git@2023.1.1 git+https://github.com/dask/distributed.git@2023.1.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.02"
+      test-unittest: "python -m pytest -v ./python/raft-dask/raft_dask/test"
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
deleted file mode 100644
index 0a681b864b..0000000000
--- a/.github/workflows/wheels.yml
+++ /dev/null
@@ -1,72 +0,0 @@
-name: RAFT wheels
-
-on:
-  workflow_call:
-    inputs:
-      versioneer-override:
-        type: string
-        default: ''
-      build-tag:
-        type: string
-        default: ''
-      branch:
-        required: true
-        type: string
-      date:
-        required: true
-        type: string
-      sha:
-        required: true
-        type: string
-      build-type:
-        type: string
-        default: nightly
-
-concurrency:
-  group: "raft-${{ github.workflow }}-${{ github.ref }}"
-  cancel-in-progress: true
-
-jobs:
-  pylibraft-wheel:
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux.yml@main
-    with:
-      repo: rapidsai/raft
-
-      build-type: ${{ inputs.build-type }}
-      branch: ${{ inputs.branch }}
-      sha: ${{ inputs.sha }}
-      date: ${{ inputs.date }}
-
-      package-dir: python/pylibraft
-      package-name: pylibraft
-
-      python-package-versioneer-override: ${{ inputs.versioneer-override }}
-      python-package-build-tag: ${{ inputs.build-tag }}
-
-      skbuild-configure-options: "-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF"
-
-      test-extras: test
-      test-unittest: "python -m pytest -v ./python/pylibraft/pylibraft/test"
-    secrets: inherit
-  raft-dask-wheel:
-    needs: pylibraft-wheel
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux.yml@main
-    with:
-      repo: rapidsai/raft
-
-      build-type: ${{ inputs.build-type }}
-      branch: ${{ inputs.branch }}
-      sha: ${{ inputs.sha }}
-      date: ${{ inputs.date }}
-
-      package-dir: python/raft-dask
-      package-name: raft_dask
-
-      python-package-versioneer-override: ${{ inputs.versioneer-override }}
-      python-package-build-tag: ${{ inputs.build-tag }}
-
-      skbuild-configure-options: "-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF"
-
-      test-extras: test
-      test-unittest: "python -m pytest -v ./python/raft-dask/raft_dask/test"
-    secrets: inherit
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1c244200d1..b766bfc066 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
 
 repos:
       - repo: https://github.com/PyCQA/isort
-        rev: 5.10.1
+        rev: 5.12.0
         hooks:
               - id: isort
                 # Use the config file specific to each subproject so that each
@@ -97,6 +97,8 @@ repos:
         rev: v2.1.0
         hooks:
               - id: codespell
+                exclude: (?x)^(^CHANGELOG.md$)
+
 
 default_language_version:
       python: python3
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4ff454f7a0..c4701f587f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,105 @@
+# raft 23.02.00 (9 Feb 2023)
+
+## 🚨 Breaking Changes
+
+- Remove faiss ANN code from knnIndex ([#1121](https://github.com/rapidsai/raft/pull/1121)) [@benfred](https://github.com/benfred)
+- Use `GenPC` (Permuted Congruential) as the default random number generator everywhere ([#1099](https://github.com/rapidsai/raft/pull/1099)) [@Nyrio](https://github.com/Nyrio)
+
+## 🐛 Bug Fixes
+
+- Reverting a few commits from 23.02 and speeding up end-to-end build time ([#1232](https://github.com/rapidsai/raft/pull/1232)) [@cjnolet](https://github.com/cjnolet)
+- Update README.md: fix a missing word ([#1185](https://github.com/rapidsai/raft/pull/1185)) [@achirkin](https://github.com/achirkin)
+- balanced-k-means: fix a too large initial memory pool size ([#1148](https://github.com/rapidsai/raft/pull/1148)) [@achirkin](https://github.com/achirkin)
+- Catch signal handler change error ([#1147](https://github.com/rapidsai/raft/pull/1147)) [@tfeher](https://github.com/tfeher)
+- Squared norm fix follow-up (change was lost in merge conflict) ([#1144](https://github.com/rapidsai/raft/pull/1144)) [@Nyrio](https://github.com/Nyrio)
+- IVF-Flat bug fix: the *squared* norm is required for expanded distance calculations ([#1141](https://github.com/rapidsai/raft/pull/1141)) [@Nyrio](https://github.com/Nyrio)
+- build.sh switch to use `RAPIDS` magic value ([#1132](https://github.com/rapidsai/raft/pull/1132)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix `euclidean_dist` in IVF-Flat search ([#1122](https://github.com/rapidsai/raft/pull/1122)) [@Nyrio](https://github.com/Nyrio)
+- Update handle docstring ([#1103](https://github.com/rapidsai/raft/pull/1103)) [@dantegd](https://github.com/dantegd)
+- Pin libcusparse and libcusolver to avoid CUDA 12 ([#1095](https://github.com/rapidsai/raft/pull/1095)) [@wphicks](https://github.com/wphicks)
+- Fix race condition in `raft::random::discrete` ([#1094](https://github.com/rapidsai/raft/pull/1094)) [@Nyrio](https://github.com/Nyrio)
+- Fixing libraft conda recipes ([#1084](https://github.com/rapidsai/raft/pull/1084)) [@cjnolet](https://github.com/cjnolet)
+- Ensure that we get the cuda version of faiss. ([#1078](https://github.com/rapidsai/raft/pull/1078)) [@vyasr](https://github.com/vyasr)
+- Fix double definition error in ANN refinement header ([#1067](https://github.com/rapidsai/raft/pull/1067)) [@tfeher](https://github.com/tfeher)
+- Specify correct global targets names to raft_export ([#1054](https://github.com/rapidsai/raft/pull/1054)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix concurrency issues in k-means++ initialization ([#1048](https://github.com/rapidsai/raft/pull/1048)) [@Nyrio](https://github.com/Nyrio)
+
+## 📖 Documentation
+
+- Adding small comms tutorial to docs ([#1204](https://github.com/rapidsai/raft/pull/1204)) [@cjnolet](https://github.com/cjnolet)
+- Separating more namespaces into easier-to-consume sections ([#1091](https://github.com/rapidsai/raft/pull/1091)) [@cjnolet](https://github.com/cjnolet)
+- Paying down some tech debt on docs, runtime API, and cython ([#1055](https://github.com/rapidsai/raft/pull/1055)) [@cjnolet](https://github.com/cjnolet)
+
+## 🚀 New Features
+
+- Add function to convert mdspan to a const view ([#1188](https://github.com/rapidsai/raft/pull/1188)) [@lowener](https://github.com/lowener)
+- Internal library to share headers between test and bench ([#1162](https://github.com/rapidsai/raft/pull/1162)) [@achirkin](https://github.com/achirkin)
+- Add public API and tests for hierarchical balanced k-means ([#1113](https://github.com/rapidsai/raft/pull/1113)) [@Nyrio](https://github.com/Nyrio)
+- Export NCCL dependency as part of raft::distributed. ([#1077](https://github.com/rapidsai/raft/pull/1077)) [@vyasr](https://github.com/vyasr)
+- Serialization of IVF Flat and IVF PQ ([#919](https://github.com/rapidsai/raft/pull/919)) [@tfeher](https://github.com/tfeher)
+
+## 🛠️ Improvements
+
+- Pin `dask` and `distributed` for release ([#1242](https://github.com/rapidsai/raft/pull/1242)) [@galipremsagar](https://github.com/galipremsagar)
+- Update shared workflow branches ([#1241](https://github.com/rapidsai/raft/pull/1241)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Removing interruptible from basic handle sync. ([#1224](https://github.com/rapidsai/raft/pull/1224)) [@cjnolet](https://github.com/cjnolet)
+- pre-commit: Update isort version to 5.12.0 ([#1215](https://github.com/rapidsai/raft/pull/1215)) [@wence-](https://github.com/wence-)
+- Pin wheel dependencies to same RAPIDS release ([#1200](https://github.com/rapidsai/raft/pull/1200)) [@sevagh](https://github.com/sevagh)
+- Serializer for mdspans ([#1173](https://github.com/rapidsai/raft/pull/1173)) [@hcho3](https://github.com/hcho3)
+- Use CTK 118/cp310 branch of wheel workflows ([#1169](https://github.com/rapidsai/raft/pull/1169)) [@sevagh](https://github.com/sevagh)
+- Enable shallow copy of `handle_t`&#39;s resources with different workspace_resource ([#1165](https://github.com/rapidsai/raft/pull/1165)) [@cjnolet](https://github.com/cjnolet)
+- Protect balanced k-means out-of-memory in some cases ([#1161](https://github.com/rapidsai/raft/pull/1161)) [@achirkin](https://github.com/achirkin)
+- Use squeuclidean for metric name in ivf_pq python bindings ([#1160](https://github.com/rapidsai/raft/pull/1160)) [@benfred](https://github.com/benfred)
+- ANN tests: make the min_recall check strict ([#1156](https://github.com/rapidsai/raft/pull/1156)) [@achirkin](https://github.com/achirkin)
+- Make cutlass use static ctk ([#1155](https://github.com/rapidsai/raft/pull/1155)) [@sevagh](https://github.com/sevagh)
+- Fix various build errors ([#1152](https://github.com/rapidsai/raft/pull/1152)) [@hcho3](https://github.com/hcho3)
+- Remove faiss bfKnn call from fused_l2_knn unittest ([#1150](https://github.com/rapidsai/raft/pull/1150)) [@benfred](https://github.com/benfred)
+- Fix `unary_op` docs and add `map_offset` as an improved version of `write_only_unary_op` ([#1149](https://github.com/rapidsai/raft/pull/1149)) [@Nyrio](https://github.com/Nyrio)
+- Improvement of the math API wrappers ([#1146](https://github.com/rapidsai/raft/pull/1146)) [@Nyrio](https://github.com/Nyrio)
+- Changing handle_t to device_resources everywhere ([#1140](https://github.com/rapidsai/raft/pull/1140)) [@cjnolet](https://github.com/cjnolet)
+- Add L2SqrtExpanded support to ivf_pq ([#1138](https://github.com/rapidsai/raft/pull/1138)) [@benfred](https://github.com/benfred)
+- Adding workspace resource ([#1137](https://github.com/rapidsai/raft/pull/1137)) [@cjnolet](https://github.com/cjnolet)
+- Add raft::void_op functor ([#1136](https://github.com/rapidsai/raft/pull/1136)) [@ahendriksen](https://github.com/ahendriksen)
+- IVF-PQ: tighten the test criteria ([#1135](https://github.com/rapidsai/raft/pull/1135)) [@achirkin](https://github.com/achirkin)
+- Fix documentation author ([#1134](https://github.com/rapidsai/raft/pull/1134)) [@bdice](https://github.com/bdice)
+- Add L2SqrtExpanded support to ivf_flat ANN indices ([#1133](https://github.com/rapidsai/raft/pull/1133)) [@benfred](https://github.com/benfred)
+- Improvements in `matrix::gather`: test coverage, compilation errors, performance ([#1126](https://github.com/rapidsai/raft/pull/1126)) [@Nyrio](https://github.com/Nyrio)
+- Adding ability to use an existing stream in the pylibraft Handle ([#1125](https://github.com/rapidsai/raft/pull/1125)) [@cjnolet](https://github.com/cjnolet)
+- Remove faiss ANN code from knnIndex ([#1121](https://github.com/rapidsai/raft/pull/1121)) [@benfred](https://github.com/benfred)
+- Update builds for CUDA `11.8` and Python `3.10` ([#1120](https://github.com/rapidsai/raft/pull/1120)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Update workflows for nightly tests ([#1119](https://github.com/rapidsai/raft/pull/1119)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Enable `Recently Updated` Check ([#1117](https://github.com/rapidsai/raft/pull/1117)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Build wheels alongside conda CI ([#1116](https://github.com/rapidsai/raft/pull/1116)) [@sevagh](https://github.com/sevagh)
+- Allow host dataset for IVF-PQ ([#1114](https://github.com/rapidsai/raft/pull/1114)) [@tfeher](https://github.com/tfeher)
+- Decoupling raft handle from underlying resources ([#1111](https://github.com/rapidsai/raft/pull/1111)) [@cjnolet](https://github.com/cjnolet)
+- Fixing an index error introduced in PR #1109 ([#1110](https://github.com/rapidsai/raft/pull/1110)) [@vinaydes](https://github.com/vinaydes)
+- Fixing the sample-without-replacement test failures ([#1109](https://github.com/rapidsai/raft/pull/1109)) [@vinaydes](https://github.com/vinaydes)
+- Remove faiss dependency from fused_l2_knn.cuh, selection_faiss.cuh, ball_cover.cuh and haversine_distance.cuh ([#1108](https://github.com/rapidsai/raft/pull/1108)) [@benfred](https://github.com/benfred)
+- Remove redundant operators in sparse/distance and move others to raft/core ([#1105](https://github.com/rapidsai/raft/pull/1105)) [@Nyrio](https://github.com/Nyrio)
+- Speedup `make_blobs` by up to 2x by fixing inefficient kernel launch configuration ([#1100](https://github.com/rapidsai/raft/pull/1100)) [@Nyrio](https://github.com/Nyrio)
+- Use `GenPC` (Permuted Congruential) as the default random number generator everywhere ([#1099](https://github.com/rapidsai/raft/pull/1099)) [@Nyrio](https://github.com/Nyrio)
+- Cleanup faiss includes ([#1098](https://github.com/rapidsai/raft/pull/1098)) [@benfred](https://github.com/benfred)
+- matrix::select_k: move selection and warp-sort primitives ([#1085](https://github.com/rapidsai/raft/pull/1085)) [@achirkin](https://github.com/achirkin)
+- Exclude changelog from pre-commit spellcheck ([#1083](https://github.com/rapidsai/raft/pull/1083)) [@benfred](https://github.com/benfred)
+- Add GitHub Actions Workflows. ([#1076](https://github.com/rapidsai/raft/pull/1076)) [@bdice](https://github.com/bdice)
+- Adding uninstall option to build.sh ([#1075](https://github.com/rapidsai/raft/pull/1075)) [@cjnolet](https://github.com/cjnolet)
+- Use doctest for testing python example docstrings ([#1073](https://github.com/rapidsai/raft/pull/1073)) [@benfred](https://github.com/benfred)
+- Minor cython fixes / cleanup ([#1072](https://github.com/rapidsai/raft/pull/1072)) [@benfred](https://github.com/benfred)
+- IVF-PQ: tweak launch configuration ([#1069](https://github.com/rapidsai/raft/pull/1069)) [@achirkin](https://github.com/achirkin)
+- Unpin `dask` and `distributed` for development ([#1068](https://github.com/rapidsai/raft/pull/1068)) [@galipremsagar](https://github.com/galipremsagar)
+- Bifurcate Dependency Lists ([#1065](https://github.com/rapidsai/raft/pull/1065)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Add support for 64bit svdeig ([#1060](https://github.com/rapidsai/raft/pull/1060)) [@lowener](https://github.com/lowener)
+- switch mma instruction shape to 1684 from current 1688 for 3xTF32 L2/cosine kernel ([#1057](https://github.com/rapidsai/raft/pull/1057)) [@mdoijade](https://github.com/mdoijade)
+- Make IVF-PQ build index in batches when necessary ([#1056](https://github.com/rapidsai/raft/pull/1056)) [@achirkin](https://github.com/achirkin)
+- Remove unused setuputils modules ([#1053](https://github.com/rapidsai/raft/pull/1053)) [@vyasr](https://github.com/vyasr)
+- Branch 23.02 merge 22.12 ([#1051](https://github.com/rapidsai/raft/pull/1051)) [@benfred](https://github.com/benfred)
+- Shared-memory-cached kernel for `reduce_cols_by_key` to limit atomic conflicts ([#1050](https://github.com/rapidsai/raft/pull/1050)) [@Nyrio](https://github.com/Nyrio)
+- Unify use of common functors ([#1049](https://github.com/rapidsai/raft/pull/1049)) [@Nyrio](https://github.com/Nyrio)
+- Replace k-means++ CPU bottleneck with a `random::discrete` prim ([#1039](https://github.com/rapidsai/raft/pull/1039)) [@Nyrio](https://github.com/Nyrio)
+- Add python bindings for kmeans fit ([#1016](https://github.com/rapidsai/raft/pull/1016)) [@benfred](https://github.com/benfred)
+- Add MaskedL2NN ([#838](https://github.com/rapidsai/raft/pull/838)) [@ahendriksen](https://github.com/ahendriksen)
+- Move contractions tiling logic outside of Contractions_NT ([#837](https://github.com/rapidsai/raft/pull/837)) [@ahendriksen](https://github.com/ahendriksen)
+
 # raft 22.12.00 (8 Dec 2022)
 
 ## 🚨 Breaking Changes
diff --git a/README.md b/README.md
index e48a1b6193..ccd0df4926 100755
--- a/README.md
+++ b/README.md
@@ -25,8 +25,8 @@ While not exhaustive, the following general categories help summarize the accele
 | Category | Examples |
 | --- | --- |
 | **Data Formats** | sparse & dense, conversions, data generation |
-| **Dense Operations** | linear algebra, matrix and vector operations, slicing, norms, factorization, least squares, svd & eigenvalue problems |
-| **Sparse Operations** | linear algebra, eigenvalue problems, slicing, symmetrization, components & labeling |
+| **Dense Operations** | linear algebra, matrix and vector operations, reductions, slicing, norms, factorization, least squares, svd & eigenvalue problems |
+| **Sparse Operations** | linear algebra, eigenvalue problems, slicing, norms, reductions, factorization, symmetrization, components & labeling |
 | **Spatial** | pairwise distances, nearest neighbors, neighborhood graph construction |
 | **Basic Clustering** | spectral clustering, hierarchical clustering, k-means |
 | **Solvers** | combinatorial optimization, iterative solvers |
@@ -37,7 +37,7 @@ While not exhaustive, the following general categories help summarize the accele
 All of RAFT's C++ APIs can be accessed header-only and optional pre-compiled shared libraries can 1) speed up compile times and 2) enable the APIs to be used without CUDA-enabled compilers.
 
 In addition to the C++ library, RAFT also provides 2 Python libraries:
-- `pylibraft` - lightweight low-level Python wrappers around RAFT's host-accessible APIs.
+- `pylibraft` - lightweight low-level Python wrappers around RAFT's host-accessible "runtime" APIs.
 - `raft-dask` - multi-node multi-GPU communicator infrastructure for building distributed algorithms on the GPU with Dask.
 
 ## Getting started
@@ -65,17 +65,17 @@ auto matrix = raft::make_device_matrix<float>(handle, n_rows, n_cols);
 
 ### C++ Example
 
-Most of the primitives in RAFT accept a `raft::handle_t` object for the management of resources which are expensive to create, such CUDA streams, stream pools, and handles to other CUDA libraries like `cublas` and `cusolver`.
+Most of the primitives in RAFT accept a `raft::device_resources` object for the management of resources which are expensive to create, such CUDA streams, stream pools, and handles to other CUDA libraries like `cublas` and `cusolver`.
 
 The example below demonstrates creating a RAFT handle and using it with `device_matrix` and `device_vector` to allocate memory, generating random clusters, and computing
 pairwise Euclidean distances:
 ```c++
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/random/make_blobs.cuh>
 #include <raft/distance/distance.cuh>
 
-raft::handle_t handle;
+raft::device_resources handle;
 
 int n_samples = 5000;
 int n_features = 50;
@@ -93,12 +93,12 @@ raft::distance::pairwise_distance(handle, input.view(), input.view(), output.vie
 It's also possible to create `raft::device_mdspan` views to invoke the same API with raw pointers and shape information:
 
 ```c++
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/random/make_blobs.cuh>
 #include <raft/distance/distance.cuh>
 
-raft::handle_t handle;
+raft::device_resources handle;
 
 int n_samples = 5000;
 int n_features = 50;
@@ -142,7 +142,7 @@ in2 = cp.random.random_sample((n_samples, n_features), dtype=cp.float32)
 output = pairwise_distance(in1, in2, metric="euclidean")
 ```
 
-The `output` array supports [__cuda_array_interface__](https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html#cuda-array-interface-version-2) so it is interoperable with other libraries like CuPy, Numba, and PyTorch that also support it. 
+The `output` array in the above example is of type `raft.common.device_ndarray`, which supports [__cuda_array_interface__](https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html#cuda-array-interface-version-2) making it interoperable with other libraries like CuPy, Numba, and PyTorch that also support it. CuPy supports DLPack, which also enables zero-copy conversion from `raft.common.device_ndarray` to JAX and Tensorflow.
 
 Below is an example of converting the output `pylibraft.device_ndarray` to a CuPy array:
 ```python
@@ -156,6 +156,18 @@ import torch
 torch_tensor = torch.as_tensor(output, device='cuda')
 ```
 
+When the corresponding library has been installed and available in your environment, this conversion can also be done automatically by all RAFT compute APIs by setting a global configuration option:
+```python
+import pylibraft.config
+pylibraft.config.set_output_as("cupy")  # All compute APIs will return cupy arrays
+pylibraft.config.set_output_as("torch") # All compute APIs will return torch tensors
+```
+
+You can also specify a `callable` that accepts a `pylibraft.common.device_ndarray` and performs a custom conversion. The following example converts all output to `numpy` arrays:
+```python
+pylibraft.config.set_output_as(lambda device_ndarray: return device_ndarray.copy_to_host())
+```
+
 `pylibraft` also supports writing to a pre-allocated output array so any `__cuda_array_interface__` supported array can be written to in-place:
 
 ```python
@@ -176,7 +188,7 @@ pairwise_distance(in1, in2, out=output, metric="euclidean")
 
 ## Installing
 
-RAFT itself can be installed through conda, [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake), pip, or by building the repository from source. Please refer to the [build instructions](docs/source/build.md) for more a comprehensive guide on building RAFT and using it in downstream projects.
+RAFT itself can be installed through conda, [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake), pip, or by building the repository from source. Please refer to the [build instructions](docs/source/build.md) for more a comprehensive guide on installing and building RAFT and using it in downstream projects.
 
 ### Conda
 
@@ -257,14 +269,15 @@ Several CMake targets can be made available by adding components in the table be
 | --- | --- | --- | --- |
 | n/a | `raft::raft` | Full RAFT header library | CUDA toolkit library, RMM, Thrust (optional), NVTools (optional) |
 | distance | `raft::distance` | Pre-compiled template specializations for raft::distance | raft::raft, cuCollections (optional)  |
-| nn | `raft::nn` | Pre-compiled template specializations for raft::spatial::knn | raft::raft, FAISS (optional) |
+| nn | `raft::nn` | Pre-compiled template specializations for raft::neighbors | raft::raft, FAISS (optional) |
+| distributed | `raft::distributed` | No specializations | raft::raft, UCX, NCCL |
 
 ### Source
 
 The easiest way to build RAFT from source is to use the `build.sh` script at the root of the repository:
-1. Create an environment with the needed dependencies: 
+1. Create an environment with the needed dependencies:
 ```
-mamba env create --name raft_dev_env -f conda/environments/raft_dev_cuda11.5.yml
+mamba env create --name raft_dev_env -f conda/environments/all_cuda-118_arch-x86_64.yaml
 mamba activate raft_dev_env
 ```
 ```
@@ -302,6 +315,7 @@ The folder structure mirrors other RAPIDS repos, with the following folders:
       - `solver`: Sparse solvers for optimization and approximation
     - `stats`: Moments, summary statistics, model performance measures
     - `util`: Various reusable tools and utilities for accelerated algorithm development
+  - `internal`: A private header-only component that hosts the code shared between benchmarks and tests.
   - `scripts`: Helpful scripts for development
   - `src`: Compiled APIs and template specializations for the shared libraries
   - `test`: Googletests source code
diff --git a/build.sh b/build.sh
index 0708c1b89e..b47e1ed862 100755
--- a/build.sh
+++ b/build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 # raft build script
 
@@ -18,7 +18,7 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libraft pylibraft raft-dask docs tests bench clean -v -g -n --compile-libs --compile-nn --compile-dist --allgpuarch --no-nvtx --show_depr_warn -h --buildfaiss --minimal-deps"
+VALIDARGS="clean libraft pylibraft raft-dask docs tests bench clean --uninstall  -v -g -n --compile-libs --compile-nn --compile-dist --allgpuarch --no-nvtx --show_depr_warn -h --buildfaiss --minimal-deps"
 HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<tool>] [--limit-tests=<targets>] [--limit-bench=<targets>]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
@@ -34,6 +34,7 @@ HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<to
    -v                          - verbose build mode
    -g                          - build for debug
    -n                          - no install step
+   --uninstall                 - uninstall files for specified targets which were built and installed prior
    --compile-libs              - compile shared libraries for all components
    --compile-nn                - compile shared library for nn component
    --compile-dist              - compile shared library for distance and current random components
@@ -56,9 +57,9 @@ HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<to
 "
 LIBRAFT_BUILD_DIR=${LIBRAFT_BUILD_DIR:=${REPODIR}/cpp/build}
 SPHINX_BUILD_DIR=${REPODIR}/docs
-PY_RAFT_BUILD_DIR=${REPODIR}/python/raft/build
-PY_LIBRAFT_BUILD_DIR=${REPODIR}/python/pylibraft/_skbuild
-BUILD_DIRS="${LIBRAFT_BUILD_DIR} ${PY_RAFT_BUILD_DIR} ${PY_LIBRAFT_BUILD_DIR}"
+RAFT_DASK_BUILD_DIR=${REPODIR}/python/raft-dask/_skbuild
+PYLIBRAFT_BUILD_DIR=${REPODIR}/python/pylibraft/_skbuild
+BUILD_DIRS="${LIBRAFT_BUILD_DIR} ${PYLIBRAFT_BUILD_DIR} ${RAFT_DASK_BUILD_DIR}"
 
 # Set defaults for vars modified by flags to this script
 CMAKE_LOG_LEVEL=""
@@ -152,6 +153,7 @@ function limitTests {
             # Remove the full LIMIT_TEST_TARGETS argument from list of args so that it passes validArgs function
             ARGS=${ARGS//--limit-tests=$LIMIT_TEST_TARGETS/}
             TEST_TARGETS=${LIMIT_TEST_TARGETS}
+	    echo "Limiting tests to $TEST_TARGETS"
         fi
     fi
 }
@@ -190,6 +192,65 @@ if (( ${NUMARGS} != 0 )); then
     done
 fi
 
+# This should run before build/install
+if hasArg --uninstall; then
+    UNINSTALL=1
+
+    if hasArg pylibraft || hasArg libraft || (( ${NUMARGS} == 1 )); then
+
+      echo "Removing libraft files..."
+      if [ -e ${LIBRAFT_BUILD_DIR}/install_manifest.txt ]; then
+          xargs rm -fv < ${LIBRAFT_BUILD_DIR}/install_manifest.txt > /dev/null 2>&1
+      fi
+    fi
+
+    if hasArg pylibraft || (( ${NUMARGS} == 1 )); then
+      echo "Uninstalling pylibraft package..."
+      if [ -e ${PYLIBRAFT_BUILD_DIR}/install_manifest.txt ]; then
+          xargs rm -fv < ${PYLIBRAFT_BUILD_DIR}/install_manifest.txt > /dev/null 2>&1
+      fi
+
+      # Try to uninstall via pip if it is installed
+      if [ -x "$(command -v pip)" ]; then
+        echo "Using pip to uninstall pylibraft"
+        pip uninstall -y pylibraft
+
+      # Otherwise, try to uninstall through conda if that's where things are installed
+      elif [ -x "$(command -v conda)" ] && [ "$INSTALL_PREFIX" == "$CONDA_PREFIX" ]; then
+        echo "Using conda to uninstall pylibraft"
+        conda uninstall -y pylibraft
+
+      # Otherwise, fail
+      else
+        echo "Could not uninstall pylibraft from pip or conda. pylibraft package will need to be manually uninstalled"
+      fi
+    fi
+
+    if hasArg raft-dask || (( ${NUMARGS} == 1 )); then
+      echo "Uninstalling raft-dask package..."
+      if [ -e ${RAFT_DASK_BUILD_DIR}/install_manifest.txt ]; then
+          xargs rm -fv < ${RAFT_DASK_BUILD_DIR}/install_manifest.txt > /dev/null 2>&1
+      fi
+
+      # Try to uninstall via pip if it is installed
+      if [ -x "$(command -v pip)" ]; then
+        echo "Using pip to uninstall raft-dask"
+        pip uninstall -y raft-dask
+
+      # Otherwise, try to uninstall through conda if that's where things are installed
+      elif [ -x "$(command -v conda)" ] && [ "$INSTALL_PREFIX" == "$CONDA_PREFIX" ]; then
+        echo "Using conda to uninstall raft-dask"
+        conda uninstall -y raft-dask
+
+      # Otherwise, fail
+      else
+        echo "Could not uninstall raft-dask from pip or conda. raft-dask package will need to be manually uninstalled."
+      fi
+    fi
+    exit 0
+fi
+
+
 # Process flags
 if hasArg -n; then
     INSTALL_TARGET=""
@@ -286,9 +347,8 @@ fi
 if hasArg clean; then
     CLEAN=1
 fi
-if hasArg uninstall; then
-    UNINSTALL=1
-fi
+
+
 
 if [[ ${CMAKE_TARGET} == "" ]]; then
     CMAKE_TARGET="all"
@@ -328,7 +388,7 @@ if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || has
         RAFT_CMAKE_CUDA_ARCHITECTURES="NATIVE"
         echo "Building for the architecture of the GPU in the system..."
     else
-        RAFT_CMAKE_CUDA_ARCHITECTURES="ALL"
+        RAFT_CMAKE_CUDA_ARCHITECTURES="RAPIDS"
         echo "Building for *ALL* supported GPU architectures..."
     fi
 
@@ -370,7 +430,7 @@ if (( ${NUMARGS} == 0 )) || hasArg raft-dask; then
     fi
 
     cd ${REPODIR}/python/raft-dask
-    python setup.py build_ext --inplace -- -DCMAKE_PREFIX_PATH="${LIBRAFT_BUILD_DIR};${INSTALL_PREFIX}" -DCMAKE_LIBRARY_PATH=${LIBRAFT_BUILD_DIR} ${EXTRA_CMAKE_ARGS} -- -j${PARALLEL_LEVEL:-1}
+    python setup.py build_ext --inplace -- -DCMAKE_PREFIX_PATH="${RAFT_DASK_BUILD_DIR};${INSTALL_PREFIX}" -DCMAKE_LIBRARY_PATH=${LIBRAFT_BUILD_DIR} ${EXTRA_CMAKE_ARGS} -- -j${PARALLEL_LEVEL:-1}
     if [[ ${INSTALL_TARGET} != "" ]]; then
         python setup.py install --single-version-externally-managed --record=record.txt -- -DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} ${EXTRA_CMAKE_ARGS}
     fi
@@ -384,7 +444,7 @@ if (( ${NUMARGS} == 0 )) || hasArg pylibraft; then
     fi
 
     cd ${REPODIR}/python/pylibraft
-    python setup.py build_ext --inplace -- -DCMAKE_PREFIX_PATH="${LIBRAFT_BUILD_DIR};${INSTALL_PREFIX}" -DCMAKE_LIBRARY_PATH=${LIBRAFT_BUILD_DIR} ${EXTRA_CMAKE_ARGS} -- -j${PARALLEL_LEVEL:-1}
+    python setup.py build_ext --inplace -- -DCMAKE_PREFIX_PATH="${RAFT_DASK_BUILD_DIR};${INSTALL_PREFIX}" -DCMAKE_LIBRARY_PATH=${LIBRAFT_BUILD_DIR} ${EXTRA_CMAKE_ARGS} -- -j${PARALLEL_LEVEL:-1}
     if [[ ${INSTALL_TARGET} != "" ]]; then
         python setup.py install --single-version-externally-managed --record=record.txt -- -DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} ${EXTRA_CMAKE_ARGS}
     fi
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
new file mode 100755
index 0000000000..853ae095d3
--- /dev/null
+++ b/ci/build_cpp.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+source rapids-env-update
+
+export CMAKE_GENERATOR=Ninja
+
+rapids-print-env
+
+rapids-logger "Begin cpp build"
+
+rapids-mamba-retry mambabuild conda/recipes/libraft
+
+rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_python.sh b/ci/build_python.sh
new file mode 100755
index 0000000000..b20fd51bca
--- /dev/null
+++ b/ci/build_python.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+source rapids-env-update
+
+export CMAKE_GENERATOR=Ninja
+
+rapids-print-env
+
+rapids-logger "Begin py build"
+
+CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
+
+# TODO: Remove `--no-test` flags once importing on a CPU
+# node works correctly
+rapids-mamba-retry mambabuild \
+  --no-test \
+  --channel "${CPP_CHANNEL}" \
+  conda/recipes/pylibraft
+
+rapids-mamba-retry mambabuild \
+  --no-test \
+  --channel "${CPP_CHANNEL}" \
+  --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
+  conda/recipes/raft-dask
+
+rapids-upload-conda-to-s3 python
diff --git a/ci/check_style.sh b/ci/check_style.sh
new file mode 100755
index 0000000000..be3ac3f4b8
--- /dev/null
+++ b/ci/check_style.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+rapids-logger "Create checks conda environment"
+. /opt/conda/etc/profile.d/conda.sh
+
+rapids-dependency-file-generator \
+  --output conda \
+  --file_key checks \
+  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
+
+rapids-mamba-retry env create --force -f env.yaml -n checks
+conda activate checks
+
+# Run pre-commit checks
+pre-commit run --hook-stage manual --all-files --show-diff-on-failure
diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py
index bfef5392f5..43a4a186f8 100644
--- a/ci/checks/copyright.py
+++ b/ci/checks/copyright.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -37,7 +37,7 @@
     re.compile(r"setup[.]cfg$"),
     re.compile(r"meta[.]yaml$")
 ]
-ExemptFiles = ["cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh"]
+ExemptFiles = ["cpp/include/raft/spatial/knn/detail/faiss_select/"]
 
 # this will break starting at year 10000, which is probably OK :)
 CheckSimple = re.compile(
diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index b87b16d138..2f0e2b94ca 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -31,12 +31,19 @@ fi
 export GPUCI_CONDA_RETRY_MAX=1
 export GPUCI_CONDA_RETRY_SLEEP=30
 
+# Workaround to keep Jenkins builds working
+# until we migrate fully to GitHub Actions
+export RAPIDS_CUDA_VERSION="${CUDA}"
+export SCCACHE_BUCKET=rapids-sccache
+export SCCACHE_REGION=us-west-2
+export SCCACHE_IDLE_TIMEOUT=32768
+
 # Use Ninja to build
 export CMAKE_GENERATOR="Ninja"
 export CONDA_BLD_DIR="${WORKSPACE}/.conda-bld"
 
 # ucx-py version
-export UCX_PY_VERSION='0.29.*'
+export UCX_PY_VERSION='0.30.*'
 
 ################################################################################
 # SETUP - Check environment
@@ -123,5 +130,6 @@ fi
 # UPLOAD - Conda packages
 ################################################################################
 
-gpuci_logger "Upload conda packages"
-source ci/cpu/upload.sh
+# Uploads disabled due to new GH Actions implementation
+# gpuci_logger "Upload conda packages"
+# source ci/cpu/upload.sh
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index eb0117cdc3..154edbc7f2 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -21,6 +21,13 @@ export PARALLEL_LEVEL=${PARALLEL_LEVEL:-8}
 export CUDA_REL=${CUDA_VERSION%.*}
 CONDA_ARTIFACT_PATH=${WORKSPACE}/ci/artifacts/raft/cpu/.conda-bld/ # notice there is no `linux-64` here
 
+# Workaround to keep Jenkins builds working
+# until we migrate fully to GitHub Actions
+export RAPIDS_CUDA_VERSION="${CUDA}"
+export SCCACHE_BUCKET=rapids-sccache
+export SCCACHE_REGION=us-west-2
+export SCCACHE_IDLE_TIMEOUT=32768
+
 # Set home to the job's workspace
 export HOME=$WORKSPACE
 
@@ -31,13 +38,13 @@ export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 unset GIT_DESCRIBE_TAG
 
 # ucx-py version
-export UCX_PY_VERSION='0.29.*'
+export UCX_PY_VERSION='0.30.*'
 
 # Whether to install dask nightly or stable packages.
 export INSTALL_DASK_MAIN=0
 
 # Dask version to install when `INSTALL_DASK_MAIN=0`
-export DASK_STABLE_VERSION="2022.11.1"
+export DASK_STABLE_VERSION="2023.1.1"
 
 ################################################################################
 # SETUP - Check environment
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 4e0ecd8e15..bd0ff1db7b 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 ########################
 # RAFT Version Updater #
 ########################
@@ -17,12 +17,14 @@ CURRENT_MAJOR=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[1]}')
 CURRENT_MINOR=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[2]}')
 CURRENT_PATCH=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[3]}')
 CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR}
+CURRENT_UCX_PY_VERSION="$(curl -sL https://version.gpuci.io/rapids/${CURRENT_SHORT_TAG}).*"
 
 #Get <major>.<minor> for next version
 NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}')
 NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
 NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
-NEXT_UCX_PY_VERSION="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG}).*"
+NEXT_UCX_PY_SHORT_TAG="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG})"
+NEXT_UCX_PY_VERSION="${NEXT_UCX_PY_SHORT_TAG}.*"
 
 echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
 
@@ -41,7 +43,7 @@ sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cma
 sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/source/conf.py
 sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/source/conf.py
 
-for FILE in conda/environments/*.yml; do
+for FILE in conda/environments/*.yaml dependencies.yaml; do
   sed_runner "s/dask-cuda=${CURRENT_SHORT_TAG}/dask-cuda=${NEXT_SHORT_TAG}/g" ${FILE};
   sed_runner "s/rapids-build-env=${CURRENT_SHORT_TAG}/rapids-build-env=${NEXT_SHORT_TAG}/g" ${FILE};
   sed_runner "s/rapids-doc-env=${CURRENT_SHORT_TAG}/rapids-doc-env=${NEXT_SHORT_TAG}/g" ${FILE};
@@ -52,3 +54,22 @@ done
 
 sed_runner "s/export UCX_PY_VERSION=.*/export UCX_PY_VERSION='${NEXT_UCX_PY_VERSION}'/g" ci/gpu/build.sh
 sed_runner "s/export UCX_PY_VERSION=.*/export UCX_PY_VERSION='${NEXT_UCX_PY_VERSION}'/g" ci/cpu/build.sh
+sed_runner "/^ucx_py_version:$/ {n;s/.*/  - \"${NEXT_UCX_PY_VERSION}\"/}" conda/recipes/raft-dask/conda_build_config.yaml
+
+# Wheel builds install dask-cuda from source, update its branch
+sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" .github/workflows/*.yaml
+
+# Need to distutils-normalize the original version
+NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
+NEXT_UCX_PY_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_UCX_PY_SHORT_TAG}'))")
+
+# Wheel builds install intra-RAPIDS dependencies from same release
+sed_runner "s/{cuda_suffix}[^\"].*\",/{cuda_suffix}==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/pylibraft/setup.py
+sed_runner "s/{cuda_suffix}.*\"\]/{cuda_suffix}==${NEXT_SHORT_TAG_PEP440}.*\"\]/g" python/pylibraft/_custom_build/backend.py
+sed_runner "s/dask-cuda==.*\",/dask-cuda==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/raft-dask/setup.py
+sed_runner "s/pylibraft{cuda_suffix}.*\",/pylibraft{cuda_suffix}==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/raft-dask/setup.py
+sed_runner "s/ucx-py{cuda_suffix}.*\",/ucx-py{cuda_suffix}==${NEXT_UCX_PY_SHORT_TAG_PEP440}.*\",/g" python/raft-dask/setup.py
+
+for FILE in .github/workflows/*.yaml; do
+  sed_runner "/shared-action-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
+done
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
new file mode 100755
index 0000000000..d8538bdf47
--- /dev/null
+++ b/ci/test_cpp.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+. /opt/conda/etc/profile.d/conda.sh
+
+rapids-logger "Generate C++ testing dependencies"
+rapids-dependency-file-generator \
+  --output conda \
+  --file_key test_cpp \
+  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee env.yaml
+
+rapids-mamba-retry env create --force -f env.yaml -n test
+
+# Temporarily allow unbound variables for conda activation.
+set +u
+conda activate test
+set -u
+
+CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
+RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/
+mkdir -p "${RAPIDS_TESTS_DIR}"
+SUITEERROR=0
+
+rapids-print-env
+
+rapids-mamba-retry install \
+  --channel "${CPP_CHANNEL}" \
+  libraft-headers libraft-distance libraft-nn libraft-tests
+
+rapids-logger "Check GPU usage"
+nvidia-smi
+
+set +e
+
+# Run libraft gtests from libraft-tests package
+rapids-logger "Run gtests"
+
+# TODO: exit code handling is too verbose. Find a cleaner solution.
+
+for gt in "$CONDA_PREFIX"/bin/gtests/libraft/* ; do
+    test_name=$(basename ${gt})
+    echo "Running gtest $test_name"
+    ${gt} --gtest_output=xml:${RAPIDS_TESTS_DIR}
+
+    exitcode=$?
+    if (( ${exitcode} != 0 )); then
+        SUITEERROR=${exitcode}
+        echo "FAILED: GTest ${gt}"
+    fi
+done
+
+exit ${SUITEERROR}
diff --git a/ci/test_python.sh b/ci/test_python.sh
new file mode 100755
index 0000000000..eb458d2a5a
--- /dev/null
+++ b/ci/test_python.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+. /opt/conda/etc/profile.d/conda.sh
+
+rapids-logger "Generate Python testing dependencies"
+rapids-dependency-file-generator \
+  --output conda \
+  --file_key test_python \
+  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
+
+rapids-mamba-retry env create --force -f env.yaml -n test
+
+# Temporarily allow unbound variables for conda activation.
+set +u
+conda activate test
+set -u
+
+rapids-logger "Downloading artifacts from previous jobs"
+CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
+PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
+
+RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}
+RAPIDS_COVERAGE_DIR=${RAPIDS_COVERAGE_DIR:-"${PWD}/coverage-results"}
+mkdir -p "${RAPIDS_TESTS_DIR}" "${RAPIDS_COVERAGE_DIR}"
+SUITEERROR=0
+
+rapids-print-env
+
+rapids-mamba-retry install \
+  --channel "${CPP_CHANNEL}" \
+  --channel "${PYTHON_CHANNEL}" \
+  libraft-distance libraft-headers pylibraft raft-dask
+
+rapids-logger "Check GPU usage"
+nvidia-smi
+
+set +e
+
+rapids-logger "pytest pylibraft"
+pushd python/pylibraft/pylibraft
+pytest \
+  --cache-clear \
+  --junitxml="${RAPIDS_TESTS_DIR}/junit-pylibraft.xml" \
+  --cov-config=../.coveragerc \
+  --cov=pylibraft \
+  --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/pylibraft-coverage.xml" \
+  --cov-report=term \
+  test
+exitcode=$?
+
+if (( ${exitcode} != 0 )); then
+    SUITEERROR=${exitcode}
+    echo "FAILED: 1 or more tests in pylibraft"
+fi
+popd
+
+rapids-logger "pytest raft-dask"
+pushd python/raft-dask/raft_dask
+pytest \
+  --cache-clear \
+  --junitxml="${RAPIDS_TESTS_DIR}/junit-raft-dask.xml" \
+  --cov-config=../.coveragerc \
+  --cov=raft_dask \
+  --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/raft-dask-coverage.xml" \
+  --cov-report=term \
+  test
+exitcode=$?
+
+if (( ${exitcode} != 0 )); then
+    SUITEERROR=${exitcode}
+    echo "FAILED: 1 or more tests in raft-dask"
+fi
+popd
+
+exit ${SUITEERROR}
diff --git a/ci/wheel_smoke_test_pylibraft.py b/ci/wheel_smoke_test_pylibraft.py
new file mode 100644
index 0000000000..7fee674691
--- /dev/null
+++ b/ci/wheel_smoke_test_pylibraft.py
@@ -0,0 +1,38 @@
+import numpy as np
+from scipy.spatial.distance import cdist
+
+from pylibraft.common import Handle, Stream, device_ndarray
+from pylibraft.distance import pairwise_distance
+
+
+if __name__ == "__main__":
+    metric = "euclidean"
+    n_rows = 1337
+    n_cols = 1337
+
+    input1 = np.random.random_sample((n_rows, n_cols))
+    input1 = np.asarray(input1, order="C").astype(np.float64)
+
+    output = np.zeros((n_rows, n_rows), dtype=np.float64)
+
+    expected = cdist(input1, input1, metric)
+
+    expected[expected <= 1e-5] = 0.0
+
+    input1_device = device_ndarray(input1)
+    output_device = None
+
+    s2 = Stream()
+    handle = Handle(stream=s2)
+    ret_output = pairwise_distance(
+        input1_device, input1_device, output_device, metric, handle=handle
+    )
+    handle.sync()
+
+    output_device = ret_output
+
+    actual = output_device.copy_to_host()
+
+    actual[actual <= 1e-5] = 0.0
+
+    assert np.allclose(expected, actual, rtol=1e-4)
diff --git a/ci/wheel_smoke_test_raft_dask.py b/ci/wheel_smoke_test_raft_dask.py
new file mode 100644
index 0000000000..32c13e61ca
--- /dev/null
+++ b/ci/wheel_smoke_test_raft_dask.py
@@ -0,0 +1,92 @@
+from dask.distributed import Client, wait
+from dask_cuda import LocalCUDACluster, initialize
+
+from raft_dask.common import (
+    Comms,
+    local_handle,
+    perform_test_comm_split,
+    perform_test_comms_allgather,
+    perform_test_comms_allreduce,
+    perform_test_comms_bcast,
+    perform_test_comms_device_multicast_sendrecv,
+    perform_test_comms_device_send_or_recv,
+    perform_test_comms_device_sendrecv,
+    perform_test_comms_gather,
+    perform_test_comms_gatherv,
+    perform_test_comms_reduce,
+    perform_test_comms_reducescatter,
+    perform_test_comms_send_recv,
+)
+
+import os
+os.environ["UCX_LOG_LEVEL"] = "error"
+
+
+def func_test_send_recv(sessionId, n_trials):
+    handle = local_handle(sessionId)
+    return perform_test_comms_send_recv(handle, n_trials)
+
+
+def func_test_collective(func, sessionId, root):
+    handle = local_handle(sessionId)
+    return func(handle, root)
+
+
+if __name__ == "__main__":
+    # initial setup
+    cluster = LocalCUDACluster(protocol="tcp", scheduler_port=0)
+    client = Client(cluster)
+
+    n_trials = 5
+    root_location = "client"
+
+    # p2p test for ucx
+    cb = Comms(comms_p2p=True, verbose=True)
+    cb.init()
+
+    dfs = [
+        client.submit(
+            func_test_send_recv,
+            cb.sessionId,
+            n_trials,
+            pure=False,
+            workers=[w],
+        )
+        for w in cb.worker_addresses
+    ]
+
+    wait(dfs, timeout=5)
+
+    assert list(map(lambda x: x.result(), dfs))
+
+    cb.destroy()
+
+    # collectives test for nccl
+
+    cb = Comms(
+        verbose=True, client=client, nccl_root_location=root_location
+    )
+    cb.init()
+
+    for k, v in cb.worker_info(cb.worker_addresses).items():
+
+        dfs = [
+            client.submit(
+                func_test_collective,
+                perform_test_comms_allgather,
+                cb.sessionId,
+                v["rank"],
+                pure=False,
+                workers=[w],
+            )
+            for w in cb.worker_addresses
+        ]
+        wait(dfs, timeout=5)
+
+        assert all([x.result() for x in dfs])
+
+    cb.destroy()
+
+    # final client and cluster teardown
+    client.close()
+    cluster.close()
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
new file mode 100644
index 0000000000..7dc305bf97
--- /dev/null
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -0,0 +1,48 @@
+# This file is generated by `rapids-dependency-file-generator`.
+# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+channels:
+- rapidsai
+- rapidsai-nightly
+- dask/label/dev
+- conda-forge
+- nvidia
+dependencies:
+- breathe
+- c-compiler
+- clang-tools=11.1.0
+- clang=11.1.0
+- cmake>=3.23.1,!=3.25.0
+- cuda-profiler-api=11.8.86
+- cuda-python >=11.7.1,<12.0
+- cudatoolkit=11.8
+- cupy
+- cxx-compiler
+- cython>=0.29,<0.30
+- dask-cuda=23.02
+- dask==2023.1.1
+- distributed==2023.1.1
+- doxygen>=1.8.20
+- faiss-proc=*=cuda
+- gcc_linux-64=9
+- libcublas-dev=11.11.3.6
+- libcublas=11.11.3.6
+- libcurand-dev=10.3.0.86
+- libcurand=10.3.0.86
+- libcusolver-dev=11.4.1.48
+- libcusolver=11.4.1.48
+- libcusparse-dev=11.7.5.86
+- libcusparse=11.7.5.86
+- libfaiss>=1.7.1=cuda*
+- ninja
+- pytest
+- pytest-cov
+- rmm=23.02
+- scikit-build>=0.13.1
+- scikit-learn
+- scipy
+- sphinx-markdown-tables
+- sysroot_linux-64==2.17
+- ucx-proc=*=gpu
+- ucx-py=0.30
+- ucx>=1.13.0
+name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/raft_dev_cuda11.2.yml b/conda/environments/raft_dev_cuda11.2.yml
deleted file mode 100644
index 5330227aa4..0000000000
--- a/conda/environments/raft_dev_cuda11.2.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-name: raft_dev
-channels:
-- rapidsai
-- rapidsai-nightly
-- dask/label/dev
-- conda-forge
-- nvidia
-dependencies:
-- c-compiler
-- cxx-compiler
-- cudatoolkit=11.2
-- cuda-python >=11.7.1,<12.0
-- ninja
-- clang=11.1.0
-- clang-tools=11.1.0
-- cython>=0.29,<0.30
-- cmake>=3.23.1,!=3.25.0
-- dask==2022.11.1
-- distributed==2022.11.1
-- scikit-build>=0.13.1
-- rapids-build-env=22.12.*
-- rapids-notebook-env=22.12.*
-- rapids-doc-env=22.12.*
-- rmm=22.12.*
-- dask-cuda=22.12.*
-- ucx>=1.13.0
-- ucx-py=0.29.*
-- ucx-proc=*=gpu
-- doxygen>=1.8.20
-- libfaiss>=1.7.0
-- faiss-proc=*=cuda
-- ccache
-- pip
-- pip:
-    - sphinx_markdown_tables
-    - breathe
-
-# rapids-build-env, notebook-env and doc-env are defined in
-# https://docs.rapids.ai/maintainers/depmgmt/
-
-# To install different versions of packages contained in those meta packages,
-# it is recommended to remove those meta packages (without removing the actual
-# packages contained in the environment) first with:
-# conda remove --force rapids-build-env rapids-notebook-env rapids-doc-env
diff --git a/conda/environments/raft_dev_cuda11.4.yml b/conda/environments/raft_dev_cuda11.4.yml
deleted file mode 100644
index 83eca86a7f..0000000000
--- a/conda/environments/raft_dev_cuda11.4.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-name: raft_dev
-channels:
-- rapidsai
-- rapidsai-nightly
-- dask/label/dev
-- conda-forge
-- nvidia
-dependencies:
-- c-compiler
-- cxx-compiler
-- cudatoolkit=11.4
-- cuda-python >=11.7.1,<12.0
-- ninja
-- clang=11.1.0
-- clang-tools=11.1.0
-- cython>=0.29,<0.30
-- cmake>=3.23.1,!=3.25.0
-- dask==2022.11.1
-- distributed==2022.11.1
-- scikit-build>=0.13.1
-- rapids-build-env=22.12.*
-- rapids-notebook-env=22.12.*
-- rapids-doc-env=22.12.*
-- rmm=22.12.*
-- dask-cuda=22.12.*
-- ucx>=1.13.0
-- ucx-py=0.29.*
-- ucx-proc=*=gpu
-- doxygen>=1.8.20
-- libfaiss>=1.7.0
-- faiss-proc=*=cuda
-- ccache
-- pip
-- pip:
-    - sphinx_markdown_tables
-    - breathe
-
-# rapids-build-env, notebook-env and doc-env are defined in
-# https://docs.rapids.ai/maintainers/depmgmt/
-
-# To install different versions of packages contained in those meta packages,
-# it is recommended to remove those meta packages (without removing the actual
-# packages contained in the environment) first with:
-# conda remove --force rapids-build-env rapids-notebook-env rapids-doc-env
diff --git a/conda/environments/raft_dev_cuda11.5.yml b/conda/environments/raft_dev_cuda11.5.yml
deleted file mode 100644
index f8ef71bac2..0000000000
--- a/conda/environments/raft_dev_cuda11.5.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-name: raft_dev
-channels:
-- rapidsai
-- rapidsai-nightly
-- dask/label/dev
-- conda-forge
-- nvidia
-dependencies:
-- c-compiler
-- cxx-compiler
-- cudatoolkit=11.5
-- cuda-python >=11.7.1,<12.0
-- ninja
-- clang=11.1.0
-- clang-tools=11.1.0
-- cython>=0.29,<0.30
-- cmake>=3.23.1,!=3.25.0
-- dask==2022.11.1
-- distributed==2022.11.1
-- scikit-build>=0.13.1
-- rapids-build-env=22.12.*
-- rapids-notebook-env=22.12.*
-- rapids-doc-env=22.12.*
-- rmm=22.12.*
-- dask-cuda=22.12.*
-- ucx>=1.13.0
-- ucx-py=0.29.*
-- ucx-proc=*=gpu
-- doxygen>=1.8.20
-- libfaiss>=1.7.0
-- faiss-proc=*=cuda
-- ccache
-- pip
-- pip:
-    - sphinx_markdown_tables
-    - breathe
-
-# rapids-build-env, notebook-env and doc-env are defined in
-# https://docs.rapids.ai/maintainers/depmgmt/
-
-# To install different versions of packages contained in those meta packages,
-# it is recommended to remove those meta packages (without removing the actual
-# packages contained in the environment) first with:
-# conda remove --force rapids-build-env rapids-notebook-env rapids-doc-env
diff --git a/conda/recipes/libraft/build_libraft_distance.sh b/conda/recipes/libraft/build_libraft_distance.sh
index 35bf354e9b..d7e995fc03 100644
--- a/conda/recipes/libraft/build_libraft_distance.sh
+++ b/conda/recipes/libraft/build_libraft_distance.sh
@@ -1,4 +1,4 @@
 #!/usr/bin/env bash
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
 ./build.sh libraft -v --allgpuarch --compile-dist --no-nvtx
diff --git a/conda/recipes/libraft/build_libraft_nn.sh b/conda/recipes/libraft/build_libraft_nn.sh
index 773d6ab02e..9865922cd0 100644
--- a/conda/recipes/libraft/build_libraft_nn.sh
+++ b/conda/recipes/libraft/build_libraft_nn.sh
@@ -1,4 +1,4 @@
 #!/usr/bin/env bash
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
 ./build.sh libraft -v --allgpuarch --compile-nn --no-nvtx
diff --git a/conda/recipes/libraft/build_libraft_tests.sh b/conda/recipes/libraft/build_libraft_tests.sh
index 040a2f8b8c..6adbbe78e1 100644
--- a/conda/recipes/libraft/build_libraft_tests.sh
+++ b/conda/recipes/libraft/build_libraft_tests.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
 ./build.sh tests bench -v --allgpuarch --no-nvtx
 cmake --install cpp/build --component testing
diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml
index facb478562..1012bddb40 100644
--- a/conda/recipes/libraft/conda_build_config.yaml
+++ b/conda/recipes/libraft/conda_build_config.yaml
@@ -19,11 +19,43 @@ nccl_version:
 gtest_version:
   - "=1.10.0"
 
-libcusolver_version:
-  - ">=11.2.1,<=11.4.1.48"
+libfaiss_version:
+  - "1.7.2 *_cuda"
 
-libcusparse_version:
-  - ">=11.5.0,<12.0"
+# The CTK libraries below are missing from the conda-forge::cudatoolkit
+# package. The "*_host_*" version specifiers correspond to `11.8` packages and the
+# "*_run_*" version specifiers correspond to `11.x` packages.
 
-libfaiss_version:
-  - "1.7.0 *_cuda"
+libcublas_host_version:
+  - "=11.11.3.6"
+
+libcublas_run_version:
+  - ">=11.5.2.43,<12.0.0"
+
+libcurand_host_version:
+  - "=10.3.0.86"
+
+libcurand_run_version:
+  - ">=10.2.5.43,<10.3.1"
+
+libcusolver_host_version:
+  - "=11.4.1.48"
+
+libcusolver_run_version:
+  - ">=11.2.0.43,<11.4.2"
+
+libcusparse_host_version:
+  - "=11.7.5.86"
+
+libcusparse_run_version:
+  - ">=11.6.0.43,<12.0.0"
+
+# `cuda-profiler-api` only has `11.8.0` and `12.0.0` packages for all
+# architectures. The "*_host_*" version specifiers correspond to `11.8` packages and the
+# "*_run_*" version specifiers correspond to `11.x` packages.
+
+cuda_profiler_api_host_version:
+  - "=11.8.86"
+
+cuda_profiler_api_run_version:
+  - ">=11.4.240,<12"
diff --git a/conda/recipes/libraft/meta.yaml b/conda/recipes/libraft/meta.yaml
index 339fa76065..b0d6c47ee9 100644
--- a/conda/recipes/libraft/meta.yaml
+++ b/conda/recipes/libraft/meta.yaml
@@ -4,9 +4,8 @@
 #   conda build . -c conda-forge -c nvidia -c rapidsai
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
-{% set cuda_version = '.'.join(environ.get('CUDA', '9.2').split('.')[:2]) %}
+{% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
 {% set cuda_major = cuda_version.split('.')[0] %}
-{% set ucx_py_version = environ.get('UCX_PY_VERSION') %}
 {% set cuda_spec = ">=" + cuda_major ~ ",<" + (cuda_major | int + 1) ~ ".0a0" %} # i.e. >=11,<12.0a0
 
 package:
@@ -21,18 +20,18 @@ outputs:
     script: build_libraft_headers.sh
     build:
       script_env: &script_env
-        - PARALLEL_LEVEL
-        - VERSION_SUFFIX
-        - PROJECT_FLASH
-        - CMAKE_GENERATOR
+        - AWS_ACCESS_KEY_ID
+        - AWS_SECRET_ACCESS_KEY
         - CMAKE_C_COMPILER_LAUNCHER
-        - CMAKE_CXX_COMPILER_LAUNCHER
         - CMAKE_CUDA_COMPILER_LAUNCHER
+        - CMAKE_CXX_COMPILER_LAUNCHER
+        - CMAKE_GENERATOR
+        - PARALLEL_LEVEL
+        - SCCACHE_BUCKET
+        - SCCACHE_IDLE_TIMEOUT
+        - SCCACHE_REGION
         - SCCACHE_S3_KEY_PREFIX=libraft-aarch64 # [aarch64]
         - SCCACHE_S3_KEY_PREFIX=libraft-linux64 # [linux64]
-        - SCCACHE_BUCKET=rapids-sccache
-        - SCCACHE_REGION=us-west-2
-        - SCCACHE_IDLE_TIMEOUT=32768
       number: {{ GIT_DESCRIBE_NUMBER }}
       string: cuda{{ cuda_major }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
       ignore_run_exports_from:
@@ -42,26 +41,35 @@ outputs:
         - {{ compiler('c') }}
         - {{ compiler('cxx') }}
         - {{ compiler('cuda') }} {{ cuda_version }}
-        - sysroot_{{ target_platform }} {{ sysroot_version }}
         - cmake {{ cmake_version }}
+        - ninja
+        - sysroot_{{ target_platform }} {{ sysroot_version }}
       host:
-        - cudatoolkit {{ cuda_version }}.*
-        - libcusolver {{ libcusolver_version }}
-        - libcusparse {{ libcusparse_version }}
-        - librmm {{ minor_version }}
-        - nccl {{ nccl_version }}
-        - ucx-proc=*=gpu
-        - ucx-py {{ ucx_py_version }}
+        - cuda-profiler-api {{ cuda_profiler_api_host_version }}
+        - cudatoolkit ={{ cuda_version }}
+        - libcublas {{ libcublas_host_version }}
+        - libcublas-dev {{ libcublas_host_version }}
+        - libcurand {{ libcurand_host_version }}
+        - libcurand-dev {{ libcurand_host_version }}
+        - libcusolver {{ libcusolver_host_version }}
+        - libcusolver-dev {{ libcusolver_host_version }}
+        - libcusparse {{ libcusparse_host_version }}
+        - libcusparse-dev {{ libcusparse_host_version }}
+        - librmm ={{ minor_version }}
       run:
-        - cudatoolkit {{ cuda_spec }}
-        - libcusolver {{ libcusolver_version }}
-        - libcusparse {{ libcusparse_version }}
-        - librmm {{ minor_version }}
-        - nccl {{ nccl_version }}
-        - ucx-proc=*=gpu
-        - ucx-py {{ ucx_py_version }}
+        - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
+        - cuda-profiler-api {{ cuda_profiler_api_run_version }}
+        - libcublas {{ libcublas_run_version }}
+        - libcublas-dev {{ libcublas_run_version }}
+        - libcurand {{ libcurand_run_version }}
+        - libcurand-dev {{ libcurand_run_version }}
+        - libcusolver {{ libcusolver_run_version }}
+        - libcusolver-dev {{ libcusolver_run_version }}
+        - libcusparse {{ libcusparse_run_version }}
+        - libcusparse-dev {{ libcusparse_run_version }}
+        - librmm ={{ minor_version }}
     about:
-      home: http://rapids.ai/
+      home: https://rapids.ai/
       license: Apache-2.0
       summary: libraft-headers library
   - name: libraft-distance
@@ -75,29 +83,27 @@ outputs:
         - {{ compiler('cuda') }}
     requirements:
       build:
-        - cmake {{ cmake_version }}
         - {{ compiler('c') }}
-        - {{ compiler('cxx') }}
         - {{ compiler('cuda') }} {{ cuda_version }}
+        - {{ compiler('cxx') }}
+        - cmake {{ cmake_version }}
+        - ninja
         - sysroot_{{ target_platform }} {{ sysroot_version }}
       host:
-        - cudatoolkit {{ cuda_version }}.*
-        - librmm {{ minor_version }}
-        - nccl {{ nccl_version }}
-        - ucx-proc=*=gpu
-        - ucx-py {{ ucx_py_version }}
         - {{ pin_subpackage('libraft-headers', exact=True) }}
+        - cuda-profiler-api {{ cuda_profiler_api_host_version }}
+        - libcublas {{ libcublas_host_version }}
+        - libcublas-dev {{ libcublas_host_version }}
+        - libcurand {{ libcurand_host_version }}
+        - libcurand-dev {{ libcurand_host_version }}
+        - libcusolver {{ libcusolver_host_version }}
+        - libcusolver-dev {{ libcusolver_host_version }}
+        - libcusparse {{ libcusparse_host_version }}
+        - libcusparse-dev {{ libcusparse_host_version }}
       run:
-        - cudatoolkit {{ cuda_spec }}
-        - librmm {{ minor_version }}
-        - nccl {{ nccl_version }}
-        - ucx-proc=*=gpu
-        - ucx-py {{ ucx_py_version }}
-        - libcusolver {{ libcusolver_version }}
-        - libcusparse {{ libcusparse_version }}
         - {{ pin_subpackage('libraft-headers', exact=True) }}
     about:
-      home: http://rapids.ai/
+      home: https://rapids.ai/
       license: Apache-2.0
       summary: libraft-distance library
   - name: libraft-nn
@@ -111,28 +117,32 @@ outputs:
         - {{ compiler('cuda') }}
     requirements:
       build:
-        - cmake {{ cmake_version }}
         - {{ compiler('c') }}
-        - {{ compiler('cxx') }}
         - {{ compiler('cuda') }} {{ cuda_version }}
+        - {{ compiler('cxx') }}
+        - cmake {{ cmake_version }}
+        - ninja
         - sysroot_{{ target_platform }} {{ sysroot_version }}
       host:
-        - cudatoolkit {{ cuda_version }}.*
+        - {{ pin_subpackage('libraft-headers', exact=True) }}
+        - cuda-profiler-api {{ cuda_profiler_api_host_version }}
         - faiss-proc=*=cuda
         - lapack
+        - libcublas {{ libcublas_host_version }}
+        - libcublas-dev {{ libcublas_host_version }}
+        - libcurand {{ libcurand_host_version }}
+        - libcurand-dev {{ libcurand_host_version }}
+        - libcusolver {{ libcusolver_host_version }}
+        - libcusolver-dev {{ libcusolver_host_version }}
+        - libcusparse {{ libcusparse_host_version }}
+        - libcusparse-dev {{ libcusparse_host_version }}
         - libfaiss {{ libfaiss_version }}
-        - librmm {{ minor_version }}
-        - {{ pin_subpackage('libraft-headers', exact=True) }}
       run:
-        - cudatoolkit {{ cuda_spec }}
         - faiss-proc=*=cuda
-        - libcusolver {{ libcusolver_version }}
-        - libcusparse {{ libcusparse_version }}
         - libfaiss {{ libfaiss_version }}
-        - librmm {{ minor_version }}
         - {{ pin_subpackage('libraft-headers', exact=True) }}
     about:
-      home: http://rapids.ai/
+      home: https://rapids.ai/
       license: Apache-2.0
       summary: libraft-nn library
   - name: libraft-tests
@@ -146,28 +156,34 @@ outputs:
         - {{ compiler('cuda') }}
     requirements:
       build:
-        - cmake {{ cmake_version }}
         - {{ compiler('c') }}
-        - {{ compiler('cxx') }}
         - {{ compiler('cuda') }} {{ cuda_version }}
+        - {{ compiler('cxx') }}
+        - cmake {{ cmake_version }}
+        - ninja
         - sysroot_{{ target_platform }} {{ sysroot_version }}
       host:
-        - cudatoolkit {{ cuda_version }}.*
-        - gmock {{ gtest_version }}
-        - gtest {{ gtest_version }}
         - {{ pin_subpackage('libraft-distance', exact=True) }}
         - {{ pin_subpackage('libraft-headers', exact=True) }}
         - {{ pin_subpackage('libraft-nn', exact=True) }}
-      run:
-        - cudatoolkit {{ cuda_spec }}
+        - cuda-profiler-api {{ cuda_profiler_api_host_version }}
         - gmock {{ gtest_version }}
         - gtest {{ gtest_version }}
-        - libcusolver {{ libcusolver_version }}
-        - libcusparse {{ libcusparse_version }}
+        - libcublas {{ libcublas_host_version }}
+        - libcublas-dev {{ libcublas_host_version }}
+        - libcurand {{ libcurand_host_version }}
+        - libcurand-dev {{ libcurand_host_version }}
+        - libcusolver {{ libcusolver_host_version }}
+        - libcusolver-dev {{ libcusolver_host_version }}
+        - libcusparse {{ libcusparse_host_version }}
+        - libcusparse-dev {{ libcusparse_host_version }}
+      run:
         - {{ pin_subpackage('libraft-distance', exact=True) }}
         - {{ pin_subpackage('libraft-headers', exact=True) }}
         - {{ pin_subpackage('libraft-nn', exact=True) }}
+        - gmock {{ gtest_version }}
+        - gtest {{ gtest_version }}
     about:
-      home: http://rapids.ai/
+      home: https://rapids.ai/
       license: Apache-2.0
       summary: libraft tests
diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml
index 68e2d5952d..6bc091a219 100644
--- a/conda/recipes/pylibraft/meta.yaml
+++ b/conda/recipes/pylibraft/meta.yaml
@@ -3,10 +3,10 @@
 # Usage:
 #   conda build . -c conda-forge -c numba -c rapidsai -c pytorch
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
-{% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
-{% set cuda_version='.'.join(environ.get('CUDA', 'unknown').split('.')[:2]) %}
-{% set cuda_major=cuda_version.split('.')[0] %}
-{% set py_version=environ.get('CONDA_PY', 36) %}
+{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
+{% set py_version = environ['CONDA_PY'] %}
+{% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
+{% set cuda_major = cuda_version.split('.')[0] %}
 
 package:
   name: pylibraft
@@ -23,36 +23,38 @@ build:
 
 requirements:
   build:
-    - cmake {{ cmake_version }}
     - {{ compiler('c') }}
     - {{ compiler('cxx') }}
     - {{ compiler('cuda') }} {{ cuda_version }}
+    - cmake {{ cmake_version }}
+    - ninja
     - sysroot_{{ target_platform }} {{ sysroot_version }}
   host:
+    - cuda-python >=11.7.1,<12.0
+    - cudatoolkit ={{ cuda_version }}
+    - cython >=0.29,<0.30
+    - libraft-distance {{ version }}
+    - libraft-headers {{ version }}
     - python x.x
+    - rmm ={{ minor_version }}
+    - scikit-build >=0.13.1
     - setuptools
-    - cython>=0.29,<0.30
-    - scikit-build>=0.13.1
-    - rmm {{ minor_version }}
-    - libraft-headers {{ version }}
-    - libraft-distance {{ version }}
-    - cudatoolkit {{ cuda_version }}.*
-    - cuda-python >=11.7.1,<12.0
   run:
-    - python x.x
-    - libraft-headers {{ version }}
-    - libraft-distance {{ version }}
-    - cuda-python >=11.7.1,<12.0
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
+    - cuda-python >=11.7.1,<12.0
+    - libraft-distance {{ version }}
+    - libraft-headers {{ version }}
+    - python x.x
 
+# TODO: Remove the linux64 tags on tests after disabling gpuCI / Jenkins
 tests:                                 # [linux64]
   requirements:                        # [linux64]
-    - cudatoolkit {{ cuda_version }}.* # [linux64]
+    - cudatoolkit ={{ cuda_version }}  # [linux64]
   imports:                             # [linux64]
     - pylibraft                        # [linux64]
 
 about:
-  home: http://rapids.ai/
+  home: https://rapids.ai/
   license: Apache-2.0
   # license_file: LICENSE
   summary: pylibraft library
diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml
index 3b42dab182..42d7e3a900 100644
--- a/conda/recipes/raft-dask/conda_build_config.yaml
+++ b/conda/recipes/raft-dask/conda_build_config.yaml
@@ -13,5 +13,8 @@ sysroot_version:
 ucx_version:
   - "1.13.0"
 
+ucx_py_version:
+  - "0.30.*"
+
 cmake_version:
   - ">=3.23.1,!=3.25.0"
diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml
index f9a7c58e24..a8bc626eaa 100644
--- a/conda/recipes/raft-dask/meta.yaml
+++ b/conda/recipes/raft-dask/meta.yaml
@@ -3,11 +3,10 @@
 # Usage:
 #   conda build . -c conda-forge -c numba -c rapidsai -c pytorch
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
-{% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
-{% set cuda_version='.'.join(environ.get('CUDA', 'unknown').split('.')[:2]) %}
-{% set cuda_major=cuda_version.split('.')[0] %}
-{% set py_version=environ.get('CONDA_PY', 36) %}
-{% set ucx_py_version=environ.get('UCX_PY_VERSION') %}
+{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
+{% set py_version = environ['CONDA_PY'] %}
+{% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
+{% set cuda_major = cuda_version.split('.')[0] %}
 
 package:
   name: raft-dask
@@ -24,47 +23,49 @@ build:
 
 requirements:
   build:
-    - cmake {{ cmake_version }}
     - {{ compiler('c') }}
     - {{ compiler('cxx') }}
     - {{ compiler('cuda') }} {{ cuda_version }}
+    - cmake {{ cmake_version }}
+    - ninja
     - sysroot_{{ target_platform }} {{ sysroot_version }}
   host:
+    - cuda-python >=11.7.1,<12.0
+    - cudatoolkit ={{ cuda_version }}
+    - cython >=0.29,<0.30
+    - nccl >=2.9.9
+    - pylibraft {{ version }}
     - python x.x
+    - rmm ={{ minor_version }}
+    - scikit-build >=0.13.1
     - setuptools
-    - cython>=0.29,<0.30
-    - scikit-build>=0.13.1
-    - rmm {{ minor_version }}
-    - pylibraft {{ version }}
-    - cudatoolkit {{ cuda_version }}.*
-    - cuda-python >=11.7.1,<12.0
-    - nccl>=2.9.9
     - ucx {{ ucx_version }}
-    - ucx-py {{ ucx_py_version }}
     - ucx-proc=*=gpu
+    - ucx-py {{ ucx_py_version }}
   run:
-    - python x.x
-    - dask-cuda {{ minor_version }}
+    - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
+    - cuda-python >=11.7.1,<12.0
+    - dask ==2023.1.1
+    - dask-cuda ={{ minor_version }}
+    - distributed ==2023.1.1
+    - joblib >=0.11
+    - nccl >=2.9.9
     - pylibraft {{ version }}
-    - nccl>=2.9.9
-    - rmm {{ minor_version }}
+    - python x.x
+    - rmm ={{ minor_version }}
     - ucx >={{ ucx_version }}
-    - ucx-py {{ ucx_py_version }}
     - ucx-proc=*=gpu
-    - dask==2022.11.1
-    - distributed==2022.11.1
-    - cuda-python >=11.7.1,<12.0
-    - joblib >=0.11
-    - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
+    - ucx-py {{ ucx_py_version }}
 
+# TODO: Remove the linux64 tags on tests after disabling gpuCI / Jenkins
 tests:                                 # [linux64]
   requirements:                        # [linux64]
-    - cudatoolkit {{ cuda_version }}.* # [linux64]
+    - cudatoolkit ={{ cuda_version }}  # [linux64]
   imports:                             # [linux64]
     - raft_dask                        # [linux64]
 
 about:
-  home: http://rapids.ai/
+  home: https://rapids.ai/
   license: Apache-2.0
   # license_file: LICENSE
   summary: raft-dask library
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2fd10fe067..1d54409ae6 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -10,9 +10,8 @@
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
-# =============================================================================
-set(RAPIDS_VERSION "22.12")
-set(RAFT_VERSION "22.12.01")
+set(RAPIDS_VERSION "23.02")
+set(RAFT_VERSION "23.02.00")
 
 cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
 include(../fetch_rapids.cmake)
@@ -219,6 +218,15 @@ target_link_libraries(
 
 target_compile_features(raft INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
 
+# Endian detection
+include(TestBigEndian)
+test_big_endian(BIG_ENDIAN)
+if(BIG_ENDIAN)
+  target_compile_definitions(raft INTERFACE RAFT_SYSTEM_LITTLE_ENDIAN=0)
+else()
+  target_compile_definitions(raft INTERFACE RAFT_SYSTEM_LITTLE_ENDIAN=1)
+endif()
+
 if(RAFT_COMPILE_DIST_LIBRARY OR RAFT_COMPILE_NN_LIBRARY)
   file(
     WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld"
@@ -279,79 +287,98 @@ set_target_properties(raft_distance PROPERTIES EXPORT_NAME distance)
 if(RAFT_COMPILE_DIST_LIBRARY)
   add_library(
     raft_distance_lib
-    src/distance/pairwise_distance.cu
-    src/distance/fused_l2_min_arg.cu
-    src/distance/update_centroids_float.cu
-    src/distance/update_centroids_double.cu
-    src/distance/cluster_cost_float.cu
-    src/distance/cluster_cost_double.cu
-    src/distance/specializations/detail/canberra.cu
-    src/distance/specializations/detail/chebyshev.cu
-    src/distance/specializations/detail/correlation.cu
-    src/distance/specializations/detail/cosine.cu
-    src/distance/specializations/detail/cosine.cu
-    src/distance/specializations/detail/hamming_unexpanded.cu
-    src/distance/specializations/detail/hellinger_expanded.cu
-    src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
-    src/distance/specializations/detail/jensen_shannon_float_float_float_uint32.cu
-    src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
-    src/distance/specializations/detail/kernels/gram_matrix_base_double.cu
-    src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
-    src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
-    src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
+    src/distance/distance/pairwise_distance.cu
+    src/distance/distance/fused_l2_min_arg.cu
+    src/distance/cluster/update_centroids_float.cu
+    src/distance/cluster/update_centroids_double.cu
+    src/distance/cluster/cluster_cost_float.cu
+    src/distance/cluster/cluster_cost_double.cu
+    src/distance/neighbors/refine_d_uint64_t_float.cu
+    src/distance/neighbors/refine_d_uint64_t_int8_t.cu
+    src/distance/neighbors/refine_d_uint64_t_uint8_t.cu
+    src/distance/neighbors/refine_h_uint64_t_float.cu
+    src/distance/neighbors/refine_h_uint64_t_int8_t.cu
+    src/distance/neighbors/refine_h_uint64_t_uint8_t.cu
+    src/distance/neighbors/specializations/refine_d_uint64_t_float.cu
+    src/distance/neighbors/specializations/refine_d_uint64_t_int8_t.cu
+    src/distance/neighbors/specializations/refine_d_uint64_t_uint8_t.cu
+    src/distance/neighbors/specializations/refine_h_uint64_t_float.cu
+    src/distance/neighbors/specializations/refine_h_uint64_t_int8_t.cu
+    src/distance/neighbors/specializations/refine_h_uint64_t_uint8_t.cu
+    src/distance/cluster/kmeans_fit_float.cu
+    src/distance/cluster/kmeans_fit_double.cu
+    src/distance/distance/specializations/detail/canberra.cu
+    src/distance/distance/specializations/detail/chebyshev.cu
+    src/distance/distance/specializations/detail/correlation.cu
+    src/distance/distance/specializations/detail/cosine.cu
+    src/distance/distance/specializations/detail/cosine.cu
+    src/distance/distance/specializations/detail/hamming_unexpanded.cu
+    src/distance/distance/specializations/detail/hellinger_expanded.cu
+    src/distance/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
+    src/distance/distance/specializations/detail/jensen_shannon_float_float_float_uint32.cu
+    src/distance/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
+    src/distance/distance/specializations/detail/kernels/gram_matrix_base_double.cu
+    src/distance/distance/specializations/detail/kernels/gram_matrix_base_float.cu
+    src/distance/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
+    src/distance/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
     # These are somehow missing a kernel definition which is causing a compile error.
     # src/distance/specializations/detail/kernels/rbf_kernel_double.cu
     # src/distance/specializations/detail/kernels/rbf_kernel_float.cu
-    src/distance/specializations/detail/kernels/tanh_kernel_double.cu
-    src/distance/specializations/detail/kernels/tanh_kernel_float.cu
-    src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
-    src/distance/specializations/detail/kl_divergence_float_float_float_uint32.cu
-    src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
-    src/distance/specializations/detail/l1_float_float_float_int.cu
-    src/distance/specializations/detail/l1_float_float_float_uint32.cu
-    src/distance/specializations/detail/l1_double_double_double_int.cu
-    src/distance/specializations/detail/l2_expanded_float_float_float_int.cu
-    src/distance/specializations/detail/l2_expanded_float_float_float_uint32.cu
-    src/distance/specializations/detail/l2_expanded_double_double_double_int.cu
-    src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_int.cu
-    src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_uint32.cu
-    src/distance/specializations/detail/l2_sqrt_expanded_double_double_double_int.cu
-    src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_int.cu
-    src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_uint32.cu
-    src/distance/specializations/detail/l2_sqrt_unexpanded_double_double_double_int.cu
-    src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
-    src/distance/specializations/detail/l2_unexpanded_float_float_float_uint32.cu
-    src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
-    src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
-    src/distance/specializations/detail/lp_unexpanded_float_float_float_uint32.cu
-    src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
-    src/distance/specializations/detail/russel_rao_double_double_double_int.cu
-    src/distance/specializations/detail/russel_rao_float_float_float_uint32.cu
-    src/distance/specializations/detail/russel_rao_float_float_float_int.cu
-    src/distance/specializations/fused_l2_nn_double_int.cu
-    src/distance/specializations/fused_l2_nn_double_int64.cu
-    src/distance/specializations/fused_l2_nn_float_int.cu
-    src/distance/specializations/fused_l2_nn_float_int64.cu
-    src/nn/specializations/detail/ivfpq_build.cu
-    src/nn/specializations/detail/ivfpq_compute_similarity_float_fast.cu
-    src/nn/specializations/detail/ivfpq_compute_similarity_float_no_basediff.cu
-    src/nn/specializations/detail/ivfpq_compute_similarity_float_no_smem_lut.cu
-    src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_fast.cu
-    src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_no_basediff.cu
-    src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_no_smem_lut.cu
-    src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_fast.cu
-    src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_no_basediff.cu
-    src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_no_smem_lut.cu
-    src/nn/specializations/detail/ivfpq_compute_similarity_half_fast.cu
-    src/nn/specializations/detail/ivfpq_compute_similarity_half_no_basediff.cu
-    src/nn/specializations/detail/ivfpq_compute_similarity_half_no_smem_lut.cu
-    src/nn/specializations/detail/ivfpq_search.cu
-    src/nn/specializations/detail/ivfpq_search_float_uint64_t.cu
-    src/nn/specializations/refine.cu
-    src/random/specializations/rmat_rectangular_generator_int_double.cu
-    src/random/specializations/rmat_rectangular_generator_int64_double.cu
-    src/random/specializations/rmat_rectangular_generator_int_float.cu
-    src/random/specializations/rmat_rectangular_generator_int64_float.cu
+    src/distance/distance/specializations/detail/kernels/tanh_kernel_double.cu
+    src/distance/distance/specializations/detail/kernels/tanh_kernel_float.cu
+    src/distance/distance/specializations/detail/kl_divergence_float_float_float_int.cu
+    src/distance/distance/specializations/detail/kl_divergence_float_float_float_uint32.cu
+    src/distance/distance/specializations/detail/kl_divergence_double_double_double_int.cu
+    src/distance/distance/specializations/detail/l1_float_float_float_int.cu
+    src/distance/distance/specializations/detail/l1_float_float_float_uint32.cu
+    src/distance/distance/specializations/detail/l1_double_double_double_int.cu
+    src/distance/distance/specializations/detail/l2_expanded_float_float_float_int.cu
+    src/distance/distance/specializations/detail/l2_expanded_float_float_float_uint32.cu
+    src/distance/distance/specializations/detail/l2_expanded_double_double_double_int.cu
+    src/distance/distance/specializations/detail/l2_sqrt_expanded_float_float_float_int.cu
+    src/distance/distance/specializations/detail/l2_sqrt_expanded_float_float_float_uint32.cu
+    src/distance/distance/specializations/detail/l2_sqrt_expanded_double_double_double_int.cu
+    src/distance/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_int.cu
+    src/distance/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_uint32.cu
+    src/distance/distance/specializations/detail/l2_sqrt_unexpanded_double_double_double_int.cu
+    src/distance/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
+    src/distance/distance/specializations/detail/l2_unexpanded_float_float_float_uint32.cu
+    src/distance/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
+    src/distance/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
+    src/distance/distance/specializations/detail/lp_unexpanded_float_float_float_uint32.cu
+    src/distance/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
+    src/distance/distance/specializations/detail/russel_rao_double_double_double_int.cu
+    src/distance/distance/specializations/detail/russel_rao_float_float_float_uint32.cu
+    src/distance/distance/specializations/detail/russel_rao_float_float_float_int.cu
+    src/distance/distance/specializations/fused_l2_nn_double_int.cu
+    src/distance/distance/specializations/fused_l2_nn_double_int64.cu
+    src/distance/distance/specializations/fused_l2_nn_float_int.cu
+    src/distance/distance/specializations/fused_l2_nn_float_int64.cu
+    src/distance/neighbors/ivfpq_build.cu
+    src/distance/neighbors/ivfpq_deserialize.cu
+    src/distance/neighbors/ivfpq_serialize.cu
+    src/distance/neighbors/ivfpq_search_float_uint64_t.cu
+    src/distance/neighbors/ivfpq_search_int8_t_uint64_t.cu
+    src/distance/neighbors/ivfpq_search_uint8_t_uint64_t.cu
+    src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_float_fast.cu
+    src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_float_no_basediff.cu
+    src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_float_no_smem_lut.cu
+    src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_fp8s_fast.cu
+    src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_fp8s_no_basediff.cu
+    src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_fp8s_no_smem_lut.cu
+    src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_fp8u_fast.cu
+    src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_fp8u_no_basediff.cu
+    src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_fp8u_no_smem_lut.cu
+    src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_half_fast.cu
+    src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_half_no_basediff.cu
+    src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_half_no_smem_lut.cu
+    src/distance/neighbors/specializations/detail/ivfpq_search_float_int64_t.cu
+    src/distance/neighbors/specializations/detail/ivfpq_search_float_uint64_t.cu
+    src/distance/neighbors/specializations/detail/ivfpq_search_float_uint32_t.cu
+    src/distance/random/rmat_rectangular_generator_int_double.cu
+    src/distance/random/rmat_rectangular_generator_int64_double.cu
+    src/distance/random/rmat_rectangular_generator_int_float.cu
+    src/distance/random/rmat_rectangular_generator_int64_float.cu
   )
   set_target_properties(
     raft_distance_lib
@@ -404,33 +431,21 @@ set_target_properties(raft_nn PROPERTIES EXPORT_NAME nn)
 if(RAFT_COMPILE_NN_LIBRARY)
   add_library(
     raft_nn_lib
-    src/nn/specializations/ball_cover.cu
     src/nn/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
     src/nn/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
     src/nn/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
     src/nn/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
-    src/nn/specializations/detail/ivfpq_compute_similarity_float_fast.cu
-    src/nn/specializations/detail/ivfpq_compute_similarity_float_no_basediff.cu
-    src/nn/specializations/detail/ivfpq_compute_similarity_float_no_smem_lut.cu
-    src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_fast.cu
-    src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_no_basediff.cu
-    src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_no_smem_lut.cu
-    src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_fast.cu
-    src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_no_basediff.cu
-    src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_no_smem_lut.cu
-    src/nn/specializations/detail/ivfpq_compute_similarity_half_fast.cu
-    src/nn/specializations/detail/ivfpq_compute_similarity_half_no_basediff.cu
-    src/nn/specializations/detail/ivfpq_compute_similarity_half_no_smem_lut.cu
-    src/nn/specializations/detail/ivfpq_build.cu
-    src/nn/specializations/detail/ivfpq_search.cu
-    src/nn/specializations/detail/ivfpq_search_float_int64_t.cu
-    src/nn/specializations/detail/ivfpq_search_float_uint32_t.cu
-    src/nn/specializations/detail/ivfpq_search_float_uint64_t.cu
+    src/nn/specializations/ball_cover_all_knn_query.cu
+    src/nn/specializations/ball_cover_build_index.cu
+    src/nn/specializations/ball_cover_knn_query.cu
     src/nn/specializations/fused_l2_knn_long_float_true.cu
     src/nn/specializations/fused_l2_knn_long_float_false.cu
     src/nn/specializations/fused_l2_knn_int_float_true.cu
     src/nn/specializations/fused_l2_knn_int_float_false.cu
-    src/nn/specializations/knn.cu
+    src/nn/specializations/brute_force_knn_long_float_int.cu
+    src/nn/specializations/brute_force_knn_long_float_uint.cu
+    src/nn/specializations/brute_force_knn_uint32_t_float_int.cu
+    src/nn/specializations/brute_force_knn_uint32_t_float_uint.cu
   )
   set_target_properties(
     raft_nn_lib
@@ -478,10 +493,20 @@ endif()
 
 set_target_properties(raft_distributed PROPERTIES EXPORT_NAME distributed)
 
+rapids_find_generate_module(
+  NCCL
+  HEADER_NAMES nccl.h
+  LIBRARY_NAMES nccl
+  BUILD_EXPORT_SET raft-distributed-exports
+  INSTALL_EXPORT_SET raft-distributed-exports
+)
+
 rapids_export_package(BUILD ucx raft-distributed-exports)
 rapids_export_package(INSTALL ucx raft-distributed-exports)
+rapids_export_package(BUILD NCCL raft-distributed-exports)
+rapids_export_package(INSTALL NCCL raft-distributed-exports)
 
-target_link_libraries(raft_distributed INTERFACE ucx::ucp)
+target_link_libraries(raft_distributed INTERFACE ucx::ucp NCCL::NCCL)
 
 # ##################################################################################################
 # * install targets-----------------------------------------------------------
@@ -518,7 +543,7 @@ if(TARGET raft_distance_lib)
     EXPORT raft-distance-lib-exports
   )
   install(
-    DIRECTORY include/raft_distance
+    DIRECTORY include/raft_runtime
     DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
     COMPONENT distance
   )
@@ -665,27 +690,17 @@ raft_export(
 # ##################################################################################################
 # * build export -------------------------------------------------------------
 raft_export(
-  BUILD
-  raft
-  EXPORT_SET
-  raft-exports
-  COMPONENTS
-  nn
-  distance
-  distributed
-  GLOBAL_TARGETS
-  raft
-  raft_distance
-  distributed
-  raft_nn
-  DOCUMENTATION
-  doc_string
-  NAMESPACE
-  raft::
-  FINAL_CODE_BLOCK
-  code_string
+  BUILD raft EXPORT_SET raft-exports COMPONENTS nn distance distributed GLOBAL_TARGETS raft
+  distance distributed nn DOCUMENTATION doc_string NAMESPACE raft:: FINAL_CODE_BLOCK code_string
 )
 
+# ##################################################################################################
+# * shared test/bench headers ------------------------------------------------
+
+if(BUILD_TESTS OR BUILD_BENCH)
+  include(internal/CMakeLists.txt)
+endif()
+
 # ##################################################################################################
 # * build test executable ----------------------------------------------------
 
diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt
index 4e6b6ceb40..b1ffc72ba9 100644
--- a/cpp/bench/CMakeLists.txt
+++ b/cpp/bench/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -30,6 +30,7 @@ function(ConfigureBench)
   target_link_libraries(
     ${BENCH_NAME}
     PRIVATE raft::raft
+            raft_internal
             $<$<BOOL:${ConfigureBench_DIST}>:raft::distance>
             $<$<BOOL:${ConfigureBench_NN}>:raft::nn>
             benchmark::benchmark
@@ -96,12 +97,16 @@ if(BUILD_BENCH)
     bench/linalg/matrix_vector_op.cu
     bench/linalg/norm.cu
     bench/linalg/normalize.cu
+    bench/linalg/reduce_cols_by_key.cu
     bench/linalg/reduce_rows_by_key.cu
     bench/linalg/reduce.cu
     bench/main.cpp
   )
 
-  ConfigureBench(NAME MATRIX_BENCH PATH bench/matrix/argmin.cu bench/main.cpp)
+  ConfigureBench(
+    NAME MATRIX_BENCH PATH bench/matrix/argmin.cu bench/matrix/gather.cu bench/matrix/select_k.cu
+    bench/main.cpp
+  )
 
   ConfigureBench(
     NAME RANDOM_BENCH PATH bench/random/make_blobs.cu bench/random/permute.cu bench/random/rng.cu
@@ -125,7 +130,6 @@ if(BUILD_BENCH)
     bench/neighbors/knn/ivf_pq_int8_t_int64_t.cu
     bench/neighbors/knn/ivf_pq_uint8_t_uint32_t.cu
     bench/neighbors/refine.cu
-    bench/neighbors/selection.cu
     bench/main.cpp
     OPTIONAL
     DIST
diff --git a/cpp/bench/cluster/kmeans_balanced.cu b/cpp/bench/cluster/kmeans_balanced.cu
index 210b40ced8..9c53e86d8c 100644
--- a/cpp/bench/cluster/kmeans_balanced.cu
+++ b/cpp/bench/cluster/kmeans_balanced.cu
@@ -15,20 +15,19 @@
  */
 
 #include <common/benchmark.hpp>
+#include <raft/cluster/kmeans_balanced.cuh>
 #include <raft/random/rng.cuh>
-#include <raft/spatial/knn/detail/ann_kmeans_balanced.cuh>
 
-#if defined RAFT_DISTANCE_COMPILED && defined RAFT_NN_COMPILED
-#include <raft/cluster/specializations.cuh>
+#if defined RAFT_DISTANCE_COMPILED
+#include <raft/distance/specializations.cuh>
 #endif
 
 namespace raft::bench::cluster {
 
 struct KMeansBalancedBenchParams {
   DatasetParams data;
-  uint32_t max_iter;
   uint32_t n_lists;
-  raft::distance::DistanceType metric;
+  raft::cluster::kmeans_balanced_params kb_params;
 };
 
 template <typename T, typename IndexT = int>
@@ -38,15 +37,10 @@ struct KMeansBalanced : public fixture {
   void run_benchmark(::benchmark::State& state) override
   {
     this->loop_on_state(state, [this]() {
-      raft::spatial::knn::detail::kmeans::build_hierarchical<T>(this->handle,
-                                                                this->params.max_iter,
-                                                                (uint32_t)this->params.data.cols,
-                                                                this->X.data_handle(),
-                                                                this->params.data.rows,
-                                                                this->centroids.data_handle(),
-                                                                this->params.n_lists,
-                                                                this->params.metric,
-                                                                this->handle.get_stream());
+      raft::device_matrix_view<const T, IndexT> X_view   = this->X.view();
+      raft::device_matrix_view<T, IndexT> centroids_view = this->centroids.view();
+      raft::cluster::kmeans_balanced::fit(
+        this->handle, this->params.kb_params, X_view, centroids_view);
     });
   }
 
@@ -84,8 +78,8 @@ std::vector<KMeansBalancedBenchParams> getKMeansBalancedInputs()
   std::vector<KMeansBalancedBenchParams> out;
   KMeansBalancedBenchParams p;
   p.data.row_major                          = true;
-  p.max_iter                                = 20;
-  p.metric                                  = raft::distance::DistanceType::L2Expanded;
+  p.kb_params.n_iters                       = 20;
+  p.kb_params.metric                        = raft::distance::DistanceType::L2Expanded;
   std::vector<std::pair<int, int>> row_cols = {
     {100000, 128}, {1000000, 128}, {10000000, 128},
     // The following dataset sizes are too large for most GPUs.
@@ -104,7 +98,5 @@ std::vector<KMeansBalancedBenchParams> getKMeansBalancedInputs()
 
 // Note: the datasets sizes are too large for 32-bit index types.
 RAFT_BENCH_REGISTER((KMeansBalanced<float, int64_t>), "", getKMeansBalancedInputs());
-RAFT_BENCH_REGISTER((KMeansBalanced<int8_t, int64_t>), "", getKMeansBalancedInputs());
-RAFT_BENCH_REGISTER((KMeansBalanced<uint8_t, int64_t>), "", getKMeansBalancedInputs());
 
 }  // namespace raft::bench::cluster
diff --git a/cpp/bench/common/benchmark.hpp b/cpp/bench/common/benchmark.hpp
index 13ca40a033..85d5381e2c 100644
--- a/cpp/bench/common/benchmark.hpp
+++ b/cpp/bench/common/benchmark.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
 
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdarray.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/cudart_utils.h>
 #include <raft/interruptible.hpp>
 #include <raft/random/make_blobs.cuh>
@@ -110,7 +110,7 @@ class fixture {
   rmm::device_buffer scratch_buf_;
 
  public:
-  raft::handle_t handle;
+  raft::device_resources handle;
   rmm::cuda_stream_view stream;
 
   fixture() : stream{handle.get_stream()}
diff --git a/cpp/bench/distance/distance_common.cuh b/cpp/bench/distance/distance_common.cuh
index 73faacce37..7ddecd7579 100644
--- a/cpp/bench/distance/distance_common.cuh
+++ b/cpp/bench/distance/distance_common.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/distance/kernels.cu b/cpp/bench/distance/kernels.cu
index 5c9c2cc2ed..027f93171e 100644
--- a/cpp/bench/distance/kernels.cu
+++ b/cpp/bench/distance/kernels.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 
 #include <common/benchmark.hpp>
 #include <memory>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/distance/kernels.cuh>
 #include <raft/random/rng.cuh>
@@ -77,7 +77,7 @@ struct GramMatrix : public fixture {
   }
 
  private:
-  const raft::handle_t handle;
+  const raft::device_resources handle;
   std::unique_ptr<GramMatrixBase<T>> kernel;
   GramTestParams params;
 
diff --git a/cpp/bench/linalg/norm.cu b/cpp/bench/linalg/norm.cu
index cce4195cf1..efecee88c9 100644
--- a/cpp/bench/linalg/norm.cu
+++ b/cpp/bench/linalg/norm.cu
@@ -18,6 +18,7 @@
 #include <raft/linalg/matrix_vector_op.cuh>
 #include <raft/linalg/norm.cuh>
 #include <raft/random/rng.cuh>
+#include <raft/util/cuda_utils.cuh>
 #include <raft/util/itertools.hpp>
 
 #include <rmm/device_uvector.hpp>
@@ -60,7 +61,7 @@ struct rowNorm : public fixture {
                          output_view,
                          raft::linalg::L2Norm,
                          raft::linalg::Apply::ALONG_ROWS,
-                         raft::SqrtOp<T>());
+                         raft::sqrt_op());
     });
   }
 
diff --git a/cpp/bench/linalg/reduce_cols_by_key.cu b/cpp/bench/linalg/reduce_cols_by_key.cu
new file mode 100644
index 0000000000..43aeb69ab0
--- /dev/null
+++ b/cpp/bench/linalg/reduce_cols_by_key.cu
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+#include <raft/linalg/reduce_cols_by_key.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/itertools.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+namespace raft::bench::linalg {
+
+template <typename IdxT>
+struct rcbk_params {
+  IdxT rows, cols;
+  IdxT keys;
+};
+
+template <typename IdxT>
+inline auto operator<<(std::ostream& os, const rcbk_params<IdxT>& p) -> std::ostream&
+{
+  os << p.rows << "#" << p.cols << "#" << p.keys;
+  return os;
+}
+
+template <typename T, typename KeyT, typename IdxT>
+struct reduce_cols_by_key : public fixture {
+  reduce_cols_by_key(const rcbk_params<IdxT>& p)
+    : params(p), in(p.rows * p.cols, stream), out(p.rows * p.keys, stream), keys(p.cols, stream)
+  {
+    raft::random::RngState rng{42};
+    raft::random::uniformInt(rng, keys.data(), p.cols, (KeyT)0, (KeyT)p.keys, stream);
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    std::ostringstream label_stream;
+    label_stream << params;
+    state.SetLabel(label_stream.str());
+
+    loop_on_state(state, [this]() {
+      raft::linalg::reduce_cols_by_key(
+        in.data(), keys.data(), out.data(), params.rows, params.cols, params.keys, stream, false);
+    });
+  }
+
+ protected:
+  rcbk_params<IdxT> params;
+  rmm::device_uvector<T> in, out;
+  rmm::device_uvector<KeyT> keys;
+};  // struct reduce_cols_by_key
+
+const std::vector<rcbk_params<int>> rcbk_inputs_i32 =
+  raft::util::itertools::product<rcbk_params<int>>(
+    {1, 10, 100, 1000}, {1000, 10000, 100000}, {8, 32, 128, 512, 2048});
+const std::vector<rcbk_params<int64_t>> rcbk_inputs_i64 =
+  raft::util::itertools::product<rcbk_params<int64_t>>(
+    {1, 10, 100, 1000}, {1000, 10000, 100000}, {8, 32, 128, 512, 2048});
+
+RAFT_BENCH_REGISTER((reduce_cols_by_key<float, uint32_t, int>), "", rcbk_inputs_i32);
+RAFT_BENCH_REGISTER((reduce_cols_by_key<double, uint32_t, int>), "", rcbk_inputs_i32);
+RAFT_BENCH_REGISTER((reduce_cols_by_key<float, uint32_t, int64_t>), "", rcbk_inputs_i64);
+RAFT_BENCH_REGISTER((reduce_cols_by_key<double, uint32_t, int64_t>), "", rcbk_inputs_i64);
+
+}  // namespace raft::bench::linalg
diff --git a/cpp/bench/matrix/argmin.cu b/cpp/bench/matrix/argmin.cu
index 0d0dea0fdb..3869f0c5e1 100644
--- a/cpp/bench/matrix/argmin.cu
+++ b/cpp/bench/matrix/argmin.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,10 +17,11 @@
 #include <common/benchmark.hpp>
 #include <raft/matrix/argmin.cuh>
 #include <raft/random/rng.cuh>
+#include <raft/util/itertools.hpp>
 
 #include <rmm/device_uvector.hpp>
 
-namespace raft::bench::linalg {
+namespace raft::bench::matrix {
 
 template <typename IdxT>
 struct ArgminParams {
@@ -45,9 +46,7 @@ struct Argmin : public fixture {
   void run_benchmark(::benchmark::State& state) override
   {
     loop_on_state(state, [this]() {
-      auto matrix_const_view = raft::make_device_matrix_view<const T, IdxT, row_major>(
-        matrix.data_handle(), matrix.extent(0), matrix.extent(1));
-      raft::matrix::argmin(handle, matrix_const_view, indices.view());
+      raft::matrix::argmin(handle, raft::make_const_mdspan(matrix.view()), indices.view());
     });
   }
 
@@ -57,15 +56,11 @@ struct Argmin : public fixture {
   raft::device_vector<OutT, IdxT> indices;
 };  // struct Argmin
 
-const std::vector<ArgminParams<int64_t>> argmin_inputs_i64{
-  {1000, 64},     {1000, 128},     {1000, 256},     {1000, 512},     {1000, 1024},
-  {10000, 64},    {10000, 128},    {10000, 256},    {10000, 512},    {10000, 1024},
-  {100000, 64},   {100000, 128},   {100000, 256},   {100000, 512},   {100000, 1024},
-  {1000000, 64},  {1000000, 128},  {1000000, 256},  {1000000, 512},  {1000000, 1024},
-  {10000000, 64}, {10000000, 128}, {10000000, 256}, {10000000, 512}, {10000000, 1024},
-};
+const std::vector<ArgminParams<int64_t>> argmin_inputs_i64 =
+  raft::util::itertools::product<ArgminParams<int64_t>>({1000, 10000, 100000, 1000000, 10000000},
+                                                        {64, 128, 256, 512, 1024});
 
 RAFT_BENCH_REGISTER((Argmin<float, uint32_t, int64_t>), "", argmin_inputs_i64);
 RAFT_BENCH_REGISTER((Argmin<double, uint32_t, int64_t>), "", argmin_inputs_i64);
 
-}  // namespace raft::bench::linalg
+}  // namespace raft::bench::matrix
diff --git a/cpp/bench/matrix/gather.cu b/cpp/bench/matrix/gather.cu
new file mode 100644
index 0000000000..c5d80744cd
--- /dev/null
+++ b/cpp/bench/matrix/gather.cu
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+#include <raft/matrix/gather.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/itertools.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+namespace raft::bench::matrix {
+
+template <typename IdxT>
+struct GatherParams {
+  IdxT rows, cols, map_length;
+};
+
+template <typename IdxT>
+inline auto operator<<(std::ostream& os, const GatherParams<IdxT>& p) -> std::ostream&
+{
+  os << p.rows << "#" << p.cols << "#" << p.map_length;
+  return os;
+}
+
+template <typename T, typename MapT, typename IdxT, bool Conditional = false>
+struct Gather : public fixture {
+  Gather(const GatherParams<IdxT>& p) : params(p) {}
+
+  void allocate_data(const ::benchmark::State& state) override
+  {
+    matrix  = raft::make_device_matrix<T, IdxT>(handle, params.rows, params.cols);
+    map     = raft::make_device_vector<MapT, IdxT>(handle, params.map_length);
+    out     = raft::make_device_matrix<T, IdxT>(handle, params.map_length, params.cols);
+    stencil = raft::make_device_vector<T, IdxT>(handle, Conditional ? params.map_length : IdxT(0));
+
+    raft::random::RngState rng{1234};
+    raft::random::uniform(
+      rng, matrix.data_handle(), params.rows * params.cols, T(-1), T(1), stream);
+    raft::random::uniformInt(
+      handle, rng, map.data_handle(), params.map_length, (MapT)0, (MapT)params.rows);
+    if constexpr (Conditional) {
+      raft::random::uniform(rng, stencil.data_handle(), params.map_length, T(-1), T(1), stream);
+    }
+    handle.sync_stream(stream);
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    std::ostringstream label_stream;
+    label_stream << params;
+    state.SetLabel(label_stream.str());
+
+    loop_on_state(state, [this]() {
+      auto matrix_const_view = raft::make_const_mdspan(matrix.view());
+      auto map_const_view    = raft::make_const_mdspan(map.view());
+      if constexpr (Conditional) {
+        auto stencil_const_view = raft::make_const_mdspan(stencil.view());
+        auto pred_op            = raft::plug_const_op(T(0.0), raft::greater_op());
+        raft::matrix::gather_if(
+          handle, matrix_const_view, out.view(), map_const_view, stencil_const_view, pred_op);
+      } else {
+        raft::matrix::gather(handle, matrix_const_view, map_const_view, out.view());
+      }
+    });
+  }
+
+ private:
+  GatherParams<IdxT> params;
+  raft::device_matrix<T, IdxT> matrix, out;
+  raft::device_vector<T, IdxT> stencil;
+  raft::device_vector<MapT, IdxT> map;
+};  // struct Gather
+
+template <typename T, typename MapT, typename IdxT>
+using GatherIf = Gather<T, MapT, IdxT, true>;
+
+const std::vector<GatherParams<int64_t>> gather_inputs_i64 =
+  raft::util::itertools::product<GatherParams<int64_t>>(
+    {1000000}, {10, 20, 50, 100, 200, 500}, {1000, 10000, 100000, 1000000});
+
+RAFT_BENCH_REGISTER((Gather<float, uint32_t, int64_t>), "", gather_inputs_i64);
+RAFT_BENCH_REGISTER((Gather<double, uint32_t, int64_t>), "", gather_inputs_i64);
+RAFT_BENCH_REGISTER((GatherIf<float, uint32_t, int64_t>), "", gather_inputs_i64);
+RAFT_BENCH_REGISTER((GatherIf<double, uint32_t, int64_t>), "", gather_inputs_i64);
+}  // namespace raft::bench::matrix
diff --git a/cpp/bench/matrix/select_k.cu b/cpp/bench/matrix/select_k.cu
new file mode 100644
index 0000000000..2c8b8bb67b
--- /dev/null
+++ b/cpp/bench/matrix/select_k.cu
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft_internal/matrix/select_k.cuh>
+
+#include <common/benchmark.hpp>
+
+#include <raft/core/device_resources.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/sparse/detail/utils.h>
+#include <raft/util/cudart_utils.hpp>
+
+#include <raft/matrix/detail/select_radix.cuh>
+#include <raft/matrix/detail/select_warpsort.cuh>
+#include <raft/matrix/select_k.cuh>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+namespace raft::matrix {
+
+using namespace raft::bench;  // NOLINT
+
+template <typename KeyT, typename IdxT, select::Algo Algo>
+struct selection : public fixture {
+  explicit selection(const select::params& p)
+    : params_(p),
+      in_dists_(p.batch_size * p.len, stream),
+      in_ids_(p.batch_size * p.len, stream),
+      out_dists_(p.batch_size * p.k, stream),
+      out_ids_(p.batch_size * p.k, stream)
+  {
+    raft::sparse::iota_fill(in_ids_.data(), IdxT(p.batch_size), IdxT(p.len), stream);
+    raft::random::RngState state{42};
+    raft::random::uniform(handle, state, in_dists_.data(), in_dists_.size(), KeyT(-1.0), KeyT(1.0));
+  }
+
+  void run_benchmark(::benchmark::State& state) override  // NOLINT
+  {
+    device_resources handle{stream};
+    using_pool_memory_res res;
+    try {
+      std::ostringstream label_stream;
+      label_stream << params_.batch_size << "#" << params_.len << "#" << params_.k;
+      state.SetLabel(label_stream.str());
+      loop_on_state(state, [this, &handle]() {
+        select::select_k_impl<KeyT, IdxT>(handle,
+                                          Algo,
+                                          in_dists_.data(),
+                                          in_ids_.data(),
+                                          params_.batch_size,
+                                          params_.len,
+                                          params_.k,
+                                          out_dists_.data(),
+                                          out_ids_.data(),
+                                          params_.select_min);
+      });
+    } catch (raft::exception& e) {
+      state.SkipWithError(e.what());
+    }
+  }
+
+ private:
+  const select::params params_;
+  rmm::device_uvector<KeyT> in_dists_, out_dists_;
+  rmm::device_uvector<IdxT> in_ids_, out_ids_;
+};
+
+const std::vector<select::params> kInputs{
+  {20000, 500, 1, true},   {20000, 500, 2, true},    {20000, 500, 4, true},
+  {20000, 500, 8, true},   {20000, 500, 16, true},   {20000, 500, 32, true},
+  {20000, 500, 64, true},  {20000, 500, 128, true},  {20000, 500, 256, true},
+
+  {1000, 10000, 1, true},  {1000, 10000, 2, true},   {1000, 10000, 4, true},
+  {1000, 10000, 8, true},  {1000, 10000, 16, true},  {1000, 10000, 32, true},
+  {1000, 10000, 64, true}, {1000, 10000, 128, true}, {1000, 10000, 256, true},
+
+  {100, 100000, 1, true},  {100, 100000, 2, true},   {100, 100000, 4, true},
+  {100, 100000, 8, true},  {100, 100000, 16, true},  {100, 100000, 32, true},
+  {100, 100000, 64, true}, {100, 100000, 128, true}, {100, 100000, 256, true},
+
+  {10, 1000000, 1, true},  {10, 1000000, 2, true},   {10, 1000000, 4, true},
+  {10, 1000000, 8, true},  {10, 1000000, 16, true},  {10, 1000000, 32, true},
+  {10, 1000000, 64, true}, {10, 1000000, 128, true}, {10, 1000000, 256, true},
+};
+
+#define SELECTION_REGISTER(KeyT, IdxT, A)                          \
+  namespace BENCHMARK_PRIVATE_NAME(selection)                      \
+  {                                                                \
+    using SelectK = selection<KeyT, IdxT, select::Algo::A>;        \
+    RAFT_BENCH_REGISTER(SelectK, #KeyT "/" #IdxT "/" #A, kInputs); \
+  }
+
+SELECTION_REGISTER(float, int, kPublicApi);           // NOLINT
+SELECTION_REGISTER(float, int, kRadix8bits);          // NOLINT
+SELECTION_REGISTER(float, int, kRadix11bits);         // NOLINT
+SELECTION_REGISTER(float, int, kWarpAuto);            // NOLINT
+SELECTION_REGISTER(float, int, kWarpImmediate);       // NOLINT
+SELECTION_REGISTER(float, int, kWarpFiltered);        // NOLINT
+SELECTION_REGISTER(float, int, kWarpDistributed);     // NOLINT
+SELECTION_REGISTER(float, int, kWarpDistributedShm);  // NOLINT
+
+SELECTION_REGISTER(double, int, kRadix8bits);   // NOLINT
+SELECTION_REGISTER(double, int, kRadix11bits);  // NOLINT
+SELECTION_REGISTER(double, int, kWarpAuto);     // NOLINT
+
+SELECTION_REGISTER(double, size_t, kRadix8bits);          // NOLINT
+SELECTION_REGISTER(double, size_t, kRadix11bits);         // NOLINT
+SELECTION_REGISTER(double, size_t, kWarpImmediate);       // NOLINT
+SELECTION_REGISTER(double, size_t, kWarpFiltered);        // NOLINT
+SELECTION_REGISTER(double, size_t, kWarpDistributed);     // NOLINT
+SELECTION_REGISTER(double, size_t, kWarpDistributedShm);  // NOLINT
+
+}  // namespace raft::matrix
diff --git a/cpp/bench/neighbors/knn.cuh b/cpp/bench/neighbors/knn.cuh
index d38631b289..633ea33670 100644
--- a/cpp/bench/neighbors/knn.cuh
+++ b/cpp/bench/neighbors/knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,6 +32,7 @@
 #include <raft/spatial/knn/specializations.cuh>
 #if defined RAFT_DISTANCE_COMPILED
 #include <raft/cluster/specializations.cuh>
+#include <raft/neighbors/specializations.cuh>
 #endif
 #endif
 
@@ -148,7 +149,7 @@ struct ivf_flat_knn {
   raft::neighbors::ivf_flat::search_params search_params;
   params ps;
 
-  ivf_flat_knn(const raft::handle_t& handle, const params& ps, const ValT* data) : ps(ps)
+  ivf_flat_knn(const raft::device_resources& handle, const params& ps, const ValT* data) : ps(ps)
   {
     index_params.n_lists = 4096;
     index_params.metric  = raft::distance::DistanceType::L2Expanded;
@@ -156,7 +157,7 @@ struct ivf_flat_knn {
       handle, index_params, data, IdxT(ps.n_samples), uint32_t(ps.n_dims)));
   }
 
-  void search(const raft::handle_t& handle,
+  void search(const raft::device_resources& handle,
               const ValT* search_items,
               dist_t* out_dists,
               IdxT* out_idxs)
@@ -176,7 +177,7 @@ struct ivf_pq_knn {
   raft::neighbors::ivf_pq::search_params search_params;
   params ps;
 
-  ivf_pq_knn(const raft::handle_t& handle, const params& ps, const ValT* data) : ps(ps)
+  ivf_pq_knn(const raft::device_resources& handle, const params& ps, const ValT* data) : ps(ps)
   {
     index_params.n_lists = 4096;
     index_params.metric  = raft::distance::DistanceType::L2Expanded;
@@ -184,7 +185,7 @@ struct ivf_pq_knn {
       handle, index_params, data, IdxT(ps.n_samples), uint32_t(ps.n_dims)));
   }
 
-  void search(const raft::handle_t& handle,
+  void search(const raft::device_resources& handle,
               const ValT* search_items,
               dist_t* out_dists,
               IdxT* out_idxs)
@@ -202,12 +203,12 @@ struct brute_force_knn {
   ValT* index;
   params ps;
 
-  brute_force_knn(const raft::handle_t& handle, const params& ps, const ValT* data)
+  brute_force_knn(const raft::device_resources& handle, const params& ps, const ValT* data)
     : index(const_cast<ValT*>(data)), ps(ps)
   {
   }
 
-  void search(const raft::handle_t& handle,
+  void search(const raft::device_resources& handle,
               const ValT* search_items,
               dist_t* out_dists,
               IdxT* out_idxs)
@@ -287,7 +288,7 @@ struct knn : public fixture {
       std::ostringstream label_stream;
       label_stream << params_ << "#" << strategy_ << "#" << scope_;
       state.SetLabel(label_stream.str());
-      raft::handle_t handle(stream);
+      raft::device_resources handle(stream);
       std::optional<ImplT> index;
 
       if (scope_ == Scope::SEARCH) {  // also implies TransferStrategy::NO_COPY
diff --git a/cpp/bench/neighbors/refine.cu b/cpp/bench/neighbors/refine.cu
index a038905ace..255004361c 100644
--- a/cpp/bench/neighbors/refine.cu
+++ b/cpp/bench/neighbors/refine.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,15 +14,16 @@
  * limitations under the License.
  */
 
-#include <common/benchmark.hpp>
+#include <raft_internal/neighbors/refine_helper.cuh>
 
-#include <raft/random/rng.cuh>
+#include <common/benchmark.hpp>
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/neighbors/detail/refine.cuh>
 #include <raft/neighbors/refine.cuh>
+#include <raft/random/rng.cuh>
 
 #if defined RAFT_DISTANCE_COMPILED
 #include <raft/distance/specializations.cuh>
@@ -36,12 +37,10 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
-#include "../../test/neighbors/refine_helper.cuh"
-
 #include <iostream>
 #include <sstream>
 
-using namespace raft::neighbors::detail;
+using namespace raft::neighbors;
 
 namespace raft::bench::neighbors {
 
@@ -53,7 +52,7 @@ inline auto operator<<(std::ostream& os, const RefineInputs<IdxT>& p) -> std::os
   return os;
 }
 
-RefineInputs<int64_t> p;
+RefineInputs<uint64_t> p;
 
 template <typename DataT, typename DistanceT, typename IdxT>
 class RefineAnn : public fixture {
@@ -95,28 +94,28 @@ class RefineAnn : public fixture {
   }
 
  private:
-  raft::handle_t handle_;
+  raft::device_resources handle_;
   RefineHelper<DataT, DistanceT, IdxT> data;
 };
 
-std::vector<RefineInputs<int64_t>> getInputs()
+std::vector<RefineInputs<uint64_t>> getInputs()
 {
-  std::vector<RefineInputs<int64_t>> out;
+  std::vector<RefineInputs<uint64_t>> out;
   raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded;
   for (bool host_data : {true, false}) {
-    for (int64_t n_queries : {1000, 10000}) {
-      for (int64_t dim : {128, 512}) {
-        out.push_back(RefineInputs<int64_t>{n_queries, 2000000, dim, 32, 128, metric, host_data});
-        out.push_back(RefineInputs<int64_t>{n_queries, 2000000, dim, 10, 40, metric, host_data});
+    for (uint64_t n_queries : {1000, 10000}) {
+      for (uint64_t dim : {128, 512}) {
+        out.push_back(RefineInputs<uint64_t>{n_queries, 2000000, dim, 32, 128, metric, host_data});
+        out.push_back(RefineInputs<uint64_t>{n_queries, 2000000, dim, 10, 40, metric, host_data});
       }
     }
   }
   return out;
 }
 
-using refine_float_int64 = RefineAnn<float, float, int64_t>;
+using refine_float_int64 = RefineAnn<float, float, uint64_t>;
 RAFT_BENCH_REGISTER(refine_float_int64, "", getInputs());
 
-using refine_uint8_int64 = RefineAnn<uint8_t, float, int64_t>;
+using refine_uint8_int64 = RefineAnn<uint8_t, float, uint64_t>;
 RAFT_BENCH_REGISTER(refine_uint8_int64, "", getInputs());
 }  // namespace raft::bench::neighbors
diff --git a/cpp/bench/neighbors/selection.cu b/cpp/bench/neighbors/selection.cu
deleted file mode 100644
index 1f116c199f..0000000000
--- a/cpp/bench/neighbors/selection.cu
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <common/benchmark.hpp>
-#include <raft/spatial/knn/knn.cuh>
-
-#if defined RAFT_NN_COMPILED
-#include <raft/spatial/knn/specializations.cuh>
-#endif
-
-#include <raft/random/rng.cuh>
-#include <raft/sparse/detail/utils.h>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/mr/device/pool_memory_resource.hpp>
-
-namespace raft::bench::spatial {
-
-struct params {
-  int n_inputs;
-  int input_len;
-  int k;
-  int select_min;
-};
-
-template <typename KeyT, typename IdxT, raft::spatial::knn::SelectKAlgo Algo>
-struct selection : public fixture {
-  explicit selection(const params& p)
-    : params_(p),
-      in_dists_(p.n_inputs * p.input_len, stream),
-      in_ids_(p.n_inputs * p.input_len, stream),
-      out_dists_(p.n_inputs * p.k, stream),
-      out_ids_(p.n_inputs * p.k, stream)
-  {
-    raft::sparse::iota_fill(in_ids_.data(), IdxT(p.n_inputs), IdxT(p.input_len), stream);
-    raft::random::RngState state{42};
-    raft::random::uniform(handle, state, in_dists_.data(), in_dists_.size(), KeyT(-1.0), KeyT(1.0));
-  }
-
-  void run_benchmark(::benchmark::State& state) override
-  {
-    using_pool_memory_res res;
-    try {
-      std::ostringstream label_stream;
-      label_stream << params_.n_inputs << "#" << params_.input_len << "#" << params_.k;
-      state.SetLabel(label_stream.str());
-      loop_on_state(state, [this]() {
-        raft::spatial::knn::select_k<IdxT, KeyT>(in_dists_.data(),
-                                                 in_ids_.data(),
-                                                 params_.n_inputs,
-                                                 params_.input_len,
-                                                 out_dists_.data(),
-                                                 out_ids_.data(),
-                                                 params_.select_min,
-                                                 params_.k,
-                                                 stream,
-                                                 Algo);
-      });
-    } catch (raft::exception& e) {
-      state.SkipWithError(e.what());
-    }
-  }
-
- private:
-  const params params_;
-  rmm::device_uvector<KeyT> in_dists_, out_dists_;
-  rmm::device_uvector<IdxT> in_ids_, out_ids_;
-};
-
-const std::vector<params> kInputs{
-  {20000, 500, 1, true},   {20000, 500, 2, true},    {20000, 500, 4, true},
-  {20000, 500, 8, true},   {20000, 500, 16, true},   {20000, 500, 32, true},
-  {20000, 500, 64, true},  {20000, 500, 128, true},  {20000, 500, 256, true},
-
-  {1000, 10000, 1, true},  {1000, 10000, 2, true},   {1000, 10000, 4, true},
-  {1000, 10000, 8, true},  {1000, 10000, 16, true},  {1000, 10000, 32, true},
-  {1000, 10000, 64, true}, {1000, 10000, 128, true}, {1000, 10000, 256, true},
-
-  {100, 100000, 1, true},  {100, 100000, 2, true},   {100, 100000, 4, true},
-  {100, 100000, 8, true},  {100, 100000, 16, true},  {100, 100000, 32, true},
-  {100, 100000, 64, true}, {100, 100000, 128, true}, {100, 100000, 256, true},
-
-  {10, 1000000, 1, true},  {10, 1000000, 2, true},   {10, 1000000, 4, true},
-  {10, 1000000, 8, true},  {10, 1000000, 16, true},  {10, 1000000, 32, true},
-  {10, 1000000, 64, true}, {10, 1000000, 128, true}, {10, 1000000, 256, true},
-};
-
-#define SELECTION_REGISTER(KeyT, IdxT, Algo)                                      \
-  namespace BENCHMARK_PRIVATE_NAME(selection)                                     \
-  {                                                                               \
-    using SelectK = selection<KeyT, IdxT, raft::spatial::knn::SelectKAlgo::Algo>; \
-    RAFT_BENCH_REGISTER(SelectK, #KeyT "/" #IdxT "/" #Algo, kInputs);             \
-  }
-
-SELECTION_REGISTER(float, int, FAISS);
-SELECTION_REGISTER(float, int, RADIX_8_BITS);
-SELECTION_REGISTER(float, int, RADIX_11_BITS);
-SELECTION_REGISTER(float, int, WARP_SORT);
-
-SELECTION_REGISTER(double, int, FAISS);
-SELECTION_REGISTER(double, int, RADIX_8_BITS);
-SELECTION_REGISTER(double, int, RADIX_11_BITS);
-SELECTION_REGISTER(double, int, WARP_SORT);
-
-SELECTION_REGISTER(double, size_t, FAISS);
-SELECTION_REGISTER(double, size_t, RADIX_8_BITS);
-SELECTION_REGISTER(double, size_t, RADIX_11_BITS);
-SELECTION_REGISTER(double, size_t, WARP_SORT);
-
-}  // namespace raft::bench::spatial
diff --git a/cpp/bench/random/make_blobs.cu b/cpp/bench/random/make_blobs.cu
index fdd4ef61d2..950d80c499 100644
--- a/cpp/bench/random/make_blobs.cu
+++ b/cpp/bench/random/make_blobs.cu
@@ -25,6 +25,12 @@ struct make_blobs_inputs {
   bool row_major;
 };  // struct make_blobs_inputs
 
+inline auto operator<<(std::ostream& os, const make_blobs_inputs& p) -> std::ostream&
+{
+  os << p.rows << "#" << p.cols << "#" << p.clusters << "#" << p.row_major;
+  return os;
+}
+
 template <typename T>
 struct make_blobs : public fixture {
   make_blobs(const make_blobs_inputs& p)
@@ -34,6 +40,10 @@ struct make_blobs : public fixture {
 
   void run_benchmark(::benchmark::State& state) override
   {
+    std::ostringstream label_stream;
+    label_stream << params;
+    state.SetLabel(label_stream.str());
+
     loop_on_state(state, [this]() {
       raft::random::make_blobs(data.data(),
                                labels.data(),
diff --git a/cpp/bench/random/permute.cu b/cpp/bench/random/permute.cu
index 5364bb44e3..cb9e21868b 100644
--- a/cpp/bench/random/permute.cu
+++ b/cpp/bench/random/permute.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -50,7 +50,7 @@ struct permute : public fixture {
   }
 
  private:
-  raft::handle_t handle;
+  raft::device_resources handle;
   permute_inputs params;
   rmm::device_uvector<T> out, in;
   rmm::device_uvector<int> perms;
diff --git a/cpp/bench/sparse/convert_csr.cu b/cpp/bench/sparse/convert_csr.cu
index 830fab13cc..c9dcae6985 100644
--- a/cpp/bench/sparse/convert_csr.cu
+++ b/cpp/bench/sparse/convert_csr.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -107,7 +107,7 @@ struct bench_base : public fixture {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   bench_param<index_t> params;
   rmm::device_uvector<bool> adj;
   rmm::device_uvector<index_t> row_ind;
diff --git a/cpp/cmake/thirdparty/get_cutlass.cmake b/cpp/cmake/thirdparty/get_cutlass.cmake
index 811a5466c3..3e02ce064e 100644
--- a/cpp/cmake/thirdparty/get_cutlass.cmake
+++ b/cpp/cmake/thirdparty/get_cutlass.cmake
@@ -30,6 +30,10 @@ function(find_and_configure_cutlass)
       CACHE BOOL "Disable CUTLASS to build with cuBLAS library."
   )
 
+  if (CUDA_STATIC_RUNTIME)
+    set(CUDART_LIBRARY "${CUDA_cudart_static_LIBRARY}" CACHE FILEPATH "fixing cutlass cmake code" FORCE)
+  endif()
+
   rapids_cpm_find(
     NvidiaCutlass ${PKG_VERSION}
     GLOBAL_TARGETS nvidia::cutlass::cutlass
diff --git a/cpp/include/raft/cluster/detail/agglomerative.cuh b/cpp/include/raft/cluster/detail/agglomerative.cuh
index 618f852bba..f4b2ecf051 100644
--- a/cpp/include/raft/cluster/detail/agglomerative.cuh
+++ b/cpp/include/raft/cluster/detail/agglomerative.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
@@ -100,7 +100,7 @@ class UnionFind {
  * @param[out] out_size cluster sizes of output
  */
 template <typename value_idx, typename value_t>
-void build_dendrogram_host(const handle_t& handle,
+void build_dendrogram_host(raft::device_resources const& handle,
                            const value_idx* rows,
                            const value_idx* cols,
                            const value_t* data,
@@ -236,7 +236,7 @@ struct init_label_roots {
  * @param n_leaves
  */
 template <typename value_idx, int tpb = 256>
-void extract_flattened_clusters(const raft::handle_t& handle,
+void extract_flattened_clusters(raft::device_resources const& handle,
                                 value_idx* labels,
                                 const value_idx* children,
                                 size_t n_clusters,
diff --git a/cpp/include/raft/cluster/detail/connectivities.cuh b/cpp/include/raft/cluster/detail/connectivities.cuh
index a07045f0d2..163670f29a 100644
--- a/cpp/include/raft/cluster/detail/connectivities.cuh
+++ b/cpp/include/raft/cluster/detail/connectivities.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
@@ -24,6 +24,7 @@
 #include <rmm/device_uvector.hpp>
 
 #include <raft/cluster/single_linkage_types.hpp>
+#include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.hpp>
@@ -39,7 +40,7 @@ namespace raft::cluster::detail {
 
 template <raft::cluster::LinkageDistance dist_type, typename value_idx, typename value_t>
 struct distance_graph_impl {
-  void run(const raft::handle_t& handle,
+  void run(raft::device_resources const& handle,
            const value_t* X,
            size_t m,
            size_t n,
@@ -57,7 +58,7 @@ struct distance_graph_impl {
  */
 template <typename value_idx, typename value_t>
 struct distance_graph_impl<raft::cluster::LinkageDistance::KNN_GRAPH, value_idx, value_t> {
-  void run(const raft::handle_t& handle,
+  void run(raft::device_resources const& handle,
            const value_t* X,
            size_t m,
            size_t n,
@@ -103,6 +104,98 @@ struct distance_graph_impl<raft::cluster::LinkageDistance::KNN_GRAPH, value_idx,
   }
 };
 
+template <typename value_idx>
+__global__ void fill_indices2(value_idx* indices, size_t m, size_t nnz)
+{
+  value_idx tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (tid >= nnz) return;
+  value_idx v  = tid % m;
+  indices[tid] = v;
+}
+
+/**
+ * Compute connected CSR of pairwise distances
+ * @tparam value_idx
+ * @tparam value_t
+ * @param handle
+ * @param X
+ * @param m
+ * @param n
+ * @param metric
+ * @param[out] indptr
+ * @param[out] indices
+ * @param[out] data
+ */
+template <typename value_idx, typename value_t>
+void pairwise_distances(const raft::device_resources& handle,
+                        const value_t* X,
+                        size_t m,
+                        size_t n,
+                        raft::distance::DistanceType metric,
+                        value_idx* indptr,
+                        value_idx* indices,
+                        value_t* data)
+{
+  auto stream      = handle.get_stream();
+  auto exec_policy = handle.get_thrust_policy();
+
+  value_idx nnz = m * m;
+
+  value_idx blocks = raft::ceildiv(nnz, (value_idx)256);
+  fill_indices2<value_idx><<<blocks, 256, 0, stream>>>(indices, m, nnz);
+
+  thrust::sequence(exec_policy, indptr, indptr + m, 0, (int)m);
+
+  raft::update_device(indptr + m, &nnz, 1, stream);
+
+  // TODO: It would ultimately be nice if the MST could accept
+  // dense inputs directly so we don't need to double the memory
+  // usage to hand it a sparse array here.
+  distance::pairwise_distance<value_t, value_idx>(handle, X, X, data, m, m, n, metric);
+  // self-loops get max distance
+  auto transform_in =
+    thrust::make_zip_iterator(thrust::make_tuple(thrust::make_counting_iterator(0), data));
+
+  thrust::transform(exec_policy,
+                    transform_in,
+                    transform_in + nnz,
+                    data,
+                    [=] __device__(const thrust::tuple<value_idx, value_t>& tup) {
+                      value_idx idx  = thrust::get<0>(tup);
+                      bool self_loop = idx % m == idx / m;
+                      return (self_loop * std::numeric_limits<value_t>::max()) +
+                             (!self_loop * thrust::get<1>(tup));
+                    });
+}
+
+/**
+ * Connectivities specialization for pairwise distances
+ * @tparam value_idx
+ * @tparam value_t
+ */
+template <typename value_idx, typename value_t>
+struct distance_graph_impl<raft::cluster::LinkageDistance::PAIRWISE, value_idx, value_t> {
+  void run(const raft::device_resources& handle,
+           const value_t* X,
+           size_t m,
+           size_t n,
+           raft::distance::DistanceType metric,
+           rmm::device_uvector<value_idx>& indptr,
+           rmm::device_uvector<value_idx>& indices,
+           rmm::device_uvector<value_t>& data,
+           int c)
+  {
+    auto stream = handle.get_stream();
+
+    size_t nnz = m * m;
+
+    indices.resize(nnz, stream);
+    data.resize(nnz, stream);
+
+    pairwise_distances(handle, X, m, n, metric, indptr.data(), indices.data(), data.data());
+  }
+};
+
 /**
  * Returns a CSR connectivities graph based on the given linkage distance.
  * @tparam value_idx
@@ -120,7 +213,7 @@ struct distance_graph_impl<raft::cluster::LinkageDistance::KNN_GRAPH, value_idx,
  *             which will guarantee k <= log(n) + c
  */
 template <typename value_idx, typename value_t, raft::cluster::LinkageDistance dist_type>
-void get_distance_graph(const raft::handle_t& handle,
+void get_distance_graph(raft::device_resources const& handle,
                         const value_t* X,
                         size_t m,
                         size_t n,
diff --git a/cpp/include/raft/cluster/detail/kmeans.cuh b/cpp/include/raft/cluster/detail/kmeans.cuh
index 5aa9870b46..9632fedb9d 100644
--- a/cpp/include/raft/cluster/detail/kmeans.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,17 +31,19 @@
 #include <raft/common/nvtx.hpp>
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/device_mdarray.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/host_mdarray.hpp>
 #include <raft/core/kvp.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/mdarray.hpp>
+#include <raft/core/operators.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/map_then_reduce.cuh>
 #include <raft/linalg/matrix_vector_op.cuh>
 #include <raft/linalg/norm.cuh>
 #include <raft/linalg/reduce_cols_by_key.cuh>
 #include <raft/linalg/reduce_rows_by_key.cuh>
+#include <raft/matrix/gather.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <rmm/device_scalar.hpp>
@@ -57,7 +59,7 @@ namespace detail {
 
 // Selects 'n_clusters' samples randomly from X
 template <typename DataT, typename IndexT>
-void initRandom(const raft::handle_t& handle,
+void initRandom(raft::device_resources const& handle,
                 const KMeansParams& params,
                 raft::device_matrix_view<const DataT, IndexT> X,
                 raft::device_matrix_view<DataT, IndexT> centroids)
@@ -83,7 +85,7 @@ void initRandom(const raft::handle_t& handle,
  * 5: end for
  */
 template <typename DataT, typename IndexT>
-void kmeansPlusPlus(const raft::handle_t& handle,
+void kmeansPlusPlus(raft::device_resources const& handle,
                     const KMeansParams& params,
                     raft::device_matrix_view<const DataT, IndexT> X,
                     raft::device_matrix_view<DataT, IndexT> centroidsRawData,
@@ -109,7 +111,7 @@ void kmeansPlusPlus(const raft::handle_t& handle,
   auto dataBatchSize = getDataBatchSize(params.batch_samples, n_samples);
 
   // temporary buffers
-  std::vector<DataT> h_wt(n_samples);
+  auto indices            = raft::make_device_vector<IndexT, IndexT>(handle, n_trials);
   auto centroidCandidates = raft::make_device_matrix<DataT, IndexT>(handle, n_trials, n_features);
   auto costPerCandidate   = raft::make_device_vector<DataT, IndexT>(handle, n_trials);
   auto minClusterDistance = raft::make_device_vector<DataT, IndexT>(handle, n_samples);
@@ -119,6 +121,17 @@ void kmeansPlusPlus(const raft::handle_t& handle,
   rmm::device_scalar<DataT> clusterCost(stream);
   rmm::device_scalar<cub::KeyValuePair<int, DataT>> minClusterIndexAndDistance(stream);
 
+  // Device and matrix views
+  raft::device_vector_view<IndexT, IndexT> indices_view(indices.data_handle(), n_trials);
+  auto const_weights_view =
+    raft::make_device_vector_view<const DataT, IndexT>(minClusterDistance.data_handle(), n_samples);
+  auto const_indices_view =
+    raft::make_device_vector_view<const IndexT, IndexT>(indices.data_handle(), n_trials);
+  auto const_X_view =
+    raft::make_device_matrix_view<const DataT, IndexT>(X.data_handle(), n_samples, n_features);
+  raft::device_matrix_view<DataT, IndexT> candidates_view(
+    centroidCandidates.data_handle(), n_trials, n_features);
+
   // L2 norm of X: ||c||^2
   auto L2NormX = raft::make_device_vector<DataT, IndexT>(handle, n_samples);
 
@@ -133,6 +146,7 @@ void kmeansPlusPlus(const raft::handle_t& handle,
                           stream);
   }
 
+  raft::random::RngState rng(params.rng_state.seed, params.rng_state.type);
   std::mt19937 gen(params.rng_state.seed);
   std::uniform_int_distribution<> dis(0, n_samples - 1);
 
@@ -169,20 +183,9 @@ void kmeansPlusPlus(const raft::handle_t& handle,
     // <<< Step-3 >>> : Sample x in X with probability p_x = d^2(x, C) / phi_X (C)
     // Choose 'n_trials' centroid candidates from X with probability proportional to the squared
     // distance to the nearest existing cluster
-    raft::copy(h_wt.data(), minClusterDistance.data_handle(), minClusterDistance.size(), stream);
-    handle.sync_stream(stream);
 
-    // Note - n_trials is relative small here, we don't need raft::gather call
-    std::discrete_distribution<> d(h_wt.begin(), h_wt.end());
-    for (int cIdx = 0; cIdx < n_trials; ++cIdx) {
-      auto rand_idx     = d(gen);
-      auto randCentroid = raft::make_device_matrix_view<const DataT, IndexT>(
-        X.data_handle() + n_features * rand_idx, 1, n_features);
-      raft::copy(centroidCandidates.data_handle() + cIdx * n_features,
-                 randCentroid.data_handle(),
-                 randCentroid.size(),
-                 stream);
-    }
+    raft::random::discrete(handle, rng, indices_view, const_weights_view);
+    raft::matrix::gather(handle, const_X_view, const_indices_view, candidates_view);
 
     // Calculate pairwise distance between X and the centroid candidates
     // Output - pwd [n_trials x n_samples]
@@ -195,16 +198,15 @@ void kmeansPlusPlus(const raft::handle_t& handle,
     // Outputs minDistanceBuf[n_trials x n_samples] where minDistance[i, :] contains updated
     // minClusterDistance that includes candidate-i
     auto minDistBuf = distBuffer.view();
-    raft::linalg::matrixVectorOp(
-      minDistBuf.data_handle(),
-      pwd.data_handle(),
-      minClusterDistance.data_handle(),
-      pwd.extent(1),
-      pwd.extent(0),
-      true,
-      true,
-      [=] __device__(DataT mat, DataT vec) { return vec <= mat ? vec : mat; },
-      stream);
+    raft::linalg::matrixVectorOp(minDistBuf.data_handle(),
+                                 pwd.data_handle(),
+                                 minClusterDistance.data_handle(),
+                                 pwd.extent(1),
+                                 pwd.extent(0),
+                                 true,
+                                 true,
+                                 raft::min_op{},
+                                 stream);
 
     // Calculate costPerCandidate[n_trials] where costPerCandidate[i] is the cluster cost when using
     // centroid candidate-i
@@ -226,7 +228,8 @@ void kmeansPlusPlus(const raft::handle_t& handle,
                                 temp_storage_bytes,
                                 costPerCandidate.data_handle(),
                                 minClusterIndexAndDistance.data(),
-                                costPerCandidate.extent(0));
+                                costPerCandidate.extent(0),
+                                stream);
 
       // Allocate temporary storage
       workspace.resize(temp_storage_bytes, stream);
@@ -236,10 +239,12 @@ void kmeansPlusPlus(const raft::handle_t& handle,
                                 temp_storage_bytes,
                                 costPerCandidate.data_handle(),
                                 minClusterIndexAndDistance.data(),
-                                costPerCandidate.extent(0));
+                                costPerCandidate.extent(0),
+                                stream);
 
       int bestCandidateIdx = -1;
       raft::copy(&bestCandidateIdx, &minClusterIndexAndDistance.data()->key, 1, stream);
+      handle.sync_stream();
       /// <<< End of Step-3 >>>
 
       /// <<< Step-4 >>>: C = C U {x}
@@ -277,7 +282,7 @@ void kmeansPlusPlus(const raft::handle_t& handle,
  * @param[inout] workspace
  */
 template <typename DataT, typename IndexT, typename LabelsIterator>
-void update_centroids(const raft::handle_t& handle,
+void update_centroids(raft::device_resources const& handle,
                       raft::device_matrix_view<const DataT, IndexT, row_major> X,
                       raft::device_vector_view<const DataT, IndexT> sample_weights,
                       raft::device_matrix_view<const DataT, IndexT, row_major> centroids,
@@ -321,21 +326,15 @@ void update_centroids(const raft::handle_t& handle,
   //   weight_per_cluster[n_clusters] - 1D array, weight_per_cluster[i] contains sum of weights in
   //   cluster-i.
   // Note - when weight_per_cluster[i] is 0, new_centroids[i] is reset to 0
-  raft::linalg::matrixVectorOp(
-    new_centroids.data_handle(),
-    new_centroids.data_handle(),
-    weight_per_cluster.data_handle(),
-    new_centroids.extent(1),
-    new_centroids.extent(0),
-    true,
-    false,
-    [=] __device__(DataT mat, DataT vec) {
-      if (vec == 0)
-        return DataT(0);
-      else
-        return mat / vec;
-    },
-    handle.get_stream());
+  raft::linalg::matrixVectorOp(new_centroids.data_handle(),
+                               new_centroids.data_handle(),
+                               weight_per_cluster.data_handle(),
+                               new_centroids.extent(1),
+                               new_centroids.extent(0),
+                               true,
+                               false,
+                               raft::div_checkzero_op{},
+                               handle.get_stream());
 
   // copy centroids[i] to new_centroids[i] when weight_per_cluster[i] is 0
   cub::ArgIndexInputIterator<DataT*> itr_wt(weight_per_cluster.data_handle());
@@ -351,15 +350,13 @@ void update_centroids(const raft::handle_t& handle,
       // copy when the sum of weights in the cluster is 0
       return map.value == 0;
     },
-    [=] __device__(raft::KeyValuePair<ptrdiff_t, DataT> map) {  // map
-      return map.key;
-    },
+    raft::key_op{},
     handle.get_stream());
 }
 
 // TODO: Resizing is needed to use mdarray instead of rmm::device_uvector
 template <typename DataT, typename IndexT>
-void kmeans_fit_main(const raft::handle_t& handle,
+void kmeans_fit_main(raft::device_resources const& handle,
                      const KMeansParams& params,
                      raft::device_matrix_view<const DataT, IndexT> X,
                      raft::device_vector_view<const DataT, IndexT> weight,
@@ -394,7 +391,7 @@ void kmeans_fit_main(const raft::handle_t& handle,
   // resource
   auto wtInCluster = raft::make_device_vector<DataT, IndexT>(handle, n_clusters);
 
-  rmm::device_scalar<raft::KeyValuePair<IndexT, DataT>> clusterCostD(stream);
+  rmm::device_scalar<DataT> clusterCostD(stream);
 
   // L2 norm of X: ||x||^2
   auto L2NormX = raft::make_device_vector<DataT, IndexT>(handle, n_samples);
@@ -465,16 +462,12 @@ void kmeans_fit_main(const raft::handle_t& handle,
     // compute the squared norm between the newCentroids and the original
     // centroids, destructor releases the resource
     auto sqrdNorm = raft::make_device_scalar(handle, DataT(0));
-    raft::linalg::mapThenSumReduce(
-      sqrdNorm.data_handle(),
-      newCentroids.size(),
-      [=] __device__(const DataT a, const DataT b) {
-        DataT diff = a - b;
-        return diff * diff;
-      },
-      stream,
-      centroids.data_handle(),
-      newCentroids.data_handle());
+    raft::linalg::mapThenSumReduce(sqrdNorm.data_handle(),
+                                   newCentroids.size(),
+                                   raft::sqdiff_op{},
+                                   stream,
+                                   centroids.data_handle(),
+                                   newCentroids.data_handle());
 
     DataT sqrdNormError = 0;
     raft::copy(&sqrdNormError, sqrdNorm.data_handle(), sqrdNorm.size(), stream);
@@ -489,18 +482,11 @@ void kmeans_fit_main(const raft::handle_t& handle,
                                  minClusterAndDistance.view(),
                                  workspace,
                                  raft::make_device_scalar_view(clusterCostD.data()),
-                                 [] __device__(const raft::KeyValuePair<IndexT, DataT>& a,
-                                               const raft::KeyValuePair<IndexT, DataT>& b) {
-                                   raft::KeyValuePair<IndexT, DataT> res;
-                                   res.key   = 0;
-                                   res.value = a.value + b.value;
-                                   return res;
-                                 });
-
-      DataT curClusteringCost = 0;
-      raft::copy(&curClusteringCost, &(clusterCostD.data()->value), 1, stream);
-
-      handle.sync_stream(stream);
+                                 raft::value_op{},
+                                 raft::add_op{});
+
+      DataT curClusteringCost = clusterCostD.value(stream);
+
       ASSERT(curClusteringCost != (DataT)0.0,
              "Too few points and centroids being found is getting 0 cost from "
              "centers");
@@ -553,15 +539,10 @@ void kmeans_fit_main(const raft::handle_t& handle,
                              minClusterAndDistance.view(),
                              workspace,
                              raft::make_device_scalar_view(clusterCostD.data()),
-                             [] __device__(const raft::KeyValuePair<IndexT, DataT>& a,
-                                           const raft::KeyValuePair<IndexT, DataT>& b) {
-                               raft::KeyValuePair<IndexT, DataT> res;
-                               res.key   = 0;
-                               res.value = a.value + b.value;
-                               return res;
-                             });
+                             raft::value_op{},
+                             raft::add_op{});
 
-  raft::copy(inertia.data_handle(), &(clusterCostD.data()->value), 1, stream);
+  inertia[0] = clusterCostD.value(stream);
 
   RAFT_LOG_DEBUG("KMeans.fit: completed after %d iterations with %f inertia[0] ",
                  n_iter[0] > params.max_iter ? n_iter[0] - 1 : n_iter[0],
@@ -592,7 +573,7 @@ void kmeans_fit_main(const raft::handle_t& handle,
 
  */
 template <typename DataT, typename IndexT>
-void initScalableKMeansPlusPlus(const raft::handle_t& handle,
+void initScalableKMeansPlusPlus(raft::device_resources const& handle,
                                 const KMeansParams& params,
                                 raft::device_matrix_view<const DataT, IndexT> X,
                                 raft::device_matrix_view<DataT, IndexT> centroidsRawData,
@@ -673,7 +654,8 @@ void initScalableKMeansPlusPlus(const raft::handle_t& handle,
                              minClusterDistanceVec.view(),
                              workspace,
                              raft::make_device_scalar_view(clusterCost.data()),
-                             [] __device__(const DataT& a, const DataT& b) { return a + b; });
+                             raft::identity_op{},
+                             raft::add_op{});
 
   auto psi = clusterCost.value(stream);
 
@@ -705,7 +687,8 @@ void initScalableKMeansPlusPlus(const raft::handle_t& handle,
                                minClusterDistanceVec.view(),
                                workspace,
                                raft::make_device_scalar_view<DataT>(clusterCost.data()),
-                               [] __device__(const DataT& a, const DataT& b) { return a + b; });
+                               raft::identity_op{},
+                               raft::add_op{});
 
     psi = clusterCost.value(stream);
 
@@ -833,7 +816,7 @@ void initScalableKMeansPlusPlus(const raft::handle_t& handle,
  * @param[out]    n_iter        Number of iterations run.
  */
 template <typename DataT, typename IndexT>
-void kmeans_fit(handle_t const& handle,
+void kmeans_fit(raft::device_resources const& handle,
                 const KMeansParams& params,
                 raft::device_matrix_view<const DataT> X,
                 std::optional<raft::device_vector_view<const DataT>> sample_weight,
@@ -972,7 +955,7 @@ void kmeans_fit(handle_t const& handle,
 }
 
 template <typename DataT, typename IndexT = int>
-void kmeans_fit(handle_t const& handle,
+void kmeans_fit(raft::device_resources const& handle,
                 const KMeansParams& params,
                 const DataT* X,
                 const DataT* sample_weight,
@@ -997,7 +980,7 @@ void kmeans_fit(handle_t const& handle,
 }
 
 template <typename DataT, typename IndexT>
-void kmeans_predict(handle_t const& handle,
+void kmeans_predict(raft::device_resources const& handle,
                     const KMeansParams& params,
                     raft::device_matrix_view<const DataT> X,
                     std::optional<raft::device_vector_view<const DataT>> sample_weight,
@@ -1074,7 +1057,7 @@ void kmeans_predict(handle_t const& handle,
                                                       workspace);
 
   // calculate cluster cost phi_x(C)
-  rmm::device_scalar<raft::KeyValuePair<IndexT, DataT>> clusterCostD(stream);
+  rmm::device_scalar<DataT> clusterCostD(stream);
   // TODO: add different templates for InType of binaryOp to avoid thrust transform
   thrust::transform(handle.get_thrust_policy(),
                     minClusterAndDistance.data_handle(),
@@ -1092,25 +1075,20 @@ void kmeans_predict(handle_t const& handle,
                              minClusterAndDistance.view(),
                              workspace,
                              raft::make_device_scalar_view(clusterCostD.data()),
-                             [] __device__(const raft::KeyValuePair<IndexT, DataT>& a,
-                                           const raft::KeyValuePair<IndexT, DataT>& b) {
-                               raft::KeyValuePair<IndexT, DataT> res;
-                               res.key   = 0;
-                               res.value = a.value + b.value;
-                               return res;
-                             });
-
-  raft::copy(inertia.data_handle(), &(clusterCostD.data()->value), 1, stream);
+                             raft::value_op{},
+                             raft::add_op{});
 
   thrust::transform(handle.get_thrust_policy(),
                     minClusterAndDistance.data_handle(),
                     minClusterAndDistance.data_handle() + minClusterAndDistance.size(),
                     labels.data_handle(),
-                    [=] __device__(raft::KeyValuePair<IndexT, DataT> pair) { return pair.key; });
+                    raft::key_op{});
+
+  inertia[0] = clusterCostD.value(stream);
 }
 
 template <typename DataT, typename IndexT = int>
-void kmeans_predict(handle_t const& handle,
+void kmeans_predict(raft::device_resources const& handle,
                     const KMeansParams& params,
                     const DataT* X,
                     const DataT* sample_weight,
@@ -1142,7 +1120,7 @@ void kmeans_predict(handle_t const& handle,
 }
 
 template <typename DataT, typename IndexT = int>
-void kmeans_fit_predict(handle_t const& handle,
+void kmeans_fit_predict(raft::device_resources const& handle,
                         const KMeansParams& params,
                         raft::device_matrix_view<const DataT> X,
                         std::optional<raft::device_vector_view<const DataT>> sample_weight,
@@ -1169,7 +1147,7 @@ void kmeans_fit_predict(handle_t const& handle,
 }
 
 template <typename DataT, typename IndexT = int>
-void kmeans_fit_predict(handle_t const& handle,
+void kmeans_fit_predict(raft::device_resources const& handle,
                         const KMeansParams& params,
                         const DataT* X,
                         const DataT* sample_weight,
@@ -1209,7 +1187,7 @@ void kmeans_fit_predict(handle_t const& handle,
  * @param[out]    X_new         X transformed in the new space..
  */
 template <typename DataT, typename IndexT = int>
-void kmeans_transform(const raft::handle_t& handle,
+void kmeans_transform(raft::device_resources const& handle,
                       const KMeansParams& params,
                       raft::device_matrix_view<const DataT> X,
                       raft::device_matrix_view<const DataT> centroids,
@@ -1250,7 +1228,7 @@ void kmeans_transform(const raft::handle_t& handle,
 }
 
 template <typename DataT, typename IndexT = int>
-void kmeans_transform(const raft::handle_t& handle,
+void kmeans_transform(raft::device_resources const& handle,
                       const KMeansParams& params,
                       const DataT* X,
                       const DataT* centroids,
diff --git a/cpp/include/raft/cluster/detail/kmeans_balanced.cuh b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
new file mode 100644
index 0000000000..3d23c809c3
--- /dev/null
+++ b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
@@ -0,0 +1,1095 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <limits>
+#include <type_traits>
+
+#include <raft/cluster/detail/kmeans_common.cuh>
+#include <raft/cluster/kmeans_balanced_types.hpp>
+#include <raft/common/nvtx.hpp>
+#include <raft/core/cudart_utils.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/distance/distance.cuh>
+#include <raft/distance/distance_types.hpp>
+#include <raft/distance/fused_l2_nn.cuh>
+#include <raft/linalg/add.cuh>
+#include <raft/linalg/gemm.cuh>
+#include <raft/linalg/map.cuh>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/linalg/normalize.cuh>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/matrix/argmin.cuh>
+#include <raft/matrix/gather.cuh>
+#include <raft/matrix/matrix.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/device_atomics.cuh>
+#include <raft/util/integer_utils.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_vector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <thrust/gather.h>
+#include <thrust/transform.h>
+
+#include <tuple>
+
+namespace raft::cluster::detail {
+
+constexpr static inline float kAdjustCentersWeight = 7.0f;
+
+/**
+ * @brief Predict labels for the dataset; floating-point types only.
+ *
+ * NB: no minibatch splitting is done here, it may require large amount of temporary memory (n_rows
+ * * n_cluster * sizeof(MathT)).
+ *
+ * @tparam MathT  type of the centroids and mapped data
+ * @tparam IdxT   index type
+ * @tparam LabelT label type
+ *
+ * @param[in] handle The raft handle.
+ * @param[in] params Structure containing the hyper-parameters
+ * @param[in] centers Pointer to the row-major matrix of cluster centers [n_clusters, dim]
+ * @param[in] n_clusters Number of clusters/centers
+ * @param[in] dim Dimensionality of the data
+ * @param[in] dataset Pointer to the data [n_rows, dim]
+ * @param[in] dataset_norm Pointer to the precomputed norm (for L2 metrics only) [n_rows]
+ * @param[in] n_rows Number samples in the `dataset`
+ * @param[out] labels Output predictions [n_rows]
+ * @param[inout] mr (optional) Memory resource to use for temporary allocations
+ */
+template <typename MathT, typename IdxT, typename LabelT>
+inline std::enable_if_t<std::is_floating_point_v<MathT>> predict_core(
+  const raft::device_resources& handle,
+  const kmeans_balanced_params& params,
+  const MathT* centers,
+  IdxT n_clusters,
+  IdxT dim,
+  const MathT* dataset,
+  const MathT* dataset_norm,
+  IdxT n_rows,
+  LabelT* labels,
+  rmm::mr::device_memory_resource* mr)
+{
+  auto stream = handle.get_stream();
+  switch (params.metric) {
+    case raft::distance::DistanceType::L2Expanded:
+    case raft::distance::DistanceType::L2SqrtExpanded: {
+      auto workspace = raft::make_device_mdarray<char, IdxT>(
+        handle, mr, make_extents<IdxT>((sizeof(int)) * n_rows));
+
+      auto minClusterAndDistance = raft::make_device_mdarray<raft::KeyValuePair<IdxT, MathT>, IdxT>(
+        handle, mr, make_extents<IdxT>(n_rows));
+      raft::KeyValuePair<IdxT, MathT> initial_value(0, std::numeric_limits<MathT>::max());
+      thrust::fill(handle.get_thrust_policy(),
+                   minClusterAndDistance.data_handle(),
+                   minClusterAndDistance.data_handle() + minClusterAndDistance.size(),
+                   initial_value);
+
+      auto centroidsNorm =
+        raft::make_device_mdarray<MathT, IdxT>(handle, mr, make_extents<IdxT>(n_clusters));
+      raft::linalg::rowNorm<MathT, IdxT>(
+        centroidsNorm.data_handle(), centers, dim, n_clusters, raft::linalg::L2Norm, true, stream);
+
+      raft::distance::fusedL2NNMinReduce<MathT, raft::KeyValuePair<IdxT, MathT>, IdxT>(
+        minClusterAndDistance.data_handle(),
+        dataset,
+        centers,
+        dataset_norm,
+        centroidsNorm.data_handle(),
+        n_rows,
+        n_clusters,
+        dim,
+        (void*)workspace.data_handle(),
+        (params.metric == raft::distance::DistanceType::L2Expanded) ? false : true,
+        false,
+        stream);
+
+      // todo(lsugy): use KVP + iterator in caller.
+      // Copy keys to output labels
+      thrust::transform(handle.get_thrust_policy(),
+                        minClusterAndDistance.data_handle(),
+                        minClusterAndDistance.data_handle() + n_rows,
+                        labels,
+                        raft::compose_op<raft::cast_op<LabelT>, raft::key_op>());
+      break;
+    }
+    case raft::distance::DistanceType::InnerProduct: {
+      // TODO: pass buffer
+      rmm::device_uvector<MathT> distances(n_rows * n_clusters, stream, mr);
+
+      MathT alpha = -1.0;
+      MathT beta  = 0.0;
+
+      linalg::gemm(handle,
+                   true,
+                   false,
+                   n_clusters,
+                   n_rows,
+                   dim,
+                   &alpha,
+                   centers,
+                   dim,
+                   dataset,
+                   dim,
+                   &beta,
+                   distances.data(),
+                   n_clusters,
+                   stream);
+
+      auto distances_const_view = raft::make_device_matrix_view<const MathT, IdxT, row_major>(
+        distances.data(), n_rows, n_clusters);
+      auto labels_view = raft::make_device_vector_view<LabelT, IdxT>(labels, n_rows);
+      raft::matrix::argmin(handle, distances_const_view, labels_view);
+      break;
+    }
+    default: {
+      RAFT_FAIL("The chosen distance metric is not supported (%d)", int(params.metric));
+    }
+  }
+}
+
+/**
+ * @brief Suggest a minibatch size for kmeans prediction.
+ *
+ * This function is used as a heuristic to split the work over a large dataset
+ * to reduce the size of temporary memory allocations.
+ *
+ * @tparam MathT type of the centroids and mapped data
+ * @tparam IdxT  index type
+ *
+ * @param[in] n_clusters number of clusters in kmeans clustering
+ * @param[in] n_rows Number of samples in the dataset
+ * @param[in] dim Number of features in the dataset
+ * @param[in] metric Distance metric
+ * @param[in] needs_conversion Whether the data needs to be converted to MathT
+ * @return A suggested minibatch size and the expected memory cost per-row (in bytes)
+ */
+template <typename MathT, typename IdxT>
+constexpr auto calc_minibatch_size(IdxT n_clusters,
+                                   IdxT n_rows,
+                                   IdxT dim,
+                                   raft::distance::DistanceType metric,
+                                   bool needs_conversion) -> std::tuple<IdxT, size_t>
+{
+  n_clusters = std::max<IdxT>(1, n_clusters);
+
+  // Estimate memory needs per row (i.e element of the batch).
+  size_t mem_per_row = 0;
+  switch (metric) {
+    // fusedL2NN needs a mutex and a key-value pair for each row.
+    case distance::DistanceType::L2Expanded:
+    case distance::DistanceType::L2SqrtExpanded: {
+      mem_per_row += sizeof(int);
+      mem_per_row += sizeof(raft::KeyValuePair<IdxT, MathT>);
+    } break;
+    // Other metrics require storing a distance matrix.
+    default: {
+      mem_per_row += sizeof(MathT) * n_clusters;
+    }
+  }
+
+  // If we need to convert to MathT, space required for the converted batch.
+  if (!needs_conversion) { mem_per_row += sizeof(MathT) * dim; }
+
+  // Heuristic: calculate the minibatch size in order to use at most 1GB of memory.
+  IdxT minibatch_size = (1 << 30) / mem_per_row;
+  minibatch_size      = 64 * div_rounding_up_safe(minibatch_size, IdxT{64});
+  minibatch_size      = std::min<IdxT>(minibatch_size, n_rows);
+  return std::make_tuple(minibatch_size, mem_per_row);
+}
+
+/**
+ * @brief Given the data and labels, calculate cluster centers and sizes in one sweep.
+ *
+ * @note all pointers must be accessible on the device.
+ *
+ * @tparam T          element type
+ * @tparam MathT      type of the centroids and mapped data
+ * @tparam IdxT       index type
+ * @tparam LabelT     label type
+ * @tparam CounterT   counter type supported by CUDA's native atomicAdd
+ * @tparam MappingOpT type of the mapping operation
+ *
+ * @param[in] handle The raft handle.
+ * @param[inout] centers Pointer to the output [n_clusters, dim]
+ * @param[inout] cluster_sizes Number of rows in each cluster [n_clusters]
+ * @param[in] n_clusters Number of clusters/centers
+ * @param[in] dim Dimensionality of the data
+ * @param[in] dataset Pointer to the data [n_rows, dim]
+ * @param[in] n_rows Number of samples in the `dataset`
+ * @param[in] labels Output predictions [n_rows]
+ * @param[in] reset_counters Whether to clear the output arrays before calculating.
+ *    When set to `false`, this function may be used to update existing centers and sizes using
+ *    the weighted average principle.
+ * @param[in] mapping_op Mapping operation from T to MathT
+ * @param[inout] mr (optional) Memory resource to use for temporary allocations on the device
+ */
+template <typename T,
+          typename MathT,
+          typename IdxT,
+          typename LabelT,
+          typename CounterT,
+          typename MappingOpT>
+void calc_centers_and_sizes(const raft::device_resources& handle,
+                            MathT* centers,
+                            CounterT* cluster_sizes,
+                            IdxT n_clusters,
+                            IdxT dim,
+                            const T* dataset,
+                            IdxT n_rows,
+                            const LabelT* labels,
+                            bool reset_counters,
+                            MappingOpT mapping_op,
+                            rmm::mr::device_memory_resource* mr = nullptr)
+{
+  auto stream = handle.get_stream();
+  if (mr == nullptr) { mr = handle.get_workspace_resource(); }
+
+  if (!reset_counters) {
+    raft::linalg::matrixVectorOp(
+      centers, centers, cluster_sizes, dim, n_clusters, true, false, raft::mul_op(), stream);
+  }
+
+  rmm::device_uvector<char> workspace(0, stream, mr);
+
+  // If we reset the counters, we can compute directly the new sizes in cluster_sizes.
+  // If we don't reset, we compute in a temporary buffer and add in a separate step.
+  rmm::device_uvector<CounterT> temp_cluster_sizes(0, stream, mr);
+  CounterT* temp_sizes = cluster_sizes;
+  if (!reset_counters) {
+    temp_cluster_sizes.resize(n_clusters, stream);
+    temp_sizes = temp_cluster_sizes.data();
+  }
+
+  // Apply mapping only when the data and math types are different.
+  if constexpr (std::is_same_v<T, MathT>) {
+    raft::linalg::reduce_rows_by_key(
+      dataset, dim, labels, nullptr, n_rows, dim, n_clusters, centers, stream, reset_counters);
+  } else {
+    // todo(lsugy): use iterator from KV output of fusedL2NN
+    cub::TransformInputIterator<MathT, MappingOpT, const T*> mapping_itr(dataset, mapping_op);
+    raft::linalg::reduce_rows_by_key(
+      mapping_itr, dim, labels, nullptr, n_rows, dim, n_clusters, centers, stream, reset_counters);
+  }
+
+  // Compute weight of each cluster
+  raft::cluster::detail::countLabels(handle, labels, temp_sizes, n_rows, n_clusters, workspace);
+
+  // Add previous sizes if necessary
+  if (!reset_counters) {
+    raft::linalg::add(cluster_sizes, cluster_sizes, temp_sizes, n_clusters, stream);
+  }
+
+  raft::linalg::matrixVectorOp(centers,
+                               centers,
+                               cluster_sizes,
+                               dim,
+                               n_clusters,
+                               true,
+                               false,
+                               raft::div_checkzero_op(),
+                               stream);
+}
+
+/** Computes the L2 norm of the dataset, converting to MathT if necessary */
+template <typename T, typename MathT, typename IdxT, typename MappingOpT>
+void compute_norm(const raft::device_resources& handle,
+                  MathT* dataset_norm,
+                  const T* dataset,
+                  IdxT dim,
+                  IdxT n_rows,
+                  MappingOpT mapping_op,
+                  rmm::mr::device_memory_resource* mr = nullptr)
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope("compute_norm");
+  auto stream = handle.get_stream();
+  if (mr == nullptr) { mr = handle.get_workspace_resource(); }
+  rmm::device_uvector<MathT> mapped_dataset(0, stream, mr);
+
+  const MathT* dataset_ptr = nullptr;
+
+  if (std::is_same_v<MathT, T>) {
+    dataset_ptr = reinterpret_cast<const MathT*>(dataset);
+  } else {
+    mapped_dataset.resize(n_rows * dim, stream);
+
+    linalg::unaryOp(mapped_dataset.data(), dataset, n_rows * dim, mapping_op, stream);
+
+    dataset_ptr = (const MathT*)mapped_dataset.data();
+  }
+
+  raft::linalg::rowNorm<MathT, IdxT>(
+    dataset_norm, dataset_ptr, dim, n_rows, raft::linalg::L2Norm, true, stream);
+}
+
+/**
+ * @brief Predict labels for the dataset.
+ *
+ * @tparam T element type
+ * @tparam MathT type of the centroids and mapped data
+ * @tparam IdxT index type
+ * @tparam LabelT label type
+ * @tparam MappingOpT type of the mapping operation
+ *
+ * @param[in] handle The raft handle
+ * @param[in] params Structure containing the hyper-parameters
+ * @param[in] centers Pointer to the row-major matrix of cluster centers [n_clusters, dim]
+ * @param[in] n_clusters Number of clusters/centers
+ * @param[in] dim Dimensionality of the data
+ * @param[in] dataset Pointer to the data [n_rows, dim]
+ * @param[in] n_rows Number samples in the `dataset`
+ * @param[out] labels Output predictions [n_rows]
+ * @param[in] mapping_op Mapping operation from T to MathT
+ * @param[inout] mr (optional) memory resource to use for temporary allocations
+ * @param[in] dataset_norm (optional) Pre-computed norms of each row in the dataset [n_rows]
+ */
+template <typename T, typename MathT, typename IdxT, typename LabelT, typename MappingOpT>
+void predict(const raft::device_resources& handle,
+             const kmeans_balanced_params& params,
+             const MathT* centers,
+             IdxT n_clusters,
+             IdxT dim,
+             const T* dataset,
+             IdxT n_rows,
+             LabelT* labels,
+             MappingOpT mapping_op,
+             rmm::mr::device_memory_resource* mr = nullptr,
+             const MathT* dataset_norm           = nullptr)
+{
+  auto stream = handle.get_stream();
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "predict(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
+  if (mr == nullptr) { mr = handle.get_workspace_resource(); }
+  auto [max_minibatch_size, _mem_per_row] =
+    calc_minibatch_size<MathT>(n_clusters, n_rows, dim, params.metric, std::is_same_v<T, MathT>);
+  rmm::device_uvector<MathT> cur_dataset(
+    std::is_same_v<T, MathT> ? 0 : max_minibatch_size * dim, stream, mr);
+  bool need_compute_norm =
+    dataset_norm == nullptr && (params.metric == raft::distance::DistanceType::L2Expanded ||
+                                params.metric == raft::distance::DistanceType::L2SqrtExpanded);
+  rmm::device_uvector<MathT> cur_dataset_norm(
+    need_compute_norm ? max_minibatch_size : 0, stream, mr);
+  const MathT* dataset_norm_ptr = nullptr;
+  auto cur_dataset_ptr          = cur_dataset.data();
+  for (IdxT offset = 0; offset < n_rows; offset += max_minibatch_size) {
+    IdxT minibatch_size = std::min<IdxT>(max_minibatch_size, n_rows - offset);
+
+    if constexpr (std::is_same_v<T, MathT>) {
+      cur_dataset_ptr = const_cast<MathT*>(dataset + offset * dim);
+    } else {
+      linalg::unaryOp(
+        cur_dataset_ptr, dataset + offset * dim, minibatch_size * dim, mapping_op, stream);
+    }
+
+    // Compute the norm now if it hasn't been pre-computed.
+    if (need_compute_norm) {
+      compute_norm(
+        handle, cur_dataset_norm.data(), cur_dataset_ptr, dim, minibatch_size, mapping_op, mr);
+      dataset_norm_ptr = cur_dataset_norm.data();
+    } else if (dataset_norm != nullptr) {
+      dataset_norm_ptr = dataset_norm + offset;
+    }
+
+    predict_core(handle,
+                 params,
+                 centers,
+                 n_clusters,
+                 dim,
+                 cur_dataset_ptr,
+                 dataset_norm_ptr,
+                 minibatch_size,
+                 labels + offset,
+                 mr);
+  }
+}
+
+template <uint32_t BlockDimY,
+          typename T,
+          typename MathT,
+          typename IdxT,
+          typename LabelT,
+          typename CounterT,
+          typename MappingOpT>
+__global__ void __launch_bounds__((WarpSize * BlockDimY))
+  adjust_centers_kernel(MathT* centers,  // [n_clusters, dim]
+                        IdxT n_clusters,
+                        IdxT dim,
+                        const T* dataset,  // [n_rows, dim]
+                        IdxT n_rows,
+                        const LabelT* labels,           // [n_rows]
+                        const CounterT* cluster_sizes,  // [n_clusters]
+                        MathT threshold,
+                        IdxT average,
+                        IdxT seed,
+                        IdxT* count,
+                        MappingOpT mapping_op)
+{
+  IdxT l = threadIdx.y + BlockDimY * static_cast<IdxT>(blockIdx.y);
+  if (l >= n_clusters) return;
+  auto csize = static_cast<IdxT>(cluster_sizes[l]);
+  // skip big clusters
+  if (csize > static_cast<IdxT>(average * threshold)) return;
+
+  // choose a "random" i that belongs to a rather large cluster
+  IdxT i;
+  IdxT j = laneId();
+  if (j == 0) {
+    do {
+      auto old = atomicAdd(count, IdxT{1});
+      i        = (seed * (old + 1)) % n_rows;
+    } while (static_cast<IdxT>(cluster_sizes[labels[i]]) < average);
+  }
+  i = raft::shfl(i, 0);
+
+  // Adjust the center of the selected smaller cluster to gravitate towards
+  // a sample from the selected larger cluster.
+  const IdxT li = static_cast<IdxT>(labels[i]);
+  // Weight of the current center for the weighted average.
+  // We dump it for anomalously small clusters, but keep constant otherwise.
+  const MathT wc = min(static_cast<MathT>(csize), static_cast<MathT>(kAdjustCentersWeight));
+  // Weight for the datapoint used to shift the center.
+  const MathT wd = 1.0;
+  for (; j < dim; j += WarpSize) {
+    MathT val = 0;
+    val += wc * centers[j + dim * li];
+    val += wd * mapping_op(dataset[j + dim * i]);
+    val /= wc + wd;
+    centers[j + dim * l] = val;
+  }
+}
+
+/**
+ * @brief Adjust centers for clusters that have small number of entries.
+ *
+ * For each cluster, where the cluster size is not bigger than a threshold, the center is moved
+ * towards a data point that belongs to a large cluster.
+ *
+ * NB: if this function returns `true`, you should update the labels.
+ *
+ * NB: all pointers must be on the device side.
+ *
+ * @tparam T element type
+ * @tparam MathT type of the centroids and mapped data
+ * @tparam IdxT index type
+ * @tparam LabelT label type
+ * @tparam CounterT counter type supported by CUDA's native atomicAdd
+ * @tparam MappingOpT type of the mapping operation
+ *
+ * @param[inout] centers cluster centers [n_clusters, dim]
+ * @param[in] n_clusters number of rows in `centers`
+ * @param[in] dim number of columns in `centers` and `dataset`
+ * @param[in] dataset a host pointer to the row-major data matrix [n_rows, dim]
+ * @param[in] n_rows number of rows in `dataset`
+ * @param[in] labels a host pointer to the cluster indices [n_rows]
+ * @param[in] cluster_sizes number of rows in each cluster [n_clusters]
+ * @param[in] threshold defines a criterion for adjusting a cluster
+ *                   (cluster_sizes <= average_size * threshold)
+ *                   0 <= threshold < 1
+ * @param[in] mapping_op Mapping operation from T to MathT
+ * @param[in] stream CUDA stream
+ * @param[inout] device_memory  memory resource to use for temporary allocations
+ *
+ * @return whether any of the centers has been updated (and thus, `labels` need to be recalculated).
+ */
+template <typename T,
+          typename MathT,
+          typename IdxT,
+          typename LabelT,
+          typename CounterT,
+          typename MappingOpT>
+auto adjust_centers(MathT* centers,
+                    IdxT n_clusters,
+                    IdxT dim,
+                    const T* dataset,
+                    IdxT n_rows,
+                    const LabelT* labels,
+                    const CounterT* cluster_sizes,
+                    MathT threshold,
+                    MappingOpT mapping_op,
+                    rmm::cuda_stream_view stream,
+                    rmm::mr::device_memory_resource* device_memory) -> bool
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "adjust_centers(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
+  if (n_clusters == 0) { return false; }
+  constexpr static std::array kPrimes{29,   71,   113,  173,  229,  281,  349,  409,  463,  541,
+                                      601,  659,  733,  809,  863,  941,  1013, 1069, 1151, 1223,
+                                      1291, 1373, 1451, 1511, 1583, 1657, 1733, 1811, 1889, 1987,
+                                      2053, 2129, 2213, 2287, 2357, 2423, 2531, 2617, 2687, 2741};
+  static IdxT i        = 0;
+  static IdxT i_primes = 0;
+
+  bool adjusted = false;
+  IdxT average  = n_rows / n_clusters;
+  IdxT ofst;
+  do {
+    i_primes = (i_primes + 1) % kPrimes.size();
+    ofst     = kPrimes[i_primes];
+  } while (n_rows % ofst == 0);
+
+  constexpr uint32_t kBlockDimY = 4;
+  const dim3 block_dim(WarpSize, kBlockDimY, 1);
+  const dim3 grid_dim(1, raft::ceildiv(n_clusters, static_cast<IdxT>(kBlockDimY)), 1);
+  rmm::device_scalar<IdxT> update_count(0, stream, device_memory);
+  adjust_centers_kernel<kBlockDimY><<<grid_dim, block_dim, 0, stream>>>(centers,
+                                                                        n_clusters,
+                                                                        dim,
+                                                                        dataset,
+                                                                        n_rows,
+                                                                        labels,
+                                                                        cluster_sizes,
+                                                                        threshold,
+                                                                        average,
+                                                                        ofst,
+                                                                        update_count.data(),
+                                                                        mapping_op);
+  adjusted = update_count.value(stream) > 0;  // NB: rmm scalar performs the sync
+
+  return adjusted;
+}
+
+/**
+ * @brief Expectation-maximization-balancing combined in an iterative process.
+ *
+ * Note, the `cluster_centers` is assumed to be already initialized here.
+ * Thus, this function can be used for fine-tuning existing clusters;
+ * to train from scratch, use `build_clusters` function below.
+ *
+ * @tparam T      element type
+ * @tparam MathT  type of the centroids and mapped data
+ * @tparam IdxT   index type
+ * @tparam LabelT label type
+ * @tparam CounterT counter type supported by CUDA's native atomicAdd
+ * @tparam MappingOpT type of the mapping operation
+ *
+ * @param[in] handle The raft handle
+ * @param[in] params Structure containing the hyper-parameters
+ * @param[in] n_iters Requested number of iterations (can differ from params.n_iter!)
+ * @param[in] dim Dimensionality of the dataset
+ * @param[in] dataset Pointer to a managed row-major array [n_rows, dim]
+ * @param[in] dataset_norm Pointer to the precomputed norm (for L2 metrics only) [n_rows]
+ * @param[in] n_rows Number of rows in the dataset
+ * @param[in] n_cluster Requested number of clusters
+ * @param[inout] cluster_centers Pointer to a managed row-major array [n_clusters, dim]
+ * @param[out] cluster_labels Pointer to a managed row-major array [n_rows]
+ * @param[out] cluster_sizes Pointer to a managed row-major array [n_clusters]
+ * @param[in] balancing_pullback
+ *   if the cluster centers are rebalanced on this number of iterations,
+ *   one extra iteration is performed (this could happen several times) (default should be `2`).
+ *   In other words, the first and then every `ballancing_pullback`-th rebalancing operation adds
+ *   one more iteration to the main cycle.
+ * @param[in] balancing_threshold
+ *   the rebalancing takes place if any cluster is smaller than `avg_size * balancing_threshold`
+ *   on a given iteration (default should be `~ 0.25`).
+ * @param[in] mapping_op Mapping operation from T to MathT
+ * @param[inout] device_memory
+ *   A memory resource for device allocations (makes sense to provide a memory pool here)
+ */
+template <typename T,
+          typename MathT,
+          typename IdxT,
+          typename LabelT,
+          typename CounterT,
+          typename MappingOpT>
+void balancing_em_iters(const raft::device_resources& handle,
+                        const kmeans_balanced_params& params,
+                        uint32_t n_iters,
+                        IdxT dim,
+                        const T* dataset,
+                        const MathT* dataset_norm,
+                        IdxT n_rows,
+                        IdxT n_clusters,
+                        MathT* cluster_centers,
+                        LabelT* cluster_labels,
+                        CounterT* cluster_sizes,
+                        uint32_t balancing_pullback,
+                        MathT balancing_threshold,
+                        MappingOpT mapping_op,
+                        rmm::mr::device_memory_resource* device_memory)
+{
+  auto stream                = handle.get_stream();
+  uint32_t balancing_counter = balancing_pullback;
+  for (uint32_t iter = 0; iter < n_iters; iter++) {
+    // Balancing step - move the centers around to equalize cluster sizes
+    // (but not on the first iteration)
+    if (iter > 0 && adjust_centers(cluster_centers,
+                                   n_clusters,
+                                   dim,
+                                   dataset,
+                                   n_rows,
+                                   cluster_labels,
+                                   cluster_sizes,
+                                   balancing_threshold,
+                                   mapping_op,
+                                   stream,
+                                   device_memory)) {
+      if (balancing_counter++ >= balancing_pullback) {
+        balancing_counter -= balancing_pullback;
+        n_iters++;
+      }
+    }
+    switch (params.metric) {
+      // For some metrics, cluster calculation and adjustment tends to favor zero center vectors.
+      // To avoid converging to zero, we normalize the center vectors on every iteration.
+      case raft::distance::DistanceType::InnerProduct:
+      case raft::distance::DistanceType::CosineExpanded:
+      case raft::distance::DistanceType::CorrelationExpanded: {
+        auto clusters_in_view = raft::make_device_matrix_view<const MathT, IdxT, raft::row_major>(
+          cluster_centers, n_clusters, dim);
+        auto clusters_out_view = raft::make_device_matrix_view<MathT, IdxT, raft::row_major>(
+          cluster_centers, n_clusters, dim);
+        raft::linalg::row_normalize(
+          handle, clusters_in_view, clusters_out_view, raft::linalg::L2Norm);
+        break;
+      }
+      default: break;
+    }
+    // E: Expectation step - predict labels
+    predict(handle,
+            params,
+            cluster_centers,
+            n_clusters,
+            dim,
+            dataset,
+            n_rows,
+            cluster_labels,
+            mapping_op,
+            device_memory,
+            dataset_norm);
+    // M: Maximization step - calculate optimal cluster centers
+    calc_centers_and_sizes(handle,
+                           cluster_centers,
+                           cluster_sizes,
+                           n_clusters,
+                           dim,
+                           dataset,
+                           n_rows,
+                           cluster_labels,
+                           true,
+                           mapping_op,
+                           device_memory);
+  }
+}
+
+/** Randomly initialize cluster centers and then call `balancing_em_iters`. */
+template <typename T,
+          typename MathT,
+          typename IdxT,
+          typename LabelT,
+          typename CounterT,
+          typename MappingOpT>
+void build_clusters(const raft::device_resources& handle,
+                    const kmeans_balanced_params& params,
+                    IdxT dim,
+                    const T* dataset,
+                    IdxT n_rows,
+                    IdxT n_clusters,
+                    MathT* cluster_centers,
+                    LabelT* cluster_labels,
+                    CounterT* cluster_sizes,
+                    MappingOpT mapping_op,
+                    rmm::mr::device_memory_resource* device_memory,
+                    const MathT* dataset_norm = nullptr)
+{
+  auto stream = handle.get_stream();
+
+  // "randomly" initialize labels
+  auto labels_view = raft::make_device_vector_view<LabelT, IdxT>(cluster_labels, n_rows);
+  linalg::map_offset(
+    handle,
+    labels_view,
+    raft::compose_op(raft::cast_op<LabelT>(), raft::mod_const_op<IdxT>(n_clusters)));
+
+  // update centers to match the initialized labels.
+  calc_centers_and_sizes(handle,
+                         cluster_centers,
+                         cluster_sizes,
+                         n_clusters,
+                         dim,
+                         dataset,
+                         n_rows,
+                         cluster_labels,
+                         true,
+                         mapping_op,
+                         device_memory);
+
+  // run EM
+  balancing_em_iters(handle,
+                     params,
+                     params.n_iters,
+                     dim,
+                     dataset,
+                     dataset_norm,
+                     n_rows,
+                     n_clusters,
+                     cluster_centers,
+                     cluster_labels,
+                     cluster_sizes,
+                     2,
+                     MathT{0.25},
+                     mapping_op,
+                     device_memory);
+}
+
+/** Calculate how many fine clusters should belong to each mesocluster. */
+template <typename IdxT, typename CounterT>
+inline auto arrange_fine_clusters(IdxT n_clusters,
+                                  IdxT n_mesoclusters,
+                                  IdxT n_rows,
+                                  const CounterT* mesocluster_sizes)
+{
+  std::vector<IdxT> fine_clusters_nums(n_mesoclusters);
+  std::vector<IdxT> fine_clusters_csum(n_mesoclusters + 1);
+  fine_clusters_csum[0] = 0;
+
+  IdxT n_lists_rem       = n_clusters;
+  IdxT n_nonempty_ms_rem = 0;
+  for (IdxT i = 0; i < n_mesoclusters; i++) {
+    n_nonempty_ms_rem += mesocluster_sizes[i] > CounterT{0} ? 1 : 0;
+  }
+  IdxT n_rows_rem               = n_rows;
+  CounterT mesocluster_size_sum = 0;
+  CounterT mesocluster_size_max = 0;
+  IdxT fine_clusters_nums_max   = 0;
+  for (IdxT i = 0; i < n_mesoclusters; i++) {
+    if (i < n_mesoclusters - 1) {
+      // Although the algorithm is meant to produce balanced clusters, when something
+      // goes wrong, we may get empty clusters (e.g. during development/debugging).
+      // The code below ensures a proportional arrangement of fine cluster numbers
+      // per mesocluster, even if some clusters are empty.
+      if (mesocluster_sizes[i] == 0) {
+        fine_clusters_nums[i] = 0;
+      } else {
+        n_nonempty_ms_rem--;
+        auto s = static_cast<IdxT>(
+          static_cast<double>(n_lists_rem * mesocluster_sizes[i]) / n_rows_rem + .5);
+        s                     = std::min<IdxT>(s, n_lists_rem - n_nonempty_ms_rem);
+        fine_clusters_nums[i] = std::max(s, IdxT{1});
+      }
+    } else {
+      fine_clusters_nums[i] = n_lists_rem;
+    }
+    n_lists_rem -= fine_clusters_nums[i];
+    n_rows_rem -= mesocluster_sizes[i];
+    mesocluster_size_max = max(mesocluster_size_max, mesocluster_sizes[i]);
+    mesocluster_size_sum += mesocluster_sizes[i];
+    fine_clusters_nums_max    = max(fine_clusters_nums_max, fine_clusters_nums[i]);
+    fine_clusters_csum[i + 1] = fine_clusters_csum[i] + fine_clusters_nums[i];
+  }
+
+  RAFT_EXPECTS(static_cast<IdxT>(mesocluster_size_sum) == n_rows,
+               "mesocluster sizes do not add up (%zu) to the total trainset size (%zu)",
+               static_cast<size_t>(mesocluster_size_sum),
+               static_cast<size_t>(n_rows));
+  RAFT_EXPECTS(fine_clusters_csum[n_mesoclusters] == n_clusters,
+               "fine cluster numbers do not add up (%zu) to the total number of clusters (%zu)",
+               static_cast<size_t>(fine_clusters_csum[n_mesoclusters]),
+               static_cast<size_t>(n_clusters));
+
+  return std::make_tuple(static_cast<IdxT>(mesocluster_size_max),
+                         fine_clusters_nums_max,
+                         std::move(fine_clusters_nums),
+                         std::move(fine_clusters_csum));
+}
+
+/**
+ *  Given the (coarse) mesoclusters and the distribution of fine clusters within them,
+ *  build the fine clusters.
+ *
+ *  Processing one mesocluster at a time:
+ *   1. Copy mesocluster data into a separate buffer
+ *   2. Predict fine cluster
+ *   3. Refince the fine cluster centers
+ *
+ *  As a result, the fine clusters are what is returned by `build_hierarchical`;
+ *  this function returns the total number of fine clusters, which can be checked to be
+ *  the same as the requested number of clusters.
+ *
+ *  Note: this function uses at most `fine_clusters_nums_max` points per mesocluster for training;
+ *  if one of the clusters is larger than that (as given by `mesocluster_sizes`), the extra data
+ *  is ignored and a warning is reported.
+ */
+template <typename T,
+          typename MathT,
+          typename IdxT,
+          typename LabelT,
+          typename CounterT,
+          typename MappingOpT>
+auto build_fine_clusters(const raft::device_resources& handle,
+                         const kmeans_balanced_params& params,
+                         IdxT dim,
+                         const T* dataset_mptr,
+                         const MathT* dataset_norm_mptr,
+                         const LabelT* labels_mptr,
+                         IdxT n_rows,
+                         const IdxT* fine_clusters_nums,
+                         const IdxT* fine_clusters_csum,
+                         const CounterT* mesocluster_sizes,
+                         IdxT n_mesoclusters,
+                         IdxT mesocluster_size_max,
+                         IdxT fine_clusters_nums_max,
+                         MathT* cluster_centers,
+                         MappingOpT mapping_op,
+                         rmm::mr::device_memory_resource* managed_memory,
+                         rmm::mr::device_memory_resource* device_memory) -> IdxT
+{
+  auto stream = handle.get_stream();
+  rmm::device_uvector<IdxT> mc_trainset_ids_buf(mesocluster_size_max, stream, managed_memory);
+  rmm::device_uvector<MathT> mc_trainset_buf(mesocluster_size_max * dim, stream, device_memory);
+  rmm::device_uvector<MathT> mc_trainset_norm_buf(mesocluster_size_max, stream, device_memory);
+  auto mc_trainset_ids  = mc_trainset_ids_buf.data();
+  auto mc_trainset      = mc_trainset_buf.data();
+  auto mc_trainset_norm = mc_trainset_norm_buf.data();
+
+  // label (cluster ID) of each vector
+  rmm::device_uvector<LabelT> mc_trainset_labels(mesocluster_size_max, stream, device_memory);
+
+  rmm::device_uvector<MathT> mc_trainset_ccenters(
+    fine_clusters_nums_max * dim, stream, device_memory);
+  // number of vectors in each cluster
+  rmm::device_uvector<CounterT> mc_trainset_csizes_tmp(
+    fine_clusters_nums_max, stream, device_memory);
+
+  // Training clusters in each meso-cluster
+  IdxT n_clusters_done = 0;
+  for (IdxT i = 0; i < n_mesoclusters; i++) {
+    IdxT k = 0;
+    for (IdxT j = 0; j < n_rows && k < mesocluster_size_max; j++) {
+      if (labels_mptr[j] == LabelT(i)) { mc_trainset_ids[k++] = j; }
+    }
+    if (k != static_cast<IdxT>(mesocluster_sizes[i]))
+      RAFT_LOG_WARN("Incorrect mesocluster size at %d. %zu vs %zu",
+                    static_cast<int>(i),
+                    static_cast<size_t>(k),
+                    static_cast<size_t>(mesocluster_sizes[i]));
+    if (k == 0) {
+      RAFT_LOG_DEBUG("Empty cluster %d", i);
+      RAFT_EXPECTS(fine_clusters_nums[i] == 0,
+                   "Number of fine clusters must be zero for the empty mesocluster (got %d)",
+                   static_cast<int>(fine_clusters_nums[i]));
+      continue;
+    } else {
+      RAFT_EXPECTS(fine_clusters_nums[i] > 0,
+                   "Number of fine clusters must be non-zero for a non-empty mesocluster");
+    }
+
+    cub::TransformInputIterator<MathT, MappingOpT, const T*> mapping_itr(dataset_mptr, mapping_op);
+    raft::matrix::gather(mapping_itr, dim, n_rows, mc_trainset_ids, k, mc_trainset, stream);
+    if (params.metric == raft::distance::DistanceType::L2Expanded ||
+        params.metric == raft::distance::DistanceType::L2SqrtExpanded) {
+      thrust::gather(handle.get_thrust_policy(),
+                     mc_trainset_ids,
+                     mc_trainset_ids + k,
+                     dataset_norm_mptr,
+                     mc_trainset_norm);
+    }
+
+    build_clusters(handle,
+                   params,
+                   dim,
+                   mc_trainset,
+                   k,
+                   fine_clusters_nums[i],
+                   mc_trainset_ccenters.data(),
+                   mc_trainset_labels.data(),
+                   mc_trainset_csizes_tmp.data(),
+                   mapping_op,
+                   device_memory,
+                   mc_trainset_norm);
+
+    raft::copy(cluster_centers + (dim * fine_clusters_csum[i]),
+               mc_trainset_ccenters.data(),
+               fine_clusters_nums[i] * dim,
+               stream);
+    handle.sync_stream(stream);
+    n_clusters_done += fine_clusters_nums[i];
+  }
+  return n_clusters_done;
+}
+
+/**
+ * @brief Hierarchical balanced k-means
+ *
+ * @tparam T      element type
+ * @tparam MathT  type of the centroids and mapped data
+ * @tparam IdxT   index type
+ * @tparam LabelT label type
+ * @tparam MappingOpT type of the mapping operation
+ *
+ * @param[in] handle The raft handle.
+ * @param[in] params Structure containing the hyper-parameters
+ * @param dim number of columns in `centers` and `dataset`
+ * @param[in] dataset a device pointer to the source dataset [n_rows, dim]
+ * @param n_rows number of rows in the input
+ * @param[out] cluster_centers a device pointer to the found cluster centers [n_cluster, dim]
+ * @param n_cluster
+ * @param metric the distance type
+ * @param mapping_op Mapping operation from T to MathT
+ * @param stream
+ */
+template <typename T, typename MathT, typename IdxT, typename MappingOpT>
+void build_hierarchical(const raft::device_resources& handle,
+                        const kmeans_balanced_params& params,
+                        IdxT dim,
+                        const T* dataset,
+                        IdxT n_rows,
+                        MathT* cluster_centers,
+                        IdxT n_clusters,
+                        MappingOpT mapping_op)
+{
+  auto stream  = handle.get_stream();
+  using LabelT = uint32_t;
+
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "build_hierarchical(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
+
+  IdxT n_mesoclusters = std::min(n_clusters, static_cast<IdxT>(std::sqrt(n_clusters) + 0.5));
+  RAFT_LOG_DEBUG("build_hierarchical: n_mesoclusters: %u", n_mesoclusters);
+
+  rmm::mr::managed_memory_resource managed_memory;
+  rmm::mr::device_memory_resource* device_memory = handle.get_workspace_resource();
+  auto [max_minibatch_size, mem_per_row] =
+    calc_minibatch_size<MathT>(n_clusters, n_rows, dim, params.metric, std::is_same_v<T, MathT>);
+  auto pool_guard =
+    raft::get_pool_memory_resource(device_memory, mem_per_row * size_t(max_minibatch_size));
+  if (pool_guard) {
+    RAFT_LOG_DEBUG("build_hierarchical: using pool memory resource with initial size %zu bytes",
+                   pool_guard->pool_size());
+  }
+
+  // Precompute the L2 norm of the dataset if relevant.
+  const MathT* dataset_norm = nullptr;
+  rmm::device_uvector<MathT> dataset_norm_buf(0, stream, device_memory);
+  if (params.metric == raft::distance::DistanceType::L2Expanded ||
+      params.metric == raft::distance::DistanceType::L2SqrtExpanded) {
+    dataset_norm_buf.resize(n_rows, stream);
+    for (IdxT offset = 0; offset < n_rows; offset += max_minibatch_size) {
+      IdxT minibatch_size = std::min<IdxT>(max_minibatch_size, n_rows - offset);
+      compute_norm(handle,
+                   dataset_norm_buf.data() + offset,
+                   dataset + dim * offset,
+                   dim,
+                   minibatch_size,
+                   mapping_op,
+                   device_memory);
+    }
+    dataset_norm = (const MathT*)dataset_norm_buf.data();
+  }
+
+  /* Temporary workaround to cub::DeviceHistogram not supporting any type that isn't natively
+   * supported by atomicAdd: find a supported CounterT based on the IdxT. */
+  typedef typename std::conditional_t<sizeof(IdxT) == 8, unsigned long long int, unsigned int>
+    CounterT;
+
+  // build coarse clusters (mesoclusters)
+  rmm::device_uvector<LabelT> mesocluster_labels_buf(n_rows, stream, &managed_memory);
+  rmm::device_uvector<CounterT> mesocluster_sizes_buf(n_mesoclusters, stream, &managed_memory);
+  {
+    rmm::device_uvector<MathT> mesocluster_centers_buf(n_mesoclusters * dim, stream, device_memory);
+    build_clusters(handle,
+                   params,
+                   dim,
+                   dataset,
+                   n_rows,
+                   n_mesoclusters,
+                   mesocluster_centers_buf.data(),
+                   mesocluster_labels_buf.data(),
+                   mesocluster_sizes_buf.data(),
+                   mapping_op,
+                   device_memory,
+                   dataset_norm);
+  }
+
+  auto mesocluster_sizes  = mesocluster_sizes_buf.data();
+  auto mesocluster_labels = mesocluster_labels_buf.data();
+
+  handle.sync_stream(stream);
+
+  // build fine clusters
+  auto [mesocluster_size_max, fine_clusters_nums_max, fine_clusters_nums, fine_clusters_csum] =
+    arrange_fine_clusters(n_clusters, n_mesoclusters, n_rows, mesocluster_sizes);
+
+  const IdxT mesocluster_size_max_balanced = div_rounding_up_safe<size_t>(
+    2lu * size_t(n_rows), std::max<size_t>(size_t(n_mesoclusters), 1lu));
+  if (mesocluster_size_max > mesocluster_size_max_balanced) {
+    RAFT_LOG_WARN(
+      "build_hierarchical: built unbalanced mesoclusters (max_mesocluster_size == %u > %u). "
+      "At most %u points will be used for training within each mesocluster. "
+      "Consider increasing the number of training iterations `n_iters`.",
+      mesocluster_size_max,
+      mesocluster_size_max_balanced,
+      mesocluster_size_max_balanced);
+    RAFT_LOG_TRACE_VEC(mesocluster_sizes, n_mesoclusters);
+    RAFT_LOG_TRACE_VEC(fine_clusters_nums.data(), n_mesoclusters);
+    mesocluster_size_max = mesocluster_size_max_balanced;
+  }
+
+  auto n_clusters_done = build_fine_clusters(handle,
+                                             params,
+                                             dim,
+                                             dataset,
+                                             dataset_norm,
+                                             mesocluster_labels,
+                                             n_rows,
+                                             fine_clusters_nums.data(),
+                                             fine_clusters_csum.data(),
+                                             mesocluster_sizes,
+                                             n_mesoclusters,
+                                             mesocluster_size_max,
+                                             fine_clusters_nums_max,
+                                             cluster_centers,
+                                             mapping_op,
+                                             &managed_memory,
+                                             device_memory);
+  RAFT_EXPECTS(n_clusters_done == n_clusters, "Didn't process all clusters.");
+
+  rmm::device_uvector<CounterT> cluster_sizes(n_clusters, stream, device_memory);
+  rmm::device_uvector<LabelT> labels(n_rows, stream, device_memory);
+
+  // Fine-tuning k-means for all clusters
+  //
+  // (*) Since the likely cluster centroids have been calculated hierarchically already, the number
+  // of iterations for fine-tuning kmeans for whole clusters should be reduced. However, there is a
+  // possibility that the clusters could be unbalanced here, in which case the actual number of
+  // iterations would be increased.
+  //
+  balancing_em_iters(handle,
+                     params,
+                     std::max<uint32_t>(params.n_iters / 10, 2),
+                     dim,
+                     dataset,
+                     dataset_norm,
+                     n_rows,
+                     n_clusters,
+                     cluster_centers,
+                     labels.data(),
+                     cluster_sizes.data(),
+                     5,
+                     MathT{0.2},
+                     mapping_op,
+                     device_memory);
+}
+
+}  // namespace raft::cluster::detail
diff --git a/cpp/include/raft/cluster/detail/kmeans_common.cuh b/cpp/include/raft/cluster/detail/kmeans_common.cuh
index 2973be8c23..76fc22e99e 100644
--- a/cpp/include/raft/cluster/detail/kmeans_common.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans_common.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,14 +30,14 @@
 #include <raft/cluster/kmeans_types.hpp>
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/device_mdarray.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/kvp.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/mdarray.hpp>
+#include <raft/core/operators.hpp>
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <raft/distance/fused_l2_nn.cuh>
-#include <raft/linalg/reduce_cols_by_key.cuh>
 #include <raft/linalg/reduce_rows_by_key.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/matrix/gather.cuh>
@@ -88,7 +88,7 @@ struct KeyValueIndexOp {
 
 // Computes the intensity histogram from a sequence of labels
 template <typename SampleIteratorT, typename CounterT, typename IndexT>
-void countLabels(const raft::handle_t& handle,
+void countLabels(raft::device_resources const& handle,
                  SampleIteratorT labels,
                  CounterT* count,
                  IndexT n_samples,
@@ -96,9 +96,13 @@ void countLabels(const raft::handle_t& handle,
                  rmm::device_uvector<char>& workspace)
 {
   cudaStream_t stream = handle.get_stream();
-  IndexT num_levels   = n_clusters + 1;
-  IndexT lower_level  = 0;
-  IndexT upper_level  = n_clusters;
+
+  // CUB::DeviceHistogram requires a signed index type
+  typedef typename std::make_signed_t<IndexT> CubIndexT;
+
+  CubIndexT num_levels  = n_clusters + 1;
+  CubIndexT lower_level = 0;
+  CubIndexT upper_level = n_clusters;
 
   size_t temp_storage_bytes = 0;
   RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(nullptr,
@@ -108,7 +112,7 @@ void countLabels(const raft::handle_t& handle,
                                                     num_levels,
                                                     lower_level,
                                                     upper_level,
-                                                    n_samples,
+                                                    static_cast<CubIndexT>(n_samples),
                                                     stream));
 
   workspace.resize(temp_storage_bytes, stream);
@@ -120,12 +124,12 @@ void countLabels(const raft::handle_t& handle,
                                                     num_levels,
                                                     lower_level,
                                                     upper_level,
-                                                    n_samples,
+                                                    static_cast<CubIndexT>(n_samples),
                                                     stream));
 }
 
 template <typename DataT, typename IndexT>
-void checkWeight(const raft::handle_t& handle,
+void checkWeight(raft::device_resources const& handle,
                  raft::device_vector_view<DataT, IndexT> weight,
                  rmm::device_uvector<char>& workspace)
 {
@@ -156,12 +160,11 @@ void checkWeight(const raft::handle_t& handle,
       n_samples);
 
     auto scale = static_cast<DataT>(n_samples) / wt_sum;
-    raft::linalg::unaryOp(
-      weight.data_handle(),
-      weight.data_handle(),
-      n_samples,
-      [=] __device__(const DataT& wt) { return wt * scale; },
-      stream);
+    raft::linalg::unaryOp(weight.data_handle(),
+                          weight.data_handle(),
+                          n_samples,
+                          raft::mul_const_op<DataT>{scale},
+                          stream);
   }
 }
 
@@ -179,38 +182,47 @@ IndexT getCentroidsBatchSize(int batch_centroids, IndexT n_local_clusters)
   return (minVal == 0) ? n_local_clusters : minVal;
 }
 
-template <typename DataT, typename ReductionOpT, typename IndexT = int>
-void computeClusterCost(const raft::handle_t& handle,
-                        raft::device_vector_view<DataT, IndexT> minClusterDistance,
+template <typename InputT,
+          typename OutputT,
+          typename MainOpT,
+          typename ReductionOpT,
+          typename IndexT = int>
+void computeClusterCost(raft::device_resources const& handle,
+                        raft::device_vector_view<InputT, IndexT> minClusterDistance,
                         rmm::device_uvector<char>& workspace,
-                        raft::device_scalar_view<DataT> clusterCost,
+                        raft::device_scalar_view<OutputT> clusterCost,
+                        MainOpT main_op,
                         ReductionOpT reduction_op)
 {
-  cudaStream_t stream       = handle.get_stream();
+  cudaStream_t stream = handle.get_stream();
+
+  cub::TransformInputIterator<OutputT, MainOpT, InputT*> itr(minClusterDistance.data_handle(),
+                                                             main_op);
+
   size_t temp_storage_bytes = 0;
   RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(nullptr,
                                           temp_storage_bytes,
-                                          minClusterDistance.data_handle(),
+                                          itr,
                                           clusterCost.data_handle(),
                                           minClusterDistance.size(),
                                           reduction_op,
-                                          DataT(),
+                                          OutputT(),
                                           stream));
 
   workspace.resize(temp_storage_bytes, stream);
 
   RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(workspace.data(),
                                           temp_storage_bytes,
-                                          minClusterDistance.data_handle(),
+                                          itr,
                                           clusterCost.data_handle(),
                                           minClusterDistance.size(),
                                           reduction_op,
-                                          DataT(),
+                                          OutputT(),
                                           stream));
 }
 
 template <typename DataT, typename IndexT>
-void sampleCentroids(const raft::handle_t& handle,
+void sampleCentroids(raft::device_resources const& handle,
                      raft::device_matrix_view<const DataT, IndexT> X,
                      raft::device_vector_view<DataT, IndexT> minClusterDistance,
                      raft::device_vector_view<uint8_t, IndexT> isSampleCentroid,
@@ -267,16 +279,14 @@ void sampleCentroids(const raft::handle_t& handle,
                        sampledMinClusterDistance.data_handle(),
                        nPtsSampledInRank,
                        inRankCp.data(),
-                       [=] __device__(raft::KeyValuePair<ptrdiff_t, DataT> val) {  // MapTransformOp
-                         return val.key;
-                       },
+                       raft::key_op{},
                        stream);
 }
 
 // calculate pairwise distance between 'dataset[n x d]' and 'centroids[k x d]',
 // result will be stored in 'pairwiseDistance[n x k]'
 template <typename DataT, typename IndexT>
-void pairwise_distance_kmeans(const raft::handle_t& handle,
+void pairwise_distance_kmeans(raft::device_resources const& handle,
                               raft::device_matrix_view<const DataT, IndexT> X,
                               raft::device_matrix_view<const DataT, IndexT> centroids,
                               raft::device_matrix_view<DataT, IndexT> pairwiseDistance,
@@ -304,7 +314,7 @@ void pairwise_distance_kmeans(const raft::handle_t& handle,
 // shuffle and randomly select 'n_samples_to_gather' from input 'in' and stores
 // in 'out' does not modify the input
 template <typename DataT, typename IndexT>
-void shuffleAndGather(const raft::handle_t& handle,
+void shuffleAndGather(raft::device_resources const& handle,
                       raft::device_matrix_view<const DataT, IndexT> in,
                       raft::device_matrix_view<DataT, IndexT> out,
                       uint32_t n_samples_to_gather,
@@ -329,7 +339,7 @@ void shuffleAndGather(const raft::handle_t& handle,
                        in.extent(1),
                        in.extent(0),
                        indices.data_handle(),
-                       n_samples_to_gather,
+                       static_cast<IndexT>(n_samples_to_gather),
                        out.data_handle(),
                        stream);
 }
@@ -339,7 +349,7 @@ void shuffleAndGather(const raft::handle_t& handle,
 // is the distance between the sample and the 'centroid[key]'
 template <typename DataT, typename IndexT>
 void minClusterAndDistanceCompute(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const DataT, IndexT> X,
   raft::device_matrix_view<const DataT, IndexT> centroids,
   raft::device_vector_view<raft::KeyValuePair<IndexT, DataT>, IndexT> minClusterAndDistance,
@@ -464,17 +474,15 @@ void minClusterAndDistanceCompute(
             pair.value = val;
             return pair;
           },
-          [=] __device__(raft::KeyValuePair<IndexT, DataT> a, raft::KeyValuePair<IndexT, DataT> b) {
-            return (b.value < a.value) ? b : a;
-          },
-          [=] __device__(raft::KeyValuePair<IndexT, DataT> pair) { return pair; });
+          raft::argmin_op{},
+          raft::identity_op{});
       }
     }
   }
 }
 
 template <typename DataT, typename IndexT>
-void minClusterDistanceCompute(const raft::handle_t& handle,
+void minClusterDistanceCompute(raft::device_resources const& handle,
                                raft::device_matrix_view<const DataT, IndexT> X,
                                raft::device_matrix_view<DataT, IndexT> centroids,
                                raft::device_vector_view<DataT, IndexT> minClusterDistance,
@@ -542,7 +550,6 @@ void minClusterDistanceCompute(const raft::handle_t& handle,
     if (is_fused) {
       workspace.resize((sizeof(IndexT)) * ns, stream);
 
-      // todo(lsugy): remove cIdx
       raft::distance::fusedL2NNMinReduce<DataT, DataT, IndexT>(
         minClusterDistanceView.data_handle(),
         datasetView.data_handle(),
@@ -577,30 +584,23 @@ void minClusterDistanceCompute(const raft::handle_t& handle,
         pairwise_distance_kmeans<DataT, IndexT>(
           handle, datasetView, centroidsView, pairwiseDistanceView, workspace, metric);
 
-        raft::linalg::coalescedReduction(
-          minClusterDistanceView.data_handle(),
-          pairwiseDistanceView.data_handle(),
-          pairwiseDistanceView.extent(1),
-          pairwiseDistanceView.extent(0),
-          std::numeric_limits<DataT>::max(),
-          stream,
-          true,
-          [=] __device__(DataT val, IndexT i) {  // MainLambda
-            return val;
-          },
-          [=] __device__(DataT a, DataT b) {  // ReduceLambda
-            return (b < a) ? b : a;
-          },
-          [=] __device__(DataT val) {  // FinalLambda
-            return val;
-          });
+        raft::linalg::coalescedReduction(minClusterDistanceView.data_handle(),
+                                         pairwiseDistanceView.data_handle(),
+                                         pairwiseDistanceView.extent(1),
+                                         pairwiseDistanceView.extent(0),
+                                         std::numeric_limits<DataT>::max(),
+                                         stream,
+                                         true,
+                                         raft::identity_op{},
+                                         raft::min_op{},
+                                         raft::identity_op{});
       }
     }
   }
 }
 
 template <typename DataT, typename IndexT>
-void countSamplesInCluster(const raft::handle_t& handle,
+void countSamplesInCluster(raft::device_resources const& handle,
                            const KMeansParams& params,
                            raft::device_matrix_view<const DataT, IndexT> X,
                            raft::device_vector_view<const DataT, IndexT> L2NormX,
diff --git a/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh b/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh
index 2746b6f657..a9d8777304 100644
--- a/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,7 +42,7 @@
 #include <thrust/sort.h>
 #include <thrust/transform.h>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/spectral/detail/warn_dbg.hpp>
 #include <raft/spectral/matrix_wrappers.hpp>
@@ -360,7 +360,7 @@ static __global__ void divideCentroids(index_type_t d,
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-static int chooseNewCentroid(handle_t const& handle,
+static int chooseNewCentroid(raft::device_resources const& handle,
                              index_type_t n,
                              index_type_t d,
                              value_type_t rand,
@@ -457,7 +457,7 @@ static int chooseNewCentroid(handle_t const& handle,
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-static int initializeCentroids(handle_t const& handle,
+static int initializeCentroids(raft::device_resources const& handle,
                                index_type_t n,
                                index_type_t d,
                                index_type_t k,
@@ -568,7 +568,7 @@ static int initializeCentroids(handle_t const& handle,
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-static int assignCentroids(handle_t const& handle,
+static int assignCentroids(raft::device_resources const& handle,
                            index_type_t n,
                            index_type_t d,
                            index_type_t k,
@@ -640,7 +640,7 @@ static int assignCentroids(handle_t const& handle,
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-static int updateCentroids(handle_t const& handle,
+static int updateCentroids(raft::device_resources const& handle,
                            index_type_t n,
                            index_type_t d,
                            index_type_t k,
@@ -783,7 +783,7 @@ static int updateCentroids(handle_t const& handle,
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-int kmeans(handle_t const& handle,
+int kmeans(raft::device_resources const& handle,
            index_type_t n,
            index_type_t d,
            index_type_t k,
@@ -950,7 +950,7 @@ int kmeans(handle_t const& handle,
  *  @return error flag
  */
 template <typename index_type_t, typename value_type_t>
-int kmeans(handle_t const& handle,
+int kmeans(raft::device_resources const& handle,
            index_type_t n,
            index_type_t d,
            index_type_t k,
diff --git a/cpp/include/raft/cluster/detail/mst.cuh b/cpp/include/raft/cluster/detail/mst.cuh
index 8143d21641..46e31b672e 100644
--- a/cpp/include/raft/cluster/detail/mst.cuh
+++ b/cpp/include/raft/cluster/detail/mst.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,7 +67,7 @@ void merge_msts(sparse::solver::Graph_COO<value_idx, value_idx, value_t>& coo1,
  */
 template <typename value_idx, typename value_t, typename red_op>
 void connect_knn_graph(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const value_t* X,
   sparse::solver::Graph_COO<value_idx, value_idx, value_t>& msf,
   size_t m,
@@ -130,7 +130,7 @@ void connect_knn_graph(
  */
 template <typename value_idx, typename value_t, typename red_op>
 void build_sorted_mst(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const value_t* X,
   const value_idx* indptr,
   const value_idx* indices,
diff --git a/cpp/include/raft/cluster/detail/single_linkage.cuh b/cpp/include/raft/cluster/detail/single_linkage.cuh
index d12db85e1b..473d858827 100644
--- a/cpp/include/raft/cluster/detail/single_linkage.cuh
+++ b/cpp/include/raft/cluster/detail/single_linkage.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,7 +49,7 @@ static const size_t EMPTY = 0;
  * @param[in] n_clusters number of clusters to assign data samples
  */
 template <typename value_idx, typename value_t, LinkageDistance dist_type>
-void single_linkage(const raft::handle_t& handle,
+void single_linkage(raft::device_resources const& handle,
                     const value_t* X,
                     size_t m,
                     size_t n,
diff --git a/cpp/include/raft/cluster/kmeans.cuh b/cpp/include/raft/cluster/kmeans.cuh
index d64815244b..ac9e66d5da 100644
--- a/cpp/include/raft/cluster/kmeans.cuh
+++ b/cpp/include/raft/cluster/kmeans.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <raft/cluster/kmeans_types.hpp>
 #include <raft/core/kvp.hpp>
 #include <raft/core/mdarray.hpp>
+#include <raft/core/operators.hpp>
 
 namespace raft::cluster::kmeans {
 
@@ -43,12 +44,12 @@ using KeyValueIndexOp = detail::KeyValueIndexOp<IndexT, DataT>;
  *   k-means++ algorithm.
  *
  * @code{.cpp}
- *   #include <raft/core/handle.hpp>
+ *   #include <raft/core/device_resources.hpp>
  *   #include <raft/cluster/kmeans.cuh>
  *   #include <raft/cluster/kmeans_types.hpp>
  *   using namespace raft::cluster;
  *   ...
- *   raft::handle_t handle;
+ *   raft::raft::device_resources handle;
  *   raft::cluster::KMeansParams params;
  *   int n_features = 15, inertia, n_iter;
  *   auto centroids = raft::make_device_matrix<float, int>(handle, params.n_clusters, n_features);
@@ -82,7 +83,7 @@ using KeyValueIndexOp = detail::KeyValueIndexOp<IndexT, DataT>;
  * @param[out]    n_iter        Number of iterations run.
  */
 template <typename DataT, typename IndexT>
-void fit(handle_t const& handle,
+void fit(raft::device_resources const& handle,
          const KMeansParams& params,
          raft::device_matrix_view<const DataT, IndexT> X,
          std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight,
@@ -97,12 +98,12 @@ void fit(handle_t const& handle,
  * @brief Predict the closest cluster each sample in X belongs to.
  *
  * @code{.cpp}
- *   #include <raft/core/handle.hpp>
+ *   #include <raft/core/device_resources.hpp>
  *   #include <raft/cluster/kmeans.cuh>
  *   #include <raft/cluster/kmeans_types.hpp>
  *   using namespace raft::cluster;
  *   ...
- *   raft::handle_t handle;
+ *   raft::raft::device_resources handle;
  *   raft::cluster::KMeansParams params;
  *   int n_features = 15, inertia, n_iter;
  *   auto centroids = raft::make_device_matrix<float, int>(handle, params.n_clusters, n_features);
@@ -146,7 +147,7 @@ void fit(handle_t const& handle,
  *                                 their closest cluster center.
  */
 template <typename DataT, typename IndexT>
-void predict(handle_t const& handle,
+void predict(raft::device_resources const& handle,
              const KMeansParams& params,
              raft::device_matrix_view<const DataT, IndexT> X,
              std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight,
@@ -164,12 +165,12 @@ void predict(handle_t const& handle,
  * in the input.
  *
  * @code{.cpp}
- *   #include <raft/core/handle.hpp>
+ *   #include <raft/core/device_resources.hpp>
  *   #include <raft/cluster/kmeans.cuh>
  *   #include <raft/cluster/kmeans_types.hpp>
  *   using namespace raft::cluster;
  *   ...
- *   raft::handle_t handle;
+ *   raft::raft::device_resources handle;
  *   raft::cluster::KMeansParams params;
  *   int n_features = 15, inertia, n_iter;
  *   auto centroids = raft::make_device_matrix<float, int>(handle, params.n_clusters, n_features);
@@ -209,7 +210,7 @@ void predict(handle_t const& handle,
  * @param[out]    n_iter        Number of iterations run.
  */
 template <typename DataT, typename IndexT>
-void fit_predict(handle_t const& handle,
+void fit_predict(raft::device_resources const& handle,
                  const KMeansParams& params,
                  raft::device_matrix_view<const DataT, IndexT> X,
                  std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight,
@@ -238,7 +239,7 @@ void fit_predict(handle_t const& handle,
  *                              [dim = n_samples x n_features]
  */
 template <typename DataT, typename IndexT>
-void transform(const raft::handle_t& handle,
+void transform(raft::device_resources const& handle,
                const KMeansParams& params,
                raft::device_matrix_view<const DataT, IndexT> X,
                raft::device_matrix_view<const DataT, IndexT> centroids,
@@ -248,7 +249,7 @@ void transform(const raft::handle_t& handle,
 }
 
 template <typename DataT, typename IndexT>
-void transform(const raft::handle_t& handle,
+void transform(raft::device_resources const& handle,
                const KMeansParams& params,
                const DataT* X,
                const DataT* centroids,
@@ -280,7 +281,7 @@ void transform(const raft::handle_t& handle,
  *
  */
 template <typename DataT, typename IndexT>
-void sample_centroids(const raft::handle_t& handle,
+void sample_centroids(raft::device_resources const& handle,
                       raft::device_matrix_view<const DataT, IndexT> X,
                       raft::device_vector_view<DataT, IndexT> minClusterDistance,
                       raft::device_vector_view<std::uint8_t, IndexT> isSampleCentroid,
@@ -307,13 +308,14 @@ void sample_centroids(const raft::handle_t& handle,
  *
  */
 template <typename DataT, typename IndexT, typename ReductionOpT>
-void cluster_cost(const raft::handle_t& handle,
+void cluster_cost(raft::device_resources const& handle,
                   raft::device_vector_view<DataT, IndexT> minClusterDistance,
                   rmm::device_uvector<char>& workspace,
                   raft::device_scalar_view<DataT> clusterCost,
                   ReductionOpT reduction_op)
 {
-  detail::computeClusterCost(handle, minClusterDistance, workspace, clusterCost, reduction_op);
+  detail::computeClusterCost(
+    handle, minClusterDistance, workspace, clusterCost, raft::identity_op{}, reduction_op);
 }
 
 /**
@@ -332,7 +334,7 @@ void cluster_cost(const raft::handle_t& handle,
  * @param[out] new_centroids: output matrix of updated centroids (size n_clusters, n_features)
  */
 template <typename DataT, typename IndexT, typename LabelsIterator>
-void update_centroids(const raft::handle_t& handle,
+void update_centroids(raft::device_resources const& handle,
                       raft::device_matrix_view<const DataT, IndexT, row_major> X,
                       raft::device_vector_view<const DataT, IndexT> sample_weights,
                       raft::device_matrix_view<const DataT, IndexT, row_major> centroids,
@@ -373,7 +375,7 @@ void update_centroids(const raft::handle_t& handle,
  *
  */
 template <typename DataT, typename IndexT>
-void min_cluster_distance(const raft::handle_t& handle,
+void min_cluster_distance(raft::device_resources const& handle,
                           raft::device_matrix_view<const DataT, IndexT> X,
                           raft::device_matrix_view<DataT, IndexT> centroids,
                           raft::device_vector_view<DataT, IndexT> minClusterDistance,
@@ -424,7 +426,7 @@ void min_cluster_distance(const raft::handle_t& handle,
  */
 template <typename DataT, typename IndexT>
 void min_cluster_and_distance(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const DataT, IndexT> X,
   raft::device_matrix_view<const DataT, IndexT> centroids,
   raft::device_vector_view<raft::KeyValuePair<IndexT, DataT>, IndexT> minClusterAndDistance,
@@ -464,7 +466,7 @@ void min_cluster_and_distance(
  *
  */
 template <typename DataT, typename IndexT>
-void shuffle_and_gather(const raft::handle_t& handle,
+void shuffle_and_gather(raft::device_resources const& handle,
                         raft::device_matrix_view<const DataT, IndexT> in,
                         raft::device_matrix_view<DataT, IndexT> out,
                         uint32_t n_samples_to_gather,
@@ -493,7 +495,7 @@ void shuffle_and_gather(const raft::handle_t& handle,
  *
  */
 template <typename DataT, typename IndexT>
-void count_samples_in_cluster(const raft::handle_t& handle,
+void count_samples_in_cluster(raft::device_resources const& handle,
                               const KMeansParams& params,
                               raft::device_matrix_view<const DataT, IndexT> X,
                               raft::device_vector_view<DataT, IndexT> L2NormX,
@@ -523,7 +525,7 @@ void count_samples_in_cluster(const raft::handle_t& handle,
  * @param[in]  workspace             Temporary workspace buffer which can get resized
  */
 template <typename DataT, typename IndexT>
-void init_plus_plus(const raft::handle_t& handle,
+void init_plus_plus(raft::device_resources const& handle,
                     const KMeansParams& params,
                     raft::device_matrix_view<const DataT, IndexT> X,
                     raft::device_matrix_view<DataT, IndexT> centroids,
@@ -556,7 +558,7 @@ void init_plus_plus(const raft::handle_t& handle,
  * @param[in]     workspace     Temporary workspace buffer which can get resized
  */
 template <typename DataT, typename IndexT>
-void fit_main(const raft::handle_t& handle,
+void fit_main(raft::device_resources const& handle,
               const KMeansParams& params,
               raft::device_matrix_view<const DataT, IndexT> X,
               raft::device_vector_view<const DataT, IndexT> sample_weights,
@@ -603,7 +605,7 @@ namespace raft::cluster {
  * @param[out]    n_iter        Number of iterations run.
  */
 template <typename DataT, typename IndexT = int>
-void kmeans_fit(handle_t const& handle,
+void kmeans_fit(raft::device_resources const& handle,
                 const KMeansParams& params,
                 raft::device_matrix_view<const DataT, IndexT> X,
                 std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight,
@@ -615,7 +617,7 @@ void kmeans_fit(handle_t const& handle,
 }
 
 template <typename DataT, typename IndexT = int>
-void kmeans_fit(handle_t const& handle,
+void kmeans_fit(raft::device_resources const& handle,
                 const KMeansParams& params,
                 const DataT* X,
                 const DataT* sample_weight,
@@ -650,7 +652,7 @@ void kmeans_fit(handle_t const& handle,
  *                                 their closest cluster center.
  */
 template <typename DataT, typename IndexT = int>
-void kmeans_predict(handle_t const& handle,
+void kmeans_predict(raft::device_resources const& handle,
                     const KMeansParams& params,
                     raft::device_matrix_view<const DataT, IndexT> X,
                     std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight,
@@ -664,7 +666,7 @@ void kmeans_predict(handle_t const& handle,
 }
 
 template <typename DataT, typename IndexT = int>
-void kmeans_predict(handle_t const& handle,
+void kmeans_predict(raft::device_resources const& handle,
                     const KMeansParams& params,
                     const DataT* X,
                     const DataT* sample_weight,
@@ -715,7 +717,7 @@ void kmeans_predict(handle_t const& handle,
  * @param[out]    n_iter        Number of iterations run.
  */
 template <typename DataT, typename IndexT = int>
-void kmeans_fit_predict(handle_t const& handle,
+void kmeans_fit_predict(raft::device_resources const& handle,
                         const KMeansParams& params,
                         raft::device_matrix_view<const DataT, IndexT> X,
                         std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight,
@@ -729,7 +731,7 @@ void kmeans_fit_predict(handle_t const& handle,
 }
 
 template <typename DataT, typename IndexT = int>
-void kmeans_fit_predict(handle_t const& handle,
+void kmeans_fit_predict(raft::device_resources const& handle,
                         const KMeansParams& params,
                         const DataT* X,
                         const DataT* sample_weight,
@@ -760,7 +762,7 @@ void kmeans_fit_predict(handle_t const& handle,
  *                              [dim = n_samples x n_features]
  */
 template <typename DataT, typename IndexT = int>
-void kmeans_transform(const raft::handle_t& handle,
+void kmeans_transform(raft::device_resources const& handle,
                       const KMeansParams& params,
                       raft::device_matrix_view<const DataT, IndexT> X,
                       raft::device_matrix_view<const DataT, IndexT> centroids,
@@ -770,7 +772,7 @@ void kmeans_transform(const raft::handle_t& handle,
 }
 
 template <typename DataT, typename IndexT = int>
-void kmeans_transform(const raft::handle_t& handle,
+void kmeans_transform(raft::device_resources const& handle,
                       const KMeansParams& params,
                       const DataT* X,
                       const DataT* centroids,
@@ -807,7 +809,7 @@ using KeyValueIndexOp = kmeans::KeyValueIndexOp<IndexT, DataT>;
  *
  */
 template <typename DataT, typename IndexT>
-void sampleCentroids(const raft::handle_t& handle,
+void sampleCentroids(raft::device_resources const& handle,
                      raft::device_matrix_view<const DataT, IndexT> X,
                      raft::device_vector_view<DataT, IndexT> minClusterDistance,
                      raft::device_vector_view<std::uint8_t, IndexT> isSampleCentroid,
@@ -834,7 +836,7 @@ void sampleCentroids(const raft::handle_t& handle,
  *
  */
 template <typename DataT, typename IndexT, typename ReductionOpT>
-void computeClusterCost(const raft::handle_t& handle,
+void computeClusterCost(raft::device_resources const& handle,
                         raft::device_vector_view<DataT, IndexT> minClusterDistance,
                         rmm::device_uvector<char>& workspace,
                         raft::device_scalar_view<DataT> clusterCost,
@@ -865,7 +867,7 @@ void computeClusterCost(const raft::handle_t& handle,
  *
  */
 template <typename DataT, typename IndexT>
-void minClusterDistanceCompute(const raft::handle_t& handle,
+void minClusterDistanceCompute(raft::device_resources const& handle,
                                const KMeansParams& params,
                                raft::device_matrix_view<const DataT, IndexT> X,
                                raft::device_matrix_view<DataT, IndexT> centroids,
@@ -912,7 +914,7 @@ void minClusterDistanceCompute(const raft::handle_t& handle,
  */
 template <typename DataT, typename IndexT>
 void minClusterAndDistanceCompute(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const KMeansParams& params,
   raft::device_matrix_view<const DataT, IndexT> X,
   raft::device_matrix_view<const DataT, IndexT> centroids,
@@ -950,7 +952,7 @@ void minClusterAndDistanceCompute(
  *
  */
 template <typename DataT, typename IndexT>
-void shuffleAndGather(const raft::handle_t& handle,
+void shuffleAndGather(raft::device_resources const& handle,
                       raft::device_matrix_view<const DataT, IndexT> in,
                       raft::device_matrix_view<DataT, IndexT> out,
                       uint32_t n_samples_to_gather,
@@ -979,7 +981,7 @@ void shuffleAndGather(const raft::handle_t& handle,
  *
  */
 template <typename DataT, typename IndexT>
-void countSamplesInCluster(const raft::handle_t& handle,
+void countSamplesInCluster(raft::device_resources const& handle,
                            const KMeansParams& params,
                            raft::device_matrix_view<const DataT, IndexT> X,
                            raft::device_vector_view<DataT, IndexT> L2NormX,
@@ -1010,7 +1012,7 @@ void countSamplesInCluster(const raft::handle_t& handle,
  * @param[in]  workspace             Temporary workspace buffer which can get resized
  */
 template <typename DataT, typename IndexT>
-void kmeansPlusPlus(const raft::handle_t& handle,
+void kmeansPlusPlus(raft::device_resources const& handle,
                     const KMeansParams& params,
                     raft::device_matrix_view<const DataT, IndexT> X,
                     raft::device_matrix_view<DataT, IndexT> centroidsRawData,
@@ -1043,7 +1045,7 @@ void kmeansPlusPlus(const raft::handle_t& handle,
  * @param[in]     workspace     Temporary workspace buffer which can get resized
  */
 template <typename DataT, typename IndexT>
-void kmeans_fit_main(const raft::handle_t& handle,
+void kmeans_fit_main(raft::device_resources const& handle,
                      const KMeansParams& params,
                      raft::device_matrix_view<const DataT, IndexT> X,
                      raft::device_vector_view<const DataT, IndexT> weight,
diff --git a/cpp/include/raft/cluster/kmeans_balanced.cuh b/cpp/include/raft/cluster/kmeans_balanced.cuh
new file mode 100644
index 0000000000..405c7a8018
--- /dev/null
+++ b/cpp/include/raft/cluster/kmeans_balanced.cuh
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <utility>
+
+#include <raft/cluster/detail/kmeans_balanced.cuh>
+#include <raft/core/mdarray.hpp>
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft::cluster::kmeans_balanced {
+
+/**
+ * @brief Find clusters of balanced sizes with a hierarchical k-means algorithm.
+ *
+ * This variant of the k-means algorithm first clusters the dataset in mesoclusters, then clusters
+ * the subsets associated to each mesocluster into fine clusters, and finally runs a few k-means
+ * iterations over the whole dataset and with all the centroids to obtain the final clusters.
+ *
+ * Each k-means iteration applies expectation-maximization-balancing:
+ *  - Balancing: adjust centers for clusters that have a small number of entries. If the size of a
+ *    cluster is below a threshold, the center is moved towards a bigger cluster.
+ *  - Expectation: predict the labels (i.e find closest cluster centroid to each point)
+ *  - Maximization: calculate optimal centroids (i.e find the center of gravity of each cluster)
+ *
+ * The number of mesoclusters is chosen by rounding the square root of the number of clusters. E.g
+ * for 512 clusters, we would have 23 mesoclusters. The number of fine clusters per mesocluster is
+ * chosen proportionally to the number of points in each mesocluster.
+ *
+ * This variant of k-means uses random initialization and a fixed number of iterations, though
+ * iterations can be repeated if the balancing step moved the centroids.
+ *
+ * Additionally, this algorithm supports quantized datasets in arbitrary types but the core part of
+ * the algorithm will work with a floating-point type, hence a conversion function can be provided
+ * to map the data type to the math type.
+ *
+ * @code{.cpp}
+ *   #include <raft/core/handle.hpp>
+ *   #include <raft/cluster/kmeans_balanced.cuh>
+ *   #include <raft/cluster/kmeans_balanced_types.hpp>
+ *   ...
+ *   raft::handle_t handle;
+ *   raft::cluster::kmeans_balanced_params params;
+ *   auto centroids = raft::make_device_matrix<float, int>(handle, n_clusters, n_features);
+ *   raft::cluster::kmeans_balanced::fit(handle, params, X, centroids.view());
+ * @endcode
+ *
+ * @tparam DataT Type of the input data.
+ * @tparam MathT Type of the centroids and mapped data.
+ * @tparam IndexT Type used for indexing.
+ * @tparam MappingOpT Type of the mapping function.
+ * @param[in]  handle     The raft resources
+ * @param[in]  params     Structure containing the hyper-parameters
+ * @param[in]  X          Training instances to cluster. The data must be in row-major format.
+ *                        [dim = n_samples x n_features]
+ * @param[out] centroids  The generated centroids [dim = n_clusters x n_features]
+ * @param[in]  mapping_op (optional) Functor to convert from the input datatype to the arithmetic
+ *                        datatype. If DataT == MathT, this must be the identity.
+ */
+template <typename DataT, typename MathT, typename IndexT, typename MappingOpT = raft::identity_op>
+void fit(const raft::device_resources& handle,
+         kmeans_balanced_params const& params,
+         raft::device_matrix_view<const DataT, IndexT> X,
+         raft::device_matrix_view<MathT, IndexT> centroids,
+         MappingOpT mapping_op = raft::identity_op())
+{
+  RAFT_EXPECTS(X.extent(1) == centroids.extent(1),
+               "Number of features in dataset and centroids are different");
+  RAFT_EXPECTS(static_cast<uint64_t>(X.extent(0)) * static_cast<uint64_t>(X.extent(1)) <=
+                 static_cast<uint64_t>(std::numeric_limits<IndexT>::max()),
+               "The chosen index type cannot represent all indices for the given dataset");
+  RAFT_EXPECTS(centroids.extent(0) > IndexT{0} && centroids.extent(0) <= X.extent(0),
+               "The number of centroids must be strictly positive and cannot exceed the number of "
+               "points in the training dataset.");
+
+  detail::build_hierarchical(handle,
+                             params,
+                             X.extent(1),
+                             X.data_handle(),
+                             X.extent(0),
+                             centroids.data_handle(),
+                             centroids.extent(0),
+                             mapping_op);
+}
+
+/**
+ * @brief Predict the closest cluster each sample in X belongs to.
+ *
+ * @code{.cpp}
+ *   #include <raft/core/handle.hpp>
+ *   #include <raft/cluster/kmeans_balanced.cuh>
+ *   #include <raft/cluster/kmeans_balanced_types.hpp>
+ *   ...
+ *   raft::handle_t handle;
+ *   raft::cluster::kmeans_balanced_params params;
+ *   auto labels = raft::make_device_vector<float, int>(handle, n_rows);
+ *   raft::cluster::kmeans_balanced::predict(handle, params, X, centroids, labels);
+ * @endcode
+ *
+ * @tparam DataT Type of the input data.
+ * @tparam MathT Type of the centroids and mapped data.
+ * @tparam IndexT Type used for indexing.
+ * @tparam LabelT Type of the output labels.
+ * @tparam MappingOpT Type of the mapping function.
+ * @param[in]  handle     The raft resources
+ * @param[in]  params     Structure containing the hyper-parameters
+ * @param[in]  X          Dataset for which to infer the closest clusters.
+ *                        [dim = n_samples x n_features]
+ * @param[in]  centroids  The input centroids [dim = n_clusters x n_features]
+ * @param[out] labels     The output labels [dim = n_samples]
+ * @param[in]  mapping_op (optional) Functor to convert from the input datatype to the arithmetic
+ *                        datatype. If DataT == MathT, this must be the identity.
+ */
+template <typename DataT,
+          typename MathT,
+          typename IndexT,
+          typename LabelT,
+          typename MappingOpT = raft::identity_op>
+void predict(const raft::device_resources& handle,
+             kmeans_balanced_params const& params,
+             raft::device_matrix_view<const DataT, IndexT> X,
+             raft::device_matrix_view<const MathT, IndexT> centroids,
+             raft::device_vector_view<LabelT, IndexT> labels,
+             MappingOpT mapping_op = raft::identity_op())
+{
+  RAFT_EXPECTS(X.extent(0) == labels.extent(0),
+               "Number of rows in dataset and labels are different");
+  RAFT_EXPECTS(X.extent(1) == centroids.extent(1),
+               "Number of features in dataset and centroids are different");
+  RAFT_EXPECTS(static_cast<uint64_t>(X.extent(0)) * static_cast<uint64_t>(X.extent(1)) <=
+                 static_cast<uint64_t>(std::numeric_limits<IndexT>::max()),
+               "The chosen index type cannot represent all indices for the given dataset");
+  RAFT_EXPECTS(static_cast<uint64_t>(centroids.extent(0)) <=
+                 static_cast<uint64_t>(std::numeric_limits<LabelT>::max()),
+               "The chosen label type cannot represent all cluster labels");
+
+  detail::predict(handle,
+                  params,
+                  centroids.data_handle(),
+                  centroids.extent(0),
+                  X.extent(1),
+                  X.data_handle(),
+                  X.extent(0),
+                  labels.data_handle(),
+                  mapping_op);
+}
+
+/**
+ * @brief Compute hierarchical balanced k-means clustering and predict cluster index for each sample
+ * in the input.
+ *
+ * @code{.cpp}
+ *   #include <raft/core/handle.hpp>
+ *   #include <raft/cluster/kmeans_balanced.cuh>
+ *   #include <raft/cluster/kmeans_balanced_types.hpp>
+ *   ...
+ *   raft::handle_t handle;
+ *   raft::cluster::kmeans_balanced_params params;
+ *   auto centroids = raft::make_device_matrix<float, int>(handle, n_clusters, n_features);
+ *   auto labels = raft::make_device_vector<float, int>(handle, n_rows);
+ *   raft::cluster::kmeans_balanced::fit_predict(
+ *       handle, params, X, centroids.view(), labels.view());
+ * @endcode
+ *
+ * @tparam DataT Type of the input data.
+ * @tparam MathT Type of the centroids and mapped data.
+ * @tparam IndexT Type used for indexing.
+ * @tparam LabelT Type of the output labels.
+ * @tparam MappingOpT Type of the mapping function.
+ * @param[in]  handle     The raft resources
+ * @param[in]  params     Structure containing the hyper-parameters
+ * @param[in]  X          Training instances to cluster. The data must be in row-major format.
+ *                        [dim = n_samples x n_features]
+ * @param[out] centroids  The output centroids [dim = n_clusters x n_features]
+ * @param[out] labels     The output labels [dim = n_samples]
+ * @param[in]  mapping_op (optional) Functor to convert from the input datatype to the arithmetic
+ *                        datatype. If DataT and MathT are the same, this must be the identity.
+ */
+template <typename DataT,
+          typename MathT,
+          typename IndexT,
+          typename LabelT,
+          typename MappingOpT = raft::identity_op>
+void fit_predict(const raft::device_resources& handle,
+                 kmeans_balanced_params const& params,
+                 raft::device_matrix_view<const DataT, IndexT> X,
+                 raft::device_matrix_view<MathT, IndexT> centroids,
+                 raft::device_vector_view<LabelT, IndexT> labels,
+                 MappingOpT mapping_op = raft::identity_op())
+{
+  auto centroids_const = raft::make_device_matrix_view<const MathT, IndexT>(
+    centroids.data_handle(), centroids.extent(0), centroids.extent(1));
+  raft::cluster::kmeans_balanced::fit(handle, params, X, centroids, mapping_op);
+  raft::cluster::kmeans_balanced::predict(handle, params, X, centroids_const, labels, mapping_op);
+}
+
+namespace helpers {
+
+/**
+ * @brief Randomly initialize centers and apply expectation-maximization-balancing iterations
+ *
+ * This is essentially the non-hierarchical balanced k-means algorithm which is used by the
+ * hierarchical algorithm once to build the mesoclusters and once per mesocluster to build the fine
+ * clusters.
+ *
+ * @code{.cpp}
+ *   #include <raft/core/handle.hpp>
+ *   #include <raft/cluster/kmeans_balanced.cuh>
+ *   #include <raft/cluster/kmeans_balanced_types.hpp>
+ *   ...
+ *   raft::handle_t handle;
+ *   raft::cluster::kmeans_balanced_params params;
+ *   auto centroids = raft::make_device_matrix<float, int>(handle, n_clusters, n_features);
+ *   auto labels = raft::make_device_vector<int, int>(handle, n_samples);
+ *   auto sizes = raft::make_device_vector<int, int>(handle, n_clusters);
+ *   raft::cluster::kmeans_balanced::build_clusters(
+ *       handle, params, X, centroids.view(), labels.view(), sizes.view());
+ * @endcode
+ *
+ * @tparam DataT Type of the input data.
+ * @tparam MathT Type of the centroids and mapped data.
+ * @tparam IndexT Type used for indexing.
+ * @tparam LabelT Type of the output labels.
+ * @tparam CounterT Counter type supported by CUDA's native atomicAdd.
+ * @tparam MappingOpT Type of the mapping function.
+ * @param[in]  handle        The raft resources
+ * @param[in]  params        Structure containing the hyper-parameters
+ * @param[in]  X             Training instances to cluster. The data must be in row-major format.
+ *                           [dim = n_samples x n_features]
+ * @param[out] centroids     The output centroids [dim = n_clusters x n_features]
+ * @param[out] labels        The output labels [dim = n_samples]
+ * @param[out] cluster_sizes Size of each cluster [dim = n_clusters]
+ * @param[in]  mapping_op    (optional) Functor to convert from the input datatype to the
+ *                           arithmetic datatype. If DataT == MathT, this must be the identity.
+ * @param[in]  X_norm        (optional) Dataset's row norms [dim = n_samples]
+ */
+template <typename DataT,
+          typename MathT,
+          typename IndexT,
+          typename LabelT,
+          typename CounterT,
+          typename MappingOpT>
+void build_clusters(const raft::device_resources& handle,
+                    const kmeans_balanced_params& params,
+                    raft::device_matrix_view<const DataT, IndexT> X,
+                    raft::device_matrix_view<MathT, IndexT> centroids,
+                    raft::device_vector_view<LabelT, IndexT> labels,
+                    raft::device_vector_view<CounterT, IndexT> cluster_sizes,
+                    MappingOpT mapping_op = raft::identity_op(),
+                    std::optional<raft::device_vector_view<const MathT>> X_norm = std::nullopt)
+{
+  RAFT_EXPECTS(X.extent(0) == labels.extent(0),
+               "Number of rows in dataset and labels are different");
+  RAFT_EXPECTS(X.extent(1) == centroids.extent(1),
+               "Number of features in dataset and centroids are different");
+  RAFT_EXPECTS(centroids.extent(0) == cluster_sizes.extent(0),
+               "Number of rows in centroids and clusyer_sizes are different");
+
+  detail::build_clusters(handle,
+                         params,
+                         X.extent(1),
+                         X.data_handle(),
+                         X.extent(0),
+                         centroids.extent(0),
+                         centroids.data_handle(),
+                         labels.data_handle(),
+                         cluster_sizes.data_handle(),
+                         mapping_op,
+                         handle.get_workspace_resource(),
+                         X_norm.has_value() ? X_norm.value().data_handle() : nullptr);
+}
+
+/**
+ * @brief Given the data and labels, calculate cluster centers and sizes in one sweep.
+ *
+ * Let `S_i = {x_k | x_k \in X & labels[k] == i}` be the vectors in the dataset with label i.
+ *
+ * On exit,
+ *   `centers_i = (\sum_{x \in S_i} x + w_i * center_i) / (|S_i| + w_i)`,
+ *     where  `w_i = reset_counters ?  0 : cluster_size[i]`.
+ *
+ * In other words, the updated cluster centers are a weighted average of the existing cluster
+ * center, and the coordinates of the points labeled with i. _This allows calling this function
+ * multiple times with different datasets with the same effect as if calling this function once
+ * on the combined dataset_.
+ *
+ * @code{.cpp}
+ *   #include <raft/core/handle.hpp>
+ *   #include <raft/cluster/kmeans_balanced.cuh>
+ *   ...
+ *   raft::handle_t handle;
+ *   auto centroids = raft::make_device_matrix<float, int>(handle, n_clusters, n_features);
+ *   auto sizes = raft::make_device_vector<int, int>(handle, n_clusters);
+ *   raft::cluster::kmeans_balanced::calc_centers_and_sizes(
+ *       handle, X, labels, centroids.view(), sizes.view(), true);
+ * @endcode
+ *
+ * @tparam DataT Type of the input data.
+ * @tparam MathT Type of the centroids and mapped data.
+ * @tparam IndexT Type used for indexing.
+ * @tparam LabelT Type of the output labels.
+ * @tparam CounterT Counter type supported by CUDA's native atomicAdd.
+ * @tparam MappingOpT Type of the mapping function.
+ * @param[in]  handle         The raft resources
+ * @param[in]  X              Dataset for which to calculate cluster centers. The data must be in
+ *                            row-major format. [dim = n_samples x n_features]
+ * @param[in]  labels         The input labels [dim = n_samples]
+ * @param[out] centroids      The output centroids [dim = n_clusters x n_features]
+ * @param[out] cluster_sizes  Size of each cluster [dim = n_clusters]
+ * @param[in]  reset_counters Whether to clear the output arrays before calculating.
+ *                            When set to `false`, this function may be used to update existing
+ *                            centers and sizes using the weighted average principle.
+ * @param[in]  mapping_op     (optional) Functor to convert from the input datatype to the
+ *                            arithmetic datatype. If DataT == MathT, this must be the identity.
+ */
+template <typename DataT,
+          typename MathT,
+          typename IndexT,
+          typename LabelT,
+          typename CounterT,
+          typename MappingOpT = raft::identity_op>
+void calc_centers_and_sizes(const raft::device_resources& handle,
+                            raft::device_matrix_view<const DataT, IndexT> X,
+                            raft::device_vector_view<const LabelT, IndexT> labels,
+                            raft::device_matrix_view<MathT, IndexT> centroids,
+                            raft::device_vector_view<CounterT, IndexT> cluster_sizes,
+                            bool reset_counters   = true,
+                            MappingOpT mapping_op = raft::identity_op())
+{
+  RAFT_EXPECTS(X.extent(0) == labels.extent(0),
+               "Number of rows in dataset and labels are different");
+  RAFT_EXPECTS(X.extent(1) == centroids.extent(1),
+               "Number of features in dataset and centroids are different");
+  RAFT_EXPECTS(centroids.extent(0) == cluster_sizes.extent(0),
+               "Number of rows in centroids and clusyer_sizes are different");
+
+  detail::calc_centers_and_sizes(handle,
+                                 centroids.data_handle(),
+                                 cluster_sizes.data_handle(),
+                                 centroids.extent(0),
+                                 X.extent(1),
+                                 X.data_handle(),
+                                 X.extent(0),
+                                 labels.data_handle(),
+                                 reset_counters,
+                                 mapping_op);
+}
+
+}  // namespace helpers
+
+}  // namespace raft::cluster::kmeans_balanced
diff --git a/cpp/include/raft/cluster/kmeans_balanced_types.hpp b/cpp/include/raft/cluster/kmeans_balanced_types.hpp
new file mode 100644
index 0000000000..11b77e288a
--- /dev/null
+++ b/cpp/include/raft/cluster/kmeans_balanced_types.hpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cluster/kmeans_types.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/random/rng_state.hpp>
+
+namespace raft::cluster::kmeans_balanced {
+
+/**
+ * Simple object to specify hyper-parameters to the balanced k-means algorithm.
+ *
+ * The following metrics are currently supported in k-means balanced:
+ *  - InnerProduct
+ *  - L2Expanded
+ *  - L2SqrtExpanded
+ */
+struct kmeans_balanced_params : kmeans_base_params {
+  /**
+   * Number of training iterations
+   */
+  uint32_t n_iters = 20;
+};
+
+}  // namespace raft::cluster::kmeans_balanced
+
+namespace raft::cluster {
+
+using kmeans_balanced::kmeans_balanced_params;
+
+}  // namespace raft::cluster
diff --git a/cpp/include/raft/cluster/kmeans_deprecated.cuh b/cpp/include/raft/cluster/kmeans_deprecated.cuh
index a4cac4cb0f..8e0861ada1 100644
--- a/cpp/include/raft/cluster/kmeans_deprecated.cuh
+++ b/cpp/include/raft/cluster/kmeans_deprecated.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -46,7 +46,7 @@ namespace cluster {
  *  @return error flag
  */
 template <typename index_type_t, typename value_type_t>
-int kmeans(handle_t const& handle,
+int kmeans(raft::device_resources const& handle,
            index_type_t n,
            index_type_t d,
            index_type_t k,
diff --git a/cpp/include/raft/cluster/kmeans_types.hpp b/cpp/include/raft/cluster/kmeans_types.hpp
index f411b12b5c..4d956ad7a0 100644
--- a/cpp/include/raft/cluster/kmeans_types.hpp
+++ b/cpp/include/raft/cluster/kmeans_types.hpp
@@ -18,12 +18,24 @@
 #include <raft/distance/distance_types.hpp>
 #include <raft/random/rng_state.hpp>
 
+namespace raft::cluster {
+
+/** Base structure for parameters that are common to all k-means algorithms */
+struct kmeans_base_params {
+  /**
+   * Metric to use for distance computation. The supported metrics can vary per algorithm.
+   */
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded;
+};
+
+}  // namespace raft::cluster
+
 namespace raft::cluster::kmeans {
 
 /**
  * Simple object to specify hyper-parameters to the kmeans algorithm.
  */
-struct KMeansParams {
+struct KMeansParams : kmeans_base_params {
   enum InitMethod {
 
     /**
@@ -75,13 +87,7 @@ struct KMeansParams {
   /**
    * Seed to the random number generator.
    */
-  raft::random::RngState rng_state =
-    raft::random::RngState(0, raft::random::GeneratorType::GenPhilox);
-
-  /**
-   * Metric to use for distance computation.
-   */
-  raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded;
+  raft::random::RngState rng_state{0};
 
   /**
    * Number of instance k-means algorithm will be run with different seeds.
diff --git a/cpp/include/raft/cluster/single_linkage.cuh b/cpp/include/raft/cluster/single_linkage.cuh
index 2d74c364b2..91241b853b 100644
--- a/cpp/include/raft/cluster/single_linkage.cuh
+++ b/cpp/include/raft/cluster/single_linkage.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -50,7 +50,7 @@ namespace raft::cluster {
 template <typename value_idx,
           typename value_t,
           LinkageDistance dist_type = LinkageDistance::KNN_GRAPH>
-void single_linkage(const raft::handle_t& handle,
+void single_linkage(raft::device_resources const& handle,
                     const value_t* X,
                     size_t m,
                     size_t n,
@@ -87,7 +87,7 @@ constexpr int DEFAULT_CONST_C = 15;
  control of k. The algorithm will set `k = log(n) + c`
  */
 template <typename value_t, typename idx_t, LinkageDistance dist_type = LinkageDistance::KNN_GRAPH>
-void single_linkage(const raft::handle_t& handle,
+void single_linkage(raft::device_resources const& handle,
                     raft::device_matrix_view<const value_t, idx_t, row_major> X,
                     raft::device_matrix_view<idx_t, idx_t, row_major> dendrogram,
                     raft::device_vector_view<idx_t, idx_t> labels,
diff --git a/cpp/include/raft/comms/comms_test.hpp b/cpp/include/raft/comms/comms_test.hpp
index c7e5dd3ab6..c61bb32f79 100644
--- a/cpp/include/raft/comms/comms_test.hpp
+++ b/cpp/include/raft/comms/comms_test.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #include <raft/comms/comms.hpp>
 #include <raft/comms/detail/test.hpp>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 namespace raft {
 namespace comms {
@@ -31,7 +31,7 @@ namespace comms {
  *        initialized comms instance.
  *  @param[in] root the root rank id
  */
-bool test_collective_allreduce(const handle_t& handle, int root)
+bool test_collective_allreduce(raft::device_resources const& handle, int root)
 {
   return detail::test_collective_allreduce(handle, root);
 }
@@ -43,7 +43,7 @@ bool test_collective_allreduce(const handle_t& handle, int root)
  *        initialized comms instance.
  *  @param[in] root the root rank id
  */
-bool test_collective_broadcast(const handle_t& handle, int root)
+bool test_collective_broadcast(raft::device_resources const& handle, int root)
 {
   return detail::test_collective_broadcast(handle, root);
 }
@@ -55,7 +55,7 @@ bool test_collective_broadcast(const handle_t& handle, int root)
  *        initialized comms instance.
  *  @param[in] root the root rank id
  */
-bool test_collective_reduce(const handle_t& handle, int root)
+bool test_collective_reduce(raft::device_resources const& handle, int root)
 {
   return detail::test_collective_reduce(handle, root);
 }
@@ -67,7 +67,7 @@ bool test_collective_reduce(const handle_t& handle, int root)
  *        initialized comms instance.
  *  @param[in] root the root rank id
  */
-bool test_collective_allgather(const handle_t& handle, int root)
+bool test_collective_allgather(raft::device_resources const& handle, int root)
 {
   return detail::test_collective_allgather(handle, root);
 }
@@ -79,7 +79,7 @@ bool test_collective_allgather(const handle_t& handle, int root)
  *        initialized comms instance.
  *  @param[in] root the root rank id
  */
-bool test_collective_gather(const handle_t& handle, int root)
+bool test_collective_gather(raft::device_resources const& handle, int root)
 {
   return detail::test_collective_gather(handle, root);
 }
@@ -91,7 +91,7 @@ bool test_collective_gather(const handle_t& handle, int root)
  *        initialized comms instance.
  *  @param[in] root the root rank id
  */
-bool test_collective_gatherv(const handle_t& handle, int root)
+bool test_collective_gatherv(raft::device_resources const& handle, int root)
 {
   return detail::test_collective_gatherv(handle, root);
 }
@@ -103,7 +103,7 @@ bool test_collective_gatherv(const handle_t& handle, int root)
  *        initialized comms instance.
  *  @param[in] root the root rank id
  */
-bool test_collective_reducescatter(const handle_t& handle, int root)
+bool test_collective_reducescatter(raft::device_resources const& handle, int root)
 {
   return detail::test_collective_reducescatter(handle, root);
 }
@@ -115,7 +115,7 @@ bool test_collective_reducescatter(const handle_t& handle, int root)
  *        initialized comms instance.
  * @param[in] numTrials number of iterations of all-to-all messaging to perform
  */
-bool test_pointToPoint_simple_send_recv(const handle_t& h, int numTrials)
+bool test_pointToPoint_simple_send_recv(raft::device_resources const& h, int numTrials)
 {
   return detail::test_pointToPoint_simple_send_recv(h, numTrials);
 }
@@ -127,7 +127,7 @@ bool test_pointToPoint_simple_send_recv(const handle_t& h, int numTrials)
  *        initialized comms instance.
  * @param numTrials number of iterations of send or receive messaging to perform
  */
-bool test_pointToPoint_device_send_or_recv(const handle_t& h, int numTrials)
+bool test_pointToPoint_device_send_or_recv(raft::device_resources const& h, int numTrials)
 {
   return detail::test_pointToPoint_device_send_or_recv(h, numTrials);
 }
@@ -139,7 +139,7 @@ bool test_pointToPoint_device_send_or_recv(const handle_t& h, int numTrials)
  *        initialized comms instance.
  * @param numTrials number of iterations of send or receive messaging to perform
  */
-bool test_pointToPoint_device_sendrecv(const handle_t& h, int numTrials)
+bool test_pointToPoint_device_sendrecv(raft::device_resources const& h, int numTrials)
 {
   return detail::test_pointToPoint_device_sendrecv(h, numTrials);
 }
@@ -151,7 +151,7 @@ bool test_pointToPoint_device_sendrecv(const handle_t& h, int numTrials)
  *        initialized comms instance.
  * @param numTrials number of iterations of send or receive messaging to perform
  */
-bool test_pointToPoint_device_multicast_sendrecv(const handle_t& h, int numTrials)
+bool test_pointToPoint_device_multicast_sendrecv(raft::device_resources const& h, int numTrials)
 {
   return detail::test_pointToPoint_device_multicast_sendrecv(h, numTrials);
 }
@@ -163,6 +163,9 @@ bool test_pointToPoint_device_multicast_sendrecv(const handle_t& h, int numTrial
  *        initialized comms instance.
  * @param n_colors number of different colors to test
  */
-bool test_commsplit(const handle_t& h, int n_colors) { return detail::test_commsplit(h, n_colors); }
+bool test_commsplit(raft::device_resources const& h, int n_colors)
+{
+  return detail::test_commsplit(h, n_colors);
+}
 }  // namespace comms
 };  // namespace raft
diff --git a/cpp/include/raft/comms/detail/mpi_comms.hpp b/cpp/include/raft/comms/detail/mpi_comms.hpp
index 508a9ce717..4062389eea 100644
--- a/cpp/include/raft/comms/detail/mpi_comms.hpp
+++ b/cpp/include/raft/comms/detail/mpi_comms.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,8 +28,8 @@
 
 #include <raft/comms/comms.hpp>
 #include <raft/comms/detail/util.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/error.hpp>
-#include <raft/core/handle.hpp>
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
diff --git a/cpp/include/raft/comms/detail/std_comms.hpp b/cpp/include/raft/comms/detail/std_comms.hpp
index 33892597d8..0db27f0a45 100644
--- a/cpp/include/raft/comms/detail/std_comms.hpp
+++ b/cpp/include/raft/comms/detail/std_comms.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
 #include <raft/comms/detail/ucp_helper.hpp>
 #include <raft/comms/detail/util.hpp>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/comms/detail/test.hpp b/cpp/include/raft/comms/detail/test.hpp
index 6ba4be3886..2b12bf2d2a 100644
--- a/cpp/include/raft/comms/detail/test.hpp
+++ b/cpp/include/raft/comms/detail/test.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/comms/comms.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -38,7 +38,7 @@ namespace detail {
  *        initialized comms instance.
  *  @param[in] root the root rank id
  */
-bool test_collective_allreduce(const handle_t& handle, int root)
+bool test_collective_allreduce(raft::device_resources const& handle, int root)
 {
   comms_t const& communicator = handle.get_comms();
 
@@ -69,7 +69,7 @@ bool test_collective_allreduce(const handle_t& handle, int root)
  *        initialized comms instance.
  *  @param[in] root the root rank id
  */
-bool test_collective_broadcast(const handle_t& handle, int root)
+bool test_collective_broadcast(raft::device_resources const& handle, int root)
 {
   comms_t const& communicator = handle.get_comms();
 
@@ -104,7 +104,7 @@ bool test_collective_broadcast(const handle_t& handle, int root)
  *        initialized comms instance.
  *  @param[in] root the root rank id
  */
-bool test_collective_reduce(const handle_t& handle, int root)
+bool test_collective_reduce(raft::device_resources const& handle, int root)
 {
   comms_t const& communicator = handle.get_comms();
 
@@ -140,7 +140,7 @@ bool test_collective_reduce(const handle_t& handle, int root)
  *        initialized comms instance.
  *  @param[in] root the root rank id
  */
-bool test_collective_allgather(const handle_t& handle, int root)
+bool test_collective_allgather(raft::device_resources const& handle, int root)
 {
   comms_t const& communicator = handle.get_comms();
 
@@ -177,7 +177,7 @@ bool test_collective_allgather(const handle_t& handle, int root)
  *        initialized comms instance.
  *  @param[in] root the root rank id
  */
-bool test_collective_gather(const handle_t& handle, int root)
+bool test_collective_gather(raft::device_resources const& handle, int root)
 {
   comms_t const& communicator = handle.get_comms();
 
@@ -214,7 +214,7 @@ bool test_collective_gather(const handle_t& handle, int root)
  *        initialized comms instance.
  *  @param[in] root the root rank id
  */
-bool test_collective_gatherv(const handle_t& handle, int root)
+bool test_collective_gatherv(raft::device_resources const& handle, int root)
 {
   comms_t const& communicator = handle.get_comms();
 
@@ -273,7 +273,7 @@ bool test_collective_gatherv(const handle_t& handle, int root)
  *        initialized comms instance.
  *  @param[in] root the root rank id
  */
-bool test_collective_reducescatter(const handle_t& handle, int root)
+bool test_collective_reducescatter(raft::device_resources const& handle, int root)
 {
   comms_t const& communicator = handle.get_comms();
 
@@ -308,7 +308,7 @@ bool test_collective_reducescatter(const handle_t& handle, int root)
  *        initialized comms instance.
  * @param[in] numTrials number of iterations of all-to-all messaging to perform
  */
-bool test_pointToPoint_simple_send_recv(const handle_t& h, int numTrials)
+bool test_pointToPoint_simple_send_recv(raft::device_resources const& h, int numTrials)
 {
   comms_t const& communicator = h.get_comms();
   int const rank              = communicator.get_rank();
@@ -373,7 +373,7 @@ bool test_pointToPoint_simple_send_recv(const handle_t& h, int numTrials)
  *        initialized comms instance.
  * @param numTrials number of iterations of send or receive messaging to perform
  */
-bool test_pointToPoint_device_send_or_recv(const handle_t& h, int numTrials)
+bool test_pointToPoint_device_send_or_recv(raft::device_resources const& h, int numTrials)
 {
   comms_t const& communicator = h.get_comms();
   int const rank              = communicator.get_rank();
@@ -415,7 +415,7 @@ bool test_pointToPoint_device_send_or_recv(const handle_t& h, int numTrials)
  *        initialized comms instance.
  * @param numTrials number of iterations of send or receive messaging to perform
  */
-bool test_pointToPoint_device_sendrecv(const handle_t& h, int numTrials)
+bool test_pointToPoint_device_sendrecv(raft::device_resources const& h, int numTrials)
 {
   comms_t const& communicator = h.get_comms();
   int const rank              = communicator.get_rank();
@@ -461,7 +461,7 @@ bool test_pointToPoint_device_sendrecv(const handle_t& h, int numTrials)
  *        initialized comms instance.
  * @param numTrials number of iterations of send or receive messaging to perform
  */
-bool test_pointToPoint_device_multicast_sendrecv(const handle_t& h, int numTrials)
+bool test_pointToPoint_device_multicast_sendrecv(raft::device_resources const& h, int numTrials)
 {
   comms_t const& communicator = h.get_comms();
   int const rank              = communicator.get_rank();
@@ -520,7 +520,7 @@ bool test_pointToPoint_device_multicast_sendrecv(const handle_t& h, int numTrial
  *        initialized comms instance.
  * @param n_colors number of different colors to test
  */
-bool test_commsplit(const handle_t& h, int n_colors)
+bool test_commsplit(raft::device_resources const& h, int n_colors)
 {
   comms_t const& communicator = h.get_comms();
   int const rank              = communicator.get_rank();
diff --git a/cpp/include/raft/comms/helper.hpp b/cpp/include/raft/comms/helper.hpp
deleted file mode 100644
index f6b63ac971..0000000000
--- a/cpp/include/raft/comms/helper.hpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/comms/std_comms.hpp>
-#include <raft/core/handle.hpp>
-
-#include <iostream>
-#include <nccl.h>
-#include <ucp/api/ucp.h>
-
-namespace raft {
-namespace comms {
-
-/**
- * Function to construct comms_t and inject it on a handle_t. This
- * is used for convenience in the Python layer.
- *
- * @param handle raft::handle_t for injecting the comms
- * @param nccl_comm initialized NCCL communicator to use for collectives
- * @param num_ranks number of ranks in communicator clique
- * @param rank rank of local instance
- */
-void build_comms_nccl_only(handle_t* handle, ncclComm_t nccl_comm, int num_ranks, int rank)
-{
-  cudaStream_t stream = handle->get_stream();
-
-  auto communicator = std::make_shared<comms_t>(
-    std::unique_ptr<comms_iface>(new raft::comms::std_comms(nccl_comm, num_ranks, rank, stream)));
-  handle->set_comms(communicator);
-}
-
-/**
- * Function to construct comms_t and inject it on a handle_t. This
- * is used for convenience in the Python layer.
- *
- * @param handle raft::handle_t for injecting the comms
- * @param nccl_comm initialized NCCL communicator to use for collectives
- * @param ucp_worker of local process
- *        Note: This is purposefully left as void* so that the ucp_worker_h
- *        doesn't need to be exposed through the cython layer
- * @param eps array of ucp_ep_h instances.
- *        Note: This is purposefully left as void* so that
- *        the ucp_ep_h doesn't need to be exposed through the cython layer.
- * @param num_ranks number of ranks in communicator clique
- * @param rank rank of local instance
- */
-void build_comms_nccl_ucx(
-  handle_t* handle, ncclComm_t nccl_comm, void* ucp_worker, void* eps, int num_ranks, int rank)
-{
-  auto eps_sp = std::make_shared<ucp_ep_h*>(new ucp_ep_h[num_ranks]);
-
-  auto size_t_ep_arr = reinterpret_cast<size_t*>(eps);
-
-  for (int i = 0; i < num_ranks; i++) {
-    size_t ptr    = size_t_ep_arr[i];
-    auto ucp_ep_v = reinterpret_cast<ucp_ep_h*>(*eps_sp);
-
-    if (ptr != 0) {
-      auto eps_ptr = reinterpret_cast<ucp_ep_h>(size_t_ep_arr[i]);
-      ucp_ep_v[i]  = eps_ptr;
-    } else {
-      ucp_ep_v[i] = nullptr;
-    }
-  }
-
-  cudaStream_t stream = handle->get_stream();
-
-  auto communicator =
-    std::make_shared<comms_t>(std::unique_ptr<comms_iface>(new raft::comms::std_comms(
-      nccl_comm, (ucp_worker_h)ucp_worker, eps_sp, num_ranks, rank, stream)));
-  handle->set_comms(communicator);
-}
-
-inline void nccl_unique_id_from_char(ncclUniqueId* id, char* uniqueId, int size)
-{
-  memcpy(id->internal, uniqueId, size);
-}
-
-inline void get_unique_id(char* uid, int size)
-{
-  ncclUniqueId id;
-  ncclGetUniqueId(&id);
-
-  memcpy(uid, id.internal, size);
-}
-};  // namespace comms
-};  // end namespace raft
diff --git a/cpp/include/raft/comms/mpi_comms.hpp b/cpp/include/raft/comms/mpi_comms.hpp
index ca5275cd06..9076176ea6 100644
--- a/cpp/include/raft/comms/mpi_comms.hpp
+++ b/cpp/include/raft/comms/mpi_comms.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,12 +24,47 @@ namespace comms {
 
 using mpi_comms = detail::mpi_comms;
 
-inline void initialize_mpi_comms(handle_t* handle, MPI_Comm comm)
+/**
+ * @defgroup mpi_comms_factory MPI Comms Factory Functions
+ * @{
+ */
+
+/**
+ * Given a properly initialized MPI_Comm, construct an instance of RAFT's
+ * MPI Communicator and inject it into the given RAFT handle instance
+ * @param handle raft handle for managing expensive resources
+ * @param comm an initialized MPI communicator
+ *
+ * @code{.cpp}
+ * #include <raft/comms/mpi_comms.hpp>
+ * #include <raft/core/device_mdarray.hpp>
+ *
+ * MPI_Comm mpi_comm;
+ * raft::raft::device_resources handle;
+ *
+ * initialize_mpi_comms(&handle, mpi_comm);
+ * ...
+ * const auto& comm = handle.get_comms();
+ * auto gather_data = raft::make_device_vector<float>(handle, comm.get_size());
+ * ...
+ * comm.allgather((gather_data.data_handle())[comm.get_rank()],
+ *                gather_data.data_handle(),
+ *                1,
+ *                handle.get_stream());
+ *
+ * comm.sync_stream(handle.get_stream());
+ * @endcode
+ */
+inline void initialize_mpi_comms(device_resources* handle, MPI_Comm comm)
 {
   auto communicator = std::make_shared<comms_t>(
     std::unique_ptr<comms_iface>(new mpi_comms(comm, false, handle->get_stream())));
   handle->set_comms(communicator);
 };
 
+/**
+ * @}
+ */
+
 };  // namespace comms
 };  // end namespace raft
diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp
index edace60fbd..6370d4a8e6 100644
--- a/cpp/include/raft/comms/std_comms.hpp
+++ b/cpp/include/raft/comms/std_comms.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 #include <raft/comms/comms.hpp>
 #include <raft/comms/detail/std_comms.hpp>
@@ -31,15 +31,40 @@ namespace comms {
 using std_comms = detail::std_comms;
 
 /**
- * Function to construct comms_t and inject it on a handle_t. This
- * is used for convenience in the Python layer.
+ * @defgroup std_comms_factory std_comms Factory functions
+ * @{
+ */
+
+/**
+ * Factory function to construct a RAFT NCCL communicator and inject it into a
+ * RAFT handle.
  *
- * @param handle raft::handle_t for injecting the comms
+ * @param handle raft::device_resources for injecting the comms
  * @param nccl_comm initialized NCCL communicator to use for collectives
  * @param num_ranks number of ranks in communicator clique
  * @param rank rank of local instance
+ *
+ * @code{.cpp}
+ * #include <raft/comms/std_comms.hpp>
+ * #include <raft/core/device_mdarray.hpp>
+ *
+ * ncclComm_t nccl_comm;
+ * raft::raft::device_resources handle;
+ *
+ * build_comms_nccl_only(&handle, nccl_comm, 5, 0);
+ * ...
+ * const auto& comm = handle.get_comms();
+ * auto gather_data = raft::make_device_vector<float>(handle, comm.get_size());
+ * ...
+ * comm.allgather((gather_data.data_handle())[comm.get_rank()],
+ *                gather_data.data_handle(),
+ *                1,
+ *                handle.get_stream());
+ *
+ * comm.sync_stream(handle.get_stream());
+ * @endcode
  */
-void build_comms_nccl_only(handle_t* handle, ncclComm_t nccl_comm, int num_ranks, int rank)
+void build_comms_nccl_only(device_resources* handle, ncclComm_t nccl_comm, int num_ranks, int rank)
 {
   cudaStream_t stream = handle->get_stream();
 
@@ -49,10 +74,10 @@ void build_comms_nccl_only(handle_t* handle, ncclComm_t nccl_comm, int num_ranks
 }
 
 /**
- * Function to construct comms_t and inject it on a handle_t. This
- * is used for convenience in the Python layer.
+ * Factory function to construct a RAFT NCCL+UCX and inject it into a RAFT
+ * handle.
  *
- * @param handle raft::handle_t for injecting the comms
+ * @param handle raft::device_resources for injecting the comms
  * @param nccl_comm initialized NCCL communicator to use for collectives
  * @param ucp_worker of local process
  *        Note: This is purposefully left as void* so that the ucp_worker_h
@@ -62,9 +87,35 @@ void build_comms_nccl_only(handle_t* handle, ncclComm_t nccl_comm, int num_ranks
  *        the ucp_ep_h doesn't need to be exposed through the cython layer.
  * @param num_ranks number of ranks in communicator clique
  * @param rank rank of local instance
+ *
+ * @code{.cpp}
+ * #include <raft/comms/std_comms.hpp>
+ * #include <raft/core/device_mdarray.hpp>
+ *
+ * ncclComm_t nccl_comm;
+ * raft::raft::device_resources handle;
+ * ucp_worker_h ucp_worker;
+ * ucp_ep_h *ucp_endpoints_arr;
+ *
+ * build_comms_nccl_ucx(&handle, nccl_comm, &ucp_worker, ucp_endpoints_arr, 5, 0);
+ * ...
+ * const auto& comm = handle.get_comms();
+ * auto gather_data = raft::make_device_vector<float>(handle, comm.get_size());
+ * ...
+ * comm.allgather((gather_data.data_handle())[comm.get_rank()],
+ *                gather_data.data_handle(),
+ *                1,
+ *                handle.get_stream());
+ *
+ * comm.sync_stream(handle.get_stream());
+ * @endcode
  */
-void build_comms_nccl_ucx(
-  handle_t* handle, ncclComm_t nccl_comm, void* ucp_worker, void* eps, int num_ranks, int rank)
+void build_comms_nccl_ucx(device_resources* handle,
+                          ncclComm_t nccl_comm,
+                          void* ucp_worker,
+                          void* eps,
+                          int num_ranks,
+                          int rank)
 {
   auto eps_sp = std::make_shared<ucp_ep_h*>(new ucp_ep_h[num_ranks]);
 
@@ -90,6 +141,10 @@ void build_comms_nccl_ucx(
   handle->set_comms(communicator);
 }
 
+/**
+ * @}
+ */
+
 inline void nccl_unique_id_from_char(ncclUniqueId* id, char* uniqueId, int size)
 {
   memcpy(id->internal, uniqueId, size);
diff --git a/cpp/include/raft/core/comms.hpp b/cpp/include/raft/core/comms.hpp
index 78ce91dbf2..463c17f2f6 100644
--- a/cpp/include/raft/core/comms.hpp
+++ b/cpp/include/raft/core/comms.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cuda_runtime.h>
 #include <memory>
 #include <raft/core/error.hpp>
 #include <vector>
@@ -23,6 +24,11 @@
 namespace raft {
 namespace comms {
 
+/**
+ * @defgroup comms_types Common mnmg comms types
+ * @{
+ */
+
 typedef unsigned int request_t;
 enum class datatype_t { CHAR, UINT8, INT32, UINT32, INT64, UINT64, FLOAT32, FLOAT64 };
 enum class op_t { SUM, PROD, MIN, MAX };
@@ -105,6 +111,15 @@ get_type<double>()
   return datatype_t::FLOAT64;
 }
 
+/**
+ * @}
+ */
+
+/**
+ * @defgroup comms_iface MNMG Communicator Interface
+ * @{
+ */
+
 class comms_iface {
  public:
   virtual ~comms_iface() {}
@@ -215,6 +230,15 @@ class comms_iface {
   virtual void group_end() const = 0;
 };
 
+/**
+ * @}
+ */
+
+/**
+ * @defgroup comms_t Base Communicator Proxy
+ * @{
+ */
+
 class comms_t {
  public:
   comms_t(std::unique_ptr<comms_iface> impl) : impl_(impl.release())
@@ -647,5 +671,9 @@ class comms_t {
   std::unique_ptr<comms_iface> impl_;
 };
 
+/**
+ * @}
+ */
+
 }  // namespace comms
 }  // namespace raft
diff --git a/cpp/include/raft/core/cublas_macros.hpp b/cpp/include/raft/core/cublas_macros.hpp
index d2456433ab..855c1228f7 100644
--- a/cpp/include/raft/core/cublas_macros.hpp
+++ b/cpp/include/raft/core/cublas_macros.hpp
@@ -32,6 +32,11 @@
 
 namespace raft {
 
+/**
+ * @ingroup error_handling
+ * @{
+ */
+
 /**
  * @brief Exception thrown when a cuBLAS error is encountered.
  */
@@ -40,6 +45,10 @@ struct cublas_error : public raft::exception {
   explicit cublas_error(std::string const& message) : raft::exception(message) {}
 };
 
+/**
+ * @}
+ */
+
 namespace linalg {
 namespace detail {
 
@@ -66,6 +75,11 @@ inline const char* cublas_error_to_string(cublasStatus_t err)
 
 #undef _CUBLAS_ERR_TO_STR
 
+/**
+ * @ingroup assertion
+ * @{
+ */
+
 /**
  * @brief Error checking macro for cuBLAS runtime API functions.
  *
@@ -108,6 +122,9 @@ inline const char* cublas_error_to_string(cublasStatus_t err)
     }                                                                \
   } while (0)
 
+/**
+ * @}
+ */
 /** FIXME: remove after cuml rename */
 #ifndef CUBLAS_CHECK
 #define CUBLAS_CHECK(call) CUBLAS_TRY(call)
diff --git a/cpp/include/raft/core/cusolver_macros.hpp b/cpp/include/raft/core/cusolver_macros.hpp
index 505485e6a0..8f7caf65f3 100644
--- a/cpp/include/raft/core/cusolver_macros.hpp
+++ b/cpp/include/raft/core/cusolver_macros.hpp
@@ -31,6 +31,11 @@
 
 namespace raft {
 
+/**
+ * @ingroup error_handling
+ * @{
+ */
+
 /**
  * @brief Exception thrown when a cuSOLVER error is encountered.
  */
@@ -39,6 +44,10 @@ struct cusolver_error : public raft::exception {
   explicit cusolver_error(std::string const& message) : raft::exception(message) {}
 };
 
+/**
+ * @}
+ */
+
 namespace linalg {
 namespace detail {
 
@@ -65,6 +74,11 @@ inline const char* cusolver_error_to_string(cusolverStatus_t err)
 
 #undef _CUSOLVER_ERR_TO_STR
 
+/**
+ * @ingroup assertion
+ * @{
+ */
+
 /**
  * @brief Error checking macro for cuSOLVER runtime API functions.
  *
@@ -107,6 +121,10 @@ inline const char* cusolver_error_to_string(cusolverStatus_t err)
     }                                                                  \
   } while (0)
 
+/**
+ * @}
+ */
+
 // FIXME: remove after cuml rename
 #ifndef CUSOLVER_CHECK
 #define CUSOLVER_CHECK(call) CUSOLVER_TRY(call)
diff --git a/cpp/include/raft/core/cusparse_macros.hpp b/cpp/include/raft/core/cusparse_macros.hpp
index cf5195582b..8a9aab55f7 100644
--- a/cpp/include/raft/core/cusparse_macros.hpp
+++ b/cpp/include/raft/core/cusparse_macros.hpp
@@ -37,6 +37,11 @@
 
 namespace raft {
 
+/**
+ * @ingroup error_handling
+ * @{
+ */
+
 /**
  * @brief Exception thrown when a cuSparse error is encountered.
  */
@@ -45,6 +50,9 @@ struct cusparse_error : public raft::exception {
   explicit cusparse_error(std::string const& message) : raft::exception(message) {}
 };
 
+/**
+ * @}
+ */
 namespace sparse {
 namespace detail {
 
@@ -73,6 +81,11 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err)
 
 #undef _CUSPARSE_ERR_TO_STR
 
+/**
+ * @ingroup assertion
+ * @{
+ */
+
 /**
  * @brief Error checking macro for cuSparse runtime API functions.
  *
@@ -94,6 +107,10 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err)
     }                                                                        \
   } while (0)
 
+/**
+ * @}
+ */
+
 // FIXME: Remove after consumer rename
 #ifndef CUSPARSE_TRY
 #define CUSPARSE_TRY(call) RAFT_CUSPARSE_TRY(call)
@@ -104,6 +121,10 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err)
 #define CUSPARSE_CHECK(call) CUSPARSE_TRY(call)
 #endif
 
+/**
+ * @ingroup assertion
+ * @{
+ */
 //@todo: use logger here once logging is enabled
 /** check for cusparse runtime API errors but do not assert */
 #define RAFT_CUSPARSE_TRY_NO_THROW(call)                           \
@@ -117,6 +138,10 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err)
     }                                                              \
   } while (0)
 
+/**
+ * @}
+ */
+
 // FIXME: Remove after consumer rename
 #ifndef CUSPARSE_CHECK_NO_THROW
 #define CUSPARSE_CHECK_NO_THROW(call) RAFT_CUSPARSE_TRY_NO_THROW(call)
diff --git a/cpp/include/raft/core/detail/device_mdarray.hpp b/cpp/include/raft/core/detail/device_mdarray.hpp
index ad6831794e..31dfaba70a 100644
--- a/cpp/include/raft/core/detail/device_mdarray.hpp
+++ b/cpp/include/raft/core/detail/device_mdarray.hpp
@@ -6,7 +6,7 @@
  */
 
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
  */
 #pragma once
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/util/cudart_utils.hpp>
 
 #include <raft/core/detail/span.hpp>  // dynamic_extent
diff --git a/cpp/include/raft/core/detail/mdspan_numpy_serializer.hpp b/cpp/include/raft/core/detail/mdspan_numpy_serializer.hpp
new file mode 100644
index 0000000000..df89811636
--- /dev/null
+++ b/cpp/include/raft/core/detail/mdspan_numpy_serializer.hpp
@@ -0,0 +1,487 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdspan.hpp>
+
+#include <algorithm>
+#include <complex>
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+namespace raft {
+
+namespace detail {
+
+namespace numpy_serializer {
+
+/**
+ * A small implementation of NumPy serialization format.
+ * Reference: https://numpy.org/doc/1.23/reference/generated/numpy.lib.format.html
+ *
+ * Adapted from https://github.com/llohse/libnpy/blob/master/include/npy.hpp, using the following
+ * license:
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Leon Merten Lohse
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define RAFT_NUMPY_LITTLE_ENDIAN_CHAR  '<'
+#define RAFT_NUMPY_BIG_ENDIAN_CHAR     '>'
+#define RAFT_NUMPY_NO_ENDIAN_CHAR      '|'
+#define RAFT_NUMPY_MAGIC_STRING        "\x93NUMPY"
+#define RAFT_NUMPY_MAGIC_STRING_LENGTH 6
+
+#if RAFT_SYSTEM_LITTLE_ENDIAN == 1
+#define RAFT_NUMPY_HOST_ENDIAN_CHAR RAFT_NUMPY_LITTLE_ENDIAN_CHAR
+#else  // RAFT_SYSTEM_LITTLE_ENDIAN == 1
+#define RAFT_NUMPY_HOST_ENDIAN_CHAR RAFT_NUMPY_BIG_ENDIAN_CHAR
+#endif  // RAFT_SYSTEM_LITTLE_ENDIAN == 1
+
+using ndarray_len_t = std::uint64_t;
+
+struct dtype_t {
+  const char byteorder;
+  const char kind;
+  const unsigned int itemsize;
+
+  std::string to_string() const
+  {
+    char buf[16] = {0};
+    std::sprintf(buf, "%c%c%u", byteorder, kind, itemsize);
+    return std::string(buf);
+  }
+
+  bool operator==(const dtype_t& other) const
+  {
+    return (byteorder == other.byteorder && kind == other.kind && itemsize == other.itemsize);
+  }
+};
+
+struct header_t {
+  const dtype_t dtype;
+  const bool fortran_order;
+  const std::vector<ndarray_len_t> shape;
+
+  bool operator==(const header_t& other) const
+  {
+    return (dtype == other.dtype && fortran_order == other.fortran_order && shape == other.shape);
+  }
+};
+
+template <class T>
+struct is_complex : std::false_type {
+};
+template <class T>
+struct is_complex<std::complex<T>> : std::true_type {
+};
+
+template <typename T, typename std::enable_if_t<std::is_floating_point_v<T>, bool> = true>
+inline dtype_t get_numpy_dtype()
+{
+  return {RAFT_NUMPY_HOST_ENDIAN_CHAR, 'f', sizeof(T)};
+}
+
+template <typename T,
+          typename std::enable_if_t<std::is_integral_v<T> && std::is_signed_v<T>, bool> = true>
+inline dtype_t get_numpy_dtype()
+{
+  const char endian_char =
+    (sizeof(T) == 1 ? RAFT_NUMPY_NO_ENDIAN_CHAR : RAFT_NUMPY_HOST_ENDIAN_CHAR);
+  return {endian_char, 'i', sizeof(T)};
+}
+
+template <typename T,
+          typename std::enable_if_t<std::is_integral_v<T> && std::is_unsigned_v<T>, bool> = true>
+inline dtype_t get_numpy_dtype()
+{
+  const char endian_char =
+    (sizeof(T) == 1 ? RAFT_NUMPY_NO_ENDIAN_CHAR : RAFT_NUMPY_HOST_ENDIAN_CHAR);
+  return {endian_char, 'u', sizeof(T)};
+}
+
+template <typename T, typename std::enable_if_t<is_complex<T>{}, bool> = true>
+inline dtype_t get_numpy_dtype()
+{
+  return {RAFT_NUMPY_HOST_ENDIAN_CHAR, 'c', sizeof(T)};
+}
+
+template <typename T, typename std::enable_if_t<std::is_enum_v<T>, bool> = true>
+inline dtype_t get_numpy_dtype()
+{
+  return get_numpy_dtype<std::underlying_type_t<T>>();
+}
+
+template <typename T>
+inline std::string tuple_to_string(const std::vector<T>& tuple)
+{
+  std::ostringstream oss;
+  if (tuple.empty()) {
+    oss << "()";
+  } else if (tuple.size() == 1) {
+    oss << "(" << tuple.front() << ",)";
+  } else {
+    oss << "(";
+    for (std::size_t i = 0; i < tuple.size() - 1; ++i) {
+      oss << tuple[i] << ", ";
+    }
+    oss << tuple.back() << ")";
+  }
+  return oss.str();
+}
+
+inline std::string header_to_string(const header_t& header)
+{
+  std::ostringstream oss;
+  oss << "{'descr': '" << header.dtype.to_string()
+      << "', 'fortran_order': " << (header.fortran_order ? "True" : "False")
+      << ", 'shape': " << tuple_to_string(header.shape) << "}";
+  return oss.str();
+}
+
+inline std::string trim(const std::string& str)
+{
+  const std::string whitespace = " \t";
+  auto begin                   = str.find_first_not_of(whitespace);
+  if (begin == std::string::npos) { return ""; }
+  auto end = str.find_last_not_of(whitespace);
+
+  return str.substr(begin, end - begin + 1);
+}
+
+// A poor man's parser for Python dictionary
+// TODO(hcho3): Consider writing a proper parser
+// Limitation: can only parse a flat dictionary; all values are assumed to non-objects
+// Limitation: must know all the keys ahead of time; you get undefined behavior if you omit any key
+inline std::map<std::string, std::string> parse_pydict(std::string str,
+                                                       const std::vector<std::string>& keys)
+{
+  std::map<std::string, std::string> result;
+
+  // Unwrap dictionary
+  str = trim(str);
+  RAFT_EXPECTS(str.front() == '{' && str.back() == '}', "Expected a Python dictionary");
+  str = str.substr(1, str.length() - 2);
+
+  // Get the position of each key and put it in the list
+  std::vector<std::pair<std::size_t, std::string>> positions;
+  for (auto const& key : keys) {
+    std::size_t pos = str.find("'" + key + "'");
+    RAFT_EXPECTS(pos != std::string::npos, "Missing '%s' key.", key.c_str());
+    positions.emplace_back(pos, key);
+  }
+  // Sort the list
+  std::sort(positions.begin(), positions.end());
+
+  // Extract each key-value pair
+  for (std::size_t i = 0; i < positions.size(); ++i) {
+    std::string key = positions[i].second;
+
+    std::size_t begin     = positions[i].first;
+    std::size_t end       = (i + 1 < positions.size() ? positions[i + 1].first : std::string::npos);
+    std::string raw_value = trim(str.substr(begin, end - begin));
+    if (raw_value.back() == ',') { raw_value.pop_back(); }
+    std::size_t sep_pos = raw_value.find_first_of(":");
+    if (sep_pos == std::string::npos) {
+      result[key] = "";
+    } else {
+      result[key] = trim(raw_value.substr(sep_pos + 1));
+    }
+  }
+
+  return result;
+}
+
+inline std::string parse_pystring(std::string str)
+{
+  RAFT_EXPECTS(str.front() == '\'' && str.back() == '\'', "Invalid Python string: %s", str.c_str());
+  return str.substr(1, str.length() - 2);
+}
+
+inline bool parse_pybool(std::string str)
+{
+  if (str == "True") {
+    return true;
+  } else if (str == "False") {
+    return false;
+  } else {
+    RAFT_FAIL("Invalid Python boolean: %s", str.c_str());
+  }
+}
+
+inline std::vector<std::string> parse_pytuple(std::string str)
+{
+  std::vector<std::string> result;
+
+  str = trim(str);
+  RAFT_EXPECTS(str.front() == '(' && str.back() == ')', "Invalid Python tuple: %s", str.c_str());
+  str = str.substr(1, str.length() - 2);
+
+  std::istringstream iss(str);
+  for (std::string token; std::getline(iss, token, ',');) {
+    result.push_back(trim(token));
+  }
+
+  return result;
+}
+
+inline dtype_t parse_descr(std::string typestr)
+{
+  RAFT_EXPECTS(typestr.length() >= 3, "Invalid typestr: Too short");
+  char byteorder_c       = typestr.at(0);
+  char kind_c            = typestr.at(1);
+  std::string itemsize_s = typestr.substr(2);
+
+  const char endian_chars[] = {
+    RAFT_NUMPY_LITTLE_ENDIAN_CHAR, RAFT_NUMPY_BIG_ENDIAN_CHAR, RAFT_NUMPY_NO_ENDIAN_CHAR};
+  const char numtype_chars[] = {'f', 'i', 'u', 'c'};
+
+  RAFT_EXPECTS(std::find(std::begin(endian_chars), std::end(endian_chars), byteorder_c) !=
+                 std::end(endian_chars),
+               "Invalid typestr: unrecognized byteorder %c",
+               byteorder_c);
+  RAFT_EXPECTS(std::find(std::begin(numtype_chars), std::end(numtype_chars), kind_c) !=
+                 std::end(numtype_chars),
+               "Invalid typestr: unrecognized kind %c",
+               kind_c);
+  unsigned int itemsize = std::stoul(itemsize_s);
+
+  return {byteorder_c, kind_c, itemsize};
+}
+
+inline void write_magic(std::ostream& os)
+{
+  os.write(RAFT_NUMPY_MAGIC_STRING, RAFT_NUMPY_MAGIC_STRING_LENGTH);
+  RAFT_EXPECTS(os.good(), "Error writing magic string");
+  // Use version 1.0
+  os.put(1);
+  os.put(0);
+  RAFT_EXPECTS(os.good(), "Error writing magic string");
+}
+
+inline void read_magic(std::istream& is)
+{
+  char magic_buf[RAFT_NUMPY_MAGIC_STRING_LENGTH + 2] = {0};
+  is.read(magic_buf, RAFT_NUMPY_MAGIC_STRING_LENGTH + 2);
+  RAFT_EXPECTS(is.good(), "Error reading magic string");
+
+  RAFT_EXPECTS(std::memcmp(magic_buf, RAFT_NUMPY_MAGIC_STRING, RAFT_NUMPY_MAGIC_STRING_LENGTH) == 0,
+               "The given stream does not have a valid NumPy format.");
+
+  std::uint8_t version_major = magic_buf[RAFT_NUMPY_MAGIC_STRING_LENGTH];
+  std::uint8_t version_minor = magic_buf[RAFT_NUMPY_MAGIC_STRING_LENGTH + 1];
+  RAFT_EXPECTS(version_major == 1 && version_minor == 0,
+               "Unsupported NumPy version: %d.%d",
+               version_major,
+               version_minor);
+}
+
+inline void write_header(std::ostream& os, const header_t& header)
+{
+  std::string header_dict     = header_to_string(header);
+  std::size_t preamble_length = RAFT_NUMPY_MAGIC_STRING_LENGTH + 2 + 2 + header_dict.length() + 1;
+  RAFT_EXPECTS(preamble_length < 255 * 255, "Header too long");
+  // Enforce 64-byte alignment
+  std::size_t padding_len = 64 - preamble_length % 64;
+  std::string padding(padding_len, ' ');
+
+  write_magic(os);
+
+  // Write header length
+  std::uint8_t header_len_le16[2];
+  std::uint16_t header_len =
+    static_cast<std::uint16_t>(header_dict.length() + padding.length() + 1);
+  header_len_le16[0] = (header_len >> 0) & 0xff;
+  header_len_le16[1] = (header_len >> 8) & 0xff;
+  os.put(header_len_le16[0]);
+  os.put(header_len_le16[1]);
+  RAFT_EXPECTS(os.good(), "Error writing HEADER_LEN");
+
+  os << header_dict << padding << "\n";
+  RAFT_EXPECTS(os.good(), "Error writing header dict");
+}
+
+inline std::string read_header_bytes(std::istream& is)
+{
+  read_magic(is);
+
+  // Read header length
+  std::uint8_t header_len_le16[2];
+  is.read(reinterpret_cast<char*>(header_len_le16), 2);
+  RAFT_EXPECTS(is.good(), "Error while reading HEADER_LEN");
+  const std::uint32_t header_length = (header_len_le16[0] << 0) | (header_len_le16[1] << 8);
+
+  std::vector<char> header_bytes(header_length);
+  is.read(header_bytes.data(), header_length);
+  RAFT_EXPECTS(is.good(), "Error while reading the header");
+
+  return std::string(header_bytes.data(), header_length);
+}
+
+inline header_t read_header(std::istream& is)
+{
+  std::string header_bytes = read_header_bytes(is);
+
+  // remove trailing newline
+  RAFT_EXPECTS(header_bytes.back() == '\n', "Invalid NumPy header");
+  header_bytes.pop_back();
+
+  // parse the header dict
+  auto header_dict   = parse_pydict(header_bytes, {"descr", "fortran_order", "shape"});
+  dtype_t descr      = parse_descr(parse_pystring(header_dict["descr"]));
+  bool fortran_order = parse_pybool(header_dict["fortran_order"]);
+  std::vector<ndarray_len_t> shape;
+  auto shape_tup_str = parse_pytuple(header_dict["shape"]);
+  for (const auto& e : shape_tup_str) {
+    shape.push_back(static_cast<ndarray_len_t>(std::stoul(e)));
+  }
+
+  RAFT_EXPECTS(
+    descr.byteorder == RAFT_NUMPY_HOST_ENDIAN_CHAR || descr.byteorder == RAFT_NUMPY_NO_ENDIAN_CHAR,
+    "The mdspan was serialized on a %s machine but you're attempting to load it on "
+    "a %s machine. This use case is not currently supported.",
+    (RAFT_SYSTEM_LITTLE_ENDIAN ? "big-endian" : "little-endian"),
+    (RAFT_SYSTEM_LITTLE_ENDIAN ? "little-endian" : "big-endian"));
+
+  return {descr, fortran_order, shape};
+}
+
+template <typename ElementType, typename Extents, typename LayoutPolicy, typename AccessorPolicy>
+inline void serialize_host_mdspan(
+  std::ostream& os,
+  const raft::host_mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy>& obj)
+{
+  static_assert(std::is_same_v<LayoutPolicy, raft::layout_c_contiguous> ||
+                  std::is_same_v<LayoutPolicy, raft::layout_f_contiguous>,
+                "The serializer only supports row-major and column-major layouts");
+
+  using obj_t = raft::host_mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy>;
+
+  const auto dtype         = get_numpy_dtype<ElementType>();
+  const bool fortran_order = std::is_same_v<LayoutPolicy, raft::layout_f_contiguous>;
+  std::vector<ndarray_len_t> shape;
+  for (typename obj_t::rank_type i = 0; i < obj.rank(); ++i) {
+    shape.push_back(obj.extent(i));
+  }
+  const header_t header = {dtype, fortran_order, shape};
+  write_header(os, header);
+
+  // For contiguous layouts, size() == product of dimensions
+  os.write(reinterpret_cast<const char*>(obj.data_handle()), obj.size() * sizeof(ElementType));
+  RAFT_EXPECTS(os.good(), "Error writing content of mdspan");
+}
+
+template <typename T>
+inline void serialize_scalar(std::ostream& os, const T& value)
+{
+  const auto dtype         = get_numpy_dtype<T>();
+  const bool fortran_order = false;
+  const std::vector<ndarray_len_t> shape{};
+  const header_t header = {dtype, fortran_order, shape};
+  write_header(os, header);
+  os.write(reinterpret_cast<const char*>(&value), sizeof(T));
+  RAFT_EXPECTS(os.good(), "Error serializing a scalar");
+}
+
+template <typename ElementType, typename Extents, typename LayoutPolicy, typename AccessorPolicy>
+inline void deserialize_host_mdspan(
+  std::istream& is,
+  const raft::host_mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy>& obj)
+{
+  static_assert(std::is_same_v<LayoutPolicy, raft::layout_c_contiguous> ||
+                  std::is_same_v<LayoutPolicy, raft::layout_f_contiguous>,
+                "The serializer only supports row-major and column-major layouts");
+
+  using obj_t = raft::host_mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy>;
+
+  // Check if given dtype and fortran_order are compatible with the mdspan
+  const auto expected_dtype         = get_numpy_dtype<ElementType>();
+  const bool expected_fortran_order = std::is_same_v<LayoutPolicy, raft::layout_f_contiguous>;
+  header_t header                   = read_header(is);
+  RAFT_EXPECTS(header.dtype == expected_dtype,
+               "Expected dtype %s but got %s instead",
+               header.dtype.to_string().c_str(),
+               expected_dtype.to_string().c_str());
+  RAFT_EXPECTS(header.fortran_order == expected_fortran_order,
+               "Wrong matrix layout; expected %s but got a different layout",
+               (expected_fortran_order ? "Fortran layout" : "C layout"));
+
+  // Check if dimensions are correct
+  RAFT_EXPECTS(obj.rank() == header.shape.size(),
+               "Incorrect rank: expected %zu but got %zu",
+               obj.rank(),
+               header.shape.size());
+  for (typename obj_t::rank_type i = 0; i < obj.rank(); ++i) {
+    RAFT_EXPECTS(static_cast<ndarray_len_t>(obj.extent(i)) == header.shape[i],
+                 "Incorrect dimension: expected %zu but got %zu",
+                 static_cast<ndarray_len_t>(obj.extent(i)),
+                 header.shape[i]);
+  }
+
+  // For contiguous layouts, size() == product of dimensions
+  is.read(reinterpret_cast<char*>(obj.data_handle()), obj.size() * sizeof(ElementType));
+  RAFT_EXPECTS(is.good(), "Error while reading mdspan content");
+}
+
+template <typename T>
+inline T deserialize_scalar(std::istream& is)
+{
+  // Check if dtype is correct
+  const auto expected_dtype = get_numpy_dtype<T>();
+  header_t header           = read_header(is);
+  RAFT_EXPECTS(header.dtype == expected_dtype,
+               "Expected dtype %s but got %s instead",
+               header.dtype.to_string().c_str(),
+               expected_dtype.to_string().c_str());
+  // Check if dimensions are correct; shape should be ()
+  RAFT_EXPECTS(header.shape.empty(), "Incorrect rank: expected 0 but got %zu", header.shape.size());
+
+  T value;
+  is.read(reinterpret_cast<char*>(&value), sizeof(T));
+  RAFT_EXPECTS(is.good(), "Error while deserializing scalar");
+  return value;
+}
+
+}  // end namespace numpy_serializer
+}  // end namespace detail
+}  // end namespace raft
diff --git a/cpp/include/raft/core/device_mdarray.hpp b/cpp/include/raft/core/device_mdarray.hpp
index 693e50a506..03cb09eecb 100644
--- a/cpp/include/raft/core/device_mdarray.hpp
+++ b/cpp/include/raft/core/device_mdarray.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -72,7 +72,7 @@ using device_matrix = device_mdarray<ElementType, matrix_extent<IndexType>, Layo
  * @tparam ElementType the data type of the matrix elements
  * @tparam IndexType the index type of the extents
  * @tparam LayoutPolicy policy for strides and layout ordering
- * @param handle raft::handle_t
+ * @param handle raft::device_resources
  * @param exts dimensionality of the array (series of integers)
  * @return raft::device_mdarray
  */
@@ -80,7 +80,7 @@ template <typename ElementType,
           typename IndexType    = std::uint32_t,
           typename LayoutPolicy = layout_c_contiguous,
           size_t... Extents>
-auto make_device_mdarray(const raft::handle_t& handle, extents<IndexType, Extents...> exts)
+auto make_device_mdarray(raft::device_resources const& handle, extents<IndexType, Extents...> exts)
 {
   using mdarray_t = device_mdarray<ElementType, decltype(exts), LayoutPolicy>;
 
@@ -95,7 +95,7 @@ auto make_device_mdarray(const raft::handle_t& handle, extents<IndexType, Extent
  * @tparam ElementType the data type of the matrix elements
  * @tparam IndexType the index type of the extents
  * @tparam LayoutPolicy policy for strides and layout ordering
- * @param handle raft::handle_t
+ * @param handle raft::device_resources
  * @param mr rmm memory resource used for allocating the memory for the array
  * @param exts dimensionality of the array (series of integers)
  * @return raft::device_mdarray
@@ -104,7 +104,7 @@ template <typename ElementType,
           typename IndexType    = std::uint32_t,
           typename LayoutPolicy = layout_c_contiguous,
           size_t... Extents>
-auto make_device_mdarray(const raft::handle_t& handle,
+auto make_device_mdarray(raft::device_resources const& handle,
                          rmm::mr::device_memory_resource* mr,
                          extents<IndexType, Extents...> exts)
 {
@@ -130,7 +130,7 @@ auto make_device_mdarray(const raft::handle_t& handle,
 template <typename ElementType,
           typename IndexType    = std::uint32_t,
           typename LayoutPolicy = layout_c_contiguous>
-auto make_device_matrix(raft::handle_t const& handle, IndexType n_rows, IndexType n_cols)
+auto make_device_matrix(raft::device_resources const& handle, IndexType n_rows, IndexType n_cols)
 {
   return make_device_mdarray<ElementType, IndexType, LayoutPolicy>(
     handle.get_stream(), make_extents<IndexType>(n_rows, n_cols));
@@ -146,7 +146,7 @@ auto make_device_matrix(raft::handle_t const& handle, IndexType n_rows, IndexTyp
  * @return raft::device_scalar
  */
 template <typename ElementType, typename IndexType = std::uint32_t>
-auto make_device_scalar(raft::handle_t const& handle, ElementType const& v)
+auto make_device_scalar(raft::device_resources const& handle, ElementType const& v)
 {
   scalar_extent<IndexType> extents;
   using policy_t = typename device_scalar<ElementType>::container_policy_type;
@@ -168,7 +168,7 @@ auto make_device_scalar(raft::handle_t const& handle, ElementType const& v)
 template <typename ElementType,
           typename IndexType    = std::uint32_t,
           typename LayoutPolicy = layout_c_contiguous>
-auto make_device_vector(raft::handle_t const& handle, IndexType n)
+auto make_device_vector(raft::device_resources const& handle, IndexType n)
 {
   return make_device_mdarray<ElementType, IndexType, LayoutPolicy>(handle.get_stream(),
                                                                    make_extents<IndexType>(n));
diff --git a/cpp/include/raft/core/device_mdspan.hpp b/cpp/include/raft/core/device_mdspan.hpp
index f64f15d0d5..f72ae36d64 100644
--- a/cpp/include/raft/core/device_mdspan.hpp
+++ b/cpp/include/raft/core/device_mdspan.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -197,7 +197,9 @@ auto make_device_aligned_matrix_view(ElementType* ptr, IndexType n_rows, IndexTy
                                                  detail::alignment::value>::data_handle_type;
   static_assert(std::is_same<LayoutPolicy, layout_left_padded<ElementType>>::value ||
                 std::is_same<LayoutPolicy, layout_right_padded<ElementType>>::value);
-  assert(ptr == alignTo(ptr, detail::alignment::value));
+  assert(reinterpret_cast<std::uintptr_t>(ptr) ==
+         std::experimental::details::alignTo(reinterpret_cast<std::uintptr_t>(ptr),
+                                             detail::alignment::value));
 
   data_handle_type aligned_pointer = ptr;
 
diff --git a/cpp/include/raft/core/device_resources.hpp b/cpp/include/raft/core/device_resources.hpp
new file mode 100644
index 0000000000..68c56dc9b6
--- /dev/null
+++ b/cpp/include/raft/core/device_resources.hpp
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RAFT_DEVICE_RESOURCES
+#define __RAFT_DEVICE_RESOURCES
+
+#pragma once
+
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <cusolverDn.h>
+#include <cusolverSp.h>
+#include <cusparse.h>
+
+#include <raft/core/comms.hpp>
+#include <rmm/cuda_stream_pool.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <raft/core/resource/comms.hpp>
+#include <raft/core/resource/cublas_handle.hpp>
+#include <raft/core/resource/cuda_event.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/cuda_stream_pool.hpp>
+#include <raft/core/resource/cusolver_dn_handle.hpp>
+#include <raft/core/resource/cusolver_sp_handle.hpp>
+#include <raft/core/resource/cusparse_handle.hpp>
+#include <raft/core/resource/device_id.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resource/device_properties.hpp>
+#include <raft/core/resource/sub_comms.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+#include <raft/core/resources.hpp>
+
+namespace raft {
+
+/**
+ * @brief Main resource container object that stores all necessary resources
+ * used for calling necessary device functions, cuda kernels and/or libraries
+ */
+class device_resources : public resources {
+ public:
+  device_resources(const device_resources& handle,
+                   rmm::mr::device_memory_resource* workspace_resource)
+    : resources{handle}
+  {
+    // replace the resource factory for the workspace_resources
+    resources::add_resource_factory(
+      std::make_shared<resource::workspace_resource_factory>(workspace_resource));
+  }
+
+  device_resources(const device_resources& handle) : resources{handle} {}
+
+  device_resources(device_resources&&) = delete;
+  device_resources& operator=(device_resources&&) = delete;
+
+  /**
+   * @brief Construct a resources instance with a stream view and stream pool
+   *
+   * @param[in] stream_view the default stream (which has the default per-thread stream if
+   * unspecified)
+   * @param[in] stream_pool the stream pool used (which has default of nullptr if unspecified)
+   * @param[in] workspace_resource an optional resource used by some functions for allocating
+   *            temporary workspaces.
+   */
+  device_resources(rmm::cuda_stream_view stream_view                  = rmm::cuda_stream_per_thread,
+                   std::shared_ptr<rmm::cuda_stream_pool> stream_pool = {nullptr},
+                   rmm::mr::device_memory_resource* workspace_resource = nullptr)
+    : resources{}
+  {
+    resources::add_resource_factory(std::make_shared<resource::device_id_resource_factory>());
+    resources::add_resource_factory(
+      std::make_shared<resource::cuda_stream_resource_factory>(stream_view));
+    resources::add_resource_factory(
+      std::make_shared<resource::cuda_stream_pool_resource_factory>(stream_pool));
+    resources::add_resource_factory(
+      std::make_shared<resource::workspace_resource_factory>(workspace_resource));
+  }
+
+  /** Destroys all held-up resources */
+  virtual ~device_resources() {}
+
+  int get_device() const { return resource::get_device_id(*this); }
+
+  cublasHandle_t get_cublas_handle() const { return resource::get_cublas_handle(*this); }
+
+  cusolverDnHandle_t get_cusolver_dn_handle() const
+  {
+    return resource::get_cusolver_dn_handle(*this);
+  }
+
+  cusolverSpHandle_t get_cusolver_sp_handle() const
+  {
+    return resource::get_cusolver_sp_handle(*this);
+  }
+
+  cusparseHandle_t get_cusparse_handle() const { return resource::get_cusparse_handle(*this); }
+
+  rmm::exec_policy& get_thrust_policy() const { return resource::get_thrust_policy(*this); }
+
+  /**
+   * @brief synchronize a stream on the current container
+   */
+  void sync_stream(rmm::cuda_stream_view stream) const { resource::sync_stream(*this, stream); }
+
+  /**
+   * @brief synchronize main stream on the current container
+   */
+  void sync_stream() const { resource::sync_stream(*this); }
+
+  /**
+   * @brief returns main stream on the current container
+   */
+  rmm::cuda_stream_view get_stream() const { return resource::get_cuda_stream(*this); }
+
+  /**
+   * @brief returns whether stream pool was initialized on the current container
+   */
+
+  bool is_stream_pool_initialized() const { return resource::is_stream_pool_initialized(*this); }
+
+  /**
+   * @brief returns stream pool on the current container
+   */
+  const rmm::cuda_stream_pool& get_stream_pool() const
+  {
+    return resource::get_cuda_stream_pool(*this);
+  }
+
+  std::size_t get_stream_pool_size() const { return resource::get_stream_pool_size(*this); }
+
+  /**
+   * @brief return stream from pool
+   */
+  rmm::cuda_stream_view get_stream_from_stream_pool() const
+  {
+    return resource::get_stream_from_stream_pool(*this);
+  }
+
+  /**
+   * @brief return stream from pool at index
+   */
+  rmm::cuda_stream_view get_stream_from_stream_pool(std::size_t stream_idx) const
+  {
+    return resource::get_stream_from_stream_pool(*this, stream_idx);
+  }
+
+  /**
+   * @brief return stream from pool if size > 0, else main stream on current container
+   */
+  rmm::cuda_stream_view get_next_usable_stream() const
+  {
+    return resource::get_next_usable_stream(*this);
+  }
+
+  /**
+   * @brief return stream from pool at index if size > 0, else main stream on current container
+   *
+   * @param[in] stream_idx the required index of the stream in the stream pool if available
+   */
+  rmm::cuda_stream_view get_next_usable_stream(std::size_t stream_idx) const
+  {
+    return resource::get_next_usable_stream(*this, stream_idx);
+  }
+
+  /**
+   * @brief synchronize the stream pool on the current container
+   */
+  void sync_stream_pool() const { return resource::sync_stream_pool(*this); }
+
+  /**
+   * @brief synchronize subset of stream pool
+   *
+   * @param[in] stream_indices the indices of the streams in the stream pool to synchronize
+   */
+  void sync_stream_pool(const std::vector<std::size_t> stream_indices) const
+  {
+    return resource::sync_stream_pool(*this, stream_indices);
+  }
+
+  /**
+   * @brief ask stream pool to wait on last event in main stream
+   */
+  void wait_stream_pool_on_stream() const { return resource::wait_stream_pool_on_stream(*this); }
+
+  void set_comms(std::shared_ptr<comms::comms_t> communicator)
+  {
+    resource::set_comms(*this, communicator);
+  }
+
+  const comms::comms_t& get_comms() const { return resource::get_comms(*this); }
+
+  void set_subcomm(std::string key, std::shared_ptr<comms::comms_t> subcomm)
+  {
+    resource::set_subcomm(*this, key, subcomm);
+  }
+
+  const comms::comms_t& get_subcomm(std::string key) const
+  {
+    return resource::get_subcomm(*this, key);
+  }
+
+  rmm::mr::device_memory_resource* get_workspace_resource() const
+  {
+    return resource::get_workspace_resource(*this);
+  }
+
+  bool comms_initialized() const { return resource::comms_initialized(*this); }
+
+  const cudaDeviceProp& get_device_properties() const
+  {
+    return resource::get_device_properties(*this);
+  }
+};  // class device_resources
+
+/**
+ * @brief RAII approach to synchronizing across all streams in the current container
+ */
+class stream_syncer {
+ public:
+  explicit stream_syncer(const device_resources& handle) : handle_(handle)
+  {
+    handle_.sync_stream();
+  }
+  ~stream_syncer()
+  {
+    handle_.wait_stream_pool_on_stream();
+    handle_.sync_stream_pool();
+  }
+
+  stream_syncer(const stream_syncer& other) = delete;
+  stream_syncer& operator=(const stream_syncer& other) = delete;
+
+ private:
+  const device_resources& handle_;
+};  // class stream_syncer
+
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/core/error.hpp b/cpp/include/raft/core/error.hpp
index b932309d24..84b244f4dc 100644
--- a/cpp/include/raft/core/error.hpp
+++ b/cpp/include/raft/core/error.hpp
@@ -30,6 +30,11 @@
 
 namespace raft {
 
+/**
+ * @defgroup error_handling Exceptions & Error Handling
+ * @{
+ */
+
 /** base exception class for the whole of raft */
 class exception : public std::exception {
  public:
@@ -93,6 +98,10 @@ struct logic_error : public raft::exception {
   explicit logic_error(std::string const& message) : raft::exception(message) {}
 };
 
+/**
+ * @}
+ */
+
 }  // namespace raft
 
 // FIXME: Need to be replaced with RAFT_FAIL
@@ -143,6 +152,11 @@ struct logic_error : public raft::exception {
     msg += std::string(buf.data(), buf.data() + size - 1); /* -1 to remove final '\0' */         \
   } while (0)
 
+/**
+ * @defgroup assertion Assertion and error macros
+ * @{
+ */
+
 /**
  * @brief Macro for checking (pre-)conditions that throws an exception when a condition is false
  *
@@ -174,4 +188,8 @@ struct logic_error : public raft::exception {
     throw raft::logic_error(msg);                               \
   } while (0)
 
+/**
+ * @}
+ */
+
 #endif
diff --git a/cpp/include/raft/core/handle.hpp b/cpp/include/raft/core/handle.hpp
index 08cb812bb7..02efebec9e 100644
--- a/cpp/include/raft/core/handle.hpp
+++ b/cpp/include/raft/core/handle.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,326 +14,52 @@
  * limitations under the License.
  */
 
-#ifndef __RAFT_RT_HANDLE
-#define __RAFT_RT_HANDLE
-
 #pragma once
 
-#include <memory>
-#include <mutex>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
-#include <cusolverDn.h>
-#include <cusolverSp.h>
-#include <cusparse.h>
-
-///@todo: enable once we have migrated cuml-comms layer too
-//#include <common/cuml_comms_int.hpp>
-
-#include <raft/core/cudart_utils.hpp>
-
-#include <raft/core/comms.hpp>
-#include <raft/core/cublas_macros.hpp>
-#include <raft/core/cusolver_macros.hpp>
-#include <raft/core/cusparse_macros.hpp>
-#include <raft/core/interruptible.hpp>
-#include <rmm/cuda_stream_pool.hpp>
-#include <rmm/exec_policy.hpp>
+#include <raft/core/device_resources.hpp>
 
 namespace raft {
 
 /**
- * @brief Main handle object that stores all necessary context used for calling
- *        necessary cuda kernels and/or libraries
+ * raft::handle_t is being kept around for backwards
+ * compatibility and will be removed in a future version.
+ *
+ * Extending the `raft::handle_t` instead of `using` to
+ * minimize needed changes downstream
+ * (e.g. existing forward declarations, etc...)
+ *
+ * Use of `raft::resources` or `raft::handle_t` is preferred.
  */
-class handle_t {
+class handle_t : public raft::device_resources {
  public:
-  // delete copy/move constructors and assignment operators as
-  // copying and moving underlying resources is unsafe
-  handle_t(const handle_t&) = delete;
-  handle_t& operator=(const handle_t&) = delete;
-  handle_t(handle_t&&)                 = delete;
+  handle_t(const handle_t& handle, rmm::mr::device_memory_resource* workspace_resource)
+    : device_resources(handle, workspace_resource)
+  {
+  }
+
+  handle_t(const handle_t& handle) : device_resources{handle} {}
+
+  handle_t(handle_t&&) = delete;
   handle_t& operator=(handle_t&&) = delete;
 
   /**
-   * @brief Construct a handle with a stream view and stream pool
+   * @brief Construct a resources instance with a stream view and stream pool
    *
    * @param[in] stream_view the default stream (which has the default per-thread stream if
    * unspecified)
    * @param[in] stream_pool the stream pool used (which has default of nullptr if unspecified)
+   * @param[in] workspace_resource an optional resource used by some functions for allocating
+   *            temporary workspaces.
    */
-  handle_t(rmm::cuda_stream_view stream_view                  = rmm::cuda_stream_per_thread,
-           std::shared_ptr<rmm::cuda_stream_pool> stream_pool = {nullptr})
-    : dev_id_([]() -> int {
-        int cur_dev = -1;
-        RAFT_CUDA_TRY(cudaGetDevice(&cur_dev));
-        return cur_dev;
-      }()),
-      stream_view_{stream_view},
-      stream_pool_{stream_pool}
+  handle_t(rmm::cuda_stream_view stream_view                   = rmm::cuda_stream_per_thread,
+           std::shared_ptr<rmm::cuda_stream_pool> stream_pool  = {nullptr},
+           rmm::mr::device_memory_resource* workspace_resource = nullptr)
+    : device_resources{stream_view, stream_pool, workspace_resource}
   {
-    create_resources();
   }
 
   /** Destroys all held-up resources */
-  virtual ~handle_t() { destroy_resources(); }
-
-  int get_device() const { return dev_id_; }
-
-  cublasHandle_t get_cublas_handle() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!cublas_initialized_) {
-      RAFT_CUBLAS_TRY_NO_THROW(cublasCreate(&cublas_handle_));
-      RAFT_CUBLAS_TRY_NO_THROW(cublasSetStream(cublas_handle_, stream_view_));
-      cublas_initialized_ = true;
-    }
-    return cublas_handle_;
-  }
-
-  cusolverDnHandle_t get_cusolver_dn_handle() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!cusolver_dn_initialized_) {
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnCreate(&cusolver_dn_handle_));
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnSetStream(cusolver_dn_handle_, stream_view_));
-      cusolver_dn_initialized_ = true;
-    }
-    return cusolver_dn_handle_;
-  }
-
-  cusolverSpHandle_t get_cusolver_sp_handle() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!cusolver_sp_initialized_) {
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpCreate(&cusolver_sp_handle_));
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpSetStream(cusolver_sp_handle_, stream_view_));
-      cusolver_sp_initialized_ = true;
-    }
-    return cusolver_sp_handle_;
-  }
-
-  cusparseHandle_t get_cusparse_handle() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!cusparse_initialized_) {
-      RAFT_CUSPARSE_TRY_NO_THROW(cusparseCreate(&cusparse_handle_));
-      RAFT_CUSPARSE_TRY_NO_THROW(cusparseSetStream(cusparse_handle_, stream_view_));
-      cusparse_initialized_ = true;
-    }
-    return cusparse_handle_;
-  }
-
-  rmm::exec_policy& get_thrust_policy() const { return *thrust_policy_; }
-
-  /**
-   * @brief synchronize a stream on the handle
-   */
-  void sync_stream(rmm::cuda_stream_view stream) const { interruptible::synchronize(stream); }
-
-  /**
-   * @brief synchronize main stream on the handle
-   */
-  void sync_stream() const { sync_stream(stream_view_); }
-
-  /**
-   * @brief returns main stream on the handle
-   */
-  rmm::cuda_stream_view get_stream() const { return stream_view_; }
-
-  /**
-   * @brief returns whether stream pool was initialized on the handle
-   */
-
-  bool is_stream_pool_initialized() const { return stream_pool_.get() != nullptr; }
-
-  /**
-   * @brief returns stream pool on the handle
-   */
-  const rmm::cuda_stream_pool& get_stream_pool() const
-  {
-    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-    return *stream_pool_;
-  }
-
-  std::size_t get_stream_pool_size() const
-  {
-    return is_stream_pool_initialized() ? stream_pool_->get_pool_size() : 0;
-  }
-
-  /**
-   * @brief return stream from pool
-   */
-  rmm::cuda_stream_view get_stream_from_stream_pool() const
-  {
-    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-    return stream_pool_->get_stream();
-  }
-
-  /**
-   * @brief return stream from pool at index
-   */
-  rmm::cuda_stream_view get_stream_from_stream_pool(std::size_t stream_idx) const
-  {
-    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-    return stream_pool_->get_stream(stream_idx);
-  }
-
-  /**
-   * @brief return stream from pool if size > 0, else main stream on handle
-   */
-  rmm::cuda_stream_view get_next_usable_stream() const
-  {
-    return is_stream_pool_initialized() ? get_stream_from_stream_pool() : stream_view_;
-  }
-
-  /**
-   * @brief return stream from pool at index if size > 0, else main stream on handle
-   *
-   * @param[in] stream_idx the required index of the stream in the stream pool if available
-   */
-  rmm::cuda_stream_view get_next_usable_stream(std::size_t stream_idx) const
-  {
-    return is_stream_pool_initialized() ? get_stream_from_stream_pool(stream_idx) : stream_view_;
-  }
-
-  /**
-   * @brief synchronize the stream pool on the handle
-   */
-  void sync_stream_pool() const
-  {
-    for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
-      sync_stream(stream_pool_->get_stream(i));
-    }
-  }
-
-  /**
-   * @brief synchronize subset of stream pool
-   *
-   * @param[in] stream_indices the indices of the streams in the stream pool to synchronize
-   */
-  void sync_stream_pool(const std::vector<std::size_t> stream_indices) const
-  {
-    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-    for (const auto& stream_index : stream_indices) {
-      sync_stream(stream_pool_->get_stream(stream_index));
-    }
-  }
-
-  /**
-   * @brief ask stream pool to wait on last event in main stream
-   */
-  void wait_stream_pool_on_stream() const
-  {
-    RAFT_CUDA_TRY(cudaEventRecord(event_, stream_view_));
-    for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
-      RAFT_CUDA_TRY(cudaStreamWaitEvent(stream_pool_->get_stream(i), event_, 0));
-    }
-  }
-
-  void set_comms(std::shared_ptr<comms::comms_t> communicator) { communicator_ = communicator; }
-
-  const comms::comms_t& get_comms() const
-  {
-    RAFT_EXPECTS(this->comms_initialized(), "ERROR: Communicator was not initialized\n");
-    return *communicator_;
-  }
-
-  void set_subcomm(std::string key, std::shared_ptr<comms::comms_t> subcomm)
-  {
-    subcomms_[key] = subcomm;
-  }
-
-  const comms::comms_t& get_subcomm(std::string key) const
-  {
-    RAFT_EXPECTS(
-      subcomms_.find(key) != subcomms_.end(), "%s was not found in subcommunicators.", key.c_str());
-
-    auto subcomm = subcomms_.at(key);
-
-    RAFT_EXPECTS(nullptr != subcomm.get(), "ERROR: Subcommunicator was not initialized");
-
-    return *subcomm;
-  }
-
-  bool comms_initialized() const { return (nullptr != communicator_.get()); }
-
-  const cudaDeviceProp& get_device_properties() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!device_prop_initialized_) {
-      RAFT_CUDA_TRY_NO_THROW(cudaGetDeviceProperties(&prop_, dev_id_));
-      device_prop_initialized_ = true;
-    }
-    return prop_;
-  }
-
- private:
-  std::shared_ptr<comms::comms_t> communicator_;
-  std::unordered_map<std::string, std::shared_ptr<comms::comms_t>> subcomms_;
-
-  const int dev_id_;
-  mutable cublasHandle_t cublas_handle_;
-  mutable bool cublas_initialized_{false};
-  mutable cusolverDnHandle_t cusolver_dn_handle_;
-  mutable bool cusolver_dn_initialized_{false};
-  mutable cusolverSpHandle_t cusolver_sp_handle_;
-  mutable bool cusolver_sp_initialized_{false};
-  mutable cusparseHandle_t cusparse_handle_;
-  mutable bool cusparse_initialized_{false};
-  std::unique_ptr<rmm::exec_policy> thrust_policy_{nullptr};
-  rmm::cuda_stream_view stream_view_{rmm::cuda_stream_per_thread};
-  std::shared_ptr<rmm::cuda_stream_pool> stream_pool_{nullptr};
-  cudaEvent_t event_;
-  mutable cudaDeviceProp prop_;
-  mutable bool device_prop_initialized_{false};
-  mutable std::mutex mutex_;
-
-  void create_resources()
-  {
-    thrust_policy_ = std::make_unique<rmm::exec_policy>(stream_view_);
-
-    RAFT_CUDA_TRY(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
-  }
-
-  void destroy_resources()
-  {
-    if (cusparse_initialized_) { RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroy(cusparse_handle_)); }
-    if (cusolver_dn_initialized_) {
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_));
-    }
-    if (cusolver_sp_initialized_) {
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_));
-    }
-    if (cublas_initialized_) { RAFT_CUBLAS_TRY_NO_THROW(cublasDestroy(cublas_handle_)); }
-    RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(event_));
-  }
-};  // class handle_t
-
-/**
- * @brief RAII approach to synchronizing across all streams in the handle
- */
-class stream_syncer {
- public:
-  explicit stream_syncer(const handle_t& handle) : handle_(handle) { handle_.sync_stream(); }
-  ~stream_syncer()
-  {
-    handle_.wait_stream_pool_on_stream();
-    handle_.sync_stream_pool();
-  }
-
-  stream_syncer(const stream_syncer& other) = delete;
-  stream_syncer& operator=(const stream_syncer& other) = delete;
-
- private:
-  const handle_t& handle_;
-};  // class stream_syncer
-
-}  // namespace raft
+  ~handle_t() override {}
+};
 
-#endif
\ No newline at end of file
+}  // end NAMESPACE raft
diff --git a/cpp/include/raft/core/host_mdspan.hpp b/cpp/include/raft/core/host_mdspan.hpp
index 1a0ea6432f..a6cdec7a84 100644
--- a/cpp/include/raft/core/host_mdspan.hpp
+++ b/cpp/include/raft/core/host_mdspan.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -144,7 +144,9 @@ auto make_host_aligned_matrix_view(ElementType* ptr, IndexType n_rows, IndexType
 
   static_assert(std::is_same<LayoutPolicy, layout_left_padded<ElementType>>::value ||
                 std::is_same<LayoutPolicy, layout_right_padded<ElementType>>::value);
-  assert(ptr == alignTo(ptr, detail::alignment::value));
+  assert(reinterpret_cast<std::uintptr_t>(ptr) ==
+         std::experimental::details::alignTo(reinterpret_cast<std::uintptr_t>(ptr),
+                                             detail::alignment::value));
   data_handle_type aligned_pointer = ptr;
 
   matrix_extent<IndexType> extents{n_rows, n_cols};
diff --git a/cpp/include/raft/core/kvp.hpp b/cpp/include/raft/core/kvp.hpp
index f6ea841dc4..8d3321eb77 100644
--- a/cpp/include/raft/core/kvp.hpp
+++ b/cpp/include/raft/core/kvp.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 
 #ifdef _RAFT_HAS_CUDA
 #include <cub/cub.cuh>
+#include <raft/util/cuda_utils.cuh>
 #endif
 namespace raft {
 /**
@@ -58,5 +59,27 @@ struct KeyValuePair {
   {
     return (value != b.value) || (key != b.key);
   }
+
+  RAFT_INLINE_FUNCTION bool operator<(const KeyValuePair<_Key, _Value>& b) const
+  {
+    return (key < b.key) || ((key == b.key) && value < b.value);
+  }
+
+  RAFT_INLINE_FUNCTION bool operator>(const KeyValuePair<_Key, _Value>& b) const
+  {
+    return (key > b.key) || ((key == b.key) && value > b.value);
+  }
 };
+
+#ifdef _RAFT_HAS_CUDA
+template <typename _Key, typename _Value>
+RAFT_INLINE_FUNCTION KeyValuePair<_Key, _Value> shfl_xor(const KeyValuePair<_Key, _Value>& input,
+                                                         int laneMask,
+                                                         int width     = WarpSize,
+                                                         uint32_t mask = 0xffffffffu)
+{
+  return KeyValuePair<_Key, _Value>(shfl_xor(input.key, laneMask, width, mask),
+                                    shfl_xor(input.value, laneMask, width, mask));
+}
+#endif
 }  // end namespace raft
diff --git a/cpp/include/raft/core/math.hpp b/cpp/include/raft/core/math.hpp
new file mode 100644
index 0000000000..c5f08b84b7
--- /dev/null
+++ b/cpp/include/raft/core/math.hpp
@@ -0,0 +1,320 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <type_traits>
+
+#include <raft/core/detail/macros.hpp>
+
+namespace raft {
+
+/**
+ * @defgroup Absolute Absolute value
+ * @{
+ */
+template <typename T>
+RAFT_INLINE_FUNCTION auto abs(T x)
+  -> std::enable_if_t<std::is_same_v<float, T> || std::is_same_v<double, T> ||
+                        std::is_same_v<int, T> || std::is_same_v<long int, T> ||
+                        std::is_same_v<long long int, T>,
+                      T>
+{
+#ifdef __CUDA_ARCH__
+  return ::abs(x);
+#else
+  return std::abs(x);
+#endif
+}
+template <typename T>
+constexpr RAFT_INLINE_FUNCTION auto abs(T x)
+  -> std::enable_if_t<!std::is_same_v<float, T> && !std::is_same_v<double, T> &&
+                        !std::is_same_v<int, T> && !std::is_same_v<long int, T> &&
+                        !std::is_same_v<long long int, T>,
+                      T>
+{
+  return x < T{0} ? -x : x;
+}
+/** @} */
+
+/**
+ * @defgroup Trigonometry Trigonometry functions
+ * @{
+ */
+/** Inverse cosine */
+template <typename T>
+RAFT_INLINE_FUNCTION auto acos(T x)
+{
+#ifdef __CUDA_ARCH__
+  return ::acos(x);
+#else
+  return std::acos(x);
+#endif
+}
+
+/** Inverse sine */
+template <typename T>
+RAFT_INLINE_FUNCTION auto asin(T x)
+{
+#ifdef __CUDA_ARCH__
+  return ::asin(x);
+#else
+  return std::asin(x);
+#endif
+}
+
+/** Inverse hyperbolic tangent */
+template <typename T>
+RAFT_INLINE_FUNCTION auto atanh(T x)
+{
+#ifdef __CUDA_ARCH__
+  return ::atanh(x);
+#else
+  return std::atanh(x);
+#endif
+}
+
+/** Cosine */
+template <typename T>
+RAFT_INLINE_FUNCTION auto cos(T x)
+{
+#ifdef __CUDA_ARCH__
+  return ::cos(x);
+#else
+  return std::cos(x);
+#endif
+}
+
+/** Sine */
+template <typename T>
+RAFT_INLINE_FUNCTION auto sin(T x)
+{
+#ifdef __CUDA_ARCH__
+  return ::sin(x);
+#else
+  return std::sin(x);
+#endif
+}
+
+/** Sine and cosine */
+template <typename T>
+RAFT_INLINE_FUNCTION std::enable_if_t<std::is_same_v<float, T> || std::is_same_v<double, T>> sincos(
+  const T& x, T* s, T* c)
+{
+#ifdef __CUDA_ARCH__
+  ::sincos(x, s, c);
+#else
+  *s = std::sin(x);
+  *c = std::cos(x);
+#endif
+}
+
+/** Hyperbolic tangent */
+template <typename T>
+RAFT_INLINE_FUNCTION auto tanh(T x)
+{
+#ifdef __CUDA_ARCH__
+  return ::tanh(x);
+#else
+  return std::tanh(x);
+#endif
+}
+/** @} */
+
+/**
+ * @defgroup Exponential Exponential and logarithm
+ * @{
+ */
+/** Exponential function */
+template <typename T>
+RAFT_INLINE_FUNCTION auto exp(T x)
+{
+#ifdef __CUDA_ARCH__
+  return ::exp(x);
+#else
+  return std::exp(x);
+#endif
+}
+
+/** Natural logarithm */
+template <typename T>
+RAFT_INLINE_FUNCTION auto log(T x)
+{
+#ifdef __CUDA_ARCH__
+  return ::log(x);
+#else
+  return std::log(x);
+#endif
+}
+/** @} */
+
+/**
+ * @defgroup Maximum Maximum of two or more values.
+ *
+ * The CUDA Math API has overloads for all combinations of float/double. We provide similar
+ * functionality while wrapping around std::max, which only supports arguments of the same type.
+ * However, though the CUDA Math API supports combinations of unsigned and signed integers, this is
+ * very error-prone so we do not support that and require the user to cast instead. (e.g the max of
+ * -1 and 1u is 4294967295u...)
+ *
+ * When no overload matches, we provide a generic implementation but require that both types be the
+ * same (and that the less-than operator be defined).
+ * @{
+ */
+template <typename T1, typename T2>
+RAFT_INLINE_FUNCTION auto max(const T1& x, const T2& y)
+{
+#ifdef __CUDA_ARCH__
+  // Combinations of types supported by the CUDA Math API
+  if constexpr ((std::is_integral_v<T1> && std::is_integral_v<T2> && std::is_same_v<T1, T2>) ||
+                ((std::is_same_v<T1, float> || std::is_same_v<T1, double>)&&(
+                  std::is_same_v<T2, float> || std::is_same_v<T2, double>))) {
+    return ::max(x, y);
+  }
+  // Else, check that the types are the same and provide a generic implementation
+  else {
+    static_assert(
+      std::is_same_v<T1, T2>,
+      "No native max overload for these types. Both argument types must be the same to use "
+      "the generic max. Please cast appropriately.");
+    return (x < y) ? y : x;
+  }
+#else
+  if constexpr (std::is_same_v<T1, float> && std::is_same_v<T2, double>) {
+    return std::max(static_cast<double>(x), y);
+  } else if constexpr (std::is_same_v<T1, double> && std::is_same_v<T2, float>) {
+    return std::max(x, static_cast<double>(y));
+  } else {
+    static_assert(
+      std::is_same_v<T1, T2>,
+      "std::max requires that both argument types be the same. Please cast appropriately.");
+    return std::max(x, y);
+  }
+#endif
+}
+
+/** Many-argument overload to avoid verbose nested calls or use with variadic arguments */
+template <typename T1, typename T2, typename... Args>
+RAFT_INLINE_FUNCTION auto max(const T1& x, const T2& y, Args&&... args)
+{
+  return raft::max(x, raft::max(y, std::forward<Args>(args)...));
+}
+
+/** One-argument overload for convenience when using with variadic arguments */
+template <typename T>
+constexpr RAFT_INLINE_FUNCTION auto max(const T& x)
+{
+  return x;
+}
+/** @} */
+
+/**
+ * @defgroup Minimum Minimum of two or more values.
+ *
+ * The CUDA Math API has overloads for all combinations of float/double. We provide similar
+ * functionality while wrapping around std::min, which only supports arguments of the same type.
+ * However, though the CUDA Math API supports combinations of unsigned and signed integers, this is
+ * very error-prone so we do not support that and require the user to cast instead. (e.g the min of
+ * -1 and 1u is 1u...)
+ *
+ * When no overload matches, we provide a generic implementation but require that both types be the
+ * same (and that the less-than operator be defined).
+ * @{
+ */
+template <typename T1, typename T2>
+RAFT_INLINE_FUNCTION auto min(const T1& x, const T2& y)
+{
+#ifdef __CUDA_ARCH__
+  // Combinations of types supported by the CUDA Math API
+  if constexpr ((std::is_integral_v<T1> && std::is_integral_v<T2> && std::is_same_v<T1, T2>) ||
+                ((std::is_same_v<T1, float> || std::is_same_v<T1, double>)&&(
+                  std::is_same_v<T2, float> || std::is_same_v<T2, double>))) {
+    return ::min(x, y);
+  }
+  // Else, check that the types are the same and provide a generic implementation
+  else {
+    static_assert(
+      std::is_same_v<T1, T2>,
+      "No native min overload for these types. Both argument types must be the same to use "
+      "the generic min. Please cast appropriately.");
+    return (y < x) ? y : x;
+  }
+#else
+  if constexpr (std::is_same_v<T1, float> && std::is_same_v<T2, double>) {
+    return std::min(static_cast<double>(x), y);
+  } else if constexpr (std::is_same_v<T1, double> && std::is_same_v<T2, float>) {
+    return std::min(x, static_cast<double>(y));
+  } else {
+    static_assert(
+      std::is_same_v<T1, T2>,
+      "std::min requires that both argument types be the same. Please cast appropriately.");
+    return std::min(x, y);
+  }
+#endif
+}
+
+/** Many-argument overload to avoid verbose nested calls or use with variadic arguments */
+template <typename T1, typename T2, typename... Args>
+RAFT_INLINE_FUNCTION auto min(const T1& x, const T2& y, Args&&... args)
+{
+  return raft::min(x, raft::min(y, std::forward<Args>(args)...));
+}
+
+/** One-argument overload for convenience when using with variadic arguments */
+template <typename T>
+constexpr RAFT_INLINE_FUNCTION auto min(const T& x)
+{
+  return x;
+}
+/** @} */
+
+/**
+ * @defgroup Power Power and root functions
+ * @{
+ */
+/** Power */
+template <typename T1, typename T2>
+RAFT_INLINE_FUNCTION auto pow(T1 x, T2 y)
+{
+#ifdef __CUDA_ARCH__
+  return ::pow(x, y);
+#else
+  return std::pow(x, y);
+#endif
+}
+
+/** Square root */
+template <typename T>
+RAFT_INLINE_FUNCTION auto sqrt(T x)
+{
+#ifdef __CUDA_ARCH__
+  return ::sqrt(x);
+#else
+  return std::sqrt(x);
+#endif
+}
+/** @} */
+
+/** Sign */
+template <typename T>
+RAFT_INLINE_FUNCTION auto sgn(T val) -> int
+{
+  return (T(0) < val) - (val < T(0));
+}
+
+}  // namespace raft
diff --git a/cpp/include/raft/core/mdspan.hpp b/cpp/include/raft/core/mdspan.hpp
index 786ce69f89..f805d20064 100644
--- a/cpp/include/raft/core/mdspan.hpp
+++ b/cpp/include/raft/core/mdspan.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -304,4 +304,52 @@ RAFT_INLINE_FUNCTION auto unravel_index(Idx idx,
   }
 }
 
+/**
+ * @brief Const accessor specialization for default_accessor
+ *
+ * @tparam ElementType
+ * @param a
+ * @return std::experimental::default_accessor<std::add_const_t<ElementType>>
+ */
+template <class ElementType>
+std::experimental::default_accessor<std::add_const_t<ElementType>> accessor_of_const(
+  std::experimental::default_accessor<ElementType> a)
+{
+  return {a};
+}
+
+/**
+ * @brief Const accessor specialization for host_device_accessor
+ *
+ * @tparam ElementType the data type of the mdspan elements
+ * @tparam MemType the type of memory where the elements are stored.
+ * @param a host_device_accessor
+ * @return host_device_accessor<std::experimental::default_accessor<std::add_const_t<ElementType>>,
+ * MemType>
+ */
+template <class ElementType, memory_type MemType>
+host_device_accessor<std::experimental::default_accessor<std::add_const_t<ElementType>>, MemType>
+accessor_of_const(host_device_accessor<std::experimental::default_accessor<ElementType>, MemType> a)
+{
+  return {a};
+}
+
+/**
+ * @brief Create a copy of the given mdspan with const element type
+ *
+ * @tparam ElementType the const-qualified data type of the mdspan elements
+ * @tparam Extents raft::extents for dimensions
+ * @tparam Layout policy for strides and layout ordering
+ * @tparam Accessor Accessor policy for the input and output
+ * @param mds raft::mdspan object
+ * @return raft::mdspan
+ */
+template <class ElementType, class Extents, class Layout, class Accessor>
+auto make_const_mdspan(mdspan<ElementType, Extents, Layout, Accessor> mds)
+{
+  auto acc_c = accessor_of_const(mds.accessor());
+  return mdspan<std::add_const_t<ElementType>, Extents, Layout, decltype(acc_c)>{
+    mds.data_handle(), mds.mapping(), acc_c};
+}
+
 }  // namespace raft
diff --git a/cpp/include/raft/core/operators.cuh b/cpp/include/raft/core/operators.cuh
new file mode 100644
index 0000000000..cafb404ef6
--- /dev/null
+++ b/cpp/include/raft/core/operators.cuh
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/detail/macros.hpp>
+#include <raft/util/device_atomics.cuh>
+
+namespace raft {
+
+/**
+ * @defgroup DeviceFunctors Commonly used device-only functors.
+ * @{
+ */
+
+struct atomic_add_op {
+  template <typename Type>
+  _RAFT_DEVICE _RAFT_FORCEINLINE Type operator()(Type* address, const Type& val)
+  {
+    return atomicAdd(address, val);
+  }
+};
+
+struct atomic_max_op {
+  template <typename Type>
+  _RAFT_DEVICE _RAFT_FORCEINLINE Type operator()(Type* address, const Type& val)
+  {
+    return atomicMax(address, val);
+  }
+};
+
+struct atomic_min_op {
+  template <typename Type>
+  _RAFT_DEVICE _RAFT_FORCEINLINE Type operator()(Type* address, const Type& val)
+  {
+    return atomicMin(address, val);
+  }
+};
+/** @} */
+
+}  // namespace raft
diff --git a/cpp/include/raft/core/operators.hpp b/cpp/include/raft/core/operators.hpp
new file mode 100644
index 0000000000..7acc907c49
--- /dev/null
+++ b/cpp/include/raft/core/operators.hpp
@@ -0,0 +1,421 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+#include <raft/core/detail/macros.hpp>
+#include <raft/core/math.hpp>
+
+namespace raft {
+
+/**
+ * @defgroup operators Commonly used functors.
+ * The optional unused arguments are useful for kernels that pass the index along with the value.
+ * @{
+ */
+
+struct identity_op {
+  template <typename Type, typename... UnusedArgs>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in, UnusedArgs...) const
+  {
+    return in;
+  }
+};
+
+struct void_op {
+  template <typename... UnusedArgs>
+  constexpr RAFT_INLINE_FUNCTION void operator()(UnusedArgs...) const
+  {
+    return;
+  }
+};
+
+template <typename OutT>
+struct cast_op {
+  template <typename InT, typename... UnusedArgs>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(InT in, UnusedArgs...) const
+  {
+    return static_cast<OutT>(in);
+  }
+};
+
+struct key_op {
+  template <typename KVP, typename... UnusedArgs>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const KVP& p, UnusedArgs...) const
+  {
+    return p.key;
+  }
+};
+
+struct value_op {
+  template <typename KVP, typename... UnusedArgs>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const KVP& p, UnusedArgs...) const
+  {
+    return p.value;
+  }
+};
+
+struct sqrt_op {
+  template <typename Type, typename... UnusedArgs>
+  RAFT_INLINE_FUNCTION auto operator()(const Type& in, UnusedArgs...) const
+  {
+    return raft::sqrt(in);
+  }
+};
+
+struct nz_op {
+  template <typename Type, typename... UnusedArgs>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in, UnusedArgs...) const
+  {
+    return in != Type(0) ? Type(1) : Type(0);
+  }
+};
+
+struct abs_op {
+  template <typename Type, typename... UnusedArgs>
+  RAFT_INLINE_FUNCTION auto operator()(const Type& in, UnusedArgs...) const
+  {
+    return raft::abs(in);
+  }
+};
+
+struct sq_op {
+  template <typename Type, typename... UnusedArgs>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in, UnusedArgs...) const
+  {
+    return in * in;
+  }
+};
+
+struct add_op {
+  template <typename T1, typename T2>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const T1& a, const T2& b) const
+  {
+    return a + b;
+  }
+};
+
+struct sub_op {
+  template <typename T1, typename T2>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const T1& a, const T2& b) const
+  {
+    return a - b;
+  }
+};
+
+struct mul_op {
+  template <typename T1, typename T2>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const T1& a, const T2& b) const
+  {
+    return a * b;
+  }
+};
+
+struct div_op {
+  template <typename T1, typename T2>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const T1& a, const T2& b) const
+  {
+    return a / b;
+  }
+};
+
+struct div_checkzero_op {
+  template <typename T1, typename T2>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const T1& a, const T2& b) const
+  {
+    if (b == T2{0}) { return T1{0} / T2{1}; }
+    return a / b;
+  }
+};
+
+struct pow_op {
+  template <typename Type>
+  RAFT_INLINE_FUNCTION auto operator()(const Type& a, const Type& b) const
+  {
+    return raft::pow(a, b);
+  }
+};
+
+struct mod_op {
+  template <typename T1, typename T2>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const T1& a, const T2& b) const
+  {
+    return a % b;
+  }
+};
+
+struct min_op {
+  template <typename... Args>
+  RAFT_INLINE_FUNCTION auto operator()(Args&&... args) const
+  {
+    return raft::min(std::forward<Args>(args)...);
+  }
+};
+
+struct max_op {
+  template <typename... Args>
+  RAFT_INLINE_FUNCTION auto operator()(Args&&... args) const
+  {
+    return raft::max(std::forward<Args>(args)...);
+  }
+};
+
+struct argmin_op {
+  template <typename KVP>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const KVP& a, const KVP& b) const
+  {
+    if ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) { return b; }
+    return a;
+  }
+};
+
+struct argmax_op {
+  template <typename KVP>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const KVP& a, const KVP& b) const
+  {
+    if ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) { return b; }
+    return a;
+  }
+};
+
+struct greater_op {
+  template <typename T1, typename T2>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const T1& a, const T2& b) const
+  {
+    return a > b;
+  }
+};
+
+struct less_op {
+  template <typename T1, typename T2>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const T1& a, const T2& b) const
+  {
+    return a < b;
+  }
+};
+
+struct greater_or_equal_op {
+  template <typename T1, typename T2>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const T1& a, const T2& b) const
+  {
+    return a >= b;
+  }
+};
+
+struct less_or_equal_op {
+  template <typename T1, typename T2>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const T1& a, const T2& b) const
+  {
+    return a <= b;
+  }
+};
+
+struct equal_op {
+  template <typename T1, typename T2>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const T1& a, const T2& b) const
+  {
+    return a == b;
+  }
+};
+
+struct notequal_op {
+  template <typename T1, typename T2>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const T1& a, const T2& b) const
+  {
+    return a != b;
+  }
+};
+
+template <typename ScalarT>
+struct const_op {
+  const ScalarT scalar;
+
+  constexpr explicit const_op(const ScalarT& s) : scalar{s} {}
+
+  template <typename... Args>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(Args...) const
+  {
+    return scalar;
+  }
+};
+
+/**
+ * @brief Wraps around a binary operator, passing a constant on the right-hand side.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *  #include <raft/core/operators.hpp>
+ *
+ *  raft::plug_const_op<float, raft::mul_op> op(2.0f);
+ *  std::cout << op(2.1f) << std::endl;  // 4.2
+ * @endcode
+ *
+ * @tparam ConstT
+ * @tparam BinaryOpT
+ */
+template <typename ConstT, typename BinaryOpT>
+struct plug_const_op {
+  const ConstT c;
+  const BinaryOpT composed_op;
+
+  template <typename OpT     = BinaryOpT,
+            typename UnusedT = std::enable_if_t<std::is_default_constructible_v<OpT>>>
+  constexpr explicit plug_const_op(const ConstT& s)
+    : c{s}, composed_op{}  // The compiler complains if composed_op is not initialized explicitly
+  {
+  }
+  constexpr plug_const_op(const ConstT& s, BinaryOpT o) : c{s}, composed_op{o} {}
+
+  template <typename InT>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(InT a) const
+  {
+    return composed_op(a, c);
+  }
+};
+
+template <typename Type>
+using add_const_op = plug_const_op<Type, add_op>;
+
+template <typename Type>
+using sub_const_op = plug_const_op<Type, sub_op>;
+
+template <typename Type>
+using mul_const_op = plug_const_op<Type, mul_op>;
+
+template <typename Type>
+using div_const_op = plug_const_op<Type, div_op>;
+
+template <typename Type>
+using div_checkzero_const_op = plug_const_op<Type, div_checkzero_op>;
+
+template <typename Type>
+using pow_const_op = plug_const_op<Type, pow_op>;
+
+template <typename Type>
+using mod_const_op = plug_const_op<Type, mod_op>;
+
+template <typename Type>
+using mod_const_op = plug_const_op<Type, mod_op>;
+
+template <typename Type>
+using equal_const_op = plug_const_op<Type, equal_op>;
+
+/**
+ * @brief Constructs an operator by composing a chain of operators.
+ *
+ * Note that all arguments are passed to the innermost operator.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *  #include <raft/core/operators.hpp>
+ *
+ *  auto op = raft::compose_op(raft::sqrt_op(), raft::abs_op(), raft::cast_op<float>(),
+ *                             raft::add_const_op<int>(8));
+ *  std::cout << op(-50) << std::endl;  // 6.48074
+ * @endcode
+ *
+ * @tparam OpsT Any number of operation types.
+ */
+template <typename... OpsT>
+struct compose_op {
+  const std::tuple<OpsT...> ops;
+
+  template <typename TupleT = std::tuple<OpsT...>,
+            typename CondT  = std::enable_if_t<std::is_default_constructible_v<TupleT>>>
+  constexpr compose_op()
+  {
+  }
+  constexpr explicit compose_op(OpsT... ops) : ops{ops...} {}
+
+  template <typename... Args>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(Args&&... args) const
+  {
+    return compose<sizeof...(OpsT)>(std::forward<Args>(args)...);
+  }
+
+ private:
+  template <size_t RemOps, typename... Args>
+  constexpr RAFT_INLINE_FUNCTION auto compose(Args&&... args) const
+  {
+    if constexpr (RemOps > 0) {
+      return compose<RemOps - 1>(std::get<RemOps - 1>(ops)(std::forward<Args>(args)...));
+    } else {
+      return identity_op{}(std::forward<Args>(args)...);
+    }
+  }
+};
+
+using absdiff_op = compose_op<abs_op, sub_op>;
+
+using sqdiff_op = compose_op<sq_op, sub_op>;
+
+/**
+ * @brief Constructs an operator by composing an outer op with one inner op for each of its inputs.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *  #include <raft/core/operators.hpp>
+ *
+ *  raft::map_args_op<raft::add_op, raft::sqrt_op, raft::cast_op<float>> op;
+ *  std::cout << op(42.0f, 10) << std::endl;  // 16.4807
+ * @endcode
+ *
+ * @tparam OuterOpT Outer operation type
+ * @tparam ArgOpsT Operation types for each input of the outer operation
+ */
+template <typename OuterOpT, typename... ArgOpsT>
+struct map_args_op {
+  const OuterOpT outer_op;
+  const std::tuple<ArgOpsT...> arg_ops;
+
+  template <typename T1    = OuterOpT,
+            typename T2    = std::tuple<ArgOpsT...>,
+            typename CondT = std::enable_if_t<std::is_default_constructible_v<T1> &&
+                                              std::is_default_constructible_v<T2>>>
+  constexpr map_args_op()
+    : outer_op{}  // The compiler complains if outer_op is not initialized explicitly
+  {
+  }
+  constexpr explicit map_args_op(OuterOpT outer_op, ArgOpsT... arg_ops)
+    : outer_op{outer_op}, arg_ops{arg_ops...}
+  {
+  }
+
+  template <typename... Args>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(Args&&... args) const
+  {
+    constexpr size_t kNumOps = sizeof...(ArgOpsT);
+    static_assert(kNumOps == sizeof...(Args),
+                  "The number of arguments does not match the number of mapping operators");
+    return map_args(std::make_index_sequence<kNumOps>{}, std::forward<Args>(args)...);
+  }
+
+ private:
+  template <size_t... I, typename... Args>
+  constexpr RAFT_INLINE_FUNCTION auto map_args(std::index_sequence<I...>, Args&&... args) const
+  {
+    return outer_op(std::get<I>(arg_ops)(std::forward<Args>(args))...);
+  }
+};
+
+/** @} */
+}  // namespace raft
diff --git a/cpp/include/raft/core/resource/comms.hpp b/cpp/include/raft/core/resource/comms.hpp
new file mode 100644
index 0000000000..73de166c14
--- /dev/null
+++ b/cpp/include/raft/core/resource/comms.hpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/comms.hpp>
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+
+namespace raft::resource {
+class comms_resource : public resource {
+ public:
+  comms_resource(std::shared_ptr<comms::comms_t> comnumicator) : communicator_(comnumicator) {}
+
+  void* get_resource() override { return &communicator_; }
+
+  ~comms_resource() override {}
+
+ private:
+  std::shared_ptr<comms::comms_t> communicator_;
+};
+
+/**
+ * Factory that knows how to construct a
+ * specific raft::resource to populate
+ * the res_t.
+ */
+class comms_resource_factory : public resource_factory {
+ public:
+  comms_resource_factory(std::shared_ptr<comms::comms_t> communicator) : communicator_(communicator)
+  {
+  }
+
+  resource_type get_resource_type() override { return resource_type::COMMUNICATOR; }
+
+  resource* make_resource() override { return new comms_resource(communicator_); }
+
+ private:
+  std::shared_ptr<comms::comms_t> communicator_;
+};
+
+/**
+ * @defgroup resource_comms Comms resource functions
+ * @{
+ */
+
+inline bool comms_initialized(resources const& res)
+{
+  return res.has_resource_factory(resource_type::COMMUNICATOR);
+}
+
+inline comms::comms_t const& get_comms(resources const& res)
+{
+  RAFT_EXPECTS(comms_initialized(res), "ERROR: Communicator was not initialized\n");
+  return *(*res.get_resource<std::shared_ptr<comms::comms_t>>(resource_type::COMMUNICATOR));
+}
+
+inline void set_comms(resources const& res, std::shared_ptr<comms::comms_t> communicator)
+{
+  res.add_resource_factory(std::make_shared<comms_resource_factory>(communicator));
+}
+
+/**
+ * @}
+ */
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resource/cublas_handle.hpp b/cpp/include/raft/core/resource/cublas_handle.hpp
new file mode 100644
index 0000000000..710fcc7e60
--- /dev/null
+++ b/cpp/include/raft/core/resource/cublas_handle.hpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cublas_v2.h>
+#include <raft/core/cublas_macros.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+
+namespace raft::resource {
+
+class cublas_resource : public resource {
+ public:
+  cublas_resource(rmm::cuda_stream_view stream)
+  {
+    RAFT_CUBLAS_TRY_NO_THROW(cublasCreate(&cublas_res));
+    RAFT_CUBLAS_TRY_NO_THROW(cublasSetStream(cublas_res, stream));
+  }
+
+  ~cublas_resource() override { RAFT_CUBLAS_TRY_NO_THROW(cublasDestroy(cublas_res)); }
+
+  void* get_resource() override { return &cublas_res; }
+
+ private:
+  cublasHandle_t cublas_res;
+};
+
+/**
+ * Factory that knows how to construct a
+ * specific raft::resource to populate
+ * the res_t.
+ */
+class cublas_resource_factory : public resource_factory {
+ public:
+  cublas_resource_factory(rmm::cuda_stream_view stream) : stream_(stream) {}
+  resource_type get_resource_type() override { return resource_type::CUBLAS_HANDLE; }
+  resource* make_resource() override { return new cublas_resource(stream_); }
+
+ private:
+  rmm::cuda_stream_view stream_;
+};
+
+/**
+ * @defgroup resource_cublas cuBLAS handle resource functions
+ * @{
+ */
+
+/**
+ * Load a cublasres_t from raft res if it exists, otherwise
+ * add it and return it.
+ * @param[in] res the raft resources object
+ * @return cublas handle
+ */
+inline cublasHandle_t get_cublas_handle(resources const& res)
+{
+  if (!res.has_resource_factory(resource_type::CUBLAS_HANDLE)) {
+    cudaStream_t stream = get_cuda_stream(res);
+    res.add_resource_factory(std::make_shared<cublas_resource_factory>(stream));
+  }
+  return *res.get_resource<cublasHandle_t>(resource_type::CUBLAS_HANDLE);
+};
+
+/**
+ * @}
+ */
+
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resource/cuda_event.hpp b/cpp/include/raft/core/resource/cuda_event.hpp
new file mode 100644
index 0000000000..4859d95ee9
--- /dev/null
+++ b/cpp/include/raft/core/resource/cuda_event.hpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda_runtime.h>
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft::resource {
+
+class cuda_event_resource : public resource {
+ public:
+  cuda_event_resource()
+  {
+    RAFT_CUDA_TRY_NO_THROW(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+  }
+  void* get_resource() override { return &event_; }
+
+  ~cuda_event_resource() override { RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(event_)); }
+
+ private:
+  cudaEvent_t event_;
+};
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resource/cuda_stream.hpp b/cpp/include/raft/core/resource/cuda_stream.hpp
new file mode 100644
index 0000000000..fc69f10d83
--- /dev/null
+++ b/cpp/include/raft/core/resource/cuda_stream.hpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda_runtime.h>
+#include <raft/core/interruptible.hpp>
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/cuda_stream_view.hpp>
+
+namespace raft::resource {
+class cuda_stream_resource : public resource {
+ public:
+  cuda_stream_resource(rmm::cuda_stream_view stream_view = rmm::cuda_stream_per_thread)
+    : stream(stream_view)
+  {
+  }
+  void* get_resource() override { return &stream; }
+
+  ~cuda_stream_resource() override {}
+
+ private:
+  rmm::cuda_stream_view stream;
+};
+
+/**
+ * Factory that knows how to construct a specific raft::resource to populate
+ * the resources instance.
+ */
+class cuda_stream_resource_factory : public resource_factory {
+ public:
+  cuda_stream_resource_factory(rmm::cuda_stream_view stream_view = rmm::cuda_stream_per_thread)
+    : stream(stream_view)
+  {
+  }
+  resource_type get_resource_type() override { return resource_type::CUDA_STREAM_VIEW; }
+  resource* make_resource() override { return new cuda_stream_resource(stream); }
+
+ private:
+  rmm::cuda_stream_view stream;
+};
+
+/**
+ * @defgroup resource_cuda_stream CUDA stream resource functions
+ * @{
+ */
+/**
+ * Load a rmm::cuda_stream_view from a resources instance (and populate it on the res
+ * if needed).
+ * @param res raft res object for managing resources
+ * @return
+ */
+inline rmm::cuda_stream_view get_cuda_stream(resources const& res)
+{
+  if (!res.has_resource_factory(resource_type::CUDA_STREAM_VIEW)) {
+    res.add_resource_factory(std::make_shared<cuda_stream_resource_factory>());
+  }
+  return *res.get_resource<rmm::cuda_stream_view>(resource_type::CUDA_STREAM_VIEW);
+};
+
+/**
+ * Load a rmm::cuda_stream_view from a resources instance (and populate it on the res
+ * if needed).
+ * @param[in] res raft resources object for managing resources
+ * @param[in] stream_view cuda stream view
+ */
+inline void set_cuda_stream(resources const& res, rmm::cuda_stream_view stream_view)
+{
+  res.add_resource_factory(std::make_shared<cuda_stream_resource_factory>(stream_view));
+};
+
+/**
+ * @brief synchronize a specific stream
+ *
+ * @param[in] res the raft resources object
+ * @param[in] stream stream to synchronize
+ */
+inline void sync_stream(const resources& res, rmm::cuda_stream_view stream)
+{
+  // TODO: Fix interruptible segfault:
+  // https://github.com/rapidsai/raft/issues/1225
+  // interruptible::synchronize(stream);
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+}
+
+/**
+ * @brief synchronize main stream on the resources instance
+ */
+inline void sync_stream(const resources& res) { sync_stream(res, get_cuda_stream(res)); }
+
+/**
+ * @}
+ */
+
+}  // namespace raft::resource
\ No newline at end of file
diff --git a/cpp/include/raft/core/resource/cuda_stream_pool.hpp b/cpp/include/raft/core/resource/cuda_stream_pool.hpp
new file mode 100644
index 0000000000..dbce75b3a5
--- /dev/null
+++ b/cpp/include/raft/core/resource/cuda_stream_pool.hpp
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda_runtime.h>
+#include <raft/core/resource/cuda_event.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/detail/stream_sync_event.hpp>
+
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+#include <rmm/cuda_stream_pool.hpp>
+
+namespace raft::resource {
+
+class cuda_stream_pool_resource : public resource {
+ public:
+  cuda_stream_pool_resource(std::shared_ptr<rmm::cuda_stream_pool> stream_pool)
+    : stream_pool_(stream_pool)
+  {
+  }
+
+  ~cuda_stream_pool_resource() override {}
+  void* get_resource() override { return &stream_pool_; }
+
+ private:
+  std::shared_ptr<rmm::cuda_stream_pool> stream_pool_{nullptr};
+};
+
+/**
+ * Factory that knows how to construct a
+ * specific raft::resource to populate
+ * the res_t.
+ */
+class cuda_stream_pool_resource_factory : public resource_factory {
+ public:
+  cuda_stream_pool_resource_factory(std::shared_ptr<rmm::cuda_stream_pool> stream_pool = {nullptr})
+    : stream_pool_(stream_pool)
+  {
+  }
+
+  resource_type get_resource_type() override { return resource_type::CUDA_STREAM_POOL; }
+  resource* make_resource() override { return new cuda_stream_pool_resource(stream_pool_); }
+
+ private:
+  std::shared_ptr<rmm::cuda_stream_pool> stream_pool_{nullptr};
+};
+
+inline bool is_stream_pool_initialized(const resources& res)
+{
+  return *res.get_resource<std::shared_ptr<rmm::cuda_stream_pool>>(
+           resource_type::CUDA_STREAM_POOL) != nullptr;
+}
+
+/**
+ * @defgroup resource_stream_pool CUDA Stream pool resource functions
+ * @{
+ */
+
+/**
+ * Load a cuda_stream_pool, and create a new one if it doesn't already exist
+ * @param res raft res object for managing resources
+ * @return
+ */
+inline const rmm::cuda_stream_pool& get_cuda_stream_pool(const resources& res)
+{
+  if (!res.has_resource_factory(resource_type::CUDA_STREAM_POOL)) {
+    res.add_resource_factory(std::make_shared<cuda_stream_pool_resource_factory>());
+  }
+  return *(
+    *res.get_resource<std::shared_ptr<rmm::cuda_stream_pool>>(resource_type::CUDA_STREAM_POOL));
+};
+
+/**
+ * Explicitly set a stream pool on the current res. Note that this will overwrite
+ * an existing stream pool on the res.
+ * @param res
+ * @param stream_pool
+ */
+inline void set_cuda_stream_pool(const resources& res,
+                                 std::shared_ptr<rmm::cuda_stream_pool> stream_pool)
+{
+  res.add_resource_factory(std::make_shared<cuda_stream_pool_resource_factory>(stream_pool));
+};
+
+inline std::size_t get_stream_pool_size(const resources& res)
+{
+  return is_stream_pool_initialized(res) ? get_cuda_stream_pool(res).get_pool_size() : 0;
+}
+
+/**
+ * @brief return stream from pool
+ */
+inline rmm::cuda_stream_view get_stream_from_stream_pool(const resources& res)
+{
+  RAFT_EXPECTS(is_stream_pool_initialized(res), "ERROR: rmm::cuda_stream_pool was not initialized");
+  return get_cuda_stream_pool(res).get_stream();
+}
+
+/**
+ * @brief return stream from pool at index
+ */
+inline rmm::cuda_stream_view get_stream_from_stream_pool(const resources& res,
+                                                         std::size_t stream_idx)
+{
+  RAFT_EXPECTS(is_stream_pool_initialized(res), "ERROR: rmm::cuda_stream_pool was not initialized");
+  return get_cuda_stream_pool(res).get_stream(stream_idx);
+}
+
+/**
+ * @brief return stream from pool if size > 0, else main stream on res
+ */
+inline rmm::cuda_stream_view get_next_usable_stream(const resources& res)
+{
+  return is_stream_pool_initialized(res) ? get_stream_from_stream_pool(res) : get_cuda_stream(res);
+}
+
+/**
+ * @brief return stream from pool at index if size > 0, else main stream on res
+ *
+ * @param[in] res the raft resources object
+ * @param[in] stream_idx the required index of the stream in the stream pool if available
+ */
+inline rmm::cuda_stream_view get_next_usable_stream(const resources& res, std::size_t stream_idx)
+{
+  return is_stream_pool_initialized(res) ? get_stream_from_stream_pool(res, stream_idx)
+                                         : get_cuda_stream(res);
+}
+
+/**
+ * @brief synchronize the stream pool on the res
+ *
+ * @param[in] res the raft resources object
+ */
+inline void sync_stream_pool(const resources& res)
+{
+  for (std::size_t i = 0; i < get_stream_pool_size(res); i++) {
+    sync_stream(res, get_cuda_stream_pool(res).get_stream(i));
+  }
+}
+
+/**
+ * @brief synchronize subset of stream pool
+ *
+ * @param[in] res the raft resources object
+ * @param[in] stream_indices the indices of the streams in the stream pool to synchronize
+ */
+inline void sync_stream_pool(const resources& res, const std::vector<std::size_t> stream_indices)
+{
+  RAFT_EXPECTS(is_stream_pool_initialized(res), "ERROR: rmm::cuda_stream_pool was not initialized");
+  for (const auto& stream_index : stream_indices) {
+    sync_stream(res, get_cuda_stream_pool(res).get_stream(stream_index));
+  }
+}
+
+/**
+ * @brief ask stream pool to wait on last event in main stream
+ *
+ * @param[in] res the raft resources object
+ */
+inline void wait_stream_pool_on_stream(const resources& res)
+{
+  cudaEvent_t event = detail::get_cuda_stream_sync_event(res);
+  RAFT_CUDA_TRY(cudaEventRecord(event, get_cuda_stream(res)));
+  for (std::size_t i = 0; i < get_stream_pool_size(res); i++) {
+    RAFT_CUDA_TRY(cudaStreamWaitEvent(get_cuda_stream_pool(res).get_stream(i), event, 0));
+  }
+}
+
+/**
+ * @}
+ */
+
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resource/cusolver_dn_handle.hpp b/cpp/include/raft/core/resource/cusolver_dn_handle.hpp
new file mode 100644
index 0000000000..7a33e2dd2a
--- /dev/null
+++ b/cpp/include/raft/core/resource/cusolver_dn_handle.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cuda_stream.hpp"
+#include <cusolverDn.h>
+#include <raft/core/cusolver_macros.hpp>
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+#include <rmm/cuda_stream_view.hpp>
+
+namespace raft::resource {
+
+/**
+ *
+ */
+class cusolver_dn_resource : public resource {
+ public:
+  cusolver_dn_resource(rmm::cuda_stream_view stream)
+  {
+    RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnCreate(&cusolver_res));
+    RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnSetStream(cusolver_res, stream));
+  }
+
+  void* get_resource() override { return &cusolver_res; }
+
+  ~cusolver_dn_resource() override { RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnDestroy(cusolver_res)); }
+
+ private:
+  cusolverDnHandle_t cusolver_res;
+};
+
+/**
+ * @defgroup resource_cusolver_dn cuSolver DN handle resource functions
+ * @{
+ */
+
+/**
+ * Factory that knows how to construct a
+ * specific raft::resource to populate
+ * the res_t.
+ */
+class cusolver_dn_resource_factory : public resource_factory {
+ public:
+  cusolver_dn_resource_factory(rmm::cuda_stream_view stream) : stream_(stream) {}
+  resource_type get_resource_type() override { return resource_type::CUSOLVER_DN_HANDLE; }
+  resource* make_resource() override { return new cusolver_dn_resource(stream_); }
+
+ private:
+  rmm::cuda_stream_view stream_;
+};
+
+/**
+ * Load a cusolverSpres_t from raft res if it exists, otherwise
+ * add it and return it.
+ * @param[in] res the raft resources object
+ * @return cusolver dn handle
+ */
+inline cusolverDnHandle_t get_cusolver_dn_handle(resources const& res)
+{
+  if (!res.has_resource_factory(resource_type::CUSOLVER_DN_HANDLE)) {
+    cudaStream_t stream = get_cuda_stream(res);
+    res.add_resource_factory(std::make_shared<cusolver_dn_resource_factory>(stream));
+  }
+  return *res.get_resource<cusolverDnHandle_t>(resource_type::CUSOLVER_DN_HANDLE);
+};
+
+/**
+ * @}
+ */
+
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resource/cusolver_sp_handle.hpp b/cpp/include/raft/core/resource/cusolver_sp_handle.hpp
new file mode 100644
index 0000000000..61fd95b44f
--- /dev/null
+++ b/cpp/include/raft/core/resource/cusolver_sp_handle.hpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cusolverSp.h>
+#include <raft/core/cusolver_macros.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+
+namespace raft::resource {
+
+/**
+ *
+ */
+class cusolver_sp_resource : public resource {
+ public:
+  cusolver_sp_resource(rmm::cuda_stream_view stream)
+  {
+    RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpCreate(&cusolver_res));
+    RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpSetStream(cusolver_res, stream));
+  }
+
+  void* get_resource() override { return &cusolver_res; }
+
+  ~cusolver_sp_resource() override { RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpDestroy(cusolver_res)); }
+
+ private:
+  cusolverSpHandle_t cusolver_res;
+};
+
+/**
+ * Factory that knows how to construct a
+ * specific raft::resource to populate
+ * the res_t.
+ */
+class cusolver_sp_resource_factory : public resource_factory {
+ public:
+  cusolver_sp_resource_factory(rmm::cuda_stream_view stream) : stream_(stream) {}
+  resource_type get_resource_type() override { return resource_type::CUSOLVER_SP_HANDLE; }
+  resource* make_resource() override { return new cusolver_sp_resource(stream_); }
+
+ private:
+  rmm::cuda_stream_view stream_;
+};
+
+/**
+ * @defgroup resource_cusolver_sp cuSolver SP handle resource functions
+ * @{
+ */
+
+/**
+ * Load a cusolverSpres_t from raft res if it exists, otherwise
+ * add it and return it.
+ * @param[in] res the raft resources object
+ * @return cusolver sp handle
+ */
+inline cusolverSpHandle_t get_cusolver_sp_handle(resources const& res)
+{
+  if (!res.has_resource_factory(resource_type::CUSOLVER_SP_HANDLE)) {
+    cudaStream_t stream = get_cuda_stream(res);
+    res.add_resource_factory(std::make_shared<cusolver_sp_resource_factory>(stream));
+  }
+  return *res.get_resource<cusolverSpHandle_t>(resource_type::CUSOLVER_SP_HANDLE);
+};
+
+/**
+ * @}
+ */
+
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resource/cusparse_handle.hpp b/cpp/include/raft/core/resource/cusparse_handle.hpp
new file mode 100644
index 0000000000..9893ed2f86
--- /dev/null
+++ b/cpp/include/raft/core/resource/cusparse_handle.hpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cusparse_v2.h>
+#include <raft/core/cusparse_macros.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+
+namespace raft::resource {
+class cusparse_resource : public resource {
+ public:
+  cusparse_resource(rmm::cuda_stream_view stream)
+  {
+    RAFT_CUSPARSE_TRY_NO_THROW(cusparseCreate(&cusparse_res));
+    RAFT_CUSPARSE_TRY_NO_THROW(cusparseSetStream(cusparse_res, stream));
+  }
+
+  ~cusparse_resource() { RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroy(cusparse_res)); }
+  void* get_resource() override { return &cusparse_res; }
+
+ private:
+  cusparseHandle_t cusparse_res;
+};
+
+/**
+ * Factory that knows how to construct a
+ * specific raft::resource to populate
+ * the res_t.
+ */
+class cusparse_resource_factory : public resource_factory {
+ public:
+  cusparse_resource_factory(rmm::cuda_stream_view stream) : stream_(stream) {}
+  resource_type get_resource_type() override { return resource_type::CUSPARSE_HANDLE; }
+  resource* make_resource() override { return new cusparse_resource(stream_); }
+
+ private:
+  rmm::cuda_stream_view stream_;
+};
+
+/**
+ * @defgroup resource_cusparse cuSparse handle resource functions
+ * @{
+ */
+
+/**
+ * Load a cusparseres_t from raft res if it exists, otherwise
+ * add it and return it.
+ * @param[in] res the raft resources object
+ * @return cusparse handle
+ */
+inline cusparseHandle_t get_cusparse_handle(resources const& res)
+{
+  if (!res.has_resource_factory(resource_type::CUSPARSE_HANDLE)) {
+    rmm::cuda_stream_view stream = get_cuda_stream(res);
+    res.add_resource_factory(std::make_shared<cusparse_resource_factory>(stream));
+  }
+  return *res.get_resource<cusparseHandle_t>(resource_type::CUSPARSE_HANDLE);
+};
+
+/**
+ * @}
+ */
+
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resource/detail/stream_sync_event.hpp b/cpp/include/raft/core/resource/detail/stream_sync_event.hpp
new file mode 100644
index 0000000000..1d02fef20d
--- /dev/null
+++ b/cpp/include/raft/core/resource/detail/stream_sync_event.hpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda_runtime.h>
+#include <raft/core/resource/cuda_event.hpp>
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft::resource::detail {
+
+/**
+ * Factory that knows how to construct a specific raft::resource to populate
+ * the res_t.
+ */
+class cuda_stream_sync_event_resource_factory : public resource_factory {
+ public:
+  resource_type get_resource_type() override { return resource_type::CUDA_STREAM_SYNC_EVENT; }
+  resource* make_resource() override { return new cuda_event_resource(); }
+};
+
+/**
+ * Load a cudaEvent from a resources instance (and populate it on the resources instance)
+ * if needed) for syncing the main cuda stream.
+ * @param res raft resources instance for managing resources
+ * @return
+ */
+inline cudaEvent_t& get_cuda_stream_sync_event(resources const& res)
+{
+  if (!res.has_resource_factory(resource_type::CUDA_STREAM_SYNC_EVENT)) {
+    res.add_resource_factory(std::make_shared<cuda_stream_sync_event_resource_factory>());
+  }
+  return *res.get_resource<cudaEvent_t>(resource_type::CUDA_STREAM_SYNC_EVENT);
+};
+
+}  // namespace raft::resource::detail
diff --git a/cpp/include/raft/core/resource/device_id.hpp b/cpp/include/raft/core/resource/device_id.hpp
new file mode 100644
index 0000000000..b55e56ca45
--- /dev/null
+++ b/cpp/include/raft/core/resource/device_id.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda_runtime.h>
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft::resource {
+
+class device_id_resource : public resource {
+ public:
+  device_id_resource()
+    : dev_id_([]() -> int {
+        int cur_dev = -1;
+        RAFT_CUDA_TRY_NO_THROW(cudaGetDevice(&cur_dev));
+        return cur_dev;
+      }())
+  {
+  }
+  void* get_resource() override { return &dev_id_; }
+
+  ~device_id_resource() override {}
+
+ private:
+  int dev_id_;
+};
+
+/**
+ * Factory that knows how to construct a
+ * specific raft::resource to populate
+ * the res_t.
+ */
+class device_id_resource_factory : public resource_factory {
+ public:
+  resource_type get_resource_type() override { return resource_type::DEVICE_ID; }
+  resource* make_resource() override { return new device_id_resource(); }
+};
+
+/**
+ * @defgroup resource_device_id Device ID resource functions
+ * @{
+ */
+
+/**
+ * Load a device id from a res (and populate it on the res if needed).
+ * @param res raft res object for managing resources
+ * @return device id
+ */
+inline int get_device_id(resources const& res)
+{
+  if (!res.has_resource_factory(resource_type::DEVICE_ID)) {
+    res.add_resource_factory(std::make_shared<device_id_resource_factory>());
+  }
+  return *res.get_resource<int>(resource_type::DEVICE_ID);
+};
+
+/**
+ * @}
+ */
+}  // namespace raft::resource
\ No newline at end of file
diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp
new file mode 100644
index 0000000000..35ae3d715f
--- /dev/null
+++ b/cpp/include/raft/core/resource/device_memory_resource.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+namespace raft::resource {
+class device_memory_resource : public resource {
+ public:
+  device_memory_resource(rmm::mr::device_memory_resource* mr_ = nullptr) : mr(mr_)
+  {
+    if (mr_ == nullptr) { mr = rmm::mr::get_current_device_resource(); }
+  }
+  void* get_resource() override { return mr; }
+
+  ~device_memory_resource() override {}
+
+ private:
+  rmm::mr::device_memory_resource* mr;
+};
+
+/**
+ * Factory that knows how to construct a specific raft::resource to populate
+ * the resources instance.
+ */
+class workspace_resource_factory : public resource_factory {
+ public:
+  workspace_resource_factory(rmm::mr::device_memory_resource* mr_ = nullptr) : mr(mr_) {}
+  resource_type get_resource_type() override { return resource_type::WORKSPACE_RESOURCE; }
+  resource* make_resource() override { return new device_memory_resource(mr); }
+
+ private:
+  rmm::mr::device_memory_resource* mr;
+};
+
+/**
+ * Load a temp workspace resource from a resources instance (and populate it on the res
+ * if needed).
+ * @param res raft resources object for managing resources
+ * @return device memory resource object
+ */
+inline rmm::mr::device_memory_resource* get_workspace_resource(resources const& res)
+{
+  if (!res.has_resource_factory(resource_type::WORKSPACE_RESOURCE)) {
+    res.add_resource_factory(std::make_shared<workspace_resource_factory>());
+  }
+  return res.get_resource<rmm::mr::device_memory_resource>(resource_type::WORKSPACE_RESOURCE);
+};
+
+/**
+ * Set a temp workspace resource on a resources instance.
+ *
+ * @param res raft resources object for managing resources
+ * @param mr a valid rmm device_memory_resource
+ */
+inline void set_workspace_resource(resources const& res, rmm::mr::device_memory_resource* mr)
+{
+  res.add_resource_factory(std::make_shared<workspace_resource_factory>(mr));
+};
+}  // namespace raft::resource
\ No newline at end of file
diff --git a/cpp/include/raft/core/resource/device_properties.hpp b/cpp/include/raft/core/resource/device_properties.hpp
new file mode 100644
index 0000000000..c3b0b8f2b9
--- /dev/null
+++ b/cpp/include/raft/core/resource/device_properties.hpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda_runtime.h>
+#include <raft/core/resource/device_id.hpp>
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft::resource {
+
+class device_properties_resource : public resource {
+ public:
+  device_properties_resource(int dev_id)
+  {
+    RAFT_CUDA_TRY_NO_THROW(cudaGetDeviceProperties(&prop_, dev_id));
+  }
+  void* get_resource() override { return &prop_; }
+
+  ~device_properties_resource() override {}
+
+ private:
+  cudaDeviceProp prop_;
+};
+
+/**
+ * @defgroup resource_device_props Device properties resource functions
+ * @{
+ */
+
+/**
+ * Factory that knows how to construct a
+ * specific raft::resource to populate
+ * the res_t.
+ */
+class device_properties_resource_factory : public resource_factory {
+ public:
+  device_properties_resource_factory(int dev_id) : dev_id_(dev_id) {}
+  resource_type get_resource_type() override { return resource_type::DEVICE_PROPERTIES; }
+  resource* make_resource() override { return new device_properties_resource(dev_id_); }
+
+ private:
+  int dev_id_;
+};
+
+/**
+ * Load a cudaDeviceProp from a res (and populate it on the res if needed).
+ * @param res raft res object for managing resources
+ * @return populated cuda device properties instance
+ */
+inline cudaDeviceProp& get_device_properties(resources const& res)
+{
+  if (!res.has_resource_factory(resource_type::DEVICE_PROPERTIES)) {
+    int dev_id = get_device_id(res);
+    res.add_resource_factory(std::make_shared<device_properties_resource_factory>(dev_id));
+  }
+  return *res.get_resource<cudaDeviceProp>(resource_type::DEVICE_PROPERTIES);
+};
+
+/**
+ * @}
+ */
+}  // namespace raft::resource
\ No newline at end of file
diff --git a/cpp/include/raft/core/resource/resource_types.hpp b/cpp/include/raft/core/resource/resource_types.hpp
new file mode 100644
index 0000000000..cf302e25f9
--- /dev/null
+++ b/cpp/include/raft/core/resource/resource_types.hpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace raft::resource {
+
+/**
+ * @defgroup resource_types Core resource vocabulary types
+ * @{
+ */
+
+/**
+ * @brief Resource types can apply to any resource and don't have to be host- or device-specific.
+ */
+enum resource_type {
+  // device-specific resource types
+  CUBLAS_HANDLE = 0,       // cublas handle
+  CUSOLVER_DN_HANDLE,      // cusolver dn handle
+  CUSOLVER_SP_HANDLE,      // cusolver sp handle
+  CUSPARSE_HANDLE,         // cusparse handle
+  CUDA_STREAM_VIEW,        // view of a cuda stream
+  CUDA_STREAM_POOL,        // cuda stream pool
+  CUDA_STREAM_SYNC_EVENT,  // cuda event for syncing streams
+  COMMUNICATOR,            // raft communicator
+  SUB_COMMUNICATOR,        // raft sub communicator
+  DEVICE_PROPERTIES,       // cuda device properties
+  DEVICE_ID,               // cuda device id
+  THRUST_POLICY,           // thrust execution policy
+  WORKSPACE_RESOURCE,      // rmm device memory resource
+
+  LAST_KEY  // reserved for the last key
+};
+
+/**
+ * @brief A resource constructs and contains an instance of
+ * some pre-determined object type and facades that object
+ * behind a common API.
+ */
+class resource {
+ public:
+  virtual void* get_resource() = 0;
+
+  virtual ~resource() {}
+};
+
+class empty_resource : public resource {
+ public:
+  empty_resource() : resource() {}
+
+  void* get_resource() override { return nullptr; }
+
+  ~empty_resource() override {}
+};
+
+/**
+ * @brief A resource factory knows how to construct an instance of
+ * a specific raft::resource::resource.
+ */
+class resource_factory {
+ public:
+  /**
+   * @brief Return the resource_type associated with the current factory
+   * @return resource_type corresponding to the current factory
+   */
+  virtual resource_type get_resource_type() = 0;
+
+  /**
+   * @brief Construct an instance of the factory's underlying resource.
+   * @return resource instance
+   */
+  virtual resource* make_resource() = 0;
+};
+
+/**
+ * @brief A resource factory knows how to construct an instance of
+ * a specific raft::resource::resource.
+ */
+class empty_resource_factory : public resource_factory {
+ public:
+  empty_resource_factory() : resource_factory() {}
+  /**
+   * @brief Return the resource_type associated with the current factory
+   * @return resource_type corresponding to the current factory
+   */
+  resource_type get_resource_type() override { return resource_type::LAST_KEY; }
+
+  /**
+   * @brief Construct an instance of the factory's underlying resource.
+   * @return resource instance
+   */
+  resource* make_resource() override { return &res; }
+
+ private:
+  empty_resource res;
+};
+
+/**
+ * @}
+ */
+
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resource/sub_comms.hpp b/cpp/include/raft/core/resource/sub_comms.hpp
new file mode 100644
index 0000000000..7070b61c54
--- /dev/null
+++ b/cpp/include/raft/core/resource/sub_comms.hpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/comms.hpp>
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+
+namespace raft::resource {
+class sub_comms_resource : public resource {
+ public:
+  sub_comms_resource() : communicators_() {}
+  void* get_resource() override { return &communicators_; }
+
+  ~sub_comms_resource() override {}
+
+ private:
+  std::unordered_map<std::string, std::shared_ptr<comms::comms_t>> communicators_;
+};
+
+/**
+ * Factory that knows how to construct a
+ * specific raft::resource to populate
+ * the res_t.
+ */
+class sub_comms_resource_factory : public resource_factory {
+ public:
+  resource_type get_resource_type() override { return resource_type::SUB_COMMUNICATOR; }
+  resource* make_resource() override { return new sub_comms_resource(); }
+};
+
+/**
+ * @defgroup resource_subcomms Subcommunicator resource functions
+ * @{
+ */
+
+inline const comms::comms_t& get_subcomm(const resources& res, std::string key)
+{
+  if (!res.has_resource_factory(resource_type::SUB_COMMUNICATOR)) {
+    res.add_resource_factory(std::make_shared<sub_comms_resource_factory>());
+  }
+
+  auto sub_comms =
+    res.get_resource<std::unordered_map<std::string, std::shared_ptr<comms::comms_t>>>(
+      resource_type::SUB_COMMUNICATOR);
+  auto sub_comm = sub_comms->at(key);
+  RAFT_EXPECTS(nullptr != sub_comm.get(), "ERROR: Subcommunicator was not initialized");
+
+  return *sub_comm;
+}
+
+inline void set_subcomm(resources const& res,
+                        std::string key,
+                        std::shared_ptr<comms::comms_t> subcomm)
+{
+  if (!res.has_resource_factory(resource_type::SUB_COMMUNICATOR)) {
+    res.add_resource_factory(std::make_shared<sub_comms_resource_factory>());
+  }
+  auto sub_comms =
+    res.get_resource<std::unordered_map<std::string, std::shared_ptr<comms::comms_t>>>(
+      resource_type::SUB_COMMUNICATOR);
+  sub_comms->insert(std::make_pair(key, subcomm));
+}
+
+/**
+ * @}
+ */
+
+}  // namespace raft::resource
\ No newline at end of file
diff --git a/cpp/include/raft/core/resource/thrust_policy.hpp b/cpp/include/raft/core/resource/thrust_policy.hpp
new file mode 100644
index 0000000000..1e7441e5e4
--- /dev/null
+++ b/cpp/include/raft/core/resource/thrust_policy.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+#include <rmm/exec_policy.hpp>
+namespace raft::resource {
+class thrust_policy_resource : public resource {
+ public:
+  thrust_policy_resource(rmm::cuda_stream_view stream_view)
+    : thrust_policy_(std::make_unique<rmm::exec_policy>(stream_view))
+  {
+  }
+  void* get_resource() override { return thrust_policy_.get(); }
+
+  ~thrust_policy_resource() override {}
+
+ private:
+  std::unique_ptr<rmm::exec_policy> thrust_policy_;
+};
+
+/**
+ * Factory that knows how to construct a
+ * specific raft::resource to populate
+ * the res_t.
+ */
+class thrust_policy_resource_factory : public resource_factory {
+ public:
+  thrust_policy_resource_factory(rmm::cuda_stream_view stream_view) : stream_view_(stream_view) {}
+  resource_type get_resource_type() override { return resource_type::THRUST_POLICY; }
+  resource* make_resource() override { return new thrust_policy_resource(stream_view_); }
+
+ private:
+  rmm::cuda_stream_view stream_view_;
+};
+
+/**
+ * @defgroup resource_thrust_policy Thrust policy resource functions
+ * @{
+ */
+
+/**
+ * Load a thrust policy from a res (and populate it on the res if needed).
+ * @param res raft res object for managing resources
+ * @return thrust execution policy
+ */
+inline rmm::exec_policy& get_thrust_policy(resources const& res)
+{
+  if (!res.has_resource_factory(resource_type::THRUST_POLICY)) {
+    rmm::cuda_stream_view stream = get_cuda_stream(res);
+    res.add_resource_factory(std::make_shared<thrust_policy_resource_factory>(stream));
+  }
+  return *res.get_resource<rmm::exec_policy>(resource_type::THRUST_POLICY);
+};
+
+/**
+ * @}
+ */
+
+}  // namespace raft::resource
\ No newline at end of file
diff --git a/cpp/include/raft/core/resources.hpp b/cpp/include/raft/core/resources.hpp
new file mode 100644
index 0000000000..64e281e934
--- /dev/null
+++ b/cpp/include/raft/core/resources.hpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "resource/resource_types.hpp"
+#include <algorithm>
+#include <mutex>
+#include <raft/core/logger.hpp>
+#include <string>
+#include <vector>
+
+namespace raft {
+
+/**
+ * @brief Resource container which allows lazy-loading and registration
+ * of resource_factory implementations, which in turn generate resource instances.
+ *
+ * This class is intended to be agnostic of the resources it contains and
+ * does not, itself, differentiate between host and device resources. Downstream
+ * accessor functions can then register and load resources as needed in order
+ * to keep its usage somewhat opaque to end-users.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <raft/core/resource/cuda_stream.hpp>
+ * #include <raft/core/resource/cublas_handle.hpp>
+ *
+ * raft::resources res;
+ * auto stream = raft::resource::get_cuda_stream(res);
+ * auto cublas_handle = raft::resource::get_cublas_handle(res);
+ * @endcode
+ */
+class resources {
+ public:
+  template <typename T>
+  using pair_res = std::pair<resource::resource_type, std::shared_ptr<T>>;
+
+  using pair_res_factory = pair_res<resource::resource_factory>;
+  using pair_resource    = pair_res<resource::resource>;
+
+  resources()
+    : factories_(resource::resource_type::LAST_KEY), resources_(resource::resource_type::LAST_KEY)
+  {
+    for (int i = 0; i < resource::resource_type::LAST_KEY; ++i) {
+      factories_.at(i) = std::make_pair(resource::resource_type::LAST_KEY,
+                                        std::make_shared<resource::empty_resource_factory>());
+      resources_.at(i) = std::make_pair(resource::resource_type::LAST_KEY,
+                                        std::make_shared<resource::empty_resource>());
+    }
+  }
+
+  /**
+   * @brief Shallow copy of underlying resources instance.
+   * Note that this does not create any new resources.
+   */
+  resources(const resources& res) : factories_(res.factories_), resources_(res.resources_) {}
+  resources(resources&&) = delete;
+  resources& operator=(resources&&) = delete;
+
+  /**
+   * @brief Returns true if a resource_factory has been registered for the
+   * given resource_type, false otherwise.
+   * @param resource_type resource type to check
+   * @return true if resource_factory is registered for the given resource_type
+   */
+  bool has_resource_factory(resource::resource_type resource_type) const
+  {
+    std::lock_guard<std::mutex> _(mutex_);
+    return factories_.at(resource_type).first != resource::resource_type::LAST_KEY;
+  }
+
+  /**
+   * @brief Register a resource_factory with the current instance.
+   * This will overwrite any existing resource factories.
+   * @param factory resource factory to register on the current instance
+   */
+  void add_resource_factory(std::shared_ptr<resource::resource_factory> factory) const
+  {
+    std::lock_guard<std::mutex> _(mutex_);
+    resource::resource_type rtype = factory.get()->get_resource_type();
+    RAFT_EXPECTS(rtype != resource::resource_type::LAST_KEY,
+                 "LAST_KEY is a placeholder and not a valid resource factory type.");
+    factories_.at(rtype) = std::make_pair(rtype, factory);
+  }
+
+  /**
+   * @brief Retrieve a resource for the given resource_type and cast to given pointer type.
+   * Note that the resources are loaded lazily on-demand and resources which don't yet
+   * exist on the current instance will be created using the corresponding factory, if
+   * it exists.
+   * @tparam res_t pointer type for which retrieved resource will be casted
+   * @param resource_type resource type to retrieve
+   * @return the given resource, if it exists.
+   */
+  template <typename res_t>
+  res_t* get_resource(resource::resource_type resource_type) const
+  {
+    std::lock_guard<std::mutex> _(mutex_);
+
+    if (resources_.at(resource_type).first == resource::resource_type::LAST_KEY) {
+      RAFT_EXPECTS(factories_.at(resource_type).first != resource::resource_type::LAST_KEY,
+                   "No resource factory has been registered for the given resource %d.",
+                   resource_type);
+      resource::resource_factory* factory = factories_.at(resource_type).second.get();
+      resources_.at(resource_type)        = std::make_pair(
+        resource_type, std::shared_ptr<resource::resource>(factory->make_resource()));
+    }
+
+    resource::resource* res = resources_.at(resource_type).second.get();
+    return reinterpret_cast<res_t*>(res->get_resource());
+  }
+
+ protected:
+  mutable std::mutex mutex_;
+  mutable std::vector<pair_res_factory> factories_;
+  mutable std::vector<pair_resource> resources_;
+};
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/core/serialize.hpp b/cpp/include/raft/core/serialize.hpp
new file mode 100644
index 0000000000..05814e2845
--- /dev/null
+++ b/cpp/include/raft/core/serialize.hpp
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/detail/mdspan_numpy_serializer.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdspan.hpp>
+
+#include <iostream>
+#include <vector>
+
+/**
+ * Collection of serialization functions for RAFT data types
+ */
+
+namespace raft {
+
+template <typename ElementType, typename Extents, typename LayoutPolicy, typename AccessorPolicy>
+inline void serialize_mdspan(
+  const raft::device_resources&,
+  std::ostream& os,
+  const raft::host_mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy>& obj)
+{
+  detail::numpy_serializer::serialize_host_mdspan(os, obj);
+}
+
+template <typename ElementType, typename Extents, typename LayoutPolicy, typename AccessorPolicy>
+inline void serialize_mdspan(
+  const raft::device_resources& handle,
+  std::ostream& os,
+  const raft::device_mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy>& obj)
+{
+  static_assert(std::is_same_v<LayoutPolicy, raft::layout_c_contiguous> ||
+                  std::is_same_v<LayoutPolicy, raft::layout_f_contiguous>,
+                "The serializer only supports row-major and column-major layouts");
+  using obj_t = raft::device_mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy>;
+
+  // Copy to host before serializing
+  // For contiguous layouts, size() == product of dimensions
+  std::vector<typename obj_t::value_type> tmp(obj.size());
+  cudaStream_t stream = handle.get_stream();
+  raft::update_host(tmp.data(), obj.data_handle(), obj.size(), stream);
+  handle.sync_stream();
+  using inner_accessor_type = typename obj_t::accessor_type::accessor_type;
+  auto tmp_mdspan =
+    raft::host_mdspan<ElementType, Extents, LayoutPolicy, raft::host_accessor<inner_accessor_type>>(
+      tmp.data(), obj.extents());
+  detail::numpy_serializer::serialize_host_mdspan(os, tmp_mdspan);
+}
+
+template <typename ElementType, typename Extents, typename LayoutPolicy, typename AccessorPolicy>
+inline void serialize_mdspan(
+  const raft::device_resources&,
+  std::ostream& os,
+  const raft::managed_mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy>& obj)
+{
+  using obj_t = raft::managed_mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy>;
+  using inner_accessor_type = typename obj_t::accessor_type::accessor_type;
+  auto tmp_mdspan =
+    raft::host_mdspan<ElementType, Extents, LayoutPolicy, raft::host_accessor<inner_accessor_type>>(
+      obj.data_handle(), obj.extents());
+  detail::numpy_serializer::serialize_host_mdspan(os, tmp_mdspan);
+}
+
+template <typename ElementType, typename Extents, typename LayoutPolicy, typename AccessorPolicy>
+inline void deserialize_mdspan(
+  const raft::device_resources&,
+  std::istream& is,
+  raft::host_mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy>& obj)
+{
+  detail::numpy_serializer::deserialize_host_mdspan(is, obj);
+}
+
+template <typename ElementType, typename Extents, typename LayoutPolicy, typename AccessorPolicy>
+inline void deserialize_mdspan(
+  const raft::device_resources& handle,
+  std::istream& is,
+  raft::device_mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy>& obj)
+{
+  static_assert(std::is_same_v<LayoutPolicy, raft::layout_c_contiguous> ||
+                  std::is_same_v<LayoutPolicy, raft::layout_f_contiguous>,
+                "The serializer only supports row-major and column-major layouts");
+  using obj_t = raft::device_mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy>;
+
+  // Copy to device after serializing
+  // For contiguous layouts, size() == product of dimensions
+  std::vector<typename obj_t::value_type> tmp(obj.size());
+  using inner_accessor_type = typename obj_t::accessor_type::accessor_type;
+  auto tmp_mdspan =
+    raft::host_mdspan<ElementType, Extents, LayoutPolicy, raft::host_accessor<inner_accessor_type>>(
+      tmp.data(), obj.extents());
+  detail::numpy_serializer::deserialize_host_mdspan(is, tmp_mdspan);
+
+  cudaStream_t stream = handle.get_stream();
+  raft::update_device(obj.data_handle(), tmp.data(), obj.size(), stream);
+  handle.sync_stream();
+}
+
+template <typename ElementType, typename Extents, typename LayoutPolicy, typename AccessorPolicy>
+inline void deserialize_mdspan(
+  const raft::device_resources& handle,
+  std::istream& is,
+  raft::host_mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy>&& obj)
+{
+  deserialize_mdspan(handle, is, obj);
+}
+
+template <typename ElementType, typename Extents, typename LayoutPolicy, typename AccessorPolicy>
+inline void deserialize_mdspan(
+  const raft::device_resources& handle,
+  std::istream& is,
+  raft::managed_mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy>& obj)
+{
+  using obj_t = raft::managed_mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy>;
+  using inner_accessor_type = typename obj_t::accessor_type::accessor_type;
+  auto tmp_mdspan =
+    raft::host_mdspan<ElementType, Extents, LayoutPolicy, raft::host_accessor<inner_accessor_type>>(
+      obj.data_handle(), obj.extents());
+  detail::numpy_serializer::deserialize_host_mdspan(is, tmp_mdspan);
+}
+
+template <typename ElementType, typename Extents, typename LayoutPolicy, typename AccessorPolicy>
+inline void deserialize_mdspan(
+  const raft::device_resources& handle,
+  std::istream& is,
+  raft::managed_mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy>&& obj)
+{
+  deserialize_mdspan(handle, is, obj);
+}
+
+template <typename ElementType, typename Extents, typename LayoutPolicy, typename AccessorPolicy>
+inline void deserialize_mdspan(
+  const raft::device_resources& handle,
+  std::istream& is,
+  raft::device_mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy>&& obj)
+{
+  deserialize_mdspan(handle, is, obj);
+}
+
+template <typename T>
+inline void serialize_scalar(const raft::device_resources&, std::ostream& os, const T& value)
+{
+  detail::numpy_serializer::serialize_scalar(os, value);
+}
+
+template <typename T>
+inline T deserialize_scalar(const raft::device_resources&, std::istream& is)
+{
+  return detail::numpy_serializer::deserialize_scalar<T>(is);
+}
+
+}  // end namespace raft
diff --git a/cpp/include/raft/distance/detail/canberra.cuh b/cpp/include/raft/distance/detail/canberra.cuh
index 6be994b80a..f17a26dc4b 100644
--- a/cpp/include/raft/distance/detail/canberra.cuh
+++ b/cpp/include/raft/distance/detail/canberra.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -73,19 +73,15 @@ static void canberraImpl(const DataT* x,
 
   // Accumulation operation lambda
   auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    const auto diff = raft::L1Op<AccT, IdxT>()(x - y);
-    const auto add  = raft::myAbs(x) + raft::myAbs(y);
+    const auto diff = raft::abs(x - y);
+    const auto add  = raft::abs(x) + raft::abs(y);
     // deal with potential for 0 in denominator by
     // forcing 1/0 instead
     acc += ((add != 0) * diff / (add + (add == 0)));
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                     DataT * regxn,
-                                     DataT * regyn,
-                                     IdxT gridStrideX,
-                                     IdxT gridStrideY) { return; };
+  auto epilog_lambda = raft::void_op();
 
   if (isRowMajor) {
     auto canberraRowMajor = pairwiseDistanceMatKernel<false,
diff --git a/cpp/include/raft/distance/detail/chebyshev.cuh b/cpp/include/raft/distance/detail/chebyshev.cuh
index 1ac10f269e..43b36e7921 100644
--- a/cpp/include/raft/distance/detail/chebyshev.cuh
+++ b/cpp/include/raft/distance/detail/chebyshev.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 
 #pragma once
+#include <raft/core/operators.hpp>
 #include <raft/distance/detail/pairwise_distance_base.cuh>
 
 namespace raft {
@@ -72,16 +73,12 @@ static void chebyshevImpl(const DataT* x,
 
   // Accumulation operation lambda
   auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    const auto diff = raft::L1Op<AccT, IdxT>()(x - y);
-    acc             = raft::myMax(acc, diff);
+    const auto diff = raft::abs(x - y);
+    acc             = raft::max(acc, diff);
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                     DataT * regxn,
-                                     DataT * regyn,
-                                     IdxT gridStrideX,
-                                     IdxT gridStrideY) { return; };
+  auto epilog_lambda = raft::void_op();
 
   if (isRowMajor) {
     auto chebyshevRowMajor = pairwiseDistanceMatKernel<false,
diff --git a/cpp/include/raft/distance/detail/correlation.cuh b/cpp/include/raft/distance/detail/correlation.cuh
index 2b77d280fe..f7fe3678e6 100644
--- a/cpp/include/raft/distance/detail/correlation.cuh
+++ b/cpp/include/raft/distance/detail/correlation.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -125,7 +125,7 @@ static void correlationImpl(const DataT* x,
         auto Q_denom = k * regx2n[i] - (regxn[i] * regxn[i]);
         auto R_denom = k * regy2n[j] - (regyn[j] * regyn[j]);
 
-        acc[i][j] = 1 - (numer / raft::mySqrt(Q_denom * R_denom));
+        acc[i][j] = 1 - (numer / raft::sqrt(Q_denom * R_denom));
       }
     }
   };
@@ -262,8 +262,8 @@ void correlationImpl(int m,
                          true,
                          stream,
                          false,
-                         raft::Nop<InType>(),
-                         raft::Sum<InType>());
+                         raft::identity_op(),
+                         raft::add_op());
     raft::linalg::reduce(norm_row_vec,
                          pB,
                          k,
@@ -273,8 +273,8 @@ void correlationImpl(int m,
                          true,
                          stream,
                          false,
-                         raft::Nop<InType>(),
-                         raft::Sum<InType>());
+                         raft::identity_op(),
+                         raft::add_op());
 
     sq_norm_col_vec += (m + n);
     sq_norm_row_vec = sq_norm_col_vec + m;
@@ -290,8 +290,8 @@ void correlationImpl(int m,
                          true,
                          stream,
                          false,
-                         raft::Nop<InType>(),
-                         raft::Sum<InType>());
+                         raft::identity_op(),
+                         raft::add_op());
     sq_norm_col_vec += m;
     sq_norm_row_vec = sq_norm_col_vec;
     raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream);
diff --git a/cpp/include/raft/distance/detail/cosine.cuh b/cpp/include/raft/distance/detail/cosine.cuh
index f06051962f..46a694aa51 100644
--- a/cpp/include/raft/distance/detail/cosine.cuh
+++ b/cpp/include/raft/distance/detail/cosine.cuh
@@ -19,6 +19,7 @@
 #include <raft/distance/detail/pairwise_distance_base.cuh>
 #include <raft/distance/detail/pairwise_distance_cutlass_base.cuh>
 #include <raft/linalg/norm.cuh>
+#include <raft/util/cuda_utils.cuh>
 
 namespace raft {
 namespace distance {
@@ -229,8 +230,6 @@ void cosineAlgo1(Index_ m,
                  cudaStream_t stream,
                  bool isRowMajor)
 {
-  auto norm_op = [] __device__(AccType in) { return raft::mySqrt(in); };
-
   // raft distance support inputs as float/double and output as uint8_t/float/double.
   static_assert(!((sizeof(OutType) > 1) && (sizeof(AccType) != sizeof(OutType))),
                 "OutType can be uint8_t, float, double,"
@@ -248,10 +247,13 @@ void cosineAlgo1(Index_ m,
   InType* row_vec = workspace;
   if (pA != pB) {
     row_vec += m;
-    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
-    raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
+    raft::linalg::rowNorm(
+      col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::sqrt_op{});
+    raft::linalg::rowNorm(
+      row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, raft::sqrt_op{});
   } else {
-    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
+    raft::linalg::rowNorm(
+      col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::sqrt_op{});
   }
 
   if (isRowMajor) {
diff --git a/cpp/include/raft/distance/detail/euclidean.cuh b/cpp/include/raft/distance/detail/euclidean.cuh
index 5ea74fa884..1a2db63f5c 100644
--- a/cpp/include/raft/distance/detail/euclidean.cuh
+++ b/cpp/include/raft/distance/detail/euclidean.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <raft/distance/detail/pairwise_distance_base.cuh>
 #include <raft/distance/detail/pairwise_distance_cutlass_base.cuh>
 #include <raft/linalg/norm.cuh>
+#include <raft/util/cuda_utils.cuh>
 
 namespace raft {
 namespace distance {
@@ -33,7 +34,7 @@ struct L2ExpandedOp {
   __device__ AccT operator()(DataT& aNorm, const DataT& bNorm, DataT& accVal) const noexcept
   {
     AccT outVal = aNorm + bNorm - DataT(2.0) * accVal;
-    return sqrt ? raft::mySqrt(outVal) : outVal;
+    return sqrt ? raft::sqrt(outVal) : outVal;
   }
 
   __device__ AccT operator()(DataT aData) const noexcept { return aData; }
@@ -129,7 +130,7 @@ void euclideanExpImpl(const DataT* x,
         for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
           for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-            acc[i][j] = raft::mySqrt(acc[i][j]);
+            acc[i][j] = raft::sqrt(acc[i][j]);
           }
         }
       }
@@ -247,8 +248,6 @@ void euclideanAlgo1(Index_ m,
                     cudaStream_t stream,
                     bool isRowMajor)
 {
-  auto norm_op = [] __device__(InType in) { return in; };
-
   // raft distance support inputs as float/double and output as uint8_t/float/double.
   static_assert(!((sizeof(OutType) > 1) && (sizeof(AccType) != sizeof(OutType))),
                 "OutType can be uint8_t, float, double,"
@@ -266,10 +265,13 @@ void euclideanAlgo1(Index_ m,
   InType* row_vec = workspace;
   if (pA != pB) {
     row_vec += m;
-    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
-    raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
+    raft::linalg::rowNorm(
+      col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
+    raft::linalg::rowNorm(
+      row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
   } else {
-    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
+    raft::linalg::rowNorm(
+      col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
   }
 
   if (isRowMajor) {
@@ -348,7 +350,7 @@ void euclideanUnExpImpl(const DataT* x,
       for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
         for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-          acc[i][j] = raft::mySqrt(acc[i][j]);
+          acc[i][j] = raft::sqrt(acc[i][j]);
         }
       }
     }
diff --git a/cpp/include/raft/distance/detail/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
index e8c2648c2e..447359ffe6 100644
--- a/cpp/include/raft/distance/detail/fused_l2_nn.cuh
+++ b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -175,7 +175,7 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min,
 #pragma unroll
         for (int j = 0; j < P::AccColsPerTh; ++j) {
           auto acc_ij = acc[i][j];
-          acc[i][j]   = acc_ij > DataT{0} ? raft::mySqrt(acc_ij) : DataT{0};
+          acc[i][j]   = acc_ij > DataT{0} ? raft::sqrt(acc_ij) : DataT{0};
         }
       }
     }
@@ -298,8 +298,6 @@ void fusedL2NNImpl(OutT* min,
     RAFT_CUDA_TRY(cudaGetLastError());
   }
 
-  auto fin_op = [] __device__(DataT d_val, int g_d_idx) { return d_val; };
-
   constexpr size_t shmemSize = P::SmemSize + ((P::Mblk + P::Nblk) * sizeof(DataT));
   if (sqrt) {
     auto fusedL2NNSqrt = fusedL2NNkernel<DataT,
@@ -310,11 +308,23 @@ void fusedL2NNImpl(OutT* min,
                                          ReduceOpT,
                                          KVPReduceOpT,
                                          decltype(core_lambda),
-                                         decltype(fin_op)>;
+                                         raft::identity_op>;
     dim3 grid          = launchConfigGenerator<P>(m, n, shmemSize, fusedL2NNSqrt);
 
-    fusedL2NNSqrt<<<grid, blk, shmemSize, stream>>>(
-      min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, core_lambda, fin_op);
+    fusedL2NNSqrt<<<grid, blk, shmemSize, stream>>>(min,
+                                                    x,
+                                                    y,
+                                                    xn,
+                                                    yn,
+                                                    m,
+                                                    n,
+                                                    k,
+                                                    maxVal,
+                                                    workspace,
+                                                    redOp,
+                                                    pairRedOp,
+                                                    core_lambda,
+                                                    raft::identity_op{});
   } else {
     auto fusedL2NN = fusedL2NNkernel<DataT,
                                      OutT,
@@ -324,10 +334,22 @@ void fusedL2NNImpl(OutT* min,
                                      ReduceOpT,
                                      KVPReduceOpT,
                                      decltype(core_lambda),
-                                     decltype(fin_op)>;
+                                     raft::identity_op>;
     dim3 grid      = launchConfigGenerator<P>(m, n, shmemSize, fusedL2NN);
-    fusedL2NN<<<grid, blk, shmemSize, stream>>>(
-      min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, core_lambda, fin_op);
+    fusedL2NN<<<grid, blk, shmemSize, stream>>>(min,
+                                                x,
+                                                y,
+                                                xn,
+                                                yn,
+                                                m,
+                                                n,
+                                                k,
+                                                maxVal,
+                                                workspace,
+                                                redOp,
+                                                pairRedOp,
+                                                core_lambda,
+                                                raft::identity_op{});
   }
 
   RAFT_CUDA_TRY(cudaGetLastError());
diff --git a/cpp/include/raft/distance/detail/hellinger.cuh b/cpp/include/raft/distance/detail/hellinger.cuh
index 31854fd1d6..13507fe84f 100644
--- a/cpp/include/raft/distance/detail/hellinger.cuh
+++ b/cpp/include/raft/distance/detail/hellinger.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #pragma once
 #include <raft/distance/detail/pairwise_distance_base.cuh>
 #include <raft/linalg/unary_op.cuh>
+#include <raft/util/cuda_utils.cuh>
 
 namespace raft {
 namespace distance {
@@ -78,14 +79,10 @@ static void hellingerImpl(const DataT* x,
 
   dim3 blk(KPolicy::Nthreads);
 
-  auto unaryOp_lambda = [] __device__(DataT input) { return raft::mySqrt(input); };
   // First sqrt x and y
-  raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
-    (DataT*)x, x, m * k, unaryOp_lambda, stream);
-
+  raft::linalg::unaryOp<DataT, raft::sqrt_op, IdxT>((DataT*)x, x, m * k, raft::sqrt_op{}, stream);
   if (x != y) {
-    raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
-      (DataT*)y, y, n * k, unaryOp_lambda, stream);
+    raft::linalg::unaryOp<DataT, raft::sqrt_op, IdxT>((DataT*)y, y, n * k, raft::sqrt_op{}, stream);
   }
 
   // Accumulation operation lambda
@@ -108,7 +105,7 @@ static void hellingerImpl(const DataT* x,
         // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative
         const auto finalVal  = (1 - acc[i][j]);
         const auto rectifier = (!signbit(finalVal));
-        acc[i][j]            = raft::mySqrt(rectifier * finalVal);
+        acc[i][j]            = raft::sqrt(rectifier * finalVal);
       }
     }
   };
@@ -145,11 +142,9 @@ static void hellingerImpl(const DataT* x,
   }
 
   // Revert sqrt of x and y
-  raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
-    (DataT*)x, x, m * k, unaryOp_lambda, stream);
+  raft::linalg::unaryOp<DataT, raft::sqrt_op, IdxT>((DataT*)x, x, m * k, raft::sqrt_op{}, stream);
   if (x != y) {
-    raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
-      (DataT*)y, y, n * k, unaryOp_lambda, stream);
+    raft::linalg::unaryOp<DataT, raft::sqrt_op, IdxT>((DataT*)y, y, n * k, raft::sqrt_op{}, stream);
   }
 
   RAFT_CUDA_TRY(cudaGetLastError());
diff --git a/cpp/include/raft/distance/detail/jensen_shannon.cuh b/cpp/include/raft/distance/detail/jensen_shannon.cuh
index 92ee071cf5..f96da01b87 100644
--- a/cpp/include/raft/distance/detail/jensen_shannon.cuh
+++ b/cpp/include/raft/distance/detail/jensen_shannon.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -78,11 +78,11 @@ static void jensenShannonImpl(const DataT* x,
   auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
     const DataT m     = 0.5f * (x + y);
     const bool m_zero = (m == 0);
-    const auto logM   = (!m_zero) * raft::myLog(m + m_zero);
+    const auto logM   = (!m_zero) * raft::log(m + m_zero);
 
     const bool x_zero = (x == 0);
     const bool y_zero = (y == 0);
-    acc += (-x * (logM - raft::myLog(x + x_zero))) + (-y * (logM - raft::myLog(y + y_zero)));
+    acc += (-x * (logM - raft::log(x + x_zero))) + (-y * (logM - raft::log(y + y_zero)));
   };
 
   // epilogue operation lambda for final value calculation
@@ -95,7 +95,7 @@ static void jensenShannonImpl(const DataT* x,
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
       for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-        acc[i][j] = raft::mySqrt(0.5 * acc[i][j]);
+        acc[i][j] = raft::sqrt(0.5 * acc[i][j]);
       }
     }
   };
diff --git a/cpp/include/raft/distance/detail/kl_divergence.cuh b/cpp/include/raft/distance/detail/kl_divergence.cuh
index 4c0c4b6ace..7ebeaf4de9 100644
--- a/cpp/include/raft/distance/detail/kl_divergence.cuh
+++ b/cpp/include/raft/distance/detail/kl_divergence.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -81,10 +81,10 @@ static void klDivergenceImpl(const DataT* x,
   auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
     if (isRowMajor) {
       const bool x_zero = (x == 0);
-      acc += x * (raft::myLog(x + x_zero) - y);
+      acc += x * (raft::log(x + x_zero) - y);
     } else {
       const bool y_zero = (y == 0);
-      acc += y * (raft::myLog(y + y_zero) - x);
+      acc += y * (raft::log(y + y_zero) - x);
     }
   };
 
@@ -92,23 +92,23 @@ static void klDivergenceImpl(const DataT* x,
     if (isRowMajor) {
       const bool x_zero = (x == 0);
       const bool y_zero = (y == 0);
-      acc += x * (raft::myLog(x + x_zero) - (!y_zero) * raft::myLog(y + y_zero));
+      acc += x * (raft::log(x + x_zero) - (!y_zero) * raft::log(y + y_zero));
     } else {
       const bool y_zero = (y == 0);
       const bool x_zero = (x == 0);
-      acc += y * (raft::myLog(y + y_zero) - (!x_zero) * raft::myLog(x + x_zero));
+      acc += y * (raft::log(y + y_zero) - (!x_zero) * raft::log(x + x_zero));
     }
   };
 
   auto unaryOp_lambda = [] __device__(DataT input) {
     const bool x_zero = (input == 0);
-    return (!x_zero) * raft::myLog(input + x_zero);
+    return (!x_zero) * raft::log(input + x_zero);
   };
 
   auto unaryOp_lambda_reverse = [] __device__(DataT input) {
     // reverse previous log (x) back to x using (e ^ log(x))
     const bool x_zero = (input == 0);
-    return (!x_zero) * raft::myExp(input);
+    return (!x_zero) * raft::exp(input);
   };
 
   // epilogue operation lambda for final value calculation
diff --git a/cpp/include/raft/distance/detail/l1.cuh b/cpp/include/raft/distance/detail/l1.cuh
index 6372019fd3..bf10651b60 100644
--- a/cpp/include/raft/distance/detail/l1.cuh
+++ b/cpp/include/raft/distance/detail/l1.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -71,16 +71,12 @@ static void l1Impl(const DataT* x,
 
   // Accumulation operation lambda
   auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    const auto diff = raft::L1Op<AccT, IdxT>()(x - y);
+    const auto diff = raft::abs(x - y);
     acc += diff;
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                     DataT * regxn,
-                                     DataT * regyn,
-                                     IdxT gridStrideX,
-                                     IdxT gridStrideY) { return; };
+  auto epilog_lambda = raft::void_op();
 
   if (isRowMajor) {
     auto l1RowMajor = pairwiseDistanceMatKernel<false,
diff --git a/cpp/include/raft/distance/detail/minkowski.cuh b/cpp/include/raft/distance/detail/minkowski.cuh
index d3d0979d0d..42af8cd281 100644
--- a/cpp/include/raft/distance/detail/minkowski.cuh
+++ b/cpp/include/raft/distance/detail/minkowski.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -74,8 +74,8 @@ void minkowskiUnExpImpl(const DataT* x,
 
   // Accumulation operation lambda
   auto core_lambda = [p] __device__(AccT & acc, DataT & x, DataT & y) {
-    const auto diff = raft::L1Op<DataT>()(x - y);
-    acc += raft::myPow(diff, p);
+    const auto diff = raft::abs(x - y);
+    acc += raft::pow(diff, p);
   };
 
   // epilogue operation lambda for final value calculation
@@ -89,7 +89,7 @@ void minkowskiUnExpImpl(const DataT* x,
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
       for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-        acc[i][j] = raft::myPow(acc[i][j], one_over_p);
+        acc[i][j] = raft::pow(acc[i][j], one_over_p);
       }
     }
   };
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
index 26536d13cd..445b4bac52 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 #pragma once
+#include <raft/core/operators.hpp>
 #include <raft/linalg/contractions.cuh>
 #include <raft/linalg/norm.cuh>
 #include <raft/util/cuda_utils.cuh>
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_gemm.h b/cpp/include/raft/distance/detail/pairwise_distance_gemm.h
index ea9ed77fb5..8dcccfc14f 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_gemm.h
+++ b/cpp/include/raft/distance/detail/pairwise_distance_gemm.h
@@ -66,7 +66,7 @@ struct PairwiseDistanceGemm {
   /// Warp-level tile size (concept: GemmShape)
   // This code section describes the size of MMA op
   using InstructionShape =
-    cutlass::gemm::GemmShape<16, 8, 8>;  // <- MMA Op tile M = 16, N = 8, K = 8
+    cutlass::gemm::GemmShape<16, 8, 4>;  // <- MMA Op tile M = 16, N = 8, K = 4
 
   /// Operation performed by GEMM
   using Operator = cutlass::arch::OpMultiplyAddFastF32;
diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh
index 6e3f97b45c..93a5ce7f1a 100644
--- a/cpp/include/raft/distance/distance.cuh
+++ b/cpp/include/raft/distance/distance.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,21 +18,21 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/detail/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <raft/core/device_mdspan.hpp>
 
+namespace raft {
+namespace distance {
+
 /**
- * @defgroup pairwise_distance pairwise distance prims
+ * @defgroup pairwise_distance pointer-based pairwise distance prims
  * @{
  */
 
-namespace raft {
-namespace distance {
-
 /**
  * @brief Evaluate pairwise distances with the user epilogue lamba allowed
  * @tparam DistanceType which distance to evaluate
@@ -219,58 +219,6 @@ void distance(const InType* x,
     x, y, dist, m, n, k, workspace.data(), worksize, stream, isRowMajor, metric_arg);
 }
 
-/**
- * @brief Evaluate pairwise distances for the simple use case.
- *
- * Note: Only contiguous row- or column-major layouts supported currently.
- *
- * @tparam DistanceType which distance to evaluate
- * @tparam InType input argument type
- * @tparam AccType accumulation type
- * @tparam OutType output type
- * @tparam Index_ Index type
- * @param handle raft handle for managing expensive resources
- * @param x first set of points (size n*k)
- * @param y second set of points (size m*k)
- * @param dist output distance matrix (size n*m)
- * @param metric_arg metric argument (used for Minkowski distance)
- */
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename layout = raft::layout_c_contiguous,
-          typename Index_ = int>
-void distance(raft::handle_t const& handle,
-              raft::device_matrix_view<InType, Index_, layout> const x,
-              raft::device_matrix_view<InType, Index_, layout> const y,
-              raft::device_matrix_view<OutType, Index_, layout> dist,
-              InType metric_arg = 2.0f)
-{
-  RAFT_EXPECTS(x.extent(1) == y.extent(1), "Number of columns must be equal.");
-  RAFT_EXPECTS(dist.extent(0) == x.extent(0),
-               "Number of rows in output must be equal to "
-               "number of rows in X");
-  RAFT_EXPECTS(dist.extent(1) == y.extent(0),
-               "Number of columns in output must be equal to "
-               "number of rows in Y");
-
-  RAFT_EXPECTS(x.is_exhaustive(), "Input x must be contiguous.");
-  RAFT_EXPECTS(y.is_exhaustive(), "Input y must be contiguous.");
-
-  constexpr auto is_rowmajor = std::is_same_v<layout, layout_c_contiguous>;
-
-  distance<distanceType, InType, AccType, OutType, Index_>(x.data_handle(),
-                                                           y.data_handle(),
-                                                           dist.data_handle(),
-                                                           x.extent(0),
-                                                           y.extent(0),
-                                                           x.extent(1),
-                                                           handle.get_stream(),
-                                                           is_rowmajor,
-                                                           metric_arg);
-}
-
 /**
  * @brief Convenience wrapper around 'distance' prim to convert runtime metric
  * into compile time for the purpose of dispatch
@@ -290,7 +238,7 @@ void distance(raft::handle_t const& handle,
  * @param metric_arg metric argument (used for Minkowski distance)
  */
 template <typename Type, typename Index_ = int>
-void pairwise_distance(const raft::handle_t& handle,
+void pairwise_distance(raft::device_resources const& handle,
                        const Type* x,
                        const Type* y,
                        Type* dist,
@@ -385,7 +333,7 @@ void pairwise_distance(const raft::handle_t& handle,
  * @param metric_arg metric argument (used for Minkowski distance)
  */
 template <typename Type, typename Index_ = int>
-void pairwise_distance(const raft::handle_t& handle,
+void pairwise_distance(raft::device_resources const& handle,
                        const Type* x,
                        const Type* y,
                        Type* dist,
@@ -401,6 +349,85 @@ void pairwise_distance(const raft::handle_t& handle,
     handle, x, y, dist, m, n, k, workspace, metric, isRowMajor, metric_arg);
 }
 
+/** @} */
+
+/**
+ * \defgroup distance_mdspan Pairwise distance functions
+ * @{
+ */
+
+/**
+ * @brief Evaluate pairwise distances for the simple use case.
+ *
+ * Note: Only contiguous row- or column-major layouts supported currently.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * #include <raft/core/device_resources.hpp>
+ * #include <raft/core/device_mdarray.hpp>
+ * #include <raft/random/make_blobs.cuh>
+ * #include <raft/distance/distance.cuh>
+ *
+ * raft::raft::device_resources handle;
+ * int n_samples = 5000;
+ * int n_features = 50;
+ *
+ * auto input = raft::make_device_matrix<float>(handle, n_samples, n_features);
+ * auto labels = raft::make_device_vector<int>(handle, n_samples);
+ * auto output = raft::make_device_matrix<float>(handle, n_samples, n_samples);
+ *
+ * raft::random::make_blobs(handle, input.view(), labels.view());
+ * auto metric = raft::distance::DistanceType::L2SqrtExpanded;
+ * raft::distance::pairwise_distance(handle, input.view(), input.view(), output.view(), metric);
+ * @endcode
+ *
+ * @tparam DistanceType which distance to evaluate
+ * @tparam InType input argument type
+ * @tparam AccType accumulation type
+ * @tparam OutType output type
+ * @tparam Index_ Index type
+ * @param handle raft handle for managing expensive resources
+ * @param x first set of points (size n*k)
+ * @param y second set of points (size m*k)
+ * @param dist output distance matrix (size n*m)
+ * @param metric_arg metric argument (used for Minkowski distance)
+ */
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename layout = raft::layout_c_contiguous,
+          typename Index_ = int>
+void distance(raft::device_resources const& handle,
+              raft::device_matrix_view<InType, Index_, layout> const x,
+              raft::device_matrix_view<InType, Index_, layout> const y,
+              raft::device_matrix_view<OutType, Index_, layout> dist,
+              InType metric_arg = 2.0f)
+{
+  RAFT_EXPECTS(x.extent(1) == y.extent(1), "Number of columns must be equal.");
+  RAFT_EXPECTS(dist.extent(0) == x.extent(0),
+               "Number of rows in output must be equal to "
+               "number of rows in X");
+  RAFT_EXPECTS(dist.extent(1) == y.extent(0),
+               "Number of columns in output must be equal to "
+               "number of rows in Y");
+
+  RAFT_EXPECTS(x.is_exhaustive(), "Input x must be contiguous.");
+  RAFT_EXPECTS(y.is_exhaustive(), "Input y must be contiguous.");
+
+  constexpr auto is_rowmajor = std::is_same_v<layout, layout_c_contiguous>;
+
+  distance<distanceType, InType, AccType, OutType, Index_>(x.data_handle(),
+                                                           y.data_handle(),
+                                                           dist.data_handle(),
+                                                           x.extent(0),
+                                                           y.extent(0),
+                                                           x.extent(1),
+                                                           handle.get_stream(),
+                                                           is_rowmajor,
+                                                           metric_arg);
+}
+
 /**
  * @brief Convenience wrapper around 'distance' prim to convert runtime metric
  * into compile time for the purpose of dispatch
@@ -414,7 +441,7 @@ void pairwise_distance(const raft::handle_t& handle,
  * @param metric_arg metric argument (used for Minkowski distance)
  */
 template <typename Type, typename layout = layout_c_contiguous, typename Index_ = int>
-void pairwise_distance(raft::handle_t const& handle,
+void pairwise_distance(raft::device_resources const& handle,
                        device_matrix_view<Type, Index_, layout> const x,
                        device_matrix_view<Type, Index_, layout> const y,
                        device_matrix_view<Type, Index_, layout> dist,
@@ -449,9 +476,9 @@ void pairwise_distance(raft::handle_t const& handle,
                     metric_arg);
 }
 
+/** @} */
+
 };  // namespace distance
 };  // namespace raft
 
-/** @} */
-
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/distance/fused_l2_nn.cuh b/cpp/include/raft/distance/fused_l2_nn.cuh
index ef51a54622..e832bcb020 100644
--- a/cpp/include/raft/distance/fused_l2_nn.cuh
+++ b/cpp/include/raft/distance/fused_l2_nn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 
 #include <cub/cub.cuh>
 #include <limits>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/detail/fused_l2_nn.cuh>
 #include <raft/linalg/contractions.cuh>
 #include <raft/util/cuda_utils.cuh>
@@ -30,6 +30,10 @@
 
 namespace raft {
 namespace distance {
+/**
+ * \defgroup fused_l2_nn Fused 1-nearest neighbors
+ * @{
+ */
 
 template <typename LabelT, typename DataT>
 using KVPMinReduce = detail::KVPMinReduceImpl<LabelT, DataT>;
@@ -40,15 +44,22 @@ using MinAndDistanceReduceOp = detail::MinAndDistanceReduceOpImpl<LabelT, DataT>
 template <typename LabelT, typename DataT>
 using MinReduceOp = detail::MinReduceOpImpl<LabelT, DataT>;
 
+/** @} */
+
 /**
  * Initialize array using init value from reduction op
  */
 template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT>
-void initialize(const raft::handle_t& handle, OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp)
+void initialize(
+  raft::device_resources const& handle, OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp)
 {
   detail::initialize<DataT, OutT, IdxT, ReduceOpT>(min, m, maxVal, redOp, handle.get_stream());
 }
 
+/**
+ * \ingroup fused_l2_nn
+ * @{
+ */
 /**
  * @brief Fused L2 distance and 1-nearest-neighbor computation in a single call.
  *
@@ -211,6 +222,8 @@ void fusedL2NNMinReduce(OutT* min,
     min, x, y, xn, yn, m, n, k, workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
 }
 
+/** @} */
+
 }  // namespace distance
 }  // namespace raft
 
diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index 4525af49d2..caa68061db 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,4 +21,4 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
diff --git a/cpp/include/raft/label/detail/classlabels.cuh b/cpp/include/raft/label/detail/classlabels.cuh
index 0af1c70b91..64d8b4bfae 100644
--- a/cpp/include/raft/label/detail/classlabels.cuh
+++ b/cpp/include/raft/label/detail/classlabels.cuh
@@ -18,6 +18,7 @@
 
 #include <cub/cub.cuh>
 
+#include <raft/core/operators.hpp>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -194,8 +195,7 @@ void make_monotonic(
 template <typename Type>
 void make_monotonic(Type* out, Type* in, size_t N, cudaStream_t stream, bool zero_based = false)
 {
-  make_monotonic<Type>(
-    out, in, N, stream, [] __device__(Type val) { return false; }, zero_based);
+  make_monotonic<Type>(out, in, N, stream, raft::const_op(false), zero_based);
 }
 
 };  // namespace detail
diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh
index 37956fe762..608c63e1a9 100644
--- a/cpp/include/raft/linalg/add.cuh
+++ b/cpp/include/raft/linalg/add.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,11 +16,6 @@
 #ifndef __ADD_H
 #define __ADD_H
 
-/**
- * @defgroup arithmetic Dense matrix arithmetic
- * @{
- */
-
 #pragma once
 
 #include "detail/add.cuh"
@@ -32,8 +27,6 @@
 namespace raft {
 namespace linalg {
 
-using detail::adds_scalar;
-
 /**
  * @ingroup arithmetic
  * @brief Elementwise scalar add operation on the input buffer
@@ -94,7 +87,7 @@ void addDevScalar(
 }
 
 /**
- * @defgroup add Addition Arithmetic
+ * @defgroup add_dense Addition Arithmetic
  * @{
  */
 
@@ -102,7 +95,7 @@ void addDevScalar(
  * @brief Elementwise add operation
  * @tparam InType    Input Type raft::device_mdspan
  * @tparam OutType   Output Type raft::device_mdspan
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in1    First Input
  * @param[in] in2    Second Input
  * @param[out] out    Output
@@ -111,7 +104,7 @@ template <typename InType,
           typename OutType,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
-void add(const raft::handle_t& handle, InType in1, InType in2, OutType out)
+void add(raft::device_resources const& handle, InType in1, InType in2, OutType out)
 {
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
@@ -142,7 +135,7 @@ void add(const raft::handle_t& handle, InType in1, InType in2, OutType out)
  * @tparam InType    Input Type raft::device_mdspan
  * @tparam OutType   Output Type raft::device_mdspan
  * @tparam ScalarIdxType Index Type of scalar
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in    Input
  * @param[in] scalar    raft::device_scalar_view
  * @param[in] out    Output
@@ -152,7 +145,7 @@ template <typename InType,
           typename ScalarIdxType,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
-void add_scalar(const raft::handle_t& handle,
+void add_scalar(raft::device_resources const& handle,
                 InType in,
                 OutType out,
                 raft::device_scalar_view<const typename InType::value_type, ScalarIdxType> scalar)
@@ -184,7 +177,7 @@ void add_scalar(const raft::handle_t& handle,
  * @tparam InType    Input Type raft::device_mdspan
  * @tparam OutType   Output Type raft::device_mdspan
  * @tparam ScalarIdxType Index Type of scalar
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in    Input
  * @param[in] scalar    raft::host_scalar_view
  * @param[in] out    Output
@@ -194,7 +187,7 @@ template <typename InType,
           typename ScalarIdxType,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
-void add_scalar(const raft::handle_t& handle,
+void add_scalar(raft::device_resources const& handle,
                 const InType in,
                 OutType out,
                 raft::host_scalar_view<const typename InType::value_type, ScalarIdxType> scalar)
@@ -226,6 +219,4 @@ void add_scalar(const raft::handle_t& handle,
 };  // end namespace linalg
 };  // end namespace raft
 
-/** @} */
-
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/axpy.cuh b/cpp/include/raft/linalg/axpy.cuh
index 88b065c8b0..9b3af73234 100644
--- a/cpp/include/raft/linalg/axpy.cuh
+++ b/cpp/include/raft/linalg/axpy.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,7 +41,7 @@ namespace raft::linalg {
  * @param [in] stream
  */
 template <typename T, bool DevicePointerMode = false>
-void axpy(const raft::handle_t& handle,
+void axpy(raft::device_resources const& handle,
           const int n,
           const T* alpha,
           const T* x,
@@ -54,7 +54,7 @@ void axpy(const raft::handle_t& handle,
 }
 
 /**
- * @defgroup axpy axpy
+ * @defgroup axpy axpy routine
  * @{
  */
 
@@ -62,7 +62,7 @@ void axpy(const raft::handle_t& handle,
  * @brief axpy function
  *  It computes the following equation: y = alpha * x + y
  *
- * @param [in] handle raft::handle_t
+ * @param [in] handle raft::device_resources
  * @param [in] alpha raft::device_scalar_view
  * @param [in] x Input vector
  * @param [inout] y Output vector
@@ -72,7 +72,7 @@ template <typename ElementType,
           typename InLayoutPolicy,
           typename OutLayoutPolicy,
           typename ScalarIdxType>
-void axpy(const raft::handle_t& handle,
+void axpy(raft::device_resources const& handle,
           raft::device_scalar_view<const ElementType, ScalarIdxType> alpha,
           raft::device_vector_view<const ElementType, IndexType, InLayoutPolicy> x,
           raft::device_vector_view<ElementType, IndexType, OutLayoutPolicy> y)
@@ -92,7 +92,7 @@ void axpy(const raft::handle_t& handle,
 /**
  * @brief axpy function
  *  It computes the following equation: y = alpha * x + y
- * @param [in] handle raft::handle_t
+ * @param [in] handle raft::device_resources
  * @param [in] alpha raft::device_scalar_view
  * @param [in] x Input vector
  * @param [inout] y Output vector
@@ -102,7 +102,7 @@ template <typename ElementType,
           typename InLayoutPolicy,
           typename OutLayoutPolicy,
           typename ScalarIdxType>
-void axpy(const raft::handle_t& handle,
+void axpy(raft::device_resources const& handle,
           raft::host_scalar_view<const ElementType, ScalarIdxType> alpha,
           raft::device_vector_view<const ElementType, IndexType, InLayoutPolicy> x,
           raft::device_vector_view<ElementType, IndexType, OutLayoutPolicy> y)
diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh
index 693ef961c2..966e84965d 100644
--- a/cpp/include/raft/linalg/binary_op.cuh
+++ b/cpp/include/raft/linalg/binary_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #include "detail/binary_op.cuh"
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/input_validation.hpp>
 
@@ -65,7 +65,7 @@ void binaryOp(
  * @tparam InType Input Type raft::device_mdspan
  * @tparam Lambda the device-lambda performing the actual operation
  * @tparam OutType Output Type raft::device_mdspan
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in1 First input
  * @param[in] in2 Second input
  * @param[out] out Output
@@ -78,7 +78,7 @@ template <typename InType,
           typename OutType,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
-void binary_op(const raft::handle_t& handle, InType in1, InType in2, OutType out, Lambda op)
+void binary_op(raft::device_resources const& handle, InType in1, InType in2, OutType out, Lambda op)
 {
   RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
   RAFT_EXPECTS(raft::is_row_or_column_major(in1), "Input 1 must be contiguous");
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh
index af8d12d873..e10f43653b 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.cuh
+++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -121,7 +121,7 @@ namespace linalg {
  *    conditioned systems. Negative values mean no regularizaton.
  */
 template <typename math_t>
-void choleskyRank1Update(const raft::handle_t& handle,
+void choleskyRank1Update(raft::device_resources const& handle,
                          math_t* L,
                          int n,
                          int ld,
diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/coalesced_reduction.cuh
index e9e5a99f46..674be207d8 100644
--- a/cpp/include/raft/linalg/coalesced_reduction.cuh
+++ b/cpp/include/raft/linalg/coalesced_reduction.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,8 @@
 #include "detail/coalesced_reduction.cuh"
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/operators.hpp>
 
 namespace raft {
 namespace linalg {
@@ -56,9 +57,9 @@ namespace linalg {
 template <typename InType,
           typename OutType      = InType,
           typename IdxType      = int,
-          typename MainLambda   = raft::Nop<InType, IdxType>,
-          typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda  = raft::Nop<OutType>>
+          typename MainLambda   = raft::identity_op,
+          typename ReduceLambda = raft::add_op,
+          typename FinalLambda  = raft::identity_op>
 void coalescedReduction(OutType* dots,
                         const InType* data,
                         IdxType D,
@@ -66,9 +67,9 @@ void coalescedReduction(OutType* dots,
                         OutType init,
                         cudaStream_t stream,
                         bool inplace           = false,
-                        MainLambda main_op     = raft::Nop<InType, IdxType>(),
-                        ReduceLambda reduce_op = raft::Sum<OutType>(),
-                        FinalLambda final_op   = raft::Nop<OutType>())
+                        MainLambda main_op     = raft::identity_op(),
+                        ReduceLambda reduce_op = raft::add_op(),
+                        FinalLambda final_op   = raft::identity_op())
 {
   detail::coalescedReduction<InType, OutType, IdxType>(
     dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
@@ -100,7 +101,7 @@ void coalescedReduction(OutType* dots,
  * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
  * It must be a 'callable' supporting the following input and output:
  * <pre>OutType (*FinalLambda)(OutType);</pre>
- * @param handle raft::handle_t
+ * @param handle raft::device_resources
  * @param[in] data Input of type raft::device_matrix_view
  * @param[out] dots Output of type raft::device_matrix_view
  * @param[in] init initial value to use for the reduction
@@ -113,17 +114,17 @@ template <typename InValueType,
           typename LayoutPolicy,
           typename OutValueType,
           typename IdxType,
-          typename MainLambda   = raft::Nop<InValueType, IdxType>,
-          typename ReduceLambda = raft::Sum<OutValueType>,
-          typename FinalLambda  = raft::Nop<OutValueType>>
-void coalesced_reduction(const raft::handle_t& handle,
+          typename MainLambda   = raft::identity_op,
+          typename ReduceLambda = raft::add_op,
+          typename FinalLambda  = raft::identity_op>
+void coalesced_reduction(raft::device_resources const& handle,
                          raft::device_matrix_view<const InValueType, IdxType, LayoutPolicy> data,
                          raft::device_vector_view<OutValueType, IdxType> dots,
                          OutValueType init,
                          bool inplace           = false,
-                         MainLambda main_op     = raft::Nop<InValueType, IdxType>(),
-                         ReduceLambda reduce_op = raft::Sum<OutValueType>(),
-                         FinalLambda final_op   = raft::Nop<OutValueType>())
+                         MainLambda main_op     = raft::identity_op(),
+                         ReduceLambda reduce_op = raft::add_op(),
+                         FinalLambda final_op   = raft::identity_op())
 {
   if constexpr (std::is_same_v<LayoutPolicy, raft::row_major>) {
     RAFT_EXPECTS(static_cast<IdxType>(dots.size()) == data.extent(0),
diff --git a/cpp/include/raft/linalg/detail/add.cuh b/cpp/include/raft/linalg/detail/add.cuh
index 34966ebbc2..bf9b2bd1d8 100644
--- a/cpp/include/raft/linalg/detail/add.cuh
+++ b/cpp/include/raft/linalg/detail/add.cuh
@@ -16,14 +16,11 @@
 
 #pragma once
 
-#include "functional.cuh"
-
+#include <raft/core/operators.hpp>
 #include <raft/linalg/binary_op.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/util/cuda_utils.cuh>
 
-#include <thrust/functional.h>
-
 namespace raft {
 namespace linalg {
 namespace detail {
@@ -31,13 +28,13 @@ namespace detail {
 template <typename InT, typename OutT = InT, typename IdxType = int>
 void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
 {
-  raft::linalg::unaryOp(out, in, len, adds_scalar<InT, OutT>(scalar), stream);
+  raft::linalg::unaryOp(out, in, len, raft::add_const_op<InT>(scalar), stream);
 }
 
 template <typename InT, typename OutT = InT, typename IdxType = int>
 void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
 {
-  raft::linalg::binaryOp(out, in1, in2, len, thrust::plus<InT>(), stream);
+  raft::linalg::binaryOp(out, in1, in2, len, raft::add_op(), stream);
 }
 
 template <class InT, typename IdxType, typename OutT = InT>
diff --git a/cpp/include/raft/linalg/detail/axpy.cuh b/cpp/include/raft/linalg/detail/axpy.cuh
index f3e1a177c8..5747e840c4 100644
--- a/cpp/include/raft/linalg/detail/axpy.cuh
+++ b/cpp/include/raft/linalg/detail/axpy.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,12 +20,12 @@
 
 #include "cublas_wrappers.hpp"
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 namespace raft::linalg::detail {
 
 template <typename T, bool DevicePointerMode = false>
-void axpy(const raft::handle_t& handle,
+void axpy(raft::device_resources const& handle,
           const int n,
           const T* alpha,
           const T* x,
diff --git a/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh b/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh
index 47937815bd..afa9155753 100644
--- a/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh
+++ b/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #include "cublas_wrappers.hpp"
 #include "cusolver_wrappers.hpp"
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/binary_op.cuh>
 
 namespace raft {
@@ -26,7 +26,7 @@ namespace linalg {
 namespace detail {
 
 template <typename math_t>
-void choleskyRank1Update(const raft::handle_t& handle,
+void choleskyRank1Update(raft::device_resources const& handle,
                          math_t* L,
                          int n,
                          int ld,
diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
index 63351f5475..238e17fa56 100644
--- a/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
@@ -18,6 +18,7 @@
 
 #include <cub/cub.cuh>
 #include <raft/common/nvtx.hpp>
+#include <raft/core/operators.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <rmm/device_uvector.hpp>
 
@@ -71,9 +72,9 @@ template <typename Policy,
           typename InType,
           typename OutType      = InType,
           typename IdxType      = int,
-          typename MainLambda   = raft::Nop<InType, IdxType>,
-          typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda  = raft::Nop<OutType>>
+          typename MainLambda   = raft::identity_op,
+          typename ReduceLambda = raft::add_op,
+          typename FinalLambda  = raft::identity_op>
 void coalescedReductionThin(OutType* dots,
                             const InType* data,
                             IdxType D,
@@ -81,9 +82,9 @@ void coalescedReductionThin(OutType* dots,
                             OutType init,
                             cudaStream_t stream,
                             bool inplace           = false,
-                            MainLambda main_op     = raft::Nop<InType, IdxType>(),
-                            ReduceLambda reduce_op = raft::Sum<OutType>(),
-                            FinalLambda final_op   = raft::Nop<OutType>())
+                            MainLambda main_op     = raft::identity_op(),
+                            ReduceLambda reduce_op = raft::add_op(),
+                            FinalLambda final_op   = raft::identity_op())
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "coalescedReductionThin<%d,%d>", Policy::LogicalWarpSize, Policy::RowsPerBlock);
@@ -97,9 +98,9 @@ void coalescedReductionThin(OutType* dots,
 template <typename InType,
           typename OutType      = InType,
           typename IdxType      = int,
-          typename MainLambda   = raft::Nop<InType, IdxType>,
-          typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda  = raft::Nop<OutType>>
+          typename MainLambda   = raft::identity_op,
+          typename ReduceLambda = raft::add_op,
+          typename FinalLambda  = raft::identity_op>
 void coalescedReductionThinDispatcher(OutType* dots,
                                       const InType* data,
                                       IdxType D,
@@ -107,9 +108,9 @@ void coalescedReductionThinDispatcher(OutType* dots,
                                       OutType init,
                                       cudaStream_t stream,
                                       bool inplace           = false,
-                                      MainLambda main_op     = raft::Nop<InType, IdxType>(),
-                                      ReduceLambda reduce_op = raft::Sum<OutType>(),
-                                      FinalLambda final_op   = raft::Nop<OutType>())
+                                      MainLambda main_op     = raft::identity_op(),
+                                      ReduceLambda reduce_op = raft::add_op(),
+                                      FinalLambda final_op   = raft::identity_op())
 {
   if (D <= IdxType(2)) {
     coalescedReductionThin<ReductionThinPolicy<2, 64>>(
@@ -168,9 +169,9 @@ template <int TPB,
           typename InType,
           typename OutType      = InType,
           typename IdxType      = int,
-          typename MainLambda   = raft::Nop<InType, IdxType>,
-          typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda  = raft::Nop<OutType>>
+          typename MainLambda   = raft::identity_op,
+          typename ReduceLambda = raft::add_op,
+          typename FinalLambda  = raft::identity_op>
 void coalescedReductionMedium(OutType* dots,
                               const InType* data,
                               IdxType D,
@@ -178,9 +179,9 @@ void coalescedReductionMedium(OutType* dots,
                               OutType init,
                               cudaStream_t stream,
                               bool inplace           = false,
-                              MainLambda main_op     = raft::Nop<InType, IdxType>(),
-                              ReduceLambda reduce_op = raft::Sum<OutType>(),
-                              FinalLambda final_op   = raft::Nop<OutType>())
+                              MainLambda main_op     = raft::identity_op(),
+                              ReduceLambda reduce_op = raft::add_op(),
+                              FinalLambda final_op   = raft::identity_op())
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope("coalescedReductionMedium<%d>", TPB);
   coalescedReductionMediumKernel<TPB>
@@ -191,9 +192,9 @@ void coalescedReductionMedium(OutType* dots,
 template <typename InType,
           typename OutType      = InType,
           typename IdxType      = int,
-          typename MainLambda   = raft::Nop<InType, IdxType>,
-          typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda  = raft::Nop<OutType>>
+          typename MainLambda   = raft::identity_op,
+          typename ReduceLambda = raft::add_op,
+          typename FinalLambda  = raft::identity_op>
 void coalescedReductionMediumDispatcher(OutType* dots,
                                         const InType* data,
                                         IdxType D,
@@ -201,9 +202,9 @@ void coalescedReductionMediumDispatcher(OutType* dots,
                                         OutType init,
                                         cudaStream_t stream,
                                         bool inplace           = false,
-                                        MainLambda main_op     = raft::Nop<InType, IdxType>(),
-                                        ReduceLambda reduce_op = raft::Sum<OutType>(),
-                                        FinalLambda final_op   = raft::Nop<OutType>())
+                                        MainLambda main_op     = raft::identity_op(),
+                                        ReduceLambda reduce_op = raft::add_op(),
+                                        FinalLambda final_op   = raft::identity_op())
 {
   // Note: for now, this kernel is only used when D > 256. If this changes in the future, use
   // smaller block sizes when relevant.
@@ -251,9 +252,9 @@ template <typename ThickPolicy,
           typename InType,
           typename OutType      = InType,
           typename IdxType      = int,
-          typename MainLambda   = raft::Nop<InType, IdxType>,
-          typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda  = raft::Nop<OutType>>
+          typename MainLambda   = raft::identity_op,
+          typename ReduceLambda = raft::add_op,
+          typename FinalLambda  = raft::identity_op>
 void coalescedReductionThick(OutType* dots,
                              const InType* data,
                              IdxType D,
@@ -261,9 +262,9 @@ void coalescedReductionThick(OutType* dots,
                              OutType init,
                              cudaStream_t stream,
                              bool inplace           = false,
-                             MainLambda main_op     = raft::Nop<InType, IdxType>(),
-                             ReduceLambda reduce_op = raft::Sum<OutType>(),
-                             FinalLambda final_op   = raft::Nop<OutType>())
+                             MainLambda main_op     = raft::identity_op(),
+                             ReduceLambda reduce_op = raft::add_op(),
+                             FinalLambda final_op   = raft::identity_op())
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "coalescedReductionThick<%d,%d>", ThickPolicy::ThreadsPerBlock, ThickPolicy::BlocksPerRow);
@@ -291,7 +292,7 @@ void coalescedReductionThick(OutType* dots,
                                      init,
                                      stream,
                                      inplace,
-                                     raft::Nop<OutType, IdxType>(),
+                                     raft::identity_op(),
                                      reduce_op,
                                      final_op);
 }
@@ -299,9 +300,9 @@ void coalescedReductionThick(OutType* dots,
 template <typename InType,
           typename OutType      = InType,
           typename IdxType      = int,
-          typename MainLambda   = raft::Nop<InType, IdxType>,
-          typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda  = raft::Nop<OutType>>
+          typename MainLambda   = raft::identity_op,
+          typename ReduceLambda = raft::add_op,
+          typename FinalLambda  = raft::identity_op>
 void coalescedReductionThickDispatcher(OutType* dots,
                                        const InType* data,
                                        IdxType D,
@@ -309,9 +310,9 @@ void coalescedReductionThickDispatcher(OutType* dots,
                                        OutType init,
                                        cudaStream_t stream,
                                        bool inplace           = false,
-                                       MainLambda main_op     = raft::Nop<InType, IdxType>(),
-                                       ReduceLambda reduce_op = raft::Sum<OutType>(),
-                                       FinalLambda final_op   = raft::Nop<OutType>())
+                                       MainLambda main_op     = raft::identity_op(),
+                                       ReduceLambda reduce_op = raft::add_op(),
+                                       FinalLambda final_op   = raft::identity_op())
 {
   // Note: multiple elements per thread to take advantage of the sequential reduction and loop
   // unrolling
@@ -330,9 +331,9 @@ void coalescedReductionThickDispatcher(OutType* dots,
 template <typename InType,
           typename OutType      = InType,
           typename IdxType      = int,
-          typename MainLambda   = raft::Nop<InType, IdxType>,
-          typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda  = raft::Nop<OutType>>
+          typename MainLambda   = raft::identity_op,
+          typename ReduceLambda = raft::add_op,
+          typename FinalLambda  = raft::identity_op>
 void coalescedReduction(OutType* dots,
                         const InType* data,
                         IdxType D,
@@ -340,9 +341,9 @@ void coalescedReduction(OutType* dots,
                         OutType init,
                         cudaStream_t stream,
                         bool inplace           = false,
-                        MainLambda main_op     = raft::Nop<InType, IdxType>(),
-                        ReduceLambda reduce_op = raft::Sum<OutType>(),
-                        FinalLambda final_op   = raft::Nop<OutType>())
+                        MainLambda main_op     = raft::identity_op(),
+                        ReduceLambda reduce_op = raft::add_op(),
+                        FinalLambda final_op   = raft::identity_op())
 {
   /* The primitive selects one of three implementations based on heuristics:
    *  - Thin: very efficient when D is small and/or N is large
diff --git a/cpp/include/raft/linalg/detail/contractions.cuh b/cpp/include/raft/linalg/detail/contractions.cuh
index 5d83f88e71..e247f39bc7 100644
--- a/cpp/include/raft/linalg/detail/contractions.cuh
+++ b/cpp/include/raft/linalg/detail/contractions.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/divide.cuh b/cpp/include/raft/linalg/detail/divide.cuh
index 333cd3e83c..eef1d19d6e 100644
--- a/cpp/include/raft/linalg/detail/divide.cuh
+++ b/cpp/include/raft/linalg/detail/divide.cuh
@@ -16,9 +16,8 @@
 
 #pragma once
 
-#include "functional.cuh"
-
 #include <raft/core/host_mdspan.hpp>
+#include <raft/core/operators.hpp>
 #include <raft/linalg/unary_op.cuh>
 
 namespace raft {
@@ -28,7 +27,7 @@ namespace detail {
 template <typename InT, typename OutT = InT, typename IdxType = int>
 void divideScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
 {
-  raft::linalg::unaryOp(out, in, len, divides_scalar<InT, OutT>(scalar), stream);
+  raft::linalg::unaryOp(out, in, len, raft::div_const_op<InT>(scalar), stream);
 }
 
 };  // end namespace detail
diff --git a/cpp/include/raft/linalg/detail/eig.cuh b/cpp/include/raft/linalg/detail/eig.cuh
index d48b42fc57..94493efb24 100644
--- a/cpp/include/raft/linalg/detail/eig.cuh
+++ b/cpp/include/raft/linalg/detail/eig.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #include "cusolver_wrappers.hpp"
 #include <cuda_runtime_api.h>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/matrix/matrix.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_scalar.hpp>
@@ -29,7 +29,7 @@ namespace linalg {
 namespace detail {
 
 template <typename math_t>
-void eigDC_legacy(const raft::handle_t& handle,
+void eigDC_legacy(raft::device_resources const& handle,
                   const math_t* in,
                   std::size_t n_rows,
                   std::size_t n_cols,
@@ -74,7 +74,7 @@ void eigDC_legacy(const raft::handle_t& handle,
 }
 
 template <typename math_t>
-void eigDC(const raft::handle_t& handle,
+void eigDC(raft::device_resources const& handle,
            const math_t* in,
            std::size_t n_rows,
            std::size_t n_cols,
@@ -137,7 +137,7 @@ void eigDC(const raft::handle_t& handle,
 enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT };
 
 template <typename math_t>
-void eigSelDC(const raft::handle_t& handle,
+void eigSelDC(raft::device_resources const& handle,
               math_t* in,
               std::size_t n_rows,
               std::size_t n_cols,
@@ -228,7 +228,7 @@ void eigSelDC(const raft::handle_t& handle,
 }
 
 template <typename math_t>
-void eigJacobi(const raft::handle_t& handle,
+void eigJacobi(raft::device_resources const& handle,
                const math_t* in,
                std::size_t n_rows,
                std::size_t n_cols,
diff --git a/cpp/include/raft/linalg/detail/eltwise.cuh b/cpp/include/raft/linalg/detail/eltwise.cuh
index 019f86a779..25b4ca0499 100644
--- a/cpp/include/raft/linalg/detail/eltwise.cuh
+++ b/cpp/include/raft/linalg/detail/eltwise.cuh
@@ -16,13 +16,10 @@
 
 #pragma once
 
-#include "functional.cuh"
-
+#include <raft/core/operators.hpp>
 #include <raft/linalg/binary_op.cuh>
 #include <raft/linalg/unary_op.cuh>
 
-#include <thrust/functional.h>
-
 namespace raft {
 namespace linalg {
 namespace detail {
@@ -30,48 +27,48 @@ namespace detail {
 template <typename InType, typename IdxType, typename OutType = InType>
 void scalarAdd(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
 {
-  raft::linalg::unaryOp(out, in, len, adds_scalar<InType, OutType>(scalar), stream);
+  raft::linalg::unaryOp(out, in, len, raft::add_const_op<InType>(scalar), stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
 void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
 {
-  raft::linalg::unaryOp(out, in, len, multiplies_scalar<InType, OutType>(scalar), stream);
+  raft::linalg::unaryOp(out, in, len, raft::mul_const_op<InType>(scalar), stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseAdd(
   OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
 {
-  raft::linalg::binaryOp(out, in1, in2, len, thrust::plus<InType>(), stream);
+  raft::linalg::binaryOp(out, in1, in2, len, raft::add_op(), stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseSub(
   OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
 {
-  raft::linalg::binaryOp(out, in1, in2, len, thrust::minus<InType>(), stream);
+  raft::linalg::binaryOp(out, in1, in2, len, raft::sub_op(), stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseMultiply(
   OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
 {
-  raft::linalg::binaryOp(out, in1, in2, len, thrust::multiplies<InType>(), stream);
+  raft::linalg::binaryOp(out, in1, in2, len, raft::mul_op(), stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseDivide(
   OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
 {
-  raft::linalg::binaryOp(out, in1, in2, len, thrust::divides<InType>(), stream);
+  raft::linalg::binaryOp(out, in1, in2, len, raft::div_op(), stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseDivideCheckZero(
   OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
 {
-  raft::linalg::binaryOp(out, in1, in2, len, divides_check_zero<InType, OutType>(), stream);
+  raft::linalg::binaryOp(out, in1, in2, len, raft::div_checkzero_op(), stream);
 }
 
 };  // end namespace detail
diff --git a/cpp/include/raft/linalg/detail/functional.cuh b/cpp/include/raft/linalg/detail/functional.cuh
deleted file mode 100644
index 067b1565e0..0000000000
--- a/cpp/include/raft/linalg/detail/functional.cuh
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/functional.h>
-
-namespace raft {
-namespace linalg {
-namespace detail {
-
-template <typename ArgType, typename ReturnType = ArgType>
-struct divides_scalar {
- public:
-  divides_scalar(ArgType scalar) : scalar_(scalar) {}
-
-  __host__ __device__ inline ReturnType operator()(ArgType in) { return in / scalar_; }
-
- private:
-  ArgType scalar_;
-};
-
-template <typename ArgType, typename ReturnType = ArgType>
-struct adds_scalar {
- public:
-  adds_scalar(ArgType scalar) : scalar_(scalar) {}
-
-  __host__ __device__ inline ReturnType operator()(ArgType in) { return in + scalar_; }
-
- private:
-  ArgType scalar_;
-};
-
-template <typename ArgType, typename ReturnType = ArgType>
-struct multiplies_scalar {
- public:
-  multiplies_scalar(ArgType scalar) : scalar_(scalar) {}
-
-  __host__ __device__ inline ReturnType operator()(ArgType in) { return in * scalar_; }
-
- private:
-  ArgType scalar_;
-};
-
-template <typename ArgType, typename ReturnType = ArgType>
-struct divides_check_zero {
- public:
-  __host__ __device__ inline ReturnType operator()(ArgType a, ArgType b)
-  {
-    return (b == static_cast<ArgType>(0)) ? 0.0 : a / b;
-  }
-};
-
-}  // namespace detail
-}  // namespace linalg
-}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp
index baa066984b..ba9496c3b9 100644
--- a/cpp/include/raft/linalg/detail/gemm.hpp
+++ b/cpp/include/raft/linalg/detail/gemm.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
 
 #include "cublas_wrappers.hpp"
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 namespace raft {
 namespace linalg {
@@ -49,7 +49,7 @@ namespace detail {
  * @param [in] stream
  */
 template <typename math_t, bool DevicePointerMode = false>
-void gemm(const raft::handle_t& handle,
+void gemm(raft::device_resources const& handle,
           const bool trans_a,
           const bool trans_b,
           const int m,
@@ -103,7 +103,7 @@ void gemm(const raft::handle_t& handle,
  * @param stream cuda stream
  */
 template <typename math_t>
-void gemm(const raft::handle_t& handle,
+void gemm(raft::device_resources const& handle,
           const math_t* a,
           int n_rows_a,
           int n_cols_a,
@@ -130,7 +130,7 @@ void gemm(const raft::handle_t& handle,
 }
 
 template <typename math_t>
-void gemm(const raft::handle_t& handle,
+void gemm(raft::device_resources const& handle,
           const math_t* a,
           int n_rows_a,
           int n_cols_a,
@@ -149,7 +149,7 @@ void gemm(const raft::handle_t& handle,
 }
 
 template <typename T, bool DevicePointerMode = false>
-void gemm(const raft::handle_t& handle,
+void gemm(raft::device_resources const& handle,
           T* z,
           T* x,
           T* y,
diff --git a/cpp/include/raft/linalg/detail/gemv.hpp b/cpp/include/raft/linalg/detail/gemv.hpp
index 38fcdcd82e..b3e001a851 100644
--- a/cpp/include/raft/linalg/detail/gemv.hpp
+++ b/cpp/include/raft/linalg/detail/gemv.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,14 +20,14 @@
 
 #include "cublas_wrappers.hpp"
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 namespace raft {
 namespace linalg {
 namespace detail {
 
 template <typename math_t, bool DevicePointerMode = false>
-void gemv(const raft::handle_t& handle,
+void gemv(raft::device_resources const& handle,
           const bool trans_a,
           const int m,
           const int n,
@@ -59,7 +59,7 @@ void gemv(const raft::handle_t& handle,
 }
 
 template <typename math_t>
-void gemv(const raft::handle_t& handle,
+void gemv(raft::device_resources const& handle,
           const math_t* A,
           const int n_rows,
           const int n_cols,
@@ -76,7 +76,7 @@ void gemv(const raft::handle_t& handle,
 }
 
 template <typename math_t>
-void gemv(const raft::handle_t& handle,
+void gemv(raft::device_resources const& handle,
           const math_t* A,
           const int n_rows_a,
           const int n_cols_a,
@@ -91,7 +91,7 @@ void gemv(const raft::handle_t& handle,
 }
 
 template <typename math_t>
-void gemv(const raft::handle_t& handle,
+void gemv(raft::device_resources const& handle,
           const math_t* A,
           const int n_rows_a,
           const int n_cols_a,
@@ -107,7 +107,7 @@ void gemv(const raft::handle_t& handle,
 }
 
 template <typename math_t>
-void gemv(const raft::handle_t& handle,
+void gemv(raft::device_resources const& handle,
           const math_t* A,
           const int n_rows_a,
           const int n_cols_a,
@@ -126,7 +126,7 @@ void gemv(const raft::handle_t& handle,
 }
 
 template <typename math_t>
-void gemv(const raft::handle_t& handle,
+void gemv(raft::device_resources const& handle,
           const math_t* A,
           const int n_rows_a,
           const int n_cols_a,
diff --git a/cpp/include/raft/linalg/detail/lanczos.cuh b/cpp/include/raft/linalg/detail/lanczos.cuh
index 5a3c595512..8c0cfeba28 100644
--- a/cpp/include/raft/linalg/detail/lanczos.cuh
+++ b/cpp/include/raft/linalg/detail/lanczos.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@
 #include <curand.h>
 
 #include "cublas_wrappers.hpp"
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/spectral/detail/lapack.hpp>
 #include <raft/spectral/detail/warn_dbg.hpp>
 #include <raft/spectral/matrix_wrappers.hpp>
@@ -82,7 +82,7 @@ inline curandStatus_t curandGenerateNormalX(
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-int performLanczosIteration(handle_t const& handle,
+int performLanczosIteration(raft::device_resources const& handle,
                             spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const* A,
                             index_type_t* iter,
                             index_type_t maxIter,
@@ -540,7 +540,7 @@ static int francisQRIteration(index_type_t n,
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-static int lanczosRestart(handle_t const& handle,
+static int lanczosRestart(raft::device_resources const& handle,
                           index_type_t n,
                           index_type_t iter,
                           index_type_t iter_new,
@@ -743,7 +743,7 @@ static int lanczosRestart(handle_t const& handle,
  */
 template <typename index_type_t, typename value_type_t>
 int computeSmallestEigenvectors(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const* A,
   index_type_t nEigVecs,
   index_type_t maxIter,
@@ -984,7 +984,7 @@ int computeSmallestEigenvectors(
 
 template <typename index_type_t, typename value_type_t>
 int computeSmallestEigenvectors(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
   index_type_t nEigVecs,
   index_type_t maxIter,
@@ -1087,7 +1087,7 @@ int computeSmallestEigenvectors(
  */
 template <typename index_type_t, typename value_type_t>
 int computeLargestEigenvectors(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const* A,
   index_type_t nEigVecs,
   index_type_t maxIter,
@@ -1331,7 +1331,7 @@ int computeLargestEigenvectors(
 
 template <typename index_type_t, typename value_type_t>
 int computeLargestEigenvectors(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
   index_type_t nEigVecs,
   index_type_t maxIter,
diff --git a/cpp/include/raft/linalg/detail/lstsq.cuh b/cpp/include/raft/linalg/detail/lstsq.cuh
index 1273956b21..207bcefc32 100644
--- a/cpp/include/raft/linalg/detail/lstsq.cuh
+++ b/cpp/include/raft/linalg/detail/lstsq.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -104,7 +104,7 @@ struct DivideByNonZero {
 
   operator()(const math_t a, const math_t b) const
   {
-    return raft::myAbs<math_t>(b) >= eps ? a / b : a;
+    return raft::abs<math_t>(b) >= eps ? a / b : a;
   }
 };
 
@@ -117,7 +117,7 @@ struct DivideByNonZero {
  *             so it's not guaranteed to stay unmodified.
  */
 template <typename math_t>
-void lstsqSvdQR(const raft::handle_t& handle,
+void lstsqSvdQR(raft::device_resources const& handle,
                 math_t* A,
                 const int n_rows,
                 const int n_cols,
@@ -177,7 +177,7 @@ void lstsqSvdQR(const raft::handle_t& handle,
  *             so it's not guaranteed to stay unmodified.
  */
 template <typename math_t>
-void lstsqSvdJacobi(const raft::handle_t& handle,
+void lstsqSvdJacobi(raft::device_resources const& handle,
                     math_t* A,
                     const int n_rows,
                     const int n_cols,
@@ -248,7 +248,7 @@ void lstsqSvdJacobi(const raft::handle_t& handle,
  *  (`w = (A^T A)^-1  A^T b`)
  */
 template <typename math_t>
-void lstsqEig(const raft::handle_t& handle,
+void lstsqEig(raft::device_resources const& handle,
               const math_t* A,
               const int n_rows,
               const int n_cols,
@@ -352,7 +352,7 @@ void lstsqEig(const raft::handle_t& handle,
  *            Warning: the content of this vector is modified by the cuSOLVER routines.
  */
 template <typename math_t>
-void lstsqQR(const raft::handle_t& handle,
+void lstsqQR(raft::device_resources const& handle,
              math_t* A,
              const int n_rows,
              const int n_cols,
diff --git a/cpp/include/raft/linalg/detail/map.cuh b/cpp/include/raft/linalg/detail/map.cuh
index add003eb52..e0b473bdd4 100644
--- a/cpp/include/raft/linalg/detail/map.cuh
+++ b/cpp/include/raft/linalg/detail/map.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <cub/cub.cuh>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/vectorized.cuh>
 
diff --git a/cpp/include/raft/linalg/detail/map_then_reduce.cuh b/cpp/include/raft/linalg/detail/map_then_reduce.cuh
index 7ef9ca1c43..70bb2df4f5 100644
--- a/cpp/include/raft/linalg/detail/map_then_reduce.cuh
+++ b/cpp/include/raft/linalg/detail/map_then_reduce.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <cub/cub.cuh>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/vectorized.cuh>
 
diff --git a/cpp/include/raft/linalg/detail/multiply.cuh b/cpp/include/raft/linalg/detail/multiply.cuh
index f1a8548bfa..84b832d875 100644
--- a/cpp/include/raft/linalg/detail/multiply.cuh
+++ b/cpp/include/raft/linalg/detail/multiply.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <raft/core/operators.hpp>
 #include <raft/linalg/unary_op.cuh>
 
 namespace raft {
@@ -26,8 +27,7 @@ template <typename math_t, typename IdxType = int>
 void multiplyScalar(
   math_t* out, const math_t* in, const math_t scalar, IdxType len, cudaStream_t stream)
 {
-  raft::linalg::unaryOp(
-    out, in, len, [scalar] __device__(math_t in) { return in * scalar; }, stream);
+  raft::linalg::unaryOp(out, in, len, raft::mul_const_op<math_t>{scalar}, stream);
 }
 
 };  // end namespace detail
diff --git a/cpp/include/raft/linalg/detail/norm.cuh b/cpp/include/raft/linalg/detail/norm.cuh
index f2f08233d5..ed7e360848 100644
--- a/cpp/include/raft/linalg/detail/norm.cuh
+++ b/cpp/include/raft/linalg/detail/norm.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <raft/core/operators.hpp>
 #include <raft/linalg/norm_types.hpp>
 #include <raft/linalg/reduce.cuh>
 
@@ -44,8 +45,8 @@ void rowNormCaller(Type* dots,
                                                 true,
                                                 stream,
                                                 false,
-                                                raft::L1Op<Type, IdxType>(),
-                                                raft::Sum<Type>(),
+                                                raft::abs_op(),
+                                                raft::add_op(),
                                                 fin_op);
       break;
     case L2Norm:
@@ -58,8 +59,8 @@ void rowNormCaller(Type* dots,
                                                 true,
                                                 stream,
                                                 false,
-                                                raft::L2Op<Type>(),
-                                                raft::Sum<Type>(),
+                                                raft::sq_op(),
+                                                raft::add_op(),
                                                 fin_op);
       break;
     case LinfNorm:
@@ -72,8 +73,8 @@ void rowNormCaller(Type* dots,
                                                 true,
                                                 stream,
                                                 false,
-                                                raft::L1Op<Type>(),
-                                                raft::Max<Type>(),
+                                                raft::abs_op(),
+                                                raft::max_op(),
                                                 fin_op);
       break;
     default: THROW("Unsupported norm type: %d", type);
@@ -101,8 +102,8 @@ void colNormCaller(Type* dots,
                                                 false,
                                                 stream,
                                                 false,
-                                                raft::L1Op<Type>(),
-                                                raft::Sum<Type>(),
+                                                raft::abs_op(),
+                                                raft::add_op(),
                                                 fin_op);
       break;
     case L2Norm:
@@ -115,8 +116,8 @@ void colNormCaller(Type* dots,
                                                 false,
                                                 stream,
                                                 false,
-                                                raft::L2Op<Type>(),
-                                                raft::Sum<Type>(),
+                                                raft::sq_op(),
+                                                raft::add_op(),
                                                 fin_op);
       break;
     case LinfNorm:
@@ -129,8 +130,8 @@ void colNormCaller(Type* dots,
                                                 false,
                                                 stream,
                                                 false,
-                                                raft::L1Op<Type>(),
-                                                raft::Max<Type>(),
+                                                raft::abs_op(),
+                                                raft::max_op(),
                                                 fin_op);
       break;
     default: THROW("Unsupported norm type: %d", type);
diff --git a/cpp/include/raft/linalg/detail/qr.cuh b/cpp/include/raft/linalg/detail/qr.cuh
index 74e9c3e1aa..4cba028d87 100644
--- a/cpp/include/raft/linalg/detail/qr.cuh
+++ b/cpp/include/raft/linalg/detail/qr.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,7 +42,7 @@ namespace detail {
  */
 template <typename math_t>
 void qrGetQ_inplace(
-  const raft::handle_t& handle, math_t* Q, int n_rows, int n_cols, cudaStream_t stream)
+  raft::device_resources const& handle, math_t* Q, int n_rows, int n_cols, cudaStream_t stream)
 {
   RAFT_EXPECTS(n_rows >= n_cols, "QR decomposition expects n_rows >= n_cols.");
   cusolverDnHandle_t cusolver = handle.get_cusolver_dn_handle();
@@ -83,7 +83,7 @@ void qrGetQ_inplace(
 }
 
 template <typename math_t>
-void qrGetQ(const raft::handle_t& handle,
+void qrGetQ(raft::device_resources const& handle,
             const math_t* M,
             math_t* Q,
             int n_rows,
@@ -95,7 +95,7 @@ void qrGetQ(const raft::handle_t& handle,
 }
 
 template <typename math_t>
-void qrGetQR(const raft::handle_t& handle,
+void qrGetQR(raft::device_resources const& handle,
              math_t* M,
              math_t* Q,
              math_t* R,
diff --git a/cpp/include/raft/linalg/detail/reduce.cuh b/cpp/include/raft/linalg/detail/reduce.cuh
index 3022973b43..721ca8179f 100644
--- a/cpp/include/raft/linalg/detail/reduce.cuh
+++ b/cpp/include/raft/linalg/detail/reduce.cuh
@@ -16,9 +16,9 @@
 
 #pragma once
 
+#include <raft/core/operators.hpp>
 #include <raft/linalg/coalesced_reduction.cuh>
 #include <raft/linalg/strided_reduction.cuh>
-#include <raft/util/cuda_utils.cuh>
 
 namespace raft {
 namespace linalg {
@@ -27,9 +27,9 @@ namespace detail {
 template <typename InType,
           typename OutType      = InType,
           typename IdxType      = int,
-          typename MainLambda   = raft::Nop<InType, IdxType>,
-          typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda  = raft::Nop<OutType>>
+          typename MainLambda   = raft::identity_op,
+          typename ReduceLambda = raft::add_op,
+          typename FinalLambda  = raft::identity_op>
 void reduce(OutType* dots,
             const InType* data,
             IdxType D,
@@ -39,9 +39,9 @@ void reduce(OutType* dots,
             bool alongRows,
             cudaStream_t stream,
             bool inplace           = false,
-            MainLambda main_op     = raft::Nop<InType, IdxType>(),
-            ReduceLambda reduce_op = raft::Sum<OutType>(),
-            FinalLambda final_op   = raft::Nop<OutType>())
+            MainLambda main_op     = raft::identity_op(),
+            ReduceLambda reduce_op = raft::add_op(),
+            FinalLambda final_op   = raft::identity_op())
 {
   if (rowMajor && alongRows) {
     raft::linalg::coalescedReduction<InType, OutType, IdxType>(
diff --git a/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh
index 450fb415e2..a85e04acca 100644
--- a/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh
+++ b/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh
@@ -29,12 +29,12 @@ namespace detail {
 ///@todo: specialize this to support shared-mem based atomics
 
 template <typename T, typename KeyIteratorT, typename IdxType>
-__global__ void reduce_cols_by_key_kernel(
+__global__ void reduce_cols_by_key_direct_kernel(
   const T* data, const KeyIteratorT keys, T* out, IdxType nrows, IdxType ncols, IdxType nkeys)
 {
   typedef typename std::iterator_traits<KeyIteratorT>::value_type KeyType;
 
-  IdxType idx = blockIdx.x * blockDim.x + threadIdx.x;
+  IdxType idx = static_cast<IdxType>(blockIdx.x) * blockDim.x + threadIdx.x;
   if (idx >= (nrows * ncols)) return;
   ///@todo: yikes! use fast-int-div
   IdxType colId = idx % ncols;
@@ -43,6 +43,38 @@ __global__ void reduce_cols_by_key_kernel(
   raft::myAtomicAdd(out + rowId * nkeys + key, data[idx]);
 }
 
+template <typename T, typename KeyIteratorT, typename IdxType>
+__global__ void reduce_cols_by_key_cached_kernel(
+  const T* data, const KeyIteratorT keys, T* out, IdxType nrows, IdxType ncols, IdxType nkeys)
+{
+  typedef typename std::iterator_traits<KeyIteratorT>::value_type KeyType;
+  extern __shared__ char smem[];
+  T* out_cache = reinterpret_cast<T*>(smem);
+
+  // Initialize the shared memory accumulators to 0.
+  for (IdxType idx = threadIdx.x; idx < nrows * nkeys; idx += blockDim.x) {
+    out_cache[idx] = T{0};
+  }
+  __syncthreads();
+
+  // Accumulate in shared memory
+  for (IdxType idx = static_cast<IdxType>(blockIdx.x) * blockDim.x + threadIdx.x;
+       idx < nrows * ncols;
+       idx += blockDim.x * static_cast<IdxType>(gridDim.x)) {
+    IdxType colId = idx % ncols;
+    IdxType rowId = idx / ncols;
+    KeyType key   = keys[colId];
+    raft::myAtomicAdd(out_cache + rowId * nkeys + key, data[idx]);
+  }
+
+  // Add the shared-memory accumulators to the global results.
+  __syncthreads();
+  for (IdxType idx = threadIdx.x; idx < nrows * nkeys; idx += blockDim.x) {
+    T val = out_cache[idx];
+    if (val != T{0}) { raft::myAtomicAdd(out + idx, val); }
+  }
+}
+
 /**
  * @brief Computes the sum-reduction of matrix columns for each given key
  * @tparam T the input data type (as well as the output reduced matrix)
@@ -60,6 +92,7 @@ __global__ void reduce_cols_by_key_kernel(
  * @param ncols number of columns in the input data
  * @param nkeys number of unique keys in the keys array
  * @param stream cuda stream to launch the kernel onto
+ * @param reset_sums Whether to reset the output sums to zero before reducing
  */
 template <typename T, typename KeyIteratorT, typename IdxType = int>
 void reduce_cols_by_key(const T* data,
@@ -68,16 +101,42 @@ void reduce_cols_by_key(const T* data,
                         IdxType nrows,
                         IdxType ncols,
                         IdxType nkeys,
-                        cudaStream_t stream)
+                        cudaStream_t stream,
+                        bool reset_sums)
 {
   typedef typename std::iterator_traits<KeyIteratorT>::value_type KeyType;
 
-  RAFT_CUDA_TRY(cudaMemsetAsync(out, 0, sizeof(T) * nrows * nkeys, stream));
-  constexpr int TPB = 256;
-  int nblks         = (int)raft::ceildiv<IdxType>(nrows * ncols, TPB);
-  reduce_cols_by_key_kernel<<<nblks, TPB, 0, stream>>>(data, keys, out, nrows, ncols, nkeys);
+  RAFT_EXPECTS(static_cast<size_t>(nrows) * static_cast<size_t>(ncols) <=
+                 static_cast<size_t>(std::numeric_limits<IdxType>::max()),
+               "Index type too small to represent indices in the input array.");
+  RAFT_EXPECTS(static_cast<size_t>(nrows) * static_cast<size_t>(nkeys) <=
+                 static_cast<size_t>(std::numeric_limits<IdxType>::max()),
+               "Index type too small to represent indices in the output array.");
+
+  // Memset the output to zero to use atomics-based reduction.
+  if (reset_sums) { RAFT_CUDA_TRY(cudaMemsetAsync(out, 0, sizeof(T) * nrows * nkeys, stream)); }
+
+  // The cached version is used when the cache fits in shared memory and the number of input
+  // elements is above a threshold (the cached version is slightly slower for small input arrays,
+  // and orders of magnitude faster for large input arrays).
+  size_t cache_size = static_cast<size_t>(nrows * nkeys) * sizeof(T);
+  if (cache_size <= 49152ull && nrows * ncols >= IdxType{8192}) {
+    constexpr int TPB = 256;
+    int n_sm          = raft::getMultiProcessorCount();
+    int target_nblks  = 4 * n_sm;
+    int max_nblks     = raft::ceildiv<IdxType>(nrows * ncols, TPB);
+    int nblks         = std::min(target_nblks, max_nblks);
+    reduce_cols_by_key_cached_kernel<<<nblks, TPB, cache_size, stream>>>(
+      data, keys, out, nrows, ncols, nkeys);
+  } else {
+    constexpr int TPB = 256;
+    int nblks         = raft::ceildiv<IdxType>(nrows * ncols, TPB);
+    reduce_cols_by_key_direct_kernel<<<nblks, TPB, 0, stream>>>(
+      data, keys, out, nrows, ncols, nkeys);
+  }
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
+
 };  // end namespace detail
 };  // end namespace linalg
 };  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/rsvd.cuh b/cpp/include/raft/linalg/detail/rsvd.cuh
index f96598d9e6..a66a23179b 100644
--- a/cpp/include/raft/linalg/detail/rsvd.cuh
+++ b/cpp/include/raft/linalg/detail/rsvd.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -54,7 +54,7 @@ namespace detail {
  * @param stream cuda stream
  */
 template <typename math_t>
-void rsvdFixedRank(const raft::handle_t& handle,
+void rsvdFixedRank(raft::device_resources const& handle,
                    math_t* M,
                    int n_rows,
                    int n_cols,
@@ -371,7 +371,7 @@ void rsvdFixedRank(const raft::handle_t& handle,
  * @param stream cuda stream
  */
 template <typename math_t>
-void rsvdPerc(const raft::handle_t& handle,
+void rsvdPerc(raft::device_resources const& handle,
               math_t* M,
               int n_rows,
               int n_cols,
diff --git a/cpp/include/raft/linalg/detail/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh
index d72bd54a32..0e516b4750 100644
--- a/cpp/include/raft/linalg/detail/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh
@@ -18,6 +18,7 @@
 
 #include "unary_op.cuh"
 #include <cub/cub.cuh>
+#include <raft/core/operators.hpp>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <type_traits>
@@ -107,9 +108,9 @@ __global__ void stridedReductionKernel(OutType* dots,
 template <typename InType,
           typename OutType      = InType,
           typename IdxType      = int,
-          typename MainLambda   = raft::Nop<InType, IdxType>,
-          typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda  = raft::Nop<OutType>>
+          typename MainLambda   = raft::identity_op,
+          typename ReduceLambda = raft::add_op,
+          typename FinalLambda  = raft::identity_op>
 void stridedReduction(OutType* dots,
                       const InType* data,
                       IdxType D,
@@ -117,15 +118,13 @@ void stridedReduction(OutType* dots,
                       OutType init,
                       cudaStream_t stream,
                       bool inplace           = false,
-                      MainLambda main_op     = raft::Nop<InType, IdxType>(),
-                      ReduceLambda reduce_op = raft::Sum<OutType>(),
-                      FinalLambda final_op   = raft::Nop<OutType>())
+                      MainLambda main_op     = raft::identity_op(),
+                      ReduceLambda reduce_op = raft::add_op(),
+                      FinalLambda final_op   = raft::identity_op())
 {
   ///@todo: this extra should go away once we have eliminated the need
   /// for atomics in stridedKernel (redesign for this is already underway)
-  if (!inplace)
-    raft::linalg::unaryOp(
-      dots, dots, D, [init] __device__(OutType a) { return init; }, stream);
+  if (!inplace) raft::linalg::unaryOp(dots, dots, D, raft::const_op(init), stream);
 
   // Arbitrary numbers for now, probably need to tune
   const dim3 thrds(32, 16);
@@ -137,7 +136,7 @@ void stridedReduction(OutType* dots,
 
   ///@todo: this complication should go away once we have eliminated the need
   /// for atomics in stridedKernel (redesign for this is already underway)
-  if constexpr (std::is_same<ReduceLambda, raft::Sum<OutType>>::value &&
+  if constexpr (std::is_same<ReduceLambda, raft::add_op>::value &&
                 std::is_same<InType, OutType>::value)
     stridedSummationKernel<InType>
       <<<nblks, thrds, shmemSize, stream>>>(dots, data, D, N, init, main_op);
@@ -148,7 +147,7 @@ void stridedReduction(OutType* dots,
   ///@todo: this complication should go away once we have eliminated the need
   /// for atomics in stridedKernel (redesign for this is already underway)
   // Perform final op on output data
-  if (!std::is_same<FinalLambda, raft::Nop<OutType>>::value)
+  if (!std::is_same<FinalLambda, raft::identity_op>::value)
     raft::linalg::unaryOp(dots, dots, D, final_op, stream);
 }
 
diff --git a/cpp/include/raft/linalg/detail/subtract.cuh b/cpp/include/raft/linalg/detail/subtract.cuh
index ae0f09d2fe..6df09df8ed 100644
--- a/cpp/include/raft/linalg/detail/subtract.cuh
+++ b/cpp/include/raft/linalg/detail/subtract.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <raft/core/operators.hpp>
 #include <raft/linalg/binary_op.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/util/cuda_utils.cuh>
@@ -27,15 +28,13 @@ namespace detail {
 template <typename InT, typename OutT = InT, typename IdxType = int>
 void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
 {
-  auto op = [scalar] __device__(InT in) { return OutT(in - scalar); };
-  raft::linalg::unaryOp<InT, decltype(op), IdxType, OutT>(out, in, len, op, stream);
+  raft::linalg::unaryOp(out, in, len, raft::sub_const_op<InT>(scalar), stream);
 }
 
 template <typename InT, typename OutT = InT, typename IdxType = int>
 void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
 {
-  auto op = [] __device__(InT a, InT b) { return OutT(a - b); };
-  raft::linalg::binaryOp<InT, decltype(op), OutT, IdxType>(out, in1, in2, len, op, stream);
+  raft::linalg::binaryOp(out, in1, in2, len, raft::sub_op(), stream);
 }
 
 template <class math_t, typename IdxType>
diff --git a/cpp/include/raft/linalg/detail/svd.cuh b/cpp/include/raft/linalg/detail/svd.cuh
index 90a7ddec1f..4850744f51 100644
--- a/cpp/include/raft/linalg/detail/svd.cuh
+++ b/cpp/include/raft/linalg/detail/svd.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@
 #include <raft/linalg/transpose.cuh>
 
 #include <raft/common/nvtx.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/matrix/math.cuh>
 #include <raft/matrix/matrix.cuh>
 #include <raft/util/cuda_utils.cuh>
@@ -36,7 +36,7 @@ namespace linalg {
 namespace detail {
 
 template <typename T>
-void svdQR(const raft::handle_t& handle,
+void svdQR(raft::device_resources const& handle,
            T* in,
            int n_rows,
            int n_cols,
@@ -101,14 +101,14 @@ void svdQR(const raft::handle_t& handle,
          "This usually occurs when some of the features do not vary enough.");
 }
 
-template <typename T>
-void svdEig(const raft::handle_t& handle,
-            T* in,
-            int n_rows,
-            int n_cols,
-            T* S,
-            T* U,
-            T* V,
+template <typename math_t, typename idx_t>
+void svdEig(raft::device_resources const& handle,
+            math_t* in,
+            idx_t n_rows,
+            idx_t n_cols,
+            math_t* S,
+            math_t* U,
+            math_t* V,
             bool gen_left_vec,
             cudaStream_t stream)
 {
@@ -117,11 +117,11 @@ void svdEig(const raft::handle_t& handle,
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
   cublasHandle_t cublasH       = handle.get_cublas_handle();
 
-  int len = n_cols * n_cols;
-  rmm::device_uvector<T> in_cross_mult(len, stream);
+  auto len = n_cols * n_cols;
+  rmm::device_uvector<math_t> in_cross_mult(len, stream);
 
-  T alpha = T(1);
-  T beta  = T(0);
+  math_t alpha = math_t(1);
+  math_t beta  = math_t(0);
   raft::linalg::gemm(handle,
                      in,
                      n_rows,
@@ -139,7 +139,7 @@ void svdEig(const raft::handle_t& handle,
   raft::linalg::eigDC(handle, in_cross_mult.data(), n_cols, n_cols, V, S, stream);
 
   raft::matrix::colReverse(V, n_cols, n_cols, stream);
-  raft::matrix::rowReverse(S, n_cols, 1, stream);
+  raft::matrix::rowReverse(S, n_cols, idx_t(1), stream);
 
   raft::matrix::seqRoot(S, S, alpha, n_cols, stream, true);
 
@@ -162,7 +162,7 @@ void svdEig(const raft::handle_t& handle,
 }
 
 template <typename math_t>
-void svdJacobi(const raft::handle_t& handle,
+void svdJacobi(raft::device_resources const& handle,
                math_t* in,
                int n_rows,
                int n_cols,
@@ -232,7 +232,7 @@ void svdJacobi(const raft::handle_t& handle,
 }
 
 template <typename math_t>
-void svdReconstruction(const raft::handle_t& handle,
+void svdReconstruction(raft::device_resources const& handle,
                        math_t* U,
                        math_t* S,
                        math_t* V,
@@ -263,7 +263,7 @@ void svdReconstruction(const raft::handle_t& handle,
 }
 
 template <typename math_t>
-bool evaluateSVDByL2Norm(const raft::handle_t& handle,
+bool evaluateSVDByL2Norm(raft::device_resources const& handle,
                          math_t* A_d,
                          math_t* U,
                          math_t* S_vec,
diff --git a/cpp/include/raft/linalg/detail/transpose.cuh b/cpp/include/raft/linalg/detail/transpose.cuh
index ef5551ea7e..9e7b236fed 100644
--- a/cpp/include/raft/linalg/detail/transpose.cuh
+++ b/cpp/include/raft/linalg/detail/transpose.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #include "cublas_wrappers.hpp"
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <rmm/exec_policy.hpp>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -29,7 +29,7 @@ namespace linalg {
 namespace detail {
 
 template <typename math_t>
-void transpose(const raft::handle_t& handle,
+void transpose(raft::device_resources const& handle,
                math_t* in,
                math_t* out,
                int n_rows,
@@ -82,7 +82,7 @@ void transpose(math_t* inout, int n, cudaStream_t stream)
 
 template <typename T, typename IndexType, typename LayoutPolicy, typename AccessorPolicy>
 void transpose_row_major_impl(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   raft::mdspan<T, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> in,
   raft::mdspan<T, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> out)
 {
@@ -108,7 +108,7 @@ void transpose_row_major_impl(
 
 template <typename T, typename IndexType, typename LayoutPolicy, typename AccessorPolicy>
 void transpose_col_major_impl(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   raft::mdspan<T, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> in,
   raft::mdspan<T, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> out)
 {
diff --git a/cpp/include/raft/linalg/divide.cuh b/cpp/include/raft/linalg/divide.cuh
index 53b083045e..0b18e6175c 100644
--- a/cpp/include/raft/linalg/divide.cuh
+++ b/cpp/include/raft/linalg/divide.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,13 +21,12 @@
 #include "detail/divide.cuh"
 
 #include <raft/core/device_mdspan.hpp>
+#include <raft/util/cuda_utils.cuh>
 #include <raft/util/input_validation.hpp>
 
 namespace raft {
 namespace linalg {
 
-using detail::divides_scalar;
-
 /**
  * @defgroup ScalarOps Scalar operations on the input buffer
  * @tparam OutT output data-type upon which the math operation will be performed
@@ -57,7 +56,7 @@ void divideScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_
  * @tparam InType    Input Type raft::device_mdspan
  * @tparam OutType   Output Type raft::device_mdspan
  * @tparam ScalarIdxType Index Type of scalar
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in    Input
  * @param[in] scalar    raft::host_scalar_view
  * @param[out] out    Output
@@ -67,7 +66,7 @@ template <typename InType,
           typename ScalarIdxType,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
-void divide_scalar(const raft::handle_t& handle,
+void divide_scalar(raft::device_resources const& handle,
                    InType in,
                    OutType out,
                    raft::host_scalar_view<const typename InType::value_type, ScalarIdxType> scalar)
diff --git a/cpp/include/raft/linalg/dot.cuh b/cpp/include/raft/linalg/dot.cuh
index 48577650bc..917188d695 100644
--- a/cpp/include/raft/linalg/dot.cuh
+++ b/cpp/include/raft/linalg/dot.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,13 +21,19 @@
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/host_mdspan.hpp>
 
 namespace raft::linalg {
+
+/**
+ * @defgroup dot BLAS dot routine
+ * @{
+ */
+
 /**
  * @brief Computes the dot product of two vectors.
- * @param[in] handle   raft::handle_t
+ * @param[in] handle   raft::device_resources
  * @param[in] x        First input vector
  * @param[in] y        Second input vector
  * @param[out] out     The output dot product between the x and y vectors.
@@ -37,7 +43,7 @@ template <typename ElementType,
           typename ScalarIndexType,
           typename LayoutPolicy1,
           typename LayoutPolicy2>
-void dot(const raft::handle_t& handle,
+void dot(raft::device_resources const& handle,
          raft::device_vector_view<const ElementType, IndexType, LayoutPolicy1> x,
          raft::device_vector_view<const ElementType, IndexType, LayoutPolicy2> y,
          raft::device_scalar_view<ElementType, ScalarIndexType> out)
@@ -57,7 +63,7 @@ void dot(const raft::handle_t& handle,
 
 /**
  * @brief Computes the dot product of two vectors.
- * @param[in] handle   raft::handle_t
+ * @param[in] handle   raft::device_resources
  * @param[in] x        First input vector
  * @param[in] y        Second input vector
  * @param[out] out     The output dot product between the x and y vectors.
@@ -67,7 +73,7 @@ template <typename ElementType,
           typename ScalarIndexType,
           typename LayoutPolicy1,
           typename LayoutPolicy2>
-void dot(const raft::handle_t& handle,
+void dot(raft::device_resources const& handle,
          raft::device_vector_view<const ElementType, IndexType, LayoutPolicy1> x,
          raft::device_vector_view<const ElementType, IndexType, LayoutPolicy2> y,
          raft::host_scalar_view<ElementType, ScalarIndexType> out)
@@ -84,5 +90,8 @@ void dot(const raft::handle_t& handle,
                                     out.data_handle(),
                                     handle.get_stream()));
 }
+
+/** @} */  // end of group dot
+
 }  // namespace raft::linalg
 #endif
diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh
index 2ad222d42d..03e94a10b1 100644
--- a/cpp/include/raft/linalg/eig.cuh
+++ b/cpp/include/raft/linalg/eig.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,11 +25,6 @@
 namespace raft {
 namespace linalg {
 
-/**
- * @defgroup eig Eigen Decomposition Methods
- * @{
- */
-
 /**
  * @brief eig decomp with divide and conquer method for the column-major
  * symmetric matrices
@@ -43,7 +38,7 @@ namespace linalg {
  * @param stream cuda stream
  */
 template <typename math_t>
-void eigDC(const raft::handle_t& handle,
+void eigDC(raft::device_resources const& handle,
            const math_t* in,
            std::size_t n_rows,
            std::size_t n_cols,
@@ -73,7 +68,7 @@ using detail::OVERWRITE_INPUT;
  * @param stream cuda stream
  */
 template <typename math_t>
-void eigSelDC(const raft::handle_t& handle,
+void eigSelDC(raft::device_resources const& handle,
               math_t* in,
               std::size_t n_rows,
               std::size_t n_cols,
@@ -102,7 +97,7 @@ void eigSelDC(const raft::handle_t& handle,
  * accuracy.
  */
 template <typename math_t>
-void eigJacobi(const raft::handle_t& handle,
+void eigJacobi(raft::device_resources const& handle,
                const math_t* in,
                std::size_t n_rows,
                std::size_t n_cols,
@@ -115,19 +110,24 @@ void eigJacobi(const raft::handle_t& handle,
   detail::eigJacobi(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream, tol, sweeps);
 }
 
+/**
+ * @defgroup eig Eigen Decomposition Methods
+ * @{
+ */
+
 /**
  * @brief eig decomp with divide and conquer method for the column-major
  * symmetric matrices
  * @tparam ValueType the data-type of input and output
  * @tparam IntegerType Integer used for addressing
- * @param handle raft::handle_t
+ * @param handle raft::device_resources
  * @param[in] in input raft::device_matrix_view (symmetric matrix that has real eig values and
  * vectors)
  * @param[out] eig_vectors: eigenvectors output of type raft::device_matrix_view
  * @param[out] eig_vals: eigen values output of type raft::device_vector_view
  */
 template <typename ValueType, typename IndexType>
-void eig_dc(const raft::handle_t& handle,
+void eig_dc(raft::device_resources const& handle,
             raft::device_matrix_view<const ValueType, IndexType, raft::col_major> in,
             raft::device_matrix_view<ValueType, IndexType, raft::col_major> eig_vectors,
             raft::device_vector_view<ValueType, IndexType> eig_vals)
@@ -149,7 +149,7 @@ void eig_dc(const raft::handle_t& handle,
  *        for the column-major symmetric matrices
  * @tparam ValueType the data-type of input and output
  * @tparam IntegerType Integer used for addressing
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in input raft::device_matrix_view (symmetric matrix that has real eig values and
  * vectors)
  * @param[out] eig_vectors: eigenvectors output of type raft::device_matrix_view
@@ -158,7 +158,7 @@ void eig_dc(const raft::handle_t& handle,
  * @param[in] memUsage: the memory selection for eig vector output
  */
 template <typename ValueType, typename IndexType>
-void eig_dc_selective(const raft::handle_t& handle,
+void eig_dc_selective(raft::device_resources const& handle,
                       raft::device_matrix_view<const ValueType, IndexType, raft::col_major> in,
                       raft::device_matrix_view<ValueType, IndexType, raft::col_major> eig_vectors,
                       raft::device_vector_view<ValueType, IndexType> eig_vals,
@@ -185,7 +185,7 @@ void eig_dc_selective(const raft::handle_t& handle,
  * column-major symmetric matrices (in parameter)
  * @tparam ValueType the data-type of input and output
  * @tparam IntegerType Integer used for addressing
- * @param handle raft::handle_t
+ * @param handle raft::device_resources
  * @param[in] in input raft::device_matrix_view (symmetric matrix that has real eig values and
  * vectors)
  * @param[out] eig_vectors: eigenvectors output of type raft::device_matrix_view
@@ -196,7 +196,7 @@ void eig_dc_selective(const raft::handle_t& handle,
  * accuracy.
  */
 template <typename ValueType, typename IndexType>
-void eig_jacobi(const raft::handle_t& handle,
+void eig_jacobi(raft::device_resources const& handle,
                 raft::device_matrix_view<const ValueType, IndexType, raft::col_major> in,
                 raft::device_matrix_view<ValueType, IndexType, raft::col_major> eig_vectors,
                 raft::device_vector_view<ValueType, IndexType> eig_vals,
diff --git a/cpp/include/raft/linalg/eltwise.cuh b/cpp/include/raft/linalg/eltwise.cuh
index dbc06a4af3..2e6c1a4ab5 100644
--- a/cpp/include/raft/linalg/eltwise.cuh
+++ b/cpp/include/raft/linalg/eltwise.cuh
@@ -23,8 +23,6 @@
 namespace raft {
 namespace linalg {
 
-using detail::adds_scalar;
-
 /**
  * @defgroup ScalarOps Scalar operations on the input buffer
  * @tparam InType data-type upon which the math operation will be performed
@@ -42,8 +40,6 @@ void scalarAdd(OutType* out, const InType* in, InType scalar, IdxType len, cudaS
   detail::scalarAdd(out, in, scalar, len, stream);
 }
 
-using detail::multiplies_scalar;
-
 template <typename InType, typename IdxType, typename OutType = InType>
 void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
 {
@@ -90,8 +86,6 @@ void eltwiseDivide(
   detail::eltwiseDivide(out, in1, in2, len, stream);
 }
 
-using detail::divides_check_zero;
-
 template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseDivideCheckZero(
   OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh
index f2354da6c6..d5dc5ffab5 100644
--- a/cpp/include/raft/linalg/gemm.cuh
+++ b/cpp/include/raft/linalg/gemm.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -52,7 +52,7 @@ namespace linalg {
  * @param [in] stream
  */
 template <typename math_t, bool DevicePointerMode = false>
-void gemm(const raft::handle_t& handle,
+void gemm(raft::device_resources const& handle,
           const bool trans_a,
           const bool trans_b,
           const int m,
@@ -91,7 +91,7 @@ void gemm(const raft::handle_t& handle,
  * @param stream cuda stream
  */
 template <typename math_t>
-void gemm(const raft::handle_t& handle,
+void gemm(raft::device_resources const& handle,
           const math_t* a,
           int n_rows_a,
           int n_cols_a,
@@ -126,7 +126,7 @@ void gemm(const raft::handle_t& handle,
  * @param stream cuda stream
  */
 template <typename math_t>
-void gemm(const raft::handle_t& handle,
+void gemm(raft::device_resources const& handle,
           const math_t* a,
           int n_rows_a,
           int n_cols_a,
@@ -161,7 +161,7 @@ void gemm(const raft::handle_t& handle,
  * @param beta scalar
  */
 template <typename T>
-void gemm(const raft::handle_t& handle,
+void gemm(raft::device_resources const& handle,
           T* z,
           T* x,
           T* y,
@@ -213,7 +213,7 @@ template <typename ValueType,
           typename                = std::enable_if_t<std::disjunction_v<
             std::is_same<ScalarViewType, raft::host_scalar_view<ValueType, ScalarIdxType>>,
             std::is_same<ScalarViewType, raft::device_scalar_view<ValueType, ScalarIdxType>>>>>
-void gemm(const raft::handle_t& handle,
+void gemm(raft::device_resources const& handle,
           raft::device_matrix_view<ValueType, IndexType, LayoutPolicyX> x,
           raft::device_matrix_view<ValueType, IndexType, LayoutPolicyY> y,
           raft::device_matrix_view<ValueType, IndexType, LayoutPolicyZ> z,
diff --git a/cpp/include/raft/linalg/gemv.cuh b/cpp/include/raft/linalg/gemv.cuh
index 8132a742f8..96846003f6 100644
--- a/cpp/include/raft/linalg/gemv.cuh
+++ b/cpp/include/raft/linalg/gemv.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -50,7 +50,7 @@ namespace linalg {
  * @param [in] stream
  */
 template <typename math_t, bool DevicePointerMode = false>
-void gemv(const raft::handle_t& handle,
+void gemv(raft::device_resources const& handle,
           const bool trans_a,
           const int m,
           const int n,
@@ -69,7 +69,7 @@ void gemv(const raft::handle_t& handle,
 }
 
 template <typename math_t>
-void gemv(const raft::handle_t& handle,
+void gemv(raft::device_resources const& handle,
           const math_t* A,
           const int n_rows,
           const int n_cols,
@@ -103,7 +103,7 @@ void gemv(const raft::handle_t& handle,
  * @param stream stream on which this function is run
  */
 template <typename math_t>
-void gemv(const raft::handle_t& handle,
+void gemv(raft::device_resources const& handle,
           const math_t* A,
           const int n_rows_a,
           const int n_cols_a,
@@ -133,7 +133,7 @@ void gemv(const raft::handle_t& handle,
  * @param stream stream on which this function is run
  */
 template <typename math_t>
-void gemv(const raft::handle_t& handle,
+void gemv(raft::device_resources const& handle,
           const math_t* A,
           const int n_rows_a,
           const int n_cols_a,
@@ -165,7 +165,7 @@ void gemv(const raft::handle_t& handle,
  * @param stream stream on which this function is run
  */
 template <typename math_t>
-void gemv(const raft::handle_t& handle,
+void gemv(raft::device_resources const& handle,
           const math_t* A,
           const int n_rows_a,
           const int n_cols_a,
@@ -199,7 +199,7 @@ void gemv(const raft::handle_t& handle,
  *
  */
 template <typename math_t>
-void gemv(const raft::handle_t& handle,
+void gemv(raft::device_resources const& handle,
           const math_t* A,
           const int n_rows_a,
           const int n_cols_a,
@@ -246,7 +246,7 @@ template <typename ValueType,
           typename                = std::enable_if_t<std::disjunction_v<
             std::is_same<ScalarViewType, raft::host_scalar_view<ValueType, ScalarIdxType>>,
             std::is_same<ScalarViewType, raft::device_scalar_view<ValueType, ScalarIdxType>>>>>
-void gemv(const raft::handle_t& handle,
+void gemv(raft::device_resources const& handle,
           raft::device_matrix_view<const ValueType, IndexType, LayoutPolicy> A,
           raft::device_vector_view<const ValueType, IndexType> x,
           raft::device_vector_view<ValueType, IndexType> y,
diff --git a/cpp/include/raft/linalg/lstsq.cuh b/cpp/include/raft/linalg/lstsq.cuh
index 7654812886..b36a9eba96 100644
--- a/cpp/include/raft/linalg/lstsq.cuh
+++ b/cpp/include/raft/linalg/lstsq.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/detail/lstsq.cuh>
 namespace raft {
 namespace linalg {
@@ -37,7 +37,7 @@ namespace linalg {
  * @param[in] stream cuda stream for ordering operations
  */
 template <typename math_t>
-void lstsqSvdQR(const raft::handle_t& handle,
+void lstsqSvdQR(raft::device_resources const& handle,
                 math_t* A,
                 const int n_rows,
                 const int n_cols,
@@ -62,7 +62,7 @@ void lstsqSvdQR(const raft::handle_t& handle,
  * @param[in] stream cuda stream for ordering operations
  */
 template <typename math_t>
-void lstsqSvdJacobi(const raft::handle_t& handle,
+void lstsqSvdJacobi(raft::device_resources const& handle,
                     math_t* A,
                     const int n_rows,
                     const int n_cols,
@@ -78,7 +78,7 @@ void lstsqSvdJacobi(const raft::handle_t& handle,
  *  (`w = (A^T A)^-1  A^T b`)
  */
 template <typename math_t>
-void lstsqEig(const raft::handle_t& handle,
+void lstsqEig(raft::device_resources const& handle,
               const math_t* A,
               const int n_rows,
               const int n_cols,
@@ -104,7 +104,7 @@ void lstsqEig(const raft::handle_t& handle,
  * @param[in] stream cuda stream for ordering operations
  */
 template <typename math_t>
-void lstsqQR(const raft::handle_t& handle,
+void lstsqQR(raft::device_resources const& handle,
              math_t* A,
              const int n_rows,
              const int n_cols,
@@ -125,7 +125,7 @@ void lstsqQR(const raft::handle_t& handle,
  * Via SVD decomposition of `A = U S Vt`.
  *
  * @tparam ValueType the data-type of input/output
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[inout] A input raft::device_matrix_view
  *            Warning: the content of this matrix is modified.
  * @param[inout] b input target raft::device_vector_view
@@ -133,7 +133,7 @@ void lstsqQR(const raft::handle_t& handle,
  * @param[out] w output coefficient raft::device_vector_view
  */
 template <typename ValueType, typename IndexType>
-void lstsq_svd_qr(const raft::handle_t& handle,
+void lstsq_svd_qr(raft::device_resources const& handle,
                   raft::device_matrix_view<const ValueType, IndexType, raft::col_major> A,
                   raft::device_vector_view<const ValueType, IndexType> b,
                   raft::device_vector_view<ValueType, IndexType> w)
@@ -155,7 +155,7 @@ void lstsq_svd_qr(const raft::handle_t& handle,
  *  Via SVD decomposition of `A = U S V^T` using Jacobi iterations.
  *
  * @tparam ValueType the data-type of input/output
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[inout] A input raft::device_matrix_view
  *            Warning: the content of this matrix is modified.
  * @param[inout] b input target raft::device_vector_view
@@ -163,7 +163,7 @@ void lstsq_svd_qr(const raft::handle_t& handle,
  * @param[out] w output coefficient raft::device_vector_view
  */
 template <typename ValueType, typename IndexType>
-void lstsq_svd_jacobi(const raft::handle_t& handle,
+void lstsq_svd_jacobi(raft::device_resources const& handle,
                       raft::device_matrix_view<const ValueType, IndexType, raft::col_major> A,
                       raft::device_vector_view<const ValueType, IndexType> b,
                       raft::device_vector_view<ValueType, IndexType> w)
@@ -186,7 +186,7 @@ void lstsq_svd_jacobi(const raft::handle_t& handle,
  *  (`w = (A^T A)^-1  A^T b`)
  *
  * @tparam ValueType the data-type of input/output
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[inout] A input raft::device_matrix_view
  *            Warning: the content of this matrix is modified by the cuSOLVER routines.
  * @param[inout] b input target raft::device_vector_view
@@ -194,7 +194,7 @@ void lstsq_svd_jacobi(const raft::handle_t& handle,
  * @param[out] w output coefficient raft::device_vector_view
  */
 template <typename ValueType, typename IndexType>
-void lstsq_eig(const raft::handle_t& handle,
+void lstsq_eig(raft::device_resources const& handle,
                raft::device_matrix_view<const ValueType, IndexType, raft::col_major> A,
                raft::device_vector_view<const ValueType, IndexType> b,
                raft::device_vector_view<ValueType, IndexType> w)
@@ -217,7 +217,7 @@ void lstsq_eig(const raft::handle_t& handle,
  *  (triangular system of equations `Rw = Q^T b`)
  *
  * @tparam ValueType the data-type of input/output
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[inout] A input raft::device_matrix_view
  *            Warning: the content of this matrix is modified.
  * @param[inout] b input target raft::device_vector_view
@@ -225,7 +225,7 @@ void lstsq_eig(const raft::handle_t& handle,
  * @param[out] w output coefficient raft::device_vector_view
  */
 template <typename ValueType, typename IndexType>
-void lstsq_qr(const raft::handle_t& handle,
+void lstsq_qr(raft::device_resources const& handle,
               raft::device_matrix_view<const ValueType, IndexType, raft::col_major> A,
               raft::device_vector_view<const ValueType, IndexType> b,
               raft::device_vector_view<ValueType, IndexType> w)
diff --git a/cpp/include/raft/linalg/map.cuh b/cpp/include/raft/linalg/map.cuh
index ad35cc5880..2b9e6c80a0 100644
--- a/cpp/include/raft/linalg/map.cuh
+++ b/cpp/include/raft/linalg/map.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,9 @@
 #include "detail/map.cuh"
 
 #include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/util/input_validation.hpp>
+#include <thrust/tabulate.h>
 
 namespace raft {
 namespace linalg {
@@ -65,7 +67,7 @@ void map_k(
  * @tparam TPB threads-per-block in the final kernel launched
  * @tparam OutType data-type of result of type raft::device_mdspan
  * @tparam Args additional parameters
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in the input of type raft::device_mdspan
  * @param[out] out the output of the map operation of type raft::device_mdspan
  * @param[in] map the device-lambda
@@ -78,7 +80,7 @@ template <typename InType,
           typename... Args,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
-void map(const raft::handle_t& handle, InType in, OutType out, MapOp map, Args... args)
+void map(raft::device_resources const& handle, InType in, OutType out, MapOp map, Args... args)
 {
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
@@ -96,9 +98,43 @@ void map(const raft::handle_t& handle, InType in, OutType out, MapOp map, Args..
   }
 }
 
+/**
+ * @brief Perform an element-wise unary operation on the input offset into the output array
+ *
+ * Usage example:
+ * @code{.cpp}
+ *  #include <raft/core/device_mdarray.hpp>
+ *  #include <raft/core/handle.hpp>
+ *  #include <raft/core/operators.hpp>
+ *  #include <raft/linalg/map.cuh>
+ *  ...
+ *  raft::handle_t handle;
+ *  auto squares = raft::make_device_vector<int>(handle, n);
+ *  raft::linalg::map_offset(handle, squares.view(), raft::sq_op());
+ * @endcode
+ *
+ * @tparam OutType Output mdspan type
+ * @tparam MapOp   The unary operation type with signature `OutT func(const IdxT& idx);`
+ * @param[in]  handle The raft handle
+ * @param[out] out    Output array
+ * @param[in]  op     The unary operation
+ */
+template <typename OutType,
+          typename MapOp,
+          typename = raft::enable_if_output_device_mdspan<OutType>>
+void map_offset(const raft::device_resources& handle, OutType out, MapOp op)
+{
+  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
+
+  using out_value_t = typename OutType::value_type;
+
+  thrust::tabulate(
+    handle.get_thrust_policy(), out.data_handle(), out.data_handle() + out.size(), op);
+}
+
 /** @} */  // end of map
 
 }  // namespace linalg
 };  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/map_reduce.cuh b/cpp/include/raft/linalg/map_reduce.cuh
index 180ed128a1..b89f3bdd54 100644
--- a/cpp/include/raft/linalg/map_reduce.cuh
+++ b/cpp/include/raft/linalg/map_reduce.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,11 +24,6 @@
 
 namespace raft::linalg {
 
-/**
- * @defgroup map_reduce Map-Reduce ops
- * @{
- */
-
 /**
  * @brief CUDA version of map and then generic reduction operation
  * @tparam Type data-type upon which the math operation will be performed
@@ -67,6 +62,10 @@ void mapReduce(OutType* out,
     out, len, neutral, map, op, stream, in, args...);
 }
 
+/**
+ * @defgroup map_reduce Map-Reduce ops
+ * @{
+ */
 /**
  * @brief CUDA version of map and then generic reduction operation
  * @tparam InValueType the data-type of the input
@@ -76,7 +75,7 @@ void mapReduce(OutType* out,
  * @tparam OutValueType the data-type of the output
  * @tparam ScalarIdxType index type of scalar
  * @tparam Args additional parameters
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in the input of type raft::device_vector_view
  * @param[in] neutral The neutral element of the reduction operation. For example:
  *    0 for sum, 1 for multiply, +Inf for Min, -Inf for Max
@@ -92,7 +91,7 @@ template <typename InValueType,
           typename OutValueType,
           typename ScalarIdxType,
           typename... Args>
-void map_reduce(const raft::handle_t& handle,
+void map_reduce(raft::device_resources const& handle,
                 raft::device_vector_view<const InValueType, IndexType> in,
                 raft::device_scalar_view<OutValueType, ScalarIdxType> out,
                 OutValueType neutral,
diff --git a/cpp/include/raft/linalg/matrix_vector.cuh b/cpp/include/raft/linalg/matrix_vector.cuh
index 57bc0cf21f..fa24ea28b7 100644
--- a/cpp/include/raft/linalg/matrix_vector.cuh
+++ b/cpp/include/raft/linalg/matrix_vector.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,11 @@
 
 namespace raft::linalg {
 
+/**
+ * @defgroup matrix_vector Matrix-Vector Operations
+ * @{
+ */
+
 /**
  * @brief multiply each row or column of matrix with vector, skipping zeros in vector
  * @param [in] handle: raft handle for managing library resources
@@ -32,7 +37,7 @@ namespace raft::linalg {
  * the rows of the matrix or columns using enum class raft::linalg::Apply
  */
 template <typename math_t, typename idx_t, typename layout_t>
-void binary_mult_skip_zero(const raft::handle_t& handle,
+void binary_mult_skip_zero(raft::device_resources const& handle,
                            raft::device_matrix_view<math_t, idx_t, layout_t> data,
                            raft::device_vector_view<const math_t, idx_t> vec,
                            Apply apply)
@@ -65,7 +70,7 @@ void binary_mult_skip_zero(const raft::handle_t& handle,
  * the rows of the matrix or columns using enum class raft::linalg::Apply
  */
 template <typename math_t, typename idx_t, typename layout_t>
-void binary_div(const raft::handle_t& handle,
+void binary_div(raft::device_resources const& handle,
                 raft::device_matrix_view<math_t, idx_t, layout_t> data,
                 raft::device_vector_view<const math_t, idx_t> vec,
                 Apply apply)
@@ -100,7 +105,7 @@ void binary_div(const raft::handle_t& handle,
  * value if false
  */
 template <typename math_t, typename idx_t, typename layout_t>
-void binary_div_skip_zero(const raft::handle_t& handle,
+void binary_div_skip_zero(raft::device_resources const& handle,
                           raft::device_matrix_view<math_t, idx_t, layout_t> data,
                           raft::device_vector_view<const math_t, idx_t> vec,
                           Apply apply,
@@ -135,7 +140,7 @@ void binary_div_skip_zero(const raft::handle_t& handle,
  * the rows of the matrix or columns using enum class raft::linalg::Apply
  */
 template <typename math_t, typename idx_t, typename layout_t>
-void binary_add(const raft::handle_t& handle,
+void binary_add(raft::device_resources const& handle,
                 raft::device_matrix_view<math_t, idx_t, layout_t> data,
                 raft::device_vector_view<const math_t, idx_t> vec,
                 Apply apply)
@@ -168,7 +173,7 @@ void binary_add(const raft::handle_t& handle,
  * the rows of the matrix or columns using enum class raft::linalg::Apply
  */
 template <typename math_t, typename idx_t, typename layout_t>
-void binary_sub(const raft::handle_t& handle,
+void binary_sub(raft::device_resources const& handle,
                 raft::device_matrix_view<math_t, idx_t, layout_t> data,
                 raft::device_vector_view<const math_t, idx_t> vec,
                 Apply apply)
@@ -191,4 +196,7 @@ void binary_sub(const raft::handle_t& handle,
                                         bcast_along_rows,
                                         handle.get_stream());
 }
+
+/** @} */  // end of matrix_vector
+
 }  // namespace raft::linalg
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
index 8b5163a714..59b2ca5ee5 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -122,7 +122,7 @@ void matrixVectorOp(MatT* out,
  * @tparam LayoutPolicy the layout of input and output (raft::row_major or raft::col_major)
  * @tparam Lambda a device function which represents a binary operator
  * @tparam IndexType Integer used for addressing
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] matrix input raft::matrix_view
  * @param[in] vec vector raft::vector_view
  * @param[out] out output raft::matrix_view
@@ -135,7 +135,7 @@ template <typename MatValueType,
           typename LayoutPolicy,
           typename Lambda,
           typename IndexType>
-void matrix_vector_op(const raft::handle_t& handle,
+void matrix_vector_op(raft::device_resources const& handle,
                       raft::device_matrix_view<const MatValueType, IndexType, LayoutPolicy> matrix,
                       raft::device_vector_view<const VecValueType, IndexType> vec,
                       raft::device_matrix_view<MatValueType, IndexType, LayoutPolicy> out,
@@ -182,7 +182,7 @@ void matrix_vector_op(const raft::handle_t& handle,
  * @tparam LayoutPolicy the layout of input and output (raft::row_major or raft::col_major)
  * @tparam Lambda a device function which represents a binary operator
  * @tparam IndexType Integer used for addressing
- * @param handle raft::handle_t
+ * @param handle raft::device_resources
  * @param matrix input raft::matrix_view
  * @param vec1 the first vector raft::vector_view
  * @param vec2 the second vector raft::vector_view
@@ -197,7 +197,7 @@ template <typename MatValueType,
           typename LayoutPolicy,
           typename Lambda,
           typename IndexType>
-void matrix_vector_op(const raft::handle_t& handle,
+void matrix_vector_op(raft::device_resources const& handle,
                       raft::device_matrix_view<const MatValueType, IndexType, LayoutPolicy> matrix,
                       raft::device_vector_view<const Vec1ValueType, IndexType> vec1,
                       raft::device_vector_view<const Vec2ValueType, IndexType> vec2,
diff --git a/cpp/include/raft/linalg/mean_squared_error.cuh b/cpp/include/raft/linalg/mean_squared_error.cuh
index a3360ae35a..62f4896d01 100644
--- a/cpp/include/raft/linalg/mean_squared_error.cuh
+++ b/cpp/include/raft/linalg/mean_squared_error.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,14 +53,14 @@ void meanSquaredError(
  * @tparam IndexType Input/Output index type
  * @tparam OutValueType Output data-type
  * @tparam TPB threads-per-block
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] A input raft::device_vector_view
  * @param[in] B input raft::device_vector_view
  * @param[out] out the output mean squared error value of type raft::device_scalar_view
  * @param[in] weight weight to apply to every term in the mean squared error calculation
  */
 template <typename InValueType, typename IndexType, typename OutValueType>
-void mean_squared_error(const raft::handle_t& handle,
+void mean_squared_error(raft::device_resources const& handle,
                         raft::device_vector_view<const InValueType, IndexType> A,
                         raft::device_vector_view<const InValueType, IndexType> B,
                         raft::device_scalar_view<OutValueType, IndexType> out,
diff --git a/cpp/include/raft/linalg/multiply.cuh b/cpp/include/raft/linalg/multiply.cuh
index 119cf667d1..574b88c63d 100644
--- a/cpp/include/raft/linalg/multiply.cuh
+++ b/cpp/include/raft/linalg/multiply.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,7 +56,7 @@ void multiplyScalar(out_t* out, const in_t* in, in_t scalar, IdxType len, cudaSt
  * @tparam InType    Input Type raft::device_mdspan
  * @tparam OutType   Output Type raft::device_mdspan
  * @tparam ScalarIdxType Index Type of scalar
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in the input buffer
  * @param[out] out the output buffer
  * @param[in] scalar the scalar used in the operations
@@ -68,7 +68,7 @@ template <typename InType,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void multiply_scalar(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   InType in,
   OutType out,
   raft::host_scalar_view<const typename InType::value_type, ScalarIdxType> scalar)
diff --git a/cpp/include/raft/linalg/norm.cuh b/cpp/include/raft/linalg/norm.cuh
index 9abfd3bdb0..8bc6720b4e 100644
--- a/cpp/include/raft/linalg/norm.cuh
+++ b/cpp/include/raft/linalg/norm.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include "linalg_types.hpp"
 
 #include <raft/core/device_mdspan.hpp>
+#include <raft/core/operators.hpp>
 #include <raft/linalg/norm_types.hpp>
 #include <raft/util/input_validation.hpp>
 
@@ -47,7 +48,7 @@ namespace linalg {
  * @param stream cuda stream where to launch work
  * @param fin_op the final lambda op
  */
-template <typename Type, typename IdxType = int, typename Lambda = raft::Nop<Type, IdxType>>
+template <typename Type, typename IdxType = int, typename Lambda = raft::identity_op>
 void rowNorm(Type* dots,
              const Type* data,
              IdxType D,
@@ -55,7 +56,7 @@ void rowNorm(Type* dots,
              NormType type,
              bool rowMajor,
              cudaStream_t stream,
-             Lambda fin_op = raft::Nop<Type, IdxType>())
+             Lambda fin_op = raft::identity_op())
 {
   detail::rowNormCaller(dots, data, D, N, type, rowMajor, stream, fin_op);
 }
@@ -74,7 +75,7 @@ void rowNorm(Type* dots,
  * @param stream cuda stream where to launch work
  * @param fin_op the final lambda op
  */
-template <typename Type, typename IdxType = int, typename Lambda = raft::Nop<Type, IdxType>>
+template <typename Type, typename IdxType = int, typename Lambda = raft::identity_op>
 void colNorm(Type* dots,
              const Type* data,
              IdxType D,
@@ -82,18 +83,23 @@ void colNorm(Type* dots,
              NormType type,
              bool rowMajor,
              cudaStream_t stream,
-             Lambda fin_op = raft::Nop<Type, IdxType>())
+             Lambda fin_op = raft::identity_op())
 {
   detail::colNormCaller(dots, data, D, N, type, rowMajor, stream, fin_op);
 }
 
+/**
+ * @defgroup norm Row- or Col-norm computation
+ * @{
+ */
+
 /**
  * @brief Compute norm of the input matrix and perform fin_op
  * @tparam ElementType Input/Output data type
  * @tparam LayoutPolicy the layout of input (raft::row_major or raft::col_major)
  * @tparam IdxType Integer type used to for addressing
  * @tparam Lambda device final lambda
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in the input raft::device_matrix_view
  * @param[out] out the output raft::device_vector_view
  * @param[in] type the type of norm to be applied
@@ -104,13 +110,13 @@ void colNorm(Type* dots,
 template <typename ElementType,
           typename LayoutPolicy,
           typename IndexType,
-          typename Lambda = raft::Nop<ElementType, IndexType>>
-void norm(const raft::handle_t& handle,
+          typename Lambda = raft::identity_op>
+void norm(raft::device_resources const& handle,
           raft::device_matrix_view<const ElementType, IndexType, LayoutPolicy> in,
           raft::device_vector_view<ElementType, IndexType> out,
           NormType type,
           Apply apply,
-          Lambda fin_op = raft::Nop<ElementType, IndexType>())
+          Lambda fin_op = raft::identity_op())
 {
   RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input must be contiguous");
 
@@ -142,6 +148,8 @@ void norm(const raft::handle_t& handle,
   }
 }
 
+/** @} */
+
 };  // end namespace linalg
 };  // end namespace raft
 
diff --git a/cpp/include/raft/linalg/normalize.cuh b/cpp/include/raft/linalg/normalize.cuh
index 4bdf697581..027ebb16e8 100644
--- a/cpp/include/raft/linalg/normalize.cuh
+++ b/cpp/include/raft/linalg/normalize.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,11 +18,17 @@
 
 #include "detail/normalize.cuh"
 
+#include <raft/core/operators.hpp>
 #include <raft/linalg/norm_types.hpp>
 
 namespace raft {
 namespace linalg {
 
+/**
+ * @defgroup norm Row- or Col-norm computation
+ * @{
+ */
+
 /**
  * @brief Divide rows by their norm defined by main_op, reduce_op and fin_op
  *
@@ -31,7 +37,7 @@ namespace linalg {
  * @tparam MainLambda Type of main_op
  * @tparam ReduceLambda Type of reduce_op
  * @tparam FinalLambda Type of fin_op
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in the input raft::device_matrix_view
  * @param[out] out the output raft::device_matrix_view
  * @param[in] init Initialization value, i.e identity element for the reduction operation
@@ -46,7 +52,7 @@ template <typename ElementType,
           typename MainLambda,
           typename ReduceLambda,
           typename FinalLambda>
-void row_normalize(const raft::handle_t& handle,
+void row_normalize(raft::device_resources const& handle,
                    raft::device_matrix_view<const ElementType, IndexType, row_major> in,
                    raft::device_matrix_view<ElementType, IndexType, row_major> out,
                    ElementType init,
@@ -79,14 +85,14 @@ void row_normalize(const raft::handle_t& handle,
  *
  * @tparam ElementType Input/Output data type
  * @tparam IndexType Integer type used to for addressing
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in the input raft::device_matrix_view
  * @param[out] out the output raft::device_matrix_view
  * @param[in] norm_type the type of norm to be applied
  * @param[in] eps If the norm is below eps, the row is considered zero and no division is applied
  */
 template <typename ElementType, typename IndexType>
-void row_normalize(const raft::handle_t& handle,
+void row_normalize(raft::device_resources const& handle,
                    raft::device_matrix_view<const ElementType, IndexType, row_major> in,
                    raft::device_matrix_view<ElementType, IndexType, row_major> out,
                    NormType norm_type,
@@ -94,38 +100,22 @@ void row_normalize(const raft::handle_t& handle,
 {
   switch (norm_type) {
     case L1Norm:
-      row_normalize(handle,
-                    in,
-                    out,
-                    ElementType(0),
-                    raft::L1Op<ElementType>(),
-                    raft::Sum<ElementType>(),
-                    raft::Nop<ElementType>(),
-                    eps);
+      row_normalize(
+        handle, in, out, ElementType(0), raft::abs_op(), raft::add_op(), raft::identity_op(), eps);
       break;
     case L2Norm:
-      row_normalize(handle,
-                    in,
-                    out,
-                    ElementType(0),
-                    raft::L2Op<ElementType>(),
-                    raft::Sum<ElementType>(),
-                    raft::SqrtOp<ElementType>(),
-                    eps);
+      row_normalize(
+        handle, in, out, ElementType(0), raft::sq_op(), raft::add_op(), raft::sqrt_op(), eps);
       break;
     case LinfNorm:
-      row_normalize(handle,
-                    in,
-                    out,
-                    ElementType(0),
-                    raft::L1Op<ElementType>(),
-                    raft::Max<ElementType>(),
-                    raft::Nop<ElementType>(),
-                    eps);
+      row_normalize(
+        handle, in, out, ElementType(0), raft::abs_op(), raft::max_op(), raft::identity_op(), eps);
       break;
     default: THROW("Unsupported norm type: %d", norm_type);
   }
 }
 
+/** @} */
+
 }  // namespace linalg
 }  // namespace raft
diff --git a/cpp/include/raft/linalg/power.cuh b/cpp/include/raft/linalg/power.cuh
index acd226b71d..1fdfcb3780 100644
--- a/cpp/include/raft/linalg/power.cuh
+++ b/cpp/include/raft/linalg/power.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,9 +19,9 @@
 #pragma once
 
 #include <raft/core/host_mdspan.hpp>
+#include <raft/core/operators.hpp>
 #include <raft/linalg/binary_op.cuh>
 #include <raft/linalg/unary_op.cuh>
-#include <raft/util/cuda_utils.cuh>
 #include <raft/util/input_validation.hpp>
 
 namespace raft {
@@ -41,8 +41,7 @@ namespace linalg {
 template <typename in_t, typename out_t = in_t, typename IdxType = int>
 void powerScalar(out_t* out, const in_t* in, const in_t scalar, IdxType len, cudaStream_t stream)
 {
-  raft::linalg::unaryOp(
-    out, in, len, [scalar] __device__(in_t in) { return raft::myPow(in, scalar); }, stream);
+  raft::linalg::unaryOp(out, in, len, raft::pow_const_op<in_t>(scalar), stream);
 }
 /** @} */
 
@@ -61,8 +60,7 @@ void powerScalar(out_t* out, const in_t* in, const in_t scalar, IdxType len, cud
 template <typename in_t, typename out_t = in_t, typename IdxType = int>
 void power(out_t* out, const in_t* in1, const in_t* in2, IdxType len, cudaStream_t stream)
 {
-  raft::linalg::binaryOp(
-    out, in1, in2, len, [] __device__(in_t a, in_t b) { return raft::myPow(a, b); }, stream);
+  raft::linalg::binaryOp(out, in1, in2, len, raft::pow_op(), stream);
 }
 /** @} */
 
@@ -75,7 +73,7 @@ void power(out_t* out, const in_t* in1, const in_t* in2, IdxType len, cudaStream
  * @brief Elementwise power operation on the input buffers
  * @tparam InType    Input Type raft::device_mdspan
  * @tparam OutType   Output Type raft::device_mdspan
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in1    First Input
  * @param[in] in2    Second Input
  * @param[out] out    Output
@@ -84,7 +82,7 @@ template <typename InType,
           typename OutType,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
-void power(const raft::handle_t& handle, InType in1, InType in2, OutType out)
+void power(raft::device_resources const& handle, InType in1, InType in2, OutType out)
 {
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
@@ -115,7 +113,7 @@ void power(const raft::handle_t& handle, InType in1, InType in2, OutType out)
  * @tparam InType    Input Type raft::device_mdspan
  * @tparam OutType   Output Type raft::device_mdspan
  * @tparam ScalarIdxType Index Type of scalar
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in    Input
  * @param[out] out    Output
  * @param[in] scalar    raft::host_scalar_view
@@ -126,7 +124,7 @@ template <typename InType,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void power_scalar(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   InType in,
   OutType out,
   const raft::host_scalar_view<const typename InType::value_type, ScalarIdxType> scalar)
diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh
index 7e6e14e680..8e58af63c1 100644
--- a/cpp/include/raft/linalg/qr.cuh
+++ b/cpp/include/raft/linalg/qr.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,11 +23,6 @@
 namespace raft {
 namespace linalg {
 
-/**
- * @defgroup QRdecomp QR decomposition
- * @{
- */
-
 /**
  * @brief compute QR decomp and return only Q matrix
  * @param handle: raft handle
@@ -38,7 +33,7 @@ namespace linalg {
  * @param stream cuda stream
  */
 template <typename math_t>
-void qrGetQ(const raft::handle_t& handle,
+void qrGetQ(raft::device_resources const& handle,
             const math_t* M,
             math_t* Q,
             int n_rows,
@@ -59,7 +54,7 @@ void qrGetQ(const raft::handle_t& handle,
  * @param stream cuda stream
  */
 template <typename math_t>
-void qrGetQR(const raft::handle_t& handle,
+void qrGetQR(raft::device_resources const& handle,
              math_t* M,
              math_t* Q,
              math_t* R,
@@ -70,14 +65,19 @@ void qrGetQR(const raft::handle_t& handle,
   detail::qrGetQR(handle, M, Q, R, n_rows, n_cols, stream);
 }
 
+/**
+ * @defgroup qr QR Decomposition
+ * @{
+ */
+
 /**
  * @brief Compute the QR decomposition of matrix M and return only the Q matrix.
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] M Input raft::device_matrix_view
  * @param[out] Q Output raft::device_matrix_view
  */
 template <typename ElementType, typename IndexType>
-void qr_get_q(const raft::handle_t& handle,
+void qr_get_q(raft::device_resources const& handle,
               raft::device_matrix_view<const ElementType, IndexType, raft::col_major> M,
               raft::device_matrix_view<ElementType, IndexType, raft::col_major> Q)
 {
@@ -88,13 +88,13 @@ void qr_get_q(const raft::handle_t& handle,
 
 /**
  * @brief Compute the QR decomposition of matrix M and return both the Q and R matrices.
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] M Input raft::device_matrix_view
  * @param[in] Q Output raft::device_matrix_view
  * @param[out] R Output raft::device_matrix_view
  */
 template <typename ElementType, typename IndexType>
-void qr_get_qr(const raft::handle_t& handle,
+void qr_get_qr(raft::device_resources const& handle,
                raft::device_matrix_view<const ElementType, IndexType, raft::col_major> M,
                raft::device_matrix_view<ElementType, IndexType, raft::col_major> Q,
                raft::device_matrix_view<ElementType, IndexType, raft::col_major> R)
diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh
index 5579acf355..ae5457c44f 100644
--- a/cpp/include/raft/linalg/reduce.cuh
+++ b/cpp/include/raft/linalg/reduce.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include "linalg_types.hpp"
 
 #include <raft/core/device_mdspan.hpp>
+#include <raft/core/operators.hpp>
 #include <raft/util/input_validation.hpp>
 
 namespace raft {
@@ -59,9 +60,9 @@ namespace linalg {
 template <typename InType,
           typename OutType      = InType,
           typename IdxType      = int,
-          typename MainLambda   = raft::Nop<InType, IdxType>,
-          typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda  = raft::Nop<OutType>>
+          typename MainLambda   = raft::identity_op,
+          typename ReduceLambda = raft::add_op,
+          typename FinalLambda  = raft::identity_op>
 void reduce(OutType* dots,
             const InType* data,
             IdxType D,
@@ -71,9 +72,9 @@ void reduce(OutType* dots,
             bool alongRows,
             cudaStream_t stream,
             bool inplace           = false,
-            MainLambda main_op     = raft::Nop<InType, IdxType>(),
-            ReduceLambda reduce_op = raft::Sum<OutType>(),
-            FinalLambda final_op   = raft::Nop<OutType>())
+            MainLambda main_op     = raft::identity_op(),
+            ReduceLambda reduce_op = raft::add_op(),
+            FinalLambda final_op   = raft::identity_op())
 {
   detail::reduce<InType, OutType, IdxType>(
     dots, data, D, N, init, rowMajor, alongRows, stream, inplace, main_op, reduce_op, final_op);
@@ -104,7 +105,7 @@ void reduce(OutType* dots,
  * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
  * It must be a 'callable' supporting the following input and output:
  * <pre>OutType (*FinalLambda)(OutType);</pre>
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] data Input of type raft::device_matrix_view
  * @param[out] dots Output of type raft::device_matrix_view
  * @param[in] init initial value to use for the reduction
@@ -118,18 +119,18 @@ template <typename InElementType,
           typename LayoutPolicy,
           typename OutElementType = InElementType,
           typename IdxType        = std::uint32_t,
-          typename MainLambda     = raft::Nop<InElementType, IdxType>,
-          typename ReduceLambda   = raft::Sum<OutElementType>,
-          typename FinalLambda    = raft::Nop<OutElementType>>
-void reduce(const raft::handle_t& handle,
+          typename MainLambda     = raft::identity_op,
+          typename ReduceLambda   = raft::add_op,
+          typename FinalLambda    = raft::identity_op>
+void reduce(raft::device_resources const& handle,
             raft::device_matrix_view<const InElementType, IdxType, LayoutPolicy> data,
             raft::device_vector_view<OutElementType, IdxType> dots,
             OutElementType init,
             Apply apply,
             bool inplace           = false,
-            MainLambda main_op     = raft::Nop<InElementType, IdxType>(),
-            ReduceLambda reduce_op = raft::Sum<OutElementType>(),
-            FinalLambda final_op   = raft::Nop<OutElementType>())
+            MainLambda main_op     = raft::identity_op(),
+            ReduceLambda reduce_op = raft::add_op(),
+            FinalLambda final_op   = raft::identity_op())
 {
   RAFT_EXPECTS(raft::is_row_or_column_major(data), "Input must be contiguous");
 
diff --git a/cpp/include/raft/linalg/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/reduce_cols_by_key.cuh
index 436fce26fd..2b744d8134 100644
--- a/cpp/include/raft/linalg/reduce_cols_by_key.cuh
+++ b/cpp/include/raft/linalg/reduce_cols_by_key.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #include "detail/reduce_cols_by_key.cuh"
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 namespace raft {
 namespace linalg {
@@ -43,6 +43,7 @@ namespace linalg {
  * @param ncols number of columns in the input data
  * @param nkeys number of unique keys in the keys array
  * @param stream cuda stream to launch the kernel onto
+ * @param reset_sums  Whether to reset the output sums to zero before reducing
  */
 template <typename T, typename KeyIteratorT, typename IdxType = int>
 void reduce_cols_by_key(const T* data,
@@ -51,9 +52,10 @@ void reduce_cols_by_key(const T* data,
                         IdxType nrows,
                         IdxType ncols,
                         IdxType nkeys,
-                        cudaStream_t stream)
+                        cudaStream_t stream,
+                        bool reset_sums = true)
 {
-  detail::reduce_cols_by_key(data, keys, out, nrows, ncols, nkeys, stream);
+  detail::reduce_cols_by_key(data, keys, out, nrows, ncols, nkeys, stream, reset_sums);
 }
 
 /**
@@ -67,7 +69,7 @@ void reduce_cols_by_key(const T* data,
  * @tparam ElementType the input data type (as well as the output reduced matrix)
  * @tparam KeyType data type of the keys
  * @tparam IndexType indexing arithmetic type
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] data the input data (dim = nrows x ncols). This is assumed to be in
  * row-major layout of type raft::device_matrix_view
  * @param[in] keys keys raft::device_vector_view (len = ncols). It is assumed that each key in this
@@ -76,18 +78,26 @@ void reduce_cols_by_key(const T* data,
  * monotonically increasing keys array.
  * @param[out] out the output reduced raft::device_matrix_view along columns (dim = nrows x nkeys).
  * This will be assumed to be in row-major layout
- * @param[in] nkeys number of unique keys in the keys array
+ * @param[in] nkeys Number of unique keys in the keys array. By default, inferred from the number of
+ * columns of out
+ * @param[in] reset_sums  Whether to reset the output sums to zero before reducing
  */
 template <typename ElementType, typename KeyType = ElementType, typename IndexType = std::uint32_t>
 void reduce_cols_by_key(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const ElementType, IndexType, raft::row_major> data,
   raft::device_vector_view<const KeyType, IndexType> keys,
   raft::device_matrix_view<ElementType, IndexType, raft::row_major> out,
-  IndexType nkeys)
+  IndexType nkeys = 0,
+  bool reset_sums = true)
 {
-  RAFT_EXPECTS(out.extent(0) == data.extent(0) && out.extent(1) == nkeys,
-               "Output is not of size nrows * nkeys");
+  if (nkeys > 0) {
+    RAFT_EXPECTS(out.extent(1) == nkeys, "Output doesn't have nkeys columns");
+  } else {
+    nkeys = out.extent(1);
+  }
+  RAFT_EXPECTS(out.extent(0) == data.extent(0),
+               "Output doesn't have the same number of rows as input");
   RAFT_EXPECTS(keys.extent(0) == data.extent(1), "Keys is not of size ncols");
 
   reduce_cols_by_key(data.data_handle(),
@@ -96,7 +106,8 @@ void reduce_cols_by_key(
                      data.extent(0),
                      data.extent(1),
                      nkeys,
-                     handle.get_stream());
+                     handle.get_stream(),
+                     reset_sums);
 }
 
 /** @} */  // end of group reduce_cols_by_key
diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
index 1dabd92087..484b60238b 100644
--- a/cpp/include/raft/linalg/reduce_rows_by_key.cuh
+++ b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #include "detail/reduce_rows_by_key.cuh"
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 namespace raft {
 namespace linalg {
@@ -136,7 +136,7 @@ void reduce_rows_by_key(const DataIteratorT d_A,
  * @tparam KeyType data-type of keys
  * @tparam WeightType data-type of weights
  * @tparam IndexType index type
- * @param[in]  handle      raft::handle_t
+ * @param[in]  handle      raft::device_resources
  * @param[in]  d_A         Input raft::device_mdspan (ncols * nrows)
  * @param[in]  d_keys      Keys for each row raft::device_vector_view (1 x nrows)
  * @param[out] d_sums      Row sums by key raft::device_matrix_view (ncols x d_keys)
@@ -148,7 +148,7 @@ void reduce_rows_by_key(const DataIteratorT d_A,
  */
 template <typename ElementType, typename KeyType, typename WeightType, typename IndexType>
 void reduce_rows_by_key(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const ElementType, IndexType, raft::row_major> d_A,
   raft::device_vector_view<const KeyType, IndexType> d_keys,
   raft::device_matrix_view<ElementType, IndexType, raft::row_major> d_sums,
diff --git a/cpp/include/raft/linalg/rsvd.cuh b/cpp/include/raft/linalg/rsvd.cuh
index 6f0315642b..eb94547f13 100644
--- a/cpp/include/raft/linalg/rsvd.cuh
+++ b/cpp/include/raft/linalg/rsvd.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,7 +47,7 @@ namespace linalg {
  * @param stream cuda stream
  */
 template <typename math_t>
-void rsvdFixedRank(const raft::handle_t& handle,
+void rsvdFixedRank(raft::device_resources const& handle,
                    math_t* M,
                    int n_rows,
                    int n_cols,
@@ -104,7 +104,7 @@ void rsvdFixedRank(const raft::handle_t& handle,
  * @param stream cuda stream
  */
 template <typename math_t>
-void rsvdPerc(const raft::handle_t& handle,
+void rsvdPerc(raft::device_resources const& handle,
               math_t* M,
               int n_rows,
               int n_cols,
@@ -154,7 +154,7 @@ void rsvdPerc(const raft::handle_t& handle,
  * U_in
  * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
  * V_in
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] S_vec singular values raft::device_vector_view of shape (K)
  * @param[in] p no. of upsamples
@@ -164,7 +164,7 @@ void rsvdPerc(const raft::handle_t& handle,
  * raft::col_major
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
-void rsvd_fixed_rank(const raft::handle_t& handle,
+void rsvd_fixed_rank(raft::device_resources const& handle,
                      raft::device_matrix_view<const ValueType, IndexType, raft::col_major> M,
                      raft::device_vector_view<ValueType, IndexType> S_vec,
                      IndexType p,
@@ -228,7 +228,7 @@ void rsvd_fixed_rank(Args... args)
  * U_in
  * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
  * V_in
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] S_vec singular values raft::device_vector_view of shape (K)
  * @param[in] p no. of upsamples
@@ -239,7 +239,7 @@ void rsvd_fixed_rank(Args... args)
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
 void rsvd_fixed_rank_symmetric(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const ValueType, IndexType, raft::col_major> M,
   raft::device_vector_view<ValueType, IndexType> S_vec,
   IndexType p,
@@ -303,7 +303,7 @@ void rsvd_fixed_rank_symmetric(Args... args)
  * U_in
  * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
  * V_in
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] S_vec singular values raft::device_vector_view of shape (K)
  * @param[in] p no. of upsamples
@@ -315,7 +315,7 @@ void rsvd_fixed_rank_symmetric(Args... args)
  * raft::col_major
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
-void rsvd_fixed_rank_jacobi(const raft::handle_t& handle,
+void rsvd_fixed_rank_jacobi(raft::device_resources const& handle,
                             raft::device_matrix_view<const ValueType, IndexType, raft::col_major> M,
                             raft::device_vector_view<ValueType, IndexType> S_vec,
                             IndexType p,
@@ -381,7 +381,7 @@ void rsvd_fixed_rank_jacobi(Args... args)
  * U_in
  * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
  * V_in
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] S_vec singular values raft::device_vector_view of shape (K)
  * @param[in] p no. of upsamples
@@ -394,7 +394,7 @@ void rsvd_fixed_rank_jacobi(Args... args)
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
 void rsvd_fixed_rank_symmetric_jacobi(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const ValueType, IndexType, raft::col_major> M,
   raft::device_vector_view<ValueType, IndexType> S_vec,
   IndexType p,
@@ -460,7 +460,7 @@ void rsvd_fixed_rank_symmetric_jacobi(Args... args)
  * U_in
  * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
  * V_in
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] S_vec singular values raft::device_vector_view of shape (K)
  * @param[in] PC_perc percentage of singular values to be computed
@@ -471,7 +471,7 @@ void rsvd_fixed_rank_symmetric_jacobi(Args... args)
  * raft::col_major
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
-void rsvd_perc(const raft::handle_t& handle,
+void rsvd_perc(raft::device_resources const& handle,
                raft::device_matrix_view<const ValueType, IndexType, raft::col_major> M,
                raft::device_vector_view<ValueType, IndexType> S_vec,
                ValueType PC_perc,
@@ -536,7 +536,7 @@ void rsvd_perc(Args... args)
  * U_in
  * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
  * V_in
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] S_vec singular values raft::device_vector_view of shape (K)
  * @param[in] PC_perc percentage of singular values to be computed
@@ -547,7 +547,7 @@ void rsvd_perc(Args... args)
  * raft::col_major
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
-void rsvd_perc_symmetric(const raft::handle_t& handle,
+void rsvd_perc_symmetric(raft::device_resources const& handle,
                          raft::device_matrix_view<const ValueType, IndexType, raft::col_major> M,
                          raft::device_vector_view<ValueType, IndexType> S_vec,
                          ValueType PC_perc,
@@ -612,7 +612,7 @@ void rsvd_perc_symmetric(Args... args)
  * U_in
  * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
  * V_in
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] S_vec singular values raft::device_vector_view of shape (K)
  * @param[in] PC_perc percentage of singular values to be computed
@@ -625,7 +625,7 @@ void rsvd_perc_symmetric(Args... args)
  * raft::col_major
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
-void rsvd_perc_jacobi(const raft::handle_t& handle,
+void rsvd_perc_jacobi(raft::device_resources const& handle,
                       raft::device_matrix_view<const ValueType, IndexType, raft::col_major> M,
                       raft::device_vector_view<ValueType, IndexType> S_vec,
                       ValueType PC_perc,
@@ -692,7 +692,7 @@ void rsvd_perc_jacobi(Args... args)
  * U_in
  * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
  * V_in
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] S_vec singular values raft::device_vector_view of shape (K)
  * @param[in] PC_perc percentage of singular values to be computed
@@ -706,7 +706,7 @@ void rsvd_perc_jacobi(Args... args)
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
 void rsvd_perc_symmetric_jacobi(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const ValueType, IndexType, raft::col_major> M,
   raft::device_vector_view<ValueType, IndexType> S_vec,
   ValueType PC_perc,
diff --git a/cpp/include/raft/linalg/sqrt.cuh b/cpp/include/raft/linalg/sqrt.cuh
index 2951285c3a..55e661897d 100644
--- a/cpp/include/raft/linalg/sqrt.cuh
+++ b/cpp/include/raft/linalg/sqrt.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,8 +19,8 @@
 #pragma once
 
 #include <raft/core/device_mdspan.hpp>
+#include <raft/core/operators.hpp>
 #include <raft/linalg/unary_op.cuh>
-#include <raft/util/cuda_utils.cuh>
 
 namespace raft {
 namespace linalg {
@@ -38,8 +38,7 @@ namespace linalg {
 template <typename in_t, typename out_t = in_t, typename IdxType = int>
 void sqrt(out_t* out, const in_t* in, IdxType len, cudaStream_t stream)
 {
-  raft::linalg::unaryOp(
-    out, in, len, [] __device__(in_t in) { return raft::mySqrt(in); }, stream);
+  raft::linalg::unaryOp(out, in, len, raft::sqrt_op{}, stream);
 }
 /** @} */
 
@@ -52,7 +51,7 @@ void sqrt(out_t* out, const in_t* in, IdxType len, cudaStream_t stream)
  * @brief Elementwise sqrt operation
  * @tparam InType    Input Type raft::device_mdspan
  * @tparam OutType   Output Type raft::device_mdspan
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in     Input
  * @param[out] out    Output
  */
@@ -60,7 +59,7 @@ template <typename InType,
           typename OutType,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
-void sqrt(const raft::handle_t& handle, InType in, OutType out)
+void sqrt(raft::device_resources const& handle, InType in, OutType out)
 {
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/strided_reduction.cuh
index 0aa4aecef5..d282a2e1fa 100644
--- a/cpp/include/raft/linalg/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/strided_reduction.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include "detail/strided_reduction.cuh"
 
 #include <raft/core/device_mdspan.hpp>
+#include <raft/core/operators.hpp>
 #include <raft/handle.hpp>
 
 #include <type_traits>
@@ -59,9 +60,9 @@ namespace linalg {
 template <typename InType,
           typename OutType      = InType,
           typename IdxType      = int,
-          typename MainLambda   = raft::Nop<InType, IdxType>,
-          typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda  = raft::Nop<OutType>>
+          typename MainLambda   = raft::identity_op,
+          typename ReduceLambda = raft::add_op,
+          typename FinalLambda  = raft::identity_op>
 void stridedReduction(OutType* dots,
                       const InType* data,
                       IdxType D,
@@ -69,9 +70,9 @@ void stridedReduction(OutType* dots,
                       OutType init,
                       cudaStream_t stream,
                       bool inplace           = false,
-                      MainLambda main_op     = raft::Nop<InType, IdxType>(),
-                      ReduceLambda reduce_op = raft::Sum<OutType>(),
-                      FinalLambda final_op   = raft::Nop<OutType>())
+                      MainLambda main_op     = raft::identity_op(),
+                      ReduceLambda reduce_op = raft::add_op(),
+                      FinalLambda final_op   = raft::identity_op())
 {
   // Only compile for types supported by myAtomicReduce, but don't make the compilation fail in
   // other cases, because coalescedReduction supports arbitrary types.
@@ -111,7 +112,7 @@ void stridedReduction(OutType* dots,
  * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
  * It must be a 'callable' supporting the following input and output:
  * <pre>OutType (*FinalLambda)(OutType);</pre>
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] data Input of type raft::device_matrix_view
  * @param[out] dots Output of type raft::device_matrix_view
  * @param[in] init initial value to use for the reduction
@@ -124,17 +125,17 @@ template <typename InValueType,
           typename LayoutPolicy,
           typename OutValueType,
           typename IndexType,
-          typename MainLambda   = raft::Nop<InValueType>,
-          typename ReduceLambda = raft::Sum<OutValueType>,
-          typename FinalLambda  = raft::Nop<OutValueType>>
-void strided_reduction(const raft::handle_t& handle,
+          typename MainLambda   = raft::identity_op,
+          typename ReduceLambda = raft::add_op,
+          typename FinalLambda  = raft::identity_op>
+void strided_reduction(raft::device_resources const& handle,
                        raft::device_matrix_view<const InValueType, IndexType, LayoutPolicy> data,
                        raft::device_vector_view<OutValueType, IndexType> dots,
                        OutValueType init,
                        bool inplace           = false,
-                       MainLambda main_op     = raft::Nop<InValueType>(),
-                       ReduceLambda reduce_op = raft::Sum<OutValueType>(),
-                       FinalLambda final_op   = raft::Nop<OutValueType>())
+                       MainLambda main_op     = raft::identity_op(),
+                       ReduceLambda reduce_op = raft::add_op(),
+                       FinalLambda final_op   = raft::identity_op())
 {
   if constexpr (std::is_same_v<LayoutPolicy, raft::row_major>) {
     RAFT_EXPECTS(static_cast<IndexType>(dots.size()) == data.extent(1),
diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.cuh
index e6f2fa8724..da995b7a2a 100644
--- a/cpp/include/raft/linalg/subtract.cuh
+++ b/cpp/include/raft/linalg/subtract.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -97,7 +97,7 @@ void subtractDevScalar(math_t* outDev,
  * @brief Elementwise subtraction operation on the input buffers
  * @tparam InType    Input Type raft::device_mdspan
  * @tparam OutType   Output Type raft::device_mdspan
- * @param handle raft::handle_t
+ * @param handle raft::device_resources
  * @param[in] in1    First Input
  * @param[in] in2    Second Input
  * @param[out] out    Output
@@ -106,7 +106,7 @@ template <typename InType,
           typename OutType,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
-void subtract(const raft::handle_t& handle, InType in1, InType in2, OutType out)
+void subtract(raft::device_resources const& handle, InType in1, InType in2, OutType out)
 {
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
@@ -137,7 +137,7 @@ void subtract(const raft::handle_t& handle, InType in1, InType in2, OutType out)
  * @tparam InType    Input Type raft::device_mdspan
  * @tparam OutType   Output Type raft::device_mdspan
  * @tparam ScalarIdxType Index Type of scalar
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in    Input
  * @param[out] out    Output
  * @param[in] scalar    raft::device_scalar_view
@@ -148,7 +148,7 @@ template <typename InType,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void subtract_scalar(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   InType in,
   OutType out,
   raft::device_scalar_view<const typename InType::element_type, ScalarIdxType> scalar)
@@ -182,7 +182,7 @@ void subtract_scalar(
  * @tparam InType    Input Type raft::device_mdspan
  * @tparam OutType   Output Type raft::device_mdspan
  * @tparam ScalarIdxType Index Type of scalar
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in    Input
  * @param[out] out    Output
  * @param[in] scalar    raft::host_scalar_view
@@ -193,7 +193,7 @@ template <typename InType,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void subtract_scalar(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   InType in,
   OutType out,
   raft::host_scalar_view<const typename InType::element_type, ScalarIdxType> scalar)
diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh
index 7be1b9d63c..eb51093240 100644
--- a/cpp/include/raft/linalg/svd.cuh
+++ b/cpp/include/raft/linalg/svd.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,7 +41,7 @@ namespace linalg {
  * @param stream cuda stream
  */
 template <typename T>
-void svdQR(const raft::handle_t& handle,
+void svdQR(raft::device_resources const& handle,
            T* in,
            int n_rows,
            int n_cols,
@@ -66,14 +66,14 @@ void svdQR(const raft::handle_t& handle,
                 stream);
 }
 
-template <typename T>
-void svdEig(const raft::handle_t& handle,
-            T* in,
-            int n_rows,
-            int n_cols,
-            T* S,
-            T* U,
-            T* V,
+template <typename math_t, typename idx_t>
+void svdEig(raft::device_resources const& handle,
+            math_t* in,
+            idx_t n_rows,
+            idx_t n_cols,
+            math_t* S,
+            math_t* U,
+            math_t* V,
             bool gen_left_vec,
             cudaStream_t stream)
 {
@@ -98,7 +98,7 @@ void svdEig(const raft::handle_t& handle,
  * @param stream cuda stream
  */
 template <typename math_t>
-void svdJacobi(const raft::handle_t& handle,
+void svdJacobi(raft::device_resources const& handle,
                math_t* in,
                int n_rows,
                int n_cols,
@@ -139,7 +139,7 @@ void svdJacobi(const raft::handle_t& handle,
  * @param stream cuda stream
  */
 template <typename math_t>
-void svdReconstruction(const raft::handle_t& handle,
+void svdReconstruction(raft::device_resources const& handle,
                        math_t* U,
                        math_t* S,
                        math_t* V,
@@ -167,7 +167,7 @@ void svdReconstruction(const raft::handle_t& handle,
  * @param stream cuda stream
  */
 template <typename math_t>
-bool evaluateSVDByL2Norm(const raft::handle_t& handle,
+bool evaluateSVDByL2Norm(raft::device_resources const& handle,
                          math_t* A_d,
                          math_t* U,
                          math_t* S_vec,
@@ -195,7 +195,7 @@ bool evaluateSVDByL2Norm(const raft::handle_t& handle,
  * U_in
  * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
  * V_in
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] sing_vals singular values raft::device_vector_view of shape (K)
  * @param[out] U_in std::optional left singular values of raft::device_matrix_view with layout
@@ -204,7 +204,7 @@ bool evaluateSVDByL2Norm(const raft::handle_t& handle,
  * layout raft::col_major and dimensions (n, n)
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
-void svd_qr(const raft::handle_t& handle,
+void svd_qr(raft::device_resources const& handle,
             raft::device_matrix_view<const ValueType, IndexType, raft::col_major> in,
             raft::device_vector_view<ValueType, IndexType> sing_vals,
             UType&& U_in,
@@ -258,7 +258,7 @@ void svd_qr(Args... args)
  * U_in
  * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
  * V_in
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] sing_vals singular values raft::device_vector_view of shape (K)
  * @param[out] U_in std::optional left singular values of raft::device_matrix_view with layout
@@ -268,7 +268,7 @@ void svd_qr(Args... args)
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
 void svd_qr_transpose_right_vec(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const ValueType, IndexType, raft::col_major> in,
   raft::device_vector_view<ValueType, IndexType> sing_vals,
   UType&& U_in,
@@ -316,7 +316,7 @@ void svd_qr_transpose_right_vec(Args... args)
 /**
  * @brief singular value decomposition (SVD) on a column major
  * matrix using Eigen decomposition. A square symmetric covariance matrix is constructed for the SVD
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] S singular values raft::device_vector_view of shape (K)
  * @param[out] V right singular values of raft::device_matrix_view with layout
@@ -326,7 +326,7 @@ void svd_qr_transpose_right_vec(Args... args)
  */
 template <typename ValueType, typename IndexType>
 void svd_eig(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const ValueType, IndexType, raft::col_major> in,
   raft::device_vector_view<ValueType, IndexType> S,
   raft::device_matrix_view<ValueType, IndexType, raft::col_major> V,
@@ -352,7 +352,7 @@ void svd_eig(
 /**
  * @brief reconstruct a matrix use left and right singular vectors and
  * singular values
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] U left singular values of raft::device_matrix_view with layout
  * raft::col_major and dimensions (m, k)
  * @param[in] S singular values raft::device_vector_view of shape (k, k)
@@ -361,7 +361,7 @@ void svd_eig(
  * @param[out] out output raft::device_matrix_view with layout raft::col_major of shape (m, n)
  */
 template <typename ValueType, typename IndexType>
-void svd_reconstruction(const raft::handle_t& handle,
+void svd_reconstruction(raft::device_resources const& handle,
                         raft::device_matrix_view<const ValueType, IndexType, raft::col_major> U,
                         raft::device_vector_view<const ValueType, IndexType> S,
                         raft::device_matrix_view<const ValueType, IndexType, raft::col_major> V,
diff --git a/cpp/include/raft/linalg/ternary_op.cuh b/cpp/include/raft/linalg/ternary_op.cuh
index 10e91a0313..aa3859bc23 100644
--- a/cpp/include/raft/linalg/ternary_op.cuh
+++ b/cpp/include/raft/linalg/ternary_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 #include "detail/ternary_op.cuh"
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/util/input_validation.hpp>
 
 namespace raft {
@@ -63,7 +63,7 @@ void ternaryOp(out_t* out,
  * @tparam InType Input Type raft::device_mdspan
  * @tparam Lambda the device-lambda performing the actual operation
  * @tparam OutType Output Type raft::device_mdspan
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in1 First input
  * @param[in] in2 Second input
  * @param[in] in3 Third input
@@ -78,7 +78,7 @@ template <typename InType,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void ternary_op(
-  const raft::handle_t& handle, InType in1, InType in2, InType in3, OutType out, Lambda op)
+  raft::device_resources const& handle, InType in1, InType in2, InType in3, OutType out, Lambda op)
 {
   RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
   RAFT_EXPECTS(raft::is_row_or_column_major(in1), "Input 1 must be contiguous");
diff --git a/cpp/include/raft/linalg/transpose.cuh b/cpp/include/raft/linalg/transpose.cuh
index e765ea7925..a0f418b4f7 100644
--- a/cpp/include/raft/linalg/transpose.cuh
+++ b/cpp/include/raft/linalg/transpose.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ namespace linalg {
  * @param stream: cuda stream
  */
 template <typename math_t>
-void transpose(const raft::handle_t& handle,
+void transpose(raft::device_resources const& handle,
                math_t* in,
                math_t* out,
                int n_rows,
@@ -56,6 +56,11 @@ void transpose(math_t* inout, int n, cudaStream_t stream)
   detail::transpose(inout, n, stream);
 }
 
+/**
+ * @defgroup transpose Matrix transpose
+ * @{
+ */
+
 /**
  * @brief Transpose a matrix. The output has same layout policy as the input.
  *
@@ -71,7 +76,7 @@ void transpose(math_t* inout, int n, cudaStream_t stream)
  * @param[out] out    Output matirx, storage is pre-allocated by caller.
  */
 template <typename T, typename IndexType, typename LayoutPolicy, typename AccessorPolicy>
-auto transpose(handle_t const& handle,
+auto transpose(raft::device_resources const& handle,
                raft::mdspan<T, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> in,
                raft::mdspan<T, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> out)
   -> std::enable_if_t<std::is_floating_point_v<T>, void>
@@ -94,6 +99,9 @@ auto transpose(handle_t const& handle,
     }
   }
 }
+
+/** @} */  // end of group transpose
+
 };  // end namespace linalg
 };  // end namespace raft
 
diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh
index a90bda06d5..ce102adfd2 100644
--- a/cpp/include/raft/linalg/unary_op.cuh
+++ b/cpp/include/raft/linalg/unary_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #include "detail/unary_op.cuh"
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/util/input_validation.hpp>
 
 namespace raft {
@@ -30,17 +30,16 @@ namespace linalg {
 /**
  * @brief perform element-wise unary operation in the input array
  * @tparam InType input data-type
- * @tparam Lambda the device-lambda performing the actual operation
+ * @tparam Lambda Device lambda performing the actual operation, with the signature
+ *         `OutType func(const InType& val);`
  * @tparam OutType output data-type
  * @tparam IdxType Integer type used to for addressing
  * @tparam TPB threads-per-block in the final kernel launched
- * @param out the output array
- * @param in the input array
- * @param len number of elements in the input array
- * @param op the device-lambda
- * @param stream cuda stream where to launch work
- * @note Lambda must be a functor with the following signature:
- *       `OutType func(const InType& val);`
+ * @param[out] out    Output array [on device], dim = [len]
+ * @param[in]  in     Input array [on device], dim = [len]
+ * @param[in]  len    Number of elements in the input array
+ * @param[in]  op     Device lambda
+ * @param[in]  stream cuda stream where to launch work
  */
 template <typename InType,
           typename Lambda,
@@ -58,15 +57,15 @@ void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_
  * Compared to `unaryOp()`, this method does not do any reads from any inputs
  *
  * @tparam OutType output data-type
- * @tparam Lambda  the device-lambda performing the actual operation
+ * @tparam Lambda  Device lambda performing the actual operation, with the signature
+ *                 `void func(OutType* outLocationOffset, IdxType idx);`
+ *                 where outLocationOffset will be out + idx.
  * @tparam IdxType Integer type used to for addressing
  * @tparam TPB     threads-per-block in the final kernel launched
  *
- * @param[out] out    the output array [on device] [len = len]
- * @param[in]  len    number of elements in the input array
- * @param[in]  op     the device-lambda which must be of the form:
- *                    `void func(OutType* outLocationOffset, IdxType idx);`
- *                    where outLocationOffset will be out + idx.
+ * @param[out] out    Output array [on device], dim = [len]
+ * @param[in]  len    Number of elements in the input array
+ * @param[in]  op     Device lambda
  * @param[in]  stream cuda stream where to launch work
  */
 template <typename OutType, typename Lambda, typename IdxType = int, int TPB = 256>
@@ -81,23 +80,22 @@ void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
  */
 
 /**
- * @brief perform element-wise binary operation on the input arrays
+ * @brief Perform an element-wise unary operation into the output array
  * @tparam InType Input Type raft::device_mdspan
- * @tparam Lambda the device-lambda performing the actual operation
+ * @tparam Lambda Device lambda performing the actual operation, with the signature
+ *                `out_value_t func(const in_value_t& val);`
  * @tparam OutType Output Type raft::device_mdspan
- * @param[in] handle raft::handle_t
- * @param[in] in Input
- * @param[out] out Output
- * @param[in] op the device-lambda
- * @note Lambda must be a functor with the following signature:
- *       `InType func(const InType& val);`
+ * @param[in]  handle The raft handle
+ * @param[in]  in     Input
+ * @param[out] out    Output
+ * @param[in]  op     Device lambda
  */
 template <typename InType,
           typename Lambda,
           typename OutType,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
-void unary_op(const raft::handle_t& handle, InType in, OutType out, Lambda op)
+void unary_op(raft::device_resources const& handle, InType in, OutType out, Lambda op)
 {
   RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
   RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input must be contiguous");
@@ -116,29 +114,32 @@ void unary_op(const raft::handle_t& handle, InType in, OutType out, Lambda op)
 }
 
 /**
- * @brief perform element-wise binary operation on the input arrays
- * This function does not read from the input
- * @tparam InType Input Type raft::device_mdspan
- * @tparam Lambda the device-lambda performing the actual operation
- * @param[in] handle raft::handle_t
- * @param[inout] in Input/Output
- * @param[in] op the device-lambda
- * @note Lambda must be a functor with the following signature:
- *       `InType func(const InType& val);`
+ * @brief Perform an element-wise unary operation on the input index into the output array
+ *
+ * @note This operation is deprecated. Please use map_offset in `raft/linalg/map.cuh` instead.
+ *
+ * @tparam OutType Output Type raft::device_mdspan
+ * @tparam Lambda  Device lambda performing the actual operation, with the signature
+ *                 `void func(out_value_t* out_location, index_t idx);`
+ * @param[in]  handle The raft handle
+ * @param[out] out    Output
+ * @param[in]  op     Device lambda
  */
-template <typename InType, typename Lambda, typename = raft::enable_if_output_device_mdspan<InType>>
-void write_only_unary_op(const raft::handle_t& handle, InType in, Lambda op)
+template <typename OutType,
+          typename Lambda,
+          typename = raft::enable_if_output_device_mdspan<OutType>>
+void write_only_unary_op(const raft::device_resources& handle, OutType out, Lambda op)
 {
-  RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input must be contiguous");
+  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
 
-  using in_value_t = typename InType::value_type;
+  using out_value_t = typename OutType::value_type;
 
-  if (in.size() <= std::numeric_limits<std::uint32_t>::max()) {
-    writeOnlyUnaryOp<in_value_t, Lambda, std::uint32_t>(
-      in.data_handle(), in.size(), op, handle.get_stream());
+  if (out.size() <= std::numeric_limits<std::uint32_t>::max()) {
+    writeOnlyUnaryOp<out_value_t, Lambda, std::uint32_t>(
+      out.data_handle(), out.size(), op, handle.get_stream());
   } else {
-    writeOnlyUnaryOp<in_value_t, Lambda, std::uint64_t>(
-      in.data_handle(), in.size(), op, handle.get_stream());
+    writeOnlyUnaryOp<out_value_t, Lambda, std::uint64_t>(
+      out.data_handle(), out.size(), op, handle.get_stream());
   }
 }
 
@@ -147,4 +148,4 @@ void write_only_unary_op(const raft::handle_t& handle, InType in, Lambda op)
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/matrix/argmax.cuh b/cpp/include/raft/matrix/argmax.cuh
index e6736b14de..433c161079 100644
--- a/cpp/include/raft/matrix/argmax.cuh
+++ b/cpp/include/raft/matrix/argmax.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,11 @@
 
 namespace raft::matrix {
 
+/**
+ * @defgroup argmax Argmax operation
+ * @{
+ */
+
 /**
  * @brief Argmax: find the col idx with maximum value for each row
  * @param[in] handle: raft handle
@@ -28,7 +33,7 @@ namespace raft::matrix {
  * @param[out] out: output vector of size n_rows
  */
 template <typename math_t, typename idx_t, typename matrix_idx_t>
-void argmax(const raft::handle_t& handle,
+void argmax(raft::device_resources const& handle,
             raft::device_matrix_view<const math_t, matrix_idx_t, row_major> in,
             raft::device_vector_view<idx_t, matrix_idx_t> out)
 {
@@ -37,4 +42,7 @@ void argmax(const raft::handle_t& handle,
   detail::argmax(
     in.data_handle(), in.extent(1), in.extent(0), out.data_handle(), handle.get_stream());
 }
+
+/** @} */  // end of group argmax
+
 }  // namespace raft::matrix
diff --git a/cpp/include/raft/matrix/argmin.cuh b/cpp/include/raft/matrix/argmin.cuh
index e8cf763f70..31ef0c1c1b 100644
--- a/cpp/include/raft/matrix/argmin.cuh
+++ b/cpp/include/raft/matrix/argmin.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,11 @@
 
 namespace raft::matrix {
 
+/**
+ * @defgroup argmin Argmin operation
+ * @{
+ */
+
 /**
  * @brief Argmin: find the col idx with minimum value for each row
  * @param[in] handle: raft handle
@@ -28,7 +33,7 @@ namespace raft::matrix {
  * @param[out] out: output vector of size n_rows
  */
 template <typename math_t, typename idx_t, typename matrix_idx_t>
-void argmin(const raft::handle_t& handle,
+void argmin(raft::device_resources const& handle,
             raft::device_matrix_view<const math_t, matrix_idx_t, row_major> in,
             raft::device_vector_view<idx_t, matrix_idx_t> out)
 {
@@ -37,4 +42,7 @@ void argmin(const raft::handle_t& handle,
   detail::argmin(
     in.data_handle(), in.extent(1), in.extent(0), out.data_handle(), handle.get_stream());
 }
+
+/** @} */  // end of group argmin
+
 }  // namespace raft::matrix
diff --git a/cpp/include/raft/matrix/col_wise_sort.cuh b/cpp/include/raft/matrix/col_wise_sort.cuh
index 5f9b3ab848..a4daf097e5 100644
--- a/cpp/include/raft/matrix/col_wise_sort.cuh
+++ b/cpp/include/raft/matrix/col_wise_sort.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -52,6 +52,11 @@ void sort_cols_per_row(const InType* in,
     in, out, n_rows, n_columns, bAllocWorkspace, workspacePtr, workspaceSize, stream, sortedKeys);
 }
 
+/**
+ * @defgroup col_wise_sort Sort rows within each column
+ * @{
+ */
+
 /**
  * @brief sort columns within each row of row-major input matrix and return sorted indexes
  * modelled as key-value sort with key being input matrix and value being index of values
@@ -66,7 +71,7 @@ void sort_cols_per_row(const InType* in,
  * @param[out] sorted_keys_opt: std::optional, output matrix for sorted keys (input)
  */
 template <typename in_t, typename out_t, typename matrix_idx_t, typename sorted_keys_t>
-void sort_cols_per_row(const raft::handle_t& handle,
+void sort_cols_per_row(raft::device_resources const& handle,
                        raft::device_matrix_view<const in_t, matrix_idx_t, raft::row_major> in,
                        raft::device_matrix_view<out_t, matrix_idx_t, raft::row_major> out,
                        sorted_keys_t&& sorted_keys_opt)
@@ -126,6 +131,8 @@ void sort_cols_per_row(Args... args)
   sort_cols_per_row(std::forward<Args>(args)..., std::nullopt);
 }
 
+/** @} */  // end of group col_wise_sort
+
 };  // end namespace raft::matrix
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/copy.cuh b/cpp/include/raft/matrix/copy.cuh
index 5f1d16485c..42d2562e5e 100644
--- a/cpp/include/raft/matrix/copy.cuh
+++ b/cpp/include/raft/matrix/copy.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,11 @@
 
 namespace raft::matrix {
 
+/**
+ * @defgroup matrix_copy Matrix copy operations
+ * @{
+ */
+
 /**
  * @brief Copy selected rows of the input matrix into contiguous space.
  *
@@ -34,7 +39,7 @@ namespace raft::matrix {
  * @param[in] indices of the rows to be copied
  */
 template <typename m_t, typename idx_t, typename layout>
-void copy_rows(const raft::handle_t& handle,
+void copy_rows(raft::device_resources const& handle,
                raft::device_matrix_view<const m_t, idx_t, layout> in,
                raft::device_matrix_view<m_t, idx_t, layout> out,
                raft::device_vector_view<idx_t, idx_t> indices)
@@ -60,7 +65,7 @@ void copy_rows(const raft::handle_t& handle,
  * @param[out] out: output matrix
  */
 template <typename m_t, typename matrix_idx_t>
-void copy(const raft::handle_t& handle,
+void copy(raft::device_resources const& handle,
           raft::device_matrix_view<const m_t, matrix_idx_t, col_major> in,
           raft::device_matrix_view<m_t, matrix_idx_t, col_major> out)
 {
@@ -79,7 +84,7 @@ void copy(const raft::handle_t& handle,
  * @param out: output matrix
  */
 template <typename m_t, typename idx_t>
-void trunc_zero_origin(const raft::handle_t& handle,
+void trunc_zero_origin(raft::device_resources const& handle,
                        raft::device_matrix_view<const m_t, idx_t, col_major> in,
                        raft::device_matrix_view<m_t, idx_t, col_major> out)
 {
@@ -94,4 +99,6 @@ void trunc_zero_origin(const raft::handle_t& handle,
                                       handle.get_stream());
 }
 
+/** @} */  // end of group matrix_copy
+
 }  // namespace raft::matrix
diff --git a/cpp/include/raft/matrix/detail/gather.cuh b/cpp/include/raft/matrix/detail/gather.cuh
index 3738afba5d..f6dc60bf85 100644
--- a/cpp/include/raft/matrix/detail/gather.cuh
+++ b/cpp/include/raft/matrix/detail/gather.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,40 +16,64 @@
 
 #pragma once
 
+#include <raft/core/operators.hpp>
+#include <raft/cudart_utils.h>
+
 namespace raft {
 namespace matrix {
 namespace detail {
 
-// gatherKernel conditionally copies rows from the source matrix 'in' into the destination matrix
-// 'out' according to a map (or a transformed map)
-template <typename MatrixIteratorT,
+/** Tiling policy for the gather kernel.
+ *
+ * The output matrix is considered as a flattened array, an approach that provides much better
+ * performance than 1 row per block when D is small. Additionally, each thread works on multiple
+ * output elements using an unrolled loop (approx. 30% faster than working on a single element)
+ */
+template <int tpb, int wpt>
+struct gather_policy {
+  static constexpr int n_threads       = tpb;
+  static constexpr int work_per_thread = wpt;
+  static constexpr int stride          = tpb * wpt;
+};
+
+/** Conditionally copies rows from the source matrix 'in' into the destination matrix
+ * 'out' according to a map (or a transformed map) */
+template <typename Policy,
+          typename InputIteratorT,
           typename MapIteratorT,
           typename StencilIteratorT,
-          int TPB,
           typename PredicateOp,
           typename MapTransformOp,
-          typename IndexT = int>
-__global__ void gatherKernel(const MatrixIteratorT in,
-                             IndexT D,
-                             IndexT N,
-                             MapIteratorT map,
-                             StencilIteratorT stencil,
-                             MatrixIteratorT out,
-                             PredicateOp pred_op,
-                             MapTransformOp transform_op)
+          typename OutputIteratorT,
+          typename IndexT>
+__global__ void gather_kernel(const InputIteratorT in,
+                              IndexT D,
+                              IndexT len,
+                              const MapIteratorT map,
+                              StencilIteratorT stencil,
+                              OutputIteratorT out,
+                              PredicateOp pred_op,
+                              MapTransformOp transform_op)
 {
   typedef typename std::iterator_traits<MapIteratorT>::value_type MapValueT;
   typedef typename std::iterator_traits<StencilIteratorT>::value_type StencilValueT;
 
-  IndexT outRowStart        = blockIdx.x * D;
-  MapValueT map_val         = map[blockIdx.x];
-  StencilValueT stencil_val = stencil[blockIdx.x];
+#pragma unroll
+  for (IndexT wid = 0; wid < Policy::work_per_thread; wid++) {
+    IndexT tid = threadIdx.x + (Policy::work_per_thread * static_cast<IndexT>(blockIdx.x) + wid) *
+                                 Policy::n_threads;
+    if (tid < len) {
+      IndexT i_dst = tid / D;
+      IndexT j     = tid % D;
+
+      MapValueT map_val         = map[i_dst];
+      StencilValueT stencil_val = stencil[i_dst];
 
-  bool predicate = pred_op(stencil_val);
-  if (predicate) {
-    IndexT inRowStart = transform_op(map_val) * D;
-    for (int i = threadIdx.x; i < D; i += TPB) {
-      out[outRowStart + i] = in[inRowStart + i];
+      bool predicate = pred_op(stencil_val);
+      if (predicate) {
+        IndexT i_src = transform_op(map_val);
+        out[tid]     = in[i_src * D + j];
+      }
     }
   }
 }
@@ -58,7 +82,7 @@ __global__ void gatherKernel(const MatrixIteratorT in,
  * @brief  gather conditionally copies rows from a source matrix into a destination matrix according
  * to a transformed map.
  *
- * @tparam MatrixIteratorT      Random-access iterator type, for reading input matrix (may be a
+ * @tparam InputIteratorT       Random-access iterator type, for reading input matrix (may be a
  * simple pointer type).
  * @tparam MapIteratorT         Random-access iterator type, for reading input map (may be a simple
  * pointer type).
@@ -67,7 +91,10 @@ __global__ void gatherKernel(const MatrixIteratorT in,
  * @tparam UnaryPredicateOp     Unary lambda expression or operator type, UnaryPredicateOp's result
  * type must be convertible to bool type.
  * @tparam MapTransformOp       Unary lambda expression or operator type, MapTransformOp's result
- * type must be convertible to IndexT (= int) type.
+ * type must be convertible to IndexT.
+ * @tparam OutputIteratorT      Random-access iterator type, for writing output matrix (may be a
+ * simple pointer type).
+ * @tparam IndexT               Index type.
  *
  * @param  in           Pointer to the input matrix (assumed to be row-major)
  * @param  D            Leading dimension of the input matrix 'in', which in-case of row-major
@@ -81,18 +108,20 @@ __global__ void gatherKernel(const MatrixIteratorT in,
  * @param  transform_op The transformation operation, transforms the map values to IndexT
  * @param  stream       CUDA stream to launch kernels within
  */
-template <typename MatrixIteratorT,
+template <typename InputIteratorT,
           typename MapIteratorT,
           typename StencilIteratorT,
           typename UnaryPredicateOp,
-          typename MapTransformOp>
-void gatherImpl(const MatrixIteratorT in,
-                int D,
-                int N,
-                MapIteratorT map,
+          typename MapTransformOp,
+          typename OutputIteratorT,
+          typename IndexT>
+void gatherImpl(const InputIteratorT in,
+                IndexT D,
+                IndexT N,
+                const MapIteratorT map,
                 StencilIteratorT stencil,
-                int map_length,
-                MatrixIteratorT out,
+                IndexT map_length,
+                OutputIteratorT out,
                 UnaryPredicateOp pred_op,
                 MapTransformOp transform_op,
                 cudaStream_t stream)
@@ -100,9 +129,6 @@ void gatherImpl(const MatrixIteratorT in,
   // skip in case of 0 length input
   if (map_length <= 0 || N <= 0 || D <= 0) return;
 
-  // signed integer type for indexing or global offsets
-  typedef int IndexT;
-
   // map value type
   typedef typename std::iterator_traits<MapIteratorT>::value_type MapValueT;
 
@@ -119,38 +145,26 @@ void gatherImpl(const MatrixIteratorT in,
   static_assert((std::is_convertible<PredicateOpReturnT, bool>::value),
                 "UnaryPredicateOp's result type must be convertible to bool type");
 
-  if (D <= 32) {
-    gatherKernel<MatrixIteratorT,
-                 MapIteratorT,
-                 StencilIteratorT,
-                 32,
-                 UnaryPredicateOp,
-                 MapTransformOp>
-      <<<map_length, 32, 0, stream>>>(in, D, N, map, stencil, out, pred_op, transform_op);
-  } else if (D <= 64) {
-    gatherKernel<MatrixIteratorT,
-                 MapIteratorT,
-                 StencilIteratorT,
-                 64,
-                 UnaryPredicateOp,
-                 MapTransformOp>
-      <<<map_length, 64, 0, stream>>>(in, D, N, map, stencil, out, pred_op, transform_op);
-  } else if (D <= 128) {
-    gatherKernel<MatrixIteratorT,
-                 MapIteratorT,
-                 StencilIteratorT,
-                 128,
-                 UnaryPredicateOp,
-                 MapTransformOp>
-      <<<map_length, 128, 0, stream>>>(in, D, N, map, stencil, out, pred_op, transform_op);
+  IndexT len        = map_length * D;
+  constexpr int TPB = 128;
+  const int n_sm    = raft::getMultiProcessorCount();
+  // The following empirical heuristics enforce that we keep a good balance between having enough
+  // blocks and enough work per thread.
+  if (len < static_cast<IndexT>(32 * TPB * n_sm)) {
+    using Policy    = gather_policy<TPB, 1>;
+    IndexT n_blocks = raft::ceildiv(map_length * D, static_cast<IndexT>(Policy::stride));
+    gather_kernel<Policy><<<n_blocks, Policy::n_threads, 0, stream>>>(
+      in, D, len, map, stencil, out, pred_op, transform_op);
+  } else if (len < static_cast<IndexT>(32 * 4 * TPB * n_sm)) {
+    using Policy    = gather_policy<TPB, 4>;
+    IndexT n_blocks = raft::ceildiv(map_length * D, static_cast<IndexT>(Policy::stride));
+    gather_kernel<Policy><<<n_blocks, Policy::n_threads, 0, stream>>>(
+      in, D, len, map, stencil, out, pred_op, transform_op);
   } else {
-    gatherKernel<MatrixIteratorT,
-                 MapIteratorT,
-                 StencilIteratorT,
-                 256,
-                 UnaryPredicateOp,
-                 MapTransformOp>
-      <<<map_length, 256, 0, stream>>>(in, D, N, map, stencil, out, pred_op, transform_op);
+    using Policy    = gather_policy<TPB, 8>;
+    IndexT n_blocks = raft::ceildiv(map_length * D, static_cast<IndexT>(Policy::stride));
+    gather_kernel<Policy><<<n_blocks, Policy::n_threads, 0, stream>>>(
+      in, D, len, map, stencil, out, pred_op, transform_op);
   }
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
@@ -158,10 +172,13 @@ void gatherImpl(const MatrixIteratorT in,
 /**
  * @brief  gather copies rows from a source matrix into a destination matrix according to a map.
  *
- * @tparam MatrixIteratorT      Random-access iterator type, for reading input matrix (may be a
+ * @tparam InputIteratorT       Random-access iterator type, for reading input matrix (may be a
  * simple pointer type).
  * @tparam MapIteratorT         Random-access iterator type, for reading input map (may be a simple
  * pointer type).
+ * @tparam OutputIteratorT      Random-access iterator type, for writing output matrix (may be a
+ * simple pointer type).
+ * @tparam IndexT               Index type.
  *
  * @param  in           Pointer to the input matrix (assumed to be row-major)
  * @param  D            Leading dimension of the input matrix 'in', which in-case of row-major
@@ -172,39 +189,33 @@ void gatherImpl(const MatrixIteratorT in,
  * @param  out          Pointer to the output matrix (assumed to be row-major)
  * @param  stream       CUDA stream to launch kernels within
  */
-template <typename MatrixIteratorT, typename MapIteratorT>
-void gather(const MatrixIteratorT in,
-            int D,
-            int N,
-            MapIteratorT map,
-            int map_length,
-            MatrixIteratorT out,
+template <typename InputIteratorT, typename MapIteratorT, typename OutputIteratorT, typename IndexT>
+void gather(const InputIteratorT in,
+            IndexT D,
+            IndexT N,
+            const MapIteratorT map,
+            IndexT map_length,
+            OutputIteratorT out,
             cudaStream_t stream)
 {
   typedef typename std::iterator_traits<MapIteratorT>::value_type MapValueT;
   gatherImpl(
-    in,
-    D,
-    N,
-    map,
-    map,
-    map_length,
-    out,
-    [] __device__(MapValueT val) { return true; },
-    [] __device__(MapValueT val) { return val; },
-    stream);
+    in, D, N, map, map, map_length, out, raft::const_op(true), raft::identity_op(), stream);
 }
 
 /**
  * @brief  gather copies rows from a source matrix into a destination matrix according to a
  * transformed map.
  *
- * @tparam MatrixIteratorT      Random-access iterator type, for reading input matrix (may be a
+ * @tparam InputIteratorT       Random-access iterator type, for reading input matrix (may be a
  * simple pointer type).
  * @tparam MapIteratorT         Random-access iterator type, for reading input map (may be a simple
  * pointer type).
  * @tparam MapTransformOp       Unary lambda expression or operator type, MapTransformOp's result
- * type must be convertible to IndexT (= int) type.
+ * type must be convertible to IndexT.
+ * @tparam OutputIteratorT      Random-access iterator type, for writing output matrix (may be a
+ * simple pointer type).
+ * @tparam IndexT               Index type.
  *
  * @param  in           Pointer to the input matrix (assumed to be row-major)
  * @param  D            Leading dimension of the input matrix 'in', which in-case of row-major
@@ -216,35 +227,29 @@ void gather(const MatrixIteratorT in,
  * @param  transform_op The transformation operation, transforms the map values to IndexT
  * @param  stream       CUDA stream to launch kernels within
  */
-template <typename MatrixIteratorT, typename MapIteratorT, typename MapTransformOp>
-void gather(const MatrixIteratorT in,
-            int D,
-            int N,
-            MapIteratorT map,
-            int map_length,
-            MatrixIteratorT out,
+template <typename InputIteratorT,
+          typename MapIteratorT,
+          typename MapTransformOp,
+          typename OutputIteratorT,
+          typename IndexT>
+void gather(const InputIteratorT in,
+            IndexT D,
+            IndexT N,
+            const MapIteratorT map,
+            IndexT map_length,
+            OutputIteratorT out,
             MapTransformOp transform_op,
             cudaStream_t stream)
 {
   typedef typename std::iterator_traits<MapIteratorT>::value_type MapValueT;
-  gatherImpl(
-    in,
-    D,
-    N,
-    map,
-    map,
-    map_length,
-    out,
-    [] __device__(MapValueT val) { return true; },
-    transform_op,
-    stream);
+  gatherImpl(in, D, N, map, map, map_length, out, raft::const_op(true), transform_op, stream);
 }
 
 /**
  * @brief  gather_if conditionally copies rows from a source matrix into a destination matrix
  * according to a map.
  *
- * @tparam MatrixIteratorT      Random-access iterator type, for reading input matrix (may be a
+ * @tparam InputIteratorT       Random-access iterator type, for reading input matrix (may be a
  * simple pointer type).
  * @tparam MapIteratorT         Random-access iterator type, for reading input map (may be a simple
  * pointer type).
@@ -252,6 +257,9 @@ void gather(const MatrixIteratorT in,
  * simple pointer type).
  * @tparam UnaryPredicateOp     Unary lambda expression or operator type, UnaryPredicateOp's result
  * type must be convertible to bool type.
+ * @tparam OutputIteratorT      Random-access iterator type, for writing output matrix (may be a
+ * simple pointer type).
+ * @tparam IndexT               Index type.
  *
  * @param  in           Pointer to the input matrix (assumed to be row-major)
  * @param  D            Leading dimension of the input matrix 'in', which in-case of row-major
@@ -264,39 +272,31 @@ void gather(const MatrixIteratorT in,
  * @param  pred_op      Predicate to apply to the stencil values
  * @param  stream       CUDA stream to launch kernels within
  */
-template <typename MatrixIteratorT,
+template <typename InputIteratorT,
           typename MapIteratorT,
           typename StencilIteratorT,
-          typename UnaryPredicateOp>
-void gather_if(const MatrixIteratorT in,
-               int D,
-               int N,
-               MapIteratorT map,
+          typename UnaryPredicateOp,
+          typename OutputIteratorT,
+          typename IndexT>
+void gather_if(const InputIteratorT in,
+               IndexT D,
+               IndexT N,
+               const MapIteratorT map,
                StencilIteratorT stencil,
-               int map_length,
-               MatrixIteratorT out,
+               IndexT map_length,
+               OutputIteratorT out,
                UnaryPredicateOp pred_op,
                cudaStream_t stream)
 {
   typedef typename std::iterator_traits<MapIteratorT>::value_type MapValueT;
-  gatherImpl(
-    in,
-    D,
-    N,
-    map,
-    stencil,
-    map_length,
-    out,
-    pred_op,
-    [] __device__(MapValueT val) { return val; },
-    stream);
+  gatherImpl(in, D, N, map, stencil, map_length, out, pred_op, raft::identity_op(), stream);
 }
 
 /**
  * @brief  gather_if conditionally copies rows from a source matrix into a destination matrix
  * according to a transformed map.
  *
- * @tparam MatrixIteratorT      Random-access iterator type, for reading input matrix (may be a
+ * @tparam InputIteratorT       Random-access iterator type, for reading input matrix (may be a
  * simple pointer type).
  * @tparam MapIteratorT         Random-access iterator type, for reading input map (may be a simple
  * pointer type).
@@ -305,7 +305,10 @@ void gather_if(const MatrixIteratorT in,
  * @tparam UnaryPredicateOp     Unary lambda expression or operator type, UnaryPredicateOp's result
  * type must be convertible to bool type.
  * @tparam MapTransformOp       Unary lambda expression or operator type, MapTransformOp's result
- * type must be convertible to IndexT (= int) type.
+ * type must be convertible to IndexT type.
+ * @tparam OutputIteratorT      Random-access iterator type, for writing output matrix (may be a
+ * simple pointer type).
+ * @tparam IndexT               Index type.
  *
  * @param  in           Pointer to the input matrix (assumed to be row-major)
  * @param  D            Leading dimension of the input matrix 'in', which in-case of row-major
@@ -319,18 +322,20 @@ void gather_if(const MatrixIteratorT in,
  * @param  transform_op The transformation operation, transforms the map values to IndexT
  * @param  stream       CUDA stream to launch kernels within
  */
-template <typename MatrixIteratorT,
+template <typename InputIteratorT,
           typename MapIteratorT,
           typename StencilIteratorT,
           typename UnaryPredicateOp,
-          typename MapTransformOp>
-void gather_if(const MatrixIteratorT in,
-               int D,
-               int N,
-               MapIteratorT map,
+          typename MapTransformOp,
+          typename OutputIteratorT,
+          typename IndexT>
+void gather_if(const InputIteratorT in,
+               IndexT D,
+               IndexT N,
+               const MapIteratorT map,
                StencilIteratorT stencil,
-               int map_length,
-               MatrixIteratorT out,
+               IndexT map_length,
+               OutputIteratorT out,
                UnaryPredicateOp pred_op,
                MapTransformOp transform_op,
                cudaStream_t stream)
diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh
index 605726bea6..ef8f0e88c1 100644
--- a/cpp/include/raft/matrix/detail/linewise_op.cuh
+++ b/cpp/include/raft/matrix/detail/linewise_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -796,7 +796,8 @@ struct MatrixLinewiseOp {
                   "layout for in and out must be either padded row or col major");
 
     // also statically assert padded matrix alignment == 2^i*VecBytes
-    assert(raft::Pow2<VecBytes>::areSameAlignOffsets(in, out));
+    RAFT_EXPECTS(raft::Pow2<VecBytes>::areSameAlignOffsets(in.data_handle(), out.data_handle()),
+                 "The matrix views in and out does not have correct alignment");
 
     if (alongLines)
       return matrixLinewiseVecRowsSpan<Type,
diff --git a/cpp/include/raft/matrix/detail/math.cuh b/cpp/include/raft/matrix/detail/math.cuh
index 64c85a03a5..96398e9c74 100644
--- a/cpp/include/raft/matrix/detail/math.cuh
+++ b/cpp/include/raft/matrix/detail/math.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,10 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 #include <cub/cub.cuh>
+#include <raft/core/operators.hpp>
 #include <raft/linalg/binary_op.cuh>
 #include <raft/linalg/map_then_reduce.cuh>
 #include <raft/linalg/matrix_vector_op.cuh>
@@ -86,10 +87,10 @@ void seqRoot(math_t* in,
         if (a < math_t(0)) {
           return math_t(0);
         } else {
-          return sqrt(a * scalar);
+          return raft::sqrt(a * scalar);
         }
       } else {
-        return sqrt(a * scalar);
+        return raft::sqrt(a * scalar);
       }
     },
     stream);
@@ -188,21 +189,19 @@ void reciprocal(math_t* in, math_t* out, IdxType len, cudaStream_t stream)
 template <typename math_t>
 void setValue(math_t* out, const math_t* in, math_t scalar, int len, cudaStream_t stream = 0)
 {
-  raft::linalg::unaryOp(
-    out, in, len, [scalar] __device__(math_t in) { return scalar; }, stream);
+  raft::linalg::unaryOp(out, in, len, raft::const_op(scalar), stream);
 }
 
 template <typename math_t, typename IdxType = int>
 void ratio(
-  const raft::handle_t& handle, math_t* src, math_t* dest, IdxType len, cudaStream_t stream)
+  raft::device_resources const& handle, math_t* src, math_t* dest, IdxType len, cudaStream_t stream)
 {
   auto d_src  = src;
   auto d_dest = dest;
 
   rmm::device_scalar<math_t> d_sum(stream);
   auto* d_sum_ptr = d_sum.data();
-  auto no_op      = [] __device__(math_t in) { return in; };
-  raft::linalg::mapThenSumReduce(d_sum_ptr, len, no_op, stream, src);
+  raft::linalg::mapThenSumReduce(d_sum_ptr, len, raft::identity_op{}, stream, src);
   raft::linalg::unaryOp(
     d_dest, d_src, len, [=] __device__(math_t a) { return a / (*d_sum_ptr); }, stream);
 }
@@ -217,15 +216,7 @@ void matrixVectorBinaryMult(Type* data,
                             cudaStream_t stream)
 {
   raft::linalg::matrixVectorOp(
-    data,
-    data,
-    vec,
-    n_col,
-    n_row,
-    rowMajor,
-    bcastAlongRows,
-    [] __device__(Type a, Type b) { return a * b; },
-    stream);
+    data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, raft::mul_op(), stream);
 }
 
 template <typename Type, typename IdxType = int, int TPB = 256>
@@ -264,15 +255,7 @@ void matrixVectorBinaryDiv(Type* data,
                            cudaStream_t stream)
 {
   raft::linalg::matrixVectorOp(
-    data,
-    data,
-    vec,
-    n_col,
-    n_row,
-    rowMajor,
-    bcastAlongRows,
-    [] __device__(Type a, Type b) { return a / b; },
-    stream);
+    data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, raft::div_op(), stream);
 }
 
 template <typename Type, typename IdxType = int, int TPB = 256>
@@ -295,7 +278,7 @@ void matrixVectorBinaryDivSkipZero(Type* data,
       rowMajor,
       bcastAlongRows,
       [] __device__(Type a, Type b) {
-        if (raft::myAbs(b) < Type(1e-10))
+        if (raft::abs(b) < Type(1e-10))
           return Type(0);
         else
           return a / b;
@@ -311,7 +294,7 @@ void matrixVectorBinaryDivSkipZero(Type* data,
       rowMajor,
       bcastAlongRows,
       [] __device__(Type a, Type b) {
-        if (raft::myAbs(b) < Type(1e-10))
+        if (raft::abs(b) < Type(1e-10))
           return a;
         else
           return a / b;
@@ -330,15 +313,7 @@ void matrixVectorBinaryAdd(Type* data,
                            cudaStream_t stream)
 {
   raft::linalg::matrixVectorOp(
-    data,
-    data,
-    vec,
-    n_col,
-    n_row,
-    rowMajor,
-    bcastAlongRows,
-    [] __device__(Type a, Type b) { return a + b; },
-    stream);
+    data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, raft::add_op(), stream);
 }
 
 template <typename Type, typename IdxType = int, int TPB = 256>
@@ -351,15 +326,7 @@ void matrixVectorBinarySub(Type* data,
                            cudaStream_t stream)
 {
   raft::linalg::matrixVectorOp(
-    data,
-    data,
-    vec,
-    n_col,
-    n_row,
-    rowMajor,
-    bcastAlongRows,
-    [] __device__(Type a, Type b) { return a - b; },
-    stream);
+    data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, raft::sub_op(), stream);
 }
 
 // Computes an argmin/argmax column-wise in a DxN matrix
diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
index 17a40be5d6..ef3a873d90 100644
--- a/cpp/include/raft/matrix/detail/matrix.cuh
+++ b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@
 #include <cstddef>
 #include <cuda_runtime.h>
 #include <cusolverDn.h>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/util/cudart_utils.hpp>
 
@@ -299,7 +299,7 @@ void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream)
 }
 
 template <typename m_t, typename idx_t = int>
-m_t getL2Norm(const raft::handle_t& handle, const m_t* in, idx_t size, cudaStream_t stream)
+m_t getL2Norm(raft::device_resources const& handle, const m_t* in, idx_t size, cudaStream_t stream)
 {
   cublasHandle_t cublasH = handle.get_cublas_handle();
   m_t normval            = 0;
diff --git a/cpp/include/raft/matrix/detail/print.hpp b/cpp/include/raft/matrix/detail/print.hpp
index fc3d14861c..814c6a0b4b 100644
--- a/cpp/include/raft/matrix/detail/print.hpp
+++ b/cpp/include/raft/matrix/detail/print.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@
 #include <cstddef>
 #include <cuda_runtime.h>
 #include <cusolverDn.h>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/util/cudart_utils.hpp>
 
diff --git a/cpp/include/raft/spatial/knn/detail/topk.cuh b/cpp/include/raft/matrix/detail/select_k.cuh
similarity index 59%
rename from cpp/include/raft/spatial/knn/detail/topk.cuh
rename to cpp/include/raft/matrix/detail/select_k.cuh
index f4dcb53088..ac1ba3dfa3 100644
--- a/cpp/include/raft/spatial/knn/detail/topk.cuh
+++ b/cpp/include/raft/matrix/detail/select_k.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,34 +16,34 @@
 
 #pragma once
 
-#include "topk/radix_topk.cuh"
-#include "topk/warpsort_topk.cuh"
+#include "select_radix.cuh"
+#include "select_warpsort.cuh"
 
 #include <raft/core/nvtx.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
-namespace raft::spatial::knn::detail {
+namespace raft::matrix::detail {
 
 /**
  * Select k smallest or largest key/values from each row in the input data.
  *
- * If you think of the input data `in_keys` as a row-major matrix with len columns and
- * batch_size rows, then this function selects k smallest/largest values in each row and fills
- * in the row-major matrix `out` of size (batch_size, k).
+ * If you think of the input data `in_val` as a row-major matrix with `len` columns and
+ * `batch_size` rows, then this function selects `k` smallest/largest values in each row and fills
+ * in the row-major matrix `out_val` of size (batch_size, k).
  *
  * @tparam T
  *   the type of the keys (what is being compared).
  * @tparam IdxT
  *   the index type (what is being selected together with the keys).
  *
- * @param[in] in
+ * @param[in] in_val
  *   contiguous device array of inputs of size (len * batch_size);
  *   these are compared and selected.
  * @param[in] in_idx
  *   contiguous device array of inputs of size (len * batch_size);
- *   typically, these are indices of the corresponding in_keys.
+ *   typically, these are indices of the corresponding in_val.
  * @param batch_size
  *   number of input rows, i.e. the batch size.
  * @param len
@@ -51,12 +51,12 @@ namespace raft::spatial::knn::detail {
  *   Invariant: len >= k.
  * @param k
  *   the number of outputs to select in each input row.
- * @param[out] out
+ * @param[out] out_val
  *   contiguous device array of outputs of size (k * batch_size);
- *   the k smallest/largest values from each row of the `in_keys`.
+ *   the k smallest/largest values from each row of the `in_val`.
  * @param[out] out_idx
  *   contiguous device array of outputs of size (k * batch_size);
- *   the payload selected together with `out`.
+ *   the payload selected together with `out_val`.
  * @param select_min
  *   whether to select k smallest (true) or largest (false) keys.
  * @param stream
@@ -64,28 +64,28 @@ namespace raft::spatial::knn::detail {
  *           memory pool here to avoid memory allocations within the call).
  */
 template <typename T, typename IdxT>
-void select_topk(const T* in,
-                 const IdxT* in_idx,
-                 size_t batch_size,
-                 size_t len,
-                 int k,
-                 T* out,
-                 IdxT* out_idx,
-                 bool select_min,
-                 rmm::cuda_stream_view stream,
-                 rmm::mr::device_memory_resource* mr = nullptr)
+void select_k(const T* in_val,
+              const IdxT* in_idx,
+              size_t batch_size,
+              size_t len,
+              int k,
+              T* out_val,
+              IdxT* out_idx,
+              bool select_min,
+              rmm::cuda_stream_view stream,
+              rmm::mr::device_memory_resource* mr = nullptr)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "matrix::select_topk(batch_size = %zu, len = %zu, k = %d)", batch_size, len, k);
+    "matrix::select_k(batch_size = %zu, len = %zu, k = %d)", batch_size, len, k);
   // TODO (achirkin): investigate the trade-off for a wider variety of inputs.
   const bool radix_faster = batch_size >= 64 && len >= 102400 && k >= 128;
-  if (k <= raft::spatial::knn::detail::topk::kMaxCapacity && !radix_faster) {
-    topk::warp_sort_topk<T, IdxT>(
-      in, in_idx, batch_size, len, k, out, out_idx, select_min, stream, mr);
+  if (k <= select::warpsort::kMaxCapacity && !radix_faster) {
+    select::warpsort::select_k<T, IdxT>(
+      in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
   } else {
-    topk::radix_topk<T, IdxT, (sizeof(T) >= 4 ? 11 : 8), 512>(
-      in, in_idx, batch_size, len, k, out, out_idx, select_min, stream, mr);
+    select::radix::select_k<T, IdxT, (sizeof(T) >= 4 ? 11 : 8), 512>(
+      in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
   }
 }
 
-}  // namespace raft::spatial::knn::detail
+}  // namespace raft::matrix::detail
diff --git a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh b/cpp/include/raft/matrix/detail/select_radix.cuh
similarity index 87%
rename from cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
rename to cpp/include/raft/matrix/detail/select_radix.cuh
index 9c0f20b706..de19e63a4c 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
+++ b/cpp/include/raft/matrix/detail/select_radix.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <raft/core/cudart_utils.hpp>
+#include <raft/core/detail/macros.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/util/device_atomics.cuh>
 #include <raft/util/pow2_utils.cuh>
@@ -27,29 +28,29 @@
 #include <cub/block/block_store.cuh>
 #include <cub/block/radix_rank_sort_operations.cuh>
 
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
-namespace raft::spatial::knn::detail::topk {
+namespace raft::matrix::detail::select::radix {
 
 constexpr int ITEM_PER_THREAD      = 32;
 constexpr int VECTORIZED_READ_SIZE = 16;
 
 template <int BitsPerPass>
-__host__ __device__ constexpr int calc_num_buckets()
+_RAFT_HOST_DEVICE constexpr int calc_num_buckets()
 {
   return 1 << BitsPerPass;
 }
 
 template <typename T, int BitsPerPass>
-__host__ __device__ constexpr int calc_num_passes()
+_RAFT_HOST_DEVICE constexpr int calc_num_passes()
 {
   return ceildiv<int>(sizeof(T) * 8, BitsPerPass);
 }
 
 // Minimum reasonable block size for the given radix size.
 template <int BitsPerPass>
-__host__ __device__ constexpr int calc_min_block_size()
+_RAFT_HOST_DEVICE constexpr int calc_min_block_size()
 {
   return 1 << std::max<int>(BitsPerPass - 4, Pow2<WarpSize>::Log2 + 1);
 }
@@ -62,7 +63,7 @@ __host__ __device__ constexpr int calc_min_block_size()
  * NB: Use pass=-1 for calc_mask().
  */
 template <typename T, int BitsPerPass>
-__device__ constexpr int calc_start_bit(int pass)
+_RAFT_DEVICE constexpr int calc_start_bit(int pass)
 {
   int start_bit = static_cast<int>(sizeof(T) * 8) - (pass + 1) * BitsPerPass;
   if (start_bit < 0) { start_bit = 0; }
@@ -70,7 +71,7 @@ __device__ constexpr int calc_start_bit(int pass)
 }
 
 template <typename T, int BitsPerPass>
-__device__ constexpr unsigned calc_mask(int pass)
+_RAFT_DEVICE constexpr unsigned calc_mask(int pass)
 {
   static_assert(BitsPerPass <= 31);
   int num_bits = calc_start_bit<T, BitsPerPass>(pass - 1) - calc_start_bit<T, BitsPerPass>(pass);
@@ -82,7 +83,7 @@ __device__ constexpr unsigned calc_mask(int pass)
  * as of integers.
  */
 template <typename T>
-__device__ typename cub::Traits<T>::UnsignedBits twiddle_in(T key, bool greater)
+_RAFT_DEVICE typename cub::Traits<T>::UnsignedBits twiddle_in(T key, bool greater)
 {
   auto bits = reinterpret_cast<typename cub::Traits<T>::UnsignedBits&>(key);
   bits      = cub::Traits<T>::TwiddleIn(bits);
@@ -91,7 +92,7 @@ __device__ typename cub::Traits<T>::UnsignedBits twiddle_in(T key, bool greater)
 }
 
 template <typename T, int BitsPerPass>
-__device__ int calc_bucket(T x, int start_bit, unsigned mask, bool greater)
+_RAFT_DEVICE int calc_bucket(T x, int start_bit, unsigned mask, bool greater)
 {
   static_assert(BitsPerPass <= sizeof(int) * 8 - 1);  // so return type can be int
   return (twiddle_in(x, greater) >> start_bit) & mask;
@@ -112,7 +113,7 @@ __device__ int calc_bucket(T x, int start_bit, unsigned mask, bool greater)
  * @param f the lambda taking two arguments (T x, IdxT idx)
  */
 template <typename T, typename IdxT, typename Func>
-__device__ void vectorized_process(const T* in, IdxT len, Func f)
+_RAFT_DEVICE void vectorized_process(const T* in, IdxT len, Func f)
 {
   const IdxT stride = blockDim.x * gridDim.x;
   const int tid     = blockIdx.x * blockDim.x + threadIdx.x;
@@ -167,18 +168,18 @@ struct Counter {
  * (see steps 4-1 in `radix_kernel` description).
  */
 template <typename T, typename IdxT, int BitsPerPass>
-__device__ void filter_and_histogram(const T* in_buf,
-                                     const IdxT* in_idx_buf,
-                                     T* out_buf,
-                                     IdxT* out_idx_buf,
-                                     T* out,
-                                     IdxT* out_idx,
-                                     IdxT len,
-                                     Counter<T, IdxT>* counter,
-                                     IdxT* histogram,
-                                     bool greater,
-                                     int pass,
-                                     int k)
+_RAFT_DEVICE void filter_and_histogram(const T* in_buf,
+                                       const IdxT* in_idx_buf,
+                                       T* out_buf,
+                                       IdxT* out_idx_buf,
+                                       T* out,
+                                       IdxT* out_idx,
+                                       IdxT len,
+                                       Counter<T, IdxT>* counter,
+                                       IdxT* histogram,
+                                       bool greater,
+                                       int pass,
+                                       int k)
 {
   constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
   __shared__ IdxT histogram_smem[num_buckets];
@@ -260,10 +261,10 @@ __device__ void filter_and_histogram(const T* in_buf,
  * (step 2 in `radix_kernel` description)
  */
 template <typename IdxT, int BitsPerPass, int BlockSize>
-__device__ void scan(volatile IdxT* histogram,
-                     const int start,
-                     const int num_buckets,
-                     const IdxT current)
+_RAFT_DEVICE void scan(volatile IdxT* histogram,
+                       const int start,
+                       const int num_buckets,
+                       const IdxT current)
 {
   typedef cub::BlockScan<IdxT, BlockSize> BlockScan;
   __shared__ typename BlockScan::TempStorage temp_storage;
@@ -284,7 +285,7 @@ __device__ void scan(volatile IdxT* histogram,
  *  (steps 2-3 in `radix_kernel` description)
  */
 template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
-__device__ void choose_bucket(Counter<T, IdxT>* counter, IdxT* histogram, const IdxT k)
+_RAFT_DEVICE void choose_bucket(Counter<T, IdxT>* counter, IdxT* histogram, const IdxT k)
 {
   constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
   int index                 = threadIdx.x;
@@ -547,21 +548,21 @@ inline dim3 get_optimal_grid_size(size_t req_batch_size, size_t len)
  *           memory pool here to avoid memory allocations within the call).
  */
 template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
-void radix_topk(const T* in,
-                const IdxT* in_idx,
-                size_t batch_size,
-                size_t len,
-                int k,
-                T* out,
-                IdxT* out_idx,
-                bool select_min,
-                rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr = nullptr)
+void select_k(const T* in,
+              const IdxT* in_idx,
+              size_t batch_size,
+              size_t len,
+              int k,
+              T* out,
+              IdxT* out_idx,
+              bool select_min,
+              rmm::cuda_stream_view stream,
+              rmm::mr::device_memory_resource* mr = nullptr)
 {
   // reduce the block size if the input length is too small.
   if constexpr (BlockSize > calc_min_block_size<BitsPerPass>()) {
     if (BlockSize * ITEM_PER_THREAD > len) {
-      return radix_topk<T, IdxT, BitsPerPass, BlockSize / 2>(
+      return select_k<T, IdxT, BitsPerPass, BlockSize / 2>(
         in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
     }
   }
@@ -573,23 +574,33 @@ void radix_topk(const T* in,
   dim3 blocks           = get_optimal_grid_size<T, IdxT, BitsPerPass, BlockSize>(batch_size, len);
   size_t max_chunk_size = blocks.y;
 
-  auto pool_guard = raft::get_pool_memory_resource(
-    mr,
-    max_chunk_size * (sizeof(Counter<T, IdxT>)            // counters
-                      + sizeof(IdxT) * (num_buckets + 2)  // histograms and IdxT bufs
-                      + sizeof(T) * 2                     // T bufs
-                      ));
+  size_t req_aux = max_chunk_size * (sizeof(Counter<T, IdxT>) + num_buckets * sizeof(IdxT));
+  size_t req_buf = max_chunk_size * len * 2 * (sizeof(T) + sizeof(IdxT));
+  size_t mem_req = req_aux + req_buf;
+  size_t mem_free, mem_total;
+  RAFT_CUDA_TRY(cudaMemGetInfo(&mem_free, &mem_total));
+  std::optional<rmm::mr::managed_memory_resource> managed_memory;
+  rmm::mr::device_memory_resource* mr_buf = nullptr;
+  if (mem_req > mem_free) {
+    // if there's not enough memory for buffers on the device, resort to the managed memory.
+    mem_req = req_aux;
+    managed_memory.emplace();
+    mr_buf = &managed_memory.value();
+  }
+
+  auto pool_guard = raft::get_pool_memory_resource(mr, mem_req);
   if (pool_guard) {
-    RAFT_LOG_DEBUG("radix_topk: using pool memory resource with initial size %zu bytes",
+    RAFT_LOG_DEBUG("radix::select_k: using pool memory resource with initial size %zu bytes",
                    pool_guard->pool_size());
   }
+  if (mr_buf == nullptr) { mr_buf = mr; }
 
   rmm::device_uvector<Counter<T, IdxT>> counters(max_chunk_size, stream, mr);
-  rmm::device_uvector<IdxT> histograms(num_buckets * max_chunk_size, stream, mr);
-  rmm::device_uvector<T> buf1(len * max_chunk_size, stream, mr);
-  rmm::device_uvector<IdxT> idx_buf1(len * max_chunk_size, stream, mr);
-  rmm::device_uvector<T> buf2(len * max_chunk_size, stream, mr);
-  rmm::device_uvector<IdxT> idx_buf2(len * max_chunk_size, stream, mr);
+  rmm::device_uvector<IdxT> histograms(max_chunk_size * num_buckets, stream, mr);
+  rmm::device_uvector<T> buf1(max_chunk_size * len, stream, mr_buf);
+  rmm::device_uvector<IdxT> idx_buf1(max_chunk_size * len, stream, mr_buf);
+  rmm::device_uvector<T> buf2(max_chunk_size * len, stream, mr_buf);
+  rmm::device_uvector<IdxT> idx_buf2(max_chunk_size * len, stream, mr_buf);
 
   for (size_t offset = 0; offset < batch_size; offset += max_chunk_size) {
     blocks.y = std::min(max_chunk_size, batch_size - offset);
@@ -646,4 +657,4 @@ void radix_topk(const T* in,
   }
 }
 
-}  // namespace raft::spatial::knn::detail::topk
+}  // namespace raft::matrix::detail::select::radix
diff --git a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh b/cpp/include/raft/matrix/detail/select_warpsort.cuh
similarity index 71%
rename from cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
rename to cpp/include/raft/matrix/detail/select_warpsort.cuh
index cbe9f36e97..d362b73792 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
+++ b/cpp/include/raft/matrix/detail/select_warpsort.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,11 @@
 
 #pragma once
 
-#include "bitonic_sort.cuh"
-
+#include <raft/core/detail/macros.hpp>
 #include <raft/core/logger.hpp>
+#include <raft/util/bitonic_sort.cuh>
 #include <raft/util/cuda_utils.cuh>
+#include <raft/util/integer_utils.hpp>
 #include <raft/util/pow2_utils.cuh>
 
 #include <algorithm>
@@ -31,12 +32,12 @@
 
 /*
   Three APIs of different scopes are provided:
-    1. host function: warp_sort_topk()
+    1. host function: select_k()
     2. block-wide API: class block_sort
     3. warp-wide API: several implementations of warp_sort_*
 
 
-  1. warp_sort_topk()
+  1. select_k()
     (see the docstring)
 
   2. class block_sort
@@ -74,7 +75,7 @@
     These two classes can be regarded as fixed size priority queue for a warp.
     Usage is similar to class block_sort. No shared memory is needed.
 
-    The host function (warp_sort_topk) uses a heuristic to choose between these two classes for
+    The host function (select_k) uses a heuristic to choose between these two classes for
     sorting, warp_sort_immediate being chosen when the number of inputs per warp is somewhat small
     (see the usage of LaunchThreshold<warp_sort_immediate>::len_factor_for_choosing).
 
@@ -94,7 +95,7 @@
       }
  */
 
-namespace raft::spatial::knn::detail::topk {
+namespace raft::matrix::detail::select::warpsort {
 
 static constexpr int kMaxCapacity = 256;
 
@@ -102,18 +103,12 @@ namespace {
 
 /** Whether 'left` should indeed be on the left w.r.t. `right`. */
 template <bool Ascending, typename T>
-__device__ __forceinline__ auto is_ordered(T left, T right) -> bool
+_RAFT_DEVICE _RAFT_FORCEINLINE auto is_ordered(T left, T right) -> bool
 {
   if constexpr (Ascending) { return left < right; }
   if constexpr (!Ascending) { return left > right; }
 }
 
-constexpr auto calc_capacity(int k) -> int
-{
-  int capacity = isPo2(k) ? k : (1 << (log2(k) + 1));
-  return capacity;
-}
-
 }  // namespace
 
 /**
@@ -134,7 +129,7 @@ constexpr auto calc_capacity(int k) -> int
  */
 template <int Capacity, bool Ascending, typename T, typename IdxT>
 class warp_sort {
-  static_assert(isPo2(Capacity));
+  static_assert(is_a_power_of_two(Capacity));
   static_assert(std::is_default_constructible_v<IdxT>);
 
  public:
@@ -148,13 +143,16 @@ class warp_sort {
   /** The number of elements to select. */
   const int k;
 
+  /** Extra memory required per-block for keeping the state (shared or global). */
+  constexpr static auto mem_required(uint32_t block_size) -> size_t { return 0; }
+
   /**
    * Construct the warp_sort empty queue.
    *
    * @param k
    *   number of elements to select.
    */
-  __device__ warp_sort(int k) : k(k)
+  _RAFT_DEVICE warp_sort(int k) : k(k)
   {
 #pragma unroll
     for (int i = 0; i < kMaxArrLen; i++) {
@@ -182,7 +180,7 @@ class warp_sort {
    *    It serves as a conditional; when `false` the function does nothing.
    *    We need it to ensure threads within a full warp don't diverge calling `bitonic::merge()`.
    */
-  __device__ void load_sorted(const T* in, const IdxT* in_idx, bool do_merge = true)
+  _RAFT_DEVICE void load_sorted(const T* in, const IdxT* in_idx, bool do_merge = true)
   {
     if (do_merge) {
       int idx = Pow2<kWarpWidth>::mod(laneId()) ^ Pow2<kWarpWidth>::Mask;
@@ -198,7 +196,7 @@ class warp_sort {
       }
     }
     if (kWarpWidth < WarpSize || do_merge) {
-      topk::bitonic<kMaxArrLen>(Ascending, kWarpWidth).merge(val_arr_, idx_arr_);
+      util::bitonic<kMaxArrLen>(Ascending, kWarpWidth).merge(val_arr_, idx_arr_);
     }
   }
 
@@ -211,14 +209,23 @@ class warp_sort {
    * @param[out] out_idx
    *   device pointer to a contiguous array, unique per-subwarp of size `kWarpWidth`
    *    (length: k <= kWarpWidth * kMaxArrLen).
+   * @param valF (optional) postprocess values (T -> OutT)
+   * @param idxF (optional) postprocess indices (IdxT -> OutIdxT)
    */
-  __device__ void store(T* out, IdxT* out_idx) const
+  template <typename OutT,
+            typename OutIdxT,
+            typename ValF = identity_op,
+            typename IdxF = identity_op>
+  _RAFT_DEVICE void store(OutT* out,
+                          OutIdxT* out_idx,
+                          ValF valF = raft::identity_op{},
+                          IdxF idxF = raft::identity_op{}) const
   {
     int idx = Pow2<kWarpWidth>::mod(laneId());
 #pragma unroll kMaxArrLen
     for (int i = 0; i < kMaxArrLen && idx < k; i++, idx += kWarpWidth) {
-      out[idx]     = val_arr_[i];
-      out_idx[idx] = idx_arr_[i];
+      out[idx]     = valF(val_arr_[i]);
+      out_idx[idx] = idxF(idx_arr_[i]);
     }
   }
 
@@ -245,8 +252,8 @@ class warp_sort {
    *   the associated indices of the elements in the same format as `keys_in`.
    */
   template <int PerThreadSizeIn>
-  __device__ __forceinline__ void merge_in(const T* __restrict__ keys_in,
-                                           const IdxT* __restrict__ ids_in)
+  _RAFT_DEVICE _RAFT_FORCEINLINE void merge_in(const T* __restrict__ keys_in,
+                                               const IdxT* __restrict__ ids_in)
   {
 #pragma unroll
     for (int i = std::min(kMaxArrLen, PerThreadSizeIn); i > 0; i--) {
@@ -257,7 +264,7 @@ class warp_sort {
         idx_arr_[kMaxArrLen - i] = ids_in[PerThreadSizeIn - i];
       }
     }
-    topk::bitonic<kMaxArrLen>(Ascending, kWarpWidth).merge(val_arr_, idx_arr_);
+    util::bitonic<kMaxArrLen>(Ascending, kWarpWidth).merge(val_arr_, idx_arr_);
   }
 };
 
@@ -275,8 +282,9 @@ class warp_sort_filtered : public warp_sort<Capacity, Ascending, T, IdxT> {
   using warp_sort<Capacity, Ascending, T, IdxT>::kDummy;
   using warp_sort<Capacity, Ascending, T, IdxT>::kWarpWidth;
   using warp_sort<Capacity, Ascending, T, IdxT>::k;
+  using warp_sort<Capacity, Ascending, T, IdxT>::mem_required;
 
-  __device__ warp_sort_filtered(int k, T limit)
+  explicit _RAFT_DEVICE warp_sort_filtered(int k, T limit = kDummy)
     : warp_sort<Capacity, Ascending, T, IdxT>(k), buf_len_(0), k_th_(limit)
   {
 #pragma unroll
@@ -286,12 +294,14 @@ class warp_sort_filtered : public warp_sort<Capacity, Ascending, T, IdxT> {
     }
   }
 
-  __device__ __forceinline__ explicit warp_sort_filtered(int k)
-    : warp_sort_filtered<Capacity, Ascending, T, IdxT>(k, kDummy)
+  _RAFT_DEVICE _RAFT_FORCEINLINE static auto init_blockwide(int k,
+                                                            uint8_t* = nullptr,
+                                                            T limit  = kDummy)
   {
+    return warp_sort_filtered<Capacity, Ascending, T, IdxT>{k, limit};
   }
 
-  __device__ void add(T val, IdxT idx)
+  _RAFT_DEVICE void add(T val, IdxT idx)
   {
     // comparing for k_th should reduce the total amount of updates:
     // `false` means the input value is surely not in the top-k values.
@@ -309,22 +319,22 @@ class warp_sort_filtered : public warp_sort<Capacity, Ascending, T, IdxT> {
     if (do_add) { add_to_buf_(val, idx); }
   }
 
-  __device__ void done()
+  _RAFT_DEVICE void done()
   {
     if (any(buf_len_ != 0)) { merge_buf_(); }
   }
 
  private:
-  __device__ __forceinline__ void set_k_th_()
+  _RAFT_DEVICE _RAFT_FORCEINLINE void set_k_th_()
   {
     // NB on using srcLane: it's ok if it is outside the warp size / width;
     //                      the modulo op will be done inside the __shfl_sync.
     k_th_ = shfl(val_arr_[kMaxArrLen - 1], k - 1, kWarpWidth);
   }
 
-  __device__ __forceinline__ void merge_buf_()
+  _RAFT_DEVICE _RAFT_FORCEINLINE void merge_buf_()
   {
-    topk::bitonic<kMaxBufLen>(!Ascending, kWarpWidth).sort(val_buf_, idx_buf_);
+    util::bitonic<kMaxBufLen>(!Ascending, kWarpWidth).sort(val_buf_, idx_buf_);
     this->merge_in<kMaxBufLen>(val_buf_, idx_buf_);
     buf_len_ = 0;
     set_k_th_();  // contains warp sync
@@ -334,7 +344,7 @@ class warp_sort_filtered : public warp_sort<Capacity, Ascending, T, IdxT> {
     }
   }
 
-  __device__ __forceinline__ void add_to_buf_(T val, IdxT idx)
+  _RAFT_DEVICE _RAFT_FORCEINLINE void add_to_buf_(T val, IdxT idx)
   {
     // NB: the loop is used here to ensure the constant indexing,
     //     to not force the buffers spill into the local memory.
@@ -373,8 +383,9 @@ class warp_sort_distributed : public warp_sort<Capacity, Ascending, T, IdxT> {
   using warp_sort<Capacity, Ascending, T, IdxT>::kDummy;
   using warp_sort<Capacity, Ascending, T, IdxT>::kWarpWidth;
   using warp_sort<Capacity, Ascending, T, IdxT>::k;
+  using warp_sort<Capacity, Ascending, T, IdxT>::mem_required;
 
-  __device__ warp_sort_distributed(int k, T limit)
+  explicit _RAFT_DEVICE warp_sort_distributed(int k, T limit = kDummy)
     : warp_sort<Capacity, Ascending, T, IdxT>(k),
       buf_val_(kDummy),
       buf_idx_(IdxT{}),
@@ -383,12 +394,14 @@ class warp_sort_distributed : public warp_sort<Capacity, Ascending, T, IdxT> {
   {
   }
 
-  __device__ __forceinline__ explicit warp_sort_distributed(int k)
-    : warp_sort_distributed<Capacity, Ascending, T, IdxT>(k, kDummy)
+  _RAFT_DEVICE _RAFT_FORCEINLINE static auto init_blockwide(int k,
+                                                            uint8_t* = nullptr,
+                                                            T limit  = kDummy)
   {
+    return warp_sort_distributed<Capacity, Ascending, T, IdxT>{k, limit};
   }
 
-  __device__ void add(T val, IdxT idx)
+  _RAFT_DEVICE void add(T val, IdxT idx)
   {
     // mask tells which lanes in the warp have valid items to be added
     uint32_t mask = ballot(is_ordered<Ascending>(val, k_th_));
@@ -428,7 +441,7 @@ class warp_sort_distributed : public warp_sort<Capacity, Ascending, T, IdxT> {
     }
   }
 
-  __device__ void done()
+  _RAFT_DEVICE void done()
   {
     if (buf_len_ != 0) {
       merge_buf_();
@@ -437,16 +450,16 @@ class warp_sort_distributed : public warp_sort<Capacity, Ascending, T, IdxT> {
   }
 
  private:
-  __device__ __forceinline__ void set_k_th_()
+  _RAFT_DEVICE _RAFT_FORCEINLINE void set_k_th_()
   {
     // NB on using srcLane: it's ok if it is outside the warp size / width;
     //                      the modulo op will be done inside the __shfl_sync.
     k_th_ = shfl(val_arr_[kMaxArrLen - 1], k - 1, kWarpWidth);
   }
 
-  __device__ __forceinline__ void merge_buf_()
+  _RAFT_DEVICE _RAFT_FORCEINLINE void merge_buf_()
   {
-    topk::bitonic<1>(!Ascending, kWarpWidth).sort(buf_val_, buf_idx_);
+    util::bitonic<1>(!Ascending, kWarpWidth).sort(buf_val_, buf_idx_);
     this->merge_in<1>(&buf_val_, &buf_idx_);
     set_k_th_();  // contains warp sync
     buf_val_ = kDummy;
@@ -463,6 +476,117 @@ class warp_sort_distributed : public warp_sort<Capacity, Ascending, T, IdxT> {
   T k_th_;
 };
 
+/**
+ * The same as `warp_sort_distributed`, but keeps the temporary value and index buffers
+ * in the given external pointers (normally, a shared memory pointer should be passed in).
+ */
+template <int Capacity, bool Ascending, typename T, typename IdxT>
+class warp_sort_distributed_ext : public warp_sort<Capacity, Ascending, T, IdxT> {
+ public:
+  using warp_sort<Capacity, Ascending, T, IdxT>::kDummy;
+  using warp_sort<Capacity, Ascending, T, IdxT>::kWarpWidth;
+  using warp_sort<Capacity, Ascending, T, IdxT>::k;
+
+  constexpr static auto mem_required(uint32_t block_size) -> size_t
+  {
+    return (sizeof(T) + sizeof(IdxT)) * block_size;
+  }
+
+  _RAFT_DEVICE warp_sort_distributed_ext(int k, T* val_buf, IdxT* idx_buf, T limit = kDummy)
+    : warp_sort<Capacity, Ascending, T, IdxT>(k),
+      val_buf_(val_buf),
+      idx_buf_(idx_buf),
+      buf_len_(0),
+      k_th_(limit)
+  {
+    val_buf_[laneId()] = kDummy;
+  }
+
+  _RAFT_DEVICE static auto init_blockwide(int k, uint8_t* shmem, T limit = kDummy)
+  {
+    T* val_buf    = nullptr;
+    IdxT* idx_buf = nullptr;
+    if constexpr (alignof(T) >= alignof(IdxT)) {
+      val_buf = reinterpret_cast<T*>(shmem);
+      idx_buf = reinterpret_cast<IdxT*>(val_buf + blockDim.x);
+    } else {
+      idx_buf = reinterpret_cast<IdxT*>(shmem);
+      val_buf = reinterpret_cast<T*>(idx_buf + blockDim.x);
+    }
+    auto warp_offset = Pow2<WarpSize>::roundDown(threadIdx.x);
+    val_buf += warp_offset;
+    idx_buf += warp_offset;
+    return warp_sort_distributed_ext<Capacity, Ascending, T, IdxT>{k, val_buf, idx_buf, limit};
+  }
+
+  _RAFT_DEVICE void add(T val, IdxT idx)
+  {
+    bool do_add = is_ordered<Ascending>(val, k_th_);
+    // mask tells which lanes in the warp have valid items to be added
+    uint32_t mask = ballot(do_add);
+    if (mask == 0) { return; }
+    // where to put the element in the tmp buffer
+    int dst_ix = buf_len_ + __popc(mask & ((1u << laneId()) - 1u));
+    // put all elements, which fit into the current tmp buffer
+    if (do_add && dst_ix < WarpSize) {
+      val_buf_[dst_ix] = val;
+      idx_buf_[dst_ix] = idx;
+      do_add           = false;
+    }
+    // Total number of elements to be added
+    buf_len_ += __popc(mask);
+    // If the buffer is still not full, we can return
+    if (buf_len_ < WarpSize) { return; }
+    // Otherwise, merge the warp tmp buffer into the queue
+    merge_buf_();  // implies warp sync
+    buf_len_ -= WarpSize;
+    // save the inputs that couldn't fit before the merge
+    if (do_add) {
+      dst_ix -= WarpSize;
+      val_buf_[dst_ix] = val;
+      idx_buf_[dst_ix] = idx;
+    }
+  }
+
+  _RAFT_DEVICE void done()
+  {
+    if (buf_len_ != 0) {
+      merge_buf_();
+      buf_len_ = 0;
+    }
+    __syncthreads();
+  }
+
+ private:
+  _RAFT_DEVICE _RAFT_FORCEINLINE void set_k_th_()
+  {
+    // NB on using srcLane: it's ok if it is outside the warp size / width;
+    //                      the modulo op will be done inside the __shfl_sync.
+    k_th_ = shfl(val_arr_[kMaxArrLen - 1], k - 1, kWarpWidth);
+  }
+
+  _RAFT_DEVICE _RAFT_FORCEINLINE void merge_buf_()
+  {
+    __syncwarp();  // make sure the threads are aware of the data written by others
+    T buf_val          = val_buf_[laneId()];
+    IdxT buf_idx       = idx_buf_[laneId()];
+    val_buf_[laneId()] = kDummy;
+    util::bitonic<1>(!Ascending, kWarpWidth).sort(buf_val, buf_idx);
+    this->merge_in<1>(&buf_val, &buf_idx);
+    set_k_th_();  // contains warp sync
+  }
+
+  using warp_sort<Capacity, Ascending, T, IdxT>::kMaxArrLen;
+  using warp_sort<Capacity, Ascending, T, IdxT>::val_arr_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::idx_arr_;
+
+  T* val_buf_;
+  IdxT* idx_buf_;
+  uint32_t buf_len_;  // 0 <= buf_len_ < WarpSize
+
+  T k_th_;
+};
+
 /**
  * This version of warp_sort adds every input element into the intermediate sorting
  * buffer, and thus does the sorting step every `Capacity` input elements.
@@ -475,8 +599,10 @@ class warp_sort_immediate : public warp_sort<Capacity, Ascending, T, IdxT> {
   using warp_sort<Capacity, Ascending, T, IdxT>::kDummy;
   using warp_sort<Capacity, Ascending, T, IdxT>::kWarpWidth;
   using warp_sort<Capacity, Ascending, T, IdxT>::k;
+  using warp_sort<Capacity, Ascending, T, IdxT>::mem_required;
 
-  __device__ warp_sort_immediate(int k) : warp_sort<Capacity, Ascending, T, IdxT>(k), buf_len_(0)
+  explicit _RAFT_DEVICE warp_sort_immediate(int k)
+    : warp_sort<Capacity, Ascending, T, IdxT>(k), buf_len_(0)
   {
 #pragma unroll
     for (int i = 0; i < kMaxArrLen; i++) {
@@ -485,7 +611,12 @@ class warp_sort_immediate : public warp_sort<Capacity, Ascending, T, IdxT> {
     }
   }
 
-  __device__ void add(T val, IdxT idx)
+  _RAFT_DEVICE _RAFT_FORCEINLINE static auto init_blockwide(int k, uint8_t* = nullptr)
+  {
+    return warp_sort_immediate<Capacity, Ascending, T, IdxT>{k};
+  }
+
+  _RAFT_DEVICE void add(T val, IdxT idx)
   {
     // NB: the loop is used here to ensure the constant indexing,
     //     to not force the buffers spill into the local memory.
@@ -499,7 +630,7 @@ class warp_sort_immediate : public warp_sort<Capacity, Ascending, T, IdxT> {
 
     ++buf_len_;
     if (buf_len_ == kMaxArrLen) {
-      topk::bitonic<kMaxArrLen>(!Ascending, kWarpWidth).sort(val_buf_, idx_buf_);
+      util::bitonic<kMaxArrLen>(!Ascending, kWarpWidth).sort(val_buf_, idx_buf_);
       this->merge_in<kMaxArrLen>(val_buf_, idx_buf_);
 #pragma unroll
       for (int i = 0; i < kMaxArrLen; i++) {
@@ -509,10 +640,10 @@ class warp_sort_immediate : public warp_sort<Capacity, Ascending, T, IdxT> {
     }
   }
 
-  __device__ void done()
+  _RAFT_DEVICE void done()
   {
     if (buf_len_ != 0) {
-      topk::bitonic<kMaxArrLen>(!Ascending, kWarpWidth).sort(val_buf_, idx_buf_);
+      util::bitonic<kMaxArrLen>(!Ascending, kWarpWidth).sort(val_buf_, idx_buf_);
       this->merge_in<kMaxArrLen>(val_buf_, idx_buf_);
     }
   }
@@ -544,15 +675,11 @@ class block_sort {
   using queue_t = WarpSortWarpWide<Capacity, Ascending, T, IdxT>;
 
   template <typename... Args>
-  __device__ block_sort(int k, uint8_t* smem_buf, Args... args) : queue_(k, args...)
+  _RAFT_DEVICE block_sort(int k, Args... args) : queue_(queue_t::init_blockwide(k, args...))
   {
-    val_smem_             = reinterpret_cast<T*>(smem_buf);
-    const int num_of_warp = subwarp_align::div(blockDim.x);
-    idx_smem_             = reinterpret_cast<IdxT*>(
-      smem_buf + Pow2<256>::roundUp(ceildiv(num_of_warp, 2) * sizeof(T) * k));
   }
 
-  __device__ void add(T val, IdxT idx) { queue_.add(val, idx); }
+  _RAFT_DEVICE void add(T val, IdxT idx) { queue_.add(val, idx); }
 
   /**
    * At the point of calling this function, the warp-level queues consumed all input
@@ -560,22 +687,26 @@ class block_sort {
    *
    * Here we tree-merge the results using the shared memory and block sync.
    */
-  __device__ void done()
+  _RAFT_DEVICE void done(uint8_t* smem_buf)
   {
     queue_.done();
 
+    int nwarps    = subwarp_align::div(blockDim.x);
+    auto val_smem = reinterpret_cast<T*>(smem_buf);
+    auto idx_smem = reinterpret_cast<IdxT*>(
+      smem_buf + Pow2<256>::roundUp(ceildiv(nwarps, 2) * sizeof(T) * queue_.k));
+
     const int warp_id = subwarp_align::div(threadIdx.x);
     // NB: there is no need for the second __synchthreads between .load_sorted and .store:
     //     we shift the pointers every iteration, such that individual warps either access the same
     //     locations or do not overlap with any of the other warps. The access patterns within warps
     //     are different for the two functions, but .load_sorted implies warp sync at the end, so
     //     there is no need for __syncwarp either.
-    for (int shift_mask = ~0, nwarps = subwarp_align::div(blockDim.x), split = (nwarps + 1) >> 1;
-         nwarps > 1;
+    for (int shift_mask = ~0, split = (nwarps + 1) >> 1; nwarps > 1;
          nwarps = split, split = (nwarps + 1) >> 1) {
       if (warp_id < nwarps && warp_id >= split) {
         int dst_warp_shift = (warp_id - (split & shift_mask)) * queue_.k;
-        queue_.store(val_smem_ + dst_warp_shift, idx_smem_ + dst_warp_shift);
+        queue_.store(val_smem + dst_warp_shift, idx_smem + dst_warp_shift);
       }
       __syncthreads();
 
@@ -585,22 +716,27 @@ class block_sort {
         // The last argument serves as a condition for loading
         //  -- to make sure threads within a full warp do not diverge on `bitonic::merge()`
         queue_.load_sorted(
-          val_smem_ + src_warp_shift, idx_smem_ + src_warp_shift, warp_id < nwarps - split);
+          val_smem + src_warp_shift, idx_smem + src_warp_shift, warp_id < nwarps - split);
       }
     }
   }
 
   /** Save the content by the pointer location. */
-  __device__ void store(T* out, IdxT* out_idx) const
+  template <typename OutT,
+            typename OutIdxT,
+            typename ValF = identity_op,
+            typename IdxF = identity_op>
+  _RAFT_DEVICE void store(OutT* out,
+                          OutIdxT* out_idx,
+                          ValF valF = raft::identity_op{},
+                          IdxF idxF = raft::identity_op{}) const
   {
-    if (threadIdx.x < subwarp_align::Value) { queue_.store(out, out_idx); }
+    if (threadIdx.x < subwarp_align::Value) { queue_.store(out, out_idx, valF, idxF); }
   }
 
  private:
   using subwarp_align = Pow2<queue_t::kWarpWidth>;
   queue_t queue_;
-  T* val_smem_;
-  IdxT* idx_smem_;
 };
 
 /**
@@ -618,7 +754,10 @@ __launch_bounds__(256) __global__
   void block_kernel(const T* in, const IdxT* in_idx, IdxT len, int k, T* out, IdxT* out_idx)
 {
   extern __shared__ __align__(256) uint8_t smem_buf_bytes[];
-  block_sort<WarpSortClass, Capacity, Ascending, T, IdxT> queue(k, smem_buf_bytes);
+  using bq_t         = block_sort<WarpSortClass, Capacity, Ascending, T, IdxT>;
+  uint8_t* warp_smem = bq_t::queue_t::mem_required(blockDim.x) > 0 ? smem_buf_bytes : nullptr;
+  bq_t queue(k, warp_smem);
+
   in += blockIdx.y * len;
   if (in_idx != nullptr) { in_idx += blockIdx.y * len; }
 
@@ -629,7 +768,7 @@ __launch_bounds__(256) __global__
               (i < len && in_idx != nullptr) ? __ldcs(in_idx + i) : i);
   }
 
-  queue.done();
+  queue.done(smem_buf_bytes);
   const int block_id = blockIdx.x + gridDim.x * blockIdx.y;
   queue.store(out + block_id * k, out_idx + block_id * k);
 }
@@ -656,7 +795,7 @@ struct launch_setup {
                                   int* min_grid_size,
                                   int block_size_limit = 0)
   {
-    const int capacity = calc_capacity(k);
+    const int capacity = bound_by_power_of_two(k);
     if constexpr (Capacity > 1) {
       if (capacity < Capacity) {
         return launch_setup<WarpSortClass, T, IdxT, Capacity / 2>::calc_optimal_params(
@@ -689,7 +828,7 @@ struct launch_setup {
                      IdxT* out_idx,
                      rmm::cuda_stream_view stream)
   {
-    const int capacity = calc_capacity(k);
+    const int capacity = bound_by_power_of_two(k);
     if constexpr (Capacity > 1) {
       if (capacity < Capacity) {
         return launch_setup<WarpSortClass, T, IdxT, Capacity / 2>::kernel(k,
@@ -740,6 +879,18 @@ struct LaunchThreshold<warp_sort_filtered> {
   static constexpr int len_factor_for_single_block = 32;
 };
 
+template <>
+struct LaunchThreshold<warp_sort_distributed> {
+  static constexpr int len_factor_for_multi_block  = 2;
+  static constexpr int len_factor_for_single_block = 32;
+};
+
+template <>
+struct LaunchThreshold<warp_sort_distributed_ext> {
+  static constexpr int len_factor_for_multi_block  = 2;
+  static constexpr int len_factor_for_single_block = 32;
+};
+
 template <>
 struct LaunchThreshold<warp_sort_immediate> {
   static constexpr int len_factor_for_choosing     = 4;
@@ -751,7 +902,7 @@ template <template <int, bool, typename, typename> class WarpSortClass, typename
 void calc_launch_parameter(
   size_t batch_size, size_t len, int k, int* p_num_of_block, int* p_num_of_warp)
 {
-  const int capacity               = calc_capacity(k);
+  const int capacity               = bound_by_power_of_two(k);
   const int capacity_per_full_warp = std::max(capacity, WarpSize);
   int block_size                   = 0;
   int min_grid_size                = 0;
@@ -825,30 +976,30 @@ void calc_launch_parameter(
 }
 
 template <template <int, bool, typename, typename> class WarpSortClass, typename T, typename IdxT>
-void warp_sort_topk_(int num_of_block,
-                     int num_of_warp,
-                     const T* in,
-                     const IdxT* in_idx,
-                     size_t batch_size,
-                     size_t len,
-                     int k,
-                     T* out,
-                     IdxT* out_idx,
-                     bool select_min,
-                     rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr = nullptr)
+void select_k_(int num_of_block,
+               int num_of_warp,
+               const T* in,
+               const IdxT* in_idx,
+               size_t batch_size,
+               size_t len,
+               int k,
+               T* out,
+               IdxT* out_idx,
+               bool select_min,
+               rmm::cuda_stream_view stream,
+               rmm::mr::device_memory_resource* mr = nullptr)
 {
   auto pool_guard = raft::get_pool_memory_resource(
     mr, num_of_block * k * batch_size * 2 * std::max(sizeof(T), sizeof(IdxT)));
   if (pool_guard) {
-    RAFT_LOG_DEBUG("warp_sort_topk: using pool memory resource with initial size %zu bytes",
+    RAFT_LOG_DEBUG("warpsort::select_k: using pool memory resource with initial size %zu bytes",
                    pool_guard->pool_size());
   }
 
   rmm::device_uvector<T> tmp_val(num_of_block * k * batch_size, stream, mr);
   rmm::device_uvector<IdxT> tmp_idx(num_of_block * k * batch_size, stream, mr);
 
-  int capacity   = calc_capacity(k);
+  int capacity   = bound_by_power_of_two(k);
   int warp_width = std::min(capacity, WarpSize);
 
   T* result_val    = (num_of_block == 1) ? out : tmp_val.data();
@@ -856,6 +1007,8 @@ void warp_sort_topk_(int num_of_block,
   int block_dim    = num_of_warp * warp_width;
   int smem_size    = calc_smem_size_for_block_wide<T, IdxT>(num_of_warp, k);
 
+  smem_size = std::max<int>(smem_size, WarpSortClass<1, true, T, IdxT>::mem_required(block_dim));
+
   launch_setup<WarpSortClass, T, IdxT>::kernel(k,
                                                select_min,
                                                batch_size,
@@ -886,6 +1039,36 @@ void warp_sort_topk_(int num_of_block,
   }
 }
 
+template <typename T, typename IdxT, template <int, bool, typename, typename> class WarpSortClass>
+void select_k_impl(const T* in,
+                   const IdxT* in_idx,
+                   size_t batch_size,
+                   size_t len,
+                   int k,
+                   T* out,
+                   IdxT* out_idx,
+                   bool select_min,
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource* mr = nullptr)
+{
+  int num_of_block = 0;
+  int num_of_warp  = 0;
+  calc_launch_parameter<WarpSortClass, T, IdxT>(batch_size, len, k, &num_of_block, &num_of_warp);
+
+  select_k_<WarpSortClass, T, IdxT>(num_of_block,
+                                    num_of_warp,
+                                    in,
+                                    in_idx,
+                                    batch_size,
+                                    len,
+                                    k,
+                                    out,
+                                    out_idx,
+                                    select_min,
+                                    stream,
+                                    mr);
+}
+
 /**
  * Select k smallest or largest key/values from each row in the input data.
  *
@@ -924,23 +1107,23 @@ void warp_sort_topk_(int num_of_block,
  *           memory pool here to avoid memory allocations within the call).
  */
 template <typename T, typename IdxT>
-void warp_sort_topk(const T* in,
-                    const IdxT* in_idx,
-                    size_t batch_size,
-                    size_t len,
-                    int k,
-                    T* out,
-                    IdxT* out_idx,
-                    bool select_min,
-                    rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* mr = nullptr)
+void select_k(const T* in,
+              const IdxT* in_idx,
+              size_t batch_size,
+              size_t len,
+              int k,
+              T* out,
+              IdxT* out_idx,
+              bool select_min,
+              rmm::cuda_stream_view stream,
+              rmm::mr::device_memory_resource* mr = nullptr)
 {
   ASSERT(k <= kMaxCapacity, "Current max k is %d (requested %d)", kMaxCapacity, k);
   ASSERT(len <= size_t(std::numeric_limits<IdxT>::max()),
          "The `len` (%zu) does not fit the indexing type",
          len);
 
-  int capacity     = calc_capacity(k);
+  int capacity     = bound_by_power_of_two(k);
   int num_of_block = 0;
   int num_of_warp  = 0;
   calc_launch_parameter<warp_sort_immediate, T, IdxT>(
@@ -948,34 +1131,34 @@ void warp_sort_topk(const T* in,
   int len_per_thread = len / (num_of_block * num_of_warp * std::min(capacity, WarpSize));
 
   if (len_per_thread <= LaunchThreshold<warp_sort_immediate>::len_factor_for_choosing) {
-    warp_sort_topk_<warp_sort_immediate, T, IdxT>(num_of_block,
-                                                  num_of_warp,
-                                                  in,
-                                                  in_idx,
-                                                  batch_size,
-                                                  len,
-                                                  k,
-                                                  out,
-                                                  out_idx,
-                                                  select_min,
-                                                  stream,
-                                                  mr);
+    select_k_<warp_sort_immediate, T, IdxT>(num_of_block,
+                                            num_of_warp,
+                                            in,
+                                            in_idx,
+                                            batch_size,
+                                            len,
+                                            k,
+                                            out,
+                                            out_idx,
+                                            select_min,
+                                            stream,
+                                            mr);
   } else {
     calc_launch_parameter<warp_sort_filtered, T, IdxT>(
       batch_size, len, k, &num_of_block, &num_of_warp);
-    warp_sort_topk_<warp_sort_filtered, T, IdxT>(num_of_block,
-                                                 num_of_warp,
-                                                 in,
-                                                 in_idx,
-                                                 batch_size,
-                                                 len,
-                                                 k,
-                                                 out,
-                                                 out_idx,
-                                                 select_min,
-                                                 stream,
-                                                 mr);
+    select_k_<warp_sort_filtered, T, IdxT>(num_of_block,
+                                           num_of_warp,
+                                           in,
+                                           in_idx,
+                                           batch_size,
+                                           len,
+                                           k,
+                                           out,
+                                           out_idx,
+                                           select_min,
+                                           stream,
+                                           mr);
   }
 }
 
-}  // namespace raft::spatial::knn::detail::topk
+}  // namespace raft::matrix::detail::select::warpsort
diff --git a/cpp/include/raft/matrix/diagonal.cuh b/cpp/include/raft/matrix/diagonal.cuh
index d83c932fcd..22147e9f34 100644
--- a/cpp/include/raft/matrix/diagonal.cuh
+++ b/cpp/include/raft/matrix/diagonal.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,11 @@
 
 namespace raft::matrix {
 
+/**
+ * @defgroup matrix_diagonal Matrix diagonal operations
+ * @{
+ */
+
 /**
  * @brief Initialize a diagonal matrix with a vector
  * @param[in] handle: raft handle
@@ -29,7 +34,7 @@ namespace raft::matrix {
  * @param[out] matrix: matrix of size n_rows x n_cols
  */
 template <typename m_t, typename idx_t, typename layout>
-void set_diagonal(const raft::handle_t& handle,
+void set_diagonal(raft::device_resources const& handle,
                   raft::device_vector_view<const m_t, idx_t> vec,
                   raft::device_matrix_view<m_t, idx_t, layout> matrix)
 {
@@ -50,7 +55,7 @@ void set_diagonal(const raft::handle_t& handle,
  * @param[out] vec: vector of length k = min(n_rows, n_cols)
  */
 template <typename m_t, typename idx_t, typename layout>
-void get_diagonal(const raft::handle_t& handle,
+void get_diagonal(raft::device_resources const& handle,
                   raft::device_matrix_view<const m_t, idx_t, layout> matrix,
                   raft::device_vector_view<m_t, idx_t> vec)
 {
@@ -69,11 +74,14 @@ void get_diagonal(const raft::handle_t& handle,
  * @param[inout] inout: square input matrix with size len x len
  */
 template <typename m_t, typename idx_t, typename layout>
-void invert_diagonal(const raft::handle_t& handle,
+void invert_diagonal(raft::device_resources const& handle,
                      raft::device_matrix_view<m_t, idx_t, layout> inout)
 {
   // TODO: Use get_diagonal for this to support rectangular
   RAFT_EXPECTS(inout.extent(0) == inout.extent(1), "Matrix must be square.");
   detail::getDiagonalInverseMatrix(inout.data_handle(), inout.extent(0), handle.get_stream());
 }
+
+/** @} */  // end of group matrix_diagonal
+
 }  // namespace raft::matrix
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/gather.cuh b/cpp/include/raft/matrix/gather.cuh
index 12b0b94fa5..7710789bfe 100644
--- a/cpp/include/raft/matrix/gather.cuh
+++ b/cpp/include/raft/matrix/gather.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,138 +17,80 @@
 #pragma once
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/matrix/detail/gather.cuh>
+#include <raft/util/itertools.hpp>
 
 namespace raft::matrix {
 
 /**
- * @brief  gather copies rows from a source matrix into a destination matrix according to a map.
+ * @defgroup matrix_gather Matrix gather operations
+ * @{
+ */
+
+/**
+ * @brief Copies rows from a source matrix into a destination matrix according to a map.
  *
- * @tparam MatrixIteratorT      Random-access iterator type, for reading input matrix (may be a
- * simple pointer type).
- * @tparam MapIteratorT         Random-access iterator type, for reading input map (may be a simple
- * pointer type).
+ * For each output row, read the index in the input matrix from the map and copy the row.
  *
- * @param  in           Pointer to the input matrix (assumed to be row-major)
- * @param  D            Leading dimension of the input matrix 'in', which in-case of row-major
- * storage is the number of columns
- * @param  N            Second dimension
- * @param  map          Pointer to the input sequence of gather locations
- * @param  map_length   The length of 'map' and 'stencil'
- * @param  out          Pointer to the output matrix (assumed to be row-major)
+ * @tparam InputIteratorT  Input iterator type, for the input matrix (may be a pointer type).
+ * @tparam MapIteratorT    Input iterator type, for the map (may be a pointer type).
+ * @tparam OutputIteratorT Output iterator type, for the output matrix (may be a pointer type).
+ * @tparam IndexT          Index type.
+ *
+ * @param  in           Input matrix, dim = [N, D] (row-major)
+ * @param  D            Number of columns of the input/output matrices
+ * @param  N            Number of rows of the input matrix
+ * @param  map          Map of row indices to gather, dim = [map_length]
+ * @param  map_length   The length of 'map', number of rows of the output matrix
+ * @param  out          Output matrix, dim = [map_length, D] (row-major)
  * @param  stream       CUDA stream to launch kernels within
  */
-template <typename MatrixIteratorT, typename MapIteratorT>
-void gather(const MatrixIteratorT in,
-            int D,
-            int N,
-            MapIteratorT map,
-            int map_length,
-            MatrixIteratorT out,
+template <typename InputIteratorT, typename MapIteratorT, typename OutputIteratorT, typename IndexT>
+void gather(const InputIteratorT in,
+            IndexT D,
+            IndexT N,
+            const MapIteratorT map,
+            IndexT map_length,
+            OutputIteratorT out,
             cudaStream_t stream)
 {
   detail::gather(in, D, N, map, map_length, out, stream);
 }
 
 /**
- * @brief  gather copies rows from a source matrix into a destination matrix according to a map.
+ * @brief Copies rows from a source matrix into a destination matrix according to a transformed map.
  *
- * @tparam matrix_t      Matrix element type
- * @tparam map_t         Map vector type
- * @tparam idx_t integer type used for indexing
- * @param[in] handle            raft handle for managing resources
- * @param[in]  in           Input matrix (assumed to be row-major)
- * @param[in]  map          Vector of gather locations
- * @param[out]  out         Output matrix (assumed to be row-major)
- */
-template <typename matrix_t, typename map_t, typename idx_t>
-void gather(const raft::handle_t& handle,
-            raft::device_matrix_view<const matrix_t, idx_t, row_major> in,
-            raft::device_vector_view<const map_t, idx_t> map,
-            raft::device_matrix_view<matrix_t, idx_t, row_major> out)
-{
-  RAFT_EXPECTS(out.extent(0) == map.extent(0),
-               "Number of rows in output matrix must equal the size of the map vector");
-  RAFT_EXPECTS(out.extent(1) == in.extent(1),
-               "Number of columns in input and output matrices must be equal.");
-
-  raft::matrix::detail::gather(
-    const_cast<matrix_t*>(in.data_handle()),  // TODO: There's a better way to handle this
-    static_cast<int>(in.extent(1)),
-    static_cast<int>(in.extent(0)),
-    map.data_handle(),
-    static_cast<int>(map.extent(0)),
-    out.data_handle(),
-    handle.get_stream());
-}
-
-/**
- * @brief  gather copies rows from a source matrix into a destination matrix according to a
- * transformed map.
+ * For each output row, read the index in the input matrix from the map, apply a transformation to
+ * this input index and copy the row.
  *
- * @tparam matrix_t     Matrix type
- * @tparam map_t        Map vector type
- * @tparam map_xform_t       Unary lambda expression or operator type, MapTransformOp's result
- * type must be convertible to idx_t (= int) type.
- * @tparam idx_t integer type for indexing
- * @param[in] handle        raft handle for managing resources
- * @param[in]  in           Input matrix (assumed to be row-major)
- * @param[in]  map          Input vector of gather locations
- * @param[out]  out         Output matrix (assumed to be row-major)
- * @param[in]  transform_op The transformation operation, transforms the map values to idx_t
- */
-template <typename matrix_t, typename map_t, typename map_xform_t, typename idx_t>
-void gather(const raft::handle_t& handle,
-            raft::device_matrix_view<const matrix_t, idx_t, row_major> in,
-            raft::device_vector_view<const map_t, idx_t> map,
-            raft::device_matrix_view<const matrix_t, idx_t, row_major> out,
-            map_xform_t transform_op)
-{
-  RAFT_EXPECTS(out.extent(0) == map.extent(0),
-               "Number of rows in output matrix must equal the size of the map vector");
-  RAFT_EXPECTS(out.extent(1) == in.extent(1),
-               "Number of columns in input and output matrices must be equal.");
-
-  detail::gather(
-    const_cast<matrix_t*>(in.data_handle()),  // TODO: There's a better way to handle this
-    static_cast<int>(in.extent(1)),
-    static_cast<int>(in.extent(0)),
-    map,
-    static_cast<int>(map.extent(0)),
-    out.data_handle(),
-    transform_op,
-    handle.get_stream());
-}
-
-/**
- * @brief  gather copies rows from a source matrix into a destination matrix according to a
- * transformed map.
- *
- * @tparam MatrixIteratorT      Random-access iterator type, for reading input matrix (may be a
- * simple pointer type).
- * @tparam MapIteratorT         Random-access iterator type, for reading input map (may be a simple
- * pointer type).
- * @tparam MapTransformOp       Unary lambda expression or operator type, MapTransformOp's result
- * type must be convertible to IndexT (= int) type.
+ * @tparam InputIteratorT  Input iterator type, for the input matrix (may be a pointer type).
+ * @tparam MapIteratorT    Input iterator type, for the map (may be a pointer type).
+ * @tparam MapTransformOp  Unary lambda expression or operator type. MapTransformOp's result type
+ *                         must be convertible to IndexT.
+ * @tparam OutputIteratorT Output iterator type, for the output matrix (may be a pointer type).
+ * @tparam IndexT          Index type.
  *
- * @param  in           Pointer to the input matrix (assumed to be row-major)
- * @param  D            Leading dimension of the input matrix 'in', which in-case of row-major
- * storage is the number of columns
- * @param  N            Second dimension
- * @param  map          Pointer to the input sequence of gather locations
- * @param  map_length   The length of 'map' and 'stencil'
- * @param  out          Pointer to the output matrix (assumed to be row-major)
- * @param  transform_op The transformation operation, transforms the map values to IndexT
+ * @param  in           Input matrix, dim = [N, D] (row-major)
+ * @param  D            Number of columns of the input/output matrices
+ * @param  N            Number of rows of the input matrix
+ * @param  map          Map of row indices to gather, dim = [map_length]
+ * @param  map_length   The length of 'map', number of rows of the output matrix
+ * @param  out          Output matrix, dim = [map_length, D] (row-major)
+ * @param  transform_op Transformation to apply to map values
  * @param  stream       CUDA stream to launch kernels within
  */
-template <typename MatrixIteratorT, typename MapIteratorT, typename MapTransformOp>
-void gather(const MatrixIteratorT in,
-            int D,
-            int N,
-            MapIteratorT map,
-            int map_length,
-            MatrixIteratorT out,
+template <typename InputIteratorT,
+          typename MapIteratorT,
+          typename MapTransformOp,
+          typename OutputIteratorT,
+          typename IndexT>
+void gather(const InputIteratorT in,
+            IndexT D,
+            IndexT N,
+            const MapIteratorT map,
+            IndexT map_length,
+            OutputIteratorT out,
             MapTransformOp transform_op,
             cudaStream_t stream)
 {
@@ -156,40 +98,42 @@ void gather(const MatrixIteratorT in,
 }
 
 /**
- * @brief  gather_if conditionally copies rows from a source matrix into a destination matrix
- * according to a map.
+ * @brief Conditionally copies rows from a source matrix into a destination matrix.
  *
- * @tparam MatrixIteratorT      Random-access iterator type, for reading input matrix (may be a
- * simple pointer type).
- * @tparam MapIteratorT         Random-access iterator type, for reading input map (may be a simple
- * pointer type).
- * @tparam StencilIteratorT     Random-access iterator type, for reading input stencil (may be a
- * simple pointer type).
- * @tparam UnaryPredicateOp     Unary lambda expression or operator type, UnaryPredicateOp's result
- * type must be convertible to bool type.
+ * For each output row, read the index in the input matrix from the map, read a stencil value, apply
+ * a predicate to the stencil value, and if true, copy the row.
  *
- * @param  in           Pointer to the input matrix (assumed to be row-major)
- * @param  D            Leading dimension of the input matrix 'in', which in-case of row-major
- * storage is the number of columns
- * @param  N            Second dimension
- * @param  map          Pointer to the input sequence of gather locations
- * @param  stencil      Pointer to the input sequence of stencil or predicate values
- * @param  map_length   The length of 'map' and 'stencil'
- * @param  out          Pointer to the output matrix (assumed to be row-major)
+ * @tparam InputIteratorT   Input iterator type, for the input matrix (may be a pointer type).
+ * @tparam MapIteratorT     Input iterator type, for the map (may be a pointer type).
+ * @tparam StencilIteratorT Input iterator type, for the stencil (may be a pointer type).
+ * @tparam UnaryPredicateOp Unary lambda expression or operator type. UnaryPredicateOp's result type
+ *                          must be convertible to bool type.
+ * @tparam OutputIteratorT  Output iterator type, for the output matrix (may be a pointer type).
+ * @tparam IndexT           Index type.
+ *
+ * @param  in           Input matrix, dim = [N, D] (row-major)
+ * @param  D            Number of columns of the input/output matrices
+ * @param  N            Number of rows of the input matrix
+ * @param  map          Map of row indices to gather, dim = [map_length]
+ * @param  stencil      Sequence of stencil values, dim = [map_length]
+ * @param  map_length   The length of 'map' and 'stencil', number of rows of the output matrix
+ * @param  out          Output matrix, dim = [map_length, D] (row-major)
  * @param  pred_op      Predicate to apply to the stencil values
  * @param  stream       CUDA stream to launch kernels within
  */
-template <typename MatrixIteratorT,
+template <typename InputIteratorT,
           typename MapIteratorT,
           typename StencilIteratorT,
-          typename UnaryPredicateOp>
-void gather_if(const MatrixIteratorT in,
-               int D,
-               int N,
-               MapIteratorT map,
+          typename UnaryPredicateOp,
+          typename OutputIteratorT,
+          typename IndexT>
+void gather_if(const InputIteratorT in,
+               IndexT D,
+               IndexT N,
+               const MapIteratorT map,
                StencilIteratorT stencil,
-               int map_length,
-               MatrixIteratorT out,
+               IndexT map_length,
+               OutputIteratorT out,
                UnaryPredicateOp pred_op,
                cudaStream_t stream)
 {
@@ -197,91 +141,47 @@ void gather_if(const MatrixIteratorT in,
 }
 
 /**
- * @brief  gather_if conditionally copies rows from a source matrix into a destination matrix
- * according to a map.
+ * @brief Conditionally copies rows according to a transformed map.
  *
- * @tparam matrix_t      Matrix value type
- * @tparam map_t         Map vector type
- * @tparam stencil_t     Stencil vector type
- * @tparam unary_pred_t     Unary lambda expression or operator type, unary_pred_t's result
- * type must be convertible to bool type.
- * @tparam idx_t integer type for indexing
- * @param[in] handle        raft handle for managing resources
- * @param[in]  in           Input matrix (assumed to be row-major)
- * @param[in]  map          Input vector of gather locations
- * @param[in]  stencil      Input vector of stencil or predicate values
- * @param[out]  out         Output matrix (assumed to be row-major)
- * @param[in]  pred_op      Predicate to apply to the stencil values
- */
-template <typename matrix_t,
-          typename map_t,
-          typename stencil_t,
-          typename unary_pred_t,
-          typename idx_t>
-void gather_if(const raft::handle_t& handle,
-               raft::device_matrix_view<const matrix_t, idx_t, row_major> in,
-               raft::device_matrix_view<matrix_t, idx_t, row_major> out,
-               raft::device_vector_view<const map_t, idx_t> map,
-               raft::device_vector_view<const stencil_t, idx_t> stencil,
-               unary_pred_t pred_op)
-{
-  RAFT_EXPECTS(out.extent(0) == map.extent(0),
-               "Number of rows in output matrix must equal the size of the map vector");
-  RAFT_EXPECTS(out.extent(1) == in.extent(1),
-               "Number of columns in input and output matrices must be equal.");
-  RAFT_EXPECTS(map.extent(0) == stencil.extent(0),
-               "Number of elements in stencil must equal number of elements in map");
-
-  detail::gather_if(const_cast<matrix_t*>(in.data_handle()),
-                    out.extent(1),
-                    out.extent(0),
-                    map.data_handle(),
-                    stencil.data_handle(),
-                    map.extent(0),
-                    out.data_handle(),
-                    pred_op,
-                    handle.get_stream());
-}
-
-/**
- * @brief  gather_if conditionally copies rows from a source matrix into a destination matrix
- * according to a transformed map.
+ * For each output row, read the index in the input matrix from the map, read a stencil value,
+ * apply a predicate to the stencil value, and if true, apply a transformation to the input index
+ * and copy the row.
  *
- * @tparam MatrixIteratorT      Random-access iterator type, for reading input matrix (may be a
- * simple pointer type).
- * @tparam MapIteratorT         Random-access iterator type, for reading input map (may be a simple
- * pointer type).
- * @tparam StencilIteratorT     Random-access iterator type, for reading input stencil (may be a
- * simple pointer type).
- * @tparam UnaryPredicateOp     Unary lambda expression or operator type, UnaryPredicateOp's result
- * type must be convertible to bool type.
- * @tparam MapTransformOp       Unary lambda expression or operator type, MapTransformOp's result
- * type must be convertible to IndexT (= int) type.
+ * @tparam InputIteratorT   Input iterator type, for the input matrix (may be a pointer type).
+ * @tparam MapIteratorT     Input iterator type, for the map (may be a pointer type).
+ * @tparam MapTransformOp   Unary lambda expression or operator type. MapTransformOp's result type
+ *                          must be convertible to IndexT.
+ * @tparam StencilIteratorT Input iterator type, for the stencil (may be a pointer type).
+ * @tparam UnaryPredicateOp Unary lambda expression or operator type. UnaryPredicateOp's result type
+ *                          must be convertible to bool type.
+ * @tparam OutputIteratorT  Output iterator type, for the output matrix (may be a pointer type).
+ * @tparam IndexT           Index type.
  *
- * @param  in           Pointer to the input matrix (assumed to be row-major)
- * @param  D            Leading dimension of the input matrix 'in', which in-case of row-major
- * storage is the number of columns
- * @param  N            Second dimension
- * @param  map          Pointer to the input sequence of gather locations
- * @param  stencil      Pointer to the input sequence of stencil or predicate values
- * @param  map_length   The length of 'map' and 'stencil'
- * @param  out          Pointer to the output matrix (assumed to be row-major)
+ * @param  in           Input matrix, dim = [N, D] (row-major)
+ * @param  D            Number of columns of the input/output matrices
+ * @param  N            Number of rows of the input matrix
+ * @param  map          Map of row indices to gather, dim = [map_length]
+ * @param  stencil      Sequence of stencil values, dim = [map_length]
+ * @param  map_length   The length of 'map' and 'stencil', number of rows of the output matrix
+ * @param  out          Output matrix, dim = [map_length, D] (row-major)
  * @param  pred_op      Predicate to apply to the stencil values
- * @param  transform_op The transformation operation, transforms the map values to IndexT
+ * @param  transform_op Transformation to apply to map values
  * @param  stream       CUDA stream to launch kernels within
  */
-template <typename MatrixIteratorT,
+template <typename InputIteratorT,
           typename MapIteratorT,
           typename StencilIteratorT,
           typename UnaryPredicateOp,
-          typename MapTransformOp>
-void gather_if(const MatrixIteratorT in,
-               int D,
-               int N,
-               MapIteratorT map,
+          typename MapTransformOp,
+          typename OutputIteratorT,
+          typename IndexT>
+void gather_if(const InputIteratorT in,
+               IndexT D,
+               IndexT N,
+               const MapIteratorT map,
                StencilIteratorT stencil,
-               int map_length,
-               MatrixIteratorT out,
+               IndexT map_length,
+               OutputIteratorT out,
                UnaryPredicateOp pred_op,
                MapTransformOp transform_op,
                cudaStream_t stream)
@@ -290,38 +190,84 @@ void gather_if(const MatrixIteratorT in,
 }
 
 /**
- * @brief  gather_if conditionally copies rows from a source matrix into a destination matrix
- * according to a transformed map.
+ * @brief Copies rows from a source matrix into a destination matrix according to a transformed map.
  *
- * @tparam matrix_t      Matrix value type, for reading input matrix
- * @tparam map_t         Vector value type for map
- * @tparam stencil_t     Vector value type for stencil
- * @tparam unary_pred_t     Unary lambda expression or operator type, unary_pred_t's result
- * type must be convertible to bool type.
- * @tparam map_xform_t       Unary lambda expression or operator type, map_xform_t's result
- * type must be convertible to idx_t (= int) type.
- * @tparam idx_t integer type for indexing
- * @param[in] handle        raft handle for managing resources
- * @param[in]  in           Input matrix (assumed to be row-major)
- * @param[in]  map          Vector of gather locations
- * @param[in]  stencil      Vector of stencil or predicate values
- * @param[out]  out          Output matrix (assumed to be row-major)
- * @param[in]  pred_op      Predicate to apply to the stencil values
- * @param[in]  transform_op The transformation operation, transforms the map values to idx_t
+ * For each output row, read the index in the input matrix from the map, apply a transformation to
+ * this input index if specified, and copy the row.
+ *
+ * @tparam matrix_t    Matrix element type
+ * @tparam map_t       Integer type of map elements
+ * @tparam idx_t       Integer type used for indexing
+ * @tparam map_xform_t Unary lambda expression or operator type. MapTransformOp's result type must
+ *                     be convertible to idx_t.
+ * @param[in]  handle        raft handle for managing resources
+ * @param[in]  in            Input matrix, dim = [N, D] (row-major)
+ * @param[in]  map           Map of row indices to gather, dim = [map_length]
+ * @param[out] out           Output matrix, dim = [map_length, D] (row-major)
+ * @param[in]  transform_op  (optional) Transformation to apply to map values
+ */
+template <typename matrix_t,
+          typename map_t,
+          typename idx_t,
+          typename map_xform_t = raft::identity_op>
+void gather(const raft::device_resources& handle,
+            raft::device_matrix_view<const matrix_t, idx_t, row_major> in,
+            raft::device_vector_view<const map_t, idx_t> map,
+            raft::device_matrix_view<matrix_t, idx_t, row_major> out,
+            map_xform_t transform_op = raft::identity_op())
+{
+  RAFT_EXPECTS(out.extent(0) == map.extent(0),
+               "Number of rows in output matrix must equal the size of the map vector");
+  RAFT_EXPECTS(out.extent(1) == in.extent(1),
+               "Number of columns in input and output matrices must be equal.");
+
+  detail::gather(
+    const_cast<matrix_t*>(in.data_handle()),  // TODO: There's a better way to handle this
+    in.extent(1),
+    in.extent(0),
+    map.data_handle(),
+    map.extent(0),
+    out.data_handle(),
+    transform_op,
+    handle.get_stream());
+}
+
+/**
+ * @brief Conditionally copies rows according to a transformed map.
+ *
+ * For each output row, read the index in the input matrix from the map, read a stencil value,
+ * apply a predicate to the stencil value, and if true, apply a transformation if specified to the
+ * input index, and copy the row.
+ *
+ * @tparam matrix_t     Matrix element type
+ * @tparam map_t        Integer type of map elements
+ * @tparam stencil_t    Value type for stencil (input type for the pred_op)
+ * @tparam unary_pred_t Unary lambda expression or operator type. unary_pred_t's result
+ *                      type must be convertible to bool type.
+ * @tparam map_xform_t  Unary lambda expression or operator type. MapTransformOp's result type must
+ *                      be convertible to idx_t.
+ * @tparam idx_t        Integer type used for indexing
+ * @param[in]  handle        raft handle for managing resources
+ * @param[in]  in            Input matrix, dim = [N, D] (row-major)
+ * @param[in]  map           Map of row indices to gather, dim = [map_length]
+ * @param[in]  stencil       Vector of stencil values, dim = [map_length]
+ * @param[out] out           Output matrix, dim = [map_length, D] (row-major)
+ * @param[in]  pred_op       Predicate to apply to the stencil values
+ * @param[in]  transform_op  (optional) Transformation to apply to map values
  */
 template <typename matrix_t,
           typename map_t,
           typename stencil_t,
           typename unary_pred_t,
-          typename map_xform_t,
-          typename idx_t>
-void gather_if(const raft::handle_t& handle,
+          typename idx_t,
+          typename map_xform_t = raft::identity_op>
+void gather_if(const raft::device_resources& handle,
                raft::device_matrix_view<const matrix_t, idx_t, row_major> in,
                raft::device_matrix_view<matrix_t, idx_t, row_major> out,
-               raft::device_vector_view<const map_t> map,
-               raft::device_vector_view<const stencil_t> stencil,
+               raft::device_vector_view<const map_t, idx_t> map,
+               raft::device_vector_view<const stencil_t, idx_t> stencil,
                unary_pred_t pred_op,
-               map_xform_t transform_op)
+               map_xform_t transform_op = raft::identity_op())
 {
   RAFT_EXPECTS(out.extent(0) == map.extent(0),
                "Number of rows in output matrix must equal the size of the map vector");
@@ -342,4 +288,6 @@ void gather_if(const raft::handle_t& handle,
                     handle.get_stream());
 }
 
+/** @} */  // end of group matrix_gather
+
 }  // namespace raft::matrix
diff --git a/cpp/include/raft/matrix/init.cuh b/cpp/include/raft/matrix/init.cuh
index caee2555a9..f597bbd1c6 100644
--- a/cpp/include/raft/matrix/init.cuh
+++ b/cpp/include/raft/matrix/init.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,12 @@
 #include <raft/matrix/matrix.cuh>
 
 namespace raft::matrix {
+
+/**
+ * @defgroup matrix_init Matrix initialization operations
+ * @{
+ */
+
 /**
  * @brief set values to scalar in matrix
  * @tparam math_t data-type upon which the math operation will be performed
@@ -33,7 +39,7 @@ namespace raft::matrix {
  * @param[in] scalar scalar value to fill matrix elements
  */
 template <typename math_t, typename extents, typename layout>
-void fill(const raft::handle_t& handle,
+void fill(raft::device_resources const& handle,
           raft::device_mdspan<const math_t, extents, layout> in,
           raft::device_mdspan<math_t, extents, layout> out,
           raft::host_scalar_view<math_t> scalar)
@@ -55,7 +61,7 @@ void fill(const raft::handle_t& handle,
  * @param[in] scalar scalar value to fill matrix elements
  */
 template <typename math_t, typename extents, typename layout>
-void fill(const raft::handle_t& handle,
+void fill(raft::device_resources const& handle,
           raft::device_mdspan<math_t, extents, layout> inout,
           math_t scalar)
 {
@@ -63,4 +69,7 @@ void fill(const raft::handle_t& handle,
   detail::setValue(
     inout.data_handle(), inout.data_handle(), scalar, inout.size(), handle.get_stream());
 }
+
+/** @} */  // end of group matrix_init
+
 }  // namespace raft::matrix
diff --git a/cpp/include/raft/matrix/linewise_op.cuh b/cpp/include/raft/matrix/linewise_op.cuh
index 77f70239ea..33de112a35 100644
--- a/cpp/include/raft/matrix/linewise_op.cuh
+++ b/cpp/include/raft/matrix/linewise_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,11 @@
 
 namespace raft::matrix {
 
+/**
+ * @defgroup linewise_op Matrix Linewise Operations
+ * @{
+ */
+
 /**
  * Run a function over matrix lines (rows or columns) with a variable number
  * row-vectors or column-vectors.
@@ -57,7 +62,7 @@ template <typename m_t,
           typename Lambda,
           typename... vec_t,
           typename = raft::enable_if_device_mdspan<vec_t...>>
-void linewise_op(const raft::handle_t& handle,
+void linewise_op(raft::device_resources const& handle,
                  raft::device_matrix_view<const m_t, idx_t, layout> in,
                  raft::device_matrix_view<m_t, idx_t, layout> out,
                  const bool alongLines,
@@ -92,7 +97,7 @@ template <typename m_t,
           typename Lambda,
           typename... vec_t,
           typename = raft::enable_if_device_mdspan<vec_t...>>
-void linewise_op(const raft::handle_t& handle,
+void linewise_op(raft::device_resources const& handle,
                  raft::device_aligned_matrix_view<const m_t, idx_t, layout> in,
                  raft::device_aligned_matrix_view<m_t, idx_t, layout> out,
                  const bool alongLines,
@@ -115,4 +120,6 @@ void linewise_op(const raft::handle_t& handle,
     out, in, lineLen, nLines, alongLines, op, handle.get_stream(), vecs.data_handle()...);
 }
 
+/** @} */  // end of group linewise_op
+
 }  // namespace raft::matrix
diff --git a/cpp/include/raft/matrix/math.cuh b/cpp/include/raft/matrix/math.cuh
index fd5ddf2df3..7afb9572be 100644
--- a/cpp/include/raft/matrix/math.cuh
+++ b/cpp/include/raft/matrix/math.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -295,7 +295,7 @@ void setValue(math_t* out, const math_t* in, math_t scalar, int len, cudaStream_
  */
 template <typename math_t, typename IdxType = int>
 void ratio(
-  const raft::handle_t& handle, math_t* src, math_t* dest, IdxType len, cudaStream_t stream)
+  raft::device_resources const& handle, math_t* src, math_t* dest, IdxType len, cudaStream_t stream)
 {
   detail::ratio(handle, src, dest, len, stream);
 }
diff --git a/cpp/include/raft/matrix/matrix.cuh b/cpp/include/raft/matrix/matrix.cuh
index cd6c4fa219..0780e41275 100644
--- a/cpp/include/raft/matrix/matrix.cuh
+++ b/cpp/include/raft/matrix/matrix.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -88,7 +88,7 @@ void copy(const m_t* in, m_t* out, idx_t n_rows, idx_t n_cols, cudaStream_t stre
  * @param[out] out: output matrix
  */
 template <typename m_t, typename idx_t = int, typename matrix_idx_t>
-void copy(const raft::handle_t& handle,
+void copy(raft::device_resources const& handle,
           raft::device_matrix_view<const m_t, matrix_idx_t, col_major> in,
           raft::device_matrix_view<m_t, matrix_idx_t, col_major> out)
 {
@@ -252,7 +252,7 @@ void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream)
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t stream)
+m_t getL2Norm(raft::device_resources const& handle, m_t* in, idx_t size, cudaStream_t stream)
 {
   return detail::getL2Norm(handle, in, size, stream);
 }
diff --git a/cpp/include/raft/matrix/norm.cuh b/cpp/include/raft/matrix/norm.cuh
index deb3657905..eb94a19669 100644
--- a/cpp/include/raft/matrix/norm.cuh
+++ b/cpp/include/raft/matrix/norm.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,11 @@
 
 namespace raft::matrix {
 
+/**
+ * @defgroup matrix_norm Matrix Norm Operations
+ * @{
+ */
+
 /**
  * @brief Get the L2/F-norm of a matrix
  * @param[in] handle: raft handle
@@ -28,8 +33,11 @@ namespace raft::matrix {
  * @returns matrix l2 norm
  */
 template <typename m_t, typename idx_t>
-m_t l2_norm(const raft::handle_t& handle, raft::device_mdspan<const m_t, idx_t> in)
+m_t l2_norm(raft::device_resources const& handle, raft::device_mdspan<const m_t, idx_t> in)
 {
   return detail::getL2Norm(handle, in.data_handle(), in.size(), handle.get_stream());
 }
+
+/** @} */  // end of group matrix_norm
+
 }  // namespace raft::matrix
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/power.cuh b/cpp/include/raft/matrix/power.cuh
index 4e2b3b7d72..c7c3757193 100644
--- a/cpp/include/raft/matrix/power.cuh
+++ b/cpp/include/raft/matrix/power.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,11 @@
 
 namespace raft::matrix {
 
+/**
+ * @defgroup matrix_power Matrix Power Operations
+ * @{
+ */
+
 /**
  * @brief Power of every element in the input matrix
  * @tparam math_t type of matrix elements
@@ -32,7 +37,7 @@ namespace raft::matrix {
  * @param[in] scalar: every element is multiplied with scalar.
  */
 template <typename math_t, typename idx_t, typename layout>
-void weighted_power(const raft::handle_t& handle,
+void weighted_power(raft::device_resources const& handle,
                     raft::device_matrix_view<const math_t, idx_t, layout> in,
                     raft::device_matrix_view<math_t, idx_t, layout> out,
                     math_t scalar)
@@ -51,7 +56,7 @@ void weighted_power(const raft::handle_t& handle,
  * @param[in] scalar: every element is multiplied with scalar.
  */
 template <typename math_t, typename idx_t, typename layout>
-void weighted_power(const raft::handle_t& handle,
+void weighted_power(raft::device_resources const& handle,
                     raft::device_matrix_view<math_t, idx_t, layout> inout,
                     math_t scalar)
 {
@@ -67,7 +72,8 @@ void weighted_power(const raft::handle_t& handle,
  * @param[inout] inout: input matrix and also the result is stored
  */
 template <typename math_t, typename idx_t, typename layout>
-void power(const raft::handle_t& handle, raft::device_matrix_view<math_t, idx_t, layout> inout)
+void power(raft::device_resources const& handle,
+           raft::device_matrix_view<math_t, idx_t, layout> inout)
 {
   detail::power<math_t>(inout.data_handle(), inout.size(), handle.get_stream());
 }
@@ -83,7 +89,7 @@ void power(const raft::handle_t& handle, raft::device_matrix_view<math_t, idx_t,
  * @{
  */
 template <typename math_t, typename idx_t, typename layout>
-void power(const raft::handle_t& handle,
+void power(raft::device_resources const& handle,
            raft::device_matrix_view<const math_t, idx_t, layout> in,
            raft::device_matrix_view<math_t, idx_t, layout> out)
 {
@@ -91,4 +97,6 @@ void power(const raft::handle_t& handle,
   detail::power<math_t>(in.data_handle(), out.data_handle(), in.size(), handle.get_stream());
 }
 
+/** @} */  // end group matrix_power
+
 }  // namespace raft::matrix
diff --git a/cpp/include/raft/matrix/print.cuh b/cpp/include/raft/matrix/print.cuh
index 4d3a8ca938..6a4bfbdd01 100644
--- a/cpp/include/raft/matrix/print.cuh
+++ b/cpp/include/raft/matrix/print.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,11 @@
 
 namespace raft::matrix {
 
+/**
+ * @defgroup matrix_print Matrix print operations
+ * @{
+ */
+
 /**
  * @brief Prints the data stored in GPU memory
  * @tparam m_t type of matrix elements
@@ -33,7 +38,7 @@ namespace raft::matrix {
  * @param[in] separators: horizontal and vertical separator characters
  */
 template <typename m_t, typename idx_t>
-void print(const raft::handle_t& handle,
+void print(raft::device_resources const& handle,
            raft::device_matrix_view<const m_t, idx_t, col_major> in,
            print_separators& separators)
 {
@@ -44,4 +49,6 @@ void print(const raft::handle_t& handle,
                 separators.vertical,
                 handle.get_stream());
 }
+
+/** @} */  // end group matrix_print
 }  // namespace raft::matrix
diff --git a/cpp/include/raft/matrix/ratio.cuh b/cpp/include/raft/matrix/ratio.cuh
index 7895ea972f..cd96d1ffbc 100644
--- a/cpp/include/raft/matrix/ratio.cuh
+++ b/cpp/include/raft/matrix/ratio.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,11 @@
 
 namespace raft::matrix {
 
+/**
+ * @defgroup matrix_ratio Matrix ratio operations
+ * @{
+ */
+
 /**
  * @brief ratio of every element over sum of input vector is calculated
  * @tparam math_t data-type upon which the math operation will be performed
@@ -31,7 +36,7 @@ namespace raft::matrix {
  * @param[out] dest: output matrix. The result is stored in the dest matrix
  */
 template <typename math_t, typename idx_t, typename layout>
-void ratio(const raft::handle_t& handle,
+void ratio(raft::device_resources const& handle,
            raft::device_matrix_view<const math_t, idx_t, layout> src,
            raft::device_matrix_view<math_t, idx_t, layout> dest)
 {
@@ -48,9 +53,13 @@ void ratio(const raft::handle_t& handle,
  * @param[inout] inout: input matrix
  */
 template <typename math_t, typename idx_t, typename layout>
-void ratio(const raft::handle_t& handle, raft::device_matrix_view<math_t, idx_t, layout> inout)
+void ratio(raft::device_resources const& handle,
+           raft::device_matrix_view<math_t, idx_t, layout> inout)
 {
   detail::ratio(
     handle, inout.data_handle(), inout.data_handle(), inout.size(), handle.get_stream());
 }
+
+/** @} */  // end group matrix_ratio
+
 }  // namespace raft::matrix
diff --git a/cpp/include/raft/matrix/reciprocal.cuh b/cpp/include/raft/matrix/reciprocal.cuh
index c41ecfb999..aa2c48e143 100644
--- a/cpp/include/raft/matrix/reciprocal.cuh
+++ b/cpp/include/raft/matrix/reciprocal.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,11 @@
 
 namespace raft::matrix {
 
+/**
+ * @defgroup matrix_reciprocal Matrix Reciprocal Operations
+ * @{
+ */
+
 /**
  * @brief Reciprocal of every element in the input matrix
  * @tparam math_t data-type upon which the math operation will be performed
@@ -35,7 +40,7 @@ namespace raft::matrix {
  * @{
  */
 template <typename math_t, typename idx_t, typename layout>
-void reciprocal(const raft::handle_t& handle,
+void reciprocal(raft::device_resources const& handle,
                 raft::device_matrix_view<const math_t, idx_t, layout> in,
                 raft::device_matrix_view<math_t, idx_t, layout> out,
                 raft::host_scalar_view<math_t> scalar,
@@ -65,7 +70,7 @@ void reciprocal(const raft::handle_t& handle,
  * @{
  */
 template <typename math_t, typename idx_t, typename layout>
-void reciprocal(const raft::handle_t& handle,
+void reciprocal(raft::device_resources const& handle,
                 raft::device_matrix_view<math_t, idx_t, layout> inout,
                 raft::host_scalar_view<math_t> scalar,
                 bool setzero = false,
@@ -78,4 +83,7 @@ void reciprocal(const raft::handle_t& handle,
                              setzero,
                              thres);
 }
+
+/** @} */  // end group matrix_reciprocal
+
 }  // namespace raft::matrix
diff --git a/cpp/include/raft/matrix/reverse.cuh b/cpp/include/raft/matrix/reverse.cuh
index e00a240577..3aaec56fee 100644
--- a/cpp/include/raft/matrix/reverse.cuh
+++ b/cpp/include/raft/matrix/reverse.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,11 @@
 
 namespace raft::matrix {
 
+/**
+ * @defgroup matrix_reverse Matrix reverse
+ * @{
+ */
+
 /**
  * @brief Reverse the columns of a matrix in place (i.e. first column and
  * last column are swapped)
@@ -29,7 +34,8 @@ namespace raft::matrix {
  * @param[inout] inout: input and output matrix
  */
 template <typename m_t, typename idx_t, typename layout_t>
-void col_reverse(const raft::handle_t& handle, raft::device_matrix_view<m_t, idx_t, layout_t> inout)
+void col_reverse(raft::device_resources const& handle,
+                 raft::device_matrix_view<m_t, idx_t, layout_t> inout)
 {
   RAFT_EXPECTS(raft::is_row_or_column_major(inout), "Unsupported matrix layout");
   if (raft::is_col_major(inout)) {
@@ -46,7 +52,8 @@ void col_reverse(const raft::handle_t& handle, raft::device_matrix_view<m_t, idx
  * @param[inout] inout: input and output matrix
  */
 template <typename m_t, typename idx_t, typename layout_t>
-void row_reverse(const raft::handle_t& handle, raft::device_matrix_view<m_t, idx_t, layout_t> inout)
+void row_reverse(raft::device_resources const& handle,
+                 raft::device_matrix_view<m_t, idx_t, layout_t> inout)
 {
   RAFT_EXPECTS(raft::is_row_or_column_major(inout), "Unsupported matrix layout");
   if (raft::is_col_major(inout)) {
@@ -55,4 +62,6 @@ void row_reverse(const raft::handle_t& handle, raft::device_matrix_view<m_t, idx
     detail::colReverse(inout.data_handle(), inout.extent(1), inout.extent(0), handle.get_stream());
   }
 }
+/** @} */  // end group matrix_reverse
+
 }  // namespace raft::matrix
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/select_k.cuh b/cpp/include/raft/matrix/select_k.cuh
new file mode 100644
index 0000000000..9a1a14fd73
--- /dev/null
+++ b/cpp/include/raft/matrix/select_k.cuh
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/select_k.cuh"
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/nvtx.hpp>
+
+#include <optional>
+
+namespace raft::matrix {
+
+/**
+ * @defgroup select_k Batched-select k smallest or largest key/values
+ * @{
+ */
+
+/**
+ * Select k smallest or largest key/values from each row in the input data.
+ *
+ * If you think of the input data `in_val` as a row-major matrix with `len` columns and
+ * `batch_size` rows, then this function selects `k` smallest/largest values in each row and fills
+ * in the row-major matrix `out_val` of size (batch_size, k).
+ *
+ * Example usage
+ * @code{.cpp}
+ *   using namespace raft;
+ *   // get a 2D row-major array of values to search through
+ *   auto in_values = {... input device_matrix_view<const float, size_t, row_major> ...}
+ *   // prepare output arrays
+ *   auto out_extents = make_extents<size_t>(in_values.extent(0), k);
+ *   auto out_values  = make_device_mdarray<float>(handle, out_extents);
+ *   auto out_indices = make_device_mdarray<size_t>(handle, out_extents);
+ *   // search `k` smallest values in each row
+ *   matrix::select_k<float, size_t>(
+ *     handle, in_values, std::nullopt, out_values.view(), out_indices.view(), true);
+ * @endcode
+ *
+ * @tparam T
+ *   the type of the keys (what is being compared).
+ * @tparam IdxT
+ *   the index type (what is being selected together with the keys).
+ *
+ * @param[in] handle
+ * @param[in] in_val
+ *   inputs values [batch_size, len];
+ *   these are compared and selected.
+ * @param[in] in_idx
+ *   optional input payload [batch_size, len];
+ *   typically, these are indices of the corresponding `in_val`.
+ *   If `in_idx` is `std::nullopt`, a contiguous array `0...len-1` is implied.
+ * @param[out] out_val
+ *   output values [batch_size, k];
+ *   the k smallest/largest values from each row of the `in_val`.
+ * @param[out] out_idx
+ *   output payload (e.g. indices) [batch_size, k];
+ *   the payload selected together with `out_val`.
+ * @param[in] select_min
+ *   whether to select k smallest (true) or largest (false) keys.
+ */
+template <typename T, typename IdxT>
+void select_k(const device_resources& handle,
+              raft::device_matrix_view<const T, size_t, row_major> in_val,
+              std::optional<raft::device_matrix_view<const IdxT, size_t, row_major>> in_idx,
+              raft::device_matrix_view<T, size_t, row_major> out_val,
+              raft::device_matrix_view<IdxT, size_t, row_major> out_idx,
+              bool select_min)
+{
+  RAFT_EXPECTS(out_val.extent(1) <= size_t(std::numeric_limits<int>::max()),
+               "output k must fit the int type.");
+  auto batch_size = in_val.extent(0);
+  auto len        = in_val.extent(1);
+  auto k          = int(out_val.extent(1));
+  RAFT_EXPECTS(batch_size == out_val.extent(0), "batch sizes must be equal");
+  RAFT_EXPECTS(batch_size == out_idx.extent(0), "batch sizes must be equal");
+  if (in_idx.has_value()) {
+    RAFT_EXPECTS(batch_size == in_idx->extent(0), "batch sizes must be equal");
+    RAFT_EXPECTS(len == in_idx->extent(1), "value and index input lengths must be equal");
+  }
+  RAFT_EXPECTS(size_t(k) == out_idx.extent(1), "value and index output lengths must be equal");
+  return detail::select_k<T, IdxT>(in_val.data_handle(),
+                                   in_idx.has_value() ? in_idx->data_handle() : nullptr,
+                                   batch_size,
+                                   len,
+                                   k,
+                                   out_val.data_handle(),
+                                   out_idx.data_handle(),
+                                   select_min,
+                                   handle.get_stream());
+}
+
+/** @} */  // end of group select_k
+
+}  // namespace raft::matrix
diff --git a/cpp/include/raft/matrix/sign_flip.cuh b/cpp/include/raft/matrix/sign_flip.cuh
index 01f8829c85..d069c55880 100644
--- a/cpp/include/raft/matrix/sign_flip.cuh
+++ b/cpp/include/raft/matrix/sign_flip.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,11 @@
 
 namespace raft::matrix {
 
+/**
+ * @defgroup matrix_sign_flip Matrix sign flip operations
+ * @{
+ */
+
 /**
  * @brief sign flip stabilizes the sign of col major eigen vectors.
  * The sign is flipped if the column has negative |max|.
@@ -31,9 +36,11 @@ namespace raft::matrix {
  * @param[inout] inout: input matrix. Result also stored in this parameter
  */
 template <typename math_t, typename idx_t>
-void sign_flip(const raft::handle_t& handle,
+void sign_flip(raft::device_resources const& handle,
                raft::device_matrix_view<math_t, idx_t, col_major> inout)
 {
   detail::signFlip(inout.data_handle(), inout.extent(0), inout.extent(1), handle.get_stream());
 }
+
+/** @} */  // end group matrix_sign_flip
 }  // namespace raft::matrix
diff --git a/cpp/include/raft/matrix/slice.cuh b/cpp/include/raft/matrix/slice.cuh
index eda2853c78..bb92b2b86f 100644
--- a/cpp/include/raft/matrix/slice.cuh
+++ b/cpp/include/raft/matrix/slice.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,11 @@
 
 namespace raft::matrix {
 
+/**
+ * @defgroup matrix_slice Matrix slicing
+ * @{
+ */
+
 template <typename idx_t>
 struct slice_coordinates {
   idx_t row1;  ///< row coordinate of the top-left point of the wanted area (0-based)
@@ -45,7 +50,7 @@ struct slice_coordinates {
  * example: Slice the 2nd and 3rd columns of a 4x3 matrix: slice(handle, in, out, {0, 1, 4, 3});
  */
 template <typename m_t, typename idx_t>
-void slice(const raft::handle_t& handle,
+void slice(raft::device_resources const& handle,
            raft::device_matrix_view<const m_t, idx_t, col_major> in,
            raft::device_matrix_view<m_t, idx_t, col_major> out,
            slice_coordinates<idx_t> coords)
@@ -68,4 +73,7 @@ void slice(const raft::handle_t& handle,
                       coords.col2,
                       handle.get_stream());
 }
+
+/** @} */  // end group matrix_slice
+
 }  // namespace raft::matrix
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/sqrt.cuh b/cpp/include/raft/matrix/sqrt.cuh
index 302167480e..9729f9b3d5 100644
--- a/cpp/include/raft/matrix/sqrt.cuh
+++ b/cpp/include/raft/matrix/sqrt.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,11 @@
 
 namespace raft::matrix {
 
+/**
+ * @defgroup matrix_sqrt Matrix Square Root
+ * @{
+ */
+
 /**
  * @brief Square root of every element in the input matrix
  * @tparam math_t data-type upon which the math operation will be performed
@@ -33,7 +38,7 @@ namespace raft::matrix {
  * @param[out] out: output matrix. The result is stored in the out matrix
  */
 template <typename math_t, typename idx_t, typename layout>
-void sqrt(const raft::handle_t& handle,
+void sqrt(raft::device_resources const& handle,
           raft::device_matrix_view<const math_t, idx_t, layout> in,
           raft::device_matrix_view<math_t, idx_t, layout> out)
 {
@@ -50,7 +55,8 @@ void sqrt(const raft::handle_t& handle,
  * @param[inout] inout: input matrix with in-place results
  */
 template <typename math_t, typename idx_t, typename layout>
-void sqrt(const raft::handle_t& handle, raft::device_matrix_view<math_t, idx_t, layout> inout)
+void sqrt(raft::device_resources const& handle,
+          raft::device_matrix_view<math_t, idx_t, layout> inout)
 {
   detail::seqRoot(inout.data_handle(), inout.size(), handle.get_stream());
 }
@@ -67,7 +73,7 @@ void sqrt(const raft::handle_t& handle, raft::device_matrix_view<math_t, idx_t,
  * @param[in] set_neg_zero whether to set negative numbers to zero
  */
 template <typename math_t, typename idx_t, typename layout>
-void weighted_sqrt(const raft::handle_t& handle,
+void weighted_sqrt(raft::device_resources const& handle,
                    raft::device_matrix_view<const math_t, idx_t, layout> in,
                    raft::device_matrix_view<math_t, idx_t, layout> out,
                    raft::host_scalar_view<math_t> scalar,
@@ -93,7 +99,7 @@ void weighted_sqrt(const raft::handle_t& handle,
  * @param[in] set_neg_zero whether to set negative numbers to zero
  */
 template <typename math_t, typename idx_t, typename layout>
-void weighted_sqrt(const raft::handle_t& handle,
+void weighted_sqrt(raft::device_resources const& handle,
                    raft::device_matrix_view<math_t, idx_t, layout> inout,
                    raft::host_scalar_view<math_t> scalar,
                    bool set_neg_zero = false)
@@ -102,4 +108,6 @@ void weighted_sqrt(const raft::handle_t& handle,
     inout.data_handle(), *(scalar.data_handle()), inout.size(), handle.get_stream(), set_neg_zero);
 }
 
+/** @} */  // end group matrix_sqrt
+
 }  // namespace raft::matrix
diff --git a/cpp/include/raft/matrix/threshold.cuh b/cpp/include/raft/matrix/threshold.cuh
index 7540ceb3c6..7dfb264d34 100644
--- a/cpp/include/raft/matrix/threshold.cuh
+++ b/cpp/include/raft/matrix/threshold.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,11 @@
 
 namespace raft::matrix {
 
+/**
+ * @defgroup matrix_threshold Matrix thesholding
+ * @{
+ */
+
 /**
  * @brief sets the small values to zero based on a defined threshold
  * @tparam math_t data-type upon which the math operation will be performed
@@ -32,7 +37,7 @@ namespace raft::matrix {
  * @param[in] thres threshold to set values to zero
  */
 template <typename math_t, typename idx_t, typename layout>
-void zero_small_values(const raft::handle_t& handle,
+void zero_small_values(raft::device_resources const& handle,
                        raft::device_matrix_view<const math_t, idx_t, layout> in,
                        raft::device_matrix_view<math_t, idx_t, layout> out,
                        math_t thres = 1e-15)
@@ -52,10 +57,13 @@ void zero_small_values(const raft::handle_t& handle,
  * @param thres: threshold
  */
 template <typename math_t, typename idx_t, typename layout>
-void zero_small_values(const raft::handle_t& handle,
+void zero_small_values(raft::device_resources const& handle,
                        raft::device_matrix_view<math_t, idx_t, layout> inout,
                        math_t thres = 1e-15)
 {
   detail::setSmallValuesZero(inout.data_handle(), inout.size(), handle.get_stream(), thres);
 }
+
+/** @} */  // end group matrix_threshold
+
 }  // namespace raft::matrix
diff --git a/cpp/include/raft/matrix/triangular.cuh b/cpp/include/raft/matrix/triangular.cuh
index fad3dd77af..3c60cc362f 100644
--- a/cpp/include/raft/matrix/triangular.cuh
+++ b/cpp/include/raft/matrix/triangular.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,11 @@
 
 namespace raft::matrix {
 
+/**
+ * @defgroup matrix_triangular Extract Matrix Triangles
+ * @{
+ */
+
 /**
  * @brief Copy the upper triangular part of a matrix to another
  * @param[in] handle: raft handle
@@ -28,7 +33,7 @@ namespace raft::matrix {
  * @param[out] dst: output matrix with a size of kxk, k = min(n_rows, n_cols)
  */
 template <typename m_t, typename idx_t>
-void upper_triangular(const raft::handle_t& handle,
+void upper_triangular(raft::device_resources const& handle,
                       raft::device_matrix_view<const m_t, idx_t, col_major> src,
                       raft::device_matrix_view<m_t, idx_t, col_major> dst)
 {
@@ -38,4 +43,7 @@ void upper_triangular(const raft::handle_t& handle,
   detail::copyUpperTriangular(
     src.data_handle(), dst.data_handle(), src.extent(0), src.extent(1), handle.get_stream());
 }
+
+/** @} */  // end group matrix_triangular
+
 }  // namespace raft::matrix
\ No newline at end of file
diff --git a/cpp/include/raft/neighbors/ann_types.hpp b/cpp/include/raft/neighbors/ann_types.hpp
index 5c6fd52be9..5bf2062f2f 100644
--- a/cpp/include/raft/neighbors/ann_types.hpp
+++ b/cpp/include/raft/neighbors/ann_types.hpp
@@ -20,6 +20,11 @@
 
 namespace raft::neighbors::ann {
 
+/**
+ * @defgroup ann_types Approximate Nearest Neighbors Types
+ * @{
+ */
+
 /** The base for approximate KNN index structures. */
 struct index {
 };
@@ -44,4 +49,6 @@ struct index_params {
 struct search_params {
 };
 
+/** @} */  // end group ann_types
+
 };  // namespace raft::neighbors::ann
diff --git a/cpp/include/raft/neighbors/ball_cover.cuh b/cpp/include/raft/neighbors/ball_cover.cuh
index 28ff8491b6..619c57a35a 100644
--- a/cpp/include/raft/neighbors/ball_cover.cuh
+++ b/cpp/include/raft/neighbors/ball_cover.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,26 +20,31 @@
 
 #include <cstdint>
 
-#include "ball_cover_types.hpp"
 #include <raft/distance/distance_types.hpp>
+#include <raft/neighbors/ball_cover_types.hpp>
 #include <raft/spatial/knn/detail/ball_cover.cuh>
 #include <raft/spatial/knn/detail/ball_cover/common.cuh>
 #include <thrust/transform.h>
 
 namespace raft::neighbors::ball_cover {
 
+/**
+ * @defgroup random_ball_cover Random Ball Cover algorithm
+ * @{
+ */
+
 /**
  * Builds and populates a previously unbuilt BallCoverIndex
  *
  * Usage example:
  * @code{.cpp}
  *
- *  #include <raft/core/handle.hpp>
+ *  #include <raft/core/device_resources.hpp>
  *  #include <raft/neighbors/ball_cover.cuh>
  *  #include <raft/distance/distance_types.hpp>
  *  using namespace raft::neighbors;
  *
- *  raft::handle_t handle;
+ *  raft::raft::device_resources handle;
  *  ...
  *  auto metric = raft::distance::DistanceType::L2Expanded;
  *  BallCoverIndex index(handle, X, metric);
@@ -55,7 +60,7 @@ namespace raft::neighbors::ball_cover {
  * @param[inout] index an empty (and not previous built) instance of BallCoverIndex
  */
 template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
-void build_index(const raft::handle_t& handle,
+void build_index(raft::device_resources const& handle,
                  BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index)
 {
   ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation");
@@ -73,6 +78,8 @@ void build_index(const raft::handle_t& handle,
   index.set_index_trained();
 }
 
+/** @} */  // end group random_ball_cover
+
 /**
  * Performs a faster exact knn in metric spaces using the triangle
  * inequality with a number of landmark points to reduce the
@@ -102,7 +109,7 @@ void build_index(const raft::handle_t& handle,
  *               looking in the closest landmark.
  */
 template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
-void all_knn_query(const raft::handle_t& handle,
+void all_knn_query(raft::device_resources const& handle,
                    BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,
                    int_t k,
                    idx_t* inds,
@@ -139,6 +146,11 @@ void all_knn_query(const raft::handle_t& handle,
   index.set_index_trained();
 }
 
+/**
+ * @ingroup random_ball_cover
+ * @{
+ */
+
 /**
  * Performs a faster exact knn in metric spaces using the triangle
  * inequality with a number of landmark points to reduce the
@@ -151,12 +163,12 @@ void all_knn_query(const raft::handle_t& handle,
  * Usage example:
  * @code{.cpp}
  *
- *  #include <raft/core/handle.hpp>
+ *  #include <raft/core/device_resources.hpp>
  *  #include <raft/neighbors/ball_cover.cuh>
  *  #include <raft/distance/distance_types.hpp>
  *  using namespace raft::neighbors;
  *
- *  raft::handle_t handle;
+ *  raft::raft::device_resources handle;
  *  ...
  *  auto metric = raft::distance::DistanceType::L2Expanded;
  *
@@ -190,7 +202,7 @@ void all_knn_query(const raft::handle_t& handle,
  *               looking in the closest landmark.
  */
 template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
-void all_knn_query(const raft::handle_t& handle,
+void all_knn_query(raft::device_resources const& handle,
                    BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,
                    raft::device_matrix_view<idx_t, matrix_idx_t, row_major> inds,
                    raft::device_matrix_view<value_t, matrix_idx_t, row_major> dists,
@@ -212,6 +224,8 @@ void all_knn_query(const raft::handle_t& handle,
     handle, index, k, inds.data_handle(), dists.data_handle(), perform_post_filtering, weight);
 }
 
+/** @} */
+
 /**
  * Performs a faster exact knn in metric spaces using the triangle
  * inequality with a number of landmark points to reduce the
@@ -242,7 +256,7 @@ void all_knn_query(const raft::handle_t& handle,
  * @param[in] n_query_pts number of query points
  */
 template <typename idx_t, typename value_t, typename int_t>
-void knn_query(const raft::handle_t& handle,
+void knn_query(raft::device_resources const& handle,
                const BallCoverIndex<idx_t, value_t, int_t>& index,
                int_t k,
                const value_t* query,
@@ -281,6 +295,11 @@ void knn_query(const raft::handle_t& handle,
   }
 }
 
+/**
+ * @ingroup random_ball_cover
+ * @{
+ */
+
 /**
  * Performs a faster exact knn in metric spaces using the triangle
  * inequality with a number of landmark points to reduce the
@@ -292,12 +311,12 @@ void knn_query(const raft::handle_t& handle,
  * Usage example:
  * @code{.cpp}
  *
- *  #include <raft/core/handle.hpp>
+ *  #include <raft/core/device_resources.hpp>
  *  #include <raft/neighbors/ball_cover.cuh>
  *  #include <raft/distance/distance_types.hpp>
  *  using namespace raft::neighbors;
  *
- *  raft::handle_t handle;
+ *  raft::raft::device_resources handle;
  *  ...
  *  auto metric = raft::distance::DistanceType::L2Expanded;
  *
@@ -333,7 +352,7 @@ void knn_query(const raft::handle_t& handle,
  *               looking in the closest landmark.
  */
 template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
-void knn_query(const raft::handle_t& handle,
+void knn_query(raft::device_resources const& handle,
                const BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,
                raft::device_matrix_view<const value_t, matrix_idx_t, row_major> query,
                raft::device_matrix_view<idx_t, matrix_idx_t, row_major> inds,
@@ -365,6 +384,8 @@ void knn_query(const raft::handle_t& handle,
             weight);
 }
 
+/** @} */
+
 // TODO: implement functions for:
 //  4. rbc_eps_neigh() - given a populated index, perform query against different query array
 //  5. rbc_all_eps_neigh() - populate a BallCoverIndex and query against training data
diff --git a/cpp/include/raft/neighbors/ball_cover_types.hpp b/cpp/include/raft/neighbors/ball_cover_types.hpp
index f6e49ab5c4..8cab1469fc 100644
--- a/cpp/include/raft/neighbors/ball_cover_types.hpp
+++ b/cpp/include/raft/neighbors/ball_cover_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,12 +19,17 @@
 #include <cstdint>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft::neighbors::ball_cover {
 
+/**
+ * @ingroup random_ball_cover
+ * @{
+ */
+
 /**
  * Stores raw index data points, sampled landmarks, the 1-nns of index points
  * to their closest landmarks, and the ball radii of each landmark. This
@@ -40,7 +45,7 @@ template <typename value_idx,
           typename matrix_idx = std::uint32_t>
 class BallCoverIndex {
  public:
-  explicit BallCoverIndex(const raft::handle_t& handle_,
+  explicit BallCoverIndex(raft::device_resources const& handle_,
                           const value_t* X_,
                           value_int m_,
                           value_int n_,
@@ -66,7 +71,7 @@ class BallCoverIndex {
   {
   }
 
-  explicit BallCoverIndex(const raft::handle_t& handle_,
+  explicit BallCoverIndex(raft::device_resources const& handle_,
                           raft::device_matrix_view<const value_t, matrix_idx, row_major> X_,
                           raft::distance::DistanceType metric_)
     : handle(handle_),
@@ -134,7 +139,7 @@ class BallCoverIndex {
   // This should only be set by internal functions
   void set_index_trained() { index_trained = true; }
 
-  const raft::handle_t& handle;
+  raft::device_resources const& handle;
 
   value_int m;
   value_int n;
@@ -158,4 +163,7 @@ class BallCoverIndex {
  protected:
   bool index_trained;
 };
+
+/** @} */
+
 }  // namespace raft::neighbors::ball_cover
diff --git a/cpp/include/raft/neighbors/brute_force.cuh b/cpp/include/raft/neighbors/brute_force.cuh
index 96cd5f11c5..ac9d14ce17 100644
--- a/cpp/include/raft/neighbors/brute_force.cuh
+++ b/cpp/include/raft/neighbors/brute_force.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,11 @@
 
 namespace raft::neighbors::brute_force {
 
+/**
+ * @defgroup brute_force_knn Brute-force K-Nearest Neighbors
+ * @{
+ */
+
 /**
  * @brief Performs a k-select across several (contiguous) row-partitioned index/distance
  * matrices formatted like the following:
@@ -49,11 +54,11 @@ namespace raft::neighbors::brute_force {
  *
  * Usage example:
  * @code{.cpp}
- *  #include <raft/core/handle.hpp>
+ *  #include <raft/core/device_resources.hpp>
  *  #include <raft/neighbors/brute_force.cuh>
  *  using namespace raft::neighbors;
  *
- *  raft::handle_t handle;
+ *  raft::raft::device_resources handle;
  *  ...
  *  compute multiple knn graphs and aggregate row-wise
  *  (see detailed description above)
@@ -74,7 +79,7 @@ namespace raft::neighbors::brute_force {
  */
 template <typename value_t, typename idx_t>
 inline void knn_merge_parts(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const value_t, idx_t, row_major> in_keys,
   raft::device_matrix_view<const idx_t, idx_t, row_major> in_values,
   raft::device_matrix_view<value_t, idx_t, row_major> out_keys,
@@ -111,12 +116,12 @@ inline void knn_merge_parts(
  *
  * Usage example:
  * @code{.cpp}
- *  #include <raft/core/handle.hpp>
+ *  #include <raft/core/device_resources.hpp>
  *  #include <raft/neighbors/brute_force.cuh>
  *  #include <raft/distance/distance_types.hpp>
  *  using namespace raft::neighbors;
  *
- *  raft::handle_t handle;
+ *  raft::raft::device_resources handle;
  *  ...
  *  int k = 10;
  *  auto metric = raft::distance::DistanceType::L2SqrtExpanded;
@@ -141,7 +146,7 @@ template <typename idx_t,
           typename matrix_idx,
           typename index_layout,
           typename search_layout>
-void knn(raft::handle_t const& handle,
+void knn(raft::device_resources const& handle,
          std::vector<raft::device_matrix_view<const value_t, matrix_idx, index_layout>> index,
          raft::device_matrix_view<const value_t, matrix_idx, search_layout> search,
          raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,
@@ -202,12 +207,12 @@ void knn(raft::handle_t const& handle,
  *
  * Usage example:
  * @code{.cpp}
- *  #include <raft/core/handle.hpp>
+ *  #include <raft/core/device_resources.hpp>
  *  #include <raft/neighbors/brute_force.cuh>
  *  #include <raft/distance/distance_types.hpp>
  *  using namespace raft::neighbors;
  *
- *  raft::handle_t handle;
+ *  raft::raft::device_resources handle;
  *  ...
  *  auto metric = raft::distance::DistanceType::L2SqrtExpanded;
  *  brute_force::fused_l2_knn(handle, index, search, indices, distances, metric);
@@ -225,7 +230,7 @@ void knn(raft::handle_t const& handle,
  * @param[in] metric type of distance computation to perform (must be a variant of L2)
  */
 template <typename value_t, typename idx_t, typename idx_layout, typename query_layout>
-void fused_l2_knn(const raft::handle_t& handle,
+void fused_l2_knn(raft::device_resources const& handle,
                   raft::device_matrix_view<const value_t, idx_t, idx_layout> index,
                   raft::device_matrix_view<const value_t, idx_t, query_layout> query,
                   raft::device_matrix_view<idx_t, idx_t, row_major> out_inds,
@@ -269,4 +274,6 @@ void fused_l2_knn(const raft::handle_t& handle,
                                          metric);
 }
 
+/** @} */  // end group brute_force_knn
+
 }  // namespace raft::neighbors::brute_force
diff --git a/cpp/include/raft/neighbors/detail/refine.cuh b/cpp/include/raft/neighbors/detail/refine.cuh
index 2dfaa5a5cb..b264643584 100644
--- a/cpp/include/raft/neighbors/detail/refine.cuh
+++ b/cpp/include/raft/neighbors/detail/refine.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/core/device_mdarray.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/nvtx.hpp>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
@@ -43,9 +43,9 @@ void check_input(extents_t dataset,
   auto n_queries = queries.extent(0);
   auto k         = distances.extent(1);
 
-  RAFT_EXPECTS(k <= raft::spatial::knn::detail::topk::kMaxCapacity,
+  RAFT_EXPECTS(k <= raft::matrix::detail::select::warpsort::kMaxCapacity,
                "k must be lest than topk::kMaxCapacity (%d).",
-               raft::spatial::knn::detail::topk::kMaxCapacity);
+               raft::matrix::detail::select::warpsort::kMaxCapacity);
 
   RAFT_EXPECTS(indices.extent(0) == n_queries && distances.extent(0) == n_queries &&
                  candidates.extent(0) == n_queries,
@@ -72,7 +72,7 @@ void check_input(extents_t dataset,
  * See raft::neighbors::refine for docs.
  */
 template <typename idx_t, typename data_t, typename distance_t, typename matrix_idx>
-void refine_device(raft::handle_t const& handle,
+void refine_device(raft::device_resources const& handle,
                    raft::device_matrix_view<const data_t, matrix_idx, row_major> dataset,
                    raft::device_matrix_view<const data_t, matrix_idx, row_major> queries,
                    raft::device_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,
@@ -141,7 +141,7 @@ typedef struct {
   float distance;
 } struct_for_refinement;
 
-int _postprocessing_qsort_compare(const void* v1, const void* v2)
+inline int _postprocessing_qsort_compare(const void* v1, const void* v2)
 {
   // sort in ascending order
   if (((struct_for_refinement*)v1)->distance > ((struct_for_refinement*)v2)->distance) {
diff --git a/cpp/include/raft/neighbors/epsilon_neighborhood.cuh b/cpp/include/raft/neighbors/epsilon_neighborhood.cuh
index 114216fc50..7db5ef6877 100644
--- a/cpp/include/raft/neighbors/epsilon_neighborhood.cuh
+++ b/cpp/include/raft/neighbors/epsilon_neighborhood.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
 #pragma once
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/spatial/knn/detail/epsilon_neighborhood.cuh>
 
 namespace raft::neighbors::epsilon_neighborhood {
@@ -59,6 +59,11 @@ void epsUnexpL2SqNeighborhood(bool* adj,
     adj, vd, x, y, m, n, k, eps, stream);
 }
 
+/**
+ * @defgroup epsilon_neighbors Epislon Neighborhood Operations
+ * @{
+ */
+
 /**
  * @brief Computes epsilon neighborhood for the L2-Squared distance metric and given ball size.
  * The epsilon neighbors is represented by a dense boolean adjacency matrix of size m * n and
@@ -67,10 +72,10 @@ void epsUnexpL2SqNeighborhood(bool* adj,
  *
  * @code{.cpp}
  *  #include <raft/neighbors/epsilon_neighborhood.cuh>
- *  #include <raft/core/handle.hpp>
+ *  #include <raft/core/device_resources.hpp>
  *  #include <raft/core/device_mdarray.hpp>
  *  using namespace raft::neighbors;
- *  raft::handle_t handle;
+ *  raft::raft::device_resources handle;
  *  ...
  *  auto adj = raft::make_device_matrix<bool>(handle, m * n);
  *  auto vd = raft::make_device_vector<int>(handle, m+1);
@@ -92,7 +97,7 @@ void epsUnexpL2SqNeighborhood(bool* adj,
  *                    squared as we compute L2-squared distance in this method)
  */
 template <typename value_t, typename idx_t, typename matrix_idx_t>
-void eps_neighbors_l2sq(const raft::handle_t& handle,
+void eps_neighbors_l2sq(raft::device_resources const& handle,
                         raft::device_matrix_view<const value_t, matrix_idx_t, row_major> x,
                         raft::device_matrix_view<const value_t, matrix_idx_t, row_major> y,
                         raft::device_matrix_view<bool, matrix_idx_t, row_major> adj,
@@ -110,6 +115,8 @@ void eps_neighbors_l2sq(const raft::handle_t& handle,
                                            handle.get_stream());
 }
 
+/** @} */  // end group epsilon_neighbors
+
 }  // namespace raft::neighbors::epsilon_neighborhood
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/neighbors/ivf_flat.cuh b/cpp/include/raft/neighbors/ivf_flat.cuh
index 5317f406e1..f18611b9f1 100644
--- a/cpp/include/raft/neighbors/ivf_flat.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,11 +16,11 @@
 
 #pragma once
 
-#include "ivf_flat_types.hpp"
+#include <raft/neighbors/ivf_flat_types.hpp>
 #include <raft/spatial/knn/detail/ivf_flat_build.cuh>
 #include <raft/spatial/knn/detail/ivf_flat_search.cuh>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 #include <raft/core/device_mdspan.hpp>
 #include <rmm/cuda_stream_view.hpp>
@@ -61,13 +61,20 @@ namespace raft::neighbors::ivf_flat {
  * @return the constructed ivf-flat index
  */
 template <typename T, typename IdxT>
-auto build(
-  const handle_t& handle, const index_params& params, const T* dataset, IdxT n_rows, uint32_t dim)
-  -> index<T, IdxT>
+auto build(raft::device_resources const& handle,
+           const index_params& params,
+           const T* dataset,
+           IdxT n_rows,
+           uint32_t dim) -> index<T, IdxT>
 {
   return raft::spatial::knn::ivf_flat::detail::build(handle, params, dataset, n_rows, dim);
 }
 
+/**
+ * @defgroup ivf_flat IVF Flat Algorithm
+ * @{
+ */
+
 /**
  * @brief Build the index from the dataset for efficient search.
  *
@@ -101,7 +108,7 @@ auto build(
  * @return the constructed ivf-flat index
  */
 template <typename value_t, typename idx_t>
-auto build(const handle_t& handle,
+auto build(raft::device_resources const& handle,
            raft::device_matrix_view<const value_t, idx_t, row_major> dataset,
            const index_params& params) -> index<value_t, idx_t>
 {
@@ -112,6 +119,8 @@ auto build(const handle_t& handle,
                                                      static_cast<idx_t>(dataset.extent(1)));
 }
 
+/** @} */
+
 /**
  * @brief Build a new index containing the data of the original plus new extra vectors.
  *
@@ -145,7 +154,7 @@ auto build(const handle_t& handle,
  * @return the constructed extended ivf-flat index
  */
 template <typename T, typename IdxT>
-auto extend(const handle_t& handle,
+auto extend(raft::device_resources const& handle,
             const index<T, IdxT>& orig_index,
             const T* new_vectors,
             const IdxT* new_indices,
@@ -155,6 +164,11 @@ auto extend(const handle_t& handle,
     handle, orig_index, new_vectors, new_indices, n_rows);
 }
 
+/**
+ * @ingroup ivf_flat
+ * @{
+ */
+
 /**
  * @brief Build a new index containing the data of the original plus new extra vectors.
  *
@@ -189,7 +203,7 @@ auto extend(const handle_t& handle,
  * @return the constructed extended ivf-flat index
  */
 template <typename value_t, typename idx_t>
-auto extend(const handle_t& handle,
+auto extend(raft::device_resources const& handle,
             const index<value_t, idx_t>& orig_index,
             raft::device_matrix_view<const value_t, idx_t, row_major> new_vectors,
             std::optional<raft::device_vector_view<const idx_t, idx_t>> new_indices = std::nullopt)
@@ -203,6 +217,8 @@ auto extend(const handle_t& handle,
     new_vectors.extent(0));
 }
 
+/** @} */
+
 /**
  * @brief Extend the index in-place with the new data.
  *
@@ -230,7 +246,7 @@ auto extend(const handle_t& handle,
  * @param[in] n_rows the number of samples
  */
 template <typename T, typename IdxT>
-void extend(const handle_t& handle,
+void extend(raft::device_resources const& handle,
             index<T, IdxT>* index,
             const T* new_vectors,
             const IdxT* new_indices,
@@ -239,6 +255,11 @@ void extend(const handle_t& handle,
   *index = extend(handle, *index, new_vectors, new_indices, n_rows);
 }
 
+/**
+ * @ingroup ivf_flat
+ * @{
+ */
+
 /**
  * @brief Extend the index in-place with the new data.
  *
@@ -267,7 +288,7 @@ void extend(const handle_t& handle,
  *    here to imply a continuous range `[0...n_rows)`.
  */
 template <typename value_t, typename idx_t>
-void extend(const handle_t& handle,
+void extend(raft::device_resources const& handle,
             index<value_t, idx_t>* index,
             raft::device_matrix_view<const value_t, idx_t, row_major> new_vectors,
             std::optional<raft::device_vector_view<const idx_t, idx_t>> new_indices = std::nullopt)
@@ -279,6 +300,8 @@ void extend(const handle_t& handle,
                   static_cast<idx_t>(new_vectors.extent(0)));
 }
 
+/** @} */
+
 /**
  * @brief Search ANN using the constructed index.
  *
@@ -322,7 +345,7 @@ void extend(const handle_t& handle,
  * enough memory pool here to avoid memory allocations within search).
  */
 template <typename T, typename IdxT>
-void search(const handle_t& handle,
+void search(raft::device_resources const& handle,
             const search_params& params,
             const index<T, IdxT>& index,
             const T* queries,
@@ -336,6 +359,11 @@ void search(const handle_t& handle,
     handle, params, index, queries, n_queries, k, neighbors, distances, mr);
 }
 
+/**
+ * @ingroup ivf_flat
+ * @{
+ */
+
 /**
  * @brief Search ANN using the constructed index.
  *
@@ -372,7 +400,7 @@ void search(const handle_t& handle,
  * @param[in] k the number of neighbors to find for each query.
  */
 template <typename value_t, typename idx_t, typename int_t>
-void search(const handle_t& handle,
+void search(raft::device_resources const& handle,
             const index<value_t, idx_t>& index,
             raft::device_matrix_view<const value_t, idx_t, row_major> queries,
             raft::device_matrix_view<idx_t, idx_t, row_major> neighbors,
@@ -402,4 +430,6 @@ void search(const handle_t& handle,
                 nullptr);
 }
 
+/** @} */
+
 }  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/include/raft/neighbors/ivf_flat_types.hpp b/cpp/include/raft/neighbors/ivf_flat_types.hpp
index 44b88a0b23..d234822a23 100644
--- a/cpp/include/raft/neighbors/ivf_flat_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_flat_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,10 @@
 #include <type_traits>
 
 namespace raft::neighbors::ivf_flat {
+/**
+ * @ingroup ivf_flat
+ * @{
+ */
 
 /** Size of the interleaved group (see `index::data` description). */
 constexpr static uint32_t kIndexGroupSize = 32;
@@ -219,7 +223,7 @@ struct index : ann::index {
   ~index()                          = default;
 
   /** Construct an empty index. It needs to be trained and then populated. */
-  index(const handle_t& handle,
+  index(raft::device_resources const& handle,
         raft::distance::DistanceType metric,
         uint32_t n_lists,
         bool adaptive_centers,
@@ -239,7 +243,7 @@ struct index : ann::index {
   }
 
   /** Construct an empty index. It needs to be trained and then populated. */
-  index(const handle_t& handle, const index_params& params, uint32_t dim)
+  index(raft::device_resources const& handle, const index_params& params, uint32_t dim)
     : index(handle, params.metric, params.n_lists, params.adaptive_centers, dim)
   {
   }
@@ -248,14 +252,21 @@ struct index : ann::index {
    * Replace the content of the index with new uninitialized mdarrays to hold the indicated amount
    * of data.
    */
-  void allocate(const handle_t& handle, IdxT index_size, bool allocate_center_norms)
+  void allocate(raft::device_resources const& handle, IdxT index_size)
   {
     data_    = make_device_mdarray<T>(handle, make_extents<IdxT>(index_size, dim()));
     indices_ = make_device_mdarray<IdxT>(handle, make_extents<IdxT>(index_size));
-    center_norms_ =
-      allocate_center_norms
-        ? std::optional(make_device_mdarray<float>(handle, make_extents<uint32_t>(n_lists())))
-        : std::nullopt;
+
+    switch (metric_) {
+      case raft::distance::DistanceType::L2Expanded:
+      case raft::distance::DistanceType::L2SqrtExpanded:
+      case raft::distance::DistanceType::L2Unexpanded:
+      case raft::distance::DistanceType::L2SqrtUnexpanded:
+        center_norms_ = make_device_mdarray<float>(handle, make_extents<uint32_t>(n_lists()));
+        break;
+      default: center_norms_ = std::nullopt;
+    }
+
     check_consistency();
   }
 
@@ -301,4 +312,6 @@ struct index : ann::index {
   }
 };
 
+/** @} */
+
 }  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh
index 5b2035fadf..287f0bc5f4 100644
--- a/cpp/include/raft/neighbors/ivf_pq.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,17 +16,22 @@
 
 #pragma once
 
-#include "ivf_pq_types.hpp"
+#include <raft/neighbors/ivf_pq_types.hpp>
 #include <raft/spatial/knn/detail/ivf_pq_build.cuh>
 #include <raft/spatial/knn/detail/ivf_pq_search.cuh>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 
 namespace raft::neighbors::ivf_pq {
 
+/**
+ * @defgroup ivf_pq IVF PQ Algorithm
+ * @{
+ */
+
 /**
  * @brief Build the index from the dataset for efficient search.
  *
@@ -60,9 +65,11 @@ namespace raft::neighbors::ivf_pq {
  * @return the constructed ivf-pq index
  */
 template <typename T, typename IdxT = uint32_t>
-inline auto build(
-  const handle_t& handle, const index_params& params, const T* dataset, IdxT n_rows, uint32_t dim)
-  -> index<IdxT>
+inline auto build(raft::device_resources const& handle,
+                  const index_params& params,
+                  const T* dataset,
+                  IdxT n_rows,
+                  uint32_t dim) -> index<IdxT>
 {
   return raft::spatial::knn::ivf_pq::detail::build(handle, params, dataset, n_rows, dim);
 }
@@ -100,7 +107,7 @@ inline auto build(
  * @return the constructed extended ivf-pq index
  */
 template <typename T, typename IdxT>
-inline auto extend(const handle_t& handle,
+inline auto extend(raft::device_resources const& handle,
                    const index<IdxT>& orig_index,
                    const T* new_vectors,
                    const IdxT* new_indices,
@@ -125,7 +132,7 @@ inline auto extend(const handle_t& handle,
  * @param n_rows the number of samples
  */
 template <typename T, typename IdxT>
-inline void extend(const handle_t& handle,
+inline void extend(raft::device_resources const& handle,
                    index<IdxT>* index,
                    const T* new_vectors,
                    const IdxT* new_indices,
@@ -177,7 +184,7 @@ inline void extend(const handle_t& handle,
  *           memory pool here to avoid memory allocations within search).
  */
 template <typename T, typename IdxT>
-inline void search(const handle_t& handle,
+inline void search(raft::device_resources const& handle,
                    const search_params& params,
                    const index<IdxT>& index,
                    const T* queries,
@@ -191,4 +198,6 @@ inline void search(const handle_t& handle,
     handle, params, index, queries, n_queries, k, neighbors, distances, mr);
 }
 
+/** @} */  // end group ivf_pq
+
 }  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/include/raft/neighbors/ivf_pq_types.hpp b/cpp/include/raft/neighbors/ivf_pq_types.hpp
index afb3eb6cd6..a6f71877f3 100644
--- a/cpp/include/raft/neighbors/ivf_pq_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_pq_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,11 @@
 
 namespace raft::neighbors::ivf_pq {
 
+/**
+ * @ingroup ivf_pq
+ * @{
+ */
+
 /** A type for specifying how PQ codebooks are created. */
 enum class codebook_gen {  // NOLINT
   PER_SUBSPACE = 0,        // NOLINT
@@ -258,7 +263,7 @@ struct index : ann::index {
   ~index()                          = default;
 
   /** Construct an empty index. It needs to be trained and then populated. */
-  index(const handle_t& handle,
+  index(raft::device_resources const& handle,
         raft::distance::DistanceType metric,
         codebook_gen codebook_kind,
         uint32_t n_lists,
@@ -290,7 +295,7 @@ struct index : ann::index {
   }
 
   /** Construct an empty index. It needs to be trained and then populated. */
-  index(const handle_t& handle,
+  index(raft::device_resources const& handle,
         const index_params& params,
         uint32_t dim,
         uint32_t n_nonempty_lists = 0)
@@ -309,10 +314,18 @@ struct index : ann::index {
    * Replace the content of the index with new uninitialized mdarrays to hold the indicated amount
    * of data.
    */
-  void allocate(const handle_t& handle, IdxT index_size)
+  void allocate(raft::device_resources const& handle, IdxT index_size)
   {
-    pq_dataset_ = make_device_mdarray<uint8_t>(handle, make_pq_dataset_extents(index_size));
-    indices_    = make_device_mdarray<IdxT>(handle, make_extents<IdxT>(index_size));
+    try {
+      pq_dataset_ = make_device_mdarray<uint8_t>(handle, make_pq_dataset_extents(index_size));
+      indices_    = make_device_mdarray<IdxT>(handle, make_extents<IdxT>(index_size));
+    } catch (std::bad_alloc& e) {
+      RAFT_FAIL(
+        "ivf-pq: failed to allocate a big enough index to hold all data (size: %zu). "
+        "Allocator exception: %s",
+        size_t(index_size),
+        e.what());
+    }
     if (index_size > 0) {
       thrust::fill_n(
         handle.get_thrust_policy(), indices_.data_handle(), index_size, kInvalidRecord);
@@ -429,7 +442,7 @@ struct index : ann::index {
 
   /** A helper function to determine the extents of an array enough to hold a given amount of data.
    */
-  auto make_pq_dataset_extents(IdxT n_rows) -> pq_dataset_extents
+  auto make_pq_dataset_extents(IdxT n_rows) const -> pq_dataset_extents
   {
     // how many elems of pq_dim fit into one kIndexGroupVecLen-byte chunk
     auto pq_chunk = (kIndexGroupVecLen * 8u) / pq_bits();
@@ -497,4 +510,6 @@ struct index : ann::index {
   }
 };
 
+/** @} */
+
 }  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/include/raft/neighbors/refine.cuh b/cpp/include/raft/neighbors/refine.cuh
index 7b6708f18c..4243d7e723 100644
--- a/cpp/include/raft/neighbors/refine.cuh
+++ b/cpp/include/raft/neighbors/refine.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/matrix/matrix.cuh>
 #include <raft/neighbors/detail/refine.cuh>
@@ -25,6 +25,11 @@
 
 namespace raft::neighbors {
 
+/**
+ * @defgroup ann_refine Approximate Nearest Neighbors Refinement
+ * @{
+ */
+
 /**
  * @brief Refine nearest neighbor search.
  *
@@ -63,7 +68,7 @@ namespace raft::neighbors {
  * @param[in] metric distance metric to use. Euclidean (L2) is used by default
  */
 template <typename idx_t, typename data_t, typename distance_t, typename matrix_idx>
-void refine(raft::handle_t const& handle,
+void refine(raft::device_resources const& handle,
             raft::device_matrix_view<const data_t, matrix_idx, row_major> dataset,
             raft::device_matrix_view<const data_t, matrix_idx, row_major> queries,
             raft::device_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,
@@ -85,7 +90,7 @@ void refine(raft::handle_t const& handle,
  * @param[in] metric distance metric to use. Euclidean (L2) is used by default
  */
 template <typename idx_t, typename data_t, typename distance_t, typename matrix_idx>
-void refine(raft::handle_t const& handle,
+void refine(raft::device_resources const& handle,
             raft::host_matrix_view<const data_t, matrix_idx, row_major> dataset,
             raft::host_matrix_view<const data_t, matrix_idx, row_major> queries,
             raft::host_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,
@@ -95,4 +100,6 @@ void refine(raft::handle_t const& handle,
 {
   detail::refine_host(dataset, queries, neighbor_candidates, indices, distances, metric);
 }
+
+/** @} */  // end group ann_refine
 }  // namespace raft::neighbors
diff --git a/cpp/include/raft/neighbors/specializations.cuh b/cpp/include/raft/neighbors/specializations.cuh
index 0511bbbf6c..d6cdae1e68 100644
--- a/cpp/include/raft/neighbors/specializations.cuh
+++ b/cpp/include/raft/neighbors/specializations.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/neighbors/specializations/ball_cover.cuh b/cpp/include/raft/neighbors/specializations/ball_cover.cuh
index f20d1adc35..d44cb7064f 100644
--- a/cpp/include/raft/neighbors/specializations/ball_cover.cuh
+++ b/cpp/include/raft/neighbors/specializations/ball_cover.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,11 +26,11 @@ extern template class BallCoverIndex<int, float, std::uint32_t, std::uint32_t>;
 extern template class BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>;
 
 extern template void build_index<std::int64_t, float, std::uint32_t, std::uint32_t>(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index);
 
 extern template void knn_query<std::int64_t, float, std::uint32_t>(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index,
   std::uint32_t k,
   const float* query,
@@ -41,7 +41,7 @@ extern template void knn_query<std::int64_t, float, std::uint32_t>(
   float weight);
 
 extern template void all_knn_query<std::int64_t, float, std::uint32_t, std::uint32_t>(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index,
   std::uint32_t k,
   std::int64_t* inds,
diff --git a/cpp/include/raft/neighbors/specializations/detail/ball_cover_lowdim.hpp b/cpp/include/raft/neighbors/specializations/detail/ball_cover_lowdim.hpp
index 31df566b3f..c558ab8b56 100644
--- a/cpp/include/raft/neighbors/specializations/detail/ball_cover_lowdim.hpp
+++ b/cpp/include/raft/neighbors/specializations/detail/ball_cover_lowdim.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@ namespace knn {
 namespace detail {
 
 extern template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t, 2>(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
   const float* query,
   const std::uint32_t n_query_rows,
@@ -38,7 +38,7 @@ extern template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t, 2>
   std::uint32_t* dists_counter);
 
 extern template void rbc_low_dim_pass_two<std::int64_t, float, std::uint32_t, 2>(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
   const float* query,
   const std::uint32_t n_query_rows,
@@ -52,7 +52,7 @@ extern template void rbc_low_dim_pass_two<std::int64_t, float, std::uint32_t, 2>
   std::uint32_t* post_dists_counter);
 
 extern template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t, 3>(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
   const float* query,
   const std::uint32_t n_query_rows,
@@ -66,7 +66,7 @@ extern template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t, 3>
   std::uint32_t* dists_counter);
 
 extern template void rbc_low_dim_pass_two<std::int64_t, float, std::uint32_t, 3>(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
   const float* query,
   const std::uint32_t n_query_rows,
diff --git a/cpp/include/raft/neighbors/specializations/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/specializations/detail/ivf_pq_search.cuh
index 768a8739c3..ca5e4ac761 100644
--- a/cpp/include/raft/neighbors/specializations/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/neighbors/specializations/detail/ivf_pq_search.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -50,15 +50,15 @@ RAFT_INST_ALL_OUT_T(fp8u_t)
 #undef RAFT_INST_ALL_IDX_T
 #undef RAFT_INST_ALL_OUT_T
 
-#define RAFT_INST(T, IdxT)                                   \
-  extern template void search<T, IdxT>(const handle_t&,      \
-                                       const search_params&, \
-                                       const index<IdxT>&,   \
-                                       const T*,             \
-                                       uint32_t,             \
-                                       uint32_t,             \
-                                       IdxT*,                \
-                                       float*,               \
+#define RAFT_INST(T, IdxT)                                            \
+  extern template void search<T, IdxT>(raft::device_resources const&, \
+                                       const search_params&,          \
+                                       const index<IdxT>&,            \
+                                       const T*,                      \
+                                       uint32_t,                      \
+                                       uint32_t,                      \
+                                       IdxT*,                         \
+                                       float*,                        \
                                        rmm::mr::device_memory_resource*);
 
 RAFT_INST(float, int64_t);
diff --git a/cpp/include/raft/neighbors/specializations/ivf_pq_specialization.hpp b/cpp/include/raft/neighbors/specializations/ivf_pq_specialization.hpp
deleted file mode 100644
index 2bce997e18..0000000000
--- a/cpp/include/raft/neighbors/specializations/ivf_pq_specialization.hpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/neighbors/ivf_pq_types.hpp>
-
-namespace raft::neighbors ::ivf_pq {
-
-#define RAFT_INST_SEARCH(T, IdxT)   \
-  void search(const handle_t&,      \
-              const search_params&, \
-              const index<IdxT>&,   \
-              const T*,             \
-              uint32_t,             \
-              uint32_t,             \
-              IdxT*,                \
-              float*,               \
-              rmm::mr::device_memory_resource*);
-
-RAFT_INST_SEARCH(float, uint64_t);
-RAFT_INST_SEARCH(int8_t, uint64_t);
-RAFT_INST_SEARCH(uint8_t, uint64_t);
-
-#undef RAFT_INST_SEARCH
-
-// We define overloads for build and extend with void return type. This is used in the Cython
-// wrappers, where exception handling is not compatible with return type that has nontrivial
-// constructor.
-#define RAFT_INST_BUILD_EXTEND(T, IdxT)      \
-  auto build(const handle_t& handle,         \
-             const index_params& params,     \
-             const T* dataset,               \
-             IdxT n_rows,                    \
-             uint32_t dim)                   \
-    ->index<IdxT>;                           \
-                                             \
-  auto extend(const handle_t& handle,        \
-              const index<IdxT>& orig_index, \
-              const T* new_vectors,          \
-              const IdxT* new_indices,       \
-              IdxT n_rows)                   \
-    ->index<IdxT>;                           \
-                                             \
-  void build(const handle_t& handle,         \
-             const index_params& params,     \
-             const T* dataset,               \
-             IdxT n_rows,                    \
-             uint32_t dim,                   \
-             index<IdxT>* idx);              \
-                                             \
-  void extend(const handle_t& handle,        \
-              index<IdxT>* idx,              \
-              const T* new_vectors,          \
-              const IdxT* new_indices,       \
-              IdxT n_rows);
-
-RAFT_INST_BUILD_EXTEND(float, uint64_t)
-RAFT_INST_BUILD_EXTEND(int8_t, uint64_t)
-RAFT_INST_BUILD_EXTEND(uint8_t, uint64_t)
-
-#undef RAFT_INST_BUILD_EXTEND
-
-}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/include/raft/neighbors/specializations/knn.cuh b/cpp/include/raft/neighbors/specializations/knn.cuh
index bbbbf67d71..b1cfa278d6 100644
--- a/cpp/include/raft/neighbors/specializations/knn.cuh
+++ b/cpp/include/raft/neighbors/specializations/knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 namespace raft {
 namespace spatial {
 namespace knn {
-extern template void brute_force_knn<long, float, int>(raft::handle_t const& handle,
+extern template void brute_force_knn<long, float, int>(raft::device_resources const& handle,
                                                        std::vector<float*>& input,
                                                        std::vector<int>& sizes,
                                                        int D,
@@ -36,22 +36,23 @@ extern template void brute_force_knn<long, float, int>(raft::handle_t const& han
                                                        distance::DistanceType metric,
                                                        float metric_arg);
 
-extern template void brute_force_knn<long, float, unsigned int>(raft::handle_t const& handle,
-                                                                std::vector<float*>& input,
-                                                                std::vector<unsigned int>& sizes,
-                                                                unsigned int D,
-                                                                float* search_items,
-                                                                unsigned int n,
-                                                                long* res_I,
-                                                                float* res_D,
-                                                                unsigned int k,
-                                                                bool rowMajorIndex,
-                                                                bool rowMajorQuery,
-                                                                std::vector<long>* translations,
-                                                                distance::DistanceType metric,
-                                                                float metric_arg);
+extern template void brute_force_knn<long, float, unsigned int>(
+  raft::device_resources const& handle,
+  std::vector<float*>& input,
+  std::vector<unsigned int>& sizes,
+  unsigned int D,
+  float* search_items,
+  unsigned int n,
+  long* res_I,
+  float* res_D,
+  unsigned int k,
+  bool rowMajorIndex,
+  bool rowMajorQuery,
+  std::vector<long>* translations,
+  distance::DistanceType metric,
+  float metric_arg);
 
-extern template void brute_force_knn<uint32_t, float, int>(raft::handle_t const& handle,
+extern template void brute_force_knn<uint32_t, float, int>(raft::device_resources const& handle,
                                                            std::vector<float*>& input,
                                                            std::vector<int>& sizes,
                                                            int D,
@@ -67,7 +68,7 @@ extern template void brute_force_knn<uint32_t, float, int>(raft::handle_t const&
                                                            float metric_arg);
 
 extern template void brute_force_knn<uint32_t, float, unsigned int>(
-  raft::handle_t const& handle,
+  raft::device_resources const& handle,
   std::vector<float*>& input,
   std::vector<unsigned int>& sizes,
   unsigned int D,
diff --git a/cpp/include/raft/neighbors/specializations/refine.cuh b/cpp/include/raft/neighbors/specializations/refine.cuh
new file mode 100644
index 0000000000..71e83a26f3
--- /dev/null
+++ b/cpp/include/raft/neighbors/specializations/refine.cuh
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/neighbors/refine.cuh>
+
+namespace raft::neighbors {
+
+#ifdef RAFT_INST
+#undef RAFT_INST
+#endif
+
+#define RAFT_INST(T, IdxT)                                                         \
+  extern template void refine<IdxT, T, float, uint64_t>(                           \
+    raft::device_resources const& handle,                                          \
+    raft::device_matrix_view<const T, uint64_t, row_major> dataset,                \
+    raft::device_matrix_view<const T, uint64_t, row_major> queries,                \
+    raft::device_matrix_view<const IdxT, uint64_t, row_major> neighbor_candidates, \
+    raft::device_matrix_view<IdxT, uint64_t, row_major> indices,                   \
+    raft::device_matrix_view<float, uint64_t, row_major> distances,                \
+    distance::DistanceType metric);                                                \
+                                                                                   \
+  extern template void refine<IdxT, T, float, uint64_t>(                           \
+    raft::device_resources const& handle,                                          \
+    raft::host_matrix_view<const T, uint64_t, row_major> dataset,                  \
+    raft::host_matrix_view<const T, uint64_t, row_major> queries,                  \
+    raft::host_matrix_view<const IdxT, uint64_t, row_major> neighbor_candidates,   \
+    raft::host_matrix_view<IdxT, uint64_t, row_major> indices,                     \
+    raft::host_matrix_view<float, uint64_t, row_major> distances,                  \
+    distance::DistanceType metric);
+
+RAFT_INST(float, uint64_t);
+RAFT_INST(uint8_t, uint64_t);
+RAFT_INST(int8_t, uint64_t);
+
+#undef RAFT_INST
+}  // namespace raft::neighbors
diff --git a/cpp/include/raft/random/detail/make_blobs.cuh b/cpp/include/raft/random/detail/make_blobs.cuh
index 212245a9bf..b37dabb366 100644
--- a/cpp/include/raft/random/detail/make_blobs.cuh
+++ b/cpp/include/raft/random/detail/make_blobs.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,8 @@
 #pragma once
 
 #include "permute.cuh"
-#include <raft/linalg/unary_op.cuh>
+#include <raft/core/handle.hpp>
+#include <raft/linalg/map.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/random/rng_device.cuh>
 #include <raft/util/cuda_utils.cuh>
@@ -39,16 +40,16 @@ void generate_labels(IdxT* labels,
                      raft::random::RngState& r,
                      cudaStream_t stream)
 {
+  raft::handle_t handle(stream);
   IdxT a, b;
   raft::random::affine_transform_params(r, n_clusters, a, b);
-  auto op = [=] __device__(IdxT * ptr, IdxT idx) {
-    if (shuffle) { idx = IdxT((a * int64_t(idx)) + b); }
+  auto op = [=] __device__(IdxT idx) {
+    if (shuffle) { idx = static_cast<IdxT>((a * int64_t(idx)) + b); }
     idx %= n_clusters;
-    // in the unlikely case of n_clusters > n_rows, make sure that the writes
-    // do not go out-of-bounds
-    if (idx < n_rows) { *ptr = idx; }
+    return idx;
   };
-  raft::linalg::writeOnlyUnaryOp<IdxT, decltype(op), IdxT>(labels, n_rows, op, stream);
+  auto labels_view = raft::make_device_vector_view<IdxT, IdxT>(labels, n_rows);
+  linalg::map_offset(handle, labels_view, op);
 }
 
 template <typename DataT, typename IdxT>
@@ -156,8 +157,10 @@ void generate_data(DataT* out,
                    const DataT cluster_std_scalar,
                    raft::random::RngState& rng_state)
 {
-  IdxT items   = n_rows * n_cols;
-  IdxT nBlocks = (items + 127) / 128;
+  constexpr IdxT block_size = 128;
+  IdxT items                = n_rows * n_cols;
+  // Choose a grid size so that each thread can write two output values.
+  IdxT nBlocks = ceildiv<IdxT>(items, 2 * block_size);
   // parentheses needed here for kernel, otherwise macro interprets the arguments
   // of triple chevron notation as macro arguments
   RAFT_CALL_RNG_FUNC(rng_state,
diff --git a/cpp/include/raft/random/detail/make_regression.cuh b/cpp/include/raft/random/detail/make_regression.cuh
index f06e20d4a6..01d97d496d 100644
--- a/cpp/include/raft/random/detail/make_regression.cuh
+++ b/cpp/include/raft/random/detail/make_regression.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 
 #include <algorithm>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/init.cuh>
@@ -44,15 +44,15 @@ static __global__ void _singular_profile_kernel(DataT* out, IdxT n, DataT tail_s
   IdxT tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < n) {
     DataT sval     = static_cast<DataT>(tid) / rank;
-    DataT low_rank = ((DataT)1.0 - tail_strength) * raft::myExp(-sval * sval);
-    DataT tail     = tail_strength * raft::myExp((DataT)-0.1 * sval);
+    DataT low_rank = ((DataT)1.0 - tail_strength) * raft::exp(-sval * sval);
+    DataT tail     = tail_strength * raft::exp((DataT)-0.1 * sval);
     out[tid]       = low_rank + tail;
   }
 }
 
 /* Internal auxiliary function to generate a low-rank matrix */
 template <typename DataT, typename IdxT>
-static void _make_low_rank_matrix(const raft::handle_t& handle,
+static void _make_low_rank_matrix(raft::device_resources const& handle,
                                   DataT* out,
                                   IdxT n_rows,
                                   IdxT n_cols,
@@ -143,7 +143,7 @@ static __global__ void _gather2d_kernel(
 }
 
 template <typename DataT, typename IdxT>
-void make_regression_caller(const raft::handle_t& handle,
+void make_regression_caller(raft::device_resources const& handle,
                             DataT* out,
                             DataT* values,
                             IdxT n_rows,
@@ -158,7 +158,7 @@ void make_regression_caller(const raft::handle_t& handle,
                             DataT noise                      = (DataT)0.0,
                             bool shuffle                     = true,
                             uint64_t seed                    = 0ULL,
-                            raft::random::GeneratorType type = raft::random::GenPhilox)
+                            raft::random::GeneratorType type = raft::random::GenPC)
 {
   n_informative = std::min(n_informative, n_cols);
 
diff --git a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
index 5bed71f2f4..8b77608e62 100644
--- a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
+++ b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
 #include <memory>
 #include <optional>
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/detail/cusolver_wrappers.hpp>
 #include <raft/linalg/matrix_vector_op.cuh>
@@ -139,13 +139,15 @@ class multi_variable_gaussian_impl {
   int *info, Lwork, info_h;
   syevjInfo_t syevj_params = NULL;
   curandGenerator_t gen;
-  const raft::handle_t& handle;
+  raft::device_resources const& handle;
   cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR;
   bool deinitilized      = false;
 
  public:  // functions
   multi_variable_gaussian_impl() = delete;
-  multi_variable_gaussian_impl(const raft::handle_t& handle, const int dim, Decomposer method)
+  multi_variable_gaussian_impl(raft::device_resources const& handle,
+                               const int dim,
+                               Decomposer method)
     : handle(handle), dim(dim), method(method)
   {
     auto cusolverHandle = handle.get_cusolver_dn_handle();
@@ -297,7 +299,7 @@ class multi_variable_gaussian_setup_token;
 
 template <typename ValueType>
 multi_variable_gaussian_setup_token<ValueType> build_multi_variable_gaussian_token_impl(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   rmm::mr::device_memory_resource& mem_resource,
   const int dim,
   const multi_variable_gaussian_decomposition_method method);
@@ -313,7 +315,7 @@ template <typename ValueType>
 class multi_variable_gaussian_setup_token {
   template <typename T>
   friend multi_variable_gaussian_setup_token<T> build_multi_variable_gaussian_token_impl(
-    const raft::handle_t& handle,
+    raft::device_resources const& handle,
     rmm::mr::device_memory_resource& mem_resource,
     const int dim,
     const multi_variable_gaussian_decomposition_method method);
@@ -340,7 +342,7 @@ class multi_variable_gaussian_setup_token {
 
   // Constructor, only for use by friend functions.
   // Hiding this will let us change the implementation in the future.
-  multi_variable_gaussian_setup_token(const raft::handle_t& handle,
+  multi_variable_gaussian_setup_token(raft::device_resources const& handle,
                                       rmm::mr::device_memory_resource& mem_resource,
                                       const int dim,
                                       const multi_variable_gaussian_decomposition_method method)
@@ -397,7 +399,7 @@ class multi_variable_gaussian_setup_token {
 
  private:
   std::unique_ptr<multi_variable_gaussian_impl<ValueType>> impl_;
-  const raft::handle_t& handle_;
+  raft::device_resources const& handle_;
   rmm::mr::device_memory_resource& mem_resource_;
   int dim_ = 0;
 
@@ -412,7 +414,7 @@ class multi_variable_gaussian_setup_token {
 
 template <typename ValueType>
 multi_variable_gaussian_setup_token<ValueType> build_multi_variable_gaussian_token_impl(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   rmm::mr::device_memory_resource& mem_resource,
   const int dim,
   const multi_variable_gaussian_decomposition_method method)
@@ -432,7 +434,7 @@ void compute_multi_variable_gaussian_impl(
 
 template <typename ValueType>
 void compute_multi_variable_gaussian_impl(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   rmm::mr::device_memory_resource& mem_resource,
   std::optional<raft::device_vector_view<const ValueType, int>> x,
   raft::device_matrix_view<ValueType, int, raft::col_major> P,
@@ -444,5 +446,41 @@ void compute_multi_variable_gaussian_impl(
   compute_multi_variable_gaussian_impl(token, x, P, X);
 }
 
+template <typename T>
+class multi_variable_gaussian : public detail::multi_variable_gaussian_impl<T> {
+ public:
+  // using Decomposer = typename detail::multi_variable_gaussian_impl<T>::Decomposer;
+  // using detail::multi_variable_gaussian_impl<T>::Decomposer::chol_decomp;
+  // using detail::multi_variable_gaussian_impl<T>::Decomposer::jacobi;
+  // using detail::multi_variable_gaussian_impl<T>::Decomposer::qr;
+
+  multi_variable_gaussian() = delete;
+  multi_variable_gaussian(raft::device_resources const& handle,
+                          const int dim,
+                          typename detail::multi_variable_gaussian_impl<T>::Decomposer method)
+    : detail::multi_variable_gaussian_impl<T>{handle, dim, method}
+  {
+  }
+
+  std::size_t get_workspace_size()
+  {
+    return detail::multi_variable_gaussian_impl<T>::get_workspace_size();
+  }
+
+  void set_workspace(T* workarea)
+  {
+    detail::multi_variable_gaussian_impl<T>::set_workspace(workarea);
+  }
+
+  void give_gaussian(const int nPoints, T* P, T* X, const T* x = 0)
+  {
+    detail::multi_variable_gaussian_impl<T>::give_gaussian(nPoints, P, X, x);
+  }
+
+  void deinit() { detail::multi_variable_gaussian_impl<T>::deinit(); }
+
+  ~multi_variable_gaussian() { deinit(); }
+};  // end of multi_variable_gaussian
+
 };  // end of namespace detail
 };  // end of namespace raft::random
diff --git a/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh b/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh
index 5ce7e909ee..b5e0610405 100644
--- a/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh
+++ b/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #include "rmat_rectangular_generator_types.cuh"
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/random/rng_device.cuh>
 #include <raft/random/rng_state.hpp>
 #include <raft/util/cuda_utils.cuh>
@@ -206,7 +206,7 @@ void rmat_rectangular_gen_caller(IdxT* out,
  * @param[in]  c_scale 2^c_scale represents the number of destination nodes
  */
 template <typename IdxT, typename ProbT>
-void rmat_rectangular_gen_impl(const raft::handle_t& handle,
+void rmat_rectangular_gen_impl(raft::device_resources const& handle,
                                raft::random::RngState& r,
                                raft::device_vector_view<const ProbT, IdxT> theta,
                                raft::random::detail::rmat_rectangular_gen_output<IdxT> output,
@@ -259,7 +259,7 @@ void rmat_rectangular_gen_impl(const raft::handle_t& handle,
  * `theta` parameter.
  */
 template <typename IdxT, typename ProbT>
-void rmat_rectangular_gen_impl(const raft::handle_t& handle,
+void rmat_rectangular_gen_impl(raft::device_resources const& handle,
                                raft::random::RngState& r,
                                raft::random::detail::rmat_rectangular_gen_output<IdxT> output,
                                ProbT a,
diff --git a/cpp/include/raft/random/detail/rng_device.cuh b/cpp/include/raft/random/detail/rng_device.cuh
index ef13138beb..7f994fb07f 100644
--- a/cpp/include/raft/random/detail/rng_device.cuh
+++ b/cpp/include/raft/random/detail/rng_device.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -143,10 +143,10 @@ DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type
 {
   constexpr Type twoPi  = Type(2.0) * Type(3.141592654);
   constexpr Type minus2 = -Type(2.0);
-  Type R                = raft::mySqrt(minus2 * raft::myLog(val1));
+  Type R                = raft::sqrt(minus2 * raft::log(val1));
   Type theta            = twoPi * val2;
   Type s, c;
-  raft::mySinCos(theta, s, c);
+  raft::sincos(theta, &s, &c);
   val1 = R * c * sigma1 + mu1;
   val2 = R * s * sigma2 + mu2;
 }
@@ -323,7 +323,7 @@ DI void custom_next(
     gen.next(res);
   } while (res == OutType(0.0));
 
-  *val = params.mu - params.beta * raft::myLog(-raft::myLog(res));
+  *val = params.mu - params.beta * raft::log(-raft::log(res));
 }
 
 template <typename GenType, typename OutType, typename LenType>
@@ -340,8 +340,8 @@ DI void custom_next(GenType& gen,
 
   gen.next(res2);
   box_muller_transform<OutType>(res1, res2, params.sigma, params.mu);
-  *val       = raft::myExp(res1);
-  *(val + 1) = raft::myExp(res2);
+  *val       = raft::exp(res1);
+  *(val + 1) = raft::exp(res2);
 }
 
 template <typename GenType, typename OutType, typename LenType>
@@ -358,7 +358,7 @@ DI void custom_next(GenType& gen,
   } while (res == OutType(0.0));
 
   constexpr OutType one = (OutType)1.0;
-  *val                  = params.mu - params.scale * raft::myLog(one / res - one);
+  *val                  = params.mu - params.scale * raft::log(one / res - one);
 }
 
 template <typename GenType, typename OutType, typename LenType>
@@ -371,7 +371,7 @@ DI void custom_next(GenType& gen,
   OutType res;
   gen.next(res);
   constexpr OutType one = (OutType)1.0;
-  *val                  = -raft::myLog(one - res) / params.lambda;
+  *val                  = -raft::log(one - res) / params.lambda;
 }
 
 template <typename GenType, typename OutType, typename LenType>
@@ -386,7 +386,7 @@ DI void custom_next(GenType& gen,
 
   constexpr OutType one = (OutType)1.0;
   constexpr OutType two = (OutType)2.0;
-  *val                  = raft::mySqrt(-two * raft::myLog(one - res)) * params.sigma;
+  *val                  = raft::sqrt(-two * raft::log(one - res)) * params.sigma;
 }
 
 template <typename GenType, typename OutType, typename LenType>
@@ -409,9 +409,9 @@ DI void custom_next(GenType& gen,
   // The <= comparison here means, number of samples going in `if` branch are more by 1 than `else`
   // branch. However it does not matter as for 0.5 both branches evaluate to same result.
   if (res <= oneHalf) {
-    out = params.mu + params.scale * raft::myLog(two * res);
+    out = params.mu + params.scale * raft::log(two * res);
   } else {
-    out = params.mu - params.scale * raft::myLog(two * (one - res));
+    out = params.mu - params.scale * raft::log(two * (one - res));
   }
   *val = out;
 }
@@ -424,7 +424,7 @@ DI void custom_next(
   gen.next(res);
   params.inIdxPtr[idx]  = idx;
   constexpr OutType one = (OutType)1.0;
-  auto exp              = -raft::myLog(one - res);
+  auto exp              = -raft::log(one - res);
   if (params.wts != nullptr) {
     *val = exp / params.wts[idx];
   } else {
@@ -681,6 +681,40 @@ __global__ void rngKernel(DeviceState<GenType> rng_state,
   return;
 }
 
+template <typename GenType, typename OutType, typename WeightType, typename IdxType>
+__global__ void sample_with_replacement_kernel(DeviceState<GenType> rng_state,
+                                               OutType* out,
+                                               const WeightType* weights_csum,
+                                               IdxType sampledLen,
+                                               IdxType len)
+{
+  // todo(lsugy): warp-collaborative binary search
+
+  IdxType tid = threadIdx.x + static_cast<IdxType>(blockIdx.x) * blockDim.x;
+  GenType gen(rng_state, static_cast<uint64_t>(tid));
+
+  if (tid < sampledLen) {
+    WeightType val_01;
+    gen.next(val_01);
+    WeightType val_search = val_01 * weights_csum[len - 1];
+
+    // Binary search of the first index for which the cumulative sum of weights is larger than the
+    // generated value
+    IdxType idx_start = 0;
+    IdxType idx_end   = len;
+    while (idx_end > idx_start) {
+      IdxType idx_middle    = (idx_start + idx_end) / 2;
+      WeightType val_middle = weights_csum[idx_middle];
+      if (val_search <= val_middle) {
+        idx_end = idx_middle;
+      } else {
+        idx_start = idx_middle + 1;
+      }
+    }
+    out[tid] = static_cast<OutType>(min(idx_start, len - 1));
+  }
+}
+
 /**
  * This kernel is deprecated and should be removed in a future release
  */
diff --git a/cpp/include/raft/random/detail/rng_impl.cuh b/cpp/include/raft/random/detail/rng_impl.cuh
index 5aecbfcaa2..cd465e634a 100644
--- a/cpp/include/raft/random/detail/rng_impl.cuh
+++ b/cpp/include/raft/random/detail/rng_impl.cuh
@@ -234,6 +234,50 @@ void laplace(
   RAFT_CALL_RNG_FUNC(rng_state, call_rng_kernel<1>, rng_state, stream, ptr, len, params);
 }
 
+template <typename GenType, typename OutType, typename WeightType, typename IdxType>
+void call_sample_with_replacement_kernel(DeviceState<GenType> const& dev_state,
+                                         RngState& rng_state,
+                                         cudaStream_t stream,
+                                         OutType* out,
+                                         const WeightType* weights_csum,
+                                         IdxType sampledLen,
+                                         IdxType len)
+{
+  IdxType n_threads = 256;
+  IdxType n_blocks  = raft::ceildiv(sampledLen, n_threads);
+  sample_with_replacement_kernel<<<n_blocks, n_threads, 0, stream>>>(
+    dev_state, out, weights_csum, sampledLen, len);
+  rng_state.advance(uint64_t(n_blocks) * n_threads, 1);
+}
+
+template <typename OutType, typename WeightType, typename IndexType = OutType>
+std::enable_if_t<std::is_integral_v<OutType>> discrete(RngState& rng_state,
+                                                       OutType* ptr,
+                                                       const WeightType* weights,
+                                                       IndexType sampledLen,
+                                                       IndexType len,
+                                                       cudaStream_t stream)
+{
+  // Compute the cumulative sums of the weights
+  size_t temp_storage_bytes = 0;
+  rmm::device_uvector<WeightType> weights_csum(len, stream);
+  cub::DeviceScan::InclusiveSum(
+    nullptr, temp_storage_bytes, weights, weights_csum.data(), len, stream);
+  rmm::device_uvector<uint8_t> temp_storage(temp_storage_bytes, stream);
+  cub::DeviceScan::InclusiveSum(
+    temp_storage.data(), temp_storage_bytes, weights, weights_csum.data(), len, stream);
+
+  // Sample indices with replacement
+  RAFT_CALL_RNG_FUNC(rng_state,
+                     call_sample_with_replacement_kernel,
+                     rng_state,
+                     stream,
+                     ptr,
+                     weights_csum.data(),
+                     sampledLen,
+                     len);
+}
+
 template <typename DataT, typename WeightsT, typename IdxT = int>
 void sampleWithoutReplacement(RngState& rng_state,
                               DataT* out,
diff --git a/cpp/include/raft/random/detail/rng_impl_deprecated.cuh b/cpp/include/raft/random/detail/rng_impl_deprecated.cuh
index f9b55dd9d0..362c844fb3 100644
--- a/cpp/include/raft/random/detail/rng_impl_deprecated.cuh
+++ b/cpp/include/raft/random/detail/rng_impl_deprecated.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@
 #include "rng_device.cuh"
 
 #include <curand_kernel.h>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/random/rng_state.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/detail/cub_wrappers.cuh>
@@ -259,7 +259,7 @@ class RngImpl {
 
   template <typename DataT, typename WeightsT, typename IdxT = int>
   METHOD_DEPR(sampleWithoutReplacement)
-  void sampleWithoutReplacement(const raft::handle_t& handle,
+  void sampleWithoutReplacement(raft::device_resources const& handle,
                                 DataT* out,
                                 IdxT* outIdx,
                                 const DataT* in,
diff --git a/cpp/include/raft/random/make_blobs.cuh b/cpp/include/raft/random/make_blobs.cuh
index 82c940b471..7aa0362f6d 100644
--- a/cpp/include/raft/random/make_blobs.cuh
+++ b/cpp/include/raft/random/make_blobs.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -74,7 +74,7 @@ void make_blobs(DataT* out,
                 DataT center_box_min           = (DataT)-10.0,
                 DataT center_box_max           = (DataT)10.0,
                 uint64_t seed                  = 0ULL,
-                GeneratorType type             = GenPhilox)
+                GeneratorType type             = GenPC)
 {
   detail::make_blobs_caller(out,
                             labels,
@@ -93,6 +93,11 @@ void make_blobs(DataT* out,
                             type);
 }
 
+/**
+ * @defgroup make_blobs Generate Isotropic Gaussian Clusters
+ * @{
+ */
+
 /**
  * @brief GPU-equivalent of sklearn.datasets.make_blobs
  *
@@ -124,7 +129,7 @@ void make_blobs(DataT* out,
  */
 template <typename DataT, typename IdxT, typename layout>
 void make_blobs(
-  raft::handle_t const& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<DataT, IdxT, layout> out,
   raft::device_vector_view<IdxT, IdxT> labels,
   IdxT n_clusters                                                        = 5,
@@ -135,7 +140,7 @@ void make_blobs(
   DataT center_box_min                                                   = (DataT)-10.0,
   DataT center_box_max                                                   = (DataT)10.0,
   uint64_t seed                                                          = 0ULL,
-  GeneratorType type                                                     = GenPhilox)
+  GeneratorType type                                                     = GenPC)
 {
   if (centers.has_value()) {
     RAFT_EXPECTS(centers.value().extent(0) == (IdxT)n_clusters,
@@ -173,6 +178,9 @@ void make_blobs(
                             seed,
                             type);
 }
+
+/** @} */  // end group make_blobs
+
 }  // end namespace raft::random
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/random/make_regression.cuh b/cpp/include/raft/random/make_regression.cuh
index c575ea987c..df7dea3156 100644
--- a/cpp/include/raft/random/make_regression.cuh
+++ b/cpp/include/raft/random/make_regression.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,7 +67,7 @@ namespace raft::random {
  * @param[in]   type            Random generator type
  */
 template <typename DataT, typename IdxT>
-void make_regression(const raft::handle_t& handle,
+void make_regression(raft::device_resources const& handle,
                      DataT* out,
                      DataT* values,
                      IdxT n_rows,
@@ -82,7 +82,7 @@ void make_regression(const raft::handle_t& handle,
                      DataT noise         = (DataT)0.0,
                      bool shuffle        = true,
                      uint64_t seed       = 0ULL,
-                     GeneratorType type  = GenPhilox)
+                     GeneratorType type  = GenPC)
 {
   detail::make_regression_caller(handle,
                                  out,
@@ -102,6 +102,11 @@ void make_regression(const raft::handle_t& handle,
                                  type);
 }
 
+/**
+ * @defgroup make_regression Generate Dataset for Regression Model
+ * @{
+ */
+
 /**
  * @brief GPU-equivalent of sklearn.datasets.make_regression as documented at:
  * https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_regression.html
@@ -133,7 +138,7 @@ void make_regression(const raft::handle_t& handle,
  * @param[in]   type            Random generator type
  */
 template <typename DataT, typename IdxT>
-void make_regression(const raft::handle_t& handle,
+void make_regression(raft::device_resources const& handle,
                      raft::device_matrix_view<DataT, IdxT, raft::row_major> out,
                      raft::device_matrix_view<DataT, IdxT, raft::row_major> values,
                      IdxT n_informative,
@@ -144,7 +149,7 @@ void make_regression(const raft::handle_t& handle,
                      DataT noise         = DataT{},
                      bool shuffle        = true,
                      uint64_t seed       = 0ULL,
-                     GeneratorType type  = GenPhilox)
+                     GeneratorType type  = GenPC)
 {
   const auto n_samples = out.extent(0);
   assert(values.extent(0) == n_samples);
@@ -177,6 +182,8 @@ void make_regression(const raft::handle_t& handle,
                                  type);
 }
 
+/** @} */  // end group make_regression
+
 }  // namespace raft::random
 
 #endif
diff --git a/cpp/include/raft/random/multi_variable_gaussian.cuh b/cpp/include/raft/random/multi_variable_gaussian.cuh
index 6bee323007..91a7695f2c 100644
--- a/cpp/include/raft/random/multi_variable_gaussian.cuh
+++ b/cpp/include/raft/random/multi_variable_gaussian.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,61 +24,28 @@
 
 namespace raft::random {
 
-template <typename T>
-class multi_variable_gaussian : public detail::multi_variable_gaussian_impl<T> {
- public:
-  // using Decomposer = typename detail::multi_variable_gaussian_impl<T>::Decomposer;
-  // using detail::multi_variable_gaussian_impl<T>::Decomposer::chol_decomp;
-  // using detail::multi_variable_gaussian_impl<T>::Decomposer::jacobi;
-  // using detail::multi_variable_gaussian_impl<T>::Decomposer::qr;
-
-  multi_variable_gaussian() = delete;
-  multi_variable_gaussian(const raft::handle_t& handle,
-                          const int dim,
-                          typename detail::multi_variable_gaussian_impl<T>::Decomposer method)
-    : detail::multi_variable_gaussian_impl<T>{handle, dim, method}
-  {
-  }
-
-  std::size_t get_workspace_size()
-  {
-    return detail::multi_variable_gaussian_impl<T>::get_workspace_size();
-  }
-
-  void set_workspace(T* workarea)
-  {
-    detail::multi_variable_gaussian_impl<T>::set_workspace(workarea);
-  }
-
-  void give_gaussian(const int nPoints, T* P, T* X, const T* x = 0)
-  {
-    detail::multi_variable_gaussian_impl<T>::give_gaussian(nPoints, P, X, x);
-  }
-
-  void deinit() { detail::multi_variable_gaussian_impl<T>::deinit(); }
-
-  ~multi_variable_gaussian() { deinit(); }
-};  // end of multi_variable_gaussian
+/**
+ * \defgroup multi_variable_gaussian Compute multi-variable Gaussian
+ * @{
+ */
 
 template <typename ValueType>
-void compute_multi_variable_gaussian(
-  const raft::handle_t& handle,
-  rmm::mr::device_memory_resource& mem_resource,
-  std::optional<raft::device_vector_view<const ValueType, int>> x,
-  raft::device_matrix_view<ValueType, int, raft::col_major> P,
-  raft::device_matrix_view<ValueType, int, raft::col_major> X,
-  const multi_variable_gaussian_decomposition_method method)
+void multi_variable_gaussian(raft::device_resources const& handle,
+                             rmm::mr::device_memory_resource& mem_resource,
+                             std::optional<raft::device_vector_view<const ValueType, int>> x,
+                             raft::device_matrix_view<ValueType, int, raft::col_major> P,
+                             raft::device_matrix_view<ValueType, int, raft::col_major> X,
+                             const multi_variable_gaussian_decomposition_method method)
 {
   detail::compute_multi_variable_gaussian_impl(handle, mem_resource, x, P, X, method);
 }
 
 template <typename ValueType>
-void compute_multi_variable_gaussian(
-  const raft::handle_t& handle,
-  std::optional<raft::device_vector_view<const ValueType, int>> x,
-  raft::device_matrix_view<ValueType, int, raft::col_major> P,
-  raft::device_matrix_view<ValueType, int, raft::col_major> X,
-  const multi_variable_gaussian_decomposition_method method)
+void multi_variable_gaussian(raft::device_resources const& handle,
+                             std::optional<raft::device_vector_view<const ValueType, int>> x,
+                             raft::device_matrix_view<ValueType, int, raft::col_major> P,
+                             raft::device_matrix_view<ValueType, int, raft::col_major> X,
+                             const multi_variable_gaussian_decomposition_method method)
 {
   rmm::mr::device_memory_resource* mem_resource_ptr = rmm::mr::get_current_device_resource();
   RAFT_EXPECTS(mem_resource_ptr != nullptr,
@@ -88,6 +55,8 @@ void compute_multi_variable_gaussian(
   detail::compute_multi_variable_gaussian_impl(handle, *mem_resource_ptr, x, P, X, method);
 }
 
+/** @} */
+
 };  // end of namespace raft::random
 
 #endif
diff --git a/cpp/include/raft/random/permute.cuh b/cpp/include/raft/random/permute.cuh
index 17b103fab6..f84b603549 100644
--- a/cpp/include/raft/random/permute.cuh
+++ b/cpp/include/raft/random/permute.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,11 +23,45 @@
 
 #include <optional>
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <type_traits>
 
 namespace raft::random {
 
+namespace permute_impl {
+
+template <typename T, typename InputOutputValueType, typename IdxType, typename Layout>
+struct perms_out_view {
+};
+
+template <typename InputOutputValueType, typename IdxType, typename Layout>
+struct perms_out_view<std::nullopt_t, InputOutputValueType, IdxType, Layout> {
+  // permsOut won't have a value anyway,
+  // so we can pick any integral value type we want.
+  using type = raft::device_vector_view<IdxType, IdxType>;
+};
+
+template <typename PermutationIndexType,
+          typename InputOutputValueType,
+          typename IdxType,
+          typename Layout>
+struct perms_out_view<std::optional<raft::device_vector_view<PermutationIndexType, IdxType>>,
+                      InputOutputValueType,
+                      IdxType,
+                      Layout> {
+  using type = raft::device_vector_view<PermutationIndexType, IdxType>;
+};
+
+template <typename T, typename InputOutputValueType, typename IdxType, typename Layout>
+using perms_out_view_t = typename perms_out_view<T, InputOutputValueType, IdxType, Layout>::type;
+
+}  // namespace permute_impl
+
+/**
+ * \defgroup permute Permutation
+ * @{
+ */
+
 /**
  * @brief Randomly permute the rows of the input matrix.
  *
@@ -61,7 +95,7 @@ namespace raft::random {
  *   then we recommend Knuth Shuffle.
  */
 template <typename InputOutputValueType, typename IntType, typename IdxType, typename Layout>
-void permute(const raft::handle_t& handle,
+void permute(raft::device_resources const& handle,
              raft::device_matrix_view<const InputOutputValueType, IdxType, Layout> in,
              std::optional<raft::device_vector_view<IntType, IdxType>> permsOut,
              std::optional<raft::device_matrix_view<InputOutputValueType, IdxType, Layout>> out)
@@ -99,35 +133,6 @@ void permute(const raft::handle_t& handle,
   }
 }
 
-namespace permute_impl {
-
-template <typename T, typename InputOutputValueType, typename IdxType, typename Layout>
-struct perms_out_view {
-};
-
-template <typename InputOutputValueType, typename IdxType, typename Layout>
-struct perms_out_view<std::nullopt_t, InputOutputValueType, IdxType, Layout> {
-  // permsOut won't have a value anyway,
-  // so we can pick any integral value type we want.
-  using type = raft::device_vector_view<IdxType, IdxType>;
-};
-
-template <typename PermutationIndexType,
-          typename InputOutputValueType,
-          typename IdxType,
-          typename Layout>
-struct perms_out_view<std::optional<raft::device_vector_view<PermutationIndexType, IdxType>>,
-                      InputOutputValueType,
-                      IdxType,
-                      Layout> {
-  using type = raft::device_vector_view<PermutationIndexType, IdxType>;
-};
-
-template <typename T, typename InputOutputValueType, typename IdxType, typename Layout>
-using perms_out_view_t = typename perms_out_view<T, InputOutputValueType, IdxType, Layout>::type;
-
-}  // namespace permute_impl
-
 /**
  * @brief Overload of `permute` that compiles if users pass in `std::nullopt`
  *   for either or both of `permsOut` and `out`.
@@ -137,7 +142,7 @@ template <typename InputOutputValueType,
           typename Layout,
           typename PermsOutType,
           typename OutType>
-void permute(const raft::handle_t& handle,
+void permute(raft::device_resources const& handle,
              raft::device_matrix_view<const InputOutputValueType, IdxType, Layout> in,
              PermsOutType&& permsOut,
              OutType&& out)
@@ -160,6 +165,8 @@ void permute(const raft::handle_t& handle,
   permute(handle, in, permsOut_arg, out_arg);
 }
 
+/** @} */
+
 /**
  * @brief Legacy overload of `permute` that takes raw arrays instead of mdspan.
  *
diff --git a/cpp/include/raft/random/random_types.hpp b/cpp/include/raft/random/random_types.hpp
index 96b55a4727..cd15c2f838 100644
--- a/cpp/include/raft/random/random_types.hpp
+++ b/cpp/include/raft/random/random_types.hpp
@@ -19,9 +19,14 @@
 namespace raft::random {
 
 /**
- * @brief Matrix decomposition method for `compute_multi_variable_gaussian` to use.
+ * \ingroup multi_variable_gaussian
+ * @{
+ */
+
+/**
+ * @brief Matrix decomposition method for `multi_variable_gaussian` to use.
  *
- * `compute_multi_variable_gaussian` can use any of the following methods.
+ * `multi_variable_gaussian` can use any of the following methods.
  *
  * - `CHOLESKY`: Uses Cholesky decomposition on the normal equations.
  *   This may be faster than the other two methods, but less accurate.
@@ -36,4 +41,6 @@ namespace raft::random {
  */
 enum class multi_variable_gaussian_decomposition_method { CHOLESKY, JACOBI, QR };
 
+/** @} */
+
 };  // end of namespace raft::random
diff --git a/cpp/include/raft/random/rmat_rectangular_generator.cuh b/cpp/include/raft/random/rmat_rectangular_generator.cuh
index cedcca1711..d578794d31 100644
--- a/cpp/include/raft/random/rmat_rectangular_generator.cuh
+++ b/cpp/include/raft/random/rmat_rectangular_generator.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,11 @@
 
 namespace raft::random {
 
+/**
+ * @defgroup rmat RMAT Rectangular Generator
+ * @{
+ */
+
 /**
  * @brief Generate a bipartite RMAT graph for a rectangular adjacency matrix.
  *
@@ -73,7 +78,7 @@ namespace raft::random {
  */
 template <typename IdxT, typename ProbT>
 void rmat_rectangular_gen(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::random::RngState& r,
   raft::device_vector_view<const ProbT, IdxT> theta,
   raft::device_mdspan<IdxT, raft::extents<IdxT, raft::dynamic_extent, 2>, raft::row_major> out,
@@ -97,7 +102,7 @@ void rmat_rectangular_gen(
  * @pre `out_src.extent(0) == out_dst.extent(0)` is `true`
  */
 template <typename IdxT, typename ProbT>
-void rmat_rectangular_gen(const raft::handle_t& handle,
+void rmat_rectangular_gen(raft::device_resources const& handle,
                           raft::random::RngState& r,
                           raft::device_vector_view<const ProbT, IdxT> theta,
                           raft::device_vector_view<IdxT, IdxT> out_src,
@@ -120,7 +125,7 @@ void rmat_rectangular_gen(const raft::handle_t& handle,
  */
 template <typename IdxT, typename ProbT>
 void rmat_rectangular_gen(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::random::RngState& r,
   raft::device_vector_view<const ProbT, IdxT> theta,
   raft::device_mdspan<IdxT, raft::extents<IdxT, raft::dynamic_extent, 2>, raft::row_major> out,
@@ -147,7 +152,7 @@ void rmat_rectangular_gen(
  */
 template <typename IdxT, typename ProbT>
 void rmat_rectangular_gen(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::random::RngState& r,
   raft::device_mdspan<IdxT, raft::extents<IdxT, raft::dynamic_extent, 2>, raft::row_major> out,
   raft::device_vector_view<IdxT, IdxT> out_src,
@@ -174,7 +179,7 @@ void rmat_rectangular_gen(
  * @pre `out_src.extent(0) == out_dst.extent(0)` is `true`
  */
 template <typename IdxT, typename ProbT>
-void rmat_rectangular_gen(const raft::handle_t& handle,
+void rmat_rectangular_gen(raft::device_resources const& handle,
                           raft::random::RngState& r,
                           raft::device_vector_view<IdxT, IdxT> out_src,
                           raft::device_vector_view<IdxT, IdxT> out_dst,
@@ -199,7 +204,7 @@ void rmat_rectangular_gen(const raft::handle_t& handle,
  */
 template <typename IdxT, typename ProbT>
 void rmat_rectangular_gen(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::random::RngState& r,
   raft::device_mdspan<IdxT, raft::extents<IdxT, raft::dynamic_extent, 2>, raft::row_major> out,
   ProbT a,
@@ -212,6 +217,8 @@ void rmat_rectangular_gen(
   detail::rmat_rectangular_gen_impl(handle, r, output, a, b, c, r_scale, c_scale);
 }
 
+/** @} */  // end group rmat
+
 /**
  * @brief Legacy overload of `rmat_rectangular_gen`
  *   taking raw arrays instead of mdspan.
diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh
index 95bfe24a68..d03975d0db 100644
--- a/cpp/include/raft/random/rng.cuh
+++ b/cpp/include/raft/random/rng.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 #include <cassert>
 #include <optional>
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <type_traits>
 #include <variant>
 
@@ -41,7 +41,7 @@ namespace raft::random {
  * @param[in] end end of the range
  */
 template <typename OutputValueType, typename IndexType>
-void uniform(const raft::handle_t& handle,
+void uniform(raft::device_resources const& handle,
              RngState& rng_state,
              raft::device_vector_view<OutputValueType, IndexType> out,
              OutputValueType start,
@@ -63,7 +63,7 @@ void uniform(const raft::handle_t& handle,
  * @param[in] end end of the range
  */
 template <typename OutType, typename LenType = int>
-void uniform(const raft::handle_t& handle,
+void uniform(raft::device_resources const& handle,
              RngState& rng_state,
              OutType* ptr,
              LenType len,
@@ -86,7 +86,7 @@ void uniform(const raft::handle_t& handle,
  * @param[in] end end of the range
  */
 template <typename OutputValueType, typename IndexType>
-void uniformInt(const raft::handle_t& handle,
+void uniformInt(raft::device_resources const& handle,
                 RngState& rng_state,
                 raft::device_vector_view<OutputValueType, IndexType> out,
                 OutputValueType start,
@@ -114,7 +114,7 @@ void uniformInt(const raft::handle_t& handle,
  * @param[in] end end of the range
  */
 template <typename OutType, typename LenType = int>
-void uniformInt(const raft::handle_t& handle,
+void uniformInt(raft::device_resources const& handle,
                 RngState& rng_state,
                 OutType* ptr,
                 LenType len,
@@ -138,7 +138,7 @@ void uniformInt(const raft::handle_t& handle,
  * @param[in] sigma std-dev of the distribution
  */
 template <typename OutputValueType, typename IndexType>
-void normal(const raft::handle_t& handle,
+void normal(raft::device_resources const& handle,
             RngState& rng_state,
             raft::device_vector_view<OutputValueType, IndexType> out,
             OutputValueType mu,
@@ -160,7 +160,7 @@ void normal(const raft::handle_t& handle,
  * @param[in] sigma std-dev of the distribution
  */
 template <typename OutType, typename LenType = int>
-void normal(const raft::handle_t& handle,
+void normal(raft::device_resources const& handle,
             RngState& rng_state,
             OutType* ptr,
             LenType len,
@@ -183,7 +183,7 @@ void normal(const raft::handle_t& handle,
  * @param[in] sigma standard deviation of the distribution
  */
 template <typename OutputValueType, typename IndexType>
-void normalInt(const raft::handle_t& handle,
+void normalInt(raft::device_resources const& handle,
                RngState& rng_state,
                raft::device_vector_view<OutputValueType, IndexType> out,
                OutputValueType mu,
@@ -212,7 +212,7 @@ void normalInt(const raft::handle_t& handle,
  * @param[in] sigma std-dev of the distribution
  */
 template <typename IntType, typename LenType = int>
-void normalInt(const raft::handle_t& handle,
+void normalInt(raft::device_resources const& handle,
                RngState& rng_state,
                IntType* ptr,
                LenType len,
@@ -244,7 +244,7 @@ void normalInt(const raft::handle_t& handle,
  */
 template <typename OutputValueType, typename IndexType>
 void normalTable(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   RngState& rng_state,
   raft::device_vector_view<const OutputValueType, IndexType> mu_vec,
   std::variant<raft::device_vector_view<const OutputValueType, IndexType>, OutputValueType> sigma,
@@ -307,7 +307,7 @@ void normalTable(
  * @param[in] sigma scalar sigma to be used if 'sigma_vec' is nullptr
  */
 template <typename OutType, typename LenType = int>
-void normalTable(const raft::handle_t& handle,
+void normalTable(raft::device_resources const& handle,
                  RngState& rng_state,
                  OutType* ptr,
                  LenType n_rows,
@@ -332,7 +332,7 @@ void normalTable(const raft::handle_t& handle,
  * @param[out] out the output vector
  */
 template <typename OutputValueType, typename IndexType>
-void fill(const raft::handle_t& handle,
+void fill(raft::device_resources const& handle,
           RngState& rng_state,
           OutputValueType val,
           raft::device_vector_view<OutputValueType, IndexType> out)
@@ -352,7 +352,8 @@ void fill(const raft::handle_t& handle,
  * @param[in] val value to be filled
  */
 template <typename OutType, typename LenType = int>
-void fill(const raft::handle_t& handle, RngState& rng_state, OutType* ptr, LenType len, OutType val)
+void fill(
+  raft::device_resources const& handle, RngState& rng_state, OutType* ptr, LenType len, OutType val)
 {
   detail::fill(rng_state, ptr, len, val, handle.get_stream());
 }
@@ -371,7 +372,7 @@ void fill(const raft::handle_t& handle, RngState& rng_state, OutType* ptr, LenTy
  * @param[in] prob coin-toss probability for heads
  */
 template <typename OutputValueType, typename IndexType, typename Type>
-void bernoulli(const raft::handle_t& handle,
+void bernoulli(raft::device_resources const& handle,
                RngState& rng_state,
                raft::device_vector_view<OutputValueType, IndexType> out,
                Type prob)
@@ -394,7 +395,7 @@ void bernoulli(const raft::handle_t& handle,
  */
 template <typename Type, typename OutType = bool, typename LenType = int>
 void bernoulli(
-  const raft::handle_t& handle, RngState& rng_state, OutType* ptr, LenType len, Type prob)
+  raft::device_resources const& handle, RngState& rng_state, OutType* ptr, LenType len, Type prob)
 {
   detail::bernoulli(rng_state, ptr, len, prob, handle.get_stream());
 }
@@ -412,7 +413,7 @@ void bernoulli(
  * @param[in] scale scaling factor
  */
 template <typename OutputValueType, typename IndexType>
-void scaled_bernoulli(const raft::handle_t& handle,
+void scaled_bernoulli(raft::device_resources const& handle,
                       RngState& rng_state,
                       raft::device_vector_view<OutputValueType, IndexType> out,
                       OutputValueType prob,
@@ -435,7 +436,7 @@ void scaled_bernoulli(const raft::handle_t& handle,
  * @param[in] scale scaling factor
  */
 template <typename OutType, typename LenType = int>
-void scaled_bernoulli(const raft::handle_t& handle,
+void scaled_bernoulli(raft::device_resources const& handle,
                       RngState& rng_state,
                       OutType* ptr,
                       LenType len,
@@ -459,7 +460,7 @@ void scaled_bernoulli(const raft::handle_t& handle,
  * @note https://en.wikipedia.org/wiki/Gumbel_distribution
  */
 template <typename OutputValueType, typename IndexType = int>
-void gumbel(const raft::handle_t& handle,
+void gumbel(raft::device_resources const& handle,
             RngState& rng_state,
             raft::device_vector_view<OutputValueType, IndexType> out,
             OutputValueType mu,
@@ -482,7 +483,7 @@ void gumbel(const raft::handle_t& handle,
  * @note https://en.wikipedia.org/wiki/Gumbel_distribution
  */
 template <typename OutType, typename LenType = int>
-void gumbel(const raft::handle_t& handle,
+void gumbel(raft::device_resources const& handle,
             RngState& rng_state,
             OutType* ptr,
             LenType len,
@@ -505,7 +506,7 @@ void gumbel(const raft::handle_t& handle,
  * @param[in] sigma standard deviation of the distribution
  */
 template <typename OutputValueType, typename IndexType>
-void lognormal(const raft::handle_t& handle,
+void lognormal(raft::device_resources const& handle,
                RngState& rng_state,
                raft::device_vector_view<OutputValueType, IndexType> out,
                OutputValueType mu,
@@ -527,7 +528,7 @@ void lognormal(const raft::handle_t& handle,
  * @param[in] sigma standard deviation of the distribution
  */
 template <typename OutType, typename LenType = int>
-void lognormal(const raft::handle_t& handle,
+void lognormal(raft::device_resources const& handle,
                RngState& rng_state,
                OutType* ptr,
                LenType len,
@@ -550,7 +551,7 @@ void lognormal(const raft::handle_t& handle,
  * @param[in] scale scale value
  */
 template <typename OutputValueType, typename IndexType = int>
-void logistic(const raft::handle_t& handle,
+void logistic(raft::device_resources const& handle,
               RngState& rng_state,
               raft::device_vector_view<OutputValueType, IndexType> out,
               OutputValueType mu,
@@ -572,7 +573,7 @@ void logistic(const raft::handle_t& handle,
  * @param[in] scale scale value
  */
 template <typename OutType, typename LenType = int>
-void logistic(const raft::handle_t& handle,
+void logistic(raft::device_resources const& handle,
               RngState& rng_state,
               OutType* ptr,
               LenType len,
@@ -594,7 +595,7 @@ void logistic(const raft::handle_t& handle,
  * @param[in] lambda the exponential distribution's lambda parameter
  */
 template <typename OutputValueType, typename IndexType>
-void exponential(const raft::handle_t& handle,
+void exponential(raft::device_resources const& handle,
                  RngState& rng_state,
                  raft::device_vector_view<OutputValueType, IndexType> out,
                  OutputValueType lambda)
@@ -614,8 +615,11 @@ void exponential(const raft::handle_t& handle,
  * @param[in] lambda the exponential distribution's lambda parameter
  */
 template <typename OutType, typename LenType = int>
-void exponential(
-  const raft::handle_t& handle, RngState& rng_state, OutType* ptr, LenType len, OutType lambda)
+void exponential(raft::device_resources const& handle,
+                 RngState& rng_state,
+                 OutType* ptr,
+                 LenType len,
+                 OutType lambda)
 {
   detail::exponential(rng_state, ptr, len, lambda, handle.get_stream());
 }
@@ -632,7 +636,7 @@ void exponential(
  * @param[in] sigma the distribution's sigma parameter
  */
 template <typename OutputValueType, typename IndexType>
-void rayleigh(const raft::handle_t& handle,
+void rayleigh(raft::device_resources const& handle,
               RngState& rng_state,
               raft::device_vector_view<OutputValueType, IndexType> out,
               OutputValueType sigma)
@@ -652,8 +656,11 @@ void rayleigh(const raft::handle_t& handle,
  * @param[in] sigma the distribution's sigma parameter
  */
 template <typename OutType, typename LenType = int>
-void rayleigh(
-  const raft::handle_t& handle, RngState& rng_state, OutType* ptr, LenType len, OutType sigma)
+void rayleigh(raft::device_resources const& handle,
+              RngState& rng_state,
+              OutType* ptr,
+              LenType len,
+              OutType sigma)
 {
   detail::rayleigh(rng_state, ptr, len, sigma, handle.get_stream());
 }
@@ -671,7 +678,7 @@ void rayleigh(
  * @param[in] scale the scale
  */
 template <typename OutputValueType, typename IndexType>
-void laplace(const raft::handle_t& handle,
+void laplace(raft::device_resources const& handle,
              RngState& rng_state,
              raft::device_vector_view<OutputValueType, IndexType> out,
              OutputValueType mu,
@@ -693,7 +700,7 @@ void laplace(const raft::handle_t& handle,
  * @param[in] scale the scale
  */
 template <typename OutType, typename LenType = int>
-void laplace(const raft::handle_t& handle,
+void laplace(raft::device_resources const& handle,
              RngState& rng_state,
              OutType* ptr,
              LenType len,
@@ -703,143 +710,46 @@ void laplace(const raft::handle_t& handle,
   detail::laplace(rng_state, ptr, len, mu, scale, handle.get_stream());
 }
 
-namespace sample_without_replacement_impl {
-template <typename T>
-struct weight_alias {
-};
-
-template <>
-struct weight_alias<std::nullopt_t> {
-  using type = double;
-};
-
-template <typename ElementType, typename IndexType>
-struct weight_alias<std::optional<raft::device_vector_view<ElementType, IndexType>>> {
-  using type = typename raft::device_vector_view<ElementType, IndexType>::value_type;
-};
-
-template <typename T>
-using weight_t = typename weight_alias<T>::type;
-}  // namespace sample_without_replacement_impl
-
-/**
- * \defgroup sample_without_replacement Sampling without Replacement
- * @{
- */
-
 /**
- * @brief Sample the input vector without replacement, optionally based on the
- * input weight vector for each element in the array.
- *
- * The implementation is based on the `one-pass sampling` algorithm described in
- * ["Accelerating weighted random sampling without
- * replacement,"](https://www.ethz.ch/content/dam/ethz/special-interest/baug/ivt/ivt-dam/vpl/reports/1101-1200/ab1141.pdf)
- * a technical report by Kirill Mueller.
- *
- * If no input weight vector is provided, then input elements will be
- * sampled uniformly.  Otherwise, the elements sampled from the input
- * vector will always appear in increasing order of their weights as
- * computed using the exponential distribution. So, if you are
- * particular about the order (for e.g., array permutations), then
- * this might not be the right choice.
- *
- * @tparam DataT type of each element of the input array @c in
- * @tparam IdxT type of the dimensions of the arrays; output index type
- * @tparam WeightsVectorType std::optional<raft::device_vector_view<const weight_type, IdxT>> of
- * each elements of the weights array @c weights_opt
- * @tparam OutIndexVectorType std::optional<raft::device_vector_view<IdxT, IdxT>> of output indices
- * @c outIdx_opt
- *
- * @note Please do not specify template parameters explicitly,
- *   as the compiler can deduce them from the arguments.
+ * @brief Generate random integers, where the probability of i is weights[i]/sum(weights)
  *
- * @param[in] handle RAFT handle containing (among other resources)
- *   the CUDA stream on which to run.
- * @param[inout] rng_state Pseudorandom number generator state.
- * @param[in] in Input vector to be sampled.
- * @param[in] weights_opt std::optional weights vector.
- *        If not provided, uniform sampling will be used.
- * @param[out] out Vector of samples from the input vector.
- * @param[out] outIdx_opt std::optional vector of the indices
- *   sampled from the input array.
+ * Usage example:
+ * @code{.cpp}
+ *  #include <raft/core/device_mdarray.hpp>
+ *  #include <raft/core/device_resources.hpp>
+ *  #include <raft/random/rng.cuh>
  *
- * @pre The number of samples `out.extent(0)`
- *   is less than or equal to the number of inputs `in.extent(0)`.
- *
- * @pre The number of weights `wts.extent(0)`
- *   equals the number of inputs `in.extent(0)`.
- */
-template <typename DataT, typename IdxT, typename WeightsVectorType, class OutIndexVectorType>
-void sample_without_replacement(const raft::handle_t& handle,
-                                RngState& rng_state,
-                                raft::device_vector_view<const DataT, IdxT> in,
-                                WeightsVectorType&& weights_opt,
-                                raft::device_vector_view<DataT, IdxT> out,
-                                OutIndexVectorType&& outIdx_opt)
-{
-  using weight_type = sample_without_replacement_impl::weight_t<
-    std::remove_const_t<std::remove_reference_t<WeightsVectorType>>>;
-
-  std::optional<raft::device_vector_view<const weight_type, IdxT>> wts =
-    std::forward<WeightsVectorType>(weights_opt);
-  std::optional<raft::device_vector_view<IdxT, IdxT>> outIdx =
-    std::forward<OutIndexVectorType>(outIdx_opt);
-
-  static_assert(std::is_integral<IdxT>::value, "IdxT must be an integral type.");
-  const IdxT sampledLen = out.extent(0);
-  const IdxT len        = in.extent(0);
-  RAFT_EXPECTS(sampledLen <= len,
-               "sampleWithoutReplacement: "
-               "sampledLen (out.extent(0)) must be <= len (in.extent(0))");
-  RAFT_EXPECTS(len == 0 || in.data_handle() != nullptr,
-               "sampleWithoutReplacement: "
-               "If in.extents(0) != 0, then in.data_handle() must be nonnull");
-  RAFT_EXPECTS(sampledLen == 0 || out.data_handle() != nullptr,
-               "sampleWithoutReplacement: "
-               "If out.extents(0) != 0, then out.data_handle() must be nonnull");
-
-  const bool outIdx_has_value = outIdx.has_value();
-  if (outIdx_has_value) {
-    RAFT_EXPECTS((*outIdx).extent(0) == sampledLen,
-                 "sampleWithoutReplacement: "
-                 "If outIdx is provided, its extent(0) must equal out.extent(0)");
-  }
-  IdxT* outIdx_ptr = outIdx_has_value ? (*outIdx).data_handle() : nullptr;
-
-  const bool wts_has_value = wts.has_value();
-  if (wts_has_value) {
-    RAFT_EXPECTS((*wts).extent(0) == len,
-                 "sampleWithoutReplacement: "
-                 "If wts is provided, its extent(0) must equal in.extent(0)");
-  }
-  const weight_type* wts_ptr = wts_has_value ? (*wts).data_handle() : nullptr;
-
-  detail::sampleWithoutReplacement(rng_state,
-                                   out.data_handle(),
-                                   outIdx_ptr,
-                                   in.data_handle(),
-                                   wts_ptr,
-                                   sampledLen,
-                                   len,
-                                   handle.get_stream());
-}
-
-/**
- * @brief Overload of `sample_without_replacement` to help the
- *   compiler find the above overload, in case users pass in
- *   `std::nullopt` for one or both of the optional arguments.
+ *  raft::raft::device_resources handle;
+ *  ...
+ *  raft::random::RngState rng(seed);
+ *  auto indices = raft::make_device_vector<int>(handle, n_samples);
+ *  raft::random::discrete(handle, rng, indices.view(), weights);
+ * @endcode
  *
+ * @tparam OutType integer output type
+ * @tparam WeightType weight type
+ * @tparam IndexType data type used to represent length of the arrays
  *
- * Please see above for documentation of `sample_without_replacement`.
+ * @param[in] handle raft handle for resource management
+ * @param[in] rng_state random number generator state
+ * @param[out] out output array
+ * @param[in] weights weight array
  */
-template <typename... Args, typename = std::enable_if_t<sizeof...(Args) == 5>>
-void sample_without_replacement(Args... args)
+template <typename OutType, typename WeightType, typename IndexType>
+std::enable_if_t<std::is_integral_v<OutType>> discrete(
+  raft::device_resources const& handle,
+  RngState& rng_state,
+  raft::device_vector_view<OutType, IndexType> out,
+  raft::device_vector_view<const WeightType, IndexType> weights)
 {
-  sample_without_replacement(std::forward<Args>(args)..., std::nullopt);
+  detail::discrete(rng_state,
+                   out.data_handle(),
+                   weights.data_handle(),
+                   out.extent(0),
+                   weights.extent(0),
+                   handle.get_stream());
 }
 
-/** @} */
-
 /**
  * @brief Legacy version of @c sample_without_replacement (see above)
  *   that takes raw arrays instead of device mdspan.
@@ -860,7 +770,7 @@ void sample_without_replacement(Args... args)
  * @param[in] len input array length
  */
 template <typename DataT, typename WeightsT, typename IdxT = int>
-void sampleWithoutReplacement(const raft::handle_t& handle,
+void sampleWithoutReplacement(raft::device_resources const& handle,
                               RngState& rng_state,
                               DataT* out,
                               IdxT* outIdx,
@@ -1196,7 +1106,7 @@ class DEPR Rng : public detail::RngImpl {
    * @param stream cuda stream
    */
   template <typename DataT, typename WeightsT, typename IdxT = int>
-  void sampleWithoutReplacement(const raft::handle_t& handle,
+  void sampleWithoutReplacement(raft::device_resources const& handle,
                                 DataT* out,
                                 IdxT* outIdx,
                                 const DataT* in,
diff --git a/cpp/include/raft/random/sample_without_replacement.cuh b/cpp/include/raft/random/sample_without_replacement.cuh
new file mode 100644
index 0000000000..8998db98ae
--- /dev/null
+++ b/cpp/include/raft/random/sample_without_replacement.cuh
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/rng_impl.cuh"
+#include "rng_state.hpp"
+#include <cassert>
+#include <optional>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <type_traits>
+#include <variant>
+
+namespace raft::random {
+
+namespace sample_without_replacement_impl {
+template <typename T>
+struct weight_alias {
+};
+
+template <>
+struct weight_alias<std::nullopt_t> {
+  using type = double;
+};
+
+template <typename ElementType, typename IndexType>
+struct weight_alias<std::optional<raft::device_vector_view<ElementType, IndexType>>> {
+  using type = typename raft::device_vector_view<ElementType, IndexType>::value_type;
+};
+
+template <typename T>
+using weight_t = typename weight_alias<T>::type;
+}  // namespace sample_without_replacement_impl
+
+/**
+ * \defgroup sample_without_replacement Sampling without Replacement
+ * @{
+ */
+
+/**
+ * @brief Sample the input vector without replacement, optionally based on the
+ * input weight vector for each element in the array.
+ *
+ * The implementation is based on the `one-pass sampling` algorithm described in
+ * ["Accelerating weighted random sampling without
+ * replacement,"](https://www.ethz.ch/content/dam/ethz/special-interest/baug/ivt/ivt-dam/vpl/reports/1101-1200/ab1141.pdf)
+ * a technical report by Kirill Mueller.
+ *
+ * If no input weight vector is provided, then input elements will be
+ * sampled uniformly.  Otherwise, the elements sampled from the input
+ * vector will always appear in increasing order of their weights as
+ * computed using the exponential distribution. So, if you are
+ * particular about the order (for e.g., array permutations), then
+ * this might not be the right choice.
+ *
+ * @tparam DataT type of each element of the input array @c in
+ * @tparam IdxT type of the dimensions of the arrays; output index type
+ * @tparam WeightsVectorType std::optional<raft::device_vector_view<const weight_type, IdxT>> of
+ * each elements of the weights array @c weights_opt
+ * @tparam OutIndexVectorType std::optional<raft::device_vector_view<IdxT, IdxT>> of output indices
+ * @c outIdx_opt
+ *
+ * @note Please do not specify template parameters explicitly,
+ *   as the compiler can deduce them from the arguments.
+ *
+ * @param[in] handle RAFT handle containing (among other resources)
+ *   the CUDA stream on which to run.
+ * @param[inout] rng_state Pseudorandom number generator state.
+ * @param[in] in Input vector to be sampled.
+ * @param[in] weights_opt std::optional weights vector.
+ *        If not provided, uniform sampling will be used.
+ * @param[out] out Vector of samples from the input vector.
+ * @param[out] outIdx_opt std::optional vector of the indices
+ *   sampled from the input array.
+ *
+ * @pre The number of samples `out.extent(0)`
+ *   is less than or equal to the number of inputs `in.extent(0)`.
+ *
+ * @pre The number of weights `wts.extent(0)`
+ *   equals the number of inputs `in.extent(0)`.
+ */
+template <typename DataT, typename IdxT, typename WeightsVectorType, class OutIndexVectorType>
+void sample_without_replacement(raft::device_resources const& handle,
+                                RngState& rng_state,
+                                raft::device_vector_view<const DataT, IdxT> in,
+                                WeightsVectorType&& weights_opt,
+                                raft::device_vector_view<DataT, IdxT> out,
+                                OutIndexVectorType&& outIdx_opt)
+{
+  using weight_type = sample_without_replacement_impl::weight_t<
+    std::remove_const_t<std::remove_reference_t<WeightsVectorType>>>;
+
+  std::optional<raft::device_vector_view<const weight_type, IdxT>> wts =
+    std::forward<WeightsVectorType>(weights_opt);
+  std::optional<raft::device_vector_view<IdxT, IdxT>> outIdx =
+    std::forward<OutIndexVectorType>(outIdx_opt);
+
+  static_assert(std::is_integral<IdxT>::value, "IdxT must be an integral type.");
+  const IdxT sampledLen = out.extent(0);
+  const IdxT len        = in.extent(0);
+  RAFT_EXPECTS(sampledLen <= len,
+               "sampleWithoutReplacement: "
+               "sampledLen (out.extent(0)) must be <= len (in.extent(0))");
+  RAFT_EXPECTS(len == 0 || in.data_handle() != nullptr,
+               "sampleWithoutReplacement: "
+               "If in.extents(0) != 0, then in.data_handle() must be nonnull");
+  RAFT_EXPECTS(sampledLen == 0 || out.data_handle() != nullptr,
+               "sampleWithoutReplacement: "
+               "If out.extents(0) != 0, then out.data_handle() must be nonnull");
+
+  const bool outIdx_has_value = outIdx.has_value();
+  if (outIdx_has_value) {
+    RAFT_EXPECTS((*outIdx).extent(0) == sampledLen,
+                 "sampleWithoutReplacement: "
+                 "If outIdx is provided, its extent(0) must equal out.extent(0)");
+  }
+  IdxT* outIdx_ptr = outIdx_has_value ? (*outIdx).data_handle() : nullptr;
+
+  const bool wts_has_value = wts.has_value();
+  if (wts_has_value) {
+    RAFT_EXPECTS((*wts).extent(0) == len,
+                 "sampleWithoutReplacement: "
+                 "If wts is provided, its extent(0) must equal in.extent(0)");
+  }
+  const weight_type* wts_ptr = wts_has_value ? (*wts).data_handle() : nullptr;
+
+  detail::sampleWithoutReplacement(rng_state,
+                                   out.data_handle(),
+                                   outIdx_ptr,
+                                   in.data_handle(),
+                                   wts_ptr,
+                                   sampledLen,
+                                   len,
+                                   handle.get_stream());
+}
+
+/**
+ * @brief Overload of `sample_without_replacement` to help the
+ *   compiler find the above overload, in case users pass in
+ *   `std::nullopt` for one or both of the optional arguments.
+ *
+ *
+ * Please see above for documentation of `sample_without_replacement`.
+ */
+template <typename... Args, typename = std::enable_if_t<sizeof...(Args) == 5>>
+void sample_without_replacement(Args... args)
+{
+  sample_without_replacement(std::forward<Args>(args)..., std::nullopt);
+}
+
+/** @} */
+
+}  // end namespace raft::random
\ No newline at end of file
diff --git a/cpp/include/raft/solver/detail/lap_functions.cuh b/cpp/include/raft/solver/detail/lap_functions.cuh
index cbfe12fd23..440e6901c6 100644
--- a/cpp/include/raft/solver/detail/lap_functions.cuh
+++ b/cpp/include/raft/solver/detail/lap_functions.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  * Copyright 2020 KETAN DATE & RAKESH NAGI
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -26,7 +26,7 @@
 
 #include <raft/solver/linear_assignment_types.hpp>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/solver/detail/lap_kernels.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_scalar.hpp>
@@ -98,7 +98,7 @@ inline void calculateRectangularDims(
 }
 
 template <typename vertex_t, typename weight_t>
-inline void initialReduction(raft::handle_t const& handle,
+inline void initialReduction(raft::device_resources const& handle,
                              weight_t const* d_costs,
                              Vertices<vertex_t, weight_t>& d_vertices_dev,
                              int SP,
@@ -125,7 +125,7 @@ inline void initialReduction(raft::handle_t const& handle,
 }
 
 template <typename vertex_t, typename weight_t>
-inline void computeInitialAssignments(raft::handle_t const& handle,
+inline void computeInitialAssignments(raft::device_resources const& handle,
                                       weight_t const* d_costs,
                                       Vertices<vertex_t, weight_t>& d_vertices,
                                       int SP,
@@ -164,7 +164,7 @@ inline void computeInitialAssignments(raft::handle_t const& handle,
 
 // Function for finding row cover on individual devices.
 template <typename vertex_t, typename weight_t>
-inline int computeRowCovers(raft::handle_t const& handle,
+inline int computeRowCovers(raft::device_resources const& handle,
                             Vertices<vertex_t, weight_t>& d_vertices,
                             VertexData<vertex_t>& d_row_data,
                             VertexData<vertex_t>& d_col_data,
@@ -198,7 +198,7 @@ inline int computeRowCovers(raft::handle_t const& handle,
 
 // Function for covering the zeros in uncovered rows and expanding the frontier.
 template <typename vertex_t, typename weight_t>
-inline void coverZeroAndExpand(raft::handle_t const& handle,
+inline void coverZeroAndExpand(raft::device_resources const& handle,
                                weight_t const* d_costs_dev,
                                vertex_t const* d_rows_csr_neighbors,
                                vertex_t const* d_rows_csr_ptrs,
@@ -230,7 +230,7 @@ inline void coverZeroAndExpand(raft::handle_t const& handle,
 }
 
 template <typename vertex_t, typename weight_t>
-inline vertex_t zeroCoverIteration(raft::handle_t const& handle,
+inline vertex_t zeroCoverIteration(raft::device_resources const& handle,
                                    weight_t const* d_costs_dev,
                                    Vertices<vertex_t, weight_t>& d_vertices_dev,
                                    VertexData<vertex_t>& d_row_data_dev,
@@ -310,7 +310,7 @@ inline vertex_t zeroCoverIteration(raft::handle_t const& handle,
 // Function for executing recursive zero cover. Returns the next step (Step 4 or Step 5) depending
 // on the presence of uncovered zeros.
 template <typename vertex_t, typename weight_t>
-inline void executeZeroCover(raft::handle_t const& handle,
+inline void executeZeroCover(raft::device_resources const& handle,
                              weight_t const* d_costs_dev,
                              Vertices<vertex_t, weight_t>& d_vertices_dev,
                              VertexData<vertex_t>& d_row_data_dev,
@@ -329,7 +329,7 @@ inline void executeZeroCover(raft::handle_t const& handle,
 
 // Function for executing reverse pass of the maximum matching.
 template <typename vertex_t>
-inline void reversePass(raft::handle_t const& handle,
+inline void reversePass(raft::device_resources const& handle,
                         VertexData<vertex_t>& d_row_data_dev,
                         VertexData<vertex_t>& d_col_data_dev,
                         int SP,
@@ -385,7 +385,7 @@ inline void reversePass(raft::handle_t const& handle,
 
 // Function for executing augmentation pass of the maximum matching.
 template <typename vertex_t, typename weight_t>
-inline void augmentationPass(raft::handle_t const& handle,
+inline void augmentationPass(raft::device_resources const& handle,
                              Vertices<vertex_t, weight_t>& d_vertices_dev,
                              VertexData<vertex_t>& d_row_data_dev,
                              VertexData<vertex_t>& d_col_data_dev,
@@ -448,7 +448,7 @@ inline void augmentationPass(raft::handle_t const& handle,
 }
 
 template <typename vertex_t, typename weight_t>
-inline void dualUpdate(raft::handle_t const& handle,
+inline void dualUpdate(raft::device_resources const& handle,
                        Vertices<vertex_t, weight_t>& d_vertices_dev,
                        VertexData<vertex_t>& d_row_data_dev,
                        VertexData<vertex_t>& d_col_data_dev,
@@ -493,7 +493,7 @@ inline void dualUpdate(raft::handle_t const& handle,
 
 // Function for calculating optimal objective function value using dual variables.
 template <typename vertex_t, typename weight_t>
-inline void calcObjValDual(raft::handle_t const& handle,
+inline void calcObjValDual(raft::device_resources const& handle,
                            weight_t* d_obj_val,
                            Vertices<vertex_t, weight_t>& d_vertices_dev,
                            int SP,
@@ -513,7 +513,7 @@ inline void calcObjValDual(raft::handle_t const& handle,
 
 // Function for calculating optimal objective function value using dual variables.
 template <typename vertex_t, typename weight_t>
-inline void calcObjValPrimal(raft::handle_t const& handle,
+inline void calcObjValPrimal(raft::device_resources const& handle,
                              weight_t* d_obj_val,
                              weight_t const* d_costs,
                              vertex_t const* d_row_assignments,
diff --git a/cpp/include/raft/solver/detail/lap_kernels.cuh b/cpp/include/raft/solver/detail/lap_kernels.cuh
index d66a9d72d5..69930a1460 100644
--- a/cpp/include/raft/solver/detail/lap_kernels.cuh
+++ b/cpp/include/raft/solver/detail/lap_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  * Copyright 2020 KETAN DATE & RAKESH NAGI
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -26,7 +26,7 @@
 
 #include "../linear_assignment_types.hpp"
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/util/cudart_utils.hpp>
 
 #include <thrust/execution_policy.h>
diff --git a/cpp/include/raft/solver/linear_assignment.cuh b/cpp/include/raft/solver/linear_assignment.cuh
index 3e17b557f2..7904c04ede 100644
--- a/cpp/include/raft/solver/linear_assignment.cuh
+++ b/cpp/include/raft/solver/linear_assignment.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  * Copyright 2020 KETAN DATE & RAKESH NAGI
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -28,7 +28,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/execution_policy.h>
@@ -61,7 +61,7 @@ class LinearAssignmentProblem {
   Vertices<vertex_t, weight_t> d_vertices_dev;
   VertexData<vertex_t> d_row_data_dev, d_col_data_dev;
 
-  raft::handle_t const& handle_;
+  raft::device_resources const& handle_;
   rmm::device_uvector<int> row_covers_v;
   rmm::device_uvector<int> col_covers_v;
   rmm::device_uvector<weight_t> row_duals_v;
@@ -84,7 +84,7 @@ class LinearAssignmentProblem {
    * @param batchsize
    * @param epsilon
    */
-  LinearAssignmentProblem(raft::handle_t const& handle,
+  LinearAssignmentProblem(raft::device_resources const& handle,
                           vertex_t size,
                           vertex_t batchsize,
                           weight_t epsilon)
diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh
index abdacdc426..09f4135a51 100644
--- a/cpp/include/raft/sparse/convert/csr.cuh
+++ b/cpp/include/raft/sparse/convert/csr.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@ namespace sparse {
 namespace convert {
 
 template <typename value_t>
-void coo_to_csr(const raft::handle_t& handle,
+void coo_to_csr(raft::device_resources const& handle,
                 const int* srcRows,
                 const int* srcCols,
                 const value_t* srcVals,
@@ -90,7 +90,7 @@ void sorted_coo_to_csr(COO<T>* coo, int* row_ind, cudaStream_t stream)
  *                         number of non-zeros in adj.
  */
 template <typename index_t = int>
-void adj_to_csr(const raft::handle_t& handle,
+void adj_to_csr(raft::device_resources const& handle,
                 const bool* adj,         // Row-major adjacency matrix
                 const index_t* row_ind,  // Precomputed row indices
                 index_t num_rows,        // # rows of adj
diff --git a/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh b/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh
index 4549fbe343..87c534d7b8 100644
--- a/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #include <cooperative_groups.h>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/device_atomics.cuh>
 #include <raft/util/vectorized.cuh>
@@ -129,7 +129,7 @@ __global__ void __launch_bounds__(adj_to_csr_tpb)
  *                         number of non-zeros in adj.
  */
 template <typename index_t = int>
-void adj_to_csr(const raft::handle_t& handle,
+void adj_to_csr(raft::device_resources const& handle,
                 const bool* adj,         // row-major adjacency matrix
                 const index_t* row_ind,  // precomputed row indices
                 index_t num_rows,        // # rows of adj
diff --git a/cpp/include/raft/sparse/convert/detail/csr.cuh b/cpp/include/raft/sparse/convert/detail/csr.cuh
index acb77de358..3f155854c0 100644
--- a/cpp/include/raft/sparse/convert/detail/csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/csr.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #include <cusparse_v2.h>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -44,7 +44,7 @@ namespace convert {
 namespace detail {
 
 template <typename value_t>
-void coo_to_csr(const raft::handle_t& handle,
+void coo_to_csr(raft::device_resources const& handle,
                 const int* srcRows,
                 const int* srcCols,
                 const value_t* srcVals,
diff --git a/cpp/include/raft/sparse/detail/cusparse_wrappers.h b/cpp/include/raft/sparse/detail/cusparse_wrappers.h
index 3bb2db7902..6ae6874466 100644
--- a/cpp/include/raft/sparse/detail/cusparse_wrappers.h
+++ b/cpp/include/raft/sparse/detail/cusparse_wrappers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -779,7 +779,7 @@ cusparseStatus_t cusparsegemmi(  // NOLINT
   auto return_value =
     cusparsespmm(handle, opB, opA, alpha, matB, matA, beta, matC, alg, ext_buf, stream);
 
-  raft::handle_t rhandle;
+  raft::device_resources rhandle;
   raft::linalg::transpose(rhandle, CT.data(), C, n, m, stream);
   // destroy matrix/vector descriptors
   CUSPARSE_CHECK(cusparseDestroyDnMat(matA));
diff --git a/cpp/include/raft/sparse/distance/common.h b/cpp/include/raft/sparse/distance/common.h
index a69352d74b..1e5aeb5210 100644
--- a/cpp/include/raft/sparse/distance/common.h
+++ b/cpp/include/raft/sparse/distance/common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 namespace raft {
 namespace sparse {
@@ -24,7 +24,7 @@ namespace distance {
 
 template <typename value_idx, typename value_t>
 struct distances_config_t {
-  distances_config_t(const raft::handle_t& handle_) : handle(handle_) {}
+  distances_config_t(raft::device_resources const& handle_) : handle(handle_) {}
 
   // left side
   value_idx a_nrows;
@@ -42,7 +42,7 @@ struct distances_config_t {
   value_idx* b_indices;
   value_t* b_data;
 
-  const raft::handle_t& handle;
+  raft::device_resources const& handle;
 };
 
 template <typename value_t>
diff --git a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
index e791de10bb..d45e643780 100644
--- a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
@@ -22,11 +22,12 @@
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
+#include <raft/core/operators.cuh>
+#include <raft/core/operators.hpp>
 #include <raft/sparse/convert/coo.cuh>
 #include <raft/sparse/detail/utils.h>
 #include <raft/sparse/distance/common.h>
 #include <raft/sparse/distance/detail/coo_spmv.cuh>
-#include <raft/sparse/distance/detail/operators.cuh>
 #include <raft/sparse/linalg/transpose.cuh>
 #include <rmm/device_uvector.hpp>
 
@@ -63,8 +64,12 @@ class ip_distances_t : public distances_t<value_t> {
     /**
      * Compute pairwise distances and return dense matrix in row-major format
      */
-    balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
-      out_distances, *config_, coo_rows_b.data(), Product(), Sum(), AtomicAdd());
+    balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(out_distances,
+                                                               *config_,
+                                                               coo_rows_b.data(),
+                                                               raft::mul_op(),
+                                                               raft::add_op(),
+                                                               raft::atomic_add_op());
   }
 
   value_idx* b_rows_coo() { return coo_rows_b.data(); }
diff --git a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
index 1f55dadc58..2f165b3ff2 100644
--- a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -112,7 +112,7 @@ __global__ void compute_correlation_warp_kernel(value_t* __restrict__ C,
   value_t Q_denom = n * Q_l2 - (Q_l1 * Q_l1);
   value_t R_denom = n * R_l2 - (R_l1 * R_l1);
 
-  value_t val = 1 - (numer / sqrt(Q_denom * R_denom));
+  value_t val = 1 - (numer / raft::sqrt(Q_denom * R_denom));
 
   // correct for small instabilities
   C[(size_t)i * n_cols + j] = val * (fabs(val) >= 0.0001);
@@ -292,7 +292,7 @@ class l2_sqrt_expanded_distances_t : public l2_expanded_distances_t<value_idx, v
       this->config_->a_nrows * this->config_->b_nrows,
       [] __device__(value_t input) {
         int neg = input < 0 ? -1 : 1;
-        return sqrt(abs(input) * neg);
+        return raft::sqrt(abs(input) * neg);
       },
       this->config_->handle.get_stream());
   }
@@ -379,7 +379,7 @@ class cosine_expanded_distances_t : public distances_t<value_t> {
                config_->b_nrows,
                config_->handle.get_stream(),
                [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
-                 value_t norms = sqrt(q_norm) * sqrt(r_norm);
+                 value_t norms = raft::sqrt(q_norm) * raft::sqrt(r_norm);
                  // deal with potential for 0 in denominator by forcing 0/1 instead
                  value_t cos = ((norms != 0) * dot) / ((norms == 0) + norms);
 
@@ -429,9 +429,9 @@ class hellinger_expanded_distances_t : public distances_t<value_t> {
       out_dists,
       *config_,
       coo_rows.data(),
-      [] __device__(value_t a, value_t b) { return sqrt(a) * sqrt(b); },
-      Sum(),
-      AtomicAdd());
+      [] __device__(value_t a, value_t b) { return raft::sqrt(a) * raft::sqrt(b); },
+      raft::add_op(),
+      raft::atomic_add_op());
 
     raft::linalg::unaryOp<value_t>(
       out_dists,
@@ -440,7 +440,7 @@ class hellinger_expanded_distances_t : public distances_t<value_t> {
       [=] __device__(value_t input) {
         // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative
         bool rectifier = (1 - input) > 0;
-        return sqrt(rectifier * (1 - input));
+        return raft::sqrt(rectifier * (1 - input));
       },
       config_->handle.get_stream());
   }
diff --git a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
index 0707eb2a9b..f67109afbc 100644
--- a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <limits.h>
 
+#include <raft/core/operators.cuh>
+#include <raft/core/operators.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -28,7 +30,6 @@
 
 #include <raft/sparse/convert/coo.cuh>
 #include <raft/sparse/distance/common.h>
-#include <raft/sparse/distance/detail/operators.cuh>
 
 #include <nvfunctional>
 
@@ -88,7 +89,8 @@ class l1_unexpanded_distances_t : public distances_t<value_t> {
 
   void compute(value_t* out_dists)
   {
-    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, AbsDiff(), Sum(), AtomicAdd());
+    unexpanded_lp_distances<value_idx, value_t>(
+      out_dists, config_, raft::absdiff_op(), raft::add_op(), raft::atomic_add_op());
   }
 
  private:
@@ -104,7 +106,8 @@ class l2_unexpanded_distances_t : public distances_t<value_t> {
 
   void compute(value_t* out_dists)
   {
-    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, SqDiff(), Sum(), AtomicAdd());
+    unexpanded_lp_distances<value_idx, value_t>(
+      out_dists, config_, raft::sqdiff_op(), raft::add_op(), raft::atomic_add_op());
   }
 
  protected:
@@ -129,7 +132,7 @@ class l2_sqrt_unexpanded_distances_t : public l2_unexpanded_distances_t<value_id
       this->config_->a_nrows * this->config_->b_nrows,
       [] __device__(value_t input) {
         int neg = input < 0 ? -1 : 1;
-        return sqrt(abs(input) * neg);
+        return raft::sqrt(abs(input) * neg);
       },
       this->config_->handle.get_stream());
   }
@@ -145,7 +148,8 @@ class linf_unexpanded_distances_t : public distances_t<value_t> {
 
   void compute(value_t* out_dists)
   {
-    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, AbsDiff(), Max(), AtomicMax());
+    unexpanded_lp_distances<value_idx, value_t>(
+      out_dists, config_, raft::absdiff_op(), raft::max_op(), raft::atomic_max_op());
   }
 
  private:
@@ -172,8 +176,8 @@ class canberra_unexpanded_distances_t : public distances_t<value_t> {
         // forcing 1/0 instead
         return ((d != 0) * fabs(a - b)) / (d + (d == 0));
       },
-      Sum(),
-      AtomicAdd());
+      raft::add_op(),
+      raft::atomic_add_op());
   }
 
  private:
@@ -191,15 +195,19 @@ class lp_unexpanded_distances_t : public distances_t<value_t> {
 
   void compute(value_t* out_dists)
   {
-    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, PDiff(p), Sum(), AtomicAdd());
-
-    float one_over_p = 1.0f / p;
-    raft::linalg::unaryOp<value_t>(
-      out_dists,
+    unexpanded_lp_distances<value_idx, value_t>(
       out_dists,
-      config_->a_nrows * config_->b_nrows,
-      [=] __device__(value_t input) { return pow(input, one_over_p); },
-      config_->handle.get_stream());
+      config_,
+      raft::compose_op(raft::pow_const_op<value_t>(p), raft::sub_op()),
+      raft::add_op(),
+      raft::atomic_add_op());
+
+    value_t one_over_p = value_t{1} / p;
+    raft::linalg::unaryOp<value_t>(out_dists,
+                                   out_dists,
+                                   config_->a_nrows * config_->b_nrows,
+                                   raft::pow_const_op<value_t>(one_over_p),
+                                   config_->handle.get_stream());
   }
 
  private:
@@ -217,15 +225,15 @@ class hamming_unexpanded_distances_t : public distances_t<value_t> {
 
   void compute(value_t* out_dists)
   {
-    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, NotEqual(), Sum(), AtomicAdd());
+    unexpanded_lp_distances<value_idx, value_t>(
+      out_dists, config_, raft::notequal_op(), raft::add_op(), raft::atomic_add_op());
 
     value_t n_cols = 1.0 / config_->a_ncols;
-    raft::linalg::unaryOp<value_t>(
-      out_dists,
-      out_dists,
-      config_->a_nrows * config_->b_nrows,
-      [=] __device__(value_t input) { return input * n_cols; },
-      config_->handle.get_stream());
+    raft::linalg::unaryOp<value_t>(out_dists,
+                                   out_dists,
+                                   config_->a_nrows * config_->b_nrows,
+                                   raft::mul_const_op<value_t>(n_cols),
+                                   config_->handle.get_stream());
   }
 
  private:
@@ -259,14 +267,14 @@ class jensen_shannon_unexpanded_distances_t : public distances_t<value_t> {
 
         return (-a * (!x_zero * log(x + x_zero))) + (-b * (!y_zero * log(y + y_zero)));
       },
-      Sum(),
-      AtomicAdd());
+      raft::add_op(),
+      raft::atomic_add_op());
 
     raft::linalg::unaryOp<value_t>(
       out_dists,
       out_dists,
       config_->a_nrows * config_->b_nrows,
-      [=] __device__(value_t input) { return sqrt(0.5 * input); },
+      [=] __device__(value_t input) { return raft::sqrt(0.5 * input); },
       config_->handle.get_stream());
   }
 
@@ -299,15 +307,14 @@ class kl_divergence_unexpanded_distances_t : public distances_t<value_t> {
       *config_,
       coo_rows.data(),
       [] __device__(value_t a, value_t b) { return a * log(a / b); },
-      Sum(),
-      AtomicAdd());
-
-    raft::linalg::unaryOp<value_t>(
-      out_dists,
-      out_dists,
-      config_->a_nrows * config_->b_nrows,
-      [=] __device__(value_t input) { return 0.5 * input; },
-      config_->handle.get_stream());
+      raft::add_op(),
+      raft::atomic_add_op());
+
+    raft::linalg::unaryOp<value_t>(out_dists,
+                                   out_dists,
+                                   config_->a_nrows * config_->b_nrows,
+                                   raft::mul_const_op<value_t>(0.5),
+                                   config_->handle.get_stream());
   }
 
  private:
diff --git a/cpp/include/raft/sparse/distance/detail/operators.cuh b/cpp/include/raft/sparse/distance/detail/operators.cuh
deleted file mode 100644
index 138b21e85b..0000000000
--- a/cpp/include/raft/sparse/distance/detail/operators.cuh
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/util/device_atomics.cuh>
-
-namespace raft {
-namespace sparse {
-namespace distance {
-namespace detail {
-
-struct Sum {
-  template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
-  {
-    return a + b;
-  }
-};
-
-struct NotEqual {
-  template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
-  {
-    return a != b;
-  }
-};
-
-struct SqDiff {
-  template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
-  {
-    return (a - b) * (a - b);
-  }
-};
-
-struct PDiff {
-  float p;
-
-  PDiff(float p_) : p(p_) {}
-
-  template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
-  {
-    return pow(a - b, p);
-  }
-};
-
-struct Max {
-  template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
-  {
-    return fmax(a, b);
-  }
-};
-
-struct AtomicAdd {
-  template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t* a, value_t b)
-  {
-    return atomicAdd(a, b);
-  }
-};
-
-struct AtomicMax {
-  template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t* a, value_t b)
-  {
-    return atomicMax(a, b);
-  }
-};
-
-struct Product {
-  template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
-  {
-    return a * b;
-  }
-};
-
-struct AbsDiff {
-  template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
-  {
-    return fabs(a - b);
-  }
-};
-}  // namespace detail
-}  // namespace distance
-}  // namespace sparse
-};  // namespace raft
diff --git a/cpp/include/raft/sparse/linalg/detail/spectral.cuh b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
index cdc0e62130..3be33820cc 100644
--- a/cpp/include/raft/sparse/linalg/detail/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,7 +31,7 @@ namespace spectral {
 namespace detail {
 
 template <typename T>
-void fit_embedding(const raft::handle_t& handle,
+void fit_embedding(raft::device_resources const& handle,
                    int* rows,
                    int* cols,
                    T* vals,
@@ -88,7 +88,7 @@ void fit_embedding(const raft::handle_t& handle,
     using size_type_t  = index_type;
     using value_type_t = value_type;
 
-    std::pair<value_type_t, index_type_t> solve(handle_t const& handle,
+    std::pair<value_type_t, index_type_t> solve(raft::device_resources const& handle,
                                                 size_type_t n_obs_vecs,
                                                 size_type_t dim,
                                                 value_type_t const* __restrict__ obs,
diff --git a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
index 358e7d6d29..4ecd447cc4 100644
--- a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -325,7 +325,7 @@ void from_knn_symmetrize_matrix(const value_idx* __restrict__ knn_indices,
  * Symmetrizes a COO matrix
  */
 template <typename value_idx, typename value_t>
-void symmetrize(const raft::handle_t& handle,
+void symmetrize(raft::device_resources const& handle,
                 const value_idx* rows,
                 const value_idx* cols,
                 const value_t* vals,
diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh
index 0a97619e87..35d85e893f 100644
--- a/cpp/include/raft/sparse/linalg/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/spectral.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 #ifndef __SPARSE_SPECTRAL_H
 #define __SPARSE_SPECTRAL_H
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/sparse/linalg/detail/spectral.cuh>
 
 namespace raft {
@@ -24,7 +24,7 @@ namespace sparse {
 namespace spectral {
 
 template <typename T>
-void fit_embedding(const raft::handle_t& handle,
+void fit_embedding(raft::device_resources const& handle,
                    int* rows,
                    int* cols,
                    T* vals,
diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh
index a01145376a..f34ba4dbd0 100644
--- a/cpp/include/raft/sparse/linalg/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -149,7 +149,7 @@ void from_knn_symmetrize_matrix(const value_idx* __restrict__ knn_indices,
  * Symmetrizes a COO matrix
  */
 template <typename value_idx, typename value_t>
-void symmetrize(const raft::handle_t& handle,
+void symmetrize(raft::device_resources const& handle,
                 const value_idx* rows,
                 const value_idx* cols,
                 const value_t* vals,
diff --git a/cpp/include/raft/sparse/linalg/transpose.cuh b/cpp/include/raft/sparse/linalg/transpose.cuh
index ae527fe34c..dd5a56bed1 100644
--- a/cpp/include/raft/sparse/linalg/transpose.cuh
+++ b/cpp/include/raft/sparse/linalg/transpose.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/sparse/linalg/detail/transpose.h>
 
 namespace raft {
@@ -40,7 +40,7 @@ namespace linalg {
  * @param[in] stream : Cuda stream for ordering events
  */
 template <typename value_idx, typename value_t>
-void csr_transpose(const raft::handle_t& handle,
+void csr_transpose(raft::device_resources const& handle,
                    const value_idx* csr_indptr,
                    const value_idx* csr_indices,
                    const value_t* csr_data,
diff --git a/cpp/include/raft/sparse/neighbors/brute_force.cuh b/cpp/include/raft/sparse/neighbors/brute_force.cuh
index 9639ddc24c..515213d250 100644
--- a/cpp/include/raft/sparse/neighbors/brute_force.cuh
+++ b/cpp/include/raft/sparse/neighbors/brute_force.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/sparse/neighbors/detail/knn.cuh>
 
@@ -61,7 +61,7 @@ void knn(const value_idx* idxIndptr,
          value_idx* output_indices,
          value_t* output_dists,
          int k,
-         const raft::handle_t& handle,
+         raft::device_resources const& handle,
          size_t batch_size_index             = 2 << 14,  // approx 1M
          size_t batch_size_query             = 2 << 14,
          raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded,
diff --git a/cpp/include/raft/sparse/neighbors/connect_components.cuh b/cpp/include/raft/sparse/neighbors/connect_components.cuh
index e468643518..90343c1215 100644
--- a/cpp/include/raft/sparse/neighbors/connect_components.cuh
+++ b/cpp/include/raft/sparse/neighbors/connect_components.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/neighbors/detail/connect_components.cuh>
@@ -64,7 +64,7 @@ value_idx get_n_components(value_idx* colors, size_t n_rows, cudaStream_t stream
  */
 template <typename value_idx, typename value_t, typename red_op>
 void connect_components(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::sparse::COO<value_t, value_idx>& out,
   const value_t* X,
   const value_idx* orig_colors,
diff --git a/cpp/include/raft/sparse/neighbors/detail/connect_components.cuh b/cpp/include/raft/sparse/neighbors/detail/connect_components.cuh
index 38ba1137ac..583ff4dfdc 100644
--- a/cpp/include/raft/sparse/neighbors/detail/connect_components.cuh
+++ b/cpp/include/raft/sparse/neighbors/detail/connect_components.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -320,7 +320,7 @@ void min_components_by_color(raft::sparse::COO<value_t, value_idx>& coo,
  */
 template <typename value_idx, typename value_t, typename red_op>
 void connect_components(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::sparse::COO<value_t, value_idx>& out,
   const value_t* X,
   const value_idx* orig_colors,
diff --git a/cpp/include/raft/sparse/neighbors/detail/knn.cuh b/cpp/include/raft/sparse/neighbors/detail/knn.cuh
index 38e67036fe..7bedec9830 100644
--- a/cpp/include/raft/sparse/neighbors/detail/knn.cuh
+++ b/cpp/include/raft/sparse/neighbors/detail/knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -129,7 +129,7 @@ class sparse_knn_t {
                value_idx* output_indices_,
                value_t* output_dists_,
                int k_,
-               const raft::handle_t& handle_,
+               raft::device_resources const& handle_,
                size_t batch_size_index_             = 2 << 14,  // approx 1M
                size_t batch_size_query_             = 2 << 14,
                raft::distance::DistanceType metric_ = raft::distance::DistanceType::L2Expanded,
@@ -422,7 +422,7 @@ class sparse_knn_t {
 
   int n_idx_rows, n_idx_cols, n_query_rows, n_query_cols, k;
 
-  const raft::handle_t& handle;
+  raft::device_resources const& handle;
 };
 
 };  // namespace raft::sparse::neighbors::detail
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh b/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh
index ffd742f080..d53f2f8df3 100644
--- a/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh
+++ b/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -94,7 +94,7 @@ void conv_indices(in_t* inds, out_t* out, size_t size, cudaStream_t stream)
  * @param c
  */
 template <typename value_idx = int, typename value_t = float>
-void knn_graph(const handle_t& handle,
+void knn_graph(raft::device_resources const& handle,
                const value_t* X,
                size_t m,
                size_t n,
diff --git a/cpp/include/raft/sparse/neighbors/knn.cuh b/cpp/include/raft/sparse/neighbors/knn.cuh
index 14404adcb4..d5714fbbd1 100644
--- a/cpp/include/raft/sparse/neighbors/knn.cuh
+++ b/cpp/include/raft/sparse/neighbors/knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -72,7 +72,7 @@ void brute_force_knn(const value_idx* idxIndptr,
                      value_idx* output_indices,
                      value_t* output_dists,
                      int k,
-                     const raft::handle_t& handle,
+                     raft::device_resources const& handle,
                      size_t batch_size_index             = 2 << 14,  // approx 1M
                      size_t batch_size_query             = 2 << 14,
                      raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded,
diff --git a/cpp/include/raft/sparse/neighbors/knn_graph.cuh b/cpp/include/raft/sparse/neighbors/knn_graph.cuh
index 582df703db..dab4b53482 100644
--- a/cpp/include/raft/sparse/neighbors/knn_graph.cuh
+++ b/cpp/include/raft/sparse/neighbors/knn_graph.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,7 +41,7 @@ namespace raft::sparse::neighbors {
  * @param c
  */
 template <typename value_idx = int, typename value_t = float>
-void knn_graph(const handle_t& handle,
+void knn_graph(raft::device_resources const& handle,
                const value_t* X,
                std::size_t m,
                std::size_t n,
diff --git a/cpp/include/raft/sparse/op/detail/reduce.cuh b/cpp/include/raft/sparse/op/detail/reduce.cuh
index 2b2566f107..8cdfa49c45 100644
--- a/cpp/include/raft/sparse/op/detail/reduce.cuh
+++ b/cpp/include/raft/sparse/op/detail/reduce.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -124,7 +124,7 @@ void compute_duplicates_mask(
  * @param[in] stream cuda ops will be ordered wrt this stream
  */
 template <typename value_idx, typename value_t>
-void max_duplicates(const raft::handle_t& handle,
+void max_duplicates(raft::device_resources const& handle,
                     raft::sparse::COO<value_t, value_idx>& out,
                     const value_idx* rows,
                     const value_idx* cols,
diff --git a/cpp/include/raft/sparse/op/detail/slice.cuh b/cpp/include/raft/sparse/op/detail/slice.cuh
index 193d246b4b..4d2f1a4195 100644
--- a/cpp/include/raft/sparse/op/detail/slice.cuh
+++ b/cpp/include/raft/sparse/op/detail/slice.cuh
@@ -18,6 +18,7 @@
 
 #include <cusparse_v2.h>
 
+#include <raft/core/operators.hpp>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/util/cuda_utils.cuh>
@@ -70,12 +71,11 @@ void csr_row_slice_indptr(value_idx start_row,
   // we add another 1 to stop row.
   raft::copy_async(indptr_out, indptr + start_row, (stop_row + 2) - start_row, stream);
 
-  raft::linalg::unaryOp<value_idx>(
-    indptr_out,
-    indptr_out,
-    (stop_row + 2) - start_row,
-    [s_offset] __device__(value_idx input) { return input - s_offset; },
-    stream);
+  raft::linalg::unaryOp<value_idx>(indptr_out,
+                                   indptr_out,
+                                   (stop_row + 2) - start_row,
+                                   raft::sub_const_op<value_idx>(s_offset),
+                                   stream);
 }
 
 /**
diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/filter.cuh
index 488d926fe9..7418b26ec8 100644
--- a/cpp/include/raft/sparse/op/filter.cuh
+++ b/cpp/include/raft/sparse/op/filter.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/op/detail/filter.cuh>
 
diff --git a/cpp/include/raft/sparse/op/reduce.cuh b/cpp/include/raft/sparse/op/reduce.cuh
index 80b479f98d..5223100b2a 100644
--- a/cpp/include/raft/sparse/op/reduce.cuh
+++ b/cpp/include/raft/sparse/op/reduce.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/op/detail/reduce.cuh>
 
@@ -69,7 +69,7 @@ void compute_duplicates_mask(
  * @param[in] n number of columns in COO input matrix
  */
 template <typename value_idx, typename value_t>
-void max_duplicates(const raft::handle_t& handle,
+void max_duplicates(raft::device_resources const& handle,
                     raft::sparse::COO<value_t, value_idx>& out,
                     const value_idx* rows,
                     const value_idx* cols,
diff --git a/cpp/include/raft/sparse/op/row_op.cuh b/cpp/include/raft/sparse/op/row_op.cuh
index d73d05785d..17e3659355 100644
--- a/cpp/include/raft/sparse/op/row_op.cuh
+++ b/cpp/include/raft/sparse/op/row_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #define __SPARSE_ROW_OP_H
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/sparse/op/detail/row_op.cuh>
 
 namespace raft {
diff --git a/cpp/include/raft/sparse/op/slice.cuh b/cpp/include/raft/sparse/op/slice.cuh
index 30f7a97ffc..22d3f0168d 100644
--- a/cpp/include/raft/sparse/op/slice.cuh
+++ b/cpp/include/raft/sparse/op/slice.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/sparse/op/detail/slice.cuh>
 
 namespace raft {
diff --git a/cpp/include/raft/sparse/op/sort.cuh b/cpp/include/raft/sparse/op/sort.cuh
index ddb4b2830c..e4e69a93c7 100644
--- a/cpp/include/raft/sparse/op/sort.cuh
+++ b/cpp/include/raft/sparse/op/sort.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/sparse/op/detail/sort.h>
 
 namespace raft {
diff --git a/cpp/include/raft/sparse/solver/detail/lanczos.cuh b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
index 49f4e01362..63bc98b404 100644
--- a/cpp/include/raft/sparse/solver/detail/lanczos.cuh
+++ b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@
 #include <cuda.h>
 #include <curand.h>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/spectral/detail/lapack.hpp>
 #include <raft/spectral/detail/warn_dbg.hpp>
@@ -80,7 +80,7 @@ inline curandStatus_t curandGenerateNormalX(
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-int performLanczosIteration(handle_t const& handle,
+int performLanczosIteration(raft::device_resources const& handle,
                             spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const* A,
                             index_type_t* iter,
                             index_type_t maxIter,
@@ -541,7 +541,7 @@ static int francisQRIteration(index_type_t n,
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-static int lanczosRestart(handle_t const& handle,
+static int lanczosRestart(raft::device_resources const& handle,
                           index_type_t n,
                           index_type_t iter,
                           index_type_t iter_new,
@@ -744,7 +744,7 @@ static int lanczosRestart(handle_t const& handle,
  */
 template <typename index_type_t, typename value_type_t>
 int computeSmallestEigenvectors(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const* A,
   index_type_t nEigVecs,
   index_type_t maxIter,
@@ -988,7 +988,7 @@ int computeSmallestEigenvectors(
 
 template <typename index_type_t, typename value_type_t>
 int computeSmallestEigenvectors(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
   index_type_t nEigVecs,
   index_type_t maxIter,
@@ -1091,7 +1091,7 @@ int computeSmallestEigenvectors(
  */
 template <typename index_type_t, typename value_type_t>
 int computeLargestEigenvectors(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const* A,
   index_type_t nEigVecs,
   index_type_t maxIter,
@@ -1338,7 +1338,7 @@ int computeLargestEigenvectors(
 
 template <typename index_type_t, typename value_type_t>
 int computeLargestEigenvectors(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
   index_type_t nEigVecs,
   index_type_t maxIter,
diff --git a/cpp/include/raft/sparse/solver/detail/mst_solver_inl.cuh b/cpp/include/raft/sparse/solver/detail/mst_solver_inl.cuh
index d68d9f68b0..3ed58ea4ef 100644
--- a/cpp/include/raft/sparse/solver/detail/mst_solver_inl.cuh
+++ b/cpp/include/raft/sparse/solver/detail/mst_solver_inl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -60,17 +60,18 @@ inline curandStatus_t curand_generate_uniformX(curandGenerator_t generator,
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
-MST_solver<vertex_t, edge_t, weight_t, alteration_t>::MST_solver(const raft::handle_t& handle_,
-                                                                 const edge_t* offsets_,
-                                                                 const vertex_t* indices_,
-                                                                 const weight_t* weights_,
-                                                                 const vertex_t v_,
-                                                                 const edge_t e_,
-                                                                 vertex_t* color_,
-                                                                 cudaStream_t stream_,
-                                                                 bool symmetrize_output_,
-                                                                 bool initialize_colors_,
-                                                                 int iterations_)
+MST_solver<vertex_t, edge_t, weight_t, alteration_t>::MST_solver(
+  raft::device_resources const& handle_,
+  const edge_t* offsets_,
+  const vertex_t* indices_,
+  const weight_t* weights_,
+  const vertex_t v_,
+  const edge_t e_,
+  vertex_t* color_,
+  cudaStream_t stream_,
+  bool symmetrize_output_,
+  bool initialize_colors_,
+  int iterations_)
   : handle(handle_),
     offsets(offsets_),
     indices(indices_),
diff --git a/cpp/include/raft/sparse/solver/lanczos.cuh b/cpp/include/raft/sparse/solver/lanczos.cuh
index 9b5301988a..cdfaaa97f2 100644
--- a/cpp/include/raft/sparse/solver/lanczos.cuh
+++ b/cpp/include/raft/sparse/solver/lanczos.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -66,7 +66,7 @@ namespace raft::sparse::solver {
  */
 template <typename index_type_t, typename value_type_t>
 int computeSmallestEigenvectors(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   raft::spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
   index_type_t nEigVecs,
   index_type_t maxIter,
@@ -130,7 +130,7 @@ int computeSmallestEigenvectors(
  */
 template <typename index_type_t, typename value_type_t>
 int computeLargestEigenvectors(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   raft::spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
   index_type_t nEigVecs,
   index_type_t maxIter,
diff --git a/cpp/include/raft/sparse/solver/mst.cuh b/cpp/include/raft/sparse/solver/mst.cuh
index a941ce7c80..4f7600824a 100644
--- a/cpp/include/raft/sparse/solver/mst.cuh
+++ b/cpp/include/raft/sparse/solver/mst.cuh
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,7 +44,7 @@ namespace raft::sparse::solver {
  * when an msf is encountered)
  */
 template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t = weight_t>
-Graph_COO<vertex_t, edge_t, weight_t> mst(const raft::handle_t& handle,
+Graph_COO<vertex_t, edge_t, weight_t> mst(raft::device_resources const& handle,
                                           edge_t const* offsets,
                                           vertex_t const* indices,
                                           weight_t const* weights,
diff --git a/cpp/include/raft/sparse/solver/mst_solver.cuh b/cpp/include/raft/sparse/solver/mst_solver.cuh
index a10b74d77b..c10d7caf59 100644
--- a/cpp/include/raft/sparse/solver/mst_solver.cuh
+++ b/cpp/include/raft/sparse/solver/mst_solver.cuh
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -39,7 +39,7 @@ struct Graph_COO {
 template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
 class MST_solver {
  public:
-  MST_solver(const raft::handle_t& handle_,
+  MST_solver(raft::device_resources const& handle_,
              const edge_t* offsets_,
              const vertex_t* indices_,
              const weight_t* weights_,
@@ -56,7 +56,7 @@ class MST_solver {
   ~MST_solver() {}
 
  private:
-  const raft::handle_t& handle;
+  raft::device_resources const& handle;
   cudaStream_t stream;
   bool symmetrize_output, initialize_colors;
   int iterations;
diff --git a/cpp/include/raft/spatial/knn/ann.cuh b/cpp/include/raft/spatial/knn/ann.cuh
index befb5524ac..3d11ffbef4 100644
--- a/cpp/include/raft/spatial/knn/ann.cuh
+++ b/cpp/include/raft/spatial/knn/ann.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ namespace raft::spatial::knn {
  */
 template <typename T = float, typename value_idx = int>
 [[deprecated("Consider using new-style raft::spatial::knn::*::build functions")]] inline void
-approx_knn_build_index(raft::handle_t& handle,
+approx_knn_build_index(raft::device_resources& handle,
                        raft::spatial::knn::knnIndex* index,
                        knnIndexParam* params,
                        raft::distance::DistanceType metric,
@@ -67,7 +67,7 @@ approx_knn_build_index(raft::handle_t& handle,
  */
 template <typename T = float, typename value_idx = int>
 [[deprecated("Consider using new-style raft::spatial::knn::*::search functions")]] inline void
-approx_knn_search(raft::handle_t& handle,
+approx_knn_search(raft::device_resources& handle,
                   float* distances,
                   int64_t* indices,
                   raft::spatial::knn::knnIndex* index,
diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h
index a0d79a1b77..0e9e323b84 100644
--- a/cpp/include/raft/spatial/knn/ann_common.h
+++ b/cpp/include/raft/spatial/knn/ann_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,12 +22,10 @@
 
 #include "detail/processing.hpp"
 #include "ivf_flat_types.hpp"
+#include <raft/neighbors/ivf_pq_types.hpp>
 
 #include <raft/distance/distance_types.hpp>
 
-#include <faiss/gpu/GpuIndex.h>
-#include <raft/spatial/knn/faiss_mr.hpp>
-
 namespace raft {
 namespace spatial {
 namespace knn {
@@ -36,13 +34,14 @@ struct knnIndex {
   raft::distance::DistanceType metric;
   float metricArg;
   int nprobe;
-  std::unique_ptr<faiss::gpu::GpuIndex> index;
   std::unique_ptr<MetricProcessor<float>> metric_processor;
+
   std::unique_ptr<const ivf_flat::index<float, int64_t>> ivf_flat_float_;
   std::unique_ptr<const ivf_flat::index<uint8_t, int64_t>> ivf_flat_uint8_t_;
   std::unique_ptr<const ivf_flat::index<int8_t, int64_t>> ivf_flat_int8_t_;
 
-  std::unique_ptr<raft::spatial::knn::RmmGpuResources> gpu_res;
+  std::unique_ptr<const raft::neighbors::ivf_pq::index<int64_t>> ivf_pq;
+
   int device;
 
   template <typename T, typename IdxT>
@@ -70,16 +69,6 @@ inline auto knnIndex::ivf_flat<int8_t, int64_t>()
   return ivf_flat_int8_t_;
 }
 
-enum QuantizerType : unsigned int {
-  QT_8bit,
-  QT_4bit,
-  QT_8bit_uniform,
-  QT_4bit_uniform,
-  QT_fp16,
-  QT_8bit_direct,
-  QT_6bit
-};
-
 struct knnIndexParam {
   virtual ~knnIndexParam() {}
 };
@@ -98,11 +87,6 @@ struct IVFPQParam : IVFParam {
   bool usePrecomputedTables;
 };
 
-struct IVFSQParam : IVFParam {
-  QuantizerType qtype;
-  bool encodeResidual;
-};
-
 inline auto from_legacy_index_params(const IVFFlatParam& legacy,
                                      raft::distance::DistanceType metric,
                                      float metric_arg)
diff --git a/cpp/include/raft/spatial/knn/ball_cover.cuh b/cpp/include/raft/spatial/knn/ball_cover.cuh
index fdc2d41161..dda353e1c6 100644
--- a/cpp/include/raft/spatial/knn/ball_cover.cuh
+++ b/cpp/include/raft/spatial/knn/ball_cover.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,14 +34,14 @@
 namespace raft::spatial::knn {
 
 template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
-void rbc_build_index(const raft::handle_t& handle,
+void rbc_build_index(raft::device_resources const& handle,
                      BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index)
 {
   raft::neighbors::ball_cover::build_index(handle, index);
 }
 
 template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
-void rbc_all_knn_query(const raft::handle_t& handle,
+void rbc_all_knn_query(raft::device_resources const& handle,
                        BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,
                        int_t k,
                        idx_t* inds,
@@ -54,7 +54,7 @@ void rbc_all_knn_query(const raft::handle_t& handle,
 }
 
 template <typename idx_t, typename value_t, typename int_t>
-void rbc_knn_query(const raft::handle_t& handle,
+void rbc_knn_query(raft::device_resources const& handle,
                    const BallCoverIndex<idx_t, value_t, int_t>& index,
                    int_t k,
                    const value_t* query,
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
deleted file mode 100644
index 961cc76381..0000000000
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ /dev/null
@@ -1,1085 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "ann_utils.cuh"
-
-#include <thrust/gather.h>
-#include <thrust/transform.h>
-
-#include <raft/cluster/detail/kmeans_common.cuh>
-#include <raft/common/nvtx.hpp>
-#include <raft/core/cudart_utils.hpp>
-#include <raft/core/logger.hpp>
-#include <raft/distance/distance.cuh>
-#include <raft/distance/distance_types.hpp>
-#include <raft/distance/fused_l2_nn.cuh>
-#include <raft/linalg/add.cuh>
-#include <raft/linalg/gemm.cuh>
-#include <raft/linalg/matrix_vector_op.cuh>
-#include <raft/linalg/norm.cuh>
-#include <raft/linalg/normalize.cuh>
-#include <raft/linalg/unary_op.cuh>
-#include <raft/matrix/argmin.cuh>
-#include <raft/matrix/matrix.cuh>
-#include <raft/util/cuda_utils.cuh>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
-#include <rmm/device_vector.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/mr/device/managed_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-
-namespace raft::spatial::knn::detail::kmeans {
-
-constexpr static inline const float kAdjustCentersWeight = 7.0f;
-
-/**
- * @brief Predict labels for the dataset; floats only.
- *
- * NB: no minibatch splitting is done here, it may require large amount of temporary memory (n_rows
- * * n_cluster * sizeof(float)).
- *
- * @tparam IdxT   index type
- * @tparam LabelT label type
- *
- * @param handle
- * @param[in] centers a pointer to the row-major matrix of cluster centers [n_clusters, dim]
- * @param n_clusters number of clusters/centers
- * @param dim dimensionality of the data
- * @param[in] dataset a pointer to the data [n_rows, dim]
- * @param[in] dataset_norm pointer to the precomputed norm (for L2 metrics only) [n_rows]
- * @param n_rows number samples in the `dataset`
- * @param[out] labels output predictions [n_rows]
- * @param metric
- * @param stream
- * @param mr (optional) memory resource to use for temporary allocations
- */
-template <typename IdxT, typename LabelT>
-inline void predict_float_core(const handle_t& handle,
-                               const float* centers,
-                               uint32_t n_clusters,
-                               uint32_t dim,
-                               const float* dataset,
-                               const float* dataset_norm,
-                               IdxT n_rows,
-                               LabelT* labels,
-                               raft::distance::DistanceType metric,
-                               rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
-{
-  switch (metric) {
-    case raft::distance::DistanceType::L2Expanded:
-    case raft::distance::DistanceType::L2SqrtExpanded: {
-      auto workspace = raft::make_device_mdarray<char, IdxT>(
-        handle, mr, make_extents<IdxT>((sizeof(int)) * n_rows));
-
-      auto minClusterAndDistance = raft::make_device_mdarray<raft::KeyValuePair<IdxT, float>, IdxT>(
-        handle, mr, make_extents<IdxT>(n_rows));
-      raft::KeyValuePair<IdxT, float> initial_value(0, std::numeric_limits<float>::max());
-      thrust::fill(handle.get_thrust_policy(),
-                   minClusterAndDistance.data_handle(),
-                   minClusterAndDistance.data_handle() + minClusterAndDistance.size(),
-                   initial_value);
-
-      auto centroidsNorm =
-        raft::make_device_mdarray<float, uint32_t>(handle, mr, make_extents<uint32_t>(n_clusters));
-      raft::linalg::rowNorm<float, IdxT>(
-        centroidsNorm.data_handle(), centers, dim, n_clusters, raft::linalg::L2Norm, true, stream);
-
-      raft::distance::fusedL2NNMinReduce<float, raft::KeyValuePair<IdxT, float>, IdxT>(
-        minClusterAndDistance.data_handle(),
-        dataset,
-        centers,
-        dataset_norm,
-        centroidsNorm.data_handle(),
-        n_rows,
-        n_clusters,
-        dim,
-        (void*)workspace.data_handle(),
-        (metric == raft::distance::DistanceType::L2Expanded) ? false : true,
-        false,
-        stream);
-
-      // todo(lsugy): use KVP + iterator in caller.
-      // Copy keys to output labels
-      thrust::transform(handle.get_thrust_policy(),
-                        minClusterAndDistance.data_handle(),
-                        minClusterAndDistance.data_handle() + n_rows,
-                        labels,
-                        [=] __device__(raft::KeyValuePair<IdxT, float> kvp) {
-                          return static_cast<LabelT>(kvp.key);
-                        });
-      break;
-    }
-    case raft::distance::DistanceType::InnerProduct: {
-      // TODO: pass buffer
-      rmm::device_uvector<float> distances(n_rows * n_clusters, stream, mr);
-
-      float alpha = -1.0;
-      float beta  = 0.0;
-
-      linalg::gemm(handle,
-                   true,
-                   false,
-                   n_clusters,
-                   n_rows,
-                   dim,
-                   &alpha,
-                   centers,
-                   dim,
-                   dataset,
-                   dim,
-                   &beta,
-                   distances.data(),
-                   n_clusters,
-                   stream);
-
-      auto distances_const_view = raft::make_device_matrix_view<const float, IdxT, row_major>(
-        distances.data(), n_rows, static_cast<IdxT>(n_clusters));
-      auto labels_view = raft::make_device_vector_view<LabelT, IdxT>(labels, n_rows);
-      raft::matrix::argmin(handle, distances_const_view, labels_view);
-      break;
-    }
-    default: {
-      RAFT_FAIL("The chosen distance metric is not supported (%d)", int(metric));
-    }
-  }
-}
-
-/**
- * @brief Suggest a minibatch size for kmeans prediction.
- *
- * This function is used as a heuristic to split the work over a large dataset
- * to reduce the size of temporary memory allocations.
- *
- * @param n_clusters number of clusters in kmeans clustering
- * @param n_rows dataset size
- * @return a suggested minibatch size
- */
-template <typename IdxT>
-constexpr inline auto calc_minibatch_size(uint32_t n_clusters,
-                                          IdxT n_rows,
-                                          uint32_t dim,
-                                          raft::distance::DistanceType metric,
-                                          bool is_float) -> IdxT
-{
-  n_clusters = std::max<uint32_t>(1, n_clusters);
-
-  // Estimate memory needs per row (i.e element of the batch).
-  IdxT mem_per_row = 0;
-  /* fusedL2NN only needs one integer per row for a mutex.
-   * Other metrics require storing a distance matrix. */
-  if (metric != raft::distance::DistanceType::L2Expanded &&
-      metric != raft::distance::DistanceType::L2SqrtExpanded) {
-    mem_per_row += sizeof(float) * n_clusters;
-  } else {
-    mem_per_row += sizeof(int);
-  }
-  // If we need to convert to float, space required for the converted batch.
-  if (!is_float) { mem_per_row += sizeof(float) * dim; }
-
-  // Heuristic: calculate the minibatch size in order to use at most 1GB of memory.
-  IdxT minibatch_size = (1 << 30) / mem_per_row;
-  minibatch_size      = 64 * ceildiv(minibatch_size, (IdxT)64);
-  minibatch_size      = std::min<IdxT>(minibatch_size, n_rows);
-  return minibatch_size;
-}
-
-/**
- * @brief Given the data and labels, calculate cluster centers and sizes in one sweep.
- *
- * Let `S_i = {x_k | x_k \in dataset & labels[k] == i}` be the vectors in the dataset with label i.
- *
- * On exit,
- *   `centers_i = (\sum_{x \in S_i} x + w_i * center_i) / (|S_i| + w_i)`,
- *     where  `w_i = reset_counters ?  0 : cluster_size[i]`.
- *
- * In other words, the updated cluster centers are a weighted average of the existing cluster
- * center, and the coordinates of the points labeled with i. _This allows calling this function
- * multiple times with different datasets with the same effect as if calling this function once
- * on the combined dataset_.
- *
- * NB: all pointers must be accessible on the device.
- *
- * @tparam T      element type
- * @tparam IdxT   index type
- * @tparam LabelT label type
- *
- * @param[inout] centers pointer to the output [n_clusters, dim]
- * @param[inout] cluster_sizes number of rows in each cluster [n_clusters]
- * @param n_clusters number of clusters/centers
- * @param dim dimensionality of the data
- * @param[in] dataset a pointer to the data [n_rows, dim]
- * @param n_rows number samples in the `dataset`
- * @param[in] labels output predictions [n_rows]
- * @param reset_counters whether to clear the output arrays before calculating.
- *    When set to `false`, this function may be used to update existing centers and sizes using
- *    the weighted average principle.
- * @param stream
- * @param mr (optional) memory resource to use for temporary allocations on the device
- */
-template <typename T, typename IdxT, typename LabelT>
-void calc_centers_and_sizes(const handle_t& handle,
-                            float* centers,
-                            uint32_t* cluster_sizes,
-                            uint32_t n_clusters,
-                            uint32_t dim,
-                            const T* dataset,
-                            IdxT n_rows,
-                            const LabelT* labels,
-                            bool reset_counters,
-                            rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr = nullptr)
-{
-  if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); }
-
-  if (!reset_counters) {
-    raft::linalg::matrixVectorOp(
-      centers,
-      centers,
-      cluster_sizes,
-      (int64_t)dim,
-      (int64_t)n_clusters,
-      true,
-      false,
-      [=] __device__(float c, uint32_t s) -> float { return c * s; },
-      stream);
-  }
-
-  rmm::device_uvector<char> workspace(0, stream, mr);
-
-  // If we reset the counters, we can compute directly the new sizes in cluster_sizes.
-  // If we don't reset, we compute in a temporary buffer and add in a separate step.
-  rmm::device_uvector<uint32_t> temp_cluster_sizes(0, stream, mr);
-  uint32_t* temp_sizes = cluster_sizes;
-  if (!reset_counters) {
-    temp_cluster_sizes.resize(n_clusters, stream);
-    temp_sizes = temp_cluster_sizes.data();
-  }
-
-  utils::mapping<float> mapping_op;
-  cub::TransformInputIterator<float, utils::mapping<float>, const T*> mapping_itr(dataset,
-                                                                                  mapping_op);
-
-  // todo(lsugy): use iterator from KV output of fusedL2NN
-  raft::linalg::reduce_rows_by_key(mapping_itr,
-                                   static_cast<int64_t>(dim),
-                                   labels,
-                                   nullptr,
-                                   static_cast<int64_t>(n_rows),
-                                   static_cast<int64_t>(dim),
-                                   static_cast<int64_t>(n_clusters),
-                                   centers,
-                                   stream,
-                                   reset_counters);
-
-  // Compute weight of each cluster
-  raft::cluster::detail::countLabels(handle,
-                                     labels,
-                                     temp_sizes,
-                                     static_cast<int64_t>(n_rows),
-                                     static_cast<int64_t>(n_clusters),
-                                     workspace);
-
-  // Add previous sizes if necessary
-  if (!reset_counters) {
-    raft::linalg::add(cluster_sizes, cluster_sizes, temp_sizes, n_clusters, stream);
-  }
-
-  raft::linalg::matrixVectorOp(
-    centers,
-    centers,
-    cluster_sizes,
-    static_cast<int64_t>(dim),
-    static_cast<int64_t>(n_clusters),
-    true,
-    false,
-    [=] __device__(float mat, uint32_t vec) {
-      if (vec == 0u)
-        return 0.0f;
-      else
-        return mat / vec;
-    },
-    stream);
-}
-
-/** Computes the L2 norm of the dataset, converting to float if necessary */
-template <typename T, typename IdxT>
-void compute_norm(float* dataset_norm,
-                  const T* dataset,
-                  IdxT dim,
-                  IdxT n_rows,
-                  rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr = nullptr)
-{
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope("kmeans::compute_norm");
-  if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); }
-  rmm::device_uvector<float> dataset_float(0, stream, mr);
-
-  const float* dataset_ptr = nullptr;
-
-  if (std::is_same_v<float, T>) {
-    dataset_ptr = reinterpret_cast<const float*>(dataset);
-  } else {
-    dataset_float.resize(n_rows * dim, stream);
-
-    linalg::unaryOp(dataset_float.data(), dataset, n_rows * dim, utils::mapping<float>{}, stream);
-
-    dataset_ptr = (const float*)dataset_float.data();
-  }
-
-  raft::linalg::rowNorm<float, IdxT>(
-    dataset_norm, dataset_ptr, dim, n_rows, raft::linalg::L2Norm, true, stream);
-}
-
-/**
- * @brief Predict labels for the dataset.
- *
- * @tparam T      element type
- * @tparam IdxT   index type
- * @tparam LabelT label type
- *
- * @param handle
- * @param[in] centers a pointer to the row-major matrix of cluster centers [n_clusters, dim]
- * @param n_clusters number of clusters/centers
- * @param dim dimensionality of the data
- * @param[in] dataset a pointer to the data [n_rows, dim]
- * @param[in] dataset_norm pointer to the precomputed norm (for L2 metrics only) [n_rows]
- * @param n_rows number samples in the `dataset`
- * @param[out] labels output predictions [n_rows]
- * @param metric
- * @param stream
- * @param mr (optional) memory resource to use for temporary allocations
- */
-template <typename T, typename IdxT, typename LabelT>
-void predict(const handle_t& handle,
-             const float* centers,
-             uint32_t n_clusters,
-             uint32_t dim,
-             const T* dataset,
-             IdxT n_rows,
-             LabelT* labels,
-             raft::distance::DistanceType metric,
-             rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr = nullptr,
-             const float* dataset_norm           = nullptr)
-{
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "kmeans::predict(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
-  if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); }
-  IdxT max_minibatch_size =
-    calc_minibatch_size(n_clusters, n_rows, dim, metric, std::is_same_v<T, float>);
-  rmm::device_uvector<float> cur_dataset(
-    std::is_same_v<T, float> ? 0 : max_minibatch_size * dim, stream, mr);
-  bool need_compute_norm =
-    dataset_norm == nullptr && (metric == raft::distance::DistanceType::L2Expanded ||
-                                metric == raft::distance::DistanceType::L2SqrtExpanded);
-  rmm::device_uvector<float> cur_dataset_norm(
-    need_compute_norm ? max_minibatch_size : 0, stream, mr);
-  const float* dataset_norm_ptr = nullptr;
-  auto cur_dataset_ptr          = cur_dataset.data();
-  for (IdxT offset = 0; offset < n_rows; offset += max_minibatch_size) {
-    IdxT minibatch_size = std::min<IdxT>(max_minibatch_size, n_rows - offset);
-
-    if constexpr (std::is_same_v<T, float>) {
-      cur_dataset_ptr = const_cast<float*>(dataset + offset * dim);
-    } else {
-      linalg::unaryOp(cur_dataset_ptr,
-                      dataset + offset * dim,
-                      (IdxT)(minibatch_size * dim),
-                      utils::mapping<float>{},
-                      stream);
-    }
-
-    // Compute the norm now if it hasn't been pre-computed.
-    if (need_compute_norm) {
-      compute_norm<float, IdxT>(
-        cur_dataset_norm.data(), cur_dataset_ptr, (IdxT)dim, (IdxT)minibatch_size, stream, mr);
-      dataset_norm_ptr = cur_dataset_norm.data();
-    } else if (dataset_norm != nullptr) {
-      dataset_norm_ptr = dataset_norm + offset;
-    }
-
-    predict_float_core<IdxT, LabelT>(handle,
-                                     centers,
-                                     n_clusters,
-                                     dim,
-                                     cur_dataset_ptr,
-                                     dataset_norm_ptr,
-                                     minibatch_size,
-                                     labels + offset,
-                                     metric,
-                                     stream,
-                                     mr);
-  }
-}
-
-template <typename T, uint32_t BlockDimY, typename IdxT, typename LabelT>
-__global__ void __launch_bounds__((WarpSize * BlockDimY))
-  adjust_centers_kernel(float* centers,  // [n_clusters, dim]
-                        uint32_t n_clusters,
-                        uint32_t dim,
-                        const T* dataset,  // [n_rows, dim]
-                        IdxT n_rows,
-                        const LabelT* labels,           // [n_rows]
-                        const uint32_t* cluster_sizes,  // [n_clusters]
-                        float threshold,
-                        uint32_t average,
-                        uint32_t seed,
-                        uint32_t* count)
-{
-  uint32_t l = threadIdx.y + BlockDimY * blockIdx.y;
-  if (l >= n_clusters) return;
-  auto csize = cluster_sizes[l];
-  // skip big clusters
-  if (csize > static_cast<uint32_t>(average * threshold)) return;
-
-  // choose a "random" i that belongs to a rather large cluster
-  IdxT i;
-  uint32_t j = laneId();
-  if (j == 0) {
-    do {
-      auto old = static_cast<IdxT>(atomicAdd(count, 1));
-      i        = (seed * (old + 1)) % n_rows;
-    } while (cluster_sizes[labels[i]] < average);
-  }
-  i = raft::shfl(i, 0);
-
-  // Adjust the center of the selected smaller cluster to gravitate towards
-  // a sample from the selected larger cluster.
-  const IdxT li = static_cast<IdxT>(labels[i]);
-  // Weight of the current center for the weighted average.
-  // We dump it for anomalously small clusters, but keep constant otherwise.
-  const float wc = csize > kAdjustCentersWeight ? kAdjustCentersWeight : float(csize);
-  // Weight for the datapoint used to shift the center.
-  const float wd = 1.0;
-  for (; j < dim; j += WarpSize) {
-    float val = 0;
-    val += wc * centers[j + dim * li];
-    val += wd * utils::mapping<float>{}(dataset[j + static_cast<IdxT>(dim) * i]);
-    val /= wc + wd;
-    centers[j + dim * l] = val;
-  }
-}
-
-/**
- * @brief Adjust centers for clusters that have small number of entries.
- *
- * For each cluster, where the cluster size is not bigger than a threshold, the center is moved
- * towards a data point that belongs to a large cluster.
- *
- * NB: if this function returns `true`, you should update the labels.
- *
- * NB: all pointers are used either on the host side or on the device side together.
- *
- * @tparam T element type
- *
- * @param[inout] centers cluster centers [n_clusters, dim]
- * @param n_clusters number of rows in `centers`
- * @param dim number of columns in `centers` and `dataset`
- * @param[in] dataset a host pointer to the row-major data matrix [n_rows, dim]
- * @param n_rows number of rows in `dataset`
- * @param[in] labels a host pointer to the cluster indices [n_rows]
- * @param[in] cluster_sizes number of rows in each cluster [n_clusters]
- * @param threshold defines a criterion for adjusting a cluster
- *                   (cluster_sizes <= average_size * threshold)
- *                   0 <= threshold < 1
- * @param device_memory  memory resource to use for temporary allocations
- * @param stream
- *
- * @return whether any of the centers has been updated (and thus, `labels` need to be recalculated).
- */
-template <typename T, typename IdxT, typename LabelT>
-auto adjust_centers(float* centers,
-                    uint32_t n_clusters,
-                    uint32_t dim,
-                    const T* dataset,
-                    IdxT n_rows,
-                    const LabelT* labels,
-                    const uint32_t* cluster_sizes,
-                    float threshold,
-                    rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* device_memory) -> bool
-{
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "kmeans::adjust_centers(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
-  if (n_clusters == 0) { return false; }
-  constexpr static std::array kPrimes{29,   71,   113,  173,  229,  281,  349,  409,  463,  541,
-                                      601,  659,  733,  809,  863,  941,  1013, 1069, 1151, 1223,
-                                      1291, 1373, 1451, 1511, 1583, 1657, 1733, 1811, 1889, 1987,
-                                      2053, 2129, 2213, 2287, 2357, 2423, 2531, 2617, 2687, 2741};
-  static IdxT i        = 0;
-  static IdxT i_primes = 0;
-
-  bool adjusted    = false;
-  uint32_t average = static_cast<uint32_t>(n_rows / static_cast<IdxT>(n_clusters));
-  uint32_t ofst;
-  do {
-    i_primes = (i_primes + 1) % kPrimes.size();
-    ofst     = kPrimes[i_primes];
-  } while (n_rows % ofst == 0);
-
-  switch (utils::check_pointer_residency(centers, dataset, labels, cluster_sizes)) {
-    case utils::pointer_residency::host_and_device:
-    case utils::pointer_residency::device_only: {
-      constexpr uint32_t kBlockDimY = 4;
-      const dim3 block_dim(WarpSize, kBlockDimY, 1);
-      const dim3 grid_dim(1, raft::ceildiv(n_clusters, kBlockDimY), 1);
-      rmm::device_scalar<uint32_t> update_count(0, stream, device_memory);
-      adjust_centers_kernel<T, kBlockDimY><<<grid_dim, block_dim, 0, stream>>>(centers,
-                                                                               n_clusters,
-                                                                               dim,
-                                                                               dataset,
-                                                                               n_rows,
-                                                                               labels,
-                                                                               cluster_sizes,
-                                                                               threshold,
-                                                                               average,
-                                                                               ofst,
-                                                                               update_count.data());
-      adjusted = update_count.value(stream) > 0;  // NB: rmm scalar performs the sync
-    } break;
-    case utils::pointer_residency::host_only: {
-      stream.synchronize();
-      for (uint32_t l = 0; l < n_clusters; l++) {
-        auto csize = cluster_sizes[l];
-        // skip big clusters
-        if (csize > static_cast<uint32_t>(average * threshold)) continue;
-        // choose a "random" i that belongs to a rather large cluster
-        do {
-          i = (i + ofst) % n_rows;
-        } while (cluster_sizes[labels[i]] < average);
-        // Adjust the center of the selected smaller cluster to gravitate towards
-        // a sample from the selected larger cluster.
-        const IdxT li = static_cast<IdxT>(labels[i]);
-        // Weight of the current center for the weighted average.
-        // We dump it for anomalously small clusters, but keep constant otherwise.
-        const float wc = std::min<float>(csize, kAdjustCentersWeight);
-        // Weight for the datapoint used to shift the center.
-        const float wd = 1.0;
-        for (uint32_t j = 0; j < dim; j++) {
-          float val = 0;
-          val += wc * centers[j + dim * li];
-          val += wd * utils::mapping<float>{}(dataset[j + static_cast<IdxT>(dim) * i]);
-          val /= wc + wd;
-          centers[j + dim * l] = val;
-        }
-        adjusted = true;
-      }
-      stream.synchronize();
-    } break;
-    default: RAFT_FAIL("All pointers must reside on the same side, host or device.");
-  }
-  return adjusted;
-}
-
-/**
- * @brief Expectation-maximization-balancing combined in an iterative process.
- *
- * Note, the `cluster_centers` is assumed to be already initialized here.
- * Thus, this function can be used for fine-tuning existing clusters;
- * to train from scratch, use `build_clusters` function below.
- *
- * @tparam T      element type
- * @tparam IdxT   index type
- * @tparam LabelT label type
- *
- * @param handle
- * @param n_iters the requested number of iteration
- * @param dim the dimensionality of the dataset
- * @param[in] dataset a pointer to a managed row-major array [n_rows, dim]
- * @param[in] dataset_norm pointer to the precomputed norm (for L2 metrics only) [n_rows]
- * @param n_rows the number of rows in the dataset
- * @param n_cluster the requested number of clusters
- * @param[inout] cluster_centers a pointer to a managed row-major array [n_clusters, dim]
- * @param[out] cluster_labels a pointer to a managed row-major array [n_rows]
- * @param[out] cluster_sizes a pointer to a managed row-major array [n_clusters]
- * @param metric the distance type (there is a tweak in place for the similarity-based metrics)
- * @param balancing_pullback
- *   if the cluster centers are rebalanced on this number of iterations,
- *   one extra iteration is performed (this could happen several times) (default should be `2`).
- *   In other words, the first and then every `ballancing_pullback`-th rebalancing operation adds
- *   one more iteration to the main cycle.
- * @param balancing_threshold
- *   the rebalancing takes place if any cluster is smaller than `avg_size * balancing_threshold`
- *   on a given iteration (default should be `~ 0.25`).
- * @param stream
- * @param device_memory
- *   a memory resource for device allocations (makes sense to provide a memory pool here)
- */
-template <typename T, typename IdxT, typename LabelT>
-void balancing_em_iters(const handle_t& handle,
-                        uint32_t n_iters,
-                        uint32_t dim,
-                        const T* dataset,
-                        const float* dataset_norm,
-                        IdxT n_rows,
-                        uint32_t n_clusters,
-                        float* cluster_centers,
-                        LabelT* cluster_labels,
-                        uint32_t* cluster_sizes,
-                        raft::distance::DistanceType metric,
-                        uint32_t balancing_pullback,
-                        float balancing_threshold,
-                        rmm::cuda_stream_view stream,
-                        rmm::mr::device_memory_resource* device_memory)
-{
-  uint32_t balancing_counter = balancing_pullback;
-  for (uint32_t iter = 0; iter < n_iters; iter++) {
-    // Balancing step - move the centers around to equalize cluster sizes
-    // (but not on the first iteration)
-    if (iter > 0 && kmeans::adjust_centers(cluster_centers,
-                                           n_clusters,
-                                           dim,
-                                           dataset,
-                                           n_rows,
-                                           cluster_labels,
-                                           cluster_sizes,
-                                           balancing_threshold,
-                                           stream,
-                                           device_memory)) {
-      if (balancing_counter++ >= balancing_pullback) {
-        balancing_counter -= balancing_pullback;
-        n_iters++;
-      }
-    }
-    switch (metric) {
-      // For some metrics, cluster calculation and adjustment tends to favor zero center vectors.
-      // To avoid converging to zero, we normalize the center vectors on every iteration.
-      case raft::distance::DistanceType::InnerProduct:
-      case raft::distance::DistanceType::CosineExpanded:
-      case raft::distance::DistanceType::CorrelationExpanded: {
-        auto clusters_in_view =
-          raft::make_device_matrix_view<const float, uint32_t, raft::row_major>(
-            cluster_centers, n_clusters, dim);
-        auto clusters_out_view = raft::make_device_matrix_view<float, uint32_t, raft::row_major>(
-          cluster_centers, n_clusters, dim);
-        raft::linalg::row_normalize(
-          handle, clusters_in_view, clusters_out_view, raft::linalg::L2Norm);
-        break;
-      }
-      default: break;
-    }
-    // E: Expectation step - predict labels
-    predict<T, IdxT, LabelT>(handle,
-                             cluster_centers,
-                             n_clusters,
-                             dim,
-                             dataset,
-                             n_rows,
-                             cluster_labels,
-                             metric,
-                             stream,
-                             device_memory,
-                             dataset_norm);
-    // M: Maximization step - calculate optimal cluster centers
-    calc_centers_and_sizes(handle,
-                           cluster_centers,
-                           cluster_sizes,
-                           n_clusters,
-                           dim,
-                           dataset,
-                           n_rows,
-                           cluster_labels,
-                           true,
-                           stream,
-                           device_memory);
-  }
-}
-
-/** Randomly initialize cluster centers and then call `balancing_em_iters`. */
-template <typename T, typename IdxT, typename LabelT>
-void build_clusters(const handle_t& handle,
-                    uint32_t n_iters,
-                    uint32_t dim,
-                    const T* dataset,
-                    IdxT n_rows,
-                    uint32_t n_clusters,
-                    float* cluster_centers,
-                    LabelT* cluster_labels,
-                    uint32_t* cluster_sizes,
-                    raft::distance::DistanceType metric,
-                    rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* device_memory,
-                    const float* dataset_norm = nullptr)
-{
-  RAFT_EXPECTS(static_cast<uint64_t>(n_rows) * static_cast<uint64_t>(dim) <=
-                 static_cast<uint64_t>(std::numeric_limits<IdxT>::max()),
-               "the chosen index type cannot represent all indices for the given dataset");
-
-  // "randomly initialize labels"
-  auto f = [n_clusters] __device__(LabelT * out, IdxT i) {
-    *out = LabelT(i % static_cast<IdxT>(n_clusters));
-  };
-  linalg::writeOnlyUnaryOp<LabelT, decltype(f), IdxT>(cluster_labels, n_rows, f, stream);
-
-  // update centers to match the initialized labels.
-  calc_centers_and_sizes(handle,
-                         cluster_centers,
-                         cluster_sizes,
-                         n_clusters,
-                         dim,
-                         dataset,
-                         n_rows,
-                         cluster_labels,
-                         true,
-                         stream,
-                         device_memory);
-
-  // run EM
-  balancing_em_iters<T, IdxT, LabelT>(handle,
-                                      n_iters,
-                                      dim,
-                                      dataset,
-                                      dataset_norm,
-                                      n_rows,
-                                      n_clusters,
-                                      cluster_centers,
-                                      cluster_labels,
-                                      cluster_sizes,
-                                      metric,
-                                      2,
-                                      0.25f,
-                                      stream,
-                                      device_memory);
-}
-
-/** Calculate how many fine clusters should belong to each mesocluster. */
-template <typename IdxT>
-inline auto arrange_fine_clusters(uint32_t n_clusters,
-                                  uint32_t n_mesoclusters,
-                                  IdxT n_rows,
-                                  const uint32_t* mesocluster_sizes)
-{
-  std::vector<uint32_t> fine_clusters_nums(n_mesoclusters);
-  std::vector<uint32_t> fine_clusters_csum(n_mesoclusters + 1);
-  fine_clusters_csum[0] = 0;
-
-  uint32_t n_lists_rem       = n_clusters;
-  uint32_t n_nonempty_ms_rem = 0;
-  for (uint32_t i = 0; i < n_mesoclusters; i++) {
-    n_nonempty_ms_rem += mesocluster_sizes[i] > 0 ? 1 : 0;
-  }
-  IdxT n_rows_rem                 = n_rows;
-  IdxT mesocluster_size_sum       = 0;
-  uint32_t mesocluster_size_max   = 0;
-  uint32_t fine_clusters_nums_max = 0;
-  for (uint32_t i = 0; i < n_mesoclusters; i++) {
-    if (i < n_mesoclusters - 1) {
-      // Although the algorithm is meant to produce balanced clusters, when something
-      // goes wrong, we may get empty clusters (e.g. during development/debugging).
-      // The code below ensures a proportional arrangement of fine cluster numbers
-      // per mesocluster, even if some clusters are empty.
-      if (mesocluster_sizes[i] == 0) {
-        fine_clusters_nums[i] = 0;
-      } else {
-        n_nonempty_ms_rem--;
-        auto s = uint32_t((double)n_lists_rem * mesocluster_sizes[i] / n_rows_rem + .5);
-        s      = std::min<uint32_t>(s, n_lists_rem - n_nonempty_ms_rem);
-        fine_clusters_nums[i] = std::max<uint32_t>(s, 1);
-      }
-    } else {
-      fine_clusters_nums[i] = n_lists_rem;
-    }
-    n_lists_rem -= fine_clusters_nums[i];
-    n_rows_rem -= mesocluster_sizes[i];
-    mesocluster_size_max = max(mesocluster_size_max, mesocluster_sizes[i]);
-    mesocluster_size_sum += mesocluster_sizes[i];
-    fine_clusters_nums_max    = max(fine_clusters_nums_max, fine_clusters_nums[i]);
-    fine_clusters_csum[i + 1] = fine_clusters_csum[i] + fine_clusters_nums[i];
-  }
-
-  RAFT_EXPECTS(mesocluster_size_sum == n_rows,
-               "mesocluster sizes do not add up (%zu) to the total trainset size (%zu)",
-               static_cast<size_t>(mesocluster_size_sum),
-               static_cast<size_t>(n_rows));
-  RAFT_EXPECTS(fine_clusters_csum[n_mesoclusters] == n_clusters,
-               "fine cluster numbers do not add up (%u) to the total number of clusters (%u)",
-               fine_clusters_csum[n_mesoclusters],
-               n_clusters);
-
-  return std::make_tuple(mesocluster_size_max,
-                         fine_clusters_nums_max,
-                         std::move(fine_clusters_nums),
-                         std::move(fine_clusters_csum));
-}
-
-/**
- *  Given the (coarse) mesoclusters and the distribution of fine clusters within them,
- *  build the fine clusters.
- *
- *  Processing one mesocluster at a time:
- *   1. Copy mesocluster data into a separate buffer
- *   2. Predict fine cluster
- *   3. Refince the fine cluster centers
- *
- *  As a result, the fine clusters are what is returned by `build_hierarchical`;
- *  this function returns the total number of fine clusters, which can be checked to be
- *  the same as the requested number of clusters.
- */
-template <typename T, typename IdxT, typename LabelT>
-auto build_fine_clusters(const handle_t& handle,
-                         uint32_t n_iters,
-                         uint32_t dim,
-                         const T* dataset_mptr,
-                         const float* dataset_norm_mptr,
-                         const LabelT* labels_mptr,
-                         IdxT n_rows,
-                         const uint32_t* fine_clusters_nums,
-                         const uint32_t* fine_clusters_csum,
-                         const uint32_t* mesocluster_sizes,
-                         uint32_t n_mesoclusters,
-                         uint32_t mesocluster_size_max,
-                         uint32_t fine_clusters_nums_max,
-                         float* cluster_centers,
-                         raft::distance::DistanceType metric,
-                         rmm::mr::device_memory_resource* managed_memory,
-                         rmm::mr::device_memory_resource* device_memory,
-                         rmm::cuda_stream_view stream) -> uint32_t
-{
-  rmm::device_uvector<IdxT> mc_trainset_ids_buf(mesocluster_size_max, stream, managed_memory);
-  rmm::device_uvector<float> mc_trainset_buf(mesocluster_size_max * dim, stream, device_memory);
-  rmm::device_uvector<float> mc_trainset_norm_buf(mesocluster_size_max, stream, device_memory);
-  auto mc_trainset_ids  = mc_trainset_ids_buf.data();
-  auto mc_trainset      = mc_trainset_buf.data();
-  auto mc_trainset_norm = mc_trainset_norm_buf.data();
-
-  // label (cluster ID) of each vector
-  rmm::device_uvector<LabelT> mc_trainset_labels(mesocluster_size_max, stream, device_memory);
-
-  rmm::device_uvector<float> mc_trainset_ccenters(
-    fine_clusters_nums_max * dim, stream, device_memory);
-  // number of vectors in each cluster
-  rmm::device_uvector<uint32_t> mc_trainset_csizes_tmp(
-    fine_clusters_nums_max, stream, device_memory);
-
-  // Training clusters in each meso-cluster
-  uint32_t n_clusters_done = 0;
-  for (uint32_t i = 0; i < n_mesoclusters; i++) {
-    uint32_t k = 0;
-    for (IdxT j = 0; j < n_rows; j++) {
-      if (labels_mptr[j] == (LabelT)i) { mc_trainset_ids[k++] = j; }
-    }
-    if (k != mesocluster_sizes[i])
-      RAFT_LOG_WARN("Incorrect mesocluster size at %d. %d vs %d", i, k, mesocluster_sizes[i]);
-    if (k == 0) {
-      RAFT_LOG_DEBUG("Empty cluster %d", i);
-      RAFT_EXPECTS(fine_clusters_nums[i] == 0,
-                   "Number of fine clusters must be zero for the empty mesocluster (got %d)",
-                   fine_clusters_nums[i]);
-      continue;
-    } else {
-      RAFT_EXPECTS(fine_clusters_nums[i] > 0,
-                   "Number of fine clusters must be non-zero for a non-empty mesocluster");
-    }
-
-    utils::copy_selected((IdxT)mesocluster_sizes[i],
-                         (IdxT)dim,
-                         dataset_mptr,
-                         mc_trainset_ids,
-                         (IdxT)dim,
-                         mc_trainset,
-                         (IdxT)dim,
-                         stream);
-    if (metric == raft::distance::DistanceType::L2Expanded ||
-        metric == raft::distance::DistanceType::L2SqrtExpanded) {
-      thrust::gather(handle.get_thrust_policy(),
-                     mc_trainset_ids,
-                     mc_trainset_ids + mesocluster_sizes[i],
-                     dataset_norm_mptr,
-                     mc_trainset_norm);
-    }
-
-    build_clusters<float, IdxT, LabelT>(handle,
-                                        n_iters,
-                                        dim,
-                                        mc_trainset,
-                                        mesocluster_sizes[i],
-                                        fine_clusters_nums[i],
-                                        mc_trainset_ccenters.data(),
-                                        mc_trainset_labels.data(),
-                                        mc_trainset_csizes_tmp.data(),
-                                        metric,
-                                        stream,
-                                        device_memory,
-                                        mc_trainset_norm);
-
-    raft::copy(cluster_centers + (dim * fine_clusters_csum[i]),
-               mc_trainset_ccenters.data(),
-               fine_clusters_nums[i] * dim,
-               stream);
-    handle.sync_stream(stream);
-    n_clusters_done += fine_clusters_nums[i];
-  }
-  return n_clusters_done;
-}
-
-/**
- * @brief Hierarchical balanced k-means
- *
- * @tparam T      element type
- * @tparam IdxT   index type
- * @tparam LabelT label type
- *
- * @param handle
- * @param n_iters number of training iterations
- * @param dim number of columns in `centers` and `dataset`
- * @param[in] dataset a device pointer to the source dataset [n_rows, dim]
- * @param n_rows number of rows in the input
- * @param[out] cluster_centers a device pointer to the found cluster centers [n_cluster, dim]
- * @param n_cluster
- * @param metric the distance type
- * @param stream
- */
-template <typename T, typename IdxT>
-void build_hierarchical(const handle_t& handle,
-                        uint32_t n_iters,
-                        uint32_t dim,
-                        const T* dataset,
-                        IdxT n_rows,
-                        float* cluster_centers,
-                        uint32_t n_clusters,
-                        raft::distance::DistanceType metric,
-                        rmm::cuda_stream_view stream)
-{
-  using LabelT = uint32_t;
-
-  RAFT_EXPECTS(static_cast<uint64_t>(n_rows) * static_cast<uint64_t>(dim) <=
-                 static_cast<uint64_t>(std::numeric_limits<IdxT>::max()),
-               "the chosen index type cannot represent all indices for the given dataset");
-
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "kmeans::build_hierarchical(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
-
-  uint32_t n_mesoclusters = std::min<uint32_t>(n_clusters, std::sqrt(n_clusters) + 0.5);
-  RAFT_LOG_DEBUG("kmeans::build_hierarchical: n_mesoclusters: %u", n_mesoclusters);
-
-  rmm::mr::managed_memory_resource managed_memory;
-  rmm::mr::device_memory_resource* device_memory = nullptr;
-  IdxT max_minibatch_size =
-    calc_minibatch_size(n_clusters, n_rows, dim, metric, std::is_same_v<T, float>);
-  auto pool_guard = raft::get_pool_memory_resource(device_memory, max_minibatch_size * dim * 4);
-  if (pool_guard) {
-    RAFT_LOG_DEBUG(
-      "kmeans::build_hierarchical: using pool memory resource with initial size %zu bytes",
-      pool_guard->pool_size());
-  }
-
-  // Precompute the L2 norm of the dataset if relevant.
-  const float* dataset_norm = nullptr;
-  rmm::device_uvector<float> dataset_norm_buf(0, stream, device_memory);
-  if (metric == raft::distance::DistanceType::L2Expanded ||
-      metric == raft::distance::DistanceType::L2SqrtExpanded) {
-    dataset_norm_buf.resize(n_rows, stream);
-    for (IdxT offset = 0; offset < n_rows; offset += max_minibatch_size) {
-      IdxT minibatch_size = std::min<IdxT>(max_minibatch_size, n_rows - offset);
-      compute_norm<T, IdxT>(dataset_norm_buf.data() + offset,
-                            dataset + dim * offset,
-                            (IdxT)dim,
-                            (IdxT)minibatch_size,
-                            stream,
-                            device_memory);
-    }
-    dataset_norm = (const float*)dataset_norm_buf.data();
-  }
-
-  // build coarse clusters (mesoclusters)
-  rmm::device_uvector<LabelT> mesocluster_labels_buf(n_rows, stream, &managed_memory);
-  rmm::device_uvector<uint32_t> mesocluster_sizes_buf(n_mesoclusters, stream, &managed_memory);
-  {
-    rmm::device_uvector<float> mesocluster_centers_buf(n_mesoclusters * dim, stream, device_memory);
-    build_clusters<T, IdxT, LabelT>(handle,
-                                    n_iters,
-                                    dim,
-                                    dataset,
-                                    n_rows,
-                                    n_mesoclusters,
-                                    mesocluster_centers_buf.data(),
-                                    mesocluster_labels_buf.data(),
-                                    mesocluster_sizes_buf.data(),
-                                    metric,
-                                    stream,
-                                    device_memory,
-                                    dataset_norm);
-  }
-
-  auto mesocluster_sizes  = mesocluster_sizes_buf.data();
-  auto mesocluster_labels = mesocluster_labels_buf.data();
-
-  handle.sync_stream(stream);
-
-  // build fine clusters
-  auto [mesocluster_size_max, fine_clusters_nums_max, fine_clusters_nums, fine_clusters_csum] =
-    arrange_fine_clusters(n_clusters, n_mesoclusters, n_rows, mesocluster_sizes);
-
-  if (mesocluster_size_max * n_mesoclusters > 2 * n_rows) {
-    RAFT_LOG_WARN("build_hierarchical: built unbalanced mesoclusters");
-    RAFT_LOG_TRACE_VEC(mesocluster_sizes, n_mesoclusters);
-    RAFT_LOG_TRACE_VEC(fine_clusters_nums.data(), n_mesoclusters);
-  }
-
-  auto n_clusters_done = build_fine_clusters<T, IdxT, LabelT>(handle,
-                                                              n_iters,
-                                                              dim,
-                                                              dataset,
-                                                              dataset_norm,
-                                                              mesocluster_labels,
-                                                              n_rows,
-                                                              fine_clusters_nums.data(),
-                                                              fine_clusters_csum.data(),
-                                                              mesocluster_sizes,
-                                                              n_mesoclusters,
-                                                              mesocluster_size_max,
-                                                              fine_clusters_nums_max,
-                                                              cluster_centers,
-                                                              metric,
-                                                              &managed_memory,
-                                                              device_memory,
-                                                              stream);
-  RAFT_EXPECTS(n_clusters_done == n_clusters, "Didn't process all clusters.");
-
-  rmm::device_uvector<uint32_t> cluster_sizes(n_clusters, stream, device_memory);
-  rmm::device_uvector<LabelT> labels(n_rows, stream, device_memory);
-
-  // Fine-tuning kmeans for all clusters
-  //
-  // (*) Since the likely cluster centroids have been calculated
-  // hierarchically already, the number of iteration for fine-tuning
-  // kmeans for whole clusters should be reduced. However, there
-  // is a possibility that the clusters could be unbalanced here,
-  // in which case the actual number of iterations would be increased.
-  //
-  balancing_em_iters<T, IdxT, LabelT>(handle,
-                                      std::max<uint32_t>(n_iters / 10, 2),
-                                      dim,
-                                      dataset,
-                                      dataset_norm,
-                                      n_rows,
-                                      n_clusters,
-                                      cluster_centers,
-                                      labels.data(),
-                                      cluster_sizes.data(),
-                                      metric,
-                                      5,
-                                      0.2f,
-                                      stream,
-                                      device_memory);
-}
-
-}  // namespace raft::spatial::knn::detail::kmeans
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index e5900ffd69..427e812cda 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,89 +18,25 @@
 
 #include "../ann_common.h"
 #include "../ivf_flat.cuh"
-#include "knn_brute_force_faiss.cuh"
 
-#include "common_faiss.h"
 #include "processing.cuh"
+#include <raft/core/operators.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <raft/label/classlabels.cuh>
-#include <raft/spatial/knn/faiss_mr.hpp>
+#include <raft/neighbors/ivf_pq.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <faiss/gpu/GpuDistance.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/GpuIndexIVFFlat.h>
-#include <faiss/gpu/GpuIndexIVFPQ.h>
-#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/Limits.cuh>
-#include <faiss/gpu/utils/Select.cuh>
-#include <faiss/gpu/utils/Tensor.cuh>
-#include <faiss/utils/Heap.h>
-
 #include <thrust/iterator/transform_iterator.h>
 
 namespace raft::spatial::knn::detail {
 
-inline faiss::ScalarQuantizer::QuantizerType build_faiss_qtype(QuantizerType qtype)
-{
-  switch (qtype) {
-    case QuantizerType::QT_8bit: return faiss::ScalarQuantizer::QuantizerType::QT_8bit;
-    case QuantizerType::QT_8bit_uniform:
-      return faiss::ScalarQuantizer::QuantizerType::QT_8bit_uniform;
-    case QuantizerType::QT_4bit_uniform:
-      return faiss::ScalarQuantizer::QuantizerType::QT_4bit_uniform;
-    case QuantizerType::QT_fp16: return faiss::ScalarQuantizer::QuantizerType::QT_fp16;
-    case QuantizerType::QT_8bit_direct:
-      return faiss::ScalarQuantizer::QuantizerType::QT_8bit_direct;
-    case QuantizerType::QT_6bit: return faiss::ScalarQuantizer::QuantizerType::QT_6bit;
-    default: return (faiss::ScalarQuantizer::QuantizerType)qtype;
-  }
-}
-
-template <typename IntType = int>
-void approx_knn_ivfflat_build_index(knnIndex* index,
-                                    const IVFFlatParam& params,
-                                    IntType n,
-                                    IntType D)
-{
-  faiss::gpu::GpuIndexIVFFlatConfig config;
-  config.device                  = index->device;
-  faiss::MetricType faiss_metric = build_faiss_metric(index->metric);
-  index->index.reset(
-    new faiss::gpu::GpuIndexIVFFlat(index->gpu_res.get(), D, params.nlist, faiss_metric, config));
-}
-
-template <typename IntType = int>
-void approx_knn_ivfpq_build_index(knnIndex* index, const IVFPQParam& params, IntType n, IntType D)
-{
-  faiss::gpu::GpuIndexIVFPQConfig config;
-  config.device                  = index->device;
-  config.usePrecomputedTables    = params.usePrecomputedTables;
-  config.interleavedLayout       = params.n_bits != 8;
-  faiss::MetricType faiss_metric = build_faiss_metric(index->metric);
-  index->index.reset(new faiss::gpu::GpuIndexIVFPQ(
-    index->gpu_res.get(), D, params.nlist, params.M, params.n_bits, faiss_metric, config));
-}
-
-template <typename IntType = int>
-void approx_knn_ivfsq_build_index(knnIndex* index, const IVFSQParam& params, IntType n, IntType D)
-{
-  faiss::gpu::GpuIndexIVFScalarQuantizerConfig config;
-  config.device                                     = index->device;
-  faiss::MetricType faiss_metric                    = build_faiss_metric(index->metric);
-  faiss::ScalarQuantizer::QuantizerType faiss_qtype = build_faiss_qtype(params.qtype);
-  index->index.reset(new faiss::gpu::GpuIndexIVFScalarQuantizer(
-    index->gpu_res.get(), D, params.nlist, faiss_qtype, faiss_metric, params.encodeResidual));
-}
-
 template <typename T = float, typename IntType = int>
-void approx_knn_build_index(const handle_t& handle,
+void approx_knn_build_index(raft::device_resources const& handle,
                             knnIndex* index,
                             knnIndexParam* params,
                             raft::distance::DistanceType metric,
@@ -110,7 +46,6 @@ void approx_knn_build_index(const handle_t& handle,
                             IntType D)
 {
   auto stream      = handle.get_stream();
-  index->index     = nullptr;
   index->metric    = metric;
   index->metricArg = metricArg;
   if (dynamic_cast<const IVFParam*>(params)) {
@@ -118,46 +53,42 @@ void approx_knn_build_index(const handle_t& handle,
   }
   auto ivf_ft_pams = dynamic_cast<IVFFlatParam*>(params);
   auto ivf_pq_pams = dynamic_cast<IVFPQParam*>(params);
-  auto ivf_sq_pams = dynamic_cast<IVFSQParam*>(params);
 
   if constexpr (std::is_same_v<T, float>) {
     index->metric_processor = create_processor<float>(metric, n, D, 0, false, stream);
+    // For cosine/correlation distance, the metric processor translates distance
+    // to inner product via pre/post processing - pass the translated metric to
+    // ANN index
+    if (metric == raft::distance::DistanceType::CosineExpanded ||
+        metric == raft::distance::DistanceType::CorrelationExpanded) {
+      metric = index->metric = raft::distance::DistanceType::InnerProduct;
+    }
   }
   if constexpr (std::is_same_v<T, float>) { index->metric_processor->preprocess(index_array); }
 
-  if (ivf_ft_pams && (metric == raft::distance::DistanceType::L2Unexpanded ||
-                      metric == raft::distance::DistanceType::L2Expanded ||
-                      metric == raft::distance::DistanceType::InnerProduct)) {
+  if (ivf_ft_pams) {
     auto new_params               = from_legacy_index_params(*ivf_ft_pams, metric, metricArg);
     index->ivf_flat<T, int64_t>() = std::make_unique<const ivf_flat::index<T, int64_t>>(
       ivf_flat::build(handle, new_params, index_array, int64_t(n), D));
+  } else if (ivf_pq_pams) {
+    neighbors::ivf_pq::index_params params;
+    params.metric     = metric;
+    params.metric_arg = metricArg;
+    params.n_lists    = ivf_pq_pams->nlist;
+    params.pq_bits    = ivf_pq_pams->n_bits;
+    params.pq_dim     = ivf_pq_pams->M;
+    // TODO: handle ivf_pq_pams.usePrecomputedTables ?
+    index->ivf_pq = std::make_unique<const neighbors::ivf_pq::index<int64_t>>(
+      neighbors::ivf_pq::build(handle, params, index_array, int64_t(n), D));
   } else {
-    RAFT_CUDA_TRY(cudaGetDevice(&(index->device)));
-    index->gpu_res.reset(new raft::spatial::knn::RmmGpuResources());
-    index->gpu_res->noTempMemory();
-    index->gpu_res->setDefaultStream(index->device, stream);
-    if (ivf_ft_pams) {
-      approx_knn_ivfflat_build_index(index, *ivf_ft_pams, n, D);
-    } else if (ivf_pq_pams) {
-      approx_knn_ivfpq_build_index(index, *ivf_pq_pams, n, D);
-    } else if (ivf_sq_pams) {
-      approx_knn_ivfsq_build_index(index, *ivf_sq_pams, n, D);
-    } else {
-      RAFT_FAIL("Unrecognized index type.");
-    }
-    if constexpr (std::is_same_v<T, float>) {
-      index->index->train(n, index_array);
-      index->index->add(n, index_array);
-    } else {
-      RAFT_FAIL("FAISS-based index supports only float data.");
-    }
+    RAFT_FAIL("Unrecognized index type.");
   }
 
   if constexpr (std::is_same_v<T, float>) { index->metric_processor->revert(index_array); }
 }
 
 template <typename T = float, typename IntType = int>
-void approx_knn_search(const handle_t& handle,
+void approx_knn_search(raft::device_resources const& handle,
                        float* distances,
                        int64_t* indices,
                        knnIndex* index,
@@ -165,26 +96,22 @@ void approx_knn_search(const handle_t& handle,
                        T* query_array,
                        IntType n)
 {
-  auto faiss_ivf = dynamic_cast<GpuIndexIVF*>(index->index.get());
-  if (faiss_ivf) { faiss_ivf->setNumProbes(index->nprobe); }
-
   if constexpr (std::is_same_v<T, float>) {
     index->metric_processor->preprocess(query_array);
     index->metric_processor->set_num_queries(k);
   }
 
   // search
-  if (faiss_ivf) {
-    if constexpr (std::is_same_v<T, float>) {
-      faiss_ivf->search(n, query_array, k, distances, indices);
-    } else {
-      RAFT_FAIL("FAISS-based index supports only float data.");
-    }
-  } else if (index->ivf_flat<T, int64_t>()) {
+  if (index->ivf_flat<T, int64_t>()) {
     ivf_flat::search_params params;
     params.n_probes = index->nprobe;
     ivf_flat::search(
       handle, params, *(index->ivf_flat<T, int64_t>()), query_array, n, k, indices, distances);
+  } else if (index->ivf_pq) {
+    neighbors::ivf_pq::search_params params;
+    params.n_probes = index->nprobe;
+    neighbors::ivf_pq::search(
+      handle, params, *index->ivf_pq, query_array, n, k, indices, distances);
   } else {
     RAFT_FAIL("The model is not trained");
   }
@@ -202,11 +129,7 @@ void approx_knn_search(const handle_t& handle,
     float p = 0.5;  // standard l2
     if (index->metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / index->metricArg;
     raft::linalg::unaryOp<float>(
-      distances,
-      distances,
-      n * k,
-      [p] __device__(float input) { return powf(input, p); },
-      handle.get_stream());
+      distances, distances, n * k, raft::pow_const_op<float>(p), handle.get_stream());
   }
   if constexpr (std::is_same_v<T, float>) { index->metric_processor->postprocess(distances); }
 }
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index b721915187..395714a161 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,12 +16,19 @@
 
 #pragma once
 
+#include <raft/core/logger.hpp>
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
+#include <raft/util/integer_utils.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <memory>
+#include <optional>
 
 namespace raft::spatial::knn::detail::utils {
 
@@ -359,4 +366,202 @@ void copy_selected(IdxT n_rows,
   }
 }
 
+/**
+ * A batch input iterator over the data source.
+ * Given an input pointer, it decides whether the current device has the access to the data and
+ * gives it back to the user in batches. Three scenarios are possible:
+ *
+ *  1. if `source == nullptr`: then `batch.data() == nullptr`
+ *  2. if `source` is accessible from the device, `batch.data()` points directly at the source at
+ *     the proper offsets on each iteration.
+ *  3. if `source` is not accessible from the device, `batch.data()` points to an intermediate
+ *     buffer; the corresponding data is copied in the given `stream` on every iterator dereference
+ *     (i.e. batches can be skipped). Dereferencing the same batch two times in a row does not force
+ *     the copy.
+ *
+ * In all three scenarios, the number of iterations, batch offsets and sizes are the same.
+ *
+ * The iterator can be reused. If the number of iterations is one, at most one copy will ever be
+ * invoked (i.e. small datasets are not reloaded multiple times).
+ */
+template <typename T>
+struct batch_load_iterator {
+  using size_type = size_t;
+
+  /** A single batch of data residing in device memory. */
+  struct batch {
+    /** Logical width of a single row in a batch, in elements of type `T`. */
+    [[nodiscard]] auto row_width() const -> size_type { return row_width_; }
+    /** Logical offset of the batch, in rows (`row_width()`) */
+    [[nodiscard]] auto offset() const -> size_type { return pos_.value_or(0) * batch_size_; }
+    /** Logical size of the batch, in rows (`row_width()`) */
+    [[nodiscard]] auto size() const -> size_type { return batch_len_; }
+    /** Logical size of the batch, in rows (`row_width()`) */
+    [[nodiscard]] auto data() const -> const T* { return const_cast<const T*>(dev_ptr_); }
+    /** Whether this batch copies the data (i.e. the source is inaccessible from the device). */
+    [[nodiscard]] auto does_copy() const -> bool { return needs_copy_; }
+
+   private:
+    batch(const T* source,
+          size_type n_rows,
+          size_type row_width,
+          size_type batch_size,
+          rmm::cuda_stream_view stream,
+          rmm::mr::device_memory_resource* mr)
+      : stream_(stream),
+        buf_(0, stream, mr),
+        source_(source),
+        dev_ptr_(nullptr),
+        n_rows_(n_rows),
+        row_width_(row_width),
+        batch_size_(std::min(batch_size, n_rows)),
+        pos_(std::nullopt),
+        n_iters_(raft::div_rounding_up_safe(n_rows, batch_size)),
+        needs_copy_(false)
+    {
+      if (source_ == nullptr) { return; }
+      cudaPointerAttributes attr;
+      RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, source_));
+      dev_ptr_ = reinterpret_cast<T*>(attr.devicePointer);
+      if (dev_ptr_ == nullptr) {
+        buf_.resize(row_width_ * batch_size_, stream);
+        dev_ptr_    = buf_.data();
+        needs_copy_ = true;
+      }
+    }
+    rmm::cuda_stream_view stream_;
+    rmm::device_uvector<T> buf_;
+    const T* source_;
+    size_type n_rows_;
+    size_type row_width_;
+    size_type batch_size_;
+    size_type n_iters_;
+    bool needs_copy_;
+
+    std::optional<size_type> pos_;
+    size_type batch_len_;
+    T* dev_ptr_;
+
+    friend class batch_load_iterator<T>;
+
+    /**
+     * Changes the state of the batch to point at the `pos` index.
+     * If necessary, copies the data from the source in the registered stream.
+     */
+    void load(const size_type& pos)
+    {
+      // No-op if the data is already loaded, or it's the end of the input.
+      if (pos == pos_ || pos >= n_iters_) { return; }
+      pos_.emplace(pos);
+      batch_len_ = std::min(batch_size_, n_rows_ - std::min(offset(), n_rows_));
+      if (source_ == nullptr) { return; }
+      if (needs_copy_) {
+        if (size() > 0) {
+          RAFT_LOG_DEBUG("batch_load_iterator::copy(offset = %zu, size = %zu, row_width = %zu)",
+                         size_t(offset()),
+                         size_t(size()),
+                         size_t(row_width()));
+          copy(dev_ptr_, source_ + offset() * row_width(), size() * row_width(), stream_);
+        }
+      } else {
+        dev_ptr_ = const_cast<T*>(source_) + offset() * row_width();
+      }
+    }
+  };
+
+  using value_type = batch;
+  using reference  = const value_type&;
+  using pointer    = const value_type*;
+
+  /**
+   * Create a batch iterator over the data `source`.
+   *
+   * For convenience, the data `source` is read in logical units of size `row_width`; batch sizes
+   * and offsets are calculated in logical rows. Hence, can interpret the data as a contiguous
+   * row-major matrix of size [n_rows, row_width], and the batches are the sub-matrices of size
+   * [x<=batch_size, n_rows].
+   *
+   * @param source the input data -- host, device, or nullptr.
+   * @param n_rows the size of the input in logical rows.
+   * @param row_width the size of the logical row in the elements of type `T`.
+   * @param batch_size the desired size of the batch.
+   * @param stream the ordering for the host->device copies, if applicable.
+   * @param mr a custom memory resource for the intermediate buffer, if applicable.
+   */
+  batch_load_iterator(const T* source,
+                      size_type n_rows,
+                      size_type row_width,
+                      size_type batch_size,
+                      rmm::cuda_stream_view stream,
+                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+    : cur_batch_(new batch(source, n_rows, row_width, batch_size, stream, mr)), cur_pos_(0)
+  {
+  }
+  /**
+   * Whether this iterator copies the data on every iteration
+   * (i.e. the source is inaccessible from the device).
+   */
+  [[nodiscard]] auto does_copy() const -> bool { return cur_batch_->does_copy(); }
+  /** Reset the iterator position to `begin()` */
+  void reset() { cur_pos_ = 0; }
+  /** Reset the iterator position to `end()` */
+  void reset_to_end() { cur_pos_ = cur_batch_->n_iters_; }
+  [[nodiscard]] auto begin() const -> const batch_load_iterator<T>
+  {
+    batch_load_iterator<T> x(*this);
+    x.reset();
+    return x;
+  }
+  [[nodiscard]] auto end() const -> const batch_load_iterator<T>
+  {
+    batch_load_iterator<T> x(*this);
+    x.reset_to_end();
+    return x;
+  }
+  [[nodiscard]] auto operator*() const -> reference
+  {
+    cur_batch_->load(cur_pos_);
+    return *cur_batch_;
+  }
+  [[nodiscard]] auto operator->() const -> pointer
+  {
+    cur_batch_->load(cur_pos_);
+    return cur_batch_.get();
+  }
+  friend auto operator==(const batch_load_iterator<T>& x, const batch_load_iterator<T>& y) -> bool
+  {
+    return x.cur_batch_ == y.cur_batch_ && x.cur_pos_ == y.cur_pos_;
+  };
+  friend auto operator!=(const batch_load_iterator<T>& x, const batch_load_iterator<T>& y) -> bool
+  {
+    return x.cur_batch_ != y.cur_batch_ || x.cur_pos_ != y.cur_pos_;
+  };
+  auto operator++() -> batch_load_iterator<T>&
+  {
+    ++cur_pos_;
+    return *this;
+  }
+  auto operator++(int) -> batch_load_iterator<T>
+  {
+    batch_load_iterator<T> x(*this);
+    ++cur_pos_;
+    return x;
+  }
+  auto operator--() -> batch_load_iterator<T>&
+  {
+    --cur_pos_;
+    return *this;
+  }
+  auto operator--(int) -> batch_load_iterator<T>
+  {
+    batch_load_iterator<T> x(*this);
+    --cur_pos_;
+    return x;
+  }
+
+ private:
+  std::shared_ptr<value_type> cur_batch_;
+  size_type cur_pos_;
+};
+
 }  // namespace raft::spatial::knn::detail::utils
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
index 32a8f0ed33..7b3cf2d8f7 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,12 +16,11 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 #include "../ball_cover_types.hpp"
 #include "ball_cover/common.cuh"
 #include "ball_cover/registers.cuh"
-#include "block_select_faiss.cuh"
 #include "haversine_distance.cuh"
 #include "knn_brute_force_faiss.cuh"
 #include "selection_faiss.cuh"
@@ -31,6 +30,8 @@
 
 #include <raft/util/cuda_utils.cuh>
 
+#include <raft/spatial/knn/detail/faiss_select/key_value_block_select.cuh>
+
 #include <raft/matrix/matrix.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/sparse/convert/csr.cuh>
@@ -38,10 +39,6 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <faiss/gpu/utils/Limits.cuh>
-#include <faiss/gpu/utils/Select.cuh>
-#include <faiss/utils/Heap.h>
-
 #include <thrust/fill.h>
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
@@ -67,7 +64,7 @@ namespace detail {
  * @param index
  */
 template <typename value_idx, typename value_t, typename value_int = std::uint32_t>
-void sample_landmarks(const raft::handle_t& handle,
+void sample_landmarks(raft::device_resources const& handle,
                       BallCoverIndex<value_idx, value_t, value_int>& index)
 {
   rmm::device_uvector<value_idx> R_1nn_cols2(index.n_landmarks, handle.get_stream());
@@ -119,7 +116,7 @@ void sample_landmarks(const raft::handle_t& handle,
  * @param index
  */
 template <typename value_idx, typename value_t, typename value_int = std::uint32_t>
-void construct_landmark_1nn(const raft::handle_t& handle,
+void construct_landmark_1nn(raft::device_resources const& handle,
                             const value_idx* R_knn_inds_ptr,
                             const value_t* R_knn_dists_ptr,
                             value_int k,
@@ -173,7 +170,7 @@ void construct_landmark_1nn(const raft::handle_t& handle,
  * @param R_knn_dists
  */
 template <typename value_idx, typename value_t, typename value_int = std::uint32_t>
-void k_closest_landmarks(const raft::handle_t& handle,
+void k_closest_landmarks(raft::device_resources const& handle,
                          const BallCoverIndex<value_idx, value_t, value_int>& index,
                          const value_t* query_pts,
                          value_int n_query_pts,
@@ -209,7 +206,7 @@ void k_closest_landmarks(const raft::handle_t& handle,
  * @param index
  */
 template <typename value_idx, typename value_t, typename value_int = std::uint32_t>
-void compute_landmark_radii(const raft::handle_t& handle,
+void compute_landmark_radii(raft::device_resources const& handle,
                             BallCoverIndex<value_idx, value_t, value_int>& index)
 {
   auto entries = thrust::make_counting_iterator<value_idx>(0);
@@ -240,7 +237,7 @@ template <typename value_idx,
           typename value_t,
           typename value_int = std::uint32_t,
           typename dist_func>
-void perform_rbc_query(const raft::handle_t& handle,
+void perform_rbc_query(raft::device_resources const& handle,
                        const BallCoverIndex<value_idx, value_t, value_int>& index,
                        const value_t* query,
                        value_int n_query_pts,
@@ -342,7 +339,7 @@ template <typename value_idx = std::int64_t,
           typename value_t,
           typename value_int = std::uint32_t,
           typename distance_func>
-void rbc_build_index(const raft::handle_t& handle,
+void rbc_build_index(raft::device_resources const& handle,
                      BallCoverIndex<value_idx, value_t, value_int>& index,
                      distance_func dfunc)
 {
@@ -401,7 +398,7 @@ template <typename value_idx = std::int64_t,
           typename value_t,
           typename value_int = std::uint32_t,
           typename distance_func>
-void rbc_all_knn_query(const raft::handle_t& handle,
+void rbc_all_knn_query(raft::device_resources const& handle,
                        BallCoverIndex<value_idx, value_t, value_int>& index,
                        value_int k,
                        value_idx* inds,
@@ -470,7 +467,7 @@ template <typename value_idx = std::int64_t,
           typename value_t,
           typename value_int = std::uint32_t,
           typename distance_func>
-void rbc_knn_query(const raft::handle_t& handle,
+void rbc_knn_query(raft::device_resources const& handle,
                    const BallCoverIndex<value_idx, value_t, value_int>& index,
                    value_int k,
                    const value_t* query,
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
index b09cf0da10..0a6718f5a5 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -71,7 +71,7 @@ struct EuclideanFunc : public DistFunc<value_t, value_int> {
       sum_sq += diff * diff;
     }
 
-    return sqrt(sum_sq);
+    return raft::sqrt(sum_sq);
   }
 };
 
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
index 9c5307e683..394d27235b 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #include "common.cuh"
 
 #include "../../ball_cover_types.hpp"
-#include "../block_select_faiss.cuh"
+#include "../faiss_select/key_value_block_select.cuh"
 #include "../haversine_distance.cuh"
 #include "../selection_faiss.cuh"
 
@@ -28,10 +28,6 @@
 
 #include <raft/util/cuda_utils.cuh>
 
-#include <faiss/gpu/utils/Limits.cuh>
-#include <faiss/gpu/utils/Select.cuh>
-#include <faiss/utils/Heap.h>
-
 #include <thrust/fill.h>
 
 namespace raft {
@@ -173,10 +169,10 @@ __global__ void compute_final_dists_registers(const value_t* X_index,
                                               dist_func dfunc,
                                               value_int* dist_counter)
 {
-  static constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
+  static constexpr int kNumWarps = tpb / WarpSize;
 
   __shared__ value_t shared_memK[kNumWarps * warp_q];
-  __shared__ faiss::gpu::KeyValuePair<value_t, value_idx> shared_memV[kNumWarps * warp_q];
+  __shared__ KeyValuePair<value_t, value_idx> shared_memV[kNumWarps * warp_q];
 
   const value_t* x_ptr = X + (n_cols * blockIdx.x);
   value_t local_x_ptr[col_q];
@@ -184,21 +180,21 @@ __global__ void compute_final_dists_registers(const value_t* X_index,
     local_x_ptr[j] = x_ptr[j];
   }
 
-  faiss::gpu::KeyValueBlockSelect<value_t,
-                                  value_idx,
-                                  false,
-                                  faiss::gpu::Comparator<value_t>,
-                                  warp_q,
-                                  thread_q,
-                                  tpb>
-    heap(faiss::gpu::Limits<value_t>::getMax(),
-         faiss::gpu::Limits<value_t>::getMax(),
+  faiss_select::KeyValueBlockSelect<value_t,
+                                    value_idx,
+                                    false,
+                                    faiss_select::Comparator<value_t>,
+                                    warp_q,
+                                    thread_q,
+                                    tpb>
+    heap(std::numeric_limits<value_t>::max(),
+         std::numeric_limits<value_t>::max(),
          -1,
          shared_memK,
          shared_memV,
          k);
 
-  const value_int n_k = faiss::gpu::utils::roundDown(k, faiss::gpu::kWarpSize);
+  const value_int n_k = Pow2<WarpSize>::roundDown(k);
   value_int i         = threadIdx.x;
   for (; i < n_k; i += tpb) {
     value_idx ind = knn_inds[blockIdx.x * k + i];
@@ -225,7 +221,7 @@ __global__ void compute_final_dists_registers(const value_t* X_index,
       // Round R_size to the nearest warp threads so they can
       // all be computing in parallel.
 
-      const value_int limit = faiss::gpu::utils::roundDown(R_size, faiss::gpu::kWarpSize);
+      const value_int limit = Pow2<WarpSize>::roundDown(R_size);
 
       i = threadIdx.x;
       for (; i < limit; i += tpb) {
@@ -335,10 +331,10 @@ __global__ void block_rbc_kernel_registers(const value_t* X_index,
                                            distance_func dfunc,
                                            float weight = 1.0)
 {
-  static constexpr value_int kNumWarps = tpb / faiss::gpu::kWarpSize;
+  static constexpr value_int kNumWarps = tpb / WarpSize;
 
   __shared__ value_t shared_memK[kNumWarps * warp_q];
-  __shared__ faiss::gpu::KeyValuePair<value_t, value_idx> shared_memV[kNumWarps * warp_q];
+  __shared__ KeyValuePair<value_t, value_idx> shared_memV[kNumWarps * warp_q];
 
   // TODO: Separate kernels for different widths:
   // 1. Very small (between 3 and 32) just use registers for columns of "blockIdx.x"
@@ -353,15 +349,15 @@ __global__ void block_rbc_kernel_registers(const value_t* X_index,
   }
 
   // Each warp works on 1 R
-  faiss::gpu::KeyValueBlockSelect<value_t,
-                                  value_idx,
-                                  false,
-                                  faiss::gpu::Comparator<value_t>,
-                                  warp_q,
-                                  thread_q,
-                                  tpb>
-    heap(faiss::gpu::Limits<value_t>::getMax(),
-         faiss::gpu::Limits<value_t>::getMax(),
+  faiss_select::KeyValueBlockSelect<value_t,
+                                    value_idx,
+                                    false,
+                                    faiss_select::Comparator<value_t>,
+                                    warp_q,
+                                    thread_q,
+                                    tpb>
+    heap(std::numeric_limits<value_t>::max(),
+         std::numeric_limits<value_t>::max(),
          -1,
          shared_memK,
          shared_memV,
@@ -391,7 +387,7 @@ __global__ void block_rbc_kernel_registers(const value_t* X_index,
 
     value_idx R_size = R_stop_offset - R_start_offset;
 
-    value_int limit = faiss::gpu::utils::roundDown(R_size, faiss::gpu::kWarpSize);
+    value_int limit = Pow2<WarpSize>::roundDown(R_size);
     value_int i     = threadIdx.x;
     for (; i < limit; i += tpb) {
       // Index and distance of current candidate's nearest landmark
@@ -471,7 +467,7 @@ template <typename value_idx,
           typename value_int = std::uint32_t,
           int dims           = 2,
           typename dist_func>
-void rbc_low_dim_pass_one(const raft::handle_t& handle,
+void rbc_low_dim_pass_one(raft::device_resources const& handle,
                           const BallCoverIndex<value_idx, value_t, value_int>& index,
                           const value_t* query,
                           const value_int n_query_rows,
@@ -603,7 +599,7 @@ template <typename value_idx,
           typename value_int = std::uint32_t,
           int dims           = 2,
           typename dist_func>
-void rbc_low_dim_pass_two(const raft::handle_t& handle,
+void rbc_low_dim_pass_two(raft::device_resources const& handle,
                           const BallCoverIndex<value_idx, value_t, value_int>& index,
                           const value_t* query,
                           const value_int n_query_rows,
diff --git a/cpp/include/raft/spatial/knn/detail/common_faiss.h b/cpp/include/raft/spatial/knn/detail/common_faiss.h
index b098d0991d..57076350f0 100644
--- a/cpp/include/raft/spatial/knn/detail/common_faiss.h
+++ b/cpp/include/raft/spatial/knn/detail/common_faiss.h
@@ -19,7 +19,7 @@
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
-#include <faiss/gpu/GpuDistance.h>
+#include <faiss/MetricType.h>
 #include <raft/distance/distance_types.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
index 19862d743d..e4843acee9 100644
--- a/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
+++ b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/spatial/knn/detail/faiss_select/Comparators.cuh b/cpp/include/raft/spatial/knn/detail/faiss_select/Comparators.cuh
new file mode 100644
index 0000000000..173c06af30
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/faiss_select/Comparators.cuh
@@ -0,0 +1,29 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file thirdparty/LICENSES/LICENSE.faiss
+ */
+
+#pragma once
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+
+namespace raft::spatial::knn::detail::faiss_select {
+
+template <typename T>
+struct Comparator {
+  __device__ static inline bool lt(T a, T b) { return a < b; }
+
+  __device__ static inline bool gt(T a, T b) { return a > b; }
+};
+
+template <>
+struct Comparator<half> {
+  __device__ static inline bool lt(half a, half b) { return __hlt(a, b); }
+
+  __device__ static inline bool gt(half a, half b) { return __hgt(a, b); }
+};
+
+}  // namespace raft::spatial::knn::detail::faiss_select
diff --git a/cpp/include/raft/spatial/knn/detail/faiss_select/MergeNetworkBlock.cuh b/cpp/include/raft/spatial/knn/detail/faiss_select/MergeNetworkBlock.cuh
new file mode 100644
index 0000000000..d923b41ded
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/faiss_select/MergeNetworkBlock.cuh
@@ -0,0 +1,277 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file thirdparty/LICENSES/LICENSE.faiss
+ */
+
+#pragma once
+
+#include <cuda.h>
+#include <raft/spatial/knn/detail/faiss_select/MergeNetworkUtils.cuh>
+#include <raft/spatial/knn/detail/faiss_select/StaticUtils.h>
+
+namespace raft::spatial::knn::detail::faiss_select {
+
+// Merge pairs of lists smaller than blockDim.x (NumThreads)
+template <int NumThreads,
+          typename K,
+          typename V,
+          int N,
+          int L,
+          bool AllThreads,
+          bool Dir,
+          typename Comp,
+          bool FullMerge>
+inline __device__ void blockMergeSmall(K* listK, V* listV)
+{
+  static_assert(utils::isPowerOf2(L), "L must be a power-of-2");
+  static_assert(utils::isPowerOf2(NumThreads), "NumThreads must be a power-of-2");
+  static_assert(L <= NumThreads, "merge list size must be <= NumThreads");
+
+  // Which pair of lists we are merging
+  int mergeId = threadIdx.x / L;
+
+  // Which thread we are within the merge
+  int tid = threadIdx.x % L;
+
+  // listK points to a region of size N * 2 * L
+  listK += 2 * L * mergeId;
+  listV += 2 * L * mergeId;
+
+  // It's not a bitonic merge, both lists are in the same direction,
+  // so handle the first swap assuming the second list is reversed
+  int pos    = L - 1 - tid;
+  int stride = 2 * tid + 1;
+
+  if (AllThreads || (threadIdx.x < N * L)) {
+    K ka = listK[pos];
+    K kb = listK[pos + stride];
+
+    bool swap           = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+    listK[pos]          = swap ? kb : ka;
+    listK[pos + stride] = swap ? ka : kb;
+
+    V va                = listV[pos];
+    V vb                = listV[pos + stride];
+    listV[pos]          = swap ? vb : va;
+    listV[pos + stride] = swap ? va : vb;
+
+    // FIXME: is this a CUDA 9 compiler bug?
+    // K& ka = listK[pos];
+    // K& kb = listK[pos + stride];
+
+    // bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+    // swap(s, ka, kb);
+
+    // V& va = listV[pos];
+    // V& vb = listV[pos + stride];
+    // swap(s, va, vb);
+  }
+
+  __syncthreads();
+
+#pragma unroll
+  for (int stride = L / 2; stride > 0; stride /= 2) {
+    int pos = 2 * tid - (tid & (stride - 1));
+
+    if (AllThreads || (threadIdx.x < N * L)) {
+      K ka = listK[pos];
+      K kb = listK[pos + stride];
+
+      bool swap           = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+      listK[pos]          = swap ? kb : ka;
+      listK[pos + stride] = swap ? ka : kb;
+
+      V va                = listV[pos];
+      V vb                = listV[pos + stride];
+      listV[pos]          = swap ? vb : va;
+      listV[pos + stride] = swap ? va : vb;
+
+      // FIXME: is this a CUDA 9 compiler bug?
+      // K& ka = listK[pos];
+      // K& kb = listK[pos + stride];
+
+      // bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+      // swap(s, ka, kb);
+
+      // V& va = listV[pos];
+      // V& vb = listV[pos + stride];
+      // swap(s, va, vb);
+    }
+
+    __syncthreads();
+  }
+}
+
+// Merge pairs of sorted lists larger than blockDim.x (NumThreads)
+template <int NumThreads, typename K, typename V, int L, bool Dir, typename Comp, bool FullMerge>
+inline __device__ void blockMergeLarge(K* listK, V* listV)
+{
+  static_assert(utils::isPowerOf2(L), "L must be a power-of-2");
+  static_assert(L >= WarpSize, "merge list size must be >= 32");
+  static_assert(utils::isPowerOf2(NumThreads), "NumThreads must be a power-of-2");
+  static_assert(L >= NumThreads, "merge list size must be >= NumThreads");
+
+  // For L > NumThreads, each thread has to perform more work
+  // per each stride.
+  constexpr int kLoopPerThread = L / NumThreads;
+
+  // It's not a bitonic merge, both lists are in the same direction,
+  // so handle the first swap assuming the second list is reversed
+#pragma unroll
+  for (int loop = 0; loop < kLoopPerThread; ++loop) {
+    int tid    = loop * NumThreads + threadIdx.x;
+    int pos    = L - 1 - tid;
+    int stride = 2 * tid + 1;
+
+    K ka = listK[pos];
+    K kb = listK[pos + stride];
+
+    bool swap           = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+    listK[pos]          = swap ? kb : ka;
+    listK[pos + stride] = swap ? ka : kb;
+
+    V va                = listV[pos];
+    V vb                = listV[pos + stride];
+    listV[pos]          = swap ? vb : va;
+    listV[pos + stride] = swap ? va : vb;
+
+    // FIXME: is this a CUDA 9 compiler bug?
+    // K& ka = listK[pos];
+    // K& kb = listK[pos + stride];
+
+    // bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+    // swap(s, ka, kb);
+
+    // V& va = listV[pos];
+    // V& vb = listV[pos + stride];
+    // swap(s, va, vb);
+  }
+
+  __syncthreads();
+
+  constexpr int kSecondLoopPerThread = FullMerge ? kLoopPerThread : kLoopPerThread / 2;
+
+#pragma unroll
+  for (int stride = L / 2; stride > 0; stride /= 2) {
+#pragma unroll
+    for (int loop = 0; loop < kSecondLoopPerThread; ++loop) {
+      int tid = loop * NumThreads + threadIdx.x;
+      int pos = 2 * tid - (tid & (stride - 1));
+
+      K ka = listK[pos];
+      K kb = listK[pos + stride];
+
+      bool swap           = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+      listK[pos]          = swap ? kb : ka;
+      listK[pos + stride] = swap ? ka : kb;
+
+      V va                = listV[pos];
+      V vb                = listV[pos + stride];
+      listV[pos]          = swap ? vb : va;
+      listV[pos + stride] = swap ? va : vb;
+
+      // FIXME: is this a CUDA 9 compiler bug?
+      // K& ka = listK[pos];
+      // K& kb = listK[pos + stride];
+
+      // bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+      // swap(s, ka, kb);
+
+      // V& va = listV[pos];
+      // V& vb = listV[pos + stride];
+      // swap(s, va, vb);
+    }
+
+    __syncthreads();
+  }
+}
+
+/// Class template to prevent static_assert from firing for
+/// mixing smaller/larger than block cases
+template <int NumThreads,
+          typename K,
+          typename V,
+          int N,
+          int L,
+          bool Dir,
+          typename Comp,
+          bool SmallerThanBlock,
+          bool FullMerge>
+struct BlockMerge {
+};
+
+/// Merging lists smaller than a block
+template <int NumThreads,
+          typename K,
+          typename V,
+          int N,
+          int L,
+          bool Dir,
+          typename Comp,
+          bool FullMerge>
+struct BlockMerge<NumThreads, K, V, N, L, Dir, Comp, true, FullMerge> {
+  static inline __device__ void merge(K* listK, V* listV)
+  {
+    constexpr int kNumParallelMerges = NumThreads / L;
+    constexpr int kNumIterations     = N / kNumParallelMerges;
+
+    static_assert(L <= NumThreads, "list must be <= NumThreads");
+    static_assert((N < kNumParallelMerges) || (kNumIterations * kNumParallelMerges == N),
+                  "improper selection of N and L");
+
+    if (N < kNumParallelMerges) {
+      // We only need L threads per each list to perform the merge
+      blockMergeSmall<NumThreads, K, V, N, L, false, Dir, Comp, FullMerge>(listK, listV);
+    } else {
+      // All threads participate
+#pragma unroll
+      for (int i = 0; i < kNumIterations; ++i) {
+        int start = i * kNumParallelMerges * 2 * L;
+
+        blockMergeSmall<NumThreads, K, V, N, L, true, Dir, Comp, FullMerge>(listK + start,
+                                                                            listV + start);
+      }
+    }
+  }
+};
+
+/// Merging lists larger than a block
+template <int NumThreads,
+          typename K,
+          typename V,
+          int N,
+          int L,
+          bool Dir,
+          typename Comp,
+          bool FullMerge>
+struct BlockMerge<NumThreads, K, V, N, L, Dir, Comp, false, FullMerge> {
+  static inline __device__ void merge(K* listK, V* listV)
+  {
+    // Each pair of lists is merged sequentially
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      int start = i * 2 * L;
+
+      blockMergeLarge<NumThreads, K, V, L, Dir, Comp, FullMerge>(listK + start, listV + start);
+    }
+  }
+};
+
+template <int NumThreads,
+          typename K,
+          typename V,
+          int N,
+          int L,
+          bool Dir,
+          typename Comp,
+          bool FullMerge = true>
+inline __device__ void blockMerge(K* listK, V* listV)
+{
+  constexpr bool kSmallerThanBlock = (L <= NumThreads);
+
+  BlockMerge<NumThreads, K, V, N, L, Dir, Comp, kSmallerThanBlock, FullMerge>::merge(listK, listV);
+}
+
+}  // namespace raft::spatial::knn::detail::faiss_select
diff --git a/cpp/include/raft/spatial/knn/detail/faiss_select/MergeNetworkUtils.cuh b/cpp/include/raft/spatial/knn/detail/faiss_select/MergeNetworkUtils.cuh
new file mode 100644
index 0000000000..2cb01f9199
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/faiss_select/MergeNetworkUtils.cuh
@@ -0,0 +1,25 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file thirdparty/LICENSES/LICENSE.faiss
+ */
+
+#pragma once
+
+namespace raft::spatial::knn::detail::faiss_select {
+
+template <typename T>
+inline __device__ void swap(bool swap, T& x, T& y)
+{
+  T tmp = x;
+  x     = swap ? y : x;
+  y     = swap ? tmp : y;
+}
+
+template <typename T>
+inline __device__ void assign(bool assign, T& x, T y)
+{
+  x = assign ? y : x;
+}
+}  // namespace raft::spatial::knn::detail::faiss_select
diff --git a/cpp/include/raft/spatial/knn/detail/faiss_select/MergeNetworkWarp.cuh b/cpp/include/raft/spatial/knn/detail/faiss_select/MergeNetworkWarp.cuh
new file mode 100644
index 0000000000..bce739b2d8
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/faiss_select/MergeNetworkWarp.cuh
@@ -0,0 +1,521 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file thirdparty/LICENSES/LICENSE.faiss
+ */
+
+#pragma once
+
+#include <raft/spatial/knn/detail/faiss_select/MergeNetworkUtils.cuh>
+#include <raft/spatial/knn/detail/faiss_select/StaticUtils.h>
+
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft::spatial::knn::detail::faiss_select {
+
+//
+// This file contains functions to:
+//
+// -perform bitonic merges on pairs of sorted lists, held in
+// registers. Each list contains N * WarpSize (multiple of 32)
+// elements for some N.
+// The bitonic merge is implemented for arbitrary sizes;
+// sorted list A of size N1 * WarpSize registers
+// sorted list B of size N2 * WarpSize registers =>
+// sorted list C if size (N1 + N2) * WarpSize registers. N1 and N2
+// are >= 1 and don't have to be powers of 2.
+//
+// -perform bitonic sorts on a set of N * WarpSize key/value pairs
+// held in registers, by using the above bitonic merge as a
+// primitive.
+// N can be an arbitrary N >= 1; i.e., the bitonic sort here supports
+// odd sizes and doesn't require the input to be a power of 2.
+//
+// The sort or merge network is completely statically instantiated via
+// template specialization / expansion and constexpr, and it uses warp
+// shuffles to exchange values between warp lanes.
+//
+// A note about comparisons:
+//
+// For a sorting network of keys only, we only need one
+// comparison (a < b). However, what we really need to know is
+// if one lane chooses to exchange a value, then the
+// corresponding lane should also do the exchange.
+// Thus, if one just uses the negation !(x < y) in the higher
+// lane, this will also include the case where (x == y). Thus, one
+// lane in fact performs an exchange and the other doesn't, but
+// because the only value being exchanged is equivalent, nothing has
+// changed.
+// So, you can get away with just one comparison and its negation.
+//
+// If we're sorting keys and values, where equivalent keys can
+// exist, then this is a problem, since we want to treat (x, v1)
+// as not equivalent to (x, v2).
+//
+// To remedy this, you can either compare with a lexicographic
+// ordering (a.k < b.k || (a.k == b.k && a.v < b.v)), which since
+// we're predicating all of the choices results in 3 comparisons
+// being executed, or we can invert the selection so that there is no
+// middle choice of equality; the other lane will likewise
+// check that (b.k > a.k) (the higher lane has the values
+// swapped). Then, the first lane swaps if and only if the
+// second lane swaps; if both lanes have equivalent keys, no
+// swap will be performed. This results in only two comparisons
+// being executed.
+//
+// If you don't consider values as well, then this does not produce a
+// consistent ordering among (k, v) pairs with equivalent keys but
+// different values; for us, we don't really care about ordering or
+// stability here.
+//
+// I have tried both re-arranging the order in the higher lane to get
+// away with one comparison or adding the value to the check; both
+// result in greater register consumption or lower speed than just
+// performing both < and > comparisons with the variables, so I just
+// stick with this.
+
+// This function merges WarpSize / 2L lists in parallel using warp
+// shuffles.
+// It works on at most size-16 lists, as we need 32 threads for this
+// shuffle merge.
+//
+// If IsBitonic is false, the first stage is reversed, so we don't
+// need to sort directionally. It's still technically a bitonic sort.
+template <typename K, typename V, int L, bool Dir, typename Comp, bool IsBitonic>
+inline __device__ void warpBitonicMergeLE16(K& k, V& v)
+{
+  static_assert(utils::isPowerOf2(L), "L must be a power-of-2");
+  static_assert(L <= WarpSize / 2, "merge list size must be <= 16");
+
+  int laneId = raft::laneId();
+
+  if (!IsBitonic) {
+    // Reverse the first comparison stage.
+    // For example, merging a list of size 8 has the exchanges:
+    // 0 <-> 15, 1 <-> 14, ...
+    K otherK = shfl_xor(k, 2 * L - 1);
+    V otherV = shfl_xor(v, 2 * L - 1);
+
+    // Whether we are the lesser thread in the exchange
+    bool small = !(laneId & L);
+
+    if (Dir) {
+      // See the comment above how performing both of these
+      // comparisons in the warp seems to win out over the
+      // alternatives in practice
+      bool s = small ? Comp::gt(k, otherK) : Comp::lt(k, otherK);
+      assign(s, k, otherK);
+      assign(s, v, otherV);
+
+    } else {
+      bool s = small ? Comp::lt(k, otherK) : Comp::gt(k, otherK);
+      assign(s, k, otherK);
+      assign(s, v, otherV);
+    }
+  }
+
+#pragma unroll
+  for (int stride = IsBitonic ? L : L / 2; stride > 0; stride /= 2) {
+    K otherK = shfl_xor(k, stride);
+    V otherV = shfl_xor(v, stride);
+
+    // Whether we are the lesser thread in the exchange
+    bool small = !(laneId & stride);
+
+    if (Dir) {
+      bool s = small ? Comp::gt(k, otherK) : Comp::lt(k, otherK);
+      assign(s, k, otherK);
+      assign(s, v, otherV);
+
+    } else {
+      bool s = small ? Comp::lt(k, otherK) : Comp::gt(k, otherK);
+      assign(s, k, otherK);
+      assign(s, v, otherV);
+    }
+  }
+}
+
+// Template for performing a bitonic merge of an arbitrary set of
+// registers
+template <typename K, typename V, int N, bool Dir, typename Comp, bool Low, bool Pow2>
+struct BitonicMergeStep {
+};
+
+//
+// Power-of-2 merge specialization
+//
+
+// All merges eventually call this
+template <typename K, typename V, bool Dir, typename Comp, bool Low>
+struct BitonicMergeStep<K, V, 1, Dir, Comp, Low, true> {
+  static inline __device__ void merge(K k[1], V v[1])
+  {
+    // Use warp shuffles
+    warpBitonicMergeLE16<K, V, 16, Dir, Comp, true>(k[0], v[0]);
+  }
+};
+
+template <typename K, typename V, int N, bool Dir, typename Comp, bool Low>
+struct BitonicMergeStep<K, V, N, Dir, Comp, Low, true> {
+  static inline __device__ void merge(K k[N], V v[N])
+  {
+    static_assert(utils::isPowerOf2(N), "must be power of 2");
+    static_assert(N > 1, "must be N > 1");
+
+#pragma unroll
+    for (int i = 0; i < N / 2; ++i) {
+      K& ka = k[i];
+      V& va = v[i];
+
+      K& kb = k[i + N / 2];
+      V& vb = v[i + N / 2];
+
+      bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+      swap(s, ka, kb);
+      swap(s, va, vb);
+    }
+
+    {
+      K newK[N / 2];
+      V newV[N / 2];
+
+#pragma unroll
+      for (int i = 0; i < N / 2; ++i) {
+        newK[i] = k[i];
+        newV[i] = v[i];
+      }
+
+      BitonicMergeStep<K, V, N / 2, Dir, Comp, true, true>::merge(newK, newV);
+
+#pragma unroll
+      for (int i = 0; i < N / 2; ++i) {
+        k[i] = newK[i];
+        v[i] = newV[i];
+      }
+    }
+
+    {
+      K newK[N / 2];
+      V newV[N / 2];
+
+#pragma unroll
+      for (int i = 0; i < N / 2; ++i) {
+        newK[i] = k[i + N / 2];
+        newV[i] = v[i + N / 2];
+      }
+
+      BitonicMergeStep<K, V, N / 2, Dir, Comp, false, true>::merge(newK, newV);
+
+#pragma unroll
+      for (int i = 0; i < N / 2; ++i) {
+        k[i + N / 2] = newK[i];
+        v[i + N / 2] = newV[i];
+      }
+    }
+  }
+};
+
+//
+// Non-power-of-2 merge specialization
+//
+
+// Low recursion
+template <typename K, typename V, int N, bool Dir, typename Comp>
+struct BitonicMergeStep<K, V, N, Dir, Comp, true, false> {
+  static inline __device__ void merge(K k[N], V v[N])
+  {
+    static_assert(!utils::isPowerOf2(N), "must be non-power-of-2");
+    static_assert(N >= 3, "must be N >= 3");
+
+    constexpr int kNextHighestPowerOf2 = utils::nextHighestPowerOf2(N);
+
+#pragma unroll
+    for (int i = 0; i < N - kNextHighestPowerOf2 / 2; ++i) {
+      K& ka = k[i];
+      V& va = v[i];
+
+      K& kb = k[i + kNextHighestPowerOf2 / 2];
+      V& vb = v[i + kNextHighestPowerOf2 / 2];
+
+      bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+      swap(s, ka, kb);
+      swap(s, va, vb);
+    }
+
+    constexpr int kLowSize  = N - kNextHighestPowerOf2 / 2;
+    constexpr int kHighSize = kNextHighestPowerOf2 / 2;
+    {
+      K newK[kLowSize];
+      V newV[kLowSize];
+
+#pragma unroll
+      for (int i = 0; i < kLowSize; ++i) {
+        newK[i] = k[i];
+        newV[i] = v[i];
+      }
+
+      constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(N - kNextHighestPowerOf2 / 2);
+      // FIXME: compiler doesn't like this expression? compiler bug?
+      //      constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kLowSize);
+      BitonicMergeStep<K,
+                       V,
+                       kLowSize,
+                       Dir,
+                       Comp,
+                       true,  // low
+                       kLowIsPowerOf2>::merge(newK, newV);
+
+#pragma unroll
+      for (int i = 0; i < kLowSize; ++i) {
+        k[i] = newK[i];
+        v[i] = newV[i];
+      }
+    }
+
+    {
+      K newK[kHighSize];
+      V newV[kHighSize];
+
+#pragma unroll
+      for (int i = 0; i < kHighSize; ++i) {
+        newK[i] = k[i + kLowSize];
+        newV[i] = v[i + kLowSize];
+      }
+
+      constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(kNextHighestPowerOf2 / 2);
+      // FIXME: compiler doesn't like this expression? compiler bug?
+      //      constexpr bool kHighIsPowerOf2 =
+      //      utils::isPowerOf2(kHighSize);
+      BitonicMergeStep<K,
+                       V,
+                       kHighSize,
+                       Dir,
+                       Comp,
+                       false,  // high
+                       kHighIsPowerOf2>::merge(newK, newV);
+
+#pragma unroll
+      for (int i = 0; i < kHighSize; ++i) {
+        k[i + kLowSize] = newK[i];
+        v[i + kLowSize] = newV[i];
+      }
+    }
+  }
+};
+
+// High recursion
+template <typename K, typename V, int N, bool Dir, typename Comp>
+struct BitonicMergeStep<K, V, N, Dir, Comp, false, false> {
+  static inline __device__ void merge(K k[N], V v[N])
+  {
+    static_assert(!utils::isPowerOf2(N), "must be non-power-of-2");
+    static_assert(N >= 3, "must be N >= 3");
+
+    constexpr int kNextHighestPowerOf2 = utils::nextHighestPowerOf2(N);
+
+#pragma unroll
+    for (int i = 0; i < N - kNextHighestPowerOf2 / 2; ++i) {
+      K& ka = k[i];
+      V& va = v[i];
+
+      K& kb = k[i + kNextHighestPowerOf2 / 2];
+      V& vb = v[i + kNextHighestPowerOf2 / 2];
+
+      bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+      swap(s, ka, kb);
+      swap(s, va, vb);
+    }
+
+    constexpr int kLowSize  = kNextHighestPowerOf2 / 2;
+    constexpr int kHighSize = N - kNextHighestPowerOf2 / 2;
+    {
+      K newK[kLowSize];
+      V newV[kLowSize];
+
+#pragma unroll
+      for (int i = 0; i < kLowSize; ++i) {
+        newK[i] = k[i];
+        newV[i] = v[i];
+      }
+
+      constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kNextHighestPowerOf2 / 2);
+      // FIXME: compiler doesn't like this expression? compiler bug?
+      //      constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kLowSize);
+      BitonicMergeStep<K,
+                       V,
+                       kLowSize,
+                       Dir,
+                       Comp,
+                       true,  // low
+                       kLowIsPowerOf2>::merge(newK, newV);
+
+#pragma unroll
+      for (int i = 0; i < kLowSize; ++i) {
+        k[i] = newK[i];
+        v[i] = newV[i];
+      }
+    }
+
+    {
+      K newK[kHighSize];
+      V newV[kHighSize];
+
+#pragma unroll
+      for (int i = 0; i < kHighSize; ++i) {
+        newK[i] = k[i + kLowSize];
+        newV[i] = v[i + kLowSize];
+      }
+
+      constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(N - kNextHighestPowerOf2 / 2);
+      // FIXME: compiler doesn't like this expression? compiler bug?
+      //      constexpr bool kHighIsPowerOf2 =
+      //      utils::isPowerOf2(kHighSize);
+      BitonicMergeStep<K,
+                       V,
+                       kHighSize,
+                       Dir,
+                       Comp,
+                       false,  // high
+                       kHighIsPowerOf2>::merge(newK, newV);
+
+#pragma unroll
+      for (int i = 0; i < kHighSize; ++i) {
+        k[i + kLowSize] = newK[i];
+        v[i + kLowSize] = newV[i];
+      }
+    }
+  }
+};
+
+/// Merges two sets of registers across the warp of any size;
+/// i.e., merges a sorted k/v list of size WarpSize * N1 with a
+/// sorted k/v list of size WarpSize * N2, where N1 and N2 are any
+/// value >= 1
+template <typename K, typename V, int N1, int N2, bool Dir, typename Comp, bool FullMerge = true>
+inline __device__ void warpMergeAnyRegisters(K k1[N1], V v1[N1], K k2[N2], V v2[N2])
+{
+  constexpr int kSmallestN = N1 < N2 ? N1 : N2;
+
+#pragma unroll
+  for (int i = 0; i < kSmallestN; ++i) {
+    K& ka = k1[N1 - 1 - i];
+    V& va = v1[N1 - 1 - i];
+
+    K& kb = k2[i];
+    V& vb = v2[i];
+
+    K otherKa;
+    V otherVa;
+
+    if (FullMerge) {
+      // We need the other values
+      otherKa = shfl_xor(ka, WarpSize - 1);
+      otherVa = shfl_xor(va, WarpSize - 1);
+    }
+
+    K otherKb = shfl_xor(kb, WarpSize - 1);
+    V otherVb = shfl_xor(vb, WarpSize - 1);
+
+    // ka is always first in the list, so we needn't use our lane
+    // in this comparison
+    bool swapa = Dir ? Comp::gt(ka, otherKb) : Comp::lt(ka, otherKb);
+    assign(swapa, ka, otherKb);
+    assign(swapa, va, otherVb);
+
+    // kb is always second in the list, so we needn't use our lane
+    // in this comparison
+    if (FullMerge) {
+      bool swapb = Dir ? Comp::lt(kb, otherKa) : Comp::gt(kb, otherKa);
+      assign(swapb, kb, otherKa);
+      assign(swapb, vb, otherVa);
+
+    } else {
+      // We don't care about updating elements in the second list
+    }
+  }
+
+  BitonicMergeStep<K, V, N1, Dir, Comp, true, utils::isPowerOf2(N1)>::merge(k1, v1);
+  if (FullMerge) {
+    // Only if we care about N2 do we need to bother merging it fully
+    BitonicMergeStep<K, V, N2, Dir, Comp, false, utils::isPowerOf2(N2)>::merge(k2, v2);
+  }
+}
+
+// Recursive template that uses the above bitonic merge to perform a
+// bitonic sort
+template <typename K, typename V, int N, bool Dir, typename Comp>
+struct BitonicSortStep {
+  static inline __device__ void sort(K k[N], V v[N])
+  {
+    static_assert(N > 1, "did not hit specialized case");
+
+    // Sort recursively
+    constexpr int kSizeA = N / 2;
+    constexpr int kSizeB = N - kSizeA;
+
+    K aK[kSizeA];
+    V aV[kSizeA];
+
+#pragma unroll
+    for (int i = 0; i < kSizeA; ++i) {
+      aK[i] = k[i];
+      aV[i] = v[i];
+    }
+
+    BitonicSortStep<K, V, kSizeA, Dir, Comp>::sort(aK, aV);
+
+    K bK[kSizeB];
+    V bV[kSizeB];
+
+#pragma unroll
+    for (int i = 0; i < kSizeB; ++i) {
+      bK[i] = k[i + kSizeA];
+      bV[i] = v[i + kSizeA];
+    }
+
+    BitonicSortStep<K, V, kSizeB, Dir, Comp>::sort(bK, bV);
+
+    // Merge halves
+    warpMergeAnyRegisters<K, V, kSizeA, kSizeB, Dir, Comp>(aK, aV, bK, bV);
+
+#pragma unroll
+    for (int i = 0; i < kSizeA; ++i) {
+      k[i] = aK[i];
+      v[i] = aV[i];
+    }
+
+#pragma unroll
+    for (int i = 0; i < kSizeB; ++i) {
+      k[i + kSizeA] = bK[i];
+      v[i + kSizeA] = bV[i];
+    }
+  }
+};
+
+// Single warp (N == 1) sorting specialization
+template <typename K, typename V, bool Dir, typename Comp>
+struct BitonicSortStep<K, V, 1, Dir, Comp> {
+  static inline __device__ void sort(K k[1], V v[1])
+  {
+    // Update this code if this changes
+    // should go from 1 -> WarpSize in multiples of 2
+    static_assert(WarpSize == 32, "unexpected warp size");
+
+    warpBitonicMergeLE16<K, V, 1, Dir, Comp, false>(k[0], v[0]);
+    warpBitonicMergeLE16<K, V, 2, Dir, Comp, false>(k[0], v[0]);
+    warpBitonicMergeLE16<K, V, 4, Dir, Comp, false>(k[0], v[0]);
+    warpBitonicMergeLE16<K, V, 8, Dir, Comp, false>(k[0], v[0]);
+    warpBitonicMergeLE16<K, V, 16, Dir, Comp, false>(k[0], v[0]);
+  }
+};
+
+/// Sort a list of WarpSize * N elements in registers, where N is an
+/// arbitrary >= 1
+template <typename K, typename V, int N, bool Dir, typename Comp>
+inline __device__ void warpSortAnyRegisters(K k[N], V v[N])
+{
+  BitonicSortStep<K, V, N, Dir, Comp>::sort(k, v);
+}
+
+}  // namespace raft::spatial::knn::detail::faiss_select
diff --git a/cpp/include/raft/spatial/knn/detail/faiss_select/Select.cuh b/cpp/include/raft/spatial/knn/detail/faiss_select/Select.cuh
new file mode 100644
index 0000000000..e4faff7a6c
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/faiss_select/Select.cuh
@@ -0,0 +1,555 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file thirdparty/LICENSES/LICENSE.faiss
+ */
+
+#pragma once
+
+#include <raft/spatial/knn/detail/faiss_select/Comparators.cuh>
+#include <raft/spatial/knn/detail/faiss_select/MergeNetworkBlock.cuh>
+#include <raft/spatial/knn/detail/faiss_select/MergeNetworkWarp.cuh>
+
+#include <raft/core/kvp.hpp>
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft::spatial::knn::detail::faiss_select {
+
+// Specialization for block-wide monotonic merges producing a merge sort
+// since what we really want is a constexpr loop expansion
+template <int NumWarps,
+          int NumThreads,
+          typename K,
+          typename V,
+          int NumWarpQ,
+          bool Dir,
+          typename Comp>
+struct FinalBlockMerge {
+};
+
+template <int NumThreads, typename K, typename V, int NumWarpQ, bool Dir, typename Comp>
+struct FinalBlockMerge<1, NumThreads, K, V, NumWarpQ, Dir, Comp> {
+  static inline __device__ void merge(K* sharedK, V* sharedV)
+  {
+    // no merge required; single warp
+  }
+};
+
+template <int NumThreads, typename K, typename V, int NumWarpQ, bool Dir, typename Comp>
+struct FinalBlockMerge<2, NumThreads, K, V, NumWarpQ, Dir, Comp> {
+  static inline __device__ void merge(K* sharedK, V* sharedV)
+  {
+    // Final merge doesn't need to fully merge the second list
+    blockMerge<NumThreads, K, V, NumThreads / (WarpSize * 2), NumWarpQ, !Dir, Comp, false>(sharedK,
+                                                                                           sharedV);
+  }
+};
+
+template <int NumThreads, typename K, typename V, int NumWarpQ, bool Dir, typename Comp>
+struct FinalBlockMerge<4, NumThreads, K, V, NumWarpQ, Dir, Comp> {
+  static inline __device__ void merge(K* sharedK, V* sharedV)
+  {
+    blockMerge<NumThreads, K, V, NumThreads / (WarpSize * 2), NumWarpQ, !Dir, Comp>(sharedK,
+                                                                                    sharedV);
+    // Final merge doesn't need to fully merge the second list
+    blockMerge<NumThreads, K, V, NumThreads / (WarpSize * 4), NumWarpQ * 2, !Dir, Comp, false>(
+      sharedK, sharedV);
+  }
+};
+
+template <int NumThreads, typename K, typename V, int NumWarpQ, bool Dir, typename Comp>
+struct FinalBlockMerge<8, NumThreads, K, V, NumWarpQ, Dir, Comp> {
+  static inline __device__ void merge(K* sharedK, V* sharedV)
+  {
+    blockMerge<NumThreads, K, V, NumThreads / (WarpSize * 2), NumWarpQ, !Dir, Comp>(sharedK,
+                                                                                    sharedV);
+    blockMerge<NumThreads, K, V, NumThreads / (WarpSize * 4), NumWarpQ * 2, !Dir, Comp>(sharedK,
+                                                                                        sharedV);
+    // Final merge doesn't need to fully merge the second list
+    blockMerge<NumThreads, K, V, NumThreads / (WarpSize * 8), NumWarpQ * 4, !Dir, Comp, false>(
+      sharedK, sharedV);
+  }
+};
+
+// `Dir` true, produce largest values.
+// `Dir` false, produce smallest values.
+template <typename K,
+          typename V,
+          bool Dir,
+          typename Comp,
+          int NumWarpQ,
+          int NumThreadQ,
+          int ThreadsPerBlock>
+struct BlockSelect {
+  static constexpr int kNumWarps          = ThreadsPerBlock / WarpSize;
+  static constexpr int kTotalWarpSortSize = NumWarpQ;
+
+  __device__ inline BlockSelect(K initKVal, V initVVal, K* smemK, V* smemV, int k)
+    : initK(initKVal),
+      initV(initVVal),
+      numVals(0),
+      warpKTop(initKVal),
+      sharedK(smemK),
+      sharedV(smemV),
+      kMinus1(k - 1)
+  {
+    static_assert(utils::isPowerOf2(ThreadsPerBlock), "threads must be a power-of-2");
+    static_assert(utils::isPowerOf2(NumWarpQ), "warp queue must be power-of-2");
+
+    // Fill the per-thread queue keys with the default value
+#pragma unroll
+    for (int i = 0; i < NumThreadQ; ++i) {
+      threadK[i] = initK;
+      threadV[i] = initV;
+    }
+
+    int laneId = raft::laneId();
+    int warpId = threadIdx.x / WarpSize;
+    warpK      = sharedK + warpId * kTotalWarpSortSize;
+    warpV      = sharedV + warpId * kTotalWarpSortSize;
+
+    // Fill warp queue (only the actual queue space is fine, not where
+    // we write the per-thread queues for merging)
+    for (int i = laneId; i < NumWarpQ; i += WarpSize) {
+      warpK[i] = initK;
+      warpV[i] = initV;
+    }
+
+    warpFence();
+  }
+
+  __device__ inline void addThreadQ(K k, V v)
+  {
+    if (Dir ? Comp::gt(k, warpKTop) : Comp::lt(k, warpKTop)) {
+      // Rotate right
+#pragma unroll
+      for (int i = NumThreadQ - 1; i > 0; --i) {
+        threadK[i] = threadK[i - 1];
+        threadV[i] = threadV[i - 1];
+      }
+
+      threadK[0] = k;
+      threadV[0] = v;
+      ++numVals;
+    }
+  }
+
+  __device__ inline void checkThreadQ()
+  {
+    bool needSort = (numVals == NumThreadQ);
+
+#if CUDA_VERSION >= 9000
+    needSort = __any_sync(0xffffffff, needSort);
+#else
+    needSort = __any(needSort);
+#endif
+
+    if (!needSort) {
+      // no lanes have triggered a sort
+      return;
+    }
+
+    // This has a trailing warpFence
+    mergeWarpQ();
+
+    // Any top-k elements have been merged into the warp queue; we're
+    // free to reset the thread queues
+    numVals = 0;
+
+#pragma unroll
+    for (int i = 0; i < NumThreadQ; ++i) {
+      threadK[i] = initK;
+      threadV[i] = initV;
+    }
+
+    // We have to beat at least this element
+    warpKTop = warpK[kMinus1];
+
+    warpFence();
+  }
+
+  /// This function handles sorting and merging together the
+  /// per-thread queues with the warp-wide queue, creating a sorted
+  /// list across both
+  __device__ inline void mergeWarpQ()
+  {
+    int laneId = raft::laneId();
+
+    // Sort all of the per-thread queues
+    warpSortAnyRegisters<K, V, NumThreadQ, !Dir, Comp>(threadK, threadV);
+
+    constexpr int kNumWarpQRegisters = NumWarpQ / WarpSize;
+    K warpKRegisters[kNumWarpQRegisters];
+    V warpVRegisters[kNumWarpQRegisters];
+
+#pragma unroll
+    for (int i = 0; i < kNumWarpQRegisters; ++i) {
+      warpKRegisters[i] = warpK[i * WarpSize + laneId];
+      warpVRegisters[i] = warpV[i * WarpSize + laneId];
+    }
+
+    warpFence();
+
+    // The warp queue is already sorted, and now that we've sorted the
+    // per-thread queue, merge both sorted lists together, producing
+    // one sorted list
+    warpMergeAnyRegisters<K, V, kNumWarpQRegisters, NumThreadQ, !Dir, Comp, false>(
+      warpKRegisters, warpVRegisters, threadK, threadV);
+
+    // Write back out the warp queue
+#pragma unroll
+    for (int i = 0; i < kNumWarpQRegisters; ++i) {
+      warpK[i * WarpSize + laneId] = warpKRegisters[i];
+      warpV[i * WarpSize + laneId] = warpVRegisters[i];
+    }
+
+    warpFence();
+  }
+
+  /// WARNING: all threads in a warp must participate in this.
+  /// Otherwise, you must call the constituent parts separately.
+  __device__ inline void add(K k, V v)
+  {
+    addThreadQ(k, v);
+    checkThreadQ();
+  }
+
+  __device__ inline void reduce()
+  {
+    // Have all warps dump and merge their queues; this will produce
+    // the final per-warp results
+    mergeWarpQ();
+
+    // block-wide dep; thus far, all warps have been completely
+    // independent
+    __syncthreads();
+
+    // All warp queues are contiguous in smem.
+    // Now, we have kNumWarps lists of NumWarpQ elements.
+    // This is a power of 2.
+    FinalBlockMerge<kNumWarps, ThreadsPerBlock, K, V, NumWarpQ, Dir, Comp>::merge(sharedK, sharedV);
+
+    // The block-wide merge has a trailing syncthreads
+  }
+
+  // Default element key
+  const K initK;
+
+  // Default element value
+  const V initV;
+
+  // Number of valid elements in our thread queue
+  int numVals;
+
+  // The k-th highest (Dir) or lowest (!Dir) element
+  K warpKTop;
+
+  // Thread queue values
+  K threadK[NumThreadQ];
+  V threadV[NumThreadQ];
+
+  // Queues for all warps
+  K* sharedK;
+  V* sharedV;
+
+  // Our warp's queue (points into sharedK/sharedV)
+  // warpK[0] is highest (Dir) or lowest (!Dir)
+  K* warpK;
+  V* warpV;
+
+  // This is a cached k-1 value
+  int kMinus1;
+};
+
+/// Specialization for k == 1 (NumWarpQ == 1)
+template <typename K, typename V, bool Dir, typename Comp, int NumThreadQ, int ThreadsPerBlock>
+struct BlockSelect<K, V, Dir, Comp, 1, NumThreadQ, ThreadsPerBlock> {
+  static constexpr int kNumWarps = ThreadsPerBlock / WarpSize;
+
+  __device__ inline BlockSelect(K initK, V initV, K* smemK, V* smemV, int k)
+    : threadK(initK), threadV(initV), sharedK(smemK), sharedV(smemV)
+  {
+  }
+
+  __device__ inline void addThreadQ(K k, V v)
+  {
+    bool swap = Dir ? Comp::gt(k, threadK) : Comp::lt(k, threadK);
+    threadK   = swap ? k : threadK;
+    threadV   = swap ? v : threadV;
+  }
+
+  __device__ inline void checkThreadQ()
+  {
+    // We don't need to do anything here, since the warp doesn't
+    // cooperate until the end
+  }
+
+  __device__ inline void add(K k, V v) { addThreadQ(k, v); }
+
+  __device__ inline void reduce()
+  {
+    // Reduce within the warp
+    KeyValuePair<K, V> pair(threadK, threadV);
+
+    if (Dir) {
+      pair = warpReduce(pair, max_op{});
+    } else {
+      pair = warpReduce(pair, min_op{});
+    }
+
+    // Each warp writes out a single value
+    int laneId = raft::laneId();
+    int warpId = threadIdx.x / WarpSize;
+
+    if (laneId == 0) {
+      sharedK[warpId] = pair.key;
+      sharedV[warpId] = pair.value;
+    }
+
+    __syncthreads();
+
+    // We typically use this for small blocks (<= 128), just having the
+    // first thread in the block perform the reduction across warps is
+    // faster
+    if (threadIdx.x == 0) {
+      threadK = sharedK[0];
+      threadV = sharedV[0];
+
+#pragma unroll
+      for (int i = 1; i < kNumWarps; ++i) {
+        K k = sharedK[i];
+        V v = sharedV[i];
+
+        bool swap = Dir ? Comp::gt(k, threadK) : Comp::lt(k, threadK);
+        threadK   = swap ? k : threadK;
+        threadV   = swap ? v : threadV;
+      }
+
+      // Hopefully a thread's smem reads/writes are ordered wrt
+      // itself, so no barrier needed :)
+      sharedK[0] = threadK;
+      sharedV[0] = threadV;
+    }
+
+    // In case other threads wish to read this value
+    __syncthreads();
+  }
+
+  // threadK is lowest (Dir) or highest (!Dir)
+  K threadK;
+  V threadV;
+
+  // Where we reduce in smem
+  K* sharedK;
+  V* sharedV;
+};
+
+//
+// per-warp WarpSelect
+//
+
+// `Dir` true, produce largest values.
+// `Dir` false, produce smallest values.
+template <typename K,
+          typename V,
+          bool Dir,
+          typename Comp,
+          int NumWarpQ,
+          int NumThreadQ,
+          int ThreadsPerBlock>
+struct WarpSelect {
+  static constexpr int kNumWarpQRegisters = NumWarpQ / WarpSize;
+
+  __device__ inline WarpSelect(K initKVal, V initVVal, int k)
+    : initK(initKVal), initV(initVVal), numVals(0), warpKTop(initKVal), kLane((k - 1) % WarpSize)
+  {
+    static_assert(utils::isPowerOf2(ThreadsPerBlock), "threads must be a power-of-2");
+    static_assert(utils::isPowerOf2(NumWarpQ), "warp queue must be power-of-2");
+
+    // Fill the per-thread queue keys with the default value
+#pragma unroll
+    for (int i = 0; i < NumThreadQ; ++i) {
+      threadK[i] = initK;
+      threadV[i] = initV;
+    }
+
+    // Fill the warp queue with the default value
+#pragma unroll
+    for (int i = 0; i < kNumWarpQRegisters; ++i) {
+      warpK[i] = initK;
+      warpV[i] = initV;
+    }
+  }
+
+  __device__ inline void addThreadQ(K k, V v)
+  {
+    if (Dir ? Comp::gt(k, warpKTop) : Comp::lt(k, warpKTop)) {
+      // Rotate right
+#pragma unroll
+      for (int i = NumThreadQ - 1; i > 0; --i) {
+        threadK[i] = threadK[i - 1];
+        threadV[i] = threadV[i - 1];
+      }
+
+      threadK[0] = k;
+      threadV[0] = v;
+      ++numVals;
+    }
+  }
+
+  __device__ inline void checkThreadQ()
+  {
+    bool needSort = (numVals == NumThreadQ);
+
+#if CUDA_VERSION >= 9000
+    needSort = __any_sync(0xffffffff, needSort);
+#else
+    needSort = __any(needSort);
+#endif
+
+    if (!needSort) {
+      // no lanes have triggered a sort
+      return;
+    }
+
+    mergeWarpQ();
+
+    // Any top-k elements have been merged into the warp queue; we're
+    // free to reset the thread queues
+    numVals = 0;
+
+#pragma unroll
+    for (int i = 0; i < NumThreadQ; ++i) {
+      threadK[i] = initK;
+      threadV[i] = initV;
+    }
+
+    // We have to beat at least this element
+    warpKTop = shfl(warpK[kNumWarpQRegisters - 1], kLane);
+  }
+
+  /// This function handles sorting and merging together the
+  /// per-thread queues with the warp-wide queue, creating a sorted
+  /// list across both
+  __device__ inline void mergeWarpQ()
+  {
+    // Sort all of the per-thread queues
+    warpSortAnyRegisters<K, V, NumThreadQ, !Dir, Comp>(threadK, threadV);
+
+    // The warp queue is already sorted, and now that we've sorted the
+    // per-thread queue, merge both sorted lists together, producing
+    // one sorted list
+    warpMergeAnyRegisters<K, V, kNumWarpQRegisters, NumThreadQ, !Dir, Comp, false>(
+      warpK, warpV, threadK, threadV);
+  }
+
+  /// WARNING: all threads in a warp must participate in this.
+  /// Otherwise, you must call the constituent parts separately.
+  __device__ inline void add(K k, V v)
+  {
+    addThreadQ(k, v);
+    checkThreadQ();
+  }
+
+  __device__ inline void reduce()
+  {
+    // Have all warps dump and merge their queues; this will produce
+    // the final per-warp results
+    mergeWarpQ();
+  }
+
+  /// Dump final k selected values for this warp out
+  __device__ inline void writeOut(K* outK, V* outV, int k)
+  {
+    int laneId = raft::laneId();
+
+#pragma unroll
+    for (int i = 0; i < kNumWarpQRegisters; ++i) {
+      int idx = i * WarpSize + laneId;
+
+      if (idx < k) {
+        outK[idx] = warpK[i];
+        outV[idx] = warpV[i];
+      }
+    }
+  }
+
+  // Default element key
+  const K initK;
+
+  // Default element value
+  const V initV;
+
+  // Number of valid elements in our thread queue
+  int numVals;
+
+  // The k-th highest (Dir) or lowest (!Dir) element
+  K warpKTop;
+
+  // Thread queue values
+  K threadK[NumThreadQ];
+  V threadV[NumThreadQ];
+
+  // warpK[0] is highest (Dir) or lowest (!Dir)
+  K warpK[kNumWarpQRegisters];
+  V warpV[kNumWarpQRegisters];
+
+  // This is what lane we should load an approximation (>=k) to the
+  // kth element from the last register in the warp queue (i.e.,
+  // warpK[kNumWarpQRegisters - 1]).
+  int kLane;
+};
+
+/// Specialization for k == 1 (NumWarpQ == 1)
+template <typename K, typename V, bool Dir, typename Comp, int NumThreadQ, int ThreadsPerBlock>
+struct WarpSelect<K, V, Dir, Comp, 1, NumThreadQ, ThreadsPerBlock> {
+  static constexpr int kNumWarps = ThreadsPerBlock / WarpSize;
+
+  __device__ inline WarpSelect(K initK, V initV, int k) : threadK(initK), threadV(initV) {}
+
+  __device__ inline void addThreadQ(K k, V v)
+  {
+    bool swap = Dir ? Comp::gt(k, threadK) : Comp::lt(k, threadK);
+    threadK   = swap ? k : threadK;
+    threadV   = swap ? v : threadV;
+  }
+
+  __device__ inline void checkThreadQ()
+  {
+    // We don't need to do anything here, since the warp doesn't
+    // cooperate until the end
+  }
+
+  __device__ inline void add(K k, V v) { addThreadQ(k, v); }
+
+  __device__ inline void reduce()
+  {
+    // Reduce within the warp
+    KeyValuePair<K, V> pair(threadK, threadV);
+
+    if (Dir) {
+      pair = warpReduce(pair, max_op{});
+    } else {
+      pair = warpReduce(pair, min_op{});
+    }
+
+    threadK = pair.key;
+    threadV = pair.value;
+  }
+
+  /// Dump final k selected values for this warp out
+  __device__ inline void writeOut(K* outK, V* outV, int k)
+  {
+    if (raft::laneId() == 0) {
+      *outK = threadK;
+      *outV = threadV;
+    }
+  }
+
+  // threadK is lowest (Dir) or highest (!Dir)
+  K threadK;
+  V threadV;
+};
+
+}  // namespace raft::spatial::knn::detail::faiss_select
diff --git a/cpp/include/raft/spatial/knn/detail/faiss_select/StaticUtils.h b/cpp/include/raft/spatial/knn/detail/faiss_select/StaticUtils.h
new file mode 100644
index 0000000000..bac051b68c
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/faiss_select/StaticUtils.h
@@ -0,0 +1,48 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file thirdparty/LICENSES/LICENSE.faiss
+ */
+
+#pragma once
+
+#include <cuda.h>
+
+// allow usage for non-CUDA files
+#ifndef __host__
+#define __host__
+#define __device__
+#endif
+
+namespace raft::spatial::knn::detail::faiss_select::utils {
+
+template <typename T>
+constexpr __host__ __device__ bool isPowerOf2(T v)
+{
+  return (v && !(v & (v - 1)));
+}
+
+static_assert(isPowerOf2(2048), "isPowerOf2");
+static_assert(!isPowerOf2(3333), "isPowerOf2");
+
+template <typename T>
+constexpr __host__ __device__ T nextHighestPowerOf2(T v)
+{
+  return (isPowerOf2(v) ? (T)2 * v : ((T)1 << (log2(v) + 1)));
+}
+
+static_assert(nextHighestPowerOf2(1) == 2, "nextHighestPowerOf2");
+static_assert(nextHighestPowerOf2(2) == 4, "nextHighestPowerOf2");
+static_assert(nextHighestPowerOf2(3) == 4, "nextHighestPowerOf2");
+static_assert(nextHighestPowerOf2(4) == 8, "nextHighestPowerOf2");
+
+static_assert(nextHighestPowerOf2(15) == 16, "nextHighestPowerOf2");
+static_assert(nextHighestPowerOf2(16) == 32, "nextHighestPowerOf2");
+static_assert(nextHighestPowerOf2(17) == 32, "nextHighestPowerOf2");
+
+static_assert(nextHighestPowerOf2(1536000000u) == 2147483648u, "nextHighestPowerOf2");
+static_assert(nextHighestPowerOf2((size_t)2147483648ULL) == (size_t)4294967296ULL,
+              "nextHighestPowerOf2");
+
+}  // namespace raft::spatial::knn::detail::faiss_select::utils
diff --git a/cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh b/cpp/include/raft/spatial/knn/detail/faiss_select/key_value_block_select.cuh
similarity index 80%
rename from cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh
rename to cpp/include/raft/spatial/knn/detail/faiss_select/key_value_block_select.cuh
index 34240fba64..617a26a243 100644
--- a/cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/faiss_select/key_value_block_select.cuh
@@ -2,26 +2,19 @@
  * Copyright (c) Facebook, Inc. and its affiliates.
  *
  * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
+ * LICENSE file thirdparty/LICENSES/LICENSE.faiss
  */
 
 #pragma once
 
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/MergeNetworkUtils.cuh>
-#include <faiss/gpu/utils/PtxUtils.cuh>
-#include <faiss/gpu/utils/Select.cuh>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/gpu/utils/WarpShuffles.cuh>
-
-#include "warp_select_faiss.cuh"
+#include <raft/spatial/knn/detail/faiss_select/MergeNetworkUtils.cuh>
+#include <raft/spatial/knn/detail/faiss_select/Select.cuh>
 
 // TODO: Need to think further about the impact (and new boundaries created) on the registers
 // because this will change the max k that can be processed. One solution might be to break
 // up k into multiple batches for larger k.
 
-namespace faiss {
-namespace gpu {
+namespace raft::spatial::knn::detail::faiss_select {
 
 // `Dir` true, produce largest values.
 // `Dir` false, produce smallest values.
@@ -33,7 +26,7 @@ template <typename K,
           int NumThreadQ,
           int ThreadsPerBlock>
 struct KeyValueBlockSelect {
-  static constexpr int kNumWarps          = ThreadsPerBlock / kWarpSize;
+  static constexpr int kNumWarps          = ThreadsPerBlock / WarpSize;
   static constexpr int kTotalWarpSortSize = NumWarpQ;
 
   __device__ inline KeyValueBlockSelect(
@@ -59,14 +52,14 @@ struct KeyValueBlockSelect {
       threadV[i].value = initVv;
     }
 
-    int laneId = getLaneId();
-    int warpId = threadIdx.x / kWarpSize;
+    int laneId = raft::laneId();
+    int warpId = threadIdx.x / WarpSize;
     warpK      = sharedK + warpId * kTotalWarpSortSize;
     warpV      = sharedV + warpId * kTotalWarpSortSize;
 
     // Fill warp queue (only the actual queue space is fine, not where
     // we write the per-thread queues for merging)
-    for (int i = laneId; i < NumWarpQ; i += kWarpSize) {
+    for (int i = laneId; i < NumWarpQ; i += WarpSize) {
       warpK[i]       = initK;
       warpV[i].key   = initVk;
       warpV[i].value = initVv;
@@ -134,20 +127,20 @@ struct KeyValueBlockSelect {
   /// list across both
   __device__ inline void mergeWarpQ()
   {
-    int laneId = getLaneId();
+    int laneId = raft::laneId();
 
     // Sort all of the per-thread queues
-    warpSortAnyRegistersKVP<K, V, NumThreadQ, !Dir, Comp>(threadK, threadV);
+    warpSortAnyRegisters<K, KeyValuePair<K, V>, NumThreadQ, !Dir, Comp>(threadK, threadV);
 
-    constexpr int kNumWarpQRegisters = NumWarpQ / kWarpSize;
+    constexpr int kNumWarpQRegisters = NumWarpQ / WarpSize;
     K warpKRegisters[kNumWarpQRegisters];
     KeyValuePair<K, V> warpVRegisters[kNumWarpQRegisters];
 
 #pragma unroll
     for (int i = 0; i < kNumWarpQRegisters; ++i) {
-      warpKRegisters[i]       = warpK[i * kWarpSize + laneId];
-      warpVRegisters[i].key   = warpV[i * kWarpSize + laneId].key;
-      warpVRegisters[i].value = warpV[i * kWarpSize + laneId].value;
+      warpKRegisters[i]       = warpK[i * WarpSize + laneId];
+      warpVRegisters[i].key   = warpV[i * WarpSize + laneId].key;
+      warpVRegisters[i].value = warpV[i * WarpSize + laneId].value;
     }
 
     warpFence();
@@ -155,15 +148,15 @@ struct KeyValueBlockSelect {
     // The warp queue is already sorted, and now that we've sorted the
     // per-thread queue, merge both sorted lists together, producing
     // one sorted list
-    warpMergeAnyRegistersKVP<K, V, kNumWarpQRegisters, NumThreadQ, !Dir, Comp, false>(
+    warpMergeAnyRegisters<K, KeyValuePair<K, V>, kNumWarpQRegisters, NumThreadQ, !Dir, Comp, false>(
       warpKRegisters, warpVRegisters, threadK, threadV);
 
     // Write back out the warp queue
 #pragma unroll
     for (int i = 0; i < kNumWarpQRegisters; ++i) {
-      warpK[i * kWarpSize + laneId]       = warpKRegisters[i];
-      warpV[i * kWarpSize + laneId].key   = warpVRegisters[i].key;
-      warpV[i * kWarpSize + laneId].value = warpVRegisters[i].value;
+      warpK[i * WarpSize + laneId]       = warpKRegisters[i];
+      warpV[i * WarpSize + laneId].key   = warpVRegisters[i].key;
+      warpV[i * WarpSize + laneId].value = warpVRegisters[i].value;
     }
 
     warpFence();
@@ -228,5 +221,4 @@ struct KeyValueBlockSelect {
   int kMinus1;
 };
 
-}  // namespace gpu
-}  // namespace faiss
\ No newline at end of file
+}  // namespace raft::spatial::knn::detail::faiss_select
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
index 41f1df85fe..f1f160a154 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,13 +15,15 @@
  */
 #pragma once
 #include <cub/cub.cuh>
-#include <faiss/gpu/utils/Select.cuh>
 #include <limits>
 #include <raft/linalg/norm.cuh>
+#include <raft/spatial/knn/detail/faiss_select/Select.cuh>
 // TODO: Need to hide the PairwiseDistance class impl and expose to public API
 #include "processing.cuh"
+#include <raft/core/operators.hpp>
 #include <raft/distance/detail/distance.cuh>
 #include <raft/distance/detail/pairwise_distance_base.cuh>
+#include <raft/util/cuda_utils.cuh>
 
 namespace raft {
 namespace spatial {
@@ -217,8 +219,8 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x
   constexpr auto identity = std::numeric_limits<AccT>::max();
   constexpr auto keyMax   = std::numeric_limits<uint32_t>::max();
   constexpr auto Dir      = false;
-  typedef faiss::gpu::
-    WarpSelect<AccT, uint32_t, Dir, faiss::gpu::Comparator<AccT>, NumWarpQ, NumThreadQ, 32>
+  typedef faiss_select::
+    WarpSelect<AccT, uint32_t, Dir, faiss_select::Comparator<AccT>, NumWarpQ, NumThreadQ, 32>
       myWarpSelect;
 
   auto rowEpilog_lambda = [m, n, numOfNN, out_dists, out_inds, mutexes] __device__(
@@ -566,8 +568,6 @@ void fusedL2UnexpKnnImpl(const DataT* x,
     acc += diff * diff;
   };
 
-  auto fin_op = [] __device__(AccT d_val, int g_d_idx) { return d_val; };
-
   typedef cub::KeyValuePair<uint32_t, AccT> Pair;
 
   if (isRowMajor) {
@@ -578,7 +578,7 @@ void fusedL2UnexpKnnImpl(const DataT* x,
                                                           IdxT,
                                                           KPolicy,
                                                           decltype(core_lambda),
-                                                          decltype(fin_op),
+                                                          raft::identity_op,
                                                           32,
                                                           2,
                                                           usePrevTopKs,
@@ -590,7 +590,7 @@ void fusedL2UnexpKnnImpl(const DataT* x,
                                                           IdxT,
                                                           KPolicy,
                                                           decltype(core_lambda),
-                                                          decltype(fin_op),
+                                                          raft::identity_op,
                                                           64,
                                                           3,
                                                           usePrevTopKs,
@@ -630,7 +630,7 @@ void fusedL2UnexpKnnImpl(const DataT* x,
                                                                   ldb,
                                                                   ldd,
                                                                   core_lambda,
-                                                                  fin_op,
+                                                                  raft::identity_op{},
                                                                   sqrt,
                                                                   (uint32_t)numOfNN,
                                                                   (int*)workspace,
@@ -757,8 +757,6 @@ void fusedL2ExpKnnImpl(const DataT* x,
   // Accumulation operation lambda
   auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; };
 
-  auto fin_op = [] __device__(AccT d_val, int g_d_idx) { return d_val; };
-
   typedef cub::KeyValuePair<uint32_t, AccT> Pair;
 
   if (isRowMajor) {
@@ -769,7 +767,7 @@ void fusedL2ExpKnnImpl(const DataT* x,
                                                         IdxT,
                                                         KPolicy,
                                                         decltype(core_lambda),
-                                                        decltype(fin_op),
+                                                        raft::identity_op,
                                                         32,
                                                         2,
                                                         usePrevTopKs,
@@ -781,7 +779,7 @@ void fusedL2ExpKnnImpl(const DataT* x,
                                                         IdxT,
                                                         KPolicy,
                                                         decltype(core_lambda),
-                                                        decltype(fin_op),
+                                                        raft::identity_op,
                                                         64,
                                                         3,
                                                         usePrevTopKs,
@@ -818,14 +816,15 @@ void fusedL2ExpKnnImpl(const DataT* x,
     DataT* xn = (DataT*)workspace;
     DataT* yn = (DataT*)workspace;
 
-    auto norm_op = [] __device__(DataT in) { return in; };
-
     if (x != y) {
       yn += m;
-      raft::linalg::rowNorm(xn, x, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
-      raft::linalg::rowNorm(yn, y, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
+      raft::linalg::rowNorm(
+        xn, x, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
+      raft::linalg::rowNorm(
+        yn, y, k, n, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
     } else {
-      raft::linalg::rowNorm(xn, x, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
+      raft::linalg::rowNorm(
+        xn, x, k, n, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
     }
     fusedL2ExpKnnRowMajor<<<grid, blk, sharedMemSize, stream>>>(x,
                                                                 y,
@@ -838,7 +837,7 @@ void fusedL2ExpKnnImpl(const DataT* x,
                                                                 ldb,
                                                                 ldd,
                                                                 core_lambda,
-                                                                fin_op,
+                                                                raft::identity_op{},
                                                                 sqrt,
                                                                 (uint32_t)numOfNN,
                                                                 mutexes,
diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
index 5c03f8f67c..7d361ba4fb 100644
--- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
+++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,15 +18,11 @@
 
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
+#include <raft/util/pow2_utils.cuh>
 
-#include <faiss/gpu/GpuDistance.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/Limits.cuh>
-#include <faiss/gpu/utils/Select.cuh>
-#include <faiss/utils/Heap.h>
-
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
+#include <raft/spatial/knn/detail/faiss_select/Select.cuh>
 
 namespace raft {
 namespace spatial {
@@ -36,11 +32,11 @@ namespace detail {
 template <typename value_t>
 DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2)
 {
-  value_t sin_0 = sin(0.5 * (x1 - y1));
-  value_t sin_1 = sin(0.5 * (x2 - y2));
-  value_t rdist = sin_0 * sin_0 + cos(x1) * cos(y1) * sin_1 * sin_1;
+  value_t sin_0 = raft::sin(0.5 * (x1 - y1));
+  value_t sin_1 = raft::sin(0.5 * (x2 - y2));
+  value_t rdist = sin_0 * sin_0 + raft::cos(x1) * raft::cos(y1) * sin_1 * sin_1;
 
-  return 2 * asin(sqrt(rdist));
+  return 2 * raft::asin(raft::sqrt(rdist));
 }
 
 /**
@@ -64,21 +60,21 @@ __global__ void haversine_knn_kernel(value_idx* out_inds,
                                      size_t n_index_rows,
                                      int k)
 {
-  constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
+  constexpr int kNumWarps = tpb / WarpSize;
 
   __shared__ value_t smemK[kNumWarps * warp_q];
   __shared__ value_idx smemV[kNumWarps * warp_q];
 
-  faiss::gpu::
-    BlockSelect<value_t, value_idx, false, faiss::gpu::Comparator<value_t>, warp_q, thread_q, tpb>
-      heap(faiss::gpu::Limits<value_t>::getMax(),
+  faiss_select::
+    BlockSelect<value_t, value_idx, false, faiss_select::Comparator<value_t>, warp_q, thread_q, tpb>
+      heap(std::numeric_limits<value_t>::max(),
            std::numeric_limits<value_idx>::max(),
            smemK,
            smemV,
            k);
 
   // Grid is exactly sized to rows available
-  int limit = faiss::gpu::utils::roundDown(n_index_rows, faiss::gpu::kWarpSize);
+  int limit = Pow2<WarpSize>::roundDown(n_index_rows);
 
   const value_t* query_ptr = query + (blockIdx.x * 2);
   value_t x1               = query_ptr[0];
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
index 14c4dd85f1..c417a97531 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,21 +17,26 @@
 #pragma once
 
 #include "../ivf_flat_types.hpp"
-#include "ann_kmeans_balanced.cuh"
 #include "ann_utils.cuh"
 
-#include <raft/core/handle.hpp>
+#include <raft/cluster/kmeans_balanced.cuh>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/mdarray.hpp>
 #include <raft/core/nvtx.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/core/serialize.hpp>
 #include <raft/linalg/add.cuh>
+#include <raft/linalg/map.cuh>
 #include <raft/linalg/norm.cuh>
-#include <raft/linalg/unary_op.cuh>
 #include <raft/stats/histogram.cuh>
 #include <raft/util/pow2_utils.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <cstdint>
+#include <fstream>
+
 namespace raft::spatial::knn::ivf_flat::detail {
 
 using namespace raft::spatial::knn::detail;  // NOLINT
@@ -114,7 +119,7 @@ __global__ void build_index_kernel(const LabelT* labels,
 
 /** See raft::spatial::knn::ivf_flat::extend docs */
 template <typename T, typename IdxT>
-inline auto extend(const handle_t& handle,
+inline auto extend(raft::device_resources const& handle,
                    const index<T, IdxT>& orig_index,
                    const T* new_vectors,
                    const IdxT* new_indices,
@@ -132,15 +137,18 @@ inline auto extend(const handle_t& handle,
                "You must pass data indices when the index is non-empty.");
 
   rmm::device_uvector<LabelT> new_labels(n_rows, stream);
-  kmeans::predict<T, IdxT, LabelT>(handle,
-                                   orig_index.centers().data_handle(),
-                                   n_lists,
-                                   dim,
-                                   new_vectors,
-                                   n_rows,
-                                   new_labels.data(),
-                                   orig_index.metric(),
-                                   stream);
+  raft::cluster::kmeans_balanced_params kmeans_params;
+  kmeans_params.metric     = orig_index.metric();
+  auto new_vectors_view    = raft::make_device_matrix_view<const T, IdxT>(new_vectors, n_rows, dim);
+  auto orig_centroids_view = raft::make_device_matrix_view<const float, IdxT>(
+    orig_index.centers().data_handle(), n_lists, dim);
+  auto labels_view = raft::make_device_vector_view<LabelT, IdxT>(new_labels.data(), n_rows);
+  raft::cluster::kmeans_balanced::predict(handle,
+                                          kmeans_params,
+                                          new_vectors_view,
+                                          orig_centroids_view,
+                                          labels_view,
+                                          utils::mapping<float>{});
 
   index<T, IdxT> ext_index(
     handle, orig_index.metric(), n_lists, orig_index.adaptive_centers(), dim);
@@ -155,16 +163,19 @@ inline auto extend(const handle_t& handle,
   if (ext_index.adaptive_centers()) {
     raft::copy(
       list_sizes_ptr, orig_index.list_sizes().data_handle(), ext_index.list_sizes().size(), stream);
-    kmeans::calc_centers_and_sizes(handle,
-                                   centers_ptr,
-                                   list_sizes_ptr,
-                                   n_lists,
-                                   dim,
-                                   new_vectors,
-                                   n_rows,
-                                   new_labels.data(),
-                                   false,
-                                   stream);
+    auto centroids_view = raft::make_device_matrix_view<float, IdxT>(centers_ptr, n_lists, dim);
+    auto list_sizes_view =
+      raft::make_device_vector_view<std::remove_pointer_t<decltype(list_sizes_ptr)>, IdxT>(
+        list_sizes_ptr, n_lists);
+    auto const_labels_view =
+      raft::make_device_vector_view<const LabelT, IdxT>(new_labels.data(), n_rows);
+    raft::cluster::kmeans_balanced::helpers::calc_centers_and_sizes(handle,
+                                                                    new_vectors_view,
+                                                                    const_labels_view,
+                                                                    centroids_view,
+                                                                    list_sizes_view,
+                                                                    false,
+                                                                    utils::mapping<float>{});
   } else {
     raft::stats::histogram<uint32_t, IdxT>(raft::stats::HistTypeAuto,
                                            reinterpret_cast<int32_t*>(list_sizes_ptr),
@@ -189,8 +200,7 @@ inline auto extend(const handle_t& handle,
   update_host(&index_size, list_offsets_ptr + n_lists, 1, stream);
   handle.sync_stream(stream);
 
-  ext_index.allocate(
-    handle, index_size, ext_index.metric() == raft::distance::DistanceType::L2Expanded);
+  ext_index.allocate(handle, index_size);
 
   // Populate index with the old data
   if (orig_index.size() > 0) {
@@ -244,8 +254,7 @@ inline auto extend(const handle_t& handle,
                             n_lists,
                             raft::linalg::L2Norm,
                             true,
-                            stream,
-                            raft::SqrtOp<float>());
+                            stream);
       RAFT_LOG_TRACE_VEC(ext_index.center_norms()->data_handle(), std::min<uint32_t>(dim, 20));
     }
   }
@@ -256,9 +265,11 @@ inline auto extend(const handle_t& handle,
 
 /** See raft::spatial::knn::ivf_flat::build docs */
 template <typename T, typename IdxT>
-inline auto build(
-  const handle_t& handle, const index_params& params, const T* dataset, IdxT n_rows, uint32_t dim)
-  -> index<T, IdxT>
+inline auto build(raft::device_resources const& handle,
+                  const index_params& params,
+                  const T* dataset,
+                  IdxT n_rows,
+                  uint32_t dim) -> index<T, IdxT>
 {
   auto stream = handle.get_stream();
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
@@ -286,15 +297,15 @@ inline auto build(
                                     n_rows_train,
                                     cudaMemcpyDefault,
                                     stream));
-    kmeans::build_hierarchical<T, IdxT>(handle,
-                                        params.kmeans_n_iters,
-                                        index.dim(),
-                                        trainset.data(),
-                                        n_rows_train,
-                                        index.centers().data_handle(),
-                                        index.n_lists(),
-                                        index.metric(),
-                                        stream);
+    auto trainset_const_view =
+      raft::make_device_matrix_view<const T, IdxT>(trainset.data(), n_rows_train, index.dim());
+    auto centers_view = raft::make_device_matrix_view<float, IdxT>(
+      index.centers().data_handle(), index.n_lists(), index.dim());
+    raft::cluster::kmeans_balanced_params kmeans_params;
+    kmeans_params.n_iters = params.kmeans_n_iters;
+    kmeans_params.metric  = index.metric();
+    raft::cluster::kmeans_balanced::fit(
+      handle, kmeans_params, trainset_const_view, centers_view, utils::mapping<float>{});
   }
 
   // add the data if necessary
@@ -323,7 +334,7 @@ inline auto build(
  * @param[in] n_candidates  of neighbor_candidates
  */
 template <typename T, typename IdxT>
-inline void fill_refinement_index(const handle_t& handle,
+inline void fill_refinement_index(raft::device_resources const& handle,
                                   index<T, IdxT>* refinement_index,
                                   const T* dataset,
                                   const IdxT* candidate_idx,
@@ -338,27 +349,27 @@ inline void fill_refinement_index(const handle_t& handle,
     "ivf_flat::fill_refinement_index(%zu, %u)", size_t(n_queries));
 
   rmm::device_uvector<LabelT> new_labels(n_queries * n_candidates, stream);
-  linalg::writeOnlyUnaryOp(
-    new_labels.data(),
-    n_queries * n_candidates,
-    [n_candidates] __device__(LabelT * out, uint32_t i) { *out = i / n_candidates; },
-    stream);
+  auto new_labels_view =
+    raft::make_device_vector_view<LabelT, IdxT>(new_labels.data(), n_queries * n_candidates);
+  linalg::map_offset(
+    handle,
+    new_labels_view,
+    raft::compose_op(raft::cast_op<LabelT>(), raft::div_const_op<IdxT>(n_candidates)));
 
   auto list_sizes_ptr   = refinement_index->list_sizes().data_handle();
   auto list_offsets_ptr = refinement_index->list_offsets().data_handle();
   // We do not fill centers and center norms, since we will not run coarse search.
 
   // Calculate new offsets
-  uint32_t n_roundup = Pow2<kIndexGroupSize>::roundUp(n_candidates);
-  linalg::writeOnlyUnaryOp(
-    refinement_index->list_offsets().data_handle(),
-    refinement_index->list_offsets().size(),
-    [n_roundup] __device__(IdxT * out, uint32_t i) { *out = i * n_roundup; },
-    stream);
+  uint32_t n_roundup     = Pow2<kIndexGroupSize>::roundUp(n_candidates);
+  auto list_offsets_view = raft::make_device_vector_view<IdxT, IdxT>(
+    list_offsets_ptr, refinement_index->list_offsets().size());
+  linalg::map_offset(handle,
+                     list_offsets_view,
+                     raft::compose_op(raft::cast_op<IdxT>(), raft::mul_const_op<IdxT>(n_roundup)));
 
   IdxT index_size = n_roundup * n_lists;
-  refinement_index->allocate(
-    handle, index_size, refinement_index->metric() == raft::distance::DistanceType::L2Expanded);
+  refinement_index->allocate(handle, index_size);
 
   RAFT_CUDA_TRY(cudaMemsetAsync(list_sizes_ptr, 0, n_lists * sizeof(uint32_t), stream));
 
@@ -377,4 +388,110 @@ inline void fill_refinement_index(const handle_t& handle,
                                          refinement_index->veclen());
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
+
+// Serialization version 2
+// No backward compatibility yet; that is, can't add additional fields without breaking
+// backward compatibility.
+// TODO(hcho3) Implement next-gen serializer for IVF that allows for expansion in a backward
+//             compatible fashion.
+constexpr int serialization_version = 2;
+
+static_assert(sizeof(index<double, std::uint64_t>) == 408,
+              "The size of the index struct has changed since the last update; "
+              "paste in the new size and consider updating the save/load logic");
+
+/**
+ * Save the index to file.
+ *
+ * Experimental, both the API and the serialization format are subject to change.
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the file name for saving the index
+ * @param[in] index_ IVF-Flat index
+ *
+ */
+template <typename T, typename IdxT>
+void serialize(raft::device_resources const& handle,
+               const std::string& filename,
+               const index<T, IdxT>& index_)
+{
+  std::ofstream of(filename, std::ios::out | std::ios::binary);
+  if (!of) { RAFT_FAIL("Cannot open %s", filename.c_str()); }
+
+  RAFT_LOG_DEBUG(
+    "Saving IVF-PQ index, size %zu, dim %u", static_cast<size_t>(index_.size()), index_.dim());
+  serialize_scalar(handle, of, serialization_version);
+  serialize_scalar(handle, of, index_.size());
+  serialize_scalar(handle, of, index_.dim());
+  serialize_scalar(handle, of, index_.n_lists());
+  serialize_scalar(handle, of, index_.metric());
+  serialize_scalar(handle, of, index_.veclen());
+  serialize_scalar(handle, of, index_.adaptive_centers());
+  serialize_mdspan(handle, of, index_.data());
+  serialize_mdspan(handle, of, index_.indices());
+  serialize_mdspan(handle, of, index_.list_sizes());
+  serialize_mdspan(handle, of, index_.list_offsets());
+  serialize_mdspan(handle, of, index_.centers());
+  if (index_.center_norms()) {
+    bool has_norms = true;
+    serialize_scalar(handle, of, has_norms);
+    serialize_mdspan(handle, of, *index_.center_norms());
+  } else {
+    bool has_norms = false;
+    serialize_scalar(handle, of, has_norms);
+  }
+  of.close();
+  if (!of) { RAFT_FAIL("Error writing output %s", filename.c_str()); }
+}
+
+/** Load an index from file.
+ *
+ * Experimental, both the API and the serialization format are subject to change.
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the name of the file that stores the index
+ * @param[in] index_ IVF-Flat index
+ *
+ */
+template <typename T, typename IdxT>
+auto deserialize(raft::device_resources const& handle, const std::string& filename)
+  -> index<T, IdxT>
+{
+  std::ifstream infile(filename, std::ios::in | std::ios::binary);
+
+  if (!infile) { RAFT_FAIL("Cannot open %s", filename.c_str()); }
+
+  auto ver = deserialize_scalar<int>(handle, infile);
+  if (ver != serialization_version) {
+    RAFT_FAIL("serialization version mismatch, expected %d, got %d ", serialization_version, ver);
+  }
+  auto n_rows           = deserialize_scalar<IdxT>(handle, infile);
+  auto dim              = deserialize_scalar<std::uint32_t>(handle, infile);
+  auto n_lists          = deserialize_scalar<std::uint32_t>(handle, infile);
+  auto metric           = deserialize_scalar<raft::distance::DistanceType>(handle, infile);
+  auto veclen           = deserialize_scalar<std::uint32_t>(handle, infile);
+  bool adaptive_centers = deserialize_scalar<bool>(handle, infile);
+
+  index<T, IdxT> index_ =
+    raft::spatial::knn::ivf_flat::index<T, IdxT>(handle, metric, n_lists, adaptive_centers, dim);
+
+  index_.allocate(handle, n_rows);
+  auto data = index_.data();
+  deserialize_mdspan(handle, infile, data);
+  deserialize_mdspan(handle, infile, index_.indices());
+  deserialize_mdspan(handle, infile, index_.list_sizes());
+  deserialize_mdspan(handle, infile, index_.list_offsets());
+  deserialize_mdspan(handle, infile, index_.centers());
+  bool has_norms = deserialize_scalar<bool>(handle, infile);
+  if (has_norms) {
+    if (!index_.center_norms()) {
+      RAFT_FAIL("Error inconsistent center norms");
+    } else {
+      auto center_norms = *index_.center_norms();
+      deserialize_mdspan(handle, infile, center_norms);
+    }
+  }
+  infile.close();
+  return index_;
+}
 }  // namespace raft::spatial::knn::ivf_flat::detail
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
index 94f4dc96c6..7f70d4b8a5 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,18 +18,21 @@
 
 #include "../ivf_flat_types.hpp"
 #include "ann_utils.cuh"
-#include "topk.cuh"
-#include "topk/warpsort_topk.cuh"
 
 #include <raft/core/cudart_utils.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/mdarray.hpp>
+#include <raft/core/operators.hpp>
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/norm.cuh>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/matrix/detail/select_k.cuh>
+#include <raft/matrix/detail/select_warpsort.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/device_loads_stores.cuh>
+#include <raft/util/integer_utils.hpp>
 #include <raft/util/pow2_utils.cuh>
 #include <raft/util/vectorized.cuh>
 
@@ -662,9 +665,11 @@ template <int Capacity,
           typename T,
           typename AccT,
           typename IdxT,
-          typename Lambda>
+          typename Lambda,
+          typename PostLambda>
 __global__ void __launch_bounds__(kThreadsPerBlock)
   interleaved_scan_kernel(Lambda compute_dist,
+                          PostLambda post_process,
                           const uint32_t query_smem_elems,
                           const T* query,
                           const uint32_t* coarse_index,
@@ -697,8 +702,13 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
   copy_vectorized(query_shared, query, std::min(dim, query_smem_elems));
   __syncthreads();
 
-  using block_sort_t = topk::block_sort<topk::warp_sort_filtered, Capacity, Ascending, float, IdxT>;
-  block_sort_t queue(k, interleaved_scan_kernel_smem + query_smem_elems * sizeof(T));
+  using block_sort_t = matrix::detail::select::warpsort::block_sort<
+    matrix::detail::select::warpsort::warp_sort_filtered,
+    Capacity,
+    Ascending,
+    float,
+    IdxT>;
+  block_sort_t queue(k);
 
   {
     using align_warp  = Pow2<WarpSize>;
@@ -775,8 +785,9 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
   }
 
   // finalize and store selected neighbours
-  queue.done();
-  queue.store(distances, neighbors);
+  __syncthreads();
+  queue.done(interleaved_scan_kernel_smem);
+  queue.store(distances, neighbors, post_process);
 }
 
 /**
@@ -804,8 +815,10 @@ template <int Capacity,
           typename T,
           typename AccT,
           typename IdxT,
-          typename Lambda>
+          typename Lambda,
+          typename PostLambda>
 void launch_kernel(Lambda lambda,
+                   PostLambda post_process,
                    const ivf_flat::index<T, IdxT>& index,
                    const T* queries,
                    const uint32_t* coarse_index,
@@ -820,14 +833,16 @@ void launch_kernel(Lambda lambda,
   RAFT_EXPECTS(Veclen == index.veclen(),
                "Configured Veclen does not match the index interleaving pattern.");
   constexpr auto kKernel =
-    interleaved_scan_kernel<Capacity, Veclen, Ascending, T, AccT, IdxT, Lambda>;
+    interleaved_scan_kernel<Capacity, Veclen, Ascending, T, AccT, IdxT, Lambda, PostLambda>;
   const int max_query_smem = 16384;
   int query_smem_elems =
     std::min<int>(max_query_smem / sizeof(T), Pow2<Veclen * WarpSize>::roundUp(index.dim()));
   int smem_size              = query_smem_elems * sizeof(T);
   constexpr int kSubwarpSize = std::min<int>(Capacity, WarpSize);
-  smem_size += raft::spatial::knn::detail::topk::calc_smem_size_for_block_wide<AccT, IdxT>(
-    kThreadsPerBlock / kSubwarpSize, k);
+  auto block_merge_mem =
+    raft::matrix::detail::select::warpsort::calc_smem_size_for_block_wide<AccT, IdxT>(
+      kThreadsPerBlock / kSubwarpSize, k);
+  smem_size += std::max<int>(smem_size, block_merge_mem);
 
   // power-of-two less than cuda limit (for better addr alignment)
   constexpr uint32_t kMaxGridY = 32768;
@@ -850,6 +865,7 @@ void launch_kernel(Lambda lambda,
       n_probes,
       smem_size);
     kKernel<<<grid_dim, block_dim, smem_size, stream>>>(lambda,
+                                                        post_process,
                                                         query_smem_elems,
                                                         queries,
                                                         coarse_index,
@@ -885,7 +901,7 @@ struct euclidean_dist<Veclen, uint8_t, uint32_t> {
       const auto diff = __vabsdiffu4(x, y);
       acc             = dp4a(diff, diff, acc);
     } else {
-      const auto diff = x - y;
+      const auto diff = __usad(x, y, 0u);
       acc += diff * diff;
     }
   }
@@ -896,8 +912,12 @@ struct euclidean_dist<Veclen, int8_t, int32_t> {
   __device__ __forceinline__ void operator()(int32_t& acc, int32_t x, int32_t y)
   {
     if constexpr (Veclen > 1) {
-      const auto diff = static_cast<int32_t>(__vabsdiffs4(x, y));
-      acc             = dp4a(diff, diff, acc);
+      // Note that we enforce here that the unsigned version of dp4a is used, because the difference
+      // between two int8 numbers can be greater than 127 and therefore represented as a negative
+      // number in int8. Casting from int8 to int32 would yield incorrect results, while casting
+      // from uint8 to uint32 is correct.
+      const auto diff = __vabsdiffs4(x, y);
+      acc             = dp4a(diff, diff, static_cast<uint32_t>(acc));
     } else {
       const auto diff = x - y;
       acc += diff * diff;
@@ -936,7 +956,18 @@ void launch_with_fixed_consts(raft::distance::DistanceType metric, Args&&... arg
                            T,
                            AccT,
                            IdxT,
-                           euclidean_dist<Veclen, T, AccT>>({}, std::forward<Args>(args)...);
+                           euclidean_dist<Veclen, T, AccT>,
+                           raft::identity_op>({}, {}, std::forward<Args>(args)...);
+    case raft::distance::DistanceType::L2SqrtExpanded:
+    case raft::distance::DistanceType::L2SqrtUnexpanded:
+      return launch_kernel<Capacity,
+                           Veclen,
+                           Ascending,
+                           T,
+                           AccT,
+                           IdxT,
+                           euclidean_dist<Veclen, T, AccT>,
+                           raft::sqrt_op>({}, {}, std::forward<Args>(args)...);
     case raft::distance::DistanceType::InnerProduct:
       return launch_kernel<Capacity,
                            Veclen,
@@ -944,7 +975,8 @@ void launch_with_fixed_consts(raft::distance::DistanceType metric, Args&&... arg
                            T,
                            AccT,
                            IdxT,
-                           inner_prod_dist<Veclen, T, AccT>>({}, std::forward<Args>(args)...);
+                           inner_prod_dist<Veclen, T, AccT>,
+                           raft::identity_op>({}, {}, std::forward<Args>(args)...);
     // NB: update the description of `knn::ivf_flat::build` when adding here a new metric.
     default: RAFT_FAIL("The chosen distance metric is not supported (%d)", int(metric));
   }
@@ -957,7 +989,7 @@ void launch_with_fixed_consts(raft::distance::DistanceType metric, Args&&... arg
 template <typename T,
           typename AccT,
           typename IdxT,
-          int Capacity = topk::kMaxCapacity,
+          int Capacity = matrix::detail::select::warpsort::kMaxCapacity,
           int Veclen   = std::max<int>(1, 16 / sizeof(T))>
 struct select_interleaved_scan_kernel {
   /**
@@ -981,12 +1013,12 @@ struct select_interleaved_scan_kernel {
           capacity, veclen, select_min, std::forward<Args>(args)...);
       }
     }
-    // NB: this is the limitation of the topk::block_topk structures that use a huge number of
+    // NB: this is the limitation of the warpsort structures that use a huge number of
     //     registers (used in the main kernel here).
     RAFT_EXPECTS(capacity == Capacity,
                  "Capacity must be power-of-two not bigger than the maximum allowed size "
-                 "topk::kMaxCapacity (%d).",
-                 topk::kMaxCapacity);
+                 "matrix::detail::select::warpsort::kMaxCapacity (%d).",
+                 matrix::detail::select::warpsort::kMaxCapacity);
     RAFT_EXPECTS(
       veclen == Veclen,
       "Veclen must be power-of-two not bigger than the maximum allowed size for this data type.");
@@ -1012,7 +1044,7 @@ struct select_interleaved_scan_kernel {
  * @param metric type of the measured distance
  * @param n_probes number of nearest clusters to query
  * @param k number of nearest neighbors.
- *            NB: the maximum value of `k` is limited statically by `topk::kMaxCapacity`.
+ *            NB: the maximum value of `k` is limited statically by `kMaxCapacity`.
  * @param select_min whether to select nearest (true) or furthest (false) points w.r.t. the given
  * metric.
  * @param[out] neighbors device pointer to the result indices for each query and cluster
@@ -1037,7 +1069,7 @@ void ivfflat_interleaved_scan(const ivf_flat::index<T, IdxT>& index,
                               uint32_t& grid_dim_x,
                               rmm::cuda_stream_view stream)
 {
-  const int capacity = raft::spatial::knn::detail::topk::calc_capacity(k);
+  const int capacity = bound_by_power_of_two(k);
   select_interleaved_scan_kernel<T, AccT, IdxT>::run(capacity,
                                                      index.veclen(),
                                                      select_min,
@@ -1055,7 +1087,7 @@ void ivfflat_interleaved_scan(const ivf_flat::index<T, IdxT>& index,
 }
 
 template <typename T, typename AccT, typename IdxT>
-void search_impl(const handle_t& handle,
+void search_impl(raft::device_resources const& handle,
                  const index<T, IdxT>& index,
                  const T* queries,
                  uint32_t n_queries,
@@ -1100,28 +1132,32 @@ void search_impl(const handle_t& handle,
   float beta  = 0.0f;
 
   // todo(lsugy): raft distance? (if performance is similar/better than gemm)
-  if (index.metric() == raft::distance::DistanceType::L2Expanded) {
-    alpha = -2.0f;
-    beta  = 1.0f;
-    raft::linalg::rowNorm(query_norm_dev.data(),
-                          converted_queries_ptr,
-                          static_cast<IdxT>(index.dim()),
-                          static_cast<IdxT>(n_queries),
-                          raft::linalg::L2Norm,
-                          true,
-                          stream,
-                          raft::SqrtOp<float>());
-    utils::outer_add(query_norm_dev.data(),
-                     (IdxT)n_queries,
-                     index.center_norms()->data_handle(),
-                     (IdxT)index.n_lists(),
-                     distance_buffer_dev.data(),
-                     stream);
-    RAFT_LOG_TRACE_VEC(index.center_norms()->data_handle(), std::min<uint32_t>(20, index.dim()));
-    RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), std::min<uint32_t>(20, index.n_lists()));
-  } else {
-    alpha = 1.0f;
-    beta  = 0.0f;
+  switch (index.metric()) {
+    case raft::distance::DistanceType::L2Expanded:
+    case raft::distance::DistanceType::L2SqrtExpanded: {
+      alpha = -2.0f;
+      beta  = 1.0f;
+      raft::linalg::rowNorm(query_norm_dev.data(),
+                            converted_queries_ptr,
+                            static_cast<IdxT>(index.dim()),
+                            static_cast<IdxT>(n_queries),
+                            raft::linalg::L2Norm,
+                            true,
+                            stream);
+      utils::outer_add(query_norm_dev.data(),
+                       (IdxT)n_queries,
+                       index.center_norms()->data_handle(),
+                       (IdxT)index.n_lists(),
+                       distance_buffer_dev.data(),
+                       stream);
+      RAFT_LOG_TRACE_VEC(index.center_norms()->data_handle(), std::min<uint32_t>(20, index.dim()));
+      RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), std::min<uint32_t>(20, index.n_lists()));
+      break;
+    }
+    default: {
+      alpha = 1.0f;
+      beta  = 0.0f;
+    }
   }
 
   linalg::gemm(handle,
@@ -1141,16 +1177,16 @@ void search_impl(const handle_t& handle,
                stream);
 
   RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), std::min<uint32_t>(20, index.n_lists()));
-  select_topk<AccT, uint32_t>(distance_buffer_dev.data(),
-                              nullptr,
-                              n_queries,
-                              index.n_lists(),
-                              n_probes,
-                              coarse_distances_dev.data(),
-                              coarse_indices_dev.data(),
-                              select_min,
-                              stream,
-                              search_mr);
+  matrix::detail::select_k<AccT, uint32_t>(distance_buffer_dev.data(),
+                                           nullptr,
+                                           n_queries,
+                                           index.n_lists(),
+                                           n_probes,
+                                           coarse_distances_dev.data(),
+                                           coarse_indices_dev.data(),
+                                           select_min,
+                                           stream,
+                                           search_mr);
   RAFT_LOG_TRACE_VEC(coarse_indices_dev.data(), n_probes);
   RAFT_LOG_TRACE_VEC(coarse_distances_dev.data(), n_probes);
 
@@ -1199,16 +1235,16 @@ void search_impl(const handle_t& handle,
 
   // Merge topk values from different blocks
   if (grid_dim_x > 1) {
-    select_topk<AccT, IdxT>(refined_distances_dev.data(),
-                            refined_indices_dev.data(),
-                            n_queries,
-                            k * grid_dim_x,
-                            k,
-                            distances,
-                            neighbors,
-                            select_min,
-                            stream,
-                            search_mr);
+    matrix::detail::select_k<AccT, IdxT>(refined_distances_dev.data(),
+                                         refined_indices_dev.data(),
+                                         n_queries,
+                                         k * grid_dim_x,
+                                         k,
+                                         distances,
+                                         neighbors,
+                                         select_min,
+                                         stream,
+                                         search_mr);
   }
 }
 
@@ -1234,7 +1270,7 @@ inline bool is_min_close(distance::DistanceType metric)
 
 /** See raft::spatial::knn::ivf_flat::search docs */
 template <typename T, typename IdxT>
-inline void search(const handle_t& handle,
+inline void search(raft::device_resources const& handle,
                    const search_params& params,
                    const index<T, IdxT>& index,
                    const T* queries,
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
index 9262ef6baf..66a4207b20 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,25 +16,30 @@
 
 #pragma once
 
-#include "ann_kmeans_balanced.cuh"
 #include "ann_utils.cuh"
 
 #include <raft/neighbors/ivf_pq_types.hpp>
 
+#include <raft/cluster/kmeans_balanced.cuh>
 #include <raft/core/device_mdarray.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/nvtx.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/core/serialize.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/detail/qr.cuh>
 #include <raft/linalg/gemm.cuh>
+#include <raft/linalg/map.cuh>
 #include <raft/linalg/norm.cuh>
+#include <raft/linalg/unary_op.cuh>
 #include <raft/matrix/matrix.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/stats/histogram.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/device_atomics.cuh>
+#include <raft/util/integer_utils.hpp>
 #include <raft/util/pow2_utils.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -46,9 +51,14 @@
 #include <thrust/binary_search.h>
 #include <thrust/extrema.h>
 #include <thrust/functional.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/scan.h>
 #include <thrust/sequence.h>
 
+#include <cstdint>
+#include <fstream>
+#include <variant>
+
 namespace raft::spatial::knn::ivf_pq::detail {
 
 using namespace raft::spatial::knn::detail;  // NOLINT
@@ -59,7 +69,9 @@ using raft::neighbors::ivf_pq::index_params;
 using raft::neighbors::ivf_pq::kIndexGroupSize;
 using raft::neighbors::ivf_pq::kIndexGroupVecLen;
 
-using pq_codes_exts = extents<size_t, dynamic_extent, dynamic_extent, kIndexGroupVecLen>;
+using pq_vec_t        = TxN_t<uint8_t, kIndexGroupVecLen>::io_t;
+using pq_new_vec_exts = extents<size_t, dynamic_extent, dynamic_extent>;
+using pq_int_vec_exts = extents<size_t, dynamic_extent, dynamic_extent, kIndexGroupSize>;
 
 namespace {
 
@@ -115,80 +127,53 @@ struct bitfield_view_t {
   }
 };
 
-/*
-  NB: label type is uint32_t although it can only contain values up to `1 << pq_bits`.
-      We keep it this way to not force one more overload for kmeans::predict.
- */
-template <uint32_t PqBits, size_t VecLen>
-__device__ void ivfpq_encode_core(uint32_t n_rows,
-                                  uint32_t pq_dim,
-                                  const uint32_t* label,
-                                  uint8_t* output)
+template <uint32_t BlockDim, typename T, typename S>
+__launch_bounds__(BlockDim) __global__ void copy_warped_kernel(
+  T* out, uint32_t ld_out, const S* in, uint32_t ld_in, uint32_t n_cols, size_t n_rows)
 {
-  constexpr uint32_t kChunkSize = (VecLen * 8u) / PqBits;
-  TxN_t<uint8_t, VecLen> vec;
-  for (uint32_t j = 0; j < pq_dim;) {
-    vec.fill(0);
-    bitfield_view_t<PqBits> out{vec.val.data};
-#pragma unroll
-    for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++, label += n_rows) {
-      out[k] = static_cast<uint8_t>(*label);
-    }
-    vec.store(output, 0);
-    output += VecLen;
+  using warp    = Pow2<WarpSize>;
+  size_t row_ix = warp::div(size_t(threadIdx.x) + size_t(BlockDim) * size_t(blockIdx.x));
+  uint32_t i    = warp::mod(threadIdx.x);
+  if (row_ix >= n_rows) return;
+  out += row_ix * ld_out;
+  in += row_ix * ld_in;
+  auto f = utils::mapping<T>{};
+  for (uint32_t col_ix = i; col_ix < n_cols; col_ix += warp::Value) {
+    auto x = f(in[col_ix]);
+    __syncwarp();
+    out[col_ix] = x;
   }
 }
 
-template <uint32_t BlockDim, uint32_t PqBits>
-__launch_bounds__(BlockDim) __global__
-  void ivfpq_encode_kernel(uint32_t pq_dim,
-                           const uint32_t* label,  // [pq_dim, n_rows]
-                           device_mdspan<uint8_t, pq_codes_exts, row_major> output  // [n_rows, ..]
-  )
-{
-  uint32_t i = threadIdx.x + BlockDim * blockIdx.x;
-  if (i >= output.extent(0)) return;
-  ivfpq_encode_core<PqBits, output.static_extent(2)>(
-    output.extent(0),
-    pq_dim,
-    label + i,
-    output.data_handle() + output.extent(1) * output.extent(2) * i);
-}
-}  // namespace
-
 /**
- * Compress the cluster labels into an encoding with pq_bits bits, and transform it into a form to
- * facilitate vectorized loads
+ * Copy the data one warp-per-row:
+ *
+ *  1. load the data per-warp
+ *  2. apply the `utils::mapping<T>{}`
+ *  3. sync within warp
+ *  4. store the data.
+ *
+ * Assuming sizeof(T) >= sizeof(S) and the data is properly aligned (see the usage in `build`), this
+ * allows to re-structure the data within rows in-place.
  */
-inline void ivfpq_encode(uint32_t pq_dim,
-                         uint32_t pq_bits,       // 4 <= pq_bits <= 8
-                         const uint32_t* label,  // [pq_dim, n_rows]
-                         device_mdspan<uint8_t, pq_codes_exts, row_major> output,  // [n_rows, ..]
-                         rmm::cuda_stream_view stream)
+template <typename T, typename S>
+void copy_warped(T* out,
+                 uint32_t ld_out,
+                 const S* in,
+                 uint32_t ld_in,
+                 uint32_t n_cols,
+                 size_t n_rows,
+                 rmm::cuda_stream_view stream)
 {
   constexpr uint32_t kBlockDim = 128;
   dim3 threads(kBlockDim, 1, 1);
-  dim3 blocks(raft::ceildiv<uint32_t>(output.extent(0), kBlockDim), 1, 1);
-  switch (pq_bits) {
-    case 4:
-      return ivfpq_encode_kernel<kBlockDim, 4>
-        <<<blocks, threads, 0, stream>>>(pq_dim, label, output);
-    case 5:
-      return ivfpq_encode_kernel<kBlockDim, 5>
-        <<<blocks, threads, 0, stream>>>(pq_dim, label, output);
-    case 6:
-      return ivfpq_encode_kernel<kBlockDim, 6>
-        <<<blocks, threads, 0, stream>>>(pq_dim, label, output);
-    case 7:
-      return ivfpq_encode_kernel<kBlockDim, 7>
-        <<<blocks, threads, 0, stream>>>(pq_dim, label, output);
-    case 8:
-      return ivfpq_encode_kernel<kBlockDim, 8>
-        <<<blocks, threads, 0, stream>>>(pq_dim, label, output);
-    default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
-  }
+  dim3 blocks(div_rounding_up_safe<size_t>(n_rows, kBlockDim / WarpSize), 1, 1);
+  copy_warped_kernel<kBlockDim, T, S>
+    <<<blocks, threads, 0, stream>>>(out, ld_out, in, ld_in, n_cols, n_rows);
 }
 
+}  // namespace
+
 /**
  * @brief Fill-in a random orthogonal transformation matrix.
  *
@@ -199,7 +184,7 @@ inline void ivfpq_encode(uint32_t pq_dim,
  * @param[out] rotation_matrix device pointer to a row-major matrix of size [n_rows, n_cols].
  * @param rng random number generator state
  */
-inline void make_rotation_matrix(const handle_t& handle,
+inline void make_rotation_matrix(raft::device_resources const& handle,
                                  bool force_random_rotation,
                                  uint32_t n_rows,
                                  uint32_t n_cols,
@@ -228,8 +213,11 @@ inline void make_rotation_matrix(const handle_t& handle,
     }
   } else {
     uint32_t stride = n + 1;
-    auto f = [stride] __device__(float* out, uint32_t i) -> void { *out = float(i % stride == 0); };
-    linalg::writeOnlyUnaryOp(rotation_matrix, n * n, f, stream);
+    auto rotation_matrix_view =
+      raft::make_device_vector_view<float, uint32_t>(rotation_matrix, n * n);
+    linalg::map_offset(handle, rotation_matrix_view, [stride] __device__(uint32_t i) {
+      return static_cast<float>(i % stride == 0u);
+    });
   }
 }
 
@@ -240,7 +228,7 @@ inline void make_rotation_matrix(const handle_t& handle,
  *
  */
 template <typename T, typename IdxT>
-void select_residuals(const handle_t& handle,
+void select_residuals(raft::device_resources const& handle,
                       float* residuals,
                       IdxT n_rows,
                       uint32_t dim,
@@ -255,18 +243,14 @@ void select_residuals(const handle_t& handle,
 {
   auto stream = handle.get_stream();
   rmm::device_uvector<float> tmp(size_t(n_rows) * size_t(dim), stream, device_memory);
-  utils::copy_selected<float, T>(
-    n_rows, (IdxT)dim, dataset, row_ids, (IdxT)dim, tmp.data(), (IdxT)dim, stream);
+  // Note: the number of rows of the input dataset isn't actually n_rows, but matrix::gather doesn't
+  // need to know it, any strictly positive number would work.
+  cub::TransformInputIterator<float, utils::mapping<float>, const T*> mapping_itr(
+    dataset, utils::mapping<float>{});
+  raft::matrix::gather(mapping_itr, (IdxT)dim, n_rows, row_ids, n_rows, tmp.data(), stream);
 
   raft::matrix::linewiseOp(
-    tmp.data(),
-    tmp.data(),
-    IdxT(dim),
-    n_rows,
-    true,
-    [] __device__(float a, float b) { return a - b; },
-    stream,
-    center);
+    tmp.data(), tmp.data(), IdxT(dim), n_rows, true, raft::sub_op{}, stream, center);
 
   float alpha = 1.0;
   float beta  = 0.0;
@@ -288,166 +272,52 @@ void select_residuals(const handle_t& handle,
 }
 
 /**
+ * @brief Compute residual vectors from the source dataset given by selected indices.
+ *
+ * The residual has the form
+ *  `rotation_matrix %* (dataset[:, :] - centers[labels[:], 0:dim])`
  *
- * @param handle,
- * @param n_rows
- * @param data_dim
- * @param rot_dim
- * @param pq_dim
- * @param pq_len
- * @param pq_bits
- * @param n_clusters
- * @param codebook_kind
- * @param max_cluster_size
- * @param cluster_centers           // [n_clusters, data_dim]
- * @param rotation_matrix     // [rot_dim, data_dim]
- * @param dataset                 // [n_rows]
- * @param data_indices
- *    tells which indices to select in the dataset for each cluster [n_rows];
- *    it should be partitioned by the clusters by now.
- * @param cluster_sizes    // [n_clusters]
- * @param cluster_offsets  // [n_clusters + 1]
- * @param pq_centers  // [...] (see ivf_pq::index::pq_centers() layout)
- * @param pq_dataset
- *   // [n_rows, ceildiv(pq_dim, (kIndexGroupVecLen * 8u) / pq_bits), kIndexGroupVecLen]
- *   NB: in contrast to the final interleaved layout in ivf_pq::index::pq_dataset(), this function
- *       produces a non-interleaved data; it gets interleaved later when adding the data to the
- *       index.
- * @param device_memory
  */
 template <typename T, typename IdxT>
-void compute_pq_codes(
-  const handle_t& handle,
+void flat_compute_residuals(
+  raft::device_resources const& handle,
+  float* residuals,  // [n_rows, rot_dim]
   IdxT n_rows,
-  uint32_t data_dim,
-  uint32_t rot_dim,
-  uint32_t pq_dim,
-  uint32_t pq_len,
-  uint32_t pq_bits,
-  uint32_t n_clusters,
-  codebook_gen codebook_kind,
-  uint32_t max_cluster_size,
-  float* cluster_centers,
-  const float* rotation_matrix,
-  const T* dataset,
-  const IdxT* data_indices,
-  const uint32_t* cluster_sizes,
-  const IdxT* cluster_offsets,
-  device_mdspan<const float, typename index<IdxT>::pq_centers_extents, row_major> pq_centers,
-  device_mdspan<uint8_t, pq_codes_exts, row_major> pq_dataset,
+  device_mdspan<const float, extent_2d<uint32_t>, row_major> rotation_matrix,  // [rot_dim, dim]
+  device_mdspan<const float, extent_2d<uint32_t>, row_major> centers,          // [n_lists, dim_ext]
+  const T* dataset,                                                            // [n_rows, dim]
+  const uint32_t* labels,                                                      // [n_rows]
   rmm::mr::device_memory_resource* device_memory)
 {
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "ivf_pq::compute_pq_codes(n_rows = %zu, data_dim = %u, rot_dim = %u (%u * %u), n_clusters = "
-    "%u)",
-    size_t(n_rows),
-    data_dim,
-    rot_dim,
-    pq_dim,
-    pq_len,
-    n_clusters);
-  auto stream = handle.get_stream();
-
-  //
-  // Compute PQ code
-  //
-
-  uint32_t pq_width = 1 << pq_bits;
-  rmm::device_uvector<float> pq_centers_tmp(pq_len * pq_width, stream, device_memory);
-  rmm::device_uvector<float> rot_vectors(
-    size_t(max_cluster_size) * size_t(rot_dim), stream, device_memory);
-  rmm::device_uvector<float> sub_vectors(
-    size_t(max_cluster_size) * size_t(pq_dim * pq_len), stream, device_memory);
-  rmm::device_uvector<uint32_t> sub_vector_labels(
-    size_t(max_cluster_size) * size_t(pq_dim), stream, device_memory);
-
-  for (uint32_t l = 0; l < n_clusters; l++) {
-    auto cluster_size = cluster_sizes[l];
-    common::nvtx::range<common::nvtx::domain::raft> cluster_scope(
-      "ivf_pq::compute_pq_codes::cluster[%u](size = %u)", l, cluster_size);
-    if (cluster_size == 0) continue;
-
-    select_residuals(handle,
-                     rot_vectors.data(),
-                     IdxT(cluster_size),
-                     data_dim,
-                     rot_dim,
-                     rotation_matrix,
-                     cluster_centers + size_t(l) * size_t(data_dim),
-                     dataset,
-                     data_indices + cluster_offsets[l],
-                     device_memory);
-
-    //
-    // Change the order of the vector data to facilitate processing in
-    // each vector subspace.
-    //   input:  rot_vectors[cluster_size, rot_dim] = [cluster_size, pq_dim, pq_len]
-    //   output: sub_vectors[pq_dim, cluster_size, pq_len]
-    //
-    for (uint32_t i = 0; i < pq_dim; i++) {
-      RAFT_CUDA_TRY(
-        cudaMemcpy2DAsync(sub_vectors.data() + size_t(i) * size_t(pq_len) * size_t(cluster_size),
-                          sizeof(float) * pq_len,
-                          rot_vectors.data() + i * pq_len,
-                          sizeof(float) * rot_dim,
-                          sizeof(float) * pq_len,
-                          cluster_size,
-                          cudaMemcpyDefault,
-                          stream));
-    }
-
-    if (codebook_kind == codebook_gen::PER_CLUSTER) {
-      linalg::writeOnlyUnaryOp(
-        pq_centers_tmp.data(),
-        pq_len * pq_width,
-        [pq_centers, pq_width, pq_len, l] __device__(float* out, uint32_t i) {
-          auto i0 = i / pq_len;
-          auto i1 = i % pq_len;
-          *out    = pq_centers(l, i1, i0);
-        },
-        stream);
-    }
-
-    //
-    // Find a label (cluster ID) for each vector subspace.
-    //
-    for (uint32_t j = 0; j < pq_dim; j++) {
-      if (codebook_kind == codebook_gen::PER_SUBSPACE) {
-        linalg::writeOnlyUnaryOp(
-          pq_centers_tmp.data(),
-          pq_len * pq_width,
-          [pq_centers, pq_width, pq_len, j] __device__(float* out, uint32_t i) {
-            auto i0 = i / pq_len;
-            auto i1 = i % pq_len;
-            *out    = pq_centers(j, i1, i0);
-          },
-          stream);
-      }
-      kmeans::predict(handle,
-                      pq_centers_tmp.data(),
-                      pq_width,
-                      pq_len,
-                      sub_vectors.data() + size_t(j) * size_t(cluster_size) * size_t(pq_len),
-                      cluster_size,
-                      sub_vector_labels.data() + size_t(j) * size_t(cluster_size),
-                      raft::distance::DistanceType::L2Expanded,
-                      stream,
-                      device_memory);
-    }
-
-    //
-    // PQ encoding
-    //
-    ivfpq_encode(
-      pq_dim,
-      pq_bits,
-      sub_vector_labels.data(),
-      make_mdspan<uint8_t, IdxT, row_major, false, true>(
-        pq_dataset.data_handle() +
-          size_t(cluster_offsets[l]) * pq_dataset.extent(1) * pq_dataset.extent(2),
-        make_extents<IdxT>(cluster_size, pq_dataset.extent(1), pq_dataset.static_extent(2))),
-      stream);
-  }
+  auto stream  = handle.get_stream();
+  auto dim     = rotation_matrix.extent(1);
+  auto rot_dim = rotation_matrix.extent(0);
+  rmm::device_uvector<float> tmp(n_rows * dim, stream, device_memory);
+  auto tmp_view = raft::make_device_vector_view<float, IdxT>(tmp.data(), tmp.size());
+  linalg::map_offset(handle, tmp_view, [centers, dataset, labels, dim] __device__(size_t i) {
+    auto row_ix = i / dim;
+    auto el_ix  = i % dim;
+    auto label  = labels[row_ix];
+    return utils::mapping<float>{}(dataset[i]) - centers(label, el_ix);
+  });
+
+  float alpha = 1.0f;
+  float beta  = 0.0f;
+  linalg::gemm(handle,
+               true,
+               false,
+               rot_dim,
+               n_rows,
+               dim,
+               &alpha,
+               rotation_matrix.data_handle(),
+               dim,
+               tmp.data(),
+               dim,
+               &beta,
+               residuals,
+               rot_dim,
+               stream);
 }
 
 template <uint32_t BlockDim, typename IdxT>
@@ -487,7 +357,7 @@ auto calculate_offsets_and_indices(IdxT n_rows,
   IdxT cumsum = 0;
   update_device(cluster_offsets, &cumsum, 1, stream);
   thrust::inclusive_scan(
-    exec_policy, cluster_sizes, cluster_sizes + n_lists, cluster_offsets + 1, thrust::plus<IdxT>{});
+    exec_policy, cluster_sizes, cluster_sizes + n_lists, cluster_offsets + 1, add_op{});
   update_host(&cumsum, cluster_offsets + n_lists, 1, stream);
   uint32_t max_cluster_size =
     *thrust::max_element(exec_policy, cluster_sizes, cluster_sizes + n_lists);
@@ -505,33 +375,32 @@ auto calculate_offsets_and_indices(IdxT n_rows,
 }
 
 template <typename IdxT>
-void transpose_pq_centers(index<IdxT>& index,
-                          const float* pq_centers_source,
-                          rmm::cuda_stream_view stream)
+void transpose_pq_centers(const device_resources& handle,
+                          index<IdxT>& index,
+                          const float* pq_centers_source)
 {
+  auto stream  = handle.get_stream();
   auto extents = index.pq_centers().extents();
   static_assert(extents.rank() == 3);
   auto extents_source =
     make_extents<uint32_t>(extents.extent(0), extents.extent(2), extents.extent(1));
   auto span_source =
     make_mdspan<const float, uint32_t, row_major, false, true>(pq_centers_source, extents_source);
-  linalg::writeOnlyUnaryOp(
-    index.pq_centers().data_handle(),
-    index.pq_centers().size(),
-    [span_source, extents] __device__(float* out, size_t i) {
-      uint32_t ii[3];
-      for (int r = 2; r > 0; r--) {
-        ii[r] = i % extents.extent(r);
-        i /= extents.extent(r);
-      }
-      ii[0] = i;
-      *out  = span_source(ii[0], ii[2], ii[1]);
-    },
-    stream);
+  auto pq_centers_view = raft::make_device_vector_view<float, IdxT>(
+    index.pq_centers().data_handle(), index.pq_centers().size());
+  linalg::map_offset(handle, pq_centers_view, [span_source, extents] __device__(size_t i) {
+    uint32_t ii[3];
+    for (int r = 2; r > 0; r--) {
+      ii[r] = i % extents.extent(r);
+      i /= extents.extent(r);
+    }
+    ii[0] = i;
+    return span_source(ii[0], ii[2], ii[1]);
+  });
 }
 
 template <typename IdxT>
-void train_per_subset(const handle_t& handle,
+void train_per_subset(raft::device_resources const& handle,
                       index<IdxT>& index,
                       size_t n_rows,
                       const float* trainset,   // [n_rows, dim]
@@ -583,25 +452,35 @@ void train_per_subset(const handle_t& handle,
                  index.pq_len(),
                  stream);
 
+    // clone the handle and attached the device memory resource to it
+    const device_resources new_handle(handle, device_memory);
+
     // train PQ codebook for this subspace
-    kmeans::build_clusters(handle,
-                           kmeans_n_iters,
-                           index.pq_len(),
-                           sub_trainset.data(),
-                           n_rows,
-                           index.pq_book_size(),
-                           pq_centers_tmp.data() + index.pq_book_size() * index.pq_len() * j,
-                           sub_labels.data(),
-                           pq_cluster_sizes.data(),
-                           raft::distance::DistanceType::L2Expanded,
-                           stream,
-                           device_memory);
+    auto sub_trainset_view =
+      raft::make_device_matrix_view<const float, IdxT>(sub_trainset.data(), n_rows, index.pq_len());
+    auto centers_tmp_view = raft::make_device_matrix_view<float, IdxT>(
+      pq_centers_tmp.data() + index.pq_book_size() * index.pq_len() * j,
+      index.pq_book_size(),
+      index.pq_len());
+    auto sub_labels_view = raft::make_device_vector_view<uint32_t, IdxT>(sub_labels.data(), n_rows);
+    auto cluster_sizes_view =
+      raft::make_device_vector_view<uint32_t, IdxT>(pq_cluster_sizes.data(), index.pq_book_size());
+    raft::cluster::kmeans_balanced_params kmeans_params;
+    kmeans_params.n_iters = kmeans_n_iters;
+    kmeans_params.metric  = raft::distance::DistanceType::L2Expanded;
+    raft::cluster::kmeans_balanced::helpers::build_clusters(new_handle,
+                                                            kmeans_params,
+                                                            sub_trainset_view,
+                                                            centers_tmp_view,
+                                                            sub_labels_view,
+                                                            cluster_sizes_view,
+                                                            utils::mapping<float>{});
   }
-  transpose_pq_centers(index, pq_centers_tmp.data(), stream);
+  transpose_pq_centers(handle, index, pq_centers_tmp.data());
 }
 
 template <typename IdxT>
-void train_per_cluster(const handle_t& handle,
+void train_per_cluster(raft::device_resources const& handle,
                        index<IdxT>& index,
                        size_t n_rows,
                        const float* trainset,   // [n_rows, dim]
@@ -654,44 +533,427 @@ void train_per_cluster(const handle_t& handle,
                      indices + cluster_offsets[l],
                      device_memory);
 
+    // clone the handle and attached the device memory resource to it
+    const device_resources new_handle(handle, device_memory);
+
     // limit the cluster size to bound the training time.
     // [sic] we interpret the data as pq_len-dimensional
     size_t big_enough     = 256ul * std::max<size_t>(index.pq_book_size(), index.pq_dim());
     size_t available_rows = size_t(cluster_size) * size_t(index.pq_dim());
     auto pq_n_rows        = uint32_t(std::min(big_enough, available_rows));
     // train PQ codebook for this cluster
-    kmeans::build_clusters(
-      handle,
-      kmeans_n_iters,
-      index.pq_len(),
-      rot_vectors.data(),
-      pq_n_rows,
+    auto rot_vectors_view = raft::make_device_matrix_view<const float, IdxT>(
+      rot_vectors.data(), pq_n_rows, index.pq_len());
+    auto centers_tmp_view = raft::make_device_matrix_view<float, IdxT>(
+      pq_centers_tmp.data() + static_cast<size_t>(index.pq_book_size()) *
+                                static_cast<size_t>(index.pq_len()) * static_cast<size_t>(l),
       index.pq_book_size(),
-      pq_centers_tmp.data() + size_t(index.pq_book_size()) * size_t(index.pq_len()) * size_t(l),
-      pq_labels.data(),
-      pq_cluster_sizes.data(),
-      raft::distance::DistanceType::L2Expanded,
-      stream,
-      device_memory);
+      index.pq_len());
+    auto pq_labels_view =
+      raft::make_device_vector_view<uint32_t, IdxT>(pq_labels.data(), pq_n_rows);
+    auto pq_cluster_sizes_view =
+      raft::make_device_vector_view<uint32_t, IdxT>(pq_cluster_sizes.data(), index.pq_book_size());
+    raft::cluster::kmeans_balanced_params kmeans_params;
+    kmeans_params.n_iters = kmeans_n_iters;
+    kmeans_params.metric  = raft::distance::DistanceType::L2Expanded;
+    raft::cluster::kmeans_balanced::helpers::build_clusters(new_handle,
+                                                            kmeans_params,
+                                                            rot_vectors_view,
+                                                            centers_tmp_view,
+                                                            pq_labels_view,
+                                                            pq_cluster_sizes_view,
+                                                            utils::mapping<float>{});
   }
-  transpose_pq_centers(index, pq_centers_tmp.data(), stream);
+  transpose_pq_centers(handle, index, pq_centers_tmp.data());
+}
+
+/**
+ * Sort cluster by their size (descending).
+ *
+ * @return Number of non-empty clusters
+ */
+inline auto reorder_clusters_by_size_desc(raft::device_resources const& handle,
+                                          uint32_t* ordering,
+                                          uint32_t* cluster_sizes_out,
+                                          const uint32_t* cluster_sizes_in,
+                                          uint32_t n_clusters,
+                                          rmm::mr::device_memory_resource* device_memory)
+  -> uint32_t
+{
+  auto stream = handle.get_stream();
+  rmm::device_uvector<uint32_t> cluster_ordering_in(n_clusters, stream, device_memory);
+  thrust::sequence(handle.get_thrust_policy(),
+                   cluster_ordering_in.data(),
+                   cluster_ordering_in.data() + n_clusters);
+
+  int begin_bit             = 0;
+  int end_bit               = sizeof(uint32_t) * 8;
+  size_t cub_workspace_size = 0;
+  cub::DeviceRadixSort::SortPairsDescending(nullptr,
+                                            cub_workspace_size,
+                                            cluster_sizes_in,
+                                            cluster_sizes_out,
+                                            cluster_ordering_in.data(),
+                                            ordering,
+                                            n_clusters,
+                                            begin_bit,
+                                            end_bit,
+                                            stream);
+  rmm::device_buffer cub_workspace(cub_workspace_size, stream, device_memory);
+  cub::DeviceRadixSort::SortPairsDescending(cub_workspace.data(),
+                                            cub_workspace_size,
+                                            cluster_sizes_in,
+                                            cluster_sizes_out,
+                                            cluster_ordering_in.data(),
+                                            ordering,
+                                            n_clusters,
+                                            begin_bit,
+                                            end_bit,
+                                            stream);
+
+  return thrust::lower_bound(handle.get_thrust_policy(),
+                             cluster_sizes_out,
+                             cluster_sizes_out + n_clusters,
+                             0,
+                             thrust::greater<uint32_t>()) -
+         cluster_sizes_out;
 }
 
 /**
- * See raft::spatial::knn::ivf_pq::extend docs.
+ * Compute the code: find the closest cluster in each pq_dim-subspace.
  *
- * This version requires `new_vectors` and `new_indices` (if non-null) to be on-device.
+ * @tparam SubWarpSize
+ *   how many threads work on a single vector;
+ *   bouded by either WarpSize or pq_book_size.
+ *
+ * @param pq_centers
+ *   - codebook_gen::PER_SUBSPACE: [pq_dim , pq_len, pq_book_size]
+ *   - codebook_gen::PER_CLUSTER:  [n_lists, pq_len, pq_book_size]
+ * @param new_vector a single input of length rot_dim, reinterpreted as [pq_dim, pq_len].
+ *   the input must be already transformed to floats, rotated, and the level 1 cluster
+ *   center must be already substructed (i.e. this is the residual of a single input vector).
+ * @param codebook_kind
+ * @param j index along pq_dim "dimension"
+ * @param cluster_ix is used for PER_CLUSTER codebooks.
+ */
+template <uint32_t SubWarpSize>
+__device__ auto compute_pq_code(
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
+  device_mdspan<const float, extent_2d<uint32_t>, row_major> new_vector,
+  codebook_gen codebook_kind,
+  uint32_t j,
+  uint32_t cluster_ix) -> uint8_t
+{
+  using subwarp_align = Pow2<SubWarpSize>;
+  uint32_t lane_id    = subwarp_align::mod(laneId());
+  uint32_t partition_ix;
+  switch (codebook_kind) {
+    case codebook_gen::PER_CLUSTER: {
+      partition_ix = cluster_ix;
+    } break;
+    case codebook_gen::PER_SUBSPACE: {
+      partition_ix = j;
+    } break;
+    default: __builtin_unreachable();
+  }
+
+  const uint32_t pq_book_size = pq_centers.extent(2);
+  const uint32_t pq_len       = pq_centers.extent(1);
+  float min_dist              = std::numeric_limits<float>::infinity();
+  uint8_t code                = 0;
+  // calculate the distance for each PQ cluster, find the minimum for each thread
+  for (uint32_t i = lane_id; i < pq_book_size; i += subwarp_align::Value) {
+    // NB: the L2 quantifiers on residuals are always trained on L2 metric.
+    float d = 0.0f;
+    for (uint32_t k = 0; k < pq_len; k++) {
+      auto t = new_vector(j, k) - pq_centers(partition_ix, k, i);
+      d += t * t;
+    }
+    if (d < min_dist) {
+      min_dist = d;
+      code     = uint8_t(i);
+    }
+  }
+  // reduce among threads
+#pragma unroll
+  for (uint32_t stride = SubWarpSize >> 1; stride > 0; stride >>= 1) {
+    const auto other_dist = shfl_xor(min_dist, stride, SubWarpSize);
+    const auto other_code = shfl_xor(code, stride, SubWarpSize);
+    if (other_dist < min_dist) {
+      min_dist = other_dist;
+      code     = other_code;
+    }
+  }
+  return code;
+}
+
+template <uint32_t BlockSize, uint32_t PqBits, typename IdxT>
+__launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel(
+  device_mdspan<const float, extent_2d<IdxT>, row_major> new_vectors,
+  std::variant<IdxT, const IdxT*> src_offset_or_indices,
+  const uint32_t* new_labels,
+  device_mdspan<uint32_t, extent_1d<uint32_t>, row_major> list_sizes,
+  device_mdspan<const IdxT, extent_1d<uint32_t>, row_major> list_offsets,
+  device_mdspan<IdxT, extent_1d<IdxT>, row_major> pq_indices,
+  device_mdspan<pq_vec_t, pq_int_vec_exts, row_major> pq_dataset,
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
+  codebook_gen codebook_kind)
+{
+  constexpr uint32_t kSubWarpSize = std::min<uint32_t>(WarpSize, 1u << PqBits);
+  using subwarp_align             = Pow2<kSubWarpSize>;
+  const uint32_t lane_id          = subwarp_align::mod(threadIdx.x);
+  const IdxT row_ix = subwarp_align::div(IdxT{threadIdx.x} + IdxT{blockDim.x} * IdxT{blockIdx.x});
+  if (row_ix >= new_vectors.extent(0)) { return; }
+
+  const uint32_t cluster_ix = new_labels[row_ix];
+  uint32_t out_incluster_ix;
+  if (lane_id == 0) { out_incluster_ix = atomicAdd(&list_sizes(cluster_ix), 1); }
+  out_incluster_ix  = shfl(out_incluster_ix, 0, kSubWarpSize);
+  const IdxT out_ix = list_offsets(cluster_ix) + out_incluster_ix;
+
+  // write the label
+  if (lane_id == 0) {
+    if (std::holds_alternative<IdxT>(src_offset_or_indices)) {
+      pq_indices(out_ix) = std::get<IdxT>(src_offset_or_indices) + row_ix;
+    } else {
+      pq_indices(out_ix) = std::get<const IdxT*>(src_offset_or_indices)[row_ix];
+    }
+  }
+
+  // write the codes
+  using group_align         = Pow2<kIndexGroupSize>;
+  const uint32_t group_ix   = group_align::div(out_ix);
+  const uint32_t ingroup_ix = group_align::mod(out_ix);
+  const uint32_t pq_len     = pq_centers.extent(1);
+  const uint32_t pq_dim     = new_vectors.extent(1) / pq_len;
+
+  __shared__ pq_vec_t codes[subwarp_align::div(BlockSize)];
+  pq_vec_t& code = codes[subwarp_align::div(threadIdx.x)];
+  bitfield_view_t<PqBits> out{reinterpret_cast<uint8_t*>(&code)};
+  constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits;
+  for (uint32_t j = 0, i = 0; j < pq_dim; i++) {
+    // clear the chunk for writing
+    if (lane_id == 0) { code = pq_vec_t{}; }
+    // fill-in the values, one/pq_dim at a time
+#pragma unroll
+    for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) {
+      // find the label
+      using layout_t   = typename decltype(new_vectors)::layout_type;
+      using accessor_t = typename decltype(new_vectors)::accessor_type;
+      auto one_vector  = mdspan<const float, extent_2d<uint32_t>, layout_t, accessor_t>(
+        &new_vectors(row_ix, 0), extent_2d<uint32_t>{pq_dim, pq_len});
+      auto l = compute_pq_code<kSubWarpSize>(pq_centers, one_vector, codebook_kind, j, cluster_ix);
+      if (lane_id == 0) { out[k] = l; }
+    }
+    // write the chunk into the dataset
+    if (lane_id == 0) { pq_dataset(group_ix, i, ingroup_ix) = code; }
+  }
+}
+
+/**
+ * Assuming the index already has some data and allocated the space for more, write more data in it.
+ * There must be enough free space in `pq_dataset()` and `indices()`, as computed using
+ * `list_offsets()` and `list_sizes()`.
+ *
+ * NB: Since the pq_dataset is stored in the interleaved blocked format (see ivf_pq_types.hpp), one
+ * cannot just concatenate the old and the new codes; the positions for the codes are determined the
+ * same way as in the ivfpq_compute_similarity_kernel (see ivf_pq_search.cuh).
+ *
+ * @tparam T
+ * @tparam IdxT
+ *
+ * @param handle
+ * @param index
+ * @param[in] new_vectors
+ *    a pointer to a row-major device array [index.dim(), n_rows];
+ * @param[in] src_offset_or_indices
+ *    references for the new data:
+ *      either a starting index for the auto-indexing
+ *      or a pointer to a device array of explicit indices [n_rows];
+ * @param[in] new_labels
+ *    cluster ids (first-level quantization) - a device array [n_rows];
+ * @param n_rows
+ *    the number of records to write in.
+ * @param mr
+ *    a memory resource to use for device allocations
  */
 template <typename T, typename IdxT>
-inline auto extend_device(const handle_t& handle,
-                          const index<IdxT>& orig_index,
-                          const T* new_vectors,
-                          const IdxT* new_indices,
-                          IdxT n_rows) -> index<IdxT>
+void process_and_fill_codes(raft::device_resources const& handle,
+                            index<IdxT>& index,
+                            const T* new_vectors,
+                            std::variant<IdxT, const IdxT*> src_offset_or_indices,
+                            const uint32_t* new_labels,
+                            IdxT n_rows,
+                            rmm::mr::device_memory_resource* mr)
+{
+  pq_int_vec_exts pq_extents = make_extents<size_t>(index.pq_dataset().extent(0),
+                                                    index.pq_dataset().extent(1),
+                                                    index.pq_dataset().static_extent(2));
+  auto pq_dataset            = make_mdspan<pq_vec_t, size_t, row_major, false, true>(
+    reinterpret_cast<pq_vec_t*>(index.pq_dataset().data_handle()), pq_extents);
+
+  auto new_vectors_residual =
+    make_device_mdarray<float>(handle, mr, make_extents<IdxT>(n_rows, index.rot_dim()));
+
+  flat_compute_residuals(handle,
+                         new_vectors_residual.data_handle(),
+                         n_rows,
+                         index.rotation_matrix(),
+                         index.centers(),
+                         new_vectors,
+                         new_labels,
+                         mr);
+
+  constexpr uint32_t kBlockSize  = 256;
+  const uint32_t threads_per_vec = std::min<uint32_t>(WarpSize, index.pq_book_size());
+  dim3 blocks(div_rounding_up_safe<IdxT>(n_rows, kBlockSize / threads_per_vec), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto kernel = [](uint32_t pq_bits) {
+    switch (pq_bits) {
+      case 4: return process_and_fill_codes_kernel<kBlockSize, 4, IdxT>;
+      case 5: return process_and_fill_codes_kernel<kBlockSize, 5, IdxT>;
+      case 6: return process_and_fill_codes_kernel<kBlockSize, 6, IdxT>;
+      case 7: return process_and_fill_codes_kernel<kBlockSize, 7, IdxT>;
+      case 8: return process_and_fill_codes_kernel<kBlockSize, 8, IdxT>;
+      default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
+    }
+  }(index.pq_bits());
+  kernel<<<blocks, threads, 0, handle.get_stream()>>>(new_vectors_residual.view(),
+                                                      src_offset_or_indices,
+                                                      new_labels,
+                                                      index.list_sizes(),
+                                                      index.list_offsets(),
+                                                      index.indices(),
+                                                      pq_dataset,
+                                                      index.pq_centers(),
+                                                      index.codebook_kind());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+/**
+ * Fill the `target` index with the data from the `source`, except `list_offsets`.
+ * The `target` index must have the same settings and valid `list_offsets`, and must have been
+ * pre-allocated to fit the whole `source` data.
+ * As a result, the `target` index is in a valid state; it's identical to the `source`, except
+ * has more unused space in `pq_dataset`.
+ *
+ * @param target the index to be filled-in
+ * @param source the index to get data from
+ * @param cluster_ordering
+ *   a pointer to the managed data [n_clusters];
+ *   the mapping `source_label = cluster_ordering[target_label]`
+ * @param stream
+ */
+template <typename IdxT>
+void copy_index_data(index<IdxT>& target,
+                     const index<IdxT>& source,
+                     const uint32_t* cluster_ordering,
+                     rmm::cuda_stream_view stream)
+{
+  RAFT_EXPECTS(target.size() >= source.size(),
+               "The target index must be not smaller than the source index.");
+  RAFT_EXPECTS(target.n_lists() == source.n_lists(),
+               "The target and the source are not compatible (different numbers of clusters).");
+  RAFT_EXPECTS(target.rot_dim() == source.rot_dim() && target.dim_ext() == source.dim_ext(),
+               "The target and the source are not compatible (different dimensionality).");
+
+  // Copy the unchanged parts
+  copy(target.rotation_matrix().data_handle(),
+       source.rotation_matrix().data_handle(),
+       source.rotation_matrix().size(),
+       stream);
+
+  // copy cluster-ordering-dependent data
+  raft::matrix::gather(source.list_sizes().data_handle(),
+                       IdxT{1},
+                       static_cast<IdxT>(source.n_lists()),
+                       cluster_ordering,
+                       static_cast<IdxT>(target.n_lists()),
+                       target.list_sizes().data_handle(),
+                       stream);
+  raft::matrix::gather(source.centers().data_handle(),
+                       static_cast<IdxT>(target.dim_ext()),
+                       static_cast<IdxT>(source.n_lists()),
+                       cluster_ordering,
+                       static_cast<IdxT>(target.n_lists()),
+                       target.centers().data_handle(),
+                       stream);
+  raft::matrix::gather(source.centers_rot().data_handle(),
+                       static_cast<IdxT>(target.rot_dim()),
+                       static_cast<IdxT>(source.n_lists()),
+                       cluster_ordering,
+                       static_cast<IdxT>(target.n_lists()),
+                       target.centers_rot().data_handle(),
+                       stream);
+  switch (source.codebook_kind()) {
+    case codebook_gen::PER_SUBSPACE: {
+      copy(target.pq_centers().data_handle(),
+           source.pq_centers().data_handle(),
+           source.pq_centers().size(),
+           stream);
+    } break;
+    case codebook_gen::PER_CLUSTER: {
+      raft::matrix::gather(source.pq_centers().data_handle(),
+                           static_cast<IdxT>(source.pq_book_size() * source.pq_len()),
+                           static_cast<IdxT>(source.n_lists()),
+                           cluster_ordering,
+                           static_cast<IdxT>(target.n_lists()),
+                           target.pq_centers().data_handle(),
+                           stream);
+    } break;
+    default: RAFT_FAIL("Unreachable code");
+  }
+
+  // Fill the data with the old clusters.
+  if (source.size() > 0) {
+    std::vector<IdxT> target_cluster_offsets(target.n_lists() + 1);
+    std::vector<IdxT> source_cluster_offsets(target.n_lists() + 1);
+    std::vector<uint32_t> source_cluster_sizes(target.n_lists());
+    copy(target_cluster_offsets.data(),
+         target.list_offsets().data_handle(),
+         target.list_offsets().size(),
+         stream);
+    copy(source_cluster_offsets.data(),
+         source.list_offsets().data_handle(),
+         source.list_offsets().size(),
+         stream);
+    copy(source_cluster_sizes.data(),
+         source.list_sizes().data_handle(),
+         source.list_sizes().size(),
+         stream);
+    stream.synchronize();
+    auto data_exts = target.pq_dataset().extents();
+    auto data_unit = size_t(data_exts.extent(3)) * size_t(data_exts.extent(1));
+    auto data_mod  = size_t(data_exts.extent(2));
+    for (uint32_t l = 0; l < target.n_lists(); l++) {
+      auto k                   = cluster_ordering[l];
+      auto source_cluster_size = source_cluster_sizes[k];
+      if (source_cluster_size > 0) {
+        copy(target.indices().data_handle() + target_cluster_offsets[l],
+             source.indices().data_handle() + source_cluster_offsets[k],
+             source_cluster_size,
+             stream);
+        copy(target.pq_dataset().data_handle() + target_cluster_offsets[l] * data_unit,
+             source.pq_dataset().data_handle() + source_cluster_offsets[k] * data_unit,
+             round_up_safe<size_t>(source_cluster_size, data_mod) * data_unit,
+             stream);
+      }
+    }
+  }
+}
+
+/** See raft::spatial::knn::ivf_pq::extend docs */
+template <typename T, typename IdxT>
+auto extend(raft::device_resources const& handle,
+            const index<IdxT>& orig_index,
+            const T* new_vectors,
+            const IdxT* new_indices,
+            IdxT n_rows) -> index<IdxT>
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "ivf_pq::extend(%zu, %u)", size_t(n_rows), orig_index.dim());
-  auto stream = handle.get_stream();
+  auto stream           = handle.get_stream();
+  const auto n_clusters = orig_index.n_lists();
 
   RAFT_EXPECTS(new_indices != nullptr || orig_index.size() == 0,
                "You must pass data indices when the index is non-empty.");
@@ -699,13 +961,6 @@ inline auto extend_device(const handle_t& handle,
   static_assert(std::is_same_v<T, float> || std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
                 "Unsupported data type");
 
-  switch (new_indices != nullptr ? utils::check_pointer_residency(new_vectors, new_indices)
-                                 : utils::check_pointer_residency(new_vectors)) {
-    case utils::pointer_residency::device_only:
-    case utils::pointer_residency::host_and_device: break;
-    default: RAFT_FAIL("[ivf_pq::extend_device] The added data must be available on device.");
-  }
-
   rmm::mr::device_memory_resource* device_memory = nullptr;
   auto pool_guard = raft::get_pool_memory_resource(device_memory, 1024 * 1024);
   if (pool_guard) {
@@ -717,154 +972,138 @@ inline auto extend_device(const handle_t& handle,
   rmm::mr::pool_memory_resource<rmm::mr::managed_memory_resource> managed_memory(
     &managed_memory_upstream, 1024 * 1024);
 
-  //
-  // The cluster_centers stored in index contain data other than cluster
-  // centroids to speed up the search. Here, only the cluster centroids
-  // are extracted.
-  //
-  const auto n_clusters = orig_index.n_lists();
+  // Try to allocate an index with the same parameters and the projected new size
+  // (which can be slightly larger than index.size() + n_rows, due to padding).
+  // If this fails, the index would be too big to fit in the device anyway.
+  std::optional<index<IdxT>> placeholder_index(std::in_place_t{},
+                                               handle,
+                                               orig_index.metric(),
+                                               orig_index.codebook_kind(),
+                                               n_clusters,
+                                               orig_index.dim(),
+                                               orig_index.pq_bits(),
+                                               orig_index.pq_dim(),
+                                               orig_index.n_nonempty_lists());
+  placeholder_index->allocate(
+    handle,
+    orig_index.size() + n_rows + (kIndexGroupSize - 1) * std::min<IdxT>(n_clusters, n_rows));
+
+  // Available device memory
+  size_t free_mem, total_mem;
+  RAFT_CUDA_TRY(cudaMemGetInfo(&free_mem, &total_mem));
+
+  // Decide on an approximate threshold when we'd better start saving device memory by using
+  // managed allocations for large device buffers
+  rmm::mr::device_memory_resource* labels_mr  = device_memory;
+  rmm::mr::device_memory_resource* batches_mr = device_memory;
+  if (n_rows *
+        (orig_index.dim() * sizeof(T) + orig_index.pq_dim() + sizeof(IdxT) + sizeof(uint32_t)) >
+      free_mem) {
+    labels_mr = &managed_memory;
+  }
+  // Allocate a buffer for the new labels (classifying the new data)
+  rmm::device_uvector<uint32_t> new_data_labels(n_rows, stream, labels_mr);
+  if (labels_mr == device_memory) { free_mem -= sizeof(uint32_t) * n_rows; }
 
-  rmm::device_uvector<float> cluster_centers(
-    size_t(n_clusters) * size_t(orig_index.dim()), stream, device_memory);
-  RAFT_CUDA_TRY(cudaMemcpy2DAsync(cluster_centers.data(),
-                                  sizeof(float) * orig_index.dim(),
-                                  orig_index.centers().data_handle(),
-                                  sizeof(float) * orig_index.dim_ext(),
-                                  sizeof(float) * orig_index.dim(),
-                                  n_clusters,
-                                  cudaMemcpyDefault,
-                                  stream));
-
-  //
-  // Use the existing cluster centroids to find the label (cluster ID)
-  // of the vector to be added.
-  //
-
-  rmm::device_uvector<uint32_t> new_data_labels(n_rows, stream, device_memory);
-  utils::memzero(new_data_labels.data(), n_rows, stream);
-  rmm::device_uvector<uint32_t> new_cluster_sizes_buf(n_clusters, stream, &managed_memory);
-  auto new_cluster_sizes = new_cluster_sizes_buf.data();
-  utils::memzero(new_cluster_sizes, n_clusters, stream);
-
-  kmeans::predict(handle,
-                  cluster_centers.data(),
-                  n_clusters,
-                  orig_index.dim(),
-                  new_vectors,
-                  n_rows,
-                  new_data_labels.data(),
-                  orig_index.metric(),
-                  stream);
-  raft::stats::histogram<uint32_t, IdxT>(raft::stats::HistTypeAuto,
-                                         reinterpret_cast<int32_t*>(new_cluster_sizes),
-                                         IdxT(n_clusters),
-                                         new_data_labels.data(),
-                                         n_rows,
-                                         1,
-                                         stream);
-
-  //
-  // Make new_cluster_offsets, new_data_indices
-  //
-  rmm::device_uvector<IdxT> new_data_indices(n_rows, stream, &managed_memory);
-  rmm::device_uvector<IdxT> new_cluster_offsets(n_clusters + 1, stream, &managed_memory);
-  uint32_t new_max_cluster_size = calculate_offsets_and_indices(n_rows,
-                                                                n_clusters,
-                                                                new_data_labels.data(),
-                                                                new_cluster_sizes,
-                                                                new_cluster_offsets.data(),
-                                                                new_data_indices.data(),
-                                                                stream);
-
-  //
-  // Compute PQ code for new vectors
-  //
-  pq_codes_exts new_pq_exts = make_extents<size_t>(
-    n_rows, orig_index.pq_dataset().extent(1), orig_index.pq_dataset().static_extent(3));
-  auto new_pq_codes = make_device_mdarray<uint8_t>(handle, device_memory, new_pq_exts);
-  compute_pq_codes<T>(handle,
-                      n_rows,
-                      orig_index.dim(),
-                      orig_index.rot_dim(),
-                      orig_index.pq_dim(),
-                      orig_index.pq_len(),
-                      orig_index.pq_bits(),
-                      n_clusters,
-                      orig_index.codebook_kind(),
-                      new_max_cluster_size,
-                      cluster_centers.data(),
-                      orig_index.rotation_matrix().data_handle(),
-                      new_vectors,
-                      new_data_indices.data(),
-                      new_cluster_sizes,
-                      new_cluster_offsets.data(),
-                      orig_index.pq_centers(),
-                      new_pq_codes.view(),
-                      device_memory);
+  // Calculate the batch size for the input data if it's not accessible directly from the device
+  constexpr size_t kReasonableMaxBatchSize = 65536;
+  size_t max_batch_size                    = std::min<size_t>(n_rows, kReasonableMaxBatchSize);
+  {
+    size_t size_factor = 0;
+    // we'll use two temporary buffers for converted inputs when computing the codes.
+    size_factor += (orig_index.dim() + orig_index.rot_dim()) * sizeof(float);
+    // ...and another buffer for indices
+    size_factor += sizeof(IdxT);
+    // if the input data is not accessible on device, we'd need a buffer for it.
+    switch (utils::check_pointer_residency(new_vectors)) {
+      case utils::pointer_residency::device_only:
+      case utils::pointer_residency::host_and_device: break;
+      default: size_factor += orig_index.dim() * sizeof(T);
+    }
+    // the same with indices
+    if (new_indices != nullptr) {
+      switch (utils::check_pointer_residency(new_indices)) {
+        case utils::pointer_residency::device_only:
+        case utils::pointer_residency::host_and_device: break;
+        default: size_factor += sizeof(IdxT);
+      }
+    }
+    // make the batch size fit into the remaining memory
+    while (size_factor * max_batch_size > free_mem && max_batch_size > 128) {
+      max_batch_size >>= 1;
+    }
+    if (size_factor * max_batch_size > free_mem) {
+      // if that still doesn't fit, resort to the UVM
+      batches_mr     = &managed_memory;
+      max_batch_size = kReasonableMaxBatchSize;
+    } else {
+      // If we're keeping the batches in device memory, update the available mem tracker.
+      free_mem -= size_factor * max_batch_size;
+    }
+  }
+
+  // Predict the cluster labels for the new data, in batches if necessary
+  utils::batch_load_iterator<T> vec_batches(
+    new_vectors, n_rows, orig_index.dim(), max_batch_size, stream, batches_mr);
+  // Release the placeholder memory, because we don't intend to allocate any more long-living
+  // temporary buffers before we allocate the ext_index data.
+  // This memory could potentially speed up UVM accesses, if any.
+  placeholder_index.reset();
+  {
+    // The cluster centers in the index are stored padded, which is not acceptable by
+    // the kmeans_balanced::predict. Thus, we need the restructuring copy.
+    rmm::device_uvector<float> cluster_centers(
+      size_t(n_clusters) * size_t(orig_index.dim()), stream, device_memory);
+    RAFT_CUDA_TRY(cudaMemcpy2DAsync(cluster_centers.data(),
+                                    sizeof(float) * orig_index.dim(),
+                                    orig_index.centers().data_handle(),
+                                    sizeof(float) * orig_index.dim_ext(),
+                                    sizeof(float) * orig_index.dim(),
+                                    n_clusters,
+                                    cudaMemcpyDefault,
+                                    stream));
+    for (const auto& batch : vec_batches) {
+      auto batch_data_view =
+        raft::make_device_matrix_view<const T, IdxT>(batch.data(), batch.size(), orig_index.dim());
+      auto batch_labels_view = raft::make_device_vector_view<uint32_t, IdxT>(
+        new_data_labels.data() + batch.offset(), batch.size());
+      auto centers_view = raft::make_device_matrix_view<const float, IdxT>(
+        cluster_centers.data(), n_clusters, orig_index.dim());
+      raft::cluster::kmeans_balanced_params kmeans_params;
+      kmeans_params.metric = orig_index.metric();
+      raft::cluster::kmeans_balanced::predict(handle,
+                                              kmeans_params,
+                                              batch_data_view,
+                                              centers_view,
+                                              batch_labels_view,
+                                              utils::mapping<float>{});
+    }
+  }
 
   // Get the combined cluster sizes and sort the clusters in decreasing order
   // (this makes it easy to estimate the max number of samples during search).
-  rmm::device_uvector<uint32_t> old_cluster_sizes_buf(n_clusters, stream, &managed_memory);
-  rmm::device_uvector<uint32_t> ext_cluster_sizes_buf(n_clusters, stream, &managed_memory);
-  rmm::device_uvector<IdxT> old_cluster_offsets_buf(n_clusters + 1, stream, &managed_memory);
-  rmm::device_uvector<IdxT> ext_cluster_offsets_buf(n_clusters + 1, stream, &managed_memory);
   rmm::device_uvector<uint32_t> cluster_ordering_buf(n_clusters, stream, &managed_memory);
-  auto old_cluster_sizes   = old_cluster_sizes_buf.data();
-  auto ext_cluster_sizes   = ext_cluster_sizes_buf.data();
-  auto old_cluster_offsets = old_cluster_offsets_buf.data();
-  auto ext_cluster_offsets = ext_cluster_offsets_buf.data();
-  auto cluster_ordering    = cluster_ordering_buf.data();
-  copy(old_cluster_offsets,
-       orig_index.list_offsets().data_handle(),
-       orig_index.list_offsets().size(),
-       stream);
-  copy(old_cluster_sizes,
-       orig_index.list_sizes().data_handle(),
-       orig_index.list_sizes().size(),
-       stream);
-
+  rmm::device_uvector<uint32_t> ext_cluster_sizes_buf(n_clusters, stream, device_memory);
+  auto cluster_ordering     = cluster_ordering_buf.data();
+  auto ext_cluster_sizes    = ext_cluster_sizes_buf.data();
   uint32_t n_nonempty_lists = 0;
   {
-    rmm::device_uvector<uint32_t> ext_cluster_sizes_buf_in(n_clusters, stream, device_memory);
-    rmm::device_uvector<uint32_t> cluster_ordering_in(n_clusters, stream, device_memory);
-    auto ext_cluster_sizes_in = ext_cluster_sizes_buf_in.data();
-    linalg::add(ext_cluster_sizes_in, old_cluster_sizes, new_cluster_sizes, n_clusters, stream);
-
-    thrust::sequence(handle.get_thrust_policy(),
-                     cluster_ordering_in.data(),
-                     cluster_ordering_in.data() + n_clusters);
-
-    int begin_bit             = 0;
-    int end_bit               = sizeof(uint32_t) * 8;
-    size_t cub_workspace_size = 0;
-    cub::DeviceRadixSort::SortPairsDescending(nullptr,
-                                              cub_workspace_size,
-                                              ext_cluster_sizes_in,
-                                              ext_cluster_sizes,
-                                              cluster_ordering_in.data(),
-                                              cluster_ordering,
-                                              n_clusters,
-                                              begin_bit,
-                                              end_bit,
-                                              stream);
-    rmm::device_buffer cub_workspace(cub_workspace_size, stream, device_memory);
-    cub::DeviceRadixSort::SortPairsDescending(cub_workspace.data(),
-                                              cub_workspace_size,
-                                              ext_cluster_sizes_in,
-                                              ext_cluster_sizes,
-                                              cluster_ordering_in.data(),
-                                              cluster_ordering,
-                                              n_clusters,
-                                              begin_bit,
-                                              end_bit,
-                                              stream);
-
-    n_nonempty_lists = thrust::lower_bound(handle.get_thrust_policy(),
-                                           ext_cluster_sizes,
-                                           ext_cluster_sizes + n_clusters,
-                                           0,
-                                           thrust::greater<uint32_t>()) -
-                       ext_cluster_sizes;
+    rmm::device_uvector<uint32_t> new_cluster_sizes_buf(n_clusters, stream, device_memory);
+    auto new_cluster_sizes = new_cluster_sizes_buf.data();
+    raft::stats::histogram<uint32_t, IdxT>(raft::stats::HistTypeAuto,
+                                           reinterpret_cast<int32_t*>(new_cluster_sizes),
+                                           IdxT(n_clusters),
+                                           new_data_labels.data(),
+                                           n_rows,
+                                           1,
+                                           stream);
+    linalg::add(new_cluster_sizes,
+                new_cluster_sizes,
+                orig_index.list_sizes().data_handle(),
+                n_clusters,
+                stream);
+    n_nonempty_lists = reorder_clusters_by_size_desc(
+      handle, cluster_ordering, ext_cluster_sizes, new_cluster_sizes, n_clusters, device_memory);
   }
 
   // Assemble the extended index
@@ -876,195 +1115,70 @@ inline auto extend_device(const handle_t& handle,
                         orig_index.pq_bits(),
                         orig_index.pq_dim(),
                         n_nonempty_lists);
-  // calculate extended cluster offsets
+  // calculate extended cluster offsets and allocate the index data
   {
-    using group_align = Pow2<kIndexGroupSize>;
-    IdxT size         = 0;
+    auto ext_cluster_offsets = ext_index.list_offsets().data_handle();
+    using group_align        = Pow2<kIndexGroupSize>;
+    IdxT size                = 0;
     update_device(ext_cluster_offsets, &size, 1, stream);
-    thrust::inclusive_scan(
-      handle.get_thrust_policy(),
-      ext_cluster_sizes,
-      ext_cluster_sizes + n_clusters,
-      ext_cluster_offsets + 1,
-      [] __device__(IdxT a, IdxT b) { return group_align::roundUp(a) + group_align::roundUp(b); });
+    auto sizes_padded = thrust::make_transform_iterator(
+      ext_cluster_sizes, [] __device__ __host__(uint32_t x) -> IdxT {
+        return IdxT{Pow2<kIndexGroupSize>::roundUp(x)};
+      });
+    thrust::inclusive_scan(handle.get_thrust_policy(),
+                           sizes_padded,
+                           sizes_padded + n_clusters,
+                           ext_cluster_offsets + 1,
+                           add_op{});
     update_host(&size, ext_cluster_offsets + n_clusters, 1, stream);
-    handle.sync_stream();
-    copy(ext_index.list_offsets().data_handle(),
-         ext_cluster_offsets,
-         ext_index.list_offsets().size(),
-         stream);
-    copy(ext_index.list_sizes().data_handle(),
-         ext_cluster_sizes,
-         ext_index.list_sizes().size(),
-         stream);
+    handle.sync_stream();  // syncs `size`, `cluster_ordering`
     ext_index.allocate(handle, size);
   }
 
-  // Copy the unchanged parts
-  copy(ext_index.rotation_matrix().data_handle(),
-       orig_index.rotation_matrix().data_handle(),
-       orig_index.rotation_matrix().size(),
-       stream);
+  // pre-fill the extended index with the data from the original index
+  copy_index_data(ext_index, orig_index, cluster_ordering, stream);
 
-  // copy cluster-ordering-dependent data
-  utils::copy_selected(n_clusters,
-                       ext_index.dim_ext(),
-                       orig_index.centers().data_handle(),
-                       cluster_ordering,
-                       orig_index.dim_ext(),
-                       ext_index.centers().data_handle(),
-                       ext_index.dim_ext(),
-                       stream);
-  utils::copy_selected(n_clusters,
-                       ext_index.rot_dim(),
-                       orig_index.centers_rot().data_handle(),
-                       cluster_ordering,
-                       orig_index.rot_dim(),
-                       ext_index.centers_rot().data_handle(),
-                       ext_index.rot_dim(),
-                       stream);
-  switch (orig_index.codebook_kind()) {
-    case codebook_gen::PER_SUBSPACE: {
-      copy(ext_index.pq_centers().data_handle(),
-           orig_index.pq_centers().data_handle(),
-           orig_index.pq_centers().size(),
-           stream);
-    } break;
-    case codebook_gen::PER_CLUSTER: {
-      auto d = orig_index.pq_book_size() * orig_index.pq_len();
-      utils::copy_selected(n_clusters,
-                           d,
-                           orig_index.pq_centers().data_handle(),
-                           cluster_ordering,
-                           d,
-                           ext_index.pq_centers().data_handle(),
-                           d,
-                           stream);
-    } break;
-    default: RAFT_FAIL("Unreachable code");
-  }
-
-  // Make ext_indices
-  handle.sync_stream();  // make sure cluster sizes are up-to-date
-  auto ext_indices = ext_index.indices().data_handle();
-  for (uint32_t l = 0; l < ext_index.n_lists(); l++) {
-    auto k                = cluster_ordering[l];
-    auto old_cluster_size = old_cluster_sizes[k];
-    auto new_cluster_size = new_cluster_sizes[k];
-    if (old_cluster_size > 0) {
-      copy(ext_indices + ext_cluster_offsets[l],
-           orig_index.indices().data_handle() + old_cluster_offsets[k],
-           old_cluster_size,
-           stream);
-    }
-    if (new_cluster_size > 0) {
-      if (new_indices == nullptr) {
-        // implies the orig index is empty
-        copy(ext_indices + ext_cluster_offsets[l] + old_cluster_size,
-             new_data_indices.data() + new_cluster_offsets.data()[k],
-             new_cluster_size,
-             stream);
-      } else {
-        utils::copy_selected((IdxT)new_cluster_size,
-                             (IdxT)1,
-                             new_indices,
-                             new_data_indices.data() + new_cluster_offsets.data()[k],
-                             (IdxT)1,
-                             ext_indices + ext_cluster_offsets[l] + old_cluster_size,
-                             (IdxT)1,
-                             stream);
-      }
+  // update the labels to correspond to the new cluster ordering
+  {
+    rmm::device_uvector<uint32_t> cluster_ordering_rev_buf(n_clusters, stream, &managed_memory);
+    auto cluster_ordering_rev = cluster_ordering_rev_buf.data();
+    for (uint32_t i = 0; i < n_clusters; i++) {
+      cluster_ordering_rev[cluster_ordering[i]] = i;
     }
+    linalg::unaryOp(
+      new_data_labels.data(),
+      new_data_labels.data(),
+      new_data_labels.size(),
+      [cluster_ordering_rev] __device__(uint32_t i) { return cluster_ordering_rev[i]; },
+      stream);
   }
 
-  /* Extend the pq_dataset */
-  // For simplicity and performance, we reinterpret the last dimension of the dataset
-  // as a single vector element.
-  using vec_t = TxN_t<uint8_t, kIndexGroupVecLen>::io_t;
-
-  auto data_unit      = ext_index.pq_dataset().extent(1);
-  auto ext_pq_dataset = make_mdspan<vec_t, size_t, row_major, false, true>(
-    reinterpret_cast<vec_t*>(ext_index.pq_dataset().data_handle()),
-    make_extents<size_t>(
-      ext_index.pq_dataset().extent(0), data_unit, ext_index.pq_dataset().extent(2)));
-
-  for (uint32_t l = 0; l < ext_index.n_lists(); l++) {
-    // Extend the data cluster-by-cluster;
-    // The original/old index stores the data interleaved;
-    // the new data produced by `compute_pq_codes` is not interleaved.
-    auto k                = cluster_ordering[l];
-    auto old_cluster_size = old_cluster_sizes[k];
-    auto old_pq_dataset   = make_mdspan<const vec_t, size_t, row_major, false, true>(
-      reinterpret_cast<const vec_t*>(orig_index.pq_dataset().data_handle()) +
-        data_unit * old_cluster_offsets[k],
-      make_extents<size_t>(div_rounding_up_safe(old_cluster_size, kIndexGroupSize),
-                           data_unit,
-                           ext_pq_dataset.extent(2)));
-    auto new_pq_data = make_mdspan<vec_t, size_t, row_major, false, true>(
-      reinterpret_cast<vec_t*>(new_pq_codes.data_handle()) +
-        data_unit * new_cluster_offsets.data()[k],
-      make_extents<size_t>(new_cluster_sizes[k], data_unit));
-    // Write all cluster data, vec-by-vec
-    linalg::writeOnlyUnaryOp(
-      ext_pq_dataset.data_handle() + data_unit * ext_cluster_offsets[l],
-      data_unit * size_t(ext_cluster_offsets[l + 1] - ext_cluster_offsets[l]),
-      [old_pq_dataset, new_pq_data, old_cluster_size] __device__(vec_t * out, size_t i_flat) {
-        // find the proper 3D index from the flat offset
-        size_t i[3];
-        for (int r = 2; r > 0; r--) {
-          i[r] = i_flat % old_pq_dataset.extent(r);
-          i_flat /= old_pq_dataset.extent(r);
-        }
-        i[0]        = i_flat;
-        auto row_ix = i[0] * old_pq_dataset.extent(2) + i[2];
-        if (row_ix < old_cluster_size) {
-          // First, pack the original/old data
-          *out = old_pq_dataset(i[0], i[1], i[2]);
-        } else {
-          // Then add the new data
-          row_ix -= old_cluster_size;
-          if (row_ix < new_pq_data.extent(0)) {
-            *out = new_pq_data(row_ix, i[1]);
-          } else {
-            *out = vec_t{};
-          }
-        }
-      },
-      stream);
+  // fill the extended index with the new data (possibly, in batches)
+  utils::batch_load_iterator<IdxT> idx_batches(
+    new_indices, n_rows, 1, max_batch_size, stream, batches_mr);
+  for (const auto& vec_batch : vec_batches) {
+    const auto& idx_batch = *idx_batches++;
+    process_and_fill_codes(handle,
+                           ext_index,
+                           vec_batch.data(),
+                           new_indices != nullptr
+                             ? std::variant<IdxT, const IdxT*>(idx_batch.data())
+                             : std::variant<IdxT, const IdxT*>(IdxT(idx_batch.offset())),
+                           new_data_labels.data() + vec_batch.offset(),
+                           IdxT(vec_batch.size()),
+                           batches_mr);
   }
 
   return ext_index;
 }
 
-/** See raft::spatial::knn::ivf_pq::extend docs */
-template <typename T, typename IdxT>
-inline auto extend(const handle_t& handle,
-                   const index<IdxT>& orig_index,
-                   const T* new_vectors,
-                   const IdxT* new_indices,
-                   IdxT n_rows) -> index<IdxT>
-{
-  size_t vec_size = sizeof(T) * size_t(n_rows) * size_t(orig_index.dim());
-  size_t ind_size = sizeof(IdxT) * size_t(n_rows);
-  return utils::with_mapped_memory_t{
-    new_vectors, vec_size, [&](const T* new_vectors_dev) {
-      return utils::with_mapped_memory_t{
-        new_indices, ind_size, [&](const IdxT* new_indices_dev) {
-          return extend_device<T, IdxT>(
-            handle, orig_index, new_vectors_dev, new_indices_dev, n_rows);
-        }}();
-    }}();
-}
-
-/**
- * See raft::spatial::knn::ivf_pq::build docs.
- *
- * This version requires `dataset` to be on-device.
- */
+/** See raft::spatial::knn::ivf_pq::build docs */
 template <typename T, typename IdxT>
-inline auto build_device(
-  const handle_t& handle, const index_params& params, const T* dataset, IdxT n_rows, uint32_t dim)
-  -> index<IdxT>
+auto build(raft::device_resources const& handle,
+           const index_params& params,
+           const T* dataset,
+           IdxT n_rows,
+           uint32_t dim) -> index<IdxT>
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "ivf_pq::build(%zu, %u)", size_t(n_rows), dim);
@@ -1073,12 +1187,6 @@ inline auto build_device(
 
   RAFT_EXPECTS(n_rows > 0 && dim > 0, "empty dataset");
 
-  switch (utils::check_pointer_residency(dataset)) {
-    case utils::pointer_residency::device_only:
-    case utils::pointer_residency::host_and_device: break;
-    default: RAFT_FAIL("[ivf_pq::build_device] The dataset pointer must be available on device.");
-  }
-
   auto stream = handle.get_stream();
 
   index<IdxT> index(handle, params, dim);
@@ -1127,15 +1235,43 @@ inline auto build_device(
                                     cudaMemcpyDefault,
                                     stream));
   } else {
-    auto dim = index.dim();
-    linalg::writeOnlyUnaryOp(
-      trainset.data(),
-      size_t(index.dim()) * n_rows_train,
-      [dataset, trainset_ratio, dim] __device__(float* out, size_t i) {
+    size_t dim = index.dim();
+    cudaPointerAttributes dataset_attr;
+    RAFT_CUDA_TRY(cudaPointerGetAttributes(&dataset_attr, dataset));
+    if (dataset_attr.devicePointer != nullptr) {
+      // data is available on device: just run the kernel to copy and map the data
+      auto p = reinterpret_cast<T*>(dataset_attr.devicePointer);
+      auto trainset_view =
+        raft::make_device_vector_view<float, IdxT>(trainset.data(), dim * n_rows_train);
+      linalg::map_offset(handle, trainset_view, [p, trainset_ratio, dim] __device__(size_t i) {
         auto col = i % dim;
-        *out     = utils::mapping<float>{}(dataset[(i - col) * size_t(trainset_ratio) + col]);
-      },
-      stream);
+        return utils::mapping<float>{}(p[(i - col) * size_t(trainset_ratio) + col]);
+      });
+    } else {
+      // data is not available: first copy, then map inplace
+      auto trainset_tmp = reinterpret_cast<T*>(reinterpret_cast<uint8_t*>(trainset.data()) +
+                                               (sizeof(float) - sizeof(T)) * index.dim());
+      // We copy the data in strides, one row at a time, and place the smaller rows of type T
+      // at the end of float rows.
+      RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset_tmp,
+                                      sizeof(float) * index.dim(),
+                                      dataset,
+                                      sizeof(T) * index.dim() * trainset_ratio,
+                                      sizeof(T) * index.dim(),
+                                      n_rows_train,
+                                      cudaMemcpyDefault,
+                                      stream));
+      // Transform the input `{T -> float}`, one row per warp.
+      // The threads in each warp copy the data synchronously; this and the layout of the data
+      // (content is aligned to the end of the rows) together allow doing the transform in-place.
+      copy_warped(trainset.data(),
+                  index.dim(),
+                  trainset_tmp,
+                  index.dim() * sizeof(float) / sizeof(T),
+                  index.dim(),
+                  n_rows_train,
+                  stream);
+    }
   }
 
   // NB: here cluster_centers is used as if it is [n_clusters, data_dim] not [n_clusters, dim_ext]!
@@ -1144,28 +1280,27 @@ inline auto build_device(
   auto cluster_centers = cluster_centers_buf.data();
 
   // Train balanced hierarchical kmeans clustering
-  kmeans::build_hierarchical(handle,
-                             params.kmeans_n_iters,
-                             index.dim(),
-                             trainset.data(),
-                             n_rows_train,
-                             cluster_centers,
-                             index.n_lists(),
-                             index.metric(),
-                             stream);
+  auto trainset_const_view =
+    raft::make_device_matrix_view<const float, IdxT>(trainset.data(), n_rows_train, index.dim());
+  auto centers_view =
+    raft::make_device_matrix_view<float, IdxT>(cluster_centers, index.n_lists(), index.dim());
+  raft::cluster::kmeans_balanced_params kmeans_params;
+  kmeans_params.n_iters = params.kmeans_n_iters;
+  kmeans_params.metric  = index.metric();
+  raft::cluster::kmeans_balanced::fit(
+    handle, kmeans_params, trainset_const_view, centers_view, utils::mapping<float>{});
 
   // Trainset labels are needed for training PQ codebooks
   rmm::device_uvector<uint32_t> labels(n_rows_train, stream, big_memory_resource);
-  kmeans::predict(handle,
-                  cluster_centers,
-                  index.n_lists(),
-                  index.dim(),
-                  trainset.data(),
-                  n_rows_train,
-                  labels.data(),
-                  index.metric(),
-                  stream,
-                  device_memory);
+  auto centers_const_view =
+    raft::make_device_matrix_view<const float, IdxT>(cluster_centers, index.n_lists(), index.dim());
+  auto labels_view = raft::make_device_vector_view<uint32_t, IdxT>(labels.data(), n_rows_train);
+  raft::cluster::kmeans_balanced::predict(handle,
+                                          kmeans_params,
+                                          trainset_const_view,
+                                          centers_const_view,
+                                          labels_view,
+                                          utils::mapping<float>());
 
   {
     // combine cluster_centers and their norms
@@ -1185,8 +1320,7 @@ inline auto build_device(
                           index.n_lists(),
                           raft::linalg::L2Norm,
                           true,
-                          stream,
-                          raft::SqrtOp<float>());
+                          stream);
     RAFT_CUDA_TRY(cudaMemcpy2DAsync(index.centers().data_handle() + index.dim(),
                                     sizeof(float) * index.dim_ext(),
                                     center_norms.data(),
@@ -1250,23 +1384,126 @@ inline auto build_device(
 
   // add the data if necessary
   if (params.add_data_on_build) {
-    return detail::extend_device<T, IdxT>(handle, index, dataset, nullptr, n_rows);
+    return detail::extend<T, IdxT>(handle, index, dataset, nullptr, n_rows);
   } else {
     return index;
   }
 }
 
-/** See raft::spatial::knn::ivf_pq::build docs */
-template <typename T, typename IdxT>
-inline auto build(
-  const handle_t& handle, const index_params& params, const T* dataset, IdxT n_rows, uint32_t dim)
-  -> index<IdxT>
+// Serialization version 2
+// No backward compatibility yet; that is, can't add additional fields without breaking
+// backward compatibility.
+// TODO(hcho3) Implement next-gen serializer for IVF that allows for expansion in a backward
+//             compatible fashion.
+constexpr int serialization_version = 2;
+
+static_assert(sizeof(index<std::uint64_t>) == 560,
+              "The size of the index struct has changed since the last update; "
+              "paste in the new size and consider updating the save/load logic");
+
+/**
+ * Save the index to file.
+ *
+ * Experimental, both the API and the serialization format are subject to change.
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the file name for saving the index
+ * @param[in] index_ IVF-PQ index
+ *
+ */
+template <typename IdxT>
+void serialize(raft::device_resources const& handle_,
+               const std::string& filename,
+               const index<IdxT>& index_)
 {
-  size_t data_size = sizeof(T) * size_t(n_rows) * size_t(dim);
-  return utils::with_mapped_memory_t{dataset, data_size, [&](const T* dataset_dev) {
-                                       return build_device<T, IdxT>(
-                                         handle, params, dataset_dev, n_rows, dim);
-                                     }}();
+  std::ofstream of(filename, std::ios::out | std::ios::binary);
+  if (!of) { RAFT_FAIL("Cannot open file %s", filename.c_str()); }
+
+  RAFT_LOG_DEBUG("Size %zu, dim %d, pq_dim %d, pq_bits %d",
+                 static_cast<size_t>(index_.size()),
+                 static_cast<int>(index_.dim()),
+                 static_cast<int>(index_.pq_dim()),
+                 static_cast<int>(index_.pq_bits()));
+
+  serialize_scalar(handle_, of, serialization_version);
+  serialize_scalar(handle_, of, index_.size());
+  serialize_scalar(handle_, of, index_.dim());
+  serialize_scalar(handle_, of, index_.pq_bits());
+  serialize_scalar(handle_, of, index_.pq_dim());
+
+  serialize_scalar(handle_, of, index_.metric());
+  serialize_scalar(handle_, of, index_.codebook_kind());
+  serialize_scalar(handle_, of, index_.n_lists());
+  serialize_scalar(handle_, of, index_.n_nonempty_lists());
+
+  serialize_mdspan(handle_, of, index_.pq_centers());
+  serialize_mdspan(handle_, of, index_.pq_dataset());
+  serialize_mdspan(handle_, of, index_.indices());
+  serialize_mdspan(handle_, of, index_.rotation_matrix());
+  serialize_mdspan(handle_, of, index_.list_offsets());
+  serialize_mdspan(handle_, of, index_.list_sizes());
+  serialize_mdspan(handle_, of, index_.centers());
+  serialize_mdspan(handle_, of, index_.centers_rot());
+
+  of.close();
+  if (!of) { RAFT_FAIL("Error writing output %s", filename.c_str()); }
+  return;
+}
+
+/**
+ * Load index from file.
+ *
+ * Experimental, both the API and the serialization format are subject to change.
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the name of the file that stores the index
+ * @param[in] index_ IVF-PQ index
+ *
+ */
+template <typename IdxT>
+auto deserialize(raft::device_resources const& handle_, const std::string& filename) -> index<IdxT>
+{
+  std::ifstream infile(filename, std::ios::in | std::ios::binary);
+
+  if (!infile) { RAFT_FAIL("Cannot open file %s", filename.c_str()); }
+
+  auto ver = deserialize_scalar<int>(handle_, infile);
+  if (ver != serialization_version) {
+    RAFT_FAIL("serialization version mismatch %d vs. %d", ver, serialization_version);
+  }
+  auto n_rows  = deserialize_scalar<IdxT>(handle_, infile);
+  auto dim     = deserialize_scalar<std::uint32_t>(handle_, infile);
+  auto pq_bits = deserialize_scalar<std::uint32_t>(handle_, infile);
+  auto pq_dim  = deserialize_scalar<std::uint32_t>(handle_, infile);
+
+  auto metric        = deserialize_scalar<raft::distance::DistanceType>(handle_, infile);
+  auto codebook_kind = deserialize_scalar<raft::neighbors::ivf_pq::codebook_gen>(handle_, infile);
+  auto n_lists       = deserialize_scalar<std::uint32_t>(handle_, infile);
+  auto n_nonempty_lists = deserialize_scalar<std::uint32_t>(handle_, infile);
+
+  RAFT_LOG_DEBUG("n_rows %zu, dim %d, pq_dim %d, pq_bits %d, n_lists %d",
+                 static_cast<std::size_t>(n_rows),
+                 static_cast<int>(dim),
+                 static_cast<int>(pq_dim),
+                 static_cast<int>(pq_bits),
+                 static_cast<int>(n_lists));
+
+  auto index_ = raft::neighbors::ivf_pq::index<IdxT>(
+    handle_, metric, codebook_kind, n_lists, dim, pq_bits, pq_dim, n_nonempty_lists);
+  index_.allocate(handle_, n_rows);
+
+  deserialize_mdspan(handle_, infile, index_.pq_centers());
+  deserialize_mdspan(handle_, infile, index_.pq_dataset());
+  deserialize_mdspan(handle_, infile, index_.indices());
+  deserialize_mdspan(handle_, infile, index_.rotation_matrix());
+  deserialize_mdspan(handle_, infile, index_.list_offsets());
+  deserialize_mdspan(handle_, infile, index_.list_sizes());
+  deserialize_mdspan(handle_, infile, index_.centers());
+  deserialize_mdspan(handle_, infile, index_.centers_rot());
+
+  infile.close();
+
+  return index_;
 }
 
 }  // namespace raft::spatial::knn::ivf_pq::detail
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
index c1a3682f47..c3326f8fac 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,18 +17,21 @@
 #pragma once
 
 #include "ann_utils.cuh"
-#include "topk.cuh"
-#include "topk/warpsort_topk.cuh"
 
 #include <raft/neighbors/ivf_pq_types.hpp>
 
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/device_mdarray.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/nvtx.hpp>
+#include <raft/core/operators.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/gemm.cuh>
+#include <raft/linalg/map.cuh>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/matrix/detail/select_k.cuh>
+#include <raft/matrix/detail/select_warpsort.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/device_atomics.cuh>
 #include <raft/util/device_loads_stores.cuh>
@@ -129,11 +132,11 @@ struct fp_8bit {
  * Select the clusters to probe and, as a side-effect, translate the queries type `T -> float`
  *
  * Assuming the number of clusters is not that big (a few thousands), we do a plain GEMM
- * followed by select_topk to select the clusters to probe. There's no need to return the similarity
+ * followed by select_k to select the clusters to probe. There's no need to return the similarity
  * scores here.
  */
 template <typename T>
-void select_clusters(const handle_t& handle,
+void select_clusters(raft::device_resources const& handle,
                      uint32_t* clusters_to_probe,  // [n_queries, n_probes]
                      float* float_queries,         // [n_queries, dim_ext]
                      uint32_t n_queries,
@@ -147,7 +150,6 @@ void select_clusters(const handle_t& handle,
                      rmm::mr::device_memory_resource* mr)
 {
   auto stream = handle.get_stream();
-  rmm::device_uvector<float> qc_distances(n_queries * n_lists, stream, mr);
   /* NOTE[qc_distances]
 
   We compute query-center distances to choose the clusters to probe.
@@ -157,37 +159,38 @@ void select_clusters(const handle_t& handle,
       cluster_centers[i, dim()] contains the squared norm of the center vector i;
       we extend the dimension K of the GEMM to compute it together with all the dot products:
 
-      `cq_distances[i, j] = |cluster_centers[j]|^2 - 2 * (queries[i], cluster_centers[j])`
+      `qc_distances[i, j] = |cluster_centers[j]|^2 - 2 * (queries[i], cluster_centers[j])`
 
       This is a monotonous mapping of the proper L2 distance.
 
     IP distance:
-      `cq_distances[i, j] = - (queries[i], cluster_centers[j])`
+      `qc_distances[i, j] = - (queries[i], cluster_centers[j])`
 
       This is a negative inner-product distance. We minimize it to find the similar clusters.
 
-      NB: cq_distances is NOT used further in ivfpq_search.
+      NB: qc_distances is NOT used further in ivfpq_search.
  */
   float norm_factor;
   switch (metric) {
+    case raft::distance::DistanceType::L2SqrtExpanded:
     case raft::distance::DistanceType::L2Expanded: norm_factor = 1.0 / -2.0; break;
     case raft::distance::DistanceType::InnerProduct: norm_factor = 0.0; break;
     default: RAFT_FAIL("Unsupported distance type %d.", int(metric));
   }
-  linalg::writeOnlyUnaryOp(
-    float_queries,
-    dim_ext * n_queries,
-    [queries, dim, dim_ext, norm_factor] __device__(float* out, uint32_t ix) {
+  auto float_queries_view =
+    raft::make_device_vector_view<float, uint32_t>(float_queries, dim_ext * n_queries);
+  linalg::map_offset(
+    handle, float_queries_view, [queries, dim, dim_ext, norm_factor] __device__(uint32_t ix) {
       uint32_t col = ix % dim_ext;
       uint32_t row = ix / dim_ext;
-      *out         = col < dim ? utils::mapping<float>{}(queries[col + dim * row]) : norm_factor;
-    },
-    stream);
+      return col < dim ? utils::mapping<float>{}(queries[col + dim * row]) : norm_factor;
+    });
 
   float alpha;
   float beta;
   uint32_t gemm_k = dim;
   switch (metric) {
+    case raft::distance::DistanceType::L2SqrtExpanded:
     case raft::distance::DistanceType::L2Expanded: {
       alpha  = -2.0;
       beta   = 0.0;
@@ -200,6 +203,7 @@ void select_clusters(const handle_t& handle,
     } break;
     default: RAFT_FAIL("Unsupported distance type %d.", int(metric));
   }
+  rmm::device_uvector<float> qc_distances(n_queries * n_lists, stream, mr);
   linalg::gemm(handle,
                true,
                false,
@@ -218,16 +222,16 @@ void select_clusters(const handle_t& handle,
 
   // Select neighbor clusters for each query.
   rmm::device_uvector<float> cluster_dists(n_queries * n_probes, stream, mr);
-  select_topk<float, uint32_t>(qc_distances.data(),
-                               nullptr,
-                               n_queries,
-                               n_lists,
-                               n_probes,
-                               cluster_dists.data(),
-                               clusters_to_probe,
-                               true,
-                               stream,
-                               mr);
+  matrix::detail::select_k<float, uint32_t>(qc_distances.data(),
+                                            nullptr,
+                                            n_queries,
+                                            n_lists,
+                                            n_probes,
+                                            cluster_dists.data(),
+                                            clusters_to_probe,
+                                            true,
+                                            stream,
+                                            mr);
 }
 
 /**
@@ -418,14 +422,12 @@ void postprocess_distances(float* out,        // [n_queries, topk]
   switch (metric) {
     case distance::DistanceType::L2Unexpanded:
     case distance::DistanceType::L2Expanded: {
-      linalg::unaryOp(
-        out,
-        in,
-        len,
-        [scaling_factor] __device__(ScoreT x) -> float {
-          return scaling_factor * scaling_factor * float(x);
-        },
-        stream);
+      linalg::unaryOp(out,
+                      in,
+                      len,
+                      raft::compose_op(raft::mul_const_op<float>{scaling_factor * scaling_factor},
+                                       raft::cast_op<float>{}),
+                      stream);
     } break;
     case distance::DistanceType::L2SqrtUnexpanded:
     case distance::DistanceType::L2SqrtExpanded: {
@@ -433,18 +435,17 @@ void postprocess_distances(float* out,        // [n_queries, topk]
         out,
         in,
         len,
-        [scaling_factor] __device__(ScoreT x) -> float { return scaling_factor * sqrtf(float(x)); },
+        raft::compose_op{
+          raft::mul_const_op<float>{scaling_factor}, raft::sqrt_op{}, raft::cast_op<float>{}},
         stream);
     } break;
     case distance::DistanceType::InnerProduct: {
-      linalg::unaryOp(
-        out,
-        in,
-        len,
-        [scaling_factor] __device__(ScoreT x) -> float {
-          return -scaling_factor * scaling_factor * float(x);
-        },
-        stream);
+      linalg::unaryOp(out,
+                      in,
+                      len,
+                      raft::compose_op(raft::mul_const_op<float>{-scaling_factor * scaling_factor},
+                                       raft::cast_op<float>{}),
+                      stream);
     } break;
     default: RAFT_FAIL("Unexpected metric.");
   }
@@ -452,14 +453,15 @@ void postprocess_distances(float* out,        // [n_queries, topk]
 
 template <typename T, typename IdxT>
 struct dummy_block_sort_t {
-  using queue_t = topk::warp_sort_distributed<WarpSize, true, T, IdxT>;
+  using queue_t = matrix::detail::select::warpsort::warp_sort_distributed<WarpSize, true, T, IdxT>;
   template <typename... Args>
-  __device__ dummy_block_sort_t(int k, uint8_t* smem_buf, Args...){};
+  __device__ dummy_block_sort_t(int k, Args...){};
 };
 
 template <int Capacity, typename T, typename IdxT>
 struct pq_block_sort {
-  using type = topk::block_sort<topk::warp_sort_distributed, Capacity, true, T, IdxT>;
+  using type = matrix::detail::select::warpsort::
+    block_sort<matrix::detail::select::warpsort::warp_sort_distributed, Capacity, true, T, IdxT>;
 };
 
 template <typename T, typename IdxT>
@@ -712,6 +714,7 @@ __global__ void ivfpq_compute_similarity_kernel(uint32_t n_rows,
     if constexpr (PrecompBaseDiff) {
       // Reduce number of memory reads later by pre-computing parts of the score
       switch (metric) {
+        case distance::DistanceType::L2SqrtExpanded:
         case distance::DistanceType::L2Expanded: {
           for (uint32_t i = threadIdx.x; i < dim; i += blockDim.x) {
             base_diff[i] = query[i] - cluster_center[i];
@@ -745,6 +748,7 @@ __global__ void ivfpq_compute_similarity_kernel(uint32_t n_rows,
           float pq_c = *cur_pq_center;
           cur_pq_center += PqShift;
           switch (metric) {
+            case distance::DistanceType::L2SqrtExpanded:
             case distance::DistanceType::L2Expanded: {
               float diff;
               if constexpr (PrecompBaseDiff) {
@@ -806,11 +810,12 @@ __global__ void ivfpq_compute_similarity_kernel(uint32_t n_rows,
     constexpr OutT kDummy = upper_bound<OutT>();
     OutT query_kth        = kDummy;
     if constexpr (kManageLocalTopK) { query_kth = OutT(query_kths[query_ix]); }
-    local_topk_t block_topk(topk, smem_buf, query_kth);
+    local_topk_t block_topk(topk, nullptr, query_kth);
     OutT early_stop_limit = kDummy;
     switch (metric) {
       // If the metric is non-negative, we can use the query_kth approximation as an early stop
       // threshold to skip some iterations when computing the score. Add such metrics here.
+      case distance::DistanceType::L2SqrtExpanded:
       case distance::DistanceType::L2Expanded: {
         early_stop_limit = query_kth;
       } break;
@@ -842,7 +847,7 @@ __global__ void ivfpq_compute_similarity_kernel(uint32_t n_rows,
     if constexpr (kManageLocalTopK) {
       // sync threads before the topk merging operation, because we reuse smem_buf
       __syncthreads();
-      block_topk.done();
+      block_topk.done(smem_buf);
       block_topk.store(out_scores, out_indices);
       if (threadIdx.x == 0) { atomicMin(query_kths + query_ix, float(out_scores[topk - 1])); }
     } else {
@@ -1052,9 +1057,11 @@ struct ivfpq_compute_similarity {
 
       [[nodiscard]] auto operator()(uint32_t n_threads) const -> size_t
       {
-        return manage_local_topk ? topk::template calc_smem_size_for_block_wide<OutT, IdxT>(
-                                     n_threads / subwarp_size, topk)
-                                 : 0;
+        return manage_local_topk
+                 ? matrix::detail::select::warpsort::template calc_smem_size_for_block_wide<OutT,
+                                                                                            IdxT>(
+                     n_threads / subwarp_size, topk)
+                 : 0;
       }
     } ltk_mem{manage_local_topk, topk};
 
@@ -1173,7 +1180,13 @@ struct ivfpq_compute_similarity {
               // If we don't have enough repeating probes (locality_hint < tmp.blocks_per_sm),
               // the locality is not going to improve with increasing the number of blocks per SM.
               // Hence, the only metric here is the occupancy.
-              select_it = tmp.occupancy > cur.occupancy;
+              bool improves_occupancy = tmp.occupancy > cur.occupancy;
+              // Otherwise, the performance still improves with a smaller block size,
+              // given there is enough work to do
+              bool improves_parallelism =
+                tmp.occupancy == cur.occupancy &&
+                7u * tmp.blocks_per_sm * dev_props.multiProcessorCount <= n_blocks;
+              select_it = improves_occupancy || improves_parallelism;
             } else {
               // If we don't use shared memory for the lookup table, increasing the number of blocks
               // is very taxing on the global memory usage.
@@ -1249,7 +1262,7 @@ inline auto is_local_topk_feasible(uint32_t k, uint32_t n_probes, uint32_t n_que
  *      is guaranteed to fit into GPU memory.
  */
 template <typename ScoreT, typename LutT, typename IdxT>
-void ivfpq_search_worker(const handle_t& handle,
+void ivfpq_search_worker(raft::device_resources const& handle,
                          const index<IdxT>& index,
                          uint32_t max_samples,
                          uint32_t n_probes,
@@ -1403,16 +1416,16 @@ void ivfpq_search_worker(const handle_t& handle,
 
   // Select topk vectors for each query
   rmm::device_uvector<ScoreT> topk_dists(n_queries * topK, stream, mr);
-  select_topk<ScoreT, IdxT>(distances_buf.data(),
-                            neighbors_ptr,
-                            n_queries,
-                            topk_len,
-                            topK,
-                            topk_dists.data(),
-                            neighbors,
-                            true,
-                            stream,
-                            mr);
+  matrix::detail::select_k<ScoreT, IdxT>(distances_buf.data(),
+                                         neighbors_ptr,
+                                         n_queries,
+                                         topk_len,
+                                         topK,
+                                         topk_dists.data(),
+                                         neighbors,
+                                         true,
+                                         stream,
+                                         mr);
 
   // Postprocessing
   postprocess_distances(
@@ -1515,11 +1528,7 @@ inline auto get_max_batch_size(uint32_t k,
   };
   constexpr uint64_t kMaxWsSize = 1024 * 1024 * 1024;
   if (ws_size(max_batch_size) > kMaxWsSize) {
-    uint32_t smaller_batch_size = 1;
-    // take powers of two for better alignment
-    while (smaller_batch_size * 2 <= max_batch_size) {
-      smaller_batch_size <<= 1;
-    }
+    uint32_t smaller_batch_size = bound_by_power_of_two(max_batch_size);
     // gradually reduce the batch size until we fit into the max size limit.
     while (smaller_batch_size > 1 && ws_size(smaller_batch_size) > kMaxWsSize) {
       smaller_batch_size >>= 1;
@@ -1531,7 +1540,7 @@ inline auto get_max_batch_size(uint32_t k,
 
 /** See raft::spatial::knn::ivf_pq::search docs */
 template <typename T, typename IdxT>
-inline void search(const handle_t& handle,
+inline void search(raft::device_resources const& handle,
                    const search_params& params,
                    const index<IdxT>& index,
                    const T* queries,
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index 0c33c3f38f..2b2b5cee0c 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,15 +23,12 @@
 #include <rmm/device_uvector.hpp>
 
 #include <faiss/gpu/GpuDistance.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/Limits.cuh>
-#include <faiss/gpu/utils/Select.cuh>
-#include <faiss/utils/Heap.h>
 
 #include <cstdint>
 #include <iostream>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
+#include <raft/spatial/knn/detail/faiss_select/Select.cuh>
 #include <raft/spatial/knn/faiss_mr.hpp>
 #include <set>
 #include <thrust/iterator/transform_iterator.h>
@@ -63,7 +60,7 @@ __global__ void knn_merge_parts_kernel(value_t* inK,
                                        int k,
                                        value_idx* translations)
 {
-  constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
+  constexpr int kNumWarps = tpb / WarpSize;
 
   __shared__ value_t smemK[kNumWarps * warp_q];
   __shared__ value_idx smemV[kNumWarps * warp_q];
@@ -71,8 +68,8 @@ __global__ void knn_merge_parts_kernel(value_t* inK,
   /**
    * Uses shared memory
    */
-  faiss::gpu::
-    BlockSelect<value_t, value_idx, false, faiss::gpu::Comparator<value_t>, warp_q, thread_q, tpb>
+  faiss_select::
+    BlockSelect<value_t, value_idx, false, faiss_select::Comparator<value_t>, warp_q, thread_q, tpb>
       heap(initK, initV, smemK, smemV, k);
 
   // Grid is exactly sized to rows available
@@ -90,7 +87,7 @@ __global__ void knn_merge_parts_kernel(value_t* inK,
   value_t* inKStart   = inK + (row_idx + col);
   value_idx* inVStart = inV + (row_idx + col);
 
-  int limit             = faiss::gpu::utils::roundDown(total_k, faiss::gpu::kWarpSize);
+  int limit             = Pow2<WarpSize>::roundDown(total_k);
   value_idx translation = 0;
 
   for (; i < limit; i += tpb) {
@@ -136,7 +133,7 @@ inline void knn_merge_parts_impl(value_t* inK,
   constexpr int n_threads = (warp_q <= 1024) ? 128 : 64;
   auto block              = dim3(n_threads);
 
-  auto kInit = faiss::gpu::Limits<value_t>::getMax();
+  auto kInit = std::numeric_limits<value_t>::max();
   auto vInit = -1;
   knn_merge_parts_kernel<value_idx, value_t, warp_q, thread_q, n_threads>
     <<<grid, block, 0, stream>>>(
@@ -218,7 +215,7 @@ inline void knn_merge_parts(value_t* inK,
  */
 template <typename IntType = int, typename IdxType = std::int64_t, typename value_t = float>
 void brute_force_knn_impl(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   std::vector<value_t*>& input,
   std::vector<IntType>& sizes,
   IntType D,
diff --git a/cpp/include/raft/spatial/knn/detail/processing.cuh b/cpp/include/raft/spatial/knn/detail/processing.cuh
index a80c1c1935..b4b1cb2c14 100644
--- a/cpp/include/raft/spatial/knn/detail/processing.cuh
+++ b/cpp/include/raft/spatial/knn/detail/processing.cuh
@@ -17,6 +17,7 @@
 
 #include "processing.hpp"
 
+#include <raft/core/operators.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/matrix_vector_op.cuh>
 #include <raft/linalg/norm.cuh>
@@ -59,32 +60,16 @@ class CosineMetricProcessor : public MetricProcessor<math_t> {
                           raft::linalg::NormType::L2Norm,
                           row_major_,
                           stream_,
-                          [] __device__(math_t in) { return sqrtf(in); });
+                          raft::sqrt_op{});
 
     raft::linalg::matrixVectorOp(
-      data,
-      data,
-      colsums_.data(),
-      n_cols_,
-      n_rows_,
-      row_major_,
-      false,
-      [] __device__(math_t mat_in, math_t vec_in) { return mat_in / vec_in; },
-      stream_);
+      data, data, colsums_.data(), n_cols_, n_rows_, row_major_, false, raft::div_op{}, stream_);
   }
 
   void revert(math_t* data)
   {
     raft::linalg::matrixVectorOp(
-      data,
-      data,
-      colsums_.data(),
-      n_cols_,
-      n_rows_,
-      row_major_,
-      false,
-      [] __device__(math_t mat_in, math_t vec_in) { return mat_in * vec_in; },
-      stream_);
+      data, data, colsums_.data(), n_cols_, n_rows_, row_major_, false, raft::mul_op{}, stream_);
   }
 
   void postprocess(math_t* data)
@@ -122,12 +107,11 @@ class CorrelationMetricProcessor : public CosineMetricProcessor<math_t> {
                          true,
                          cosine::stream_);
 
-    raft::linalg::unaryOp(
-      means_.data(),
-      means_.data(),
-      cosine::n_rows_,
-      [=] __device__(math_t in) { return in * normalizer_const; },
-      cosine::stream_);
+    raft::linalg::unaryOp(means_.data(),
+                          means_.data(),
+                          cosine::n_rows_,
+                          raft::mul_const_op<math_t>(normalizer_const),
+                          cosine::stream_);
 
     raft::stats::meanCenter(data,
                             data,
diff --git a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
index 239379aad5..2cdc0fae91 100644
--- a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,12 +20,7 @@
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/pow2_utils.cuh>
 
-#include <faiss/gpu/GpuDistance.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/Limits.cuh>
-#include <faiss/gpu/utils/Select.cuh>
-#include <faiss/utils/Heap.h>
+#include <raft/spatial/knn/detail/faiss_select/Select.cuh>
 
 namespace raft {
 namespace spatial {
@@ -55,9 +50,14 @@ __global__ void select_k_kernel(const key_t* inK,
   __shared__ key_t smemK[kNumWarps * warp_q];
   __shared__ payload_t smemV[kNumWarps * warp_q];
 
-  faiss::gpu::
-    BlockSelect<key_t, payload_t, select_min, faiss::gpu::Comparator<key_t>, warp_q, thread_q, tpb>
-      heap(initK, initV, smemK, smemV, k);
+  faiss_select::BlockSelect<key_t,
+                            payload_t,
+                            select_min,
+                            faiss_select::Comparator<key_t>,
+                            warp_q,
+                            thread_q,
+                            tpb>
+    heap(initK, initV, smemK, smemV, k);
 
   // Grid is exactly sized to rows available
   int row = blockIdx.x;
diff --git a/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh b/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh
deleted file mode 100644
index b6ffbd5122..0000000000
--- a/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh
+++ /dev/null
@@ -1,763 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cub/cub.cuh>
-
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/MergeNetworkUtils.cuh>
-#include <faiss/gpu/utils/PtxUtils.cuh>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/gpu/utils/WarpShuffles.cuh>
-
-namespace faiss {
-namespace gpu {
-
-template <typename _Key, typename _Value>
-struct KeyValuePair {
-  typedef _Key Key;      ///< Key data type
-  typedef _Value Value;  ///< Value data type
-
-  Key key;      ///< Item key
-  Value value;  ///< Item value
-
-  /// Constructor
-  __host__ __device__ __forceinline__ KeyValuePair() {}
-
-  /// Copy Constructors
-  __host__ __device__ __forceinline__ KeyValuePair(cub::KeyValuePair<_Key, _Value>& kvp)
-    : key(kvp.key), value(kvp.value)
-  {
-  }
-
-  __host__ __device__ __forceinline__ KeyValuePair(faiss::gpu::KeyValuePair<_Key, _Value>& kvp)
-    : key(kvp.key), value(kvp.value)
-  {
-  }
-
-  /// Constructor
-  __host__ __device__ __forceinline__ KeyValuePair(Key const& key, Value const& value)
-    : key(key), value(value)
-  {
-  }
-
-  /// Inequality operator
-  __host__ __device__ __forceinline__ bool operator!=(const KeyValuePair& b)
-  {
-    return (value != b.value) || (key != b.key);
-  }
-};
-
-//
-// This file contains functions to:
-//
-// -perform bitonic merges on pairs of sorted lists, held in
-// registers. Each list contains N * kWarpSize (multiple of 32)
-// elements for some N.
-// The bitonic merge is implemented for arbitrary sizes;
-// sorted list A of size N1 * kWarpSize registers
-// sorted list B of size N2 * kWarpSize registers =>
-// sorted list C if size (N1 + N2) * kWarpSize registers. N1 and N2
-// are >= 1 and don't have to be powers of 2.
-//
-// -perform bitonic sorts on a set of N * kWarpSize key/value pairs
-// held in registers, by using the above bitonic merge as a
-// primitive.
-// N can be an arbitrary N >= 1; i.e., the bitonic sort here supports
-// odd sizes and doesn't require the input to be a power of 2.
-//
-// The sort or merge network is completely statically instantiated via
-// template specialization / expansion and constexpr, and it uses warp
-// shuffles to exchange values between warp lanes.
-//
-// A note about comparisons:
-//
-// For a sorting network of keys only, we only need one
-// comparison (a < b). However, what we really need to know is
-// if one lane chooses to exchange a value, then the
-// corresponding lane should also do the exchange.
-// Thus, if one just uses the negation !(x < y) in the higher
-// lane, this will also include the case where (x == y). Thus, one
-// lane in fact performs an exchange and the other doesn't, but
-// because the only value being exchanged is equivalent, nothing has
-// changed.
-// So, you can get away with just one comparison and its negation.
-//
-// If we're sorting keys and values, where equivalent keys can
-// exist, then this is a problem, since we want to treat (x, v1)
-// as not equivalent to (x, v2).
-//
-// To remedy this, you can either compare with a lexicographic
-// ordering (a.k < b.k || (a.k == b.k && a.v < b.v)), which since
-// we're predicating all of the choices results in 3 comparisons
-// being executed, or we can invert the selection so that there is no
-// middle choice of equality; the other lane will likewise
-// check that (b.k > a.k) (the higher lane has the values
-// swapped). Then, the first lane swaps if and only if the
-// second lane swaps; if both lanes have equivalent keys, no
-// swap will be performed. This results in only two comparisons
-// being executed.
-//
-// If you don't consider values as well, then this does not produce a
-// consistent ordering among (k, v) pairs with equivalent keys but
-// different values; for us, we don't really care about ordering or
-// stability here.
-//
-// I have tried both re-arranging the order in the higher lane to get
-// away with one comparison or adding the value to the check; both
-// result in greater register consumption or lower speed than just
-// performing both < and > comparisons with the variables, so I just
-// stick with this.
-
-// This function merges kWarpSize / 2L lists in parallel using warp
-// shuffles.
-// It works on at most size-16 lists, as we need 32 threads for this
-// shuffle merge.
-//
-// If IsBitonic is false, the first stage is reversed, so we don't
-// need to sort directionally. It's still technically a bitonic sort.
-template <typename K, typename V, int L, bool Dir, typename Comp, bool IsBitonic>
-inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair<K, V>& v)
-{
-  static_assert(utils::isPowerOf2(L), "L must be a power-of-2");
-  static_assert(L <= kWarpSize / 2, "merge list size must be <= 16");
-
-  int laneId = getLaneId();
-
-  if (!IsBitonic) {
-    // Reverse the first comparison stage.
-    // For example, merging a list of size 8 has the exchanges:
-    // 0 <-> 15, 1 <-> 14, ...
-    K otherK  = shfl_xor(k, 2 * L - 1);
-    K otherVk = shfl_xor(v.key, 2 * L - 1);
-    V otherVv = shfl_xor(v.value, 2 * L - 1);
-
-    KeyValuePair<K, V> otherV = KeyValuePair(otherVk, otherVv);
-
-    // Whether we are the lesser thread in the exchange
-    bool small = !(laneId & L);
-
-    if (Dir) {
-      // See the comment above how performing both of these
-      // comparisons in the warp seems to win out over the
-      // alternatives in practice
-      bool s = small ? Comp::gt(k, otherK) : Comp::lt(k, otherK);
-      assign(s, k, otherK);
-      assign(s, v.key, otherV.key);
-      assign(s, v.value, otherV.value);
-
-    } else {
-      bool s = small ? Comp::lt(k, otherK) : Comp::gt(k, otherK);
-      assign(s, k, otherK);
-      assign(s, v.value, otherV.value);
-      assign(s, v.key, otherV.key);
-    }
-  }
-
-#pragma unroll
-  for (int stride = IsBitonic ? L : L / 2; stride > 0; stride /= 2) {
-    K otherK  = shfl_xor(k, stride);
-    K otherVk = shfl_xor(v.key, stride);
-    V otherVv = shfl_xor(v.value, stride);
-
-    KeyValuePair<K, V> otherV = KeyValuePair(otherVk, otherVv);
-
-    // Whether we are the lesser thread in the exchange
-    bool small = !(laneId & stride);
-
-    if (Dir) {
-      bool s = small ? Comp::gt(k, otherK) : Comp::lt(k, otherK);
-      assign(s, k, otherK);
-      assign(s, v.key, otherV.key);
-      assign(s, v.value, otherV.value);
-
-    } else {
-      bool s = small ? Comp::lt(k, otherK) : Comp::gt(k, otherK);
-      assign(s, k, otherK);
-      assign(s, v.key, otherV.key);
-      assign(s, v.value, otherV.value);
-    }
-  }
-}
-
-// Template for performing a bitonic merge of an arbitrary set of
-// registers
-template <typename K, typename V, int N, bool Dir, typename Comp, bool Low, bool Pow2>
-struct BitonicMergeStepKVP {
-};
-
-//
-// Power-of-2 merge specialization
-//
-
-// All merges eventually call this
-template <typename K, typename V, bool Dir, typename Comp, bool Low>
-struct BitonicMergeStepKVP<K, V, 1, Dir, Comp, Low, true> {
-  static inline __device__ void merge(K k[1], KeyValuePair<K, V> v[1])
-  {
-    // Use warp shuffles
-    warpBitonicMergeLE16KVP<K, V, 16, Dir, Comp, true>(k[0], v[0]);
-  }
-};
-
-template <typename K, typename V, int N, bool Dir, typename Comp, bool Low>
-struct BitonicMergeStepKVP<K, V, N, Dir, Comp, Low, true> {
-  static inline __device__ void merge(K k[N], KeyValuePair<K, V> v[N])
-  {
-    static_assert(utils::isPowerOf2(N), "must be power of 2");
-    static_assert(N > 1, "must be N > 1");
-
-#pragma unroll
-    for (int i = 0; i < N / 2; ++i) {
-      K& ka                  = k[i];
-      KeyValuePair<K, V>& va = v[i];
-
-      K& kb                  = k[i + N / 2];
-      KeyValuePair<K, V>& vb = v[i + N / 2];
-
-      bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
-      swap(s, ka, kb);
-      swap(s, va.key, vb.key);
-      swap(s, va.value, vb.value);
-    }
-
-    {
-      K newK[N / 2];
-      KeyValuePair<K, V> newV[N / 2];
-
-#pragma unroll
-      for (int i = 0; i < N / 2; ++i) {
-        newK[i]       = k[i];
-        newV[i].key   = v[i].key;
-        newV[i].value = v[i].value;
-      }
-
-      BitonicMergeStepKVP<K, V, N / 2, Dir, Comp, true, true>::merge(newK, newV);
-
-#pragma unroll
-      for (int i = 0; i < N / 2; ++i) {
-        k[i]       = newK[i];
-        v[i].key   = newV[i].key;
-        v[i].value = newV[i].value;
-      }
-    }
-
-    {
-      K newK[N / 2];
-      KeyValuePair<K, V> newV[N / 2];
-
-#pragma unroll
-      for (int i = 0; i < N / 2; ++i) {
-        newK[i]       = k[i + N / 2];
-        newV[i].key   = v[i + N / 2].key;
-        newV[i].value = v[i + N / 2].value;
-      }
-
-      BitonicMergeStepKVP<K, V, N / 2, Dir, Comp, false, true>::merge(newK, newV);
-
-#pragma unroll
-      for (int i = 0; i < N / 2; ++i) {
-        k[i + N / 2]       = newK[i];
-        v[i + N / 2].key   = newV[i].key;
-        v[i + N / 2].value = newV[i].value;
-      }
-    }
-  }
-};
-
-//
-// Non-power-of-2 merge specialization
-//
-
-// Low recursion
-template <typename K, typename V, int N, bool Dir, typename Comp>
-struct BitonicMergeStepKVP<K, V, N, Dir, Comp, true, false> {
-  static inline __device__ void merge(K k[N], KeyValuePair<K, V> v[N])
-  {
-    static_assert(!utils::isPowerOf2(N), "must be non-power-of-2");
-    static_assert(N >= 3, "must be N >= 3");
-
-    constexpr int kNextHighestPowerOf2 = utils::nextHighestPowerOf2(N);
-
-#pragma unroll
-    for (int i = 0; i < N - kNextHighestPowerOf2 / 2; ++i) {
-      K& ka                  = k[i];
-      KeyValuePair<K, V>& va = v[i];
-
-      K& kb                  = k[i + kNextHighestPowerOf2 / 2];
-      KeyValuePair<K, V>& vb = v[i + kNextHighestPowerOf2 / 2];
-
-      bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
-      swap(s, ka, kb);
-      swap(s, va.key, vb.key);
-      swap(s, va.value, vb.value);
-    }
-
-    constexpr int kLowSize  = N - kNextHighestPowerOf2 / 2;
-    constexpr int kHighSize = kNextHighestPowerOf2 / 2;
-    {
-      K newK[kLowSize];
-      KeyValuePair<K, V> newV[kLowSize];
-
-#pragma unroll
-      for (int i = 0; i < kLowSize; ++i) {
-        newK[i]       = k[i];
-        newV[i].key   = v[i].key;
-        newV[i].value = v[i].value;
-      }
-
-      constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(N - kNextHighestPowerOf2 / 2);
-      // FIXME: compiler doesn't like this expression? compiler bug?
-      //      constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kLowSize);
-      BitonicMergeStepKVP<K,
-                          V,
-                          kLowSize,
-                          Dir,
-                          Comp,
-                          true,  // low
-                          kLowIsPowerOf2>::merge(newK, newV);
-
-#pragma unroll
-      for (int i = 0; i < kLowSize; ++i) {
-        k[i]       = newK[i];
-        v[i].key   = newV[i].key;
-        v[i].value = newV[i].value;
-      }
-    }
-
-    {
-      K newK[kHighSize];
-      KeyValuePair<K, V> newV[kHighSize];
-
-#pragma unroll
-      for (int i = 0; i < kHighSize; ++i) {
-        newK[i]       = k[i + kLowSize];
-        newV[i].key   = v[i + kLowSize].key;
-        newV[i].value = v[i + kLowSize].value;
-      }
-
-      constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(kNextHighestPowerOf2 / 2);
-      // FIXME: compiler doesn't like this expression? compiler bug?
-      //      constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(kHighSize);
-      BitonicMergeStepKVP<K,
-                          V,
-                          kHighSize,
-                          Dir,
-                          Comp,
-                          false,  // high
-                          kHighIsPowerOf2>::merge(newK, newV);
-
-#pragma unroll
-      for (int i = 0; i < kHighSize; ++i) {
-        k[i + kLowSize]       = newK[i];
-        v[i + kLowSize].key   = newV[i].key;
-        v[i + kLowSize].value = newV[i].value;
-      }
-    }
-  }
-};
-
-// High recursion
-template <typename K, typename V, int N, bool Dir, typename Comp>
-struct BitonicMergeStepKVP<K, V, N, Dir, Comp, false, false> {
-  static inline __device__ void merge(K k[N], KeyValuePair<K, V> v[N])
-  {
-    static_assert(!utils::isPowerOf2(N), "must be non-power-of-2");
-    static_assert(N >= 3, "must be N >= 3");
-
-    constexpr int kNextHighestPowerOf2 = utils::nextHighestPowerOf2(N);
-
-#pragma unroll
-    for (int i = 0; i < N - kNextHighestPowerOf2 / 2; ++i) {
-      K& ka                  = k[i];
-      KeyValuePair<K, V>& va = v[i];
-
-      K& kb                  = k[i + kNextHighestPowerOf2 / 2];
-      KeyValuePair<K, V>& vb = v[i + kNextHighestPowerOf2 / 2];
-
-      bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
-      swap(s, ka, kb);
-      swap(s, va.key, vb.key);
-      swap(s, va.value, vb.value);
-    }
-
-    constexpr int kLowSize  = kNextHighestPowerOf2 / 2;
-    constexpr int kHighSize = N - kNextHighestPowerOf2 / 2;
-    {
-      K newK[kLowSize];
-      KeyValuePair<K, V> newV[kLowSize];
-
-#pragma unroll
-      for (int i = 0; i < kLowSize; ++i) {
-        newK[i]       = k[i];
-        newV[i].key   = v[i].key;
-        newV[i].value = v[i].value;
-      }
-
-      constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kNextHighestPowerOf2 / 2);
-      // FIXME: compiler doesn't like this expression? compiler bug?
-      //      constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kLowSize);
-      BitonicMergeStepKVP<K,
-                          V,
-                          kLowSize,
-                          Dir,
-                          Comp,
-                          true,  // low
-                          kLowIsPowerOf2>::merge(newK, newV);
-
-#pragma unroll
-      for (int i = 0; i < kLowSize; ++i) {
-        k[i]       = newK[i];
-        v[i].key   = newV[i].key;
-        v[i].value = newV[i].value;
-      }
-    }
-
-    {
-      K newK[kHighSize];
-      KeyValuePair<K, V> newV[kHighSize];
-
-#pragma unroll
-      for (int i = 0; i < kHighSize; ++i) {
-        newK[i]       = k[i + kLowSize];
-        newV[i].key   = v[i + kLowSize].key;
-        newV[i].value = v[i + kLowSize].value;
-      }
-
-      constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(N - kNextHighestPowerOf2 / 2);
-      // FIXME: compiler doesn't like this expression? compiler bug?
-      //      constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(kHighSize);
-      BitonicMergeStepKVP<K,
-                          V,
-                          kHighSize,
-                          Dir,
-                          Comp,
-                          false,  // high
-                          kHighIsPowerOf2>::merge(newK, newV);
-
-#pragma unroll
-      for (int i = 0; i < kHighSize; ++i) {
-        k[i + kLowSize]       = newK[i];
-        v[i + kLowSize].key   = newV[i].key;
-        v[i + kLowSize].value = newV[i].value;
-      }
-    }
-  }
-};
-
-/// Merges two sets of registers across the warp of any size;
-/// i.e., merges a sorted k/v list of size kWarpSize * N1 with a
-/// sorted k/v list of size kWarpSize * N2, where N1 and N2 are any
-/// value >= 1
-template <typename K, typename V, int N1, int N2, bool Dir, typename Comp, bool FullMerge = true>
-inline __device__ void warpMergeAnyRegistersKVP(K k1[N1],
-                                                KeyValuePair<K, V> v1[N1],
-                                                K k2[N2],
-                                                KeyValuePair<K, V> v2[N2])
-{
-  constexpr int kSmallestN = N1 < N2 ? N1 : N2;
-
-#pragma unroll
-  for (int i = 0; i < kSmallestN; ++i) {
-    K& ka                  = k1[N1 - 1 - i];
-    KeyValuePair<K, V>& va = v1[N1 - 1 - i];
-
-    K& kb                  = k2[i];
-    KeyValuePair<K, V>& vb = v2[i];
-
-    K otherKa;
-    KeyValuePair<K, V> otherVa;
-
-    if (FullMerge) {
-      // We need the other values
-      otherKa    = shfl_xor(ka, kWarpSize - 1);
-      K otherVak = shfl_xor(va.key, kWarpSize - 1);
-      V otherVav = shfl_xor(va.value, kWarpSize - 1);
-      otherVa    = KeyValuePair(otherVak, otherVav);
-    }
-
-    K otherKb  = shfl_xor(kb, kWarpSize - 1);
-    K otherVbk = shfl_xor(vb.key, kWarpSize - 1);
-    V otherVbv = shfl_xor(vb.value, kWarpSize - 1);
-
-    // ka is always first in the list, so we needn't use our lane
-    // in this comparison
-    bool swapa = Dir ? Comp::gt(ka, otherKb) : Comp::lt(ka, otherKb);
-    assign(swapa, ka, otherKb);
-    assign(swapa, va.key, otherVbk);
-    assign(swapa, va.value, otherVbv);
-
-    // kb is always second in the list, so we needn't use our lane
-    // in this comparison
-    if (FullMerge) {
-      bool swapb = Dir ? Comp::lt(kb, otherKa) : Comp::gt(kb, otherKa);
-      assign(swapb, kb, otherKa);
-      assign(swapb, vb.key, otherVa.key);
-      assign(swapb, vb.value, otherVa.value);
-
-    } else {
-      // We don't care about updating elements in the second list
-    }
-  }
-
-  BitonicMergeStepKVP<K, V, N1, Dir, Comp, true, utils::isPowerOf2(N1)>::merge(k1, v1);
-  if (FullMerge) {
-    // Only if we care about N2 do we need to bother merging it fully
-    BitonicMergeStepKVP<K, V, N2, Dir, Comp, false, utils::isPowerOf2(N2)>::merge(k2, v2);
-  }
-}
-
-// Recursive template that uses the above bitonic merge to perform a
-// bitonic sort
-template <typename K, typename V, int N, bool Dir, typename Comp>
-struct BitonicSortStepKVP {
-  static inline __device__ void sort(K k[N], KeyValuePair<K, V> v[N])
-  {
-    static_assert(N > 1, "did not hit specialized case");
-
-    // Sort recursively
-    constexpr int kSizeA = N / 2;
-    constexpr int kSizeB = N - kSizeA;
-
-    K aK[kSizeA];
-    KeyValuePair<K, V> aV[kSizeA];
-
-#pragma unroll
-    for (int i = 0; i < kSizeA; ++i) {
-      aK[i]       = k[i];
-      aV[i].key   = v[i].key;
-      aV[i].value = v[i].value;
-    }
-
-    BitonicSortStepKVP<K, V, kSizeA, Dir, Comp>::sort(aK, aV);
-
-    K bK[kSizeB];
-    KeyValuePair<K, V> bV[kSizeB];
-
-#pragma unroll
-    for (int i = 0; i < kSizeB; ++i) {
-      bK[i]       = k[i + kSizeA];
-      bV[i].key   = v[i + kSizeA].key;
-      bV[i].value = v[i + kSizeA].value;
-    }
-
-    BitonicSortStepKVP<K, V, kSizeB, Dir, Comp>::sort(bK, bV);
-
-    // Merge halves
-    warpMergeAnyRegistersKVP<K, V, kSizeA, kSizeB, Dir, Comp>(aK, aV, bK, bV);
-
-#pragma unroll
-    for (int i = 0; i < kSizeA; ++i) {
-      k[i]       = aK[i];
-      v[i].key   = aV[i].key;
-      v[i].value = aV[i].value;
-    }
-
-#pragma unroll
-    for (int i = 0; i < kSizeB; ++i) {
-      k[i + kSizeA]       = bK[i];
-      v[i + kSizeA].key   = bV[i].key;
-      v[i + kSizeA].value = bV[i].value;
-    }
-  }
-};
-
-// Single warp (N == 1) sorting specialization
-template <typename K, typename V, bool Dir, typename Comp>
-struct BitonicSortStepKVP<K, V, 1, Dir, Comp> {
-  static inline __device__ void sort(K k[1], KeyValuePair<K, V> v[1])
-  {
-    // Update this code if this changes
-    // should go from 1 -> kWarpSize in multiples of 2
-    static_assert(kWarpSize == 32, "unexpected warp size");
-
-    warpBitonicMergeLE16KVP<K, V, 1, Dir, Comp, false>(k[0], v[0]);
-    warpBitonicMergeLE16KVP<K, V, 2, Dir, Comp, false>(k[0], v[0]);
-    warpBitonicMergeLE16KVP<K, V, 4, Dir, Comp, false>(k[0], v[0]);
-    warpBitonicMergeLE16KVP<K, V, 8, Dir, Comp, false>(k[0], v[0]);
-    warpBitonicMergeLE16KVP<K, V, 16, Dir, Comp, false>(k[0], v[0]);
-  }
-};
-
-/// Sort a list of kWarpSize * N elements in registers, where N is an
-/// arbitrary >= 1
-template <typename K, typename V, int N, bool Dir, typename Comp>
-inline __device__ void warpSortAnyRegistersKVP(K k[N], KeyValuePair<K, V> v[N])
-{
-  BitonicSortStepKVP<K, V, N, Dir, Comp>::sort(k, v);
-}
-
-// `Dir` true, produce largest values.
-// `Dir` false, produce smallest values.
-template <typename K,
-          typename V,
-          bool Dir,
-          typename Comp,
-          int NumWarpQ,
-          int NumThreadQ,
-          int ThreadsPerBlock>
-struct KeyValueWarpSelect {
-  static constexpr int kNumWarpQRegisters = NumWarpQ / faiss::gpu::kWarpSize;
-
-  __device__ inline KeyValueWarpSelect(K initKVal, faiss::gpu::KeyValuePair<K, V> initVVal, int k)
-    : initK(initKVal),
-      initV(initVVal),
-      numVals(0),
-      warpKTop(initKVal),
-      warpKTopRDist(initKVal),
-      kLane((k - 1) % faiss::gpu::kWarpSize)
-  {
-    static_assert(faiss::gpu::utils::isPowerOf2(ThreadsPerBlock), "threads must be a power-of-2");
-    static_assert(faiss::gpu::utils::isPowerOf2(NumWarpQ), "warp queue must be power-of-2");
-
-    // Fill the per-thread queue keys with the default value
-#pragma unroll
-    for (int i = 0; i < NumThreadQ; ++i) {
-      threadK[i]       = initK;
-      threadV[i].key   = initV.key;
-      threadV[i].value = initV.value;
-    }
-
-    // Fill the warp queue with the default value
-#pragma unroll
-    for (int i = 0; i < kNumWarpQRegisters; ++i) {
-      warpK[i]       = initK;
-      warpV[i].key   = initV.key;
-      warpV[i].value = initV.value;
-    }
-  }
-
-  __device__ inline void addThreadQ(K k, faiss::gpu::KeyValuePair<K, V>& v)
-  {
-    if (Dir ? Comp::gt(k, warpKTop) : Comp::lt(k, warpKTop)) {
-      // Rotate right
-#pragma unroll
-      for (int i = NumThreadQ - 1; i > 0; --i) {
-        threadK[i]       = threadK[i - 1];
-        threadV[i].key   = threadV[i - 1].key;
-        threadV[i].value = threadV[i - 1].value;
-      }
-
-      threadK[0]       = k;
-      threadV[0].key   = v.key;
-      threadV[0].value = v.value;
-      ++numVals;
-    }
-  }
-  /// This function handles sorting and merging together the
-  /// per-thread queues with the warp-wide queue, creating a sorted
-  /// list across both
-
-  // TODO
-  __device__ inline void mergeWarpQ()
-  {
-    // Sort all of the per-thread queues
-    faiss::gpu::warpSortAnyRegistersKVP<K, V, NumThreadQ, !Dir, Comp>(threadK, threadV);
-
-    // The warp queue is already sorted, and now that we've sorted the
-    // per-thread queue, merge both sorted lists together, producing
-    // one sorted list
-    faiss::gpu::warpMergeAnyRegistersKVP<K, V, kNumWarpQRegisters, NumThreadQ, !Dir, Comp, false>(
-      warpK, warpV, threadK, threadV);
-  }
-
-  /// WARNING: all threads in a warp must participate in this.
-  /// Otherwise, you must call the constituent parts separately.
-  __device__ inline void add(K k, faiss::gpu::KeyValuePair<K, V>& v)
-  {
-    addThreadQ(k, v);
-    checkThreadQ();
-  }
-
-  __device__ inline void reduce()
-  {
-    // Have all warps dump and merge their queues; this will produce
-    // the final per-warp results
-    mergeWarpQ();
-  }
-
-  __device__ inline void checkThreadQ()
-  {
-    bool needSort = (numVals == NumThreadQ);
-
-#if CUDA_VERSION >= 9000
-    needSort = __any_sync(0xffffffff, needSort);
-#else
-    needSort = __any(needSort);
-#endif
-
-    if (!needSort) {
-      // no lanes have triggered a sort
-      return;
-    }
-
-    mergeWarpQ();
-
-    // Any top-k elements have been merged into the warp queue; we're
-    // free to reset the thread queues
-    numVals = 0;
-
-#pragma unroll
-    for (int i = 0; i < NumThreadQ; ++i) {
-      threadK[i]       = initK;
-      threadV[i].key   = initV.key;
-      threadV[i].value = initV.value;
-    }
-
-    // We have to beat at least this element
-    warpKTopRDist = shfl(warpV[kNumWarpQRegisters - 1].key, kLane);
-    warpKTop      = shfl(warpK[kNumWarpQRegisters - 1], kLane);
-  }
-
-  /// Dump final k selected values for this warp out
-  __device__ inline void writeOut(K* outK, V* outV, int k)
-  {
-    int laneId = faiss::gpu::getLaneId();
-
-#pragma unroll
-    for (int i = 0; i < kNumWarpQRegisters; ++i) {
-      int idx = i * faiss::gpu::kWarpSize + laneId;
-
-      if (idx < k) {
-        outK[idx] = warpK[i];
-        outV[idx] = warpV[i].value;
-      }
-    }
-  }
-
-  // Default element key
-  const K initK;
-
-  // Default element value
-  const faiss::gpu::KeyValuePair<K, V> initV;
-
-  // Number of valid elements in our thread queue
-  int numVals;
-
-  // The k-th highest (Dir) or lowest (!Dir) element
-  K warpKTop;
-
-  // TopK's distance to closest landmark
-  K warpKTopRDist;
-
-  // Thread queue values
-  K threadK[NumThreadQ];
-  faiss::gpu::KeyValuePair<K, V> threadV[NumThreadQ];
-
-  // warpK[0] is highest (Dir) or lowest (!Dir)
-  K warpK[kNumWarpQRegisters];
-  faiss::gpu::KeyValuePair<K, V> warpV[kNumWarpQRegisters];
-
-  // This is what lane we should load an approximation (>=k) to the
-  // kth element from the last register in the warp queue (i.e.,
-  // warpK[kNumWarpQRegisters - 1]).
-  int kLane;
-};
-
-}  // namespace gpu
-}  // namespace faiss
diff --git a/cpp/include/raft/spatial/knn/knn.cuh b/cpp/include/raft/spatial/knn/knn.cuh
index 95f7aab9da..ca2c248392 100644
--- a/cpp/include/raft/spatial/knn/knn.cuh
+++ b/cpp/include/raft/spatial/knn/knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,12 +18,11 @@
 
 #include "detail/knn_brute_force_faiss.cuh"
 #include "detail/selection_faiss.cuh"
-#include <raft/core/device_mdspan.hpp>
-
-#include "detail/topk/radix_topk.cuh"
-#include "detail/topk/warpsort_topk.cuh"
 
+#include <raft/core/device_mdspan.hpp>
 #include <raft/core/nvtx.hpp>
+#include <raft/matrix/detail/select_radix.cuh>
+#include <raft/matrix/detail/select_warpsort.cuh>
 
 namespace raft::spatial::knn {
 
@@ -88,6 +87,8 @@ enum class SelectKAlgo {
  * Note, depending on the selected algorithm, the values within rows of `out_keys` are not
  * necessarily sorted. See the `SelectKAlgo` enumeration for more details.
  *
+ * Note: This call is deprecated, please use `raft/matrix/select_k.cuh`
+ *
  * @tparam idx_t
  *   the payload type (what is being selected together with the keys).
  * @tparam value_t
@@ -122,16 +123,17 @@ enum class SelectKAlgo {
  *   the implementation of the algorithm
  */
 template <typename idx_t = int, typename value_t = float>
-inline void select_k(const value_t* in_keys,
-                     const idx_t* in_values,
-                     size_t n_inputs,
-                     size_t input_len,
-                     value_t* out_keys,
-                     idx_t* out_values,
-                     bool select_min,
-                     int k,
-                     cudaStream_t stream,
-                     SelectKAlgo algo = SelectKAlgo::FAISS)
+[[deprecated("Use function `select_k` from `raft/matrix/select_k.cuh`")]] inline void select_k(
+  const value_t* in_keys,
+  const idx_t* in_values,
+  size_t n_inputs,
+  size_t input_len,
+  value_t* out_keys,
+  idx_t* out_values,
+  bool select_min,
+  int k,
+  cudaStream_t stream,
+  SelectKAlgo algo = SelectKAlgo::FAISS)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope("select-%s-%d (%zu, %zu) algo-%d",
                                                             select_min ? "min" : "max",
@@ -151,17 +153,17 @@ inline void select_k(const value_t* in_keys,
       break;
 
     case SelectKAlgo::RADIX_8_BITS:
-      detail::topk::radix_topk<value_t, idx_t, 8, 512>(
+      matrix::detail::select::radix::select_k<value_t, idx_t, 8, 512>(
         in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, stream);
       break;
 
     case SelectKAlgo::RADIX_11_BITS:
-      detail::topk::radix_topk<value_t, idx_t, 11, 512>(
+      matrix::detail::select::radix::select_k<value_t, idx_t, 11, 512>(
         in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, stream);
       break;
 
     case SelectKAlgo::WARP_SORT:
-      detail::topk::warp_sort_topk<value_t, idx_t>(
+      matrix::detail::select::warpsort::select_k<value_t, idx_t>(
         in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, stream);
       break;
 
@@ -193,7 +195,7 @@ inline void select_k(const value_t* in_keys,
  *            as input vector.
  */
 template <typename idx_t = std::int64_t, typename value_t = float, typename value_int = int>
-void brute_force_knn(raft::handle_t const& handle,
+void brute_force_knn(raft::device_resources const& handle,
                      std::vector<value_t*>& input,
                      std::vector<value_int>& sizes,
                      value_int D,
diff --git a/cpp/include/raft/spectral/cluster_solvers.cuh b/cpp/include/raft/spectral/cluster_solvers.cuh
index 6f9ebcd6af..1cb7aefd13 100644
--- a/cpp/include/raft/spectral/cluster_solvers.cuh
+++ b/cpp/include/raft/spectral/cluster_solvers.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,7 +47,7 @@ struct kmeans_solver_t {
   {
   }
 
-  std::pair<value_type_t, index_type_t> solve(handle_t const& handle,
+  std::pair<value_type_t, index_type_t> solve(raft::device_resources const& handle,
                                               size_type_t n_obs_vecs,
                                               size_type_t dim,
                                               value_type_t const* __restrict__ obs,
diff --git a/cpp/include/raft/spectral/cluster_solvers_deprecated.cuh b/cpp/include/raft/spectral/cluster_solvers_deprecated.cuh
index 0beb1e5836..17dcf6b07c 100644
--- a/cpp/include/raft/spectral/cluster_solvers_deprecated.cuh
+++ b/cpp/include/raft/spectral/cluster_solvers_deprecated.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -52,7 +52,7 @@ struct kmeans_solver_deprecated_t {
   {
   }
 
-  std::pair<value_type_t, index_type_t> solve(handle_t const& handle,
+  std::pair<value_type_t, index_type_t> solve(raft::device_resources const& handle,
                                               size_type_t n_obs_vecs,
                                               size_type_t dim,
                                               value_type_t const* __restrict__ obs,
diff --git a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
index f225438841..e32b718117 100644
--- a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/util/cudart_utils.hpp>
@@ -89,7 +89,7 @@ struct vector_view_t {
 template <typename value_type>
 class vector_t {
  public:
-  vector_t(handle_t const& raft_handle, size_type sz)
+  vector_t(device_resources const& raft_handle, size_type sz)
     : buffer_(sz, raft_handle.get_stream()), thrust_policy(raft_handle.get_thrust_policy())
   {
   }
@@ -128,7 +128,7 @@ class vector_t {
 
 template <typename index_type, typename value_type>
 struct sparse_matrix_t {
-  sparse_matrix_t(handle_t const& raft_handle,
+  sparse_matrix_t(device_resources const& raft_handle,
                   index_type const* row_offsets,
                   index_type const* col_indices,
                   value_type const* values,
@@ -145,7 +145,7 @@ struct sparse_matrix_t {
   {
   }
 
-  sparse_matrix_t(handle_t const& raft_handle,
+  sparse_matrix_t(device_resources const& raft_handle,
                   index_type const* row_offsets,
                   index_type const* col_indices,
                   value_type const* values,
@@ -162,7 +162,7 @@ struct sparse_matrix_t {
   }
 
   template <typename CSRView>
-  sparse_matrix_t(handle_t const& raft_handle, CSRView const& csr_view)
+  sparse_matrix_t(device_resources const& raft_handle, CSRView const& csr_view)
     : handle_(raft_handle),
       row_offsets_(csr_view.offsets),
       col_indices_(csr_view.indices),
@@ -276,7 +276,7 @@ struct sparse_matrix_t {
 #endif
   }
 
-  handle_t const& get_handle(void) const { return handle_; }
+  device_resources const& get_handle(void) const { return handle_; }
 
 #if not defined CUDA_ENFORCE_LOWER and CUDA_VER_10_1_UP
   cusparseSpMVAlg_t translate_algorithm(sparse_mv_alg_t alg) const
@@ -292,7 +292,7 @@ struct sparse_matrix_t {
   // private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence,
   // aggregate
 
-  handle_t const& handle_;
+  raft::device_resources const& handle_;
   index_type const* row_offsets_;
   index_type const* col_indices_;
   value_type const* values_;
@@ -303,7 +303,7 @@ struct sparse_matrix_t {
 
 template <typename index_type, typename value_type>
 struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
-  laplacian_matrix_t(handle_t const& raft_handle,
+  laplacian_matrix_t(device_resources const& raft_handle,
                      index_type const* row_offsets,
                      index_type const* col_indices,
                      value_type const* values,
@@ -318,7 +318,7 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
     sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0, diagonal_.raw());
   }
 
-  laplacian_matrix_t(handle_t const& raft_handle,
+  laplacian_matrix_t(device_resources const& raft_handle,
                      sparse_matrix_t<index_type, value_type> const& csr_m)
     : sparse_matrix_t<index_type, value_type>(raft_handle,
                                               csr_m.row_offsets_,
@@ -376,7 +376,7 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
 
 template <typename index_type, typename value_type>
 struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type> {
-  modularity_matrix_t(handle_t const& raft_handle,
+  modularity_matrix_t(device_resources const& raft_handle,
                       index_type const* row_offsets,
                       index_type const* col_indices,
                       value_type const* values,
@@ -388,7 +388,7 @@ struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type> {
     edge_sum_ = laplacian_matrix_t<index_type, value_type>::diagonal_.nrm1();
   }
 
-  modularity_matrix_t(handle_t const& raft_handle,
+  modularity_matrix_t(device_resources const& raft_handle,
                       sparse_matrix_t<index_type, value_type> const& csr_m)
     : laplacian_matrix_t<index_type, value_type>(raft_handle, csr_m)
   {
diff --git a/cpp/include/raft/spectral/detail/modularity_maximization.hpp b/cpp/include/raft/spectral/detail/modularity_maximization.hpp
index b60ca719fb..160664bae8 100644
--- a/cpp/include/raft/spectral/detail/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/detail/modularity_maximization.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -81,7 +81,7 @@ namespace detail {
  */
 template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
 std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
   EigenSolver const& eigen_solver,
   ClusterSolver const& cluster_solver,
@@ -140,7 +140,7 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
  *  @param modularity On exit, modularity
  */
 template <typename vertex_t, typename weight_t>
-void analyzeModularity(handle_t const& handle,
+void analyzeModularity(raft::device_resources const& handle,
                        raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
                        vertex_t nClusters,
                        vertex_t const* __restrict__ clusters,
diff --git a/cpp/include/raft/spectral/detail/partition.hpp b/cpp/include/raft/spectral/detail/partition.hpp
index 1e0cc78826..6750f5d93f 100644
--- a/cpp/include/raft/spectral/detail/partition.hpp
+++ b/cpp/include/raft/spectral/detail/partition.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -63,7 +63,7 @@ namespace detail {
  */
 template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
 std::tuple<vertex_t, weight_t, vertex_t> partition(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
   EigenSolver const& eigen_solver,
   ClusterSolver const& cluster_solver,
@@ -131,7 +131,7 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(
  *  @return error flag.
  */
 template <typename vertex_t, typename weight_t>
-void analyzePartition(handle_t const& handle,
+void analyzePartition(raft::device_resources const& handle,
                       spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
                       vertex_t nClusters,
                       const vertex_t* __restrict__ clusters,
diff --git a/cpp/include/raft/spectral/detail/spectral_util.cuh b/cpp/include/raft/spectral/detail/spectral_util.cuh
index 3a0ad1f96f..ae75031522 100644
--- a/cpp/include/raft/spectral/detail/spectral_util.cuh
+++ b/cpp/include/raft/spectral/detail/spectral_util.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/spectral/matrix_wrappers.hpp>
 #include <raft/util/cudart_utils.hpp>
@@ -72,7 +72,7 @@ static __global__ void scale_obs_kernel(index_type_t m, index_type_t n, value_ty
 
   // scale by alpha
   alpha = __shfl_sync(warp_full_mask(), alpha, blockDim.x - 1, blockDim.x);
-  alpha = std::sqrt(alpha);
+  alpha = raft::sqrt(alpha);
   for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) {
     for (i = threadIdx.x; i < m; i += blockDim.x) {  // blockDim.x=32
       index      = i + j * m;
@@ -116,7 +116,10 @@ cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs)
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs, weight_t* eigVecs)
+void transform_eigen_matrix(raft::device_resources const& handle,
+                            edge_t n,
+                            vertex_t nEigVecs,
+                            weight_t* eigVecs)
 {
   auto stream             = handle.get_stream();
   auto cublas_h           = handle.get_cublas_handle();
@@ -207,7 +210,7 @@ struct equal_to_i_op {
 // Construct indicator vector for ith partition
 //
 template <typename vertex_t, typename edge_t, typename weight_t>
-bool construct_indicator(handle_t const& handle,
+bool construct_indicator(raft::device_resources const& handle,
                          edge_t index,
                          edge_t n,
                          weight_t& clustersize,
diff --git a/cpp/include/raft/spectral/eigen_solvers.cuh b/cpp/include/raft/spectral/eigen_solvers.cuh
index 88e4abe513..3f6959d2e2 100644
--- a/cpp/include/raft/spectral/eigen_solvers.cuh
+++ b/cpp/include/raft/spectral/eigen_solvers.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -50,7 +50,7 @@ struct lanczos_solver_t {
   }
 
   index_type_t solve_smallest_eigenvectors(
-    handle_t const& handle,
+    raft::device_resources const& handle,
     matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
     value_type_t* __restrict__ eigVals,
     value_type_t* __restrict__ eigVecs) const
@@ -73,7 +73,7 @@ struct lanczos_solver_t {
   }
 
   index_type_t solve_largest_eigenvectors(
-    handle_t const& handle,
+    raft::device_resources const& handle,
     matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
     value_type_t* __restrict__ eigVals,
     value_type_t* __restrict__ eigVecs) const
diff --git a/cpp/include/raft/spectral/modularity_maximization.cuh b/cpp/include/raft/spectral/modularity_maximization.cuh
index 61d85aefaa..29d432c042 100644
--- a/cpp/include/raft/spectral/modularity_maximization.cuh
+++ b/cpp/include/raft/spectral/modularity_maximization.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,7 +45,7 @@ namespace spectral {
  */
 template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
 std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
   EigenSolver const& eigen_solver,
   ClusterSolver const& cluster_solver,
@@ -70,7 +70,7 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
  *  @param modularity On exit, modularity
  */
 template <typename vertex_t, typename weight_t>
-void analyzeModularity(handle_t const& handle,
+void analyzeModularity(raft::device_resources const& handle,
                        matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
                        vertex_t nClusters,
                        vertex_t const* __restrict__ clusters,
diff --git a/cpp/include/raft/spectral/partition.cuh b/cpp/include/raft/spectral/partition.cuh
index 2d21f2223c..0dec230752 100644
--- a/cpp/include/raft/spectral/partition.cuh
+++ b/cpp/include/raft/spectral/partition.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,7 +47,7 @@ namespace spectral {
  */
 template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
 std::tuple<vertex_t, weight_t, vertex_t> partition(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
   EigenSolver const& eigen_solver,
   ClusterSolver const& cluster_solver,
@@ -78,7 +78,7 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(
  *  @param cost On exit, partition cost function.
  */
 template <typename vertex_t, typename weight_t>
-void analyzePartition(handle_t const& handle,
+void analyzePartition(raft::device_resources const& handle,
                       matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
                       vertex_t nClusters,
                       const vertex_t* __restrict__ clusters,
diff --git a/cpp/include/raft/stats/accuracy.cuh b/cpp/include/raft/stats/accuracy.cuh
index 37cdc280f9..7a5780fbc9 100644
--- a/cpp/include/raft/stats/accuracy.cuh
+++ b/cpp/include/raft/stats/accuracy.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,6 +40,11 @@ float accuracy(const math_t* predictions, const math_t* ref_predictions, int n,
   return detail::accuracy_score(predictions, ref_predictions, n, stream);
 }
 
+/**
+ * @defgroup stats_accuracy Accuracy Score
+ * @{
+ */
+
 /**
  * @brief Compute accuracy of predictions. Useful for classification.
  * @tparam value_t: data type for predictions (e.g., int for classification)
@@ -50,7 +55,7 @@ float accuracy(const math_t* predictions, const math_t* ref_predictions, int n,
  * @return: Accuracy score in [0, 1]; higher is better.
  */
 template <typename value_t, typename idx_t>
-float accuracy(const raft::handle_t& handle,
+float accuracy(raft::device_resources const& handle,
                raft::device_vector_view<const value_t, idx_t> predictions,
                raft::device_vector_view<const value_t, idx_t> ref_predictions)
 {
@@ -63,6 +68,9 @@ float accuracy(const raft::handle_t& handle,
                                 predictions.extent(0),
                                 handle.get_stream());
 }
+
+/** @} */  // end group stats_accuracy
+
 }  // namespace stats
 }  // namespace raft
 
diff --git a/cpp/include/raft/stats/adjusted_rand_index.cuh b/cpp/include/raft/stats/adjusted_rand_index.cuh
index 93fd07eb0b..d2815fe753 100644
--- a/cpp/include/raft/stats/adjusted_rand_index.cuh
+++ b/cpp/include/raft/stats/adjusted_rand_index.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,6 +49,11 @@ double adjusted_rand_index(const T* firstClusterArray,
   return detail::compute_adjusted_rand_index(firstClusterArray, secondClusterArray, size, stream);
 }
 
+/**
+ * @defgroup stats_adj_rand_index Adjusted Rand Index
+ * @{
+ */
+
 /**
  * @brief Function to calculate Adjusted RandIndex
  * @see https://en.wikipedia.org/wiki/Rand_index
@@ -61,7 +66,7 @@ double adjusted_rand_index(const T* firstClusterArray,
  * @return the Adjusted RandIndex
  */
 template <typename value_t, typename math_t, typename idx_t>
-double adjusted_rand_index(const raft::handle_t& handle,
+double adjusted_rand_index(raft::device_resources const& handle,
                            raft::device_vector_view<const value_t, idx_t> first_cluster_array,
                            raft::device_vector_view<const value_t, idx_t> second_cluster_array)
 {
@@ -75,6 +80,8 @@ double adjusted_rand_index(const raft::handle_t& handle,
                                                               handle.get_stream());
 }
 
+/** @} */  // end group stats_adj_rand_index
+
 };  // end namespace stats
 };  // end namespace raft
 
diff --git a/cpp/include/raft/stats/completeness_score.cuh b/cpp/include/raft/stats/completeness_score.cuh
index fd535e77d5..17ff658ac8 100644
--- a/cpp/include/raft/stats/completeness_score.cuh
+++ b/cpp/include/raft/stats/completeness_score.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,6 +47,11 @@ double completeness_score(const T* truthClusterArray,
     predClusterArray, truthClusterArray, size, lower_label_range, upper_label_range, stream);
 }
 
+/**
+ * @defgroup stats_completeness Completeness Score
+ * @{
+ */
+
 /**
  * @brief Function to calculate the completeness score between two clusters
  *
@@ -60,7 +65,7 @@ double completeness_score(const T* truthClusterArray,
  * @return the cluster completeness score
  */
 template <typename value_t, typename idx_t>
-double completeness_score(const raft::handle_t& handle,
+double completeness_score(raft::device_resources const& handle,
                           raft::device_vector_view<const value_t, idx_t> truth_cluster_array,
                           raft::device_vector_view<const value_t, idx_t> pred_cluster_array,
                           value_t lower_label_range,
@@ -77,6 +82,8 @@ double completeness_score(const raft::handle_t& handle,
                                    handle.get_stream());
 }
 
+/** @} */  // end group stats_completeness
+
 };  // end namespace stats
 };  // end namespace raft
 
diff --git a/cpp/include/raft/stats/contingency_matrix.cuh b/cpp/include/raft/stats/contingency_matrix.cuh
index f36d95daff..e309e8b4b9 100644
--- a/cpp/include/raft/stats/contingency_matrix.cuh
+++ b/cpp/include/raft/stats/contingency_matrix.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/stats/detail/contingencyMatrix.cuh>
 
@@ -44,31 +44,6 @@ void getInputClassCardinality(
   detail::getInputClassCardinality(groundTruth, nSamples, stream, minLabel, maxLabel);
 }
 
-/**
- * @brief use this to allocate output matrix size
- * size of matrix = (maxLabel - minLabel + 1)^2 * sizeof(int)
- * @tparam value_t label type
- * @tparam idx_t Index type of matrix extent.
- * @param[in]  handle: the raft handle.
- * @param[in]  groundTruth: device 1-d array for ground truth (num of rows)
- * @param[out] minLabel: calculated min value in input array
- * @param[out] maxLabel: calculated max value in input array
- */
-template <typename value_t, typename idx_t>
-void get_input_class_cardinality(const raft::handle_t& handle,
-                                 raft::device_vector_view<const value_t, idx_t> groundTruth,
-                                 raft::host_scalar_view<value_t> minLabel,
-                                 raft::host_scalar_view<value_t> maxLabel)
-{
-  RAFT_EXPECTS(minLabel.data_handle() != nullptr, "Invalid minLabel pointer");
-  RAFT_EXPECTS(maxLabel.data_handle() != nullptr, "Invalid maxLabel pointer");
-  detail::getInputClassCardinality(groundTruth.data_handle(),
-                                   groundTruth.extent(0),
-                                   handle.get_stream(),
-                                   *minLabel.data_handle(),
-                                   *maxLabel.data_handle());
-}
-
 /**
  * @brief Calculate workspace size for running contingency matrix calculations
  * @tparam T label type
@@ -129,6 +104,36 @@ void contingencyMatrix(const T* groundTruth,
                                      maxLabel);
 }
 
+/**
+ * @defgroup contingency_matrix Contingency Matrix
+ * @{
+ */
+
+/**
+ * @brief use this to allocate output matrix size
+ * size of matrix = (maxLabel - minLabel + 1)^2 * sizeof(int)
+ * @tparam value_t label type
+ * @tparam idx_t Index type of matrix extent.
+ * @param[in]  handle: the raft handle.
+ * @param[in]  groundTruth: device 1-d array for ground truth (num of rows)
+ * @param[out] minLabel: calculated min value in input array
+ * @param[out] maxLabel: calculated max value in input array
+ */
+template <typename value_t, typename idx_t>
+void get_input_class_cardinality(raft::device_resources const& handle,
+                                 raft::device_vector_view<const value_t, idx_t> groundTruth,
+                                 raft::host_scalar_view<value_t> minLabel,
+                                 raft::host_scalar_view<value_t> maxLabel)
+{
+  RAFT_EXPECTS(minLabel.data_handle() != nullptr, "Invalid minLabel pointer");
+  RAFT_EXPECTS(maxLabel.data_handle() != nullptr, "Invalid maxLabel pointer");
+  detail::getInputClassCardinality(groundTruth.data_handle(),
+                                   groundTruth.extent(0),
+                                   handle.get_stream(),
+                                   *minLabel.data_handle(),
+                                   *maxLabel.data_handle());
+}
+
 /**
  * @brief construct contingency matrix given input ground truth and prediction
  *        labels. Users should call function getInputClassCardinality to find
@@ -153,7 +158,7 @@ template <typename value_t,
           typename layout_t,
           typename opt_min_label_t,
           typename opt_max_label_t>
-void contingency_matrix(const raft::handle_t& handle,
+void contingency_matrix(raft::device_resources const& handle,
                         raft::device_vector_view<const value_t, idx_t> ground_truth,
                         raft::device_vector_view<const value_t, idx_t> predicted_label,
                         raft::device_matrix_view<out_t, idx_t, layout_t> out_mat,
@@ -191,6 +196,8 @@ void contingency_matrix(const raft::handle_t& handle,
                                             max_label_value);
 }
 
+/** @} */  // end group contingency_matrix
+
 /**
  * @brief Overload of `contingency_matrix` to help the
  *   compiler find the above overload, in case users pass in
diff --git a/cpp/include/raft/stats/cov.cuh b/cpp/include/raft/stats/cov.cuh
index a0c2ed2090..c0c387e067 100644
--- a/cpp/include/raft/stats/cov.cuh
+++ b/cpp/include/raft/stats/cov.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,7 +45,7 @@ namespace stats {
  * function returns!
  */
 template <typename Type>
-void cov(const raft::handle_t& handle,
+void cov(raft::device_resources const& handle,
          Type* covar,
          Type* data,
          const Type* mu,
@@ -59,6 +59,11 @@ void cov(const raft::handle_t& handle,
   detail::cov(handle, covar, data, mu, D, N, sample, rowMajor, stable, stream);
 }
 
+/**
+ * @defgroup stats_cov Covariance Matrix Construction
+ * @{
+ */
+
 /**
  * @brief Compute covariance of the input matrix
  *
@@ -80,7 +85,7 @@ void cov(const raft::handle_t& handle,
  * function returns!
  */
 template <typename value_t, typename idx_t, typename layout_t>
-void cov(const raft::handle_t& handle,
+void cov(raft::device_resources const& handle,
          raft::device_matrix_view<value_t, idx_t, layout_t> data,
          raft::device_vector_view<const value_t, idx_t> mu,
          raft::device_matrix_view<value_t, idx_t, layout_t> covar,
@@ -107,6 +112,9 @@ void cov(const raft::handle_t& handle,
               stable,
               handle.get_stream());
 }
+
+/** @} */  // end group stats_cov
+
 };  // end namespace stats
 };  // end namespace raft
 
diff --git a/cpp/include/raft/stats/detail/batched/silhouette_score.cuh b/cpp/include/raft/stats/detail/batched/silhouette_score.cuh
index 25a3721af1..a184fe22ef 100644
--- a/cpp/include/raft/stats/detail/batched/silhouette_score.cuh
+++ b/cpp/include/raft/stats/detail/batched/silhouette_score.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -111,7 +111,7 @@ __global__ void compute_chunked_a_b_kernel(value_t* a,
 }
 
 template <typename value_idx, typename label_idx>
-rmm::device_uvector<value_idx> get_cluster_counts(const raft::handle_t& handle,
+rmm::device_uvector<value_idx> get_cluster_counts(raft::device_resources const& handle,
                                                   const label_idx* y,
                                                   value_idx& n_rows,
                                                   label_idx& n_labels)
@@ -128,7 +128,7 @@ rmm::device_uvector<value_idx> get_cluster_counts(const raft::handle_t& handle,
 }
 
 template <typename value_t, typename value_idx>
-rmm::device_uvector<value_t> get_pairwise_distance(const raft::handle_t& handle,
+rmm::device_uvector<value_t> get_pairwise_distance(raft::device_resources const& handle,
                                                    const value_t* left_begin,
                                                    const value_t* right_begin,
                                                    value_idx& n_left_rows,
@@ -146,7 +146,7 @@ rmm::device_uvector<value_t> get_pairwise_distance(const raft::handle_t& handle,
 }
 
 template <typename value_t, typename value_idx, typename label_idx>
-void compute_chunked_a_b(const raft::handle_t& handle,
+void compute_chunked_a_b(raft::device_resources const& handle,
                          value_t* a,
                          value_t* b,
                          value_idx& row_offset,
@@ -169,7 +169,7 @@ void compute_chunked_a_b(const raft::handle_t& handle,
 
 template <typename value_t, typename value_idx, typename label_idx>
 value_t silhouette_score(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const value_t* X,
   value_idx n_rows,
   value_idx n_cols,
@@ -249,19 +249,18 @@ value_t silhouette_score(
 
   // calculating row-wise minimum in b
   // this prim only supports int indices for now
-  raft::linalg::
-    reduce<value_t, value_t, value_idx, raft::Nop<value_t>, raft::stats::detail::MinOp<value_t>>(
-      b_ptr,
-      b_ptr,
-      n_labels,
-      n_rows,
-      std::numeric_limits<value_t>::max(),
-      true,
-      true,
-      stream,
-      false,
-      raft::Nop<value_t>(),
-      raft::stats::detail::MinOp<value_t>());
+  raft::linalg::reduce<value_t, value_t, value_idx, raft::identity_op, raft::min_op>(
+    b_ptr,
+    b_ptr,
+    n_labels,
+    n_rows,
+    std::numeric_limits<value_t>::max(),
+    true,
+    true,
+    stream,
+    false,
+    raft::identity_op(),
+    raft::min_op());
 
   // calculating the silhouette score per sample
   raft::linalg::binaryOp<value_t, raft::stats::detail::SilOp<value_t>, value_t, value_idx>(
diff --git a/cpp/include/raft/stats/detail/cov.cuh b/cpp/include/raft/stats/detail/cov.cuh
index 24de58dd91..0561ac269b 100644
--- a/cpp/include/raft/stats/detail/cov.cuh
+++ b/cpp/include/raft/stats/detail/cov.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,7 +44,7 @@ namespace detail {
  * function returns!
  */
 template <typename Type>
-void cov(const raft::handle_t& handle,
+void cov(raft::device_resources const& handle,
          Type* covar,
          Type* data,
          const Type* mu,
diff --git a/cpp/include/raft/stats/detail/mean_center.cuh b/cpp/include/raft/stats/detail/mean_center.cuh
index 61017511b1..6e1c07e1e3 100644
--- a/cpp/include/raft/stats/detail/mean_center.cuh
+++ b/cpp/include/raft/stats/detail/mean_center.cuh
@@ -49,15 +49,7 @@ void meanCenter(Type* out,
                 cudaStream_t stream)
 {
   raft::linalg::matrixVectorOp(
-    out,
-    data,
-    mu,
-    D,
-    N,
-    rowMajor,
-    bcastAlongRows,
-    [] __device__(Type a, Type b) { return a - b; },
-    stream);
+    out, data, mu, D, N, rowMajor, bcastAlongRows, raft::sub_op{}, stream);
 }
 
 /**
@@ -85,15 +77,7 @@ void meanAdd(Type* out,
              cudaStream_t stream)
 {
   raft::linalg::matrixVectorOp(
-    out,
-    data,
-    mu,
-    D,
-    N,
-    rowMajor,
-    bcastAlongRows,
-    [] __device__(Type a, Type b) { return a + b; },
-    stream);
+    out, data, mu, D, N, rowMajor, bcastAlongRows, raft::add_op{}, stream);
 }
 
 };  // end namespace detail
diff --git a/cpp/include/raft/stats/detail/silhouette_score.cuh b/cpp/include/raft/stats/detail/silhouette_score.cuh
index 076d9b13e5..f3839b99c8 100644
--- a/cpp/include/raft/stats/detail/silhouette_score.cuh
+++ b/cpp/include/raft/stats/detail/silhouette_score.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <iostream>
 #include <math.h>
 #include <numeric>
+#include <raft/core/operators.hpp>
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/add.cuh>
@@ -172,20 +173,6 @@ struct SilOp {
   }
 };
 
-/**
- * @brief structure that defines the reduction Lambda to find minimum between elements
- */
-template <typename DataT>
-struct MinOp {
-  HDI DataT operator()(DataT a, DataT b)
-  {
-    if (a > b)
-      return b;
-    else
-      return a;
-  }
-};
-
 /**
  * @brief main function that returns the average silhouette score for a given set of data and its
  * clusterings
@@ -204,7 +191,7 @@ struct MinOp {
  */
 template <typename DataT, typename LabelT>
 DataT silhouette_score(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const DataT* X_in,
   int nRows,
   int nCols,
@@ -278,19 +265,19 @@ DataT silhouette_score(
   RAFT_CUDA_TRY(cudaMemsetAsync(
     averageDistanceBetweenSampleAndCluster.data(), 0, nRows * nLabels * sizeof(DataT), stream));
 
-  raft::linalg::matrixVectorOp<DataT, DivOp<DataT>>(averageDistanceBetweenSampleAndCluster.data(),
-                                                    sampleToClusterSumOfDistances.data(),
-                                                    binCountArray.data(),
-                                                    binCountArray.data(),
-                                                    nLabels,
-                                                    nRows,
-                                                    true,
-                                                    true,
-                                                    DivOp<DataT>(),
-                                                    stream);
+  raft::linalg::matrixVectorOp(averageDistanceBetweenSampleAndCluster.data(),
+                               sampleToClusterSumOfDistances.data(),
+                               binCountArray.data(),
+                               binCountArray.data(),
+                               nLabels,
+                               nRows,
+                               true,
+                               true,
+                               DivOp<DataT>(),
+                               stream);
 
   // calculating row-wise minimum
-  raft::linalg::reduce<DataT, DataT, int, raft::Nop<DataT>, MinOp<DataT>>(
+  raft::linalg::reduce<DataT, DataT, int, raft::identity_op, raft::min_op>(
     d_bArray.data(),
     averageDistanceBetweenSampleAndCluster.data(),
     nLabels,
@@ -300,8 +287,8 @@ DataT silhouette_score(
     true,
     stream,
     false,
-    raft::Nop<DataT>(),
-    MinOp<DataT>());
+    raft::identity_op{},
+    raft::min_op{});
 
   // calculating the silhouette score per sample using the d_aArray and d_bArray
   raft::linalg::binaryOp<DataT, SilOp<DataT>>(
@@ -311,12 +298,12 @@ DataT silhouette_score(
   rmm::device_scalar<DataT> d_avgSilhouetteScore(stream);
   RAFT_CUDA_TRY(cudaMemsetAsync(d_avgSilhouetteScore.data(), 0, sizeof(DataT), stream));
 
-  raft::linalg::mapThenSumReduce<double, raft::Nop<DataT>>(d_avgSilhouetteScore.data(),
-                                                           nRows,
-                                                           raft::Nop<DataT>(),
-                                                           stream,
-                                                           perSampleSilScore,
-                                                           perSampleSilScore);
+  raft::linalg::mapThenSumReduce<double, raft::identity_op>(d_avgSilhouetteScore.data(),
+                                                            nRows,
+                                                            raft::identity_op(),
+                                                            stream,
+                                                            perSampleSilScore,
+                                                            perSampleSilScore);
 
   DataT avgSilhouetteScore = d_avgSilhouetteScore.value(stream);
 
diff --git a/cpp/include/raft/stats/detail/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh
index ccea2ea5da..2f7e22ca8a 100644
--- a/cpp/include/raft/stats/detail/stddev.cuh
+++ b/cpp/include/raft/stats/detail/stddev.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -63,7 +63,7 @@ __global__ void stddevKernelColMajor(
     thread_data += diff * diff;
   }
   Type acc = BlockReduce(temp_storage).Sum(thread_data);
-  if (threadIdx.x == 0) { std[blockIdx.x] = raft::mySqrt(acc / N); }
+  if (threadIdx.x == 0) { std[blockIdx.x] = raft::sqrt(acc / N); }
 }
 
 template <typename Type, typename IdxType, int TPB>
@@ -126,7 +126,7 @@ void stddev(Type* std,
       std,
       mu,
       D,
-      [ratio] __device__(Type a, Type b) { return raft::mySqrt(a * ratio - b * b); },
+      [ratio] __device__(Type a, Type b) { return raft::sqrt(a * ratio - b * b); },
       stream);
   } else {
     stddevKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(std, data, mu, D, N);
diff --git a/cpp/include/raft/stats/detail/trustworthiness_score.cuh b/cpp/include/raft/stats/detail/trustworthiness_score.cuh
index feb3fe607d..23f84754da 100644
--- a/cpp/include/raft/stats/detail/trustworthiness_score.cuh
+++ b/cpp/include/raft/stats/detail/trustworthiness_score.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -87,7 +87,7 @@ __global__ void compute_rank(double* rank,
  * @param[out] distances KNN distances
  */
 template <raft::distance::DistanceType distance_type, typename math_t>
-void run_knn(const raft::handle_t& h,
+void run_knn(const raft::device_resources& h,
              math_t* input,
              int n,
              int d,
@@ -128,7 +128,7 @@ void run_knn(const raft::handle_t& h,
  * @return Trustworthiness score
  */
 template <typename math_t, raft::distance::DistanceType distance_type>
-double trustworthiness_score(const raft::handle_t& h,
+double trustworthiness_score(const raft::device_resources& h,
                              const math_t* X,
                              math_t* X_embedded,
                              int n,
diff --git a/cpp/include/raft/stats/detail/weighted_mean.cuh b/cpp/include/raft/stats/detail/weighted_mean.cuh
index 43dbe4e7f1..ada0995f7d 100644
--- a/cpp/include/raft/stats/detail/weighted_mean.cuh
+++ b/cpp/include/raft/stats/detail/weighted_mean.cuh
@@ -18,6 +18,7 @@
 
 #include <raft/linalg/reduce.cuh>
 #include <raft/stats/sum.cuh>
+#include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
 namespace raft {
@@ -66,8 +67,8 @@ void weightedMean(Type* mu,
     stream,
     false,
     [weights] __device__(Type v, IdxType i) { return v * weights[i]; },
-    [] __device__(Type a, Type b) { return a + b; },
-    [WS] __device__(Type v) { return v / WS; });
+    raft::add_op{},
+    raft::div_const_op<Type>(WS));
 }
 };  // end namespace detail
 };  // end namespace stats
diff --git a/cpp/include/raft/stats/dispersion.cuh b/cpp/include/raft/stats/dispersion.cuh
index 9f995e4d5a..8600305d9e 100644
--- a/cpp/include/raft/stats/dispersion.cuh
+++ b/cpp/include/raft/stats/dispersion.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,6 +57,11 @@ DataT dispersion(const DataT* centroids,
     centroids, clusterSizes, globalCentroid, nClusters, nPoints, dim, stream);
 }
 
+/**
+ * @defgroup stats_cluster_dispersion Cluster Dispersion Metric
+ * @{
+ */
+
 /**
  * @brief Compute cluster dispersion metric. This is very useful for
  * automatically finding the 'k' (in kmeans) that improves this metric.
@@ -76,7 +81,7 @@ DataT dispersion(const DataT* centroids,
  */
 template <typename value_t, typename idx_t>
 value_t cluster_dispersion(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const value_t, idx_t, raft::row_major> centroids,
   raft::device_vector_view<const idx_t, idx_t> cluster_sizes,
   std::optional<raft::device_vector_view<value_t, idx_t>> global_centroid,
@@ -101,6 +106,8 @@ value_t cluster_dispersion(
                                             handle.get_stream());
 }
 
+/** @} */  // end group stats_cluster_dispersion
+
 /**
  * @brief Overload of `cluster_dispersion` to help the
  *   compiler find the above overload, in case users pass in
@@ -110,7 +117,7 @@ value_t cluster_dispersion(
  */
 template <typename value_t, typename idx_t>
 value_t cluster_dispersion(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const value_t, idx_t, raft::row_major> centroids,
   raft::device_vector_view<const idx_t, idx_t> cluster_sizes,
   std::nullopt_t global_centroid,
diff --git a/cpp/include/raft/stats/entropy.cuh b/cpp/include/raft/stats/entropy.cuh
index 8a98a03c6b..d59dc8e37a 100644
--- a/cpp/include/raft/stats/entropy.cuh
+++ b/cpp/include/raft/stats/entropy.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -46,6 +46,11 @@ double entropy(const T* clusterArray,
   return detail::entropy(clusterArray, size, lowerLabelRange, upperLabelRange, stream);
 }
 
+/**
+ * @defgroup stats_entropy Entropy
+ * @{
+ */
+
 /**
  * @brief Function to calculate entropy
  * <a href="https://en.wikipedia.org/wiki/Entropy_(information_theory)">more info on entropy</a>
@@ -59,7 +64,7 @@ double entropy(const T* clusterArray,
  * @return the entropy score
  */
 template <typename value_t, typename idx_t>
-double entropy(const raft::handle_t& handle,
+double entropy(raft::device_resources const& handle,
                raft::device_vector_view<const value_t, idx_t> cluster_array,
                const value_t lower_label_range,
                const value_t upper_label_range)
@@ -71,6 +76,9 @@ double entropy(const raft::handle_t& handle,
                          upper_label_range,
                          handle.get_stream());
 }
+
+/** @} */  // end group stats_entropy
+
 };  // end namespace stats
 };  // end namespace raft
 
diff --git a/cpp/include/raft/stats/histogram.cuh b/cpp/include/raft/stats/histogram.cuh
index 4ad5de0926..f829b0317e 100644
--- a/cpp/include/raft/stats/histogram.cuh
+++ b/cpp/include/raft/stats/histogram.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -70,6 +70,11 @@ void histogram(HistType type,
   detail::histogram<DataT, IdxT, BinnerOp>(type, bins, nbins, data, nrows, ncols, stream, binner);
 }
 
+/**
+ * @defgroup stats_histogram Histogram
+ * @{
+ */
+
 /**
  * @brief Perform histogram on the input data. It chooses the right load size
  * based on the input data vector length. It also supports large-bin cases
@@ -86,7 +91,7 @@ void histogram(HistType type,
  * @note signature of binner_op is `int func(value_t, IdxT);`
  */
 template <typename value_t, typename idx_t, typename binner_op = IdentityBinner<value_t, idx_t>>
-void histogram(const raft::handle_t& handle,
+void histogram(raft::device_resources const& handle,
                HistType type,
                raft::device_matrix_view<const value_t, idx_t, raft::col_major> data,
                raft::device_matrix_view<int, idx_t, raft::col_major> bins,
@@ -106,6 +111,9 @@ void histogram(const raft::handle_t& handle,
                                                handle.get_stream(),
                                                binner);
 }
+
+/** @} */  // end group stats_histogram
+
 };  // end namespace stats
 };  // end namespace raft
 
diff --git a/cpp/include/raft/stats/homogeneity_score.cuh b/cpp/include/raft/stats/homogeneity_score.cuh
index 91c479bc99..173d63e47e 100644
--- a/cpp/include/raft/stats/homogeneity_score.cuh
+++ b/cpp/include/raft/stats/homogeneity_score.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,6 +48,11 @@ double homogeneity_score(const T* truthClusterArray,
     truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
 }
 
+/**
+ * @defgroup stats_homogeneity_score Homogeneity Score
+ * @{
+ */
+
 /**
  * @brief Function to calculate the homogeneity score between two clusters
  * <a href="https://en.wikipedia.org/wiki/Homogeneity_(statistics)">more info on mutual
@@ -63,7 +68,7 @@ double homogeneity_score(const T* truthClusterArray,
  * @return the homogeneity score
  */
 template <typename value_t, typename idx_t>
-double homogeneity_score(const raft::handle_t& handle,
+double homogeneity_score(raft::device_resources const& handle,
                          raft::device_vector_view<const value_t, idx_t> truth_cluster_array,
                          raft::device_vector_view<const value_t, idx_t> pred_cluster_array,
                          value_t lower_label_range,
@@ -79,6 +84,9 @@ double homogeneity_score(const raft::handle_t& handle,
                                    upper_label_range,
                                    handle.get_stream());
 }
+
+/** @} */  // end group stats_homogeneity_score
+
 };  // end namespace stats
 };  // end namespace raft
 
diff --git a/cpp/include/raft/stats/information_criterion.cuh b/cpp/include/raft/stats/information_criterion.cuh
index 0edeed7f0b..b54f126859 100644
--- a/cpp/include/raft/stats/information_criterion.cuh
+++ b/cpp/include/raft/stats/information_criterion.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@
 #pragma once
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/stats/detail/batched/information_criterion.cuh>
 #include <raft/stats/stats_types.hpp>
 
@@ -65,11 +65,20 @@ void information_criterion_batched(ScalarT* d_ic,
     d_ic, d_loglikelihood, ic_type, n_params, batch_size, n_samples, stream);
 }
 
+/**
+ * @defgroup stats_information_criterion Information Criterion
+ * @{
+ */
+
 /**
  * Compute the given type of information criterion
  *
  * @note: it is safe to do the computation in-place (i.e give same pointer
  *        as input and output)
+ * See:
+ *  - AIC: https://en.wikipedia.org/wiki/Akaike_information_criterion
+ *  - AICc: https://en.wikipedia.org/wiki/Akaike_information_criterion#AICc
+ *  - BIC: https://en.wikipedia.org/wiki/Bayesian_information_criterion
  *
  * @tparam value_t data type
  * @tparam idx_t index type
@@ -82,7 +91,7 @@ void information_criterion_batched(ScalarT* d_ic,
  * @param[in]  n_samples        Number of samples in each series
  */
 template <typename value_t, typename idx_t>
-void information_criterion_batched(const raft::handle_t& handle,
+void information_criterion_batched(raft::device_resources const& handle,
                                    raft::device_vector_view<const value_t, idx_t> d_loglikelihood,
                                    raft::device_vector_view<value_t, idx_t> d_ic,
                                    IC_Type ic_type,
@@ -101,6 +110,8 @@ void information_criterion_batched(const raft::handle_t& handle,
                                          handle.get_stream());
 }
 
+/** @} */  // end group stats_information_criterion
+
 }  // namespace stats
 }  // namespace raft
 #endif
diff --git a/cpp/include/raft/stats/kl_divergence.cuh b/cpp/include/raft/stats/kl_divergence.cuh
index 265e87dc68..d27f736255 100644
--- a/cpp/include/raft/stats/kl_divergence.cuh
+++ b/cpp/include/raft/stats/kl_divergence.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,6 +42,11 @@ DataT kl_divergence(const DataT* modelPDF, const DataT* candidatePDF, int size,
   return detail::kl_divergence(modelPDF, candidatePDF, size, stream);
 }
 
+/**
+ * @defgroup kl_divergence Kullback-Leibler Divergence
+ * @{
+ */
+
 /**
  * @brief Function to calculate KL Divergence
  * <a href="https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence">more info on KL
@@ -55,7 +60,7 @@ DataT kl_divergence(const DataT* modelPDF, const DataT* candidatePDF, int size,
  * @return the KL Divergence value
  */
 template <typename value_t, typename idx_t>
-value_t kl_divergence(const raft::handle_t& handle,
+value_t kl_divergence(raft::device_resources const& handle,
                       raft::device_vector_view<const value_t, idx_t> modelPDF,
                       raft::device_vector_view<const value_t, idx_t> candidatePDF)
 {
@@ -66,6 +71,8 @@ value_t kl_divergence(const raft::handle_t& handle,
     modelPDF.data_handle(), candidatePDF.data_handle(), modelPDF.extent(0), handle.get_stream());
 }
 
+/** @} */  // end group kl_divergence
+
 };  // end namespace stats
 };  // end namespace raft
 
diff --git a/cpp/include/raft/stats/mean.cuh b/cpp/include/raft/stats/mean.cuh
index 5a39e29a8c..a576e63bee 100644
--- a/cpp/include/raft/stats/mean.cuh
+++ b/cpp/include/raft/stats/mean.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
 #pragma once
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/stats/detail/mean.cuh>
 
 namespace raft {
@@ -50,6 +50,11 @@ void mean(
   detail::mean(mu, data, D, N, sample, rowMajor, stream);
 }
 
+/**
+ * @defgroup stats_mean Mean
+ * @{
+ */
+
 /**
  * @brief Compute mean of the input matrix
  *
@@ -65,7 +70,7 @@ void mean(
  *   to normalize the output using N-1 or N, for true or false, respectively
  */
 template <typename value_t, typename idx_t, typename layout_t>
-void mean(const raft::handle_t& handle,
+void mean(raft::device_resources const& handle,
           raft::device_matrix_view<const value_t, idx_t, layout_t> data,
           raft::device_vector_view<value_t, idx_t> mu,
           bool sample)
@@ -85,6 +90,8 @@ void mean(const raft::handle_t& handle,
                handle.get_stream());
 }
 
+/** @} */  // end group stats_mean
+
 };  // namespace stats
 };  // namespace raft
 
diff --git a/cpp/include/raft/stats/mean_center.cuh b/cpp/include/raft/stats/mean_center.cuh
index 9f49ff8be2..b333b3c8da 100644
--- a/cpp/include/raft/stats/mean_center.cuh
+++ b/cpp/include/raft/stats/mean_center.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -52,6 +52,38 @@ void meanCenter(Type* out,
   detail::meanCenter<Type, IdxType, TPB>(out, data, mu, D, N, rowMajor, bcastAlongRows, stream);
 }
 
+/**
+ * @brief Add the input matrix wrt its mean
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads per block of the cuda kernel launched
+ * @param out the output mean-added matrix
+ * @param data input matrix
+ * @param mu the mean vector
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param rowMajor whether input is row or col major
+ * @param bcastAlongRows whether to broadcast vector along rows or columns
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename IdxType = int, int TPB = 256>
+void meanAdd(Type* out,
+             const Type* data,
+             const Type* mu,
+             IdxType D,
+             IdxType N,
+             bool rowMajor,
+             bool bcastAlongRows,
+             cudaStream_t stream)
+{
+  detail::meanAdd<Type, IdxType, TPB>(out, data, mu, D, N, rowMajor, bcastAlongRows, stream);
+}
+
+/**
+ * @defgroup stats_mean_center Mean Center
+ * @{
+ */
+
 /**
  * @brief Center the input matrix wrt its mean
  * @tparam value_t the data type
@@ -64,7 +96,7 @@ void meanCenter(Type* out,
  * @param[in]  bcast_along_rows whether to broadcast vector along rows or columns
  */
 template <typename value_t, typename idx_t, typename layout_t>
-void mean_center(const raft::handle_t& handle,
+void mean_center(raft::device_resources const& handle,
                  raft::device_matrix_view<const value_t, idx_t, layout_t> data,
                  raft::device_vector_view<const value_t, idx_t> mu,
                  raft::device_matrix_view<value_t, idx_t, layout_t> out,
@@ -88,33 +120,6 @@ void mean_center(const raft::handle_t& handle,
                                      handle.get_stream());
 }
 
-/**
- * @brief Add the input matrix wrt its mean
- * @tparam Type the data type
- * @tparam IdxType Integer type used to for addressing
- * @tparam TPB threads per block of the cuda kernel launched
- * @param out the output mean-added matrix
- * @param data input matrix
- * @param mu the mean vector
- * @param D number of columns of data
- * @param N number of rows of data
- * @param rowMajor whether input is row or col major
- * @param bcastAlongRows whether to broadcast vector along rows or columns
- * @param stream cuda stream where to launch work
- */
-template <typename Type, typename IdxType = int, int TPB = 256>
-void meanAdd(Type* out,
-             const Type* data,
-             const Type* mu,
-             IdxType D,
-             IdxType N,
-             bool rowMajor,
-             bool bcastAlongRows,
-             cudaStream_t stream)
-{
-  detail::meanAdd<Type, IdxType, TPB>(out, data, mu, D, N, rowMajor, bcastAlongRows, stream);
-}
-
 /**
  * @brief Add the input matrix wrt its mean
  * @tparam Type the data type
@@ -128,7 +133,7 @@ void meanAdd(Type* out,
  * @param[in]  bcast_along_rows whether to broadcast vector along rows or columns
  */
 template <typename value_t, typename idx_t, typename layout_t>
-void mean_add(const raft::handle_t& handle,
+void mean_add(raft::device_resources const& handle,
               raft::device_matrix_view<const value_t, idx_t, layout_t> data,
               raft::device_vector_view<const value_t, idx_t> mu,
               raft::device_matrix_view<value_t, idx_t, layout_t> out,
@@ -151,6 +156,9 @@ void mean_add(const raft::handle_t& handle,
                                   bcast_along_rows,
                                   handle.get_stream());
 }
+
+/** @} */  // end group stats_mean_center
+
 };  // end namespace stats
 };  // end namespace raft
 
diff --git a/cpp/include/raft/stats/meanvar.cuh b/cpp/include/raft/stats/meanvar.cuh
index fab2184637..0ee21d1325 100644
--- a/cpp/include/raft/stats/meanvar.cuh
+++ b/cpp/include/raft/stats/meanvar.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,6 +56,11 @@ void meanvar(Type* mean,
   detail::meanvar(mean, var, data, D, N, sample, rowMajor, stream);
 }
 
+/**
+ * @defgroup stats_mean_var Mean and Variance
+ * @{
+ */
+
 /**
  * @brief Compute mean and variance for each column of a given matrix.
  *
@@ -75,7 +80,7 @@ void meanvar(Type* mean,
  * normalize the variance using N-1 or N, for true or false respectively.
  */
 template <typename value_t, typename idx_t, typename layout_t>
-void meanvar(const raft::handle_t& handle,
+void meanvar(raft::device_resources const& handle,
              raft::device_matrix_view<const value_t, idx_t, layout_t> data,
              raft::device_vector_view<value_t, idx_t> mean,
              raft::device_vector_view<value_t, idx_t> var,
@@ -99,6 +104,8 @@ void meanvar(const raft::handle_t& handle,
                   handle.get_stream());
 }
 
+/** @} */  // end group stats_mean_var
+
 };  // namespace raft::stats
 
 #endif
diff --git a/cpp/include/raft/stats/minmax.cuh b/cpp/include/raft/stats/minmax.cuh
index a3cbec08fe..8af4f7a92c 100644
--- a/cpp/include/raft/stats/minmax.cuh
+++ b/cpp/include/raft/stats/minmax.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -70,6 +70,11 @@ void minmax(const T* data,
     data, rowids, colids, nrows, ncols, row_stride, globalmin, globalmax, sampledcols, stream);
 }
 
+/**
+ * @defgroup stats_minmax Min/Max
+ * @{
+ */
+
 /**
  * @brief Computes min/max across every column of the input matrix, as well as
  * optionally allow to subsample based on the given row/col ID mapping vectors
@@ -92,7 +97,7 @@ void minmax(const T* data,
  *    in shared memory
  */
 template <typename value_t, typename idx_t>
-void minmax(const raft::handle_t& handle,
+void minmax(raft::device_resources const& handle,
             raft::device_matrix_view<const value_t, idx_t, raft::col_major> data,
             std::optional<raft::device_vector_view<const unsigned, idx_t>> rowids,
             std::optional<raft::device_vector_view<const unsigned, idx_t>> colids,
@@ -131,6 +136,8 @@ void minmax(const raft::handle_t& handle,
                           handle.get_stream());
 }
 
+/** @} */  // end group stats_minmax
+
 };  // namespace stats
 };  // namespace raft
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/mutual_info_score.cuh b/cpp/include/raft/stats/mutual_info_score.cuh
index 6c7f588050..ca7f33d398 100644
--- a/cpp/include/raft/stats/mutual_info_score.cuh
+++ b/cpp/include/raft/stats/mutual_info_score.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,6 +47,11 @@ double mutual_info_score(const T* firstClusterArray,
     firstClusterArray, secondClusterArray, size, lowerLabelRange, upperLabelRange, stream);
 }
 
+/**
+ * @defgroup stats_mutual_info Mutual Information
+ * @{
+ */
+
 /**
  * @brief Function to calculate the mutual information between two clusters
  * <a href="https://en.wikipedia.org/wiki/Mutual_information">more info on mutual information</a>
@@ -60,7 +65,7 @@ double mutual_info_score(const T* firstClusterArray,
  * @return the mutual information score
  */
 template <typename value_t, typename idx_t>
-double mutual_info_score(const raft::handle_t& handle,
+double mutual_info_score(raft::device_resources const& handle,
                          raft::device_vector_view<const value_t, idx_t> first_cluster_array,
                          raft::device_vector_view<const value_t, idx_t> second_cluster_array,
                          value_t lower_label_range,
@@ -77,6 +82,9 @@ double mutual_info_score(const raft::handle_t& handle,
                                    upper_label_range,
                                    handle.get_stream());
 }
+
+/** @} */  // end group stats_mutual_info
+
 };  // end namespace stats
 };  // end namespace raft
 
diff --git a/cpp/include/raft/stats/r2_score.cuh b/cpp/include/raft/stats/r2_score.cuh
index 5b14c901de..1048deb7f3 100644
--- a/cpp/include/raft/stats/r2_score.cuh
+++ b/cpp/include/raft/stats/r2_score.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -46,6 +46,11 @@ math_t r2_score(math_t* y, math_t* y_hat, int n, cudaStream_t stream)
   return detail::r2_score(y, y_hat, n, stream);
 }
 
+/**
+ * @defgroup stats_r2_score Regression R2 Score
+ * @{
+ */
+
 /**
  * Calculates the "Coefficient of Determination" (R-Squared) score
  * normalizing the sum of squared errors by the total sum of squares.
@@ -64,7 +69,7 @@ math_t r2_score(math_t* y, math_t* y_hat, int n, cudaStream_t stream)
  * @note The constness of y and y_hat is currently casted away.
  */
 template <typename value_t, typename idx_t>
-value_t r2_score(const raft::handle_t& handle,
+value_t r2_score(raft::device_resources const& handle,
                  raft::device_vector_view<const value_t, idx_t> y,
                  raft::device_vector_view<const value_t, idx_t> y_hat)
 {
@@ -79,6 +84,8 @@ value_t r2_score(const raft::handle_t& handle,
                           handle.get_stream());
 }
 
+/** @} */  // end group stats_r2_score
+
 }  // namespace stats
 }  // namespace raft
 
diff --git a/cpp/include/raft/stats/rand_index.cuh b/cpp/include/raft/stats/rand_index.cuh
index 70384412a8..25b92e4e10 100644
--- a/cpp/include/raft/stats/rand_index.cuh
+++ b/cpp/include/raft/stats/rand_index.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #pragma once
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/stats/detail/rand_index.cuh>
 
 namespace raft {
@@ -39,6 +39,11 @@ double rand_index(T* firstClusterArray, T* secondClusterArray, uint64_t size, cu
   return detail::compute_rand_index(firstClusterArray, secondClusterArray, size, stream);
 }
 
+/**
+ * @defgroup stats_rand_index Rand Index
+ * @{
+ */
+
 /**
  * @brief Function to calculate RandIndex
  * <a href="https://en.wikipedia.org/wiki/Rand_index">more info on rand index</a>
@@ -50,7 +55,7 @@ double rand_index(T* firstClusterArray, T* secondClusterArray, uint64_t size, cu
  * @return: The RandIndex value.
  */
 template <typename value_t, typename idx_t>
-double rand_index(const raft::handle_t& handle,
+double rand_index(raft::device_resources const& handle,
                   raft::device_vector_view<const value_t, idx_t> first_cluster_array,
                   raft::device_vector_view<const value_t, idx_t> second_cluster_array)
 {
@@ -63,6 +68,9 @@ double rand_index(const raft::handle_t& handle,
                                     second_cluster_array.extent(0),
                                     handle.get_stream());
 }
+
+/** @} */  // end group stats_rand_index
+
 };  // end namespace stats
 };  // end namespace raft
 
diff --git a/cpp/include/raft/stats/regression_metrics.cuh b/cpp/include/raft/stats/regression_metrics.cuh
index 268440892c..7c3ca7386b 100644
--- a/cpp/include/raft/stats/regression_metrics.cuh
+++ b/cpp/include/raft/stats/regression_metrics.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #pragma once
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/stats/detail/scores.cuh>
 
@@ -53,6 +53,11 @@ void regression_metrics(const T* predictions,
     predictions, ref_predictions, n, stream, mean_abs_error, mean_squared_error, median_abs_error);
 }
 
+/**
+ * @defgroup stats_regression_metrics Regression Metrics
+ * @{
+ */
+
 /**
  * @brief Compute regression metrics mean absolute error, mean squared error, median absolute error
  * @tparam value_t the data type for predictions (e.g., float or double for regression).
@@ -68,7 +73,7 @@ void regression_metrics(const T* predictions,
  * ref_predictions[i]| for i in [0, n).
  */
 template <typename value_t, typename idx_t>
-void regression_metrics(const raft::handle_t& handle,
+void regression_metrics(raft::device_resources const& handle,
                         raft::device_vector_view<const value_t, idx_t> predictions,
                         raft::device_vector_view<const value_t, idx_t> ref_predictions,
                         raft::host_scalar_view<double> mean_abs_error,
@@ -92,6 +97,9 @@ void regression_metrics(const raft::handle_t& handle,
                              *mean_squared_error.data_handle(),
                              *median_abs_error.data_handle());
 }
+
+/** @} */  // end group stats_regression_metrics
+
 }  // namespace stats
 }  // namespace raft
 
diff --git a/cpp/include/raft/stats/silhouette_score.cuh b/cpp/include/raft/stats/silhouette_score.cuh
index fafddb7b23..db9db1f99a 100644
--- a/cpp/include/raft/stats/silhouette_score.cuh
+++ b/cpp/include/raft/stats/silhouette_score.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,7 +44,7 @@ namespace stats {
  */
 template <typename DataT, typename LabelT>
 DataT silhouette_score(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   DataT* X_in,
   int nRows,
   int nCols,
@@ -60,7 +60,7 @@ DataT silhouette_score(
 
 template <typename value_t, typename value_idx, typename label_idx>
 value_t silhouette_score_batched(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   value_t* X,
   value_idx n_rows,
   value_idx n_cols,
@@ -74,6 +74,11 @@ value_t silhouette_score_batched(
     handle, X, n_rows, n_cols, y, n_labels, scores, chunk, metric);
 }
 
+/**
+ * @defgroup stats_silhouette_score Silhouette Score
+ * @{
+ */
+
 /**
  * @brief main function that returns the average silhouette score for a given set of data and its
  * clusterings
@@ -93,7 +98,7 @@ value_t silhouette_score_batched(
  */
 template <typename value_t, typename label_t, typename idx_t>
 value_t silhouette_score(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const value_t, idx_t, raft::row_major> X_in,
   raft::device_vector_view<const label_t, idx_t> labels,
   std::optional<raft::device_vector_view<value_t, idx_t>> silhouette_score_per_sample,
@@ -119,26 +124,6 @@ value_t silhouette_score(
                                   metric);
 }
 
-/**
- * @brief Overload of `silhouette_score` to help the
- *   compiler find the above overload, in case users pass in
- *   `std::nullopt` for the optional arguments.
- *
- * Please see above for documentation of `silhouette_score`.
- */
-template <typename value_t, typename label_t, typename idx_t>
-value_t silhouette_score(
-  const raft::handle_t& handle,
-  raft::device_matrix_view<const value_t, idx_t, raft::row_major> X_in,
-  raft::device_vector_view<const label_t, idx_t> labels,
-  std::nullopt_t silhouette_score_per_sample,
-  idx_t n_unique_labels,
-  raft::distance::DistanceType metric = raft::distance::DistanceType::L2Unexpanded)
-{
-  std::optional<raft::device_vector_view<value_t, idx_t>> opt_scores = silhouette_score_per_sample;
-  return silhouette_score(handle, X_in, labels, opt_scores, n_unique_labels, metric);
-}
-
 /**
  * @brief function that returns the average silhouette score for a given set of data and its
  * clusterings
@@ -159,7 +144,7 @@ value_t silhouette_score(
  */
 template <typename value_t, typename label_t, typename idx_t>
 value_t silhouette_score_batched(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const value_t, idx_t, raft::row_major> X,
   raft::device_vector_view<const label_t, idx_t> labels,
   std::optional<raft::device_vector_view<value_t, idx_t>> silhouette_score_per_sample,
@@ -191,6 +176,28 @@ value_t silhouette_score_batched(
                                            metric);
 }
 
+/** @} */  // end group stats_silhouette_score
+
+/**
+ * @brief Overload of `silhouette_score` to help the
+ *   compiler find the above overload, in case users pass in
+ *   `std::nullopt` for the optional arguments.
+ *
+ * Please see above for documentation of `silhouette_score`.
+ */
+template <typename value_t, typename label_t, typename idx_t>
+value_t silhouette_score(
+  raft::device_resources const& handle,
+  raft::device_matrix_view<const value_t, idx_t, raft::row_major> X_in,
+  raft::device_vector_view<const label_t, idx_t> labels,
+  std::nullopt_t silhouette_score_per_sample,
+  idx_t n_unique_labels,
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2Unexpanded)
+{
+  std::optional<raft::device_vector_view<value_t, idx_t>> opt_scores = silhouette_score_per_sample;
+  return silhouette_score(handle, X_in, labels, opt_scores, n_unique_labels, metric);
+}
+
 /**
  * @brief Overload of `silhouette_score_batched` to help the
  *   compiler find the above overload, in case users pass in
@@ -200,7 +207,7 @@ value_t silhouette_score_batched(
  */
 template <typename value_t, typename label_t, typename idx_t>
 value_t silhouette_score_batched(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const value_t, idx_t, raft::row_major> X,
   raft::device_vector_view<const label_t, idx_t> labels,
   std::nullopt_t silhouette_score_per_sample,
diff --git a/cpp/include/raft/stats/stats_types.hpp b/cpp/include/raft/stats/stats_types.hpp
index 5db5ef1c57..8dc7522d60 100644
--- a/cpp/include/raft/stats/stats_types.hpp
+++ b/cpp/include/raft/stats/stats_types.hpp
@@ -20,6 +20,11 @@
 
 namespace raft::stats {
 
+/**
+ * @ingroup stats_histogram
+ * @{
+ */
+
 /**
  * @brief Types of support histogram implementations
  */
@@ -54,9 +59,18 @@ enum HistType {
   HistTypeAuto
 };
 
+/** @} */
+
+/**
+ * @ingroup stats_information_criterion
+ * @{
+ */
+
 /**
  * @brief Supported types of information criteria
  */
 enum IC_Type { AIC, AICc, BIC };
 
+/** @} */
+
 };  // end namespace raft::stats
diff --git a/cpp/include/raft/stats/stddev.cuh b/cpp/include/raft/stats/stddev.cuh
index 2747029955..0b038c85ea 100644
--- a/cpp/include/raft/stats/stddev.cuh
+++ b/cpp/include/raft/stats/stddev.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #pragma once
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/stats/detail/stddev.cuh>
 
 namespace raft {
@@ -87,6 +87,11 @@ void vars(Type* var,
   detail::vars(var, data, mu, D, N, sample, rowMajor, stream);
 }
 
+/**
+ * @defgroup stats_stddev Standard Deviation
+ * @{
+ */
+
 /**
  * @brief Compute stddev of the input matrix
  *
@@ -104,7 +109,7 @@ void vars(Type* var,
  *  to normalize the output using N-1 or N, for true or false, respectively
  */
 template <typename value_t, typename idx_t, typename layout_t>
-void stddev(const raft::handle_t& handle,
+void stddev(raft::device_resources const& handle,
             raft::device_matrix_view<const value_t, idx_t, layout_t> data,
             raft::device_vector_view<const value_t, idx_t> mu,
             raft::device_vector_view<value_t, idx_t> std,
@@ -127,6 +132,13 @@ void stddev(const raft::handle_t& handle,
                  handle.get_stream());
 }
 
+/** @} */  // end group stats_stddev
+
+/**
+ * @defgroup stats_variance Variance
+ * @{
+ */
+
 /**
  * @brief Compute variance of the input matrix
  *
@@ -144,7 +156,7 @@ void stddev(const raft::handle_t& handle,
  *  to normalize the output using N-1 or N, for true or false, respectively
  */
 template <typename value_t, typename idx_t, typename layout_t>
-void vars(const raft::handle_t& handle,
+void vars(raft::device_resources const& handle,
           raft::device_matrix_view<const value_t, idx_t, layout_t> data,
           raft::device_vector_view<const value_t, idx_t> mu,
           raft::device_vector_view<value_t, idx_t> var,
@@ -167,6 +179,8 @@ void vars(const raft::handle_t& handle,
                handle.get_stream());
 }
 
+/** @} */  // end group stats_variance
+
 };  // namespace stats
 };  // namespace raft
 
diff --git a/cpp/include/raft/stats/sum.cuh b/cpp/include/raft/stats/sum.cuh
index 18265c5e3a..5f169b3384 100644
--- a/cpp/include/raft/stats/sum.cuh
+++ b/cpp/include/raft/stats/sum.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -46,6 +46,11 @@ void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, c
   detail::sum(output, input, D, N, rowMajor, stream);
 }
 
+/**
+ * @defgroup stats_sum Sum
+ * @{
+ */
+
 /**
  * @brief Compute sum of the input matrix
  *
@@ -59,7 +64,7 @@ void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, c
  * @param[out] output the output mean vector
  */
 template <typename value_t, typename idx_t, typename layout_t>
-void sum(const raft::handle_t& handle,
+void sum(raft::device_resources const& handle,
          raft::device_matrix_view<const value_t, idx_t, layout_t> input,
          raft::device_vector_view<value_t, idx_t> output)
 {
@@ -77,6 +82,8 @@ void sum(const raft::handle_t& handle,
               handle.get_stream());
 }
 
+/** @} */  // end group stats_sum
+
 };  // end namespace stats
 };  // end namespace raft
 
diff --git a/cpp/include/raft/stats/trustworthiness_score.cuh b/cpp/include/raft/stats/trustworthiness_score.cuh
index b7b3999f77..a79cda8dfc 100644
--- a/cpp/include/raft/stats/trustworthiness_score.cuh
+++ b/cpp/include/raft/stats/trustworthiness_score.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 
 #pragma once
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/stats/detail/trustworthiness_score.cuh>
 
 namespace raft {
@@ -38,7 +38,7 @@ namespace stats {
  * @return[out] Trustworthiness score
  */
 template <typename math_t, raft::distance::DistanceType distance_type>
-double trustworthiness_score(const raft::handle_t& h,
+double trustworthiness_score(const raft::device_resources& h,
                              const math_t* X,
                              math_t* X_embedded,
                              int n,
@@ -51,6 +51,11 @@ double trustworthiness_score(const raft::handle_t& h,
     h, X, X_embedded, n, m, d, n_neighbors, batchSize);
 }
 
+/**
+ * @defgroup stats_trustworthiness Trustworthiness
+ * @{
+ */
+
 /**
  * @brief Compute the trustworthiness score
  * @tparam value_t the data type
@@ -66,7 +71,7 @@ double trustworthiness_score(const raft::handle_t& h,
  */
 template <raft::distance::DistanceType distance_type, typename value_t, typename idx_t>
 double trustworthiness_score(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const value_t, idx_t, raft::row_major> X,
   raft::device_matrix_view<const value_t, idx_t, raft::row_major> X_embedded,
   int n_neighbors,
@@ -87,6 +92,9 @@ double trustworthiness_score(
     n_neighbors,
     batch_size);
 }
+
+/** @} */  // end group stats_trustworthiness
+
 }  // namespace stats
 }  // namespace raft
 
diff --git a/cpp/include/raft/stats/v_measure.cuh b/cpp/include/raft/stats/v_measure.cuh
index c52dd35fd8..be1d83d59d 100644
--- a/cpp/include/raft/stats/v_measure.cuh
+++ b/cpp/include/raft/stats/v_measure.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 
 #pragma once
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/stats/detail/v_measure.cuh>
 
 namespace raft {
@@ -49,6 +49,11 @@ double v_measure(const T* truthClusterArray,
     truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream, beta);
 }
 
+/**
+ * @defgroup stats_vmeasure V-Measure
+ * @{
+ */
+
 /**
  * @brief Function to calculate the v-measure between two clusters
  *
@@ -63,7 +68,7 @@ double v_measure(const T* truthClusterArray,
  * @return the v-measure between the two clusters
  */
 template <typename value_t, typename idx_t>
-double v_measure(const raft::handle_t& handle,
+double v_measure(raft::device_resources const& handle,
                  raft::device_vector_view<const value_t, idx_t> truth_cluster_array,
                  raft::device_vector_view<const value_t, idx_t> pred_cluster_array,
                  value_t lower_label_range,
@@ -84,6 +89,8 @@ double v_measure(const raft::handle_t& handle,
                            beta);
 }
 
+/** @} */  // end group stats_vmeasure
+
 };  // end namespace stats
 };  // end namespace raft
 
diff --git a/cpp/include/raft/stats/weighted_mean.cuh b/cpp/include/raft/stats/weighted_mean.cuh
index 30a922b243..7f061e0b45 100644
--- a/cpp/include/raft/stats/weighted_mean.cuh
+++ b/cpp/include/raft/stats/weighted_mean.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -93,6 +93,11 @@ void colWeightedMean(
   weightedMean(mu, data, weights, D, N, true, false, stream);
 }
 
+/**
+ * @defgroup stats_weighted_mean Weighted Mean
+ * @{
+ */
+
 /**
  * @brief Compute the weighted mean of the input matrix with a
  * vector of weights, along rows or along columns
@@ -107,7 +112,7 @@ void colWeightedMean(
  * @param[in]  along_rows whether to reduce along rows or columns
  */
 template <typename value_t, typename idx_t, typename layout_t>
-void weighted_mean(const raft::handle_t& handle,
+void weighted_mean(raft::device_resources const& handle,
                    raft::device_matrix_view<const value_t, idx_t, layout_t> data,
                    raft::device_vector_view<const value_t, idx_t> weights,
                    raft::device_vector_view<value_t, idx_t> mu,
@@ -149,7 +154,7 @@ void weighted_mean(const raft::handle_t& handle,
  * @param[out] mu the output mean vector of size nrows
  */
 template <typename value_t, typename idx_t, typename layout_t>
-void row_weighted_mean(const raft::handle_t& handle,
+void row_weighted_mean(raft::device_resources const& handle,
                        raft::device_matrix_view<const value_t, idx_t, layout_t> data,
                        raft::device_vector_view<const value_t, idx_t> weights,
                        raft::device_vector_view<value_t, idx_t> mu)
@@ -170,13 +175,16 @@ void row_weighted_mean(const raft::handle_t& handle,
  * @param[out] mu the output mean vector of size ncols
  */
 template <typename value_t, typename idx_t, typename layout_t>
-void col_weighted_mean(const raft::handle_t& handle,
+void col_weighted_mean(raft::device_resources const& handle,
                        raft::device_matrix_view<const value_t, idx_t, layout_t> data,
                        raft::device_vector_view<const value_t, idx_t> weights,
                        raft::device_vector_view<value_t, idx_t> mu)
 {
   weighted_mean(handle, data, weights, mu, false);
 }
+
+/** @} */  // end group stats_weighted_mean
+
 };  // end namespace stats
 };  // end namespace raft
 
diff --git a/cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh b/cpp/include/raft/util/bitonic_sort.cuh
similarity index 68%
rename from cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh
rename to cpp/include/raft/util/bitonic_sort.cuh
index 630acab2b8..5de464b4c7 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh
+++ b/cpp/include/raft/util/bitonic_sort.cuh
@@ -16,14 +16,15 @@
 
 #pragma once
 
+#include <raft/core/detail/macros.hpp>
 #include <raft/util/cuda_utils.cuh>
 
-namespace raft::spatial::knn::detail::topk {
+namespace raft::util {
 
-namespace helpers {
+namespace {
 
 template <typename T>
-__device__ __forceinline__ void swap(T& x, T& y)
+_RAFT_DEVICE _RAFT_FORCEINLINE void swap(T& x, T& y)
 {
   T t = x;
   x   = y;
@@ -31,12 +32,12 @@ __device__ __forceinline__ void swap(T& x, T& y)
 }
 
 template <typename T>
-__device__ __forceinline__ void conditional_assign(bool cond, T& ptr, T x)
+_RAFT_DEVICE _RAFT_FORCEINLINE void conditional_assign(bool cond, T& ptr, T x)
 {
   if (cond) { ptr = x; }
 }
 
-}  // namespace helpers
+}  // namespace
 
 /**
  * Warp-wide bitonic merge and sort.
@@ -59,6 +60,19 @@ __device__ __forceinline__ void conditional_assign(bool cond, T& ptr, T x)
  *   3  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63    48  49  50 ...
  * `
  *
+ * Here is a small usage example of device code, which sorts the arrays of length 6 (= 3 * 2)
+ * grouped in pairs of threads in ascending order:
+ * @code{.cpp}
+ *   // Fill an array of three ints in each thread of a warp.
+ *   int i = laneId();
+ *   int arr[3] = {i+1, i+5, i};
+ *   // Sort the arrays in groups of two threads.
+ *   bitonic<3>(ascending=true, warp_width=2).sort(arr);
+ *   // As a result,
+ *   //  for every even thread (`i == 2j`):    arr == {2j,   2j+1, 2j+5}
+ *   //  for every odd  thread (`i == 2j+1`):  arr == {2j+1, 2j+2, 2j+6}
+ * @endcode
+ *
  * @tparam Size
  *   number of elements processed in each thread;
  *   i.e. the total data size is `Size * warp_width`.
@@ -80,7 +94,7 @@ class bitonic {
    *   the total size of the sorted data is `Size * warp_width`.
    *   Must be power-of-two, not larger than the WarpSize.
    */
-  __device__ __forceinline__ explicit bitonic(bool ascending, int warp_width = WarpSize)
+  _RAFT_DEVICE _RAFT_FORCEINLINE explicit bitonic(bool ascending, int warp_width = WarpSize)
     : ascending_(ascending), warp_width_(warp_width)
   {
   }
@@ -95,7 +109,7 @@ class bitonic {
    *
    *   1) Sort any bitonic sequence.
    *   2) Merge two halves of the input data assuming they're already sorted, and their order is
-   *      opposite (i.e. either ascending, descending or vice-versa).
+   *      opposite (i.e. either ascending+descending or descending+ascending).
    *
    * The input pointers are unique per-thread.
    * See the class description for the description of the data layout.
@@ -108,10 +122,10 @@ class bitonic {
    *   the keys; must be at least `Size` elements long.
    */
   template <typename KeyT, typename... PayloadTs>
-  __device__ __forceinline__ void merge(KeyT* __restrict__ keys,
-                                        PayloadTs* __restrict__... payloads) const
+  _RAFT_DEVICE _RAFT_FORCEINLINE void merge(KeyT* __restrict__ keys,
+                                            PayloadTs* __restrict__... payloads) const
   {
-    return bitonic<Size>::merge_(ascending_, warp_width_, keys, payloads...);
+    return bitonic<Size>::merge_impl(ascending_, warp_width_, keys, payloads...);
   }
 
   /**
@@ -127,10 +141,10 @@ class bitonic {
    *   the keys; must be at least `Size` elements long.
    */
   template <typename KeyT, typename... PayloadTs>
-  __device__ __forceinline__ void sort(KeyT* __restrict__ keys,
-                                       PayloadTs* __restrict__... payloads) const
+  _RAFT_DEVICE _RAFT_FORCEINLINE void sort(KeyT* __restrict__ keys,
+                                           PayloadTs* __restrict__... payloads) const
   {
-    return bitonic<Size>::sort_(ascending_, warp_width_, keys, payloads...);
+    return bitonic<Size>::sort_impl(ascending_, warp_width_, keys, payloads...);
   }
 
   /**
@@ -141,8 +155,8 @@ class bitonic {
    * @param payload
    */
   template <typename KeyT, typename... PayloadTs, int S = Size>
-  __device__ __forceinline__ auto merge(KeyT& __restrict__ key,
-                                        PayloadTs& __restrict__... payload) const
+  _RAFT_DEVICE _RAFT_FORCEINLINE auto merge(KeyT& __restrict__ key,
+                                            PayloadTs& __restrict__... payload) const
     -> std::enable_if_t<S == 1, void>  // SFINAE to enable this for Size == 1 only
   {
     static_assert(S == Size);
@@ -157,8 +171,8 @@ class bitonic {
    * @param payload
    */
   template <typename KeyT, typename... PayloadTs, int S = Size>
-  __device__ __forceinline__ auto sort(KeyT& __restrict__ key,
-                                       PayloadTs& __restrict__... payload) const
+  _RAFT_DEVICE _RAFT_FORCEINLINE auto sort(KeyT& __restrict__ key,
+                                           PayloadTs& __restrict__... payload) const
     -> std::enable_if_t<S == 1, void>  // SFINAE to enable this for Size == 1 only
   {
     static_assert(S == Size);
@@ -173,10 +187,10 @@ class bitonic {
   friend class bitonic;
 
   template <typename KeyT, typename... PayloadTs>
-  static __device__ __forceinline__ void merge_(bool ascending,
-                                                int warp_width,
-                                                KeyT* __restrict__ keys,
-                                                PayloadTs* __restrict__... payloads)
+  static _RAFT_DEVICE _RAFT_FORCEINLINE void merge_impl(bool ascending,
+                                                        int warp_width,
+                                                        KeyT* __restrict__ keys,
+                                                        PayloadTs* __restrict__... payloads)
   {
 #pragma unroll
     for (int size = Size; size > 1; size >>= 1) {
@@ -189,8 +203,8 @@ class bitonic {
           KeyT& key         = keys[i];
           KeyT& other       = keys[other_i];
           if (ascending ? key > other : key < other) {
-            helpers::swap(key, other);
-            (helpers::swap(payloads[i], payloads[other_i]), ...);
+            swap(key, other);
+            (swap(payloads[i], payloads[other_i]), ...);
           }
         }
       }
@@ -204,33 +218,32 @@ class bitonic {
         const KeyT other     = shfl_xor(key, stride, warp_width);
         const bool do_assign = (ascending != is_second) ? key > other : key < other;
 
-        helpers::conditional_assign(do_assign, key, other);
+        conditional_assign(do_assign, key, other);
         // NB: don't put shfl_xor in a conditional; it must be called by all threads in a warp.
-        (helpers::conditional_assign(
-           do_assign, payloads[i], shfl_xor(payloads[i], stride, warp_width)),
+        (conditional_assign(do_assign, payloads[i], shfl_xor(payloads[i], stride, warp_width)),
          ...);
       }
     }
   }
 
   template <typename KeyT, typename... PayloadTs>
-  static __device__ __forceinline__ void sort_(bool ascending,
-                                               int warp_width,
-                                               KeyT* __restrict__ keys,
-                                               PayloadTs* __restrict__... payloads)
+  static _RAFT_DEVICE _RAFT_FORCEINLINE void sort_impl(bool ascending,
+                                                       int warp_width,
+                                                       KeyT* __restrict__ keys,
+                                                       PayloadTs* __restrict__... payloads)
   {
     if constexpr (Size == 1) {
       const int lane = laneId();
       for (int width = 2; width < warp_width; width <<= 1) {
-        bitonic<1>::merge_(lane & width, width, keys, payloads...);
+        bitonic<1>::merge_impl(lane & width, width, keys, payloads...);
       }
     } else {
       constexpr int kSize2 = Size / 2;
-      bitonic<kSize2>::sort_(false, warp_width, keys, payloads...);
-      bitonic<kSize2>::sort_(true, warp_width, keys + kSize2, (payloads + kSize2)...);
+      bitonic<kSize2>::sort_impl(false, warp_width, keys, payloads...);
+      bitonic<kSize2>::sort_impl(true, warp_width, keys + kSize2, (payloads + kSize2)...);
     }
-    bitonic<Size>::merge_(ascending, warp_width, keys, payloads...);
+    bitonic<Size>::merge_impl(ascending, warp_width, keys, payloads...);
   }
 };
 
-}  // namespace raft::spatial::knn::detail::topk
+}  // namespace raft::util
diff --git a/cpp/include/raft/util/cache.cuh b/cpp/include/raft/util/cache.cuh
index ccd5d1ab86..77e3ed2d6d 100644
--- a/cpp/include/raft/util/cache.cuh
+++ b/cpp/include/raft/util/cache.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,7 +67,7 @@ namespace raft::cache {
  * // We assume that our ML algo repeatedly calls calc, and the set of keys have
  * // an overlap. We will use the cache to avoid repeated calculations.
  *
- * // Assume we have raft::handle_t& h, and cudaStream_t stream
+ * // Assume we have raft::device_resources& h, and cudaStream_t stream
  * Cache<float> cache(h.get_device_allocator(), stream, m);
  *
  * // A buffer that we will reuse to store the cache indices.
diff --git a/cpp/include/raft/util/cuda_utils.cuh b/cpp/include/raft/util/cuda_utils.cuh
index 5818fc21f3..5be9dc999a 100644
--- a/cpp/include/raft/util/cuda_utils.cuh
+++ b/cpp/include/raft/util/cuda_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,8 @@
 #include <type_traits>
 
 #include <raft/core/cudart_utils.hpp>
+#include <raft/core/math.hpp>
+#include <raft/core/operators.hpp>
 
 #ifndef ENABLE_MEMCPY_ASYNC
 // enable memcpy_async interface by default for newer GPUs
@@ -258,12 +260,14 @@ DI double myAtomicMax(double* address, double val)
 template <typename T>
 HDI T myMax(T x, T y);
 template <>
-HDI float myMax<float>(float x, float y)
+[[deprecated("use raft::max from raft/core/math.hpp instead")]] HDI float myMax<float>(float x,
+                                                                                       float y)
 {
   return fmaxf(x, y);
 }
 template <>
-HDI double myMax<double>(double x, double y)
+[[deprecated("use raft::max from raft/core/math.hpp instead")]] HDI double myMax<double>(double x,
+                                                                                         double y)
 {
   return fmax(x, y);
 }
@@ -276,12 +280,14 @@ HDI double myMax<double>(double x, double y)
 template <typename T>
 HDI T myMin(T x, T y);
 template <>
-HDI float myMin<float>(float x, float y)
+[[deprecated("use raft::min from raft/core/math.hpp instead")]] HDI float myMin<float>(float x,
+                                                                                       float y)
 {
   return fminf(x, y);
 }
 template <>
-HDI double myMin<double>(double x, double y)
+[[deprecated("use raft::min from raft/core/math.hpp instead")]] HDI double myMin<double>(double x,
+                                                                                         double y)
 {
   return fmin(x, y);
 }
@@ -297,7 +303,7 @@ HDI double myMin<double>(double x, double y)
 template <typename T>
 DI T myAtomicMin(T* address, T val)
 {
-  myAtomicReduce(address, val, myMin<T>);
+  myAtomicReduce(address, val, raft::min_op{});
   return *address;
 }
 
@@ -311,19 +317,10 @@ DI T myAtomicMin(T* address, T val)
 template <typename T>
 DI T myAtomicMax(T* address, T val)
 {
-  myAtomicReduce(address, val, myMax<T>);
+  myAtomicReduce(address, val, raft::max_op{});
   return *address;
 }
 
-/**
- * Sign function
- */
-template <typename T>
-HDI int sgn(const T val)
-{
-  return (T(0) < val) - (val < T(0));
-}
-
 /**
  * @defgroup Exp Exponential function
  * @{
@@ -331,14 +328,14 @@ HDI int sgn(const T val)
 template <typename T>
 HDI T myExp(T x);
 template <>
-HDI float myExp(float x)
+[[deprecated("use raft::exp from raft/core/math.hpp instead")]] HDI float myExp(float x)
 {
   return expf(x);
 }
 template <>
-HDI double myExp(double x)
+[[deprecated("use raft::exp from raft/core/math.hpp instead")]] HDI double myExp(double x)
 {
-  return exp(x);
+  return ::exp(x);
 }
 /** @} */
 
@@ -367,14 +364,14 @@ inline __device__ double myInf<double>()
 template <typename T>
 HDI T myLog(T x);
 template <>
-HDI float myLog(float x)
+[[deprecated("use raft::log from raft/core/math.hpp instead")]] HDI float myLog(float x)
 {
   return logf(x);
 }
 template <>
-HDI double myLog(double x)
+[[deprecated("use raft::log from raft/core/math.hpp instead")]] HDI double myLog(double x)
 {
-  return log(x);
+  return ::log(x);
 }
 /** @} */
 
@@ -385,14 +382,14 @@ HDI double myLog(double x)
 template <typename T>
 HDI T mySqrt(T x);
 template <>
-HDI float mySqrt(float x)
+[[deprecated("use raft::sqrt from raft/core/math.hpp instead")]] HDI float mySqrt(float x)
 {
   return sqrtf(x);
 }
 template <>
-HDI double mySqrt(double x)
+[[deprecated("use raft::sqrt from raft/core/math.hpp instead")]] HDI double mySqrt(double x)
 {
-  return sqrt(x);
+  return ::sqrt(x);
 }
 /** @} */
 
@@ -403,14 +400,18 @@ HDI double mySqrt(double x)
 template <typename T>
 DI void mySinCos(T x, T& s, T& c);
 template <>
-DI void mySinCos(float x, float& s, float& c)
+[[deprecated("use raft::sincos from raft/core/math.hpp instead")]] DI void mySinCos(float x,
+                                                                                    float& s,
+                                                                                    float& c)
 {
   sincosf(x, &s, &c);
 }
 template <>
-DI void mySinCos(double x, double& s, double& c)
+[[deprecated("use raft::sincos from raft/core/math.hpp instead")]] DI void mySinCos(double x,
+                                                                                    double& s,
+                                                                                    double& c)
 {
-  sincos(x, &s, &c);
+  ::sincos(x, &s, &c);
 }
 /** @} */
 
@@ -421,14 +422,14 @@ DI void mySinCos(double x, double& s, double& c)
 template <typename T>
 DI T mySin(T x);
 template <>
-DI float mySin(float x)
+[[deprecated("use raft::sin from raft/core/math.hpp instead")]] DI float mySin(float x)
 {
   return sinf(x);
 }
 template <>
-DI double mySin(double x)
+[[deprecated("use raft::sin from raft/core/math.hpp instead")]] DI double mySin(double x)
 {
-  return sin(x);
+  return ::sin(x);
 }
 /** @} */
 
@@ -442,12 +443,12 @@ DI T myAbs(T x)
   return x < 0 ? -x : x;
 }
 template <>
-DI float myAbs(float x)
+[[deprecated("use raft::abs from raft/core/math.hpp instead")]] DI float myAbs(float x)
 {
   return fabsf(x);
 }
 template <>
-DI double myAbs(double x)
+[[deprecated("use raft::abs from raft/core/math.hpp instead")]] DI double myAbs(double x)
 {
   return fabs(x);
 }
@@ -460,14 +461,16 @@ DI double myAbs(double x)
 template <typename T>
 HDI T myPow(T x, T power);
 template <>
-HDI float myPow(float x, float power)
+[[deprecated("use raft::pow from raft/core/math.hpp instead")]] HDI float myPow(float x,
+                                                                                float power)
 {
   return powf(x, power);
 }
 template <>
-HDI double myPow(double x, double power)
+[[deprecated("use raft::pow from raft/core/math.hpp instead")]] HDI double myPow(double x,
+                                                                                 double power)
 {
-  return pow(x, power);
+  return ::pow(x, power);
 }
 /** @} */
 
@@ -478,14 +481,14 @@ HDI double myPow(double x, double power)
 template <typename T>
 HDI T myTanh(T x);
 template <>
-HDI float myTanh(float x)
+[[deprecated("use raft::tanh from raft/core/math.hpp instead")]] HDI float myTanh(float x)
 {
   return tanhf(x);
 }
 template <>
-HDI double myTanh(double x)
+[[deprecated("use raft::tanh from raft/core/math.hpp instead")]] HDI double myTanh(double x)
 {
-  return tanh(x);
+  return ::tanh(x);
 }
 /** @} */
 
@@ -496,55 +499,81 @@ HDI double myTanh(double x)
 template <typename T>
 HDI T myATanh(T x);
 template <>
-HDI float myATanh(float x)
+[[deprecated("use raft::atanh from raft/core/math.hpp instead")]] HDI float myATanh(float x)
 {
   return atanhf(x);
 }
 template <>
-HDI double myATanh(double x)
+[[deprecated("use raft::atanh from raft/core/math.hpp instead")]] HDI double myATanh(double x)
 {
-  return atanh(x);
+  return ::atanh(x);
 }
 /** @} */
 
 /**
- * @defgroup LambdaOps Lambda operations in reduction kernels
+ * @defgroup LambdaOps Legacy lambda operations, to be deprecated
  * @{
  */
-// IdxType mostly to be used for MainLambda in *Reduction kernels
 template <typename Type, typename IdxType = int>
 struct Nop {
-  HDI Type operator()(Type in, IdxType i = 0) { return in; }
+  [[deprecated("Nop is deprecated. Use identity_op instead.")]] HDI Type
+  operator()(Type in, IdxType i = 0) const
+  {
+    return in;
+  }
 };
 
 template <typename Type, typename IdxType = int>
 struct SqrtOp {
-  HDI Type operator()(Type in, IdxType i = 0) { return mySqrt(in); }
+  [[deprecated("SqrtOp is deprecated. Use sqrt_op instead.")]] HDI Type
+  operator()(Type in, IdxType i = 0) const
+  {
+    return raft::sqrt(in);
+  }
 };
 
 template <typename Type, typename IdxType = int>
 struct L0Op {
-  HDI Type operator()(Type in, IdxType i = 0) { return in != Type(0) ? Type(1) : Type(0); }
+  [[deprecated("L0Op is deprecated. Use nz_op instead.")]] HDI Type operator()(Type in,
+                                                                               IdxType i = 0) const
+  {
+    return in != Type(0) ? Type(1) : Type(0);
+  }
 };
 
 template <typename Type, typename IdxType = int>
 struct L1Op {
-  HDI Type operator()(Type in, IdxType i = 0) { return myAbs(in); }
+  [[deprecated("L1Op is deprecated. Use abs_op instead.")]] HDI Type operator()(Type in,
+                                                                                IdxType i = 0) const
+  {
+    return raft::abs(in);
+  }
 };
 
 template <typename Type, typename IdxType = int>
 struct L2Op {
-  HDI Type operator()(Type in, IdxType i = 0) { return in * in; }
+  [[deprecated("L2Op is deprecated. Use sq_op instead.")]] HDI Type operator()(Type in,
+                                                                               IdxType i = 0) const
+  {
+    return in * in;
+  }
 };
 
-template <typename Type>
+template <typename InT, typename OutT = InT>
 struct Sum {
-  HDI Type operator()(Type a, Type b) { return a + b; }
+  [[deprecated("Sum is deprecated. Use add_op instead.")]] HDI OutT operator()(InT a, InT b) const
+  {
+    return a + b;
+  }
 };
 
 template <typename Type>
 struct Max {
-  HDI Type operator()(Type a, Type b) { return myMax(a, b); }
+  [[deprecated("Max is deprecated. Use max_op instead.")]] HDI Type operator()(Type a, Type b) const
+  {
+    if (b > a) { return b; }
+    return a;
+  }
 };
 /** @} */
 
@@ -939,7 +968,7 @@ DI T warpReduce(T val, ReduceLambda reduce_op)
 template <typename T>
 DI T warpReduce(T val)
 {
-  return warpReduce(val, raft::Sum<T>{});
+  return warpReduce(val, raft::add_op{});
 }
 
 /**
diff --git a/cpp/include/raft/util/cudart_utils.hpp b/cpp/include/raft/util/cudart_utils.hpp
index 68a95da587..1c9793eb0a 100644
--- a/cpp/include/raft/util/cudart_utils.hpp
+++ b/cpp/include/raft/util/cudart_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,9 +41,6 @@
 #include <memory>
 #include <mutex>
 
-///@todo: enable once logging has been enabled in raft
-//#include "logger.hpp"
-
 namespace raft {
 
 /**
@@ -379,7 +376,12 @@ std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t strea
 
   ss << name << " = [ ";
   for (int i = 0; i < size; i++) {
-    ss << std::setw(width) << arr_h[i];
+    typedef
+      typename std::conditional_t<std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>, int, T>
+        CastT;
+
+    auto val = static_cast<CastT>(arr_h[i]);
+    ss << std::setw(width) << val;
 
     if (i < size - 1) ss << ", ";
   }
diff --git a/cpp/include/raft/util/device_atomics.cuh b/cpp/include/raft/util/device_atomics.cuh
index a79981124f..14856bed8e 100644
--- a/cpp/include/raft/util/device_atomics.cuh
+++ b/cpp/include/raft/util/device_atomics.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -242,7 +242,7 @@ struct genericAtomicOperationImpl<T, Op, 8> {
 
 // -------------------------------------------------------------------------------------------------
 // specialized functions for operators
-// `atomicAdd` supports int, unsigned int, unsigend long long int, float, double (long long int is
+// `atomicAdd` supports int, unsigned int, unsigned long long int, float, double (long long int is
 // not supported.) `atomicMin`, `atomicMax` support int, unsigned int, unsigned long long int
 // `atomicAnd`, `atomicOr`, `atomicXor` support int, unsigned int, unsigned long long int
 
@@ -519,7 +519,7 @@ __forceinline__ __device__ T atomicAdd(T* address, T val)
  * performed in one atomic transaction.
  *
  * The supported types for `atomicMin` are: integers are floating point numbers.
- * CUDA natively supports `int`, `unsigend int`, `unsigned long long int`.
+ * CUDA natively supports `int`, `unsigned int`, `unsigned long long int`.
  *
  * @param[in] address The address of old value in global or shared memory
  * @param[in] val The value to be computed
@@ -540,7 +540,7 @@ __forceinline__ __device__ T atomicMin(T* address, T val)
  * performed in one atomic transaction.
  *
  * The supported types for `atomicMax` are: integers are floating point numbers.
- * CUDA natively supports `int`, `unsigend int`, `unsigned long long int`.
+ * CUDA natively supports `int`, `unsigned int`, `unsigned long long int`.
  *
  * @param[in] address The address of old value in global or shared memory
  * @param[in] val The value to be computed
diff --git a/cpp/include/raft/util/integer_utils.hpp b/cpp/include/raft/util/integer_utils.hpp
index e893ff0904..3b0d9d44ae 100644
--- a/cpp/include/raft/util/integer_utils.hpp
+++ b/cpp/include/raft/util/integer_utils.hpp
@@ -23,6 +23,7 @@
  *
  */
 
+#include <limits>
 #include <stdexcept>
 #include <type_traits>
 
@@ -112,18 +113,37 @@ constexpr inline I div_rounding_up_safe(std::integral_constant<bool, true>,
  * approach of using (dividend + divisor - 1) / divisor
  */
 template <typename I>
-constexpr inline std::enable_if_t<std::is_integral<I>::value, I> div_rounding_up_safe(
-  I dividend, I divisor) noexcept
+constexpr inline auto div_rounding_up_safe(I dividend, I divisor) noexcept
+  -> std::enable_if_t<std::is_integral<I>::value, I>
 {
   using i_is_a_signed_type = std::integral_constant<bool, std::is_signed<I>::value>;
   return detail::div_rounding_up_safe(i_is_a_signed_type{}, dividend, divisor);
 }
 
 template <typename I>
-constexpr inline std::enable_if_t<std::is_integral<I>::value, bool> is_a_power_of_two(
-  I val) noexcept
+constexpr inline auto is_a_power_of_two(I val) noexcept
+  -> std::enable_if_t<std::is_integral<I>::value, bool>
 {
-  return ((val - 1) & val) == 0;
+  return (val != 0) && (((val - 1) & val) == 0);
+}
+
+/**
+ * Given an integer `x`, return such `y` that `x <= y` and `is_a_power_of_two(y)`.
+ * If such `y` does not exist in `T`, return zero.
+ */
+template <typename T>
+constexpr inline auto bound_by_power_of_two(T x) noexcept
+  -> std::enable_if_t<std::is_integral<T>::value, T>
+{
+  if (is_a_power_of_two(x)) { return x; }
+  constexpr T kMaxUnsafe = std::numeric_limits<T>::max();
+  constexpr T kMaxSafe   = is_a_power_of_two(kMaxUnsafe) ? kMaxUnsafe : (kMaxUnsafe >> 1);
+  const T limited        = std::min(x, kMaxSafe);
+  T bound                = T{1};
+  while (bound < limited) {
+    bound <<= 1;
+  }
+  return bound < x ? T{0} : bound;
 }
 
 /**
@@ -150,13 +170,13 @@ constexpr inline std::enable_if_t<std::is_integral<I>::value, bool> is_a_power_o
  * @return Absolute value if value type is signed.
  */
 template <typename T>
-std::enable_if_t<std::is_signed<T>::value, T> constexpr inline absolute_value(T val)
+constexpr inline auto absolute_value(T val) -> std::enable_if_t<std::is_signed<T>::value, T>
 {
   return std::abs(val);
 }
 // Unsigned type just returns itself.
 template <typename T>
-std::enable_if_t<!std::is_signed<T>::value, T> constexpr inline absolute_value(T val)
+constexpr inline auto absolute_value(T val) -> std::enable_if_t<!std::is_signed<T>::value, T>
 {
   return val;
 }
diff --git a/cpp/include/raft/util/scatter.cuh b/cpp/include/raft/util/scatter.cuh
index c20afa5454..e69be36ad9 100644
--- a/cpp/include/raft/util/scatter.cuh
+++ b/cpp/include/raft/util/scatter.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <raft/core/operators.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/detail/scatter.cuh>
 
@@ -37,13 +38,13 @@ namespace raft {
  * will be applied to every element before scattering it to the right location.
  * The second param in this method will be the destination index.
  */
-template <typename DataT, typename IdxT, typename Lambda = raft::Nop<DataT, IdxT>, int TPB = 256>
+template <typename DataT, typename IdxT, typename Lambda = raft::identity_op, int TPB = 256>
 void scatter(DataT* out,
              const DataT* in,
              const IdxT* idx,
              IdxT len,
              cudaStream_t stream,
-             Lambda op = raft::Nop<DataT, IdxT>())
+             Lambda op = raft::identity_op())
 {
   if (len <= 0) return;
   constexpr size_t DataSize   = sizeof(DataT);
diff --git a/cpp/include/raft_distance/random/rmat_rectangular_generator.hpp b/cpp/include/raft_distance/random/rmat_rectangular_generator.hpp
deleted file mode 100644
index 5dfad2af3e..0000000000
--- a/cpp/include/raft_distance/random/rmat_rectangular_generator.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-
-#include <raft/core/handle.hpp>
-#include <raft/random/rng_state.hpp>
-
-namespace raft::random::runtime {
-
-#define FUNC_DECL(IdxT, ProbT)                            \
-  void rmat_rectangular_gen(raft::handle_t const& handle, \
-                            IdxT* out,                    \
-                            IdxT* out_src,                \
-                            IdxT* out_dst,                \
-                            const ProbT* theta,           \
-                            IdxT r_scale,                 \
-                            IdxT c_scale,                 \
-                            IdxT n_edges,                 \
-                            raft::random::RngState& r)
-
-FUNC_DECL(int, float);
-FUNC_DECL(int64_t, float);
-FUNC_DECL(int, double);
-FUNC_DECL(int64_t, double);
-
-#undef FUNC_DECL
-
-}  // namespace raft::random::runtime
diff --git a/cpp/include/raft_distance/kmeans.hpp b/cpp/include/raft_runtime/cluster/kmeans.hpp
similarity index 54%
rename from cpp/include/raft_distance/kmeans.hpp
rename to cpp/include/raft_runtime/cluster/kmeans.hpp
index a56021b110..3386774414 100644
--- a/cpp/include/raft_distance/kmeans.hpp
+++ b/cpp/include/raft_runtime/cluster/kmeans.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,12 +14,21 @@
  * limitations under the License.
  */
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdspan.hpp>
 #include <raft/distance/distance_types.hpp>
 
-namespace raft::cluster::kmeans::runtime {
+#include <raft/cluster/kmeans_types.hpp>
 
-void update_centroids(raft::handle_t const& handle,
+namespace raft::runtime::cluster::kmeans {
+
+/**
+ * @defgroup kmeans_runtime Kmeans Runtime API
+ * @{
+ */
+
+void update_centroids(raft::device_resources const& handle,
                       const float* X,
                       int n_samples,
                       int n_features,
@@ -30,7 +39,7 @@ void update_centroids(raft::handle_t const& handle,
                       float* new_centroids,
                       float* weight_per_cluster);
 
-void update_centroids(raft::handle_t const& handle,
+void update_centroids(raft::device_resources const& handle,
                       const double* X,
                       int n_samples,
                       int n_features,
@@ -41,7 +50,23 @@ void update_centroids(raft::handle_t const& handle,
                       double* new_centroids,
                       double* weight_per_cluster);
 
-void cluster_cost(raft::handle_t const& handle,
+void fit(raft::device_resources const& handle,
+         const raft::cluster::kmeans::KMeansParams& params,
+         raft::device_matrix_view<const float, int, row_major> X,
+         std::optional<raft::device_vector_view<const float, int>> sample_weight,
+         raft::device_matrix_view<float, int, row_major> centroids,
+         raft::host_scalar_view<float, int> inertia,
+         raft::host_scalar_view<int, int> n_iter);
+
+void fit(raft::device_resources const& handle,
+         const raft::cluster::kmeans::KMeansParams& params,
+         raft::device_matrix_view<const double, int, row_major> X,
+         std::optional<raft::device_vector_view<const double, int>> sample_weight,
+         raft::device_matrix_view<double, int, row_major> centroids,
+         raft::host_scalar_view<double, int> inertia,
+         raft::host_scalar_view<int, int> n_iter);
+
+void cluster_cost(raft::device_resources const& handle,
                   const float* X,
                   int n_samples,
                   int n_features,
@@ -49,11 +74,14 @@ void cluster_cost(raft::handle_t const& handle,
                   const float* centroids,
                   float* cost);
 
-void cluster_cost(raft::handle_t const& handle,
+void cluster_cost(raft::device_resources const& handle,
                   const double* X,
                   int n_samples,
                   int n_features,
                   int n_clusters,
                   const double* centroids,
                   double* cost);
-}  // namespace raft::cluster::kmeans::runtime
+
+/** @} */  // end group kmeans_runtime
+
+}  // namespace raft::runtime::cluster::kmeans
diff --git a/cpp/include/raft_distance/fused_l2_min_arg.hpp b/cpp/include/raft_runtime/distance/fused_l2_nn.hpp
similarity index 81%
rename from cpp/include/raft_distance/fused_l2_min_arg.hpp
rename to cpp/include/raft_runtime/distance/fused_l2_nn.hpp
index f7d3748666..bdac3723e2 100644
--- a/cpp/include/raft_distance/fused_l2_min_arg.hpp
+++ b/cpp/include/raft_runtime/distance/fused_l2_nn.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,10 +14,15 @@
  * limitations under the License.
  */
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 
-namespace raft::distance::runtime {
+namespace raft::runtime::distance {
+
+/**
+ * @defgroup fused_l2_nn_min_arg_runtime Fused L2 1NN Runtime API
+ * @{
+ */
 
 /**
  * @brief Wrapper around fusedL2NN with minimum reduction operators.
@@ -37,7 +42,7 @@ namespace raft::distance::runtime {
  * @param[in]  k             gemm k
  * @param[in]  sqrt          Whether the output `minDist` should contain L2-sqrt
  */
-void fused_l2_nn_min_arg(raft::handle_t const& handle,
+void fused_l2_nn_min_arg(raft::device_resources const& handle,
                          int* min,
                          const float* x,
                          const float* y,
@@ -46,7 +51,7 @@ void fused_l2_nn_min_arg(raft::handle_t const& handle,
                          int k,
                          bool sqrt);
 
-void fused_l2_nn_min_arg(raft::handle_t const& handle,
+void fused_l2_nn_min_arg(raft::device_resources const& handle,
                          int* min,
                          const double* x,
                          const double* y,
@@ -55,4 +60,6 @@ void fused_l2_nn_min_arg(raft::handle_t const& handle,
                          int k,
                          bool sqrt);
 
-}  // end namespace raft::distance::runtime
\ No newline at end of file
+/** @} */  // end group fused_l2_nn_min_arg_runtime
+
+}  // end namespace raft::runtime::distance
diff --git a/cpp/include/raft_distance/pairwise_distance.hpp b/cpp/include/raft_runtime/distance/pairwise_distance.hpp
similarity index 76%
rename from cpp/include/raft_distance/pairwise_distance.hpp
rename to cpp/include/raft_runtime/distance/pairwise_distance.hpp
index e91ef5de20..751f821ffb 100644
--- a/cpp/include/raft_distance/pairwise_distance.hpp
+++ b/cpp/include/raft_runtime/distance/pairwise_distance.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,14 @@
 
 #include <raft/distance/distance_types.hpp>
 
-namespace raft::distance::runtime {
-void pairwise_distance(raft::handle_t const& handle,
+namespace raft::runtime::distance {
+
+/**
+ * @defgroup pairwise_distance_runtime Pairwise Distances Runtime API
+ * @{
+ */
+
+void pairwise_distance(raft::device_resources const& handle,
                        float* x,
                        float* y,
                        float* dists,
@@ -28,7 +34,7 @@ void pairwise_distance(raft::handle_t const& handle,
                        bool isRowMajor,
                        float metric_arg);
 
-void pairwise_distance(raft::handle_t const& handle,
+void pairwise_distance(raft::device_resources const& handle,
                        double* x,
                        double* y,
                        double* dists,
@@ -38,4 +44,7 @@ void pairwise_distance(raft::handle_t const& handle,
                        raft::distance::DistanceType metric,
                        bool isRowMajor,
                        float metric_arg);
-}  // namespace raft::distance::runtime
\ No newline at end of file
+
+/** @} */  // end group pairwise_distance_runtime
+
+}  // namespace raft::runtime::distance
diff --git a/cpp/include/raft_runtime/neighbors/ivf_pq.hpp b/cpp/include/raft_runtime/neighbors/ivf_pq.hpp
new file mode 100644
index 0000000000..59d0b59128
--- /dev/null
+++ b/cpp/include/raft_runtime/neighbors/ivf_pq.hpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/neighbors/ivf_pq_types.hpp>
+
+namespace raft::runtime::neighbors::ivf_pq {
+
+#define RAFT_INST_SEARCH(T, IdxT)                            \
+  void search(raft::device_resources const&,                 \
+              const raft::neighbors::ivf_pq::search_params&, \
+              const raft::neighbors::ivf_pq::index<IdxT>&,   \
+              const T*,                                      \
+              uint32_t,                                      \
+              uint32_t,                                      \
+              IdxT*,                                         \
+              float*,                                        \
+              rmm::mr::device_memory_resource*);
+
+RAFT_INST_SEARCH(float, uint64_t);
+RAFT_INST_SEARCH(int8_t, uint64_t);
+RAFT_INST_SEARCH(uint8_t, uint64_t);
+
+#undef RAFT_INST_SEARCH
+
+// We define overloads for build and extend with void return type. This is used in the Cython
+// wrappers, where exception handling is not compatible with return type that has nontrivial
+// constructor.
+#define RAFT_INST_BUILD_EXTEND(T, IdxT)                               \
+  auto build(raft::device_resources const& handle,                    \
+             const raft::neighbors::ivf_pq::index_params& params,     \
+             const T* dataset,                                        \
+             IdxT n_rows,                                             \
+             uint32_t dim)                                            \
+    ->raft::neighbors::ivf_pq::index<IdxT>;                           \
+                                                                      \
+  auto extend(raft::device_resources const& handle,                   \
+              const raft::neighbors::ivf_pq::index<IdxT>& orig_index, \
+              const T* new_vectors,                                   \
+              const IdxT* new_indices,                                \
+              IdxT n_rows)                                            \
+    ->raft::neighbors::ivf_pq::index<IdxT>;                           \
+                                                                      \
+  void build(raft::device_resources const& handle,                    \
+             const raft::neighbors::ivf_pq::index_params& params,     \
+             const T* dataset,                                        \
+             IdxT n_rows,                                             \
+             uint32_t dim,                                            \
+             raft::neighbors::ivf_pq::index<IdxT>* idx);              \
+                                                                      \
+  void extend(raft::device_resources const& handle,                   \
+              raft::neighbors::ivf_pq::index<IdxT>* idx,              \
+              const T* new_vectors,                                   \
+              const IdxT* new_indices,                                \
+              IdxT n_rows);
+
+RAFT_INST_BUILD_EXTEND(float, uint64_t)
+RAFT_INST_BUILD_EXTEND(int8_t, uint64_t)
+RAFT_INST_BUILD_EXTEND(uint8_t, uint64_t)
+
+#undef RAFT_INST_BUILD_EXTEND
+
+/**
+ * Save the index to file.
+ *
+ * Experimental, both the API and the serialization format are subject to change.
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the filename for saving the index
+ * @param[in] index IVF-PQ index
+ *
+ */
+void serialize(raft::device_resources const& handle,
+               const std::string& filename,
+               const raft::neighbors::ivf_pq::index<uint64_t>& index);
+
+/**
+ * Load index from file.
+ *
+ * Experimental, both the API and the serialization format are subject to change.
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the name of the file that stores the index
+ * @param[in] index IVF-PQ index
+ *
+ */
+void deserialize(raft::device_resources const& handle,
+                 const std::string& filename,
+                 raft::neighbors::ivf_pq::index<uint64_t>* index);
+
+}  // namespace raft::runtime::neighbors::ivf_pq
diff --git a/cpp/include/raft/neighbors/specializations/refine.hpp b/cpp/include/raft_runtime/neighbors/refine.hpp
similarity index 52%
rename from cpp/include/raft/neighbors/specializations/refine.hpp
rename to cpp/include/raft_runtime/neighbors/refine.hpp
index 80b128327d..e779d17ded 100644
--- a/cpp/include/raft/neighbors/specializations/refine.hpp
+++ b/cpp/include/raft_runtime/neighbors/refine.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,26 +17,26 @@
 #pragma once
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 //#include <raft/core/host_mdspan.hpp>
 
-namespace raft::neighbors {
+namespace raft::runtime::neighbors {
 
-#define RAFT_INST_REFINE(IDX_T, DATA_T)                                                 \
-  void refine(raft::handle_t const& handle,                                             \
-              raft::device_matrix_view<DATA_T, uint64_t, row_major> dataset,            \
-              raft::device_matrix_view<DATA_T, uint64_t, row_major> queries,            \
-              raft::device_matrix_view<IDX_T, uint64_t, row_major> neighbor_candidates, \
-              raft::device_matrix_view<IDX_T, uint64_t, row_major> indices,             \
-              raft::device_matrix_view<float, uint64_t, row_major> distances,           \
-              distance::DistanceType metric);                                           \
-                                                                                        \
-  void refine(raft::handle_t const& handle,                                             \
-              raft::host_matrix_view<DATA_T, uint64_t, row_major> dataset,              \
-              raft::host_matrix_view<DATA_T, uint64_t, row_major> queries,              \
-              raft::host_matrix_view<IDX_T, uint64_t, row_major> neighbor_candidates,   \
-              raft::host_matrix_view<IDX_T, uint64_t, row_major> indices,               \
-              raft::host_matrix_view<float, uint64_t, row_major> distances,             \
+#define RAFT_INST_REFINE(IDX_T, DATA_T)                                                       \
+  void refine(raft::device_resources const& handle,                                           \
+              raft::device_matrix_view<const DATA_T, uint64_t, row_major> dataset,            \
+              raft::device_matrix_view<const DATA_T, uint64_t, row_major> queries,            \
+              raft::device_matrix_view<const IDX_T, uint64_t, row_major> neighbor_candidates, \
+              raft::device_matrix_view<IDX_T, uint64_t, row_major> indices,                   \
+              raft::device_matrix_view<float, uint64_t, row_major> distances,                 \
+              distance::DistanceType metric);                                                 \
+                                                                                              \
+  void refine(raft::device_resources const& handle,                                           \
+              raft::host_matrix_view<const DATA_T, uint64_t, row_major> dataset,              \
+              raft::host_matrix_view<const DATA_T, uint64_t, row_major> queries,              \
+              raft::host_matrix_view<const IDX_T, uint64_t, row_major> neighbor_candidates,   \
+              raft::host_matrix_view<IDX_T, uint64_t, row_major> indices,                     \
+              raft::host_matrix_view<float, uint64_t, row_major> distances,                   \
               distance::DistanceType metric);
 
 RAFT_INST_REFINE(uint64_t, float);
@@ -45,4 +45,4 @@ RAFT_INST_REFINE(uint64_t, int8_t);
 
 #undef RAFT_INST_REFINE
 
-}  // namespace raft::neighbors
+}  // namespace raft::runtime::neighbors
diff --git a/cpp/include/raft_runtime/random/rmat_rectangular_generator.hpp b/cpp/include/raft_runtime/random/rmat_rectangular_generator.hpp
new file mode 100644
index 0000000000..8f18fd1388
--- /dev/null
+++ b/cpp/include/raft_runtime/random/rmat_rectangular_generator.hpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+
+#include <raft/core/device_resources.hpp>
+#include <raft/random/rng_state.hpp>
+
+namespace raft::runtime::random {
+
+/**
+ * @defgroup rmat_runtime RMAT Runtime API
+ * @{
+ */
+
+#define FUNC_DECL(IdxT, ProbT)                                    \
+  void rmat_rectangular_gen(raft::device_resources const& handle, \
+                            IdxT* out,                            \
+                            IdxT* out_src,                        \
+                            IdxT* out_dst,                        \
+                            const ProbT* theta,                   \
+                            IdxT r_scale,                         \
+                            IdxT c_scale,                         \
+                            IdxT n_edges,                         \
+                            raft::random::RngState& r)
+
+FUNC_DECL(int, float);
+FUNC_DECL(int64_t, float);
+FUNC_DECL(int, double);
+FUNC_DECL(int64_t, double);
+
+#undef FUNC_DECL
+
+/** @} */  // end group rmat_runtime
+
+}  // namespace raft::runtime::random
diff --git a/python/raft-dask/cmake/thirdparty/get_nccl.cmake b/cpp/internal/CMakeLists.txt
similarity index 62%
rename from python/raft-dask/cmake/thirdparty/get_nccl.cmake
rename to cpp/internal/CMakeLists.txt
index bb4b0e4dae..4d5c585c01 100644
--- a/python/raft-dask/cmake/thirdparty/get_nccl.cmake
+++ b/cpp/internal/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -12,23 +12,10 @@
 # the License.
 # =============================================================================
 
-include(rapids-find)
-function(find_and_configure_nccl)
-
-  if(TARGET NCCL::NCCL)
-    return()
-  endif()
-
-  rapids_find_generate_module(
-    NCCL
-    HEADER_NAMES nccl.h
-    LIBRARY_NAMES nccl
+if(BUILD_TESTS OR BUILD_BENCH)
+  add_library(raft_internal INTERFACE)
+  target_include_directories(
+    raft_internal INTERFACE "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/internal>"
   )
-
-  # Currently NCCL has no CMake build-system so we require it built and installed on the machine
-  # already
-  rapids_find_package(NCCL REQUIRED)
-
-endfunction()
-
-find_and_configure_nccl()
+  target_compile_features(raft_internal INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
+endif()
diff --git a/cpp/internal/raft_internal/matrix/select_k.cuh b/cpp/internal/raft_internal/matrix/select_k.cuh
new file mode 100644
index 0000000000..205149b821
--- /dev/null
+++ b/cpp/internal/raft_internal/matrix/select_k.cuh
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/matrix/detail/select_radix.cuh>
+#include <raft/matrix/detail/select_warpsort.cuh>
+#include <raft/matrix/select_k.cuh>
+
+#include <raft/core/device_resources.hpp>
+
+namespace raft::matrix::select {
+
+struct params {
+  size_t batch_size;
+  size_t len;
+  int k;
+  bool select_min;
+  bool use_index_input = true;
+};
+
+inline auto operator<<(std::ostream& os, const params& ss) -> std::ostream&
+{
+  os << "params{batch_size: " << ss.batch_size;
+  os << ", len: " << ss.len;
+  os << ", k: " << ss.k;
+  os << (ss.select_min ? ", asc" : ", dsc");
+  os << (ss.use_index_input ? "}" : ", no-input-index}");
+  return os;
+}
+
+enum class Algo {
+  kPublicApi,
+  kRadix8bits,
+  kRadix11bits,
+  kWarpAuto,
+  kWarpImmediate,
+  kWarpFiltered,
+  kWarpDistributed,
+  kWarpDistributedShm
+};
+
+inline auto operator<<(std::ostream& os, const Algo& algo) -> std::ostream&
+{
+  switch (algo) {
+    case Algo::kPublicApi: return os << "kPublicApi";
+    case Algo::kRadix8bits: return os << "kRadix8bits";
+    case Algo::kRadix11bits: return os << "kRadix11bits";
+    case Algo::kWarpAuto: return os << "kWarpAuto";
+    case Algo::kWarpImmediate: return os << "kWarpImmediate";
+    case Algo::kWarpFiltered: return os << "kWarpFiltered";
+    case Algo::kWarpDistributed: return os << "kWarpDistributed";
+    case Algo::kWarpDistributedShm: return os << "kWarpDistributedShm";
+    default: return os << "unknown enum value";
+  }
+}
+
+template <typename T, typename IdxT>
+void select_k_impl(const device_resources& handle,
+                   const Algo& algo,
+                   const T* in,
+                   const IdxT* in_idx,
+                   size_t batch_size,
+                   size_t len,
+                   int k,
+                   T* out,
+                   IdxT* out_idx,
+                   bool select_min)
+{
+  auto stream = handle.get_stream();
+  switch (algo) {
+    case Algo::kPublicApi: {
+      auto in_extent   = make_extents<size_t>(batch_size, len);
+      auto out_extent  = make_extents<size_t>(batch_size, k);
+      auto in_span     = make_mdspan<const T, size_t, row_major, false, true>(in, in_extent);
+      auto in_idx_span = make_mdspan<const IdxT, size_t, row_major, false, true>(in_idx, in_extent);
+      auto out_span    = make_mdspan<T, size_t, row_major, false, true>(out, out_extent);
+      auto out_idx_span = make_mdspan<IdxT, size_t, row_major, false, true>(out_idx, out_extent);
+      if (in_idx == nullptr) {
+        // NB: std::nullopt prevents automatic inference of the template parameters.
+        return matrix::select_k<T, IdxT>(
+          handle, in_span, std::nullopt, out_span, out_idx_span, select_min);
+      } else {
+        return matrix::select_k(
+          handle, in_span, std::make_optional(in_idx_span), out_span, out_idx_span, select_min);
+      }
+    }
+    case Algo::kRadix8bits:
+      return detail::select::radix::select_k<T, IdxT, 8, 512>(
+        in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+    case Algo::kRadix11bits:
+      return detail::select::radix::select_k<T, IdxT, 11, 512>(
+        in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+    case Algo::kWarpAuto:
+      return detail::select::warpsort::select_k<T, IdxT>(
+        in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+    case Algo::kWarpImmediate:
+      return detail::select::warpsort::
+        select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_immediate>(
+          in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+    case Algo::kWarpFiltered:
+      return detail::select::warpsort::
+        select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_filtered>(
+          in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+    case Algo::kWarpDistributed:
+      return detail::select::warpsort::
+        select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_distributed>(
+          in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+    case Algo::kWarpDistributedShm:
+      return detail::select::warpsort::
+        select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_distributed_ext>(
+          in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+  }
+}
+
+}  // namespace raft::matrix::select
diff --git a/cpp/internal/raft_internal/neighbors/naive_knn.cuh b/cpp/internal/raft_internal/neighbors/naive_knn.cuh
new file mode 100644
index 0000000000..3ad055272b
--- /dev/null
+++ b/cpp/internal/raft_internal/neighbors/naive_knn.cuh
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/distance_types.hpp>
+#include <raft/matrix/detail/select_k.cuh>
+#include <raft/spatial/knn/detail/ann_utils.cuh>
+#include <raft/util/cuda_utils.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+namespace raft::neighbors {
+
+template <typename EvalT, typename DataT, typename IdxT>
+__global__ void naive_distance_kernel(EvalT* dist,
+                                      const DataT* x,
+                                      const DataT* y,
+                                      IdxT m,
+                                      IdxT n,
+                                      IdxT k,
+                                      raft::distance::DistanceType metric)
+{
+  IdxT midx = IdxT(threadIdx.x) + IdxT(blockIdx.x) * IdxT(blockDim.x);
+  if (midx >= m) return;
+  IdxT grid_size = IdxT(blockDim.y) * IdxT(gridDim.y);
+  for (IdxT nidx = threadIdx.y + blockIdx.y * blockDim.y; nidx < n; nidx += grid_size) {
+    EvalT acc = EvalT(0);
+    for (IdxT i = 0; i < k; ++i) {
+      IdxT xidx = i + midx * k;
+      IdxT yidx = i + nidx * k;
+      auto xv   = EvalT(x[xidx]);
+      auto yv   = EvalT(y[yidx]);
+      switch (metric) {
+        case raft::distance::DistanceType::InnerProduct: {
+          acc += xv * yv;
+        } break;
+        case raft::distance::DistanceType::L2SqrtExpanded:
+        case raft::distance::DistanceType::L2SqrtUnexpanded:
+        case raft::distance::DistanceType::L2Expanded:
+        case raft::distance::DistanceType::L2Unexpanded: {
+          auto diff = xv - yv;
+          acc += diff * diff;
+        } break;
+        default: break;
+      }
+    }
+    switch (metric) {
+      case raft::distance::DistanceType::L2SqrtExpanded:
+      case raft::distance::DistanceType::L2SqrtUnexpanded: {
+        acc = raft::sqrt(acc);
+      } break;
+      default: break;
+    }
+    dist[midx * n + nidx] = acc;
+  }
+}
+
+/**
+ * Naive, but flexible bruteforce KNN search.
+ *
+ * TODO: either replace this with brute_force_knn or with distance+select_k
+ *       when either distance or brute_force_knn support 8-bit int inputs.
+ */
+template <typename EvalT, typename DataT, typename IdxT>
+void naive_knn(EvalT* dist_topk,
+               IdxT* indices_topk,
+               const DataT* x,
+               const DataT* y,
+               size_t n_inputs,
+               size_t input_len,
+               size_t dim,
+               uint32_t k,
+               raft::distance::DistanceType type,
+               rmm::cuda_stream_view stream)
+{
+  rmm::mr::device_memory_resource* mr = nullptr;
+  auto pool_guard                     = raft::get_pool_memory_resource(mr, 1024 * 1024);
+
+  dim3 block_dim(16, 32, 1);
+  // maximum reasonable grid size in `y` direction
+  auto grid_y =
+    static_cast<uint16_t>(std::min<size_t>(raft::ceildiv<size_t>(input_len, block_dim.y), 32768));
+
+  // bound the memory used by this function
+  size_t max_batch_size =
+    std::min<size_t>(n_inputs, raft::ceildiv<size_t>(size_t(1) << size_t(27), input_len));
+  rmm::device_uvector<EvalT> dist(max_batch_size * input_len, stream, mr);
+
+  for (size_t offset = 0; offset < n_inputs; offset += max_batch_size) {
+    size_t batch_size = std::min(max_batch_size, n_inputs - offset);
+    dim3 grid_dim(raft::ceildiv<size_t>(batch_size, block_dim.x), grid_y, 1);
+
+    naive_distance_kernel<EvalT, DataT, IdxT><<<grid_dim, block_dim, 0, stream>>>(
+      dist.data(), x + offset * dim, y, batch_size, input_len, dim, type);
+
+    matrix::detail::select_k<EvalT, IdxT>(dist.data(),
+                                          nullptr,
+                                          batch_size,
+                                          input_len,
+                                          static_cast<int>(k),
+                                          dist_topk + offset * k,
+                                          indices_topk + offset * k,
+                                          type != raft::distance::DistanceType::InnerProduct,
+                                          stream,
+                                          mr);
+  }
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+}
+
+}  // namespace raft::neighbors
diff --git a/cpp/test/neighbors/refine_helper.cuh b/cpp/internal/raft_internal/neighbors/refine_helper.cuh
similarity index 72%
rename from cpp/test/neighbors/refine_helper.cuh
rename to cpp/internal/raft_internal/neighbors/refine_helper.cuh
index 3c69a8f5b7..5e26222827 100644
--- a/cpp/test/neighbors/refine_helper.cuh
+++ b/cpp/internal/raft_internal/neighbors/refine_helper.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,19 +15,20 @@
  */
 #pragma once
 
-#include "ann_utils.cuh"
+#include <raft_internal/neighbors/naive_knn.cuh>
+
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/host_mdarray.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/random/rng.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
+#include <rmm/device_uvector.hpp>
 
-namespace raft::neighbors::detail {
+namespace raft::neighbors {
 
 template <typename IdxT>
 struct RefineInputs {
@@ -44,7 +45,7 @@ struct RefineInputs {
 template <typename DataT, typename DistanceT, typename IdxT>
 class RefineHelper {
  public:
-  RefineHelper(const raft::handle_t& handle, RefineInputs<IdxT> params)
+  RefineHelper(const raft::device_resources& handle, RefineInputs<IdxT> params)
     : handle_(handle), stream_(handle.get_stream()), p(params)
   {
     raft::random::Rng r(1234ULL);
@@ -66,16 +67,16 @@ class RefineHelper {
     {
       candidates = raft::make_device_matrix<IdxT, IdxT>(handle_, p.n_queries, p.k0);
       rmm::device_uvector<DistanceT> distances_tmp(p.n_queries * p.k0, stream_);
-      raft::neighbors::naiveBfKnn<DistanceT, DataT, IdxT>(distances_tmp.data(),
-                                                          candidates.data_handle(),
-                                                          queries.data_handle(),
-                                                          dataset.data_handle(),
-                                                          p.n_queries,
-                                                          p.n_rows,
-                                                          p.dim,
-                                                          p.k0,
-                                                          p.metric,
-                                                          stream_);
+      naive_knn<DistanceT, DataT, IdxT>(distances_tmp.data(),
+                                        candidates.data_handle(),
+                                        queries.data_handle(),
+                                        dataset.data_handle(),
+                                        p.n_queries,
+                                        p.n_rows,
+                                        p.dim,
+                                        p.k0,
+                                        p.metric,
+                                        stream_);
       handle_.sync_stream(stream_);
     }
 
@@ -98,16 +99,16 @@ class RefineHelper {
     {
       rmm::device_uvector<DistanceT> distances_dev(p.n_queries * p.k, stream_);
       rmm::device_uvector<IdxT> indices_dev(p.n_queries * p.k, stream_);
-      raft::neighbors::naiveBfKnn<DistanceT, DataT, IdxT>(distances_dev.data(),
-                                                          indices_dev.data(),
-                                                          queries.data_handle(),
-                                                          dataset.data_handle(),
-                                                          p.n_queries,
-                                                          p.n_rows,
-                                                          p.dim,
-                                                          p.k,
-                                                          p.metric,
-                                                          stream_);
+      naive_knn<DistanceT, DataT, IdxT>(distances_dev.data(),
+                                        indices_dev.data(),
+                                        queries.data_handle(),
+                                        dataset.data_handle(),
+                                        p.n_queries,
+                                        p.n_rows,
+                                        p.dim,
+                                        p.k,
+                                        p.metric,
+                                        stream_);
       true_refined_distances_host.resize(p.n_queries * p.k);
       true_refined_indices_host.resize(p.n_queries * p.k);
       raft::copy(true_refined_indices_host.data(), indices_dev.data(), indices_dev.size(), stream_);
@@ -119,7 +120,7 @@ class RefineHelper {
 
  public:
   RefineInputs<IdxT> p;
-  const raft::handle_t& handle_;
+  const raft::device_resources& handle_;
   rmm::cuda_stream_view stream_;
 
   raft::device_matrix<DataT, IdxT, row_major> dataset;
@@ -137,4 +138,4 @@ class RefineHelper {
   std::vector<IdxT> true_refined_indices_host;
   std::vector<DistanceT> true_refined_distances_host;
 };
-}  // namespace raft::neighbors::detail
\ No newline at end of file
+}  // namespace raft::neighbors
diff --git a/cpp/src/distance/cluster_cost.cuh b/cpp/src/distance/cluster/cluster_cost.cuh
similarity index 75%
rename from cpp/src/distance/cluster_cost.cuh
rename to cpp/src/distance/cluster/cluster_cost.cuh
index 344673830b..be7fa521aa 100644
--- a/cpp/src/distance/cluster_cost.cuh
+++ b/cpp/src/distance/cluster/cluster_cost.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,13 +15,15 @@
  */
 
 #include <raft/cluster/kmeans.cuh>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/operators.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/distance/fused_l2_nn.cuh>
-#include <raft/handle.hpp>
+#include <raft/util/cuda_utils.cuh>
 
-namespace raft::cluster::kmeans::runtime {
+namespace raft::runtime::cluster::kmeans {
 template <typename ElementType, typename IndexType>
-void cluster_cost(const raft::handle_t& handle,
+void cluster_cost(raft::device_resources const& handle,
                   const ElementType* X,
                   IndexType n_samples,
                   IndexType n_features,
@@ -59,21 +61,19 @@ void cluster_cost(const raft::handle_t& handle,
                                      handle.get_stream());
 
   auto distances = raft::make_device_vector<ElementType, IndexType>(handle, n_samples);
-  thrust::transform(
-    handle.get_thrust_policy(),
-    min_cluster_distance.data_handle(),
-    min_cluster_distance.data_handle() + n_samples,
-    distances.data_handle(),
-    [] __device__(const raft::KeyValuePair<IndexType, ElementType>& a) { return a.value; });
+  thrust::transform(handle.get_thrust_policy(),
+                    min_cluster_distance.data_handle(),
+                    min_cluster_distance.data_handle() + n_samples,
+                    distances.data_handle(),
+                    raft::value_op{});
 
   rmm::device_scalar<ElementType> device_cost(0, handle.get_stream());
-  raft::cluster::kmeans::cluster_cost(
-    handle,
-    distances.view(),
-    workspace,
-    make_device_scalar_view<ElementType>(device_cost.data()),
-    [] __device__(const ElementType& a, const ElementType& b) { return a + b; });
+  raft::cluster::kmeans::cluster_cost(handle,
+                                      distances.view(),
+                                      workspace,
+                                      make_device_scalar_view<ElementType>(device_cost.data()),
+                                      raft::add_op{});
 
   raft::update_host(cost, device_cost.data(), 1, handle.get_stream());
 }
-}  // namespace raft::cluster::kmeans::runtime
+}  // namespace raft::runtime::cluster::kmeans
diff --git a/cpp/src/distance/cluster_cost_double.cu b/cpp/src/distance/cluster/cluster_cost_double.cu
similarity index 80%
rename from cpp/src/distance/cluster_cost_double.cu
rename to cpp/src/distance/cluster/cluster_cost_double.cu
index b811b0bf8d..6feb3076dd 100644
--- a/cpp/src/distance/cluster_cost_double.cu
+++ b/cpp/src/distance/cluster/cluster_cost_double.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,13 +15,13 @@
  */
 
 #include "cluster_cost.cuh"
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/distance/specializations.cuh>
-#include <raft/handle.hpp>
 
-namespace raft::cluster::kmeans::runtime {
+namespace raft::runtime::cluster::kmeans {
 
-void cluster_cost(const raft::handle_t& handle,
+void cluster_cost(raft::device_resources const& handle,
                   const double* X,
                   int n_samples,
                   int n_features,
@@ -31,4 +31,4 @@ void cluster_cost(const raft::handle_t& handle,
 {
   cluster_cost<double, int>(handle, X, n_samples, n_features, n_clusters, centroids, cost);
 }
-}  // namespace raft::cluster::kmeans::runtime
+}  // namespace raft::runtime::cluster::kmeans
diff --git a/cpp/src/distance/cluster_cost_float.cu b/cpp/src/distance/cluster/cluster_cost_float.cu
similarity index 80%
rename from cpp/src/distance/cluster_cost_float.cu
rename to cpp/src/distance/cluster/cluster_cost_float.cu
index d78ea446da..24af5efb25 100644
--- a/cpp/src/distance/cluster_cost_float.cu
+++ b/cpp/src/distance/cluster/cluster_cost_float.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,13 +15,13 @@
  */
 
 #include "cluster_cost.cuh"
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/distance/specializations.cuh>
-#include <raft/handle.hpp>
 
-namespace raft::cluster::kmeans::runtime {
+namespace raft::runtime::cluster::kmeans {
 
-void cluster_cost(const raft::handle_t& handle,
+void cluster_cost(raft::device_resources const& handle,
                   const float* X,
                   int n_samples,
                   int n_features,
@@ -31,4 +31,4 @@ void cluster_cost(const raft::handle_t& handle,
 {
   cluster_cost<float, int>(handle, X, n_samples, n_features, n_clusters, centroids, cost);
 }
-}  // namespace raft::cluster::kmeans::runtime
+}  // namespace raft::runtime::cluster::kmeans
diff --git a/cpp/src/distance/cluster/kmeans_fit_double.cu b/cpp/src/distance/cluster/kmeans_fit_double.cu
new file mode 100644
index 0000000000..cbc9fa45cb
--- /dev/null
+++ b/cpp/src/distance/cluster/kmeans_fit_double.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/cluster/kmeans.cuh>
+#include <raft/core/device_resources.hpp>
+#include <raft/distance/specializations.cuh>
+
+namespace raft::runtime::cluster::kmeans {
+
+void fit(raft::device_resources const& handle,
+         const raft::cluster::kmeans::KMeansParams& params,
+         raft::device_matrix_view<const double, int> X,
+         std::optional<raft::device_vector_view<const double, int>> sample_weight,
+         raft::device_matrix_view<double, int> centroids,
+         raft::host_scalar_view<double, int> inertia,
+         raft::host_scalar_view<int, int> n_iter)
+{
+  raft::cluster::kmeans::fit<double, int>(
+    handle, params, X, sample_weight, centroids, inertia, n_iter);
+}
+}  // namespace raft::runtime::cluster::kmeans
diff --git a/cpp/src/distance/cluster/kmeans_fit_float.cu b/cpp/src/distance/cluster/kmeans_fit_float.cu
new file mode 100644
index 0000000000..6dcbd73b8d
--- /dev/null
+++ b/cpp/src/distance/cluster/kmeans_fit_float.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/cluster/kmeans.cuh>
+#include <raft/core/device_resources.hpp>
+#include <raft/distance/specializations.cuh>
+
+namespace raft::runtime::cluster::kmeans {
+
+void fit(raft::device_resources const& handle,
+         const raft::cluster::kmeans::KMeansParams& params,
+         raft::device_matrix_view<const float, int> X,
+         std::optional<raft::device_vector_view<const float, int>> sample_weight,
+         raft::device_matrix_view<float, int> centroids,
+         raft::host_scalar_view<float, int> inertia,
+         raft::host_scalar_view<int, int> n_iter)
+{
+  raft::cluster::kmeans::fit<float, int>(
+    handle, params, X, sample_weight, centroids, inertia, n_iter);
+}
+}  // namespace raft::runtime::cluster::kmeans
diff --git a/cpp/src/distance/update_centroids.cuh b/cpp/src/distance/cluster/update_centroids.cuh
similarity index 92%
rename from cpp/src/distance/update_centroids.cuh
rename to cpp/src/distance/cluster/update_centroids.cuh
index 91f3e1e2a3..e0d3bd8487 100644
--- a/cpp/src/distance/update_centroids.cuh
+++ b/cpp/src/distance/cluster/update_centroids.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,15 +15,15 @@
  */
 
 #include <raft/cluster/kmeans.cuh>
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/distance/specializations.cuh>
-#include <raft/handle.hpp>
 #include <raft/linalg/norm.cuh>
 
-namespace raft::cluster::kmeans::runtime {
+namespace raft::runtime::cluster::kmeans {
 
 template <typename DataT, typename IndexT>
-void update_centroids(raft::handle_t const& handle,
+void update_centroids(raft::device_resources const& handle,
                       const DataT* X,
                       int n_samples,
                       int n_features,
@@ -68,4 +68,4 @@ void update_centroids(raft::handle_t const& handle,
                                                          weight_per_cluster_view,
                                                          new_centroids_view);
 }
-}  // namespace raft::cluster::kmeans::runtime
\ No newline at end of file
+}  // namespace raft::runtime::cluster::kmeans
\ No newline at end of file
diff --git a/cpp/src/distance/update_centroids_double.cu b/cpp/src/distance/cluster/update_centroids_double.cu
similarity index 86%
rename from cpp/src/distance/update_centroids_double.cu
rename to cpp/src/distance/cluster/update_centroids_double.cu
index fe741ddb78..cb63de2ca5 100644
--- a/cpp/src/distance/update_centroids_double.cu
+++ b/cpp/src/distance/cluster/update_centroids_double.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,13 +15,13 @@
  */
 
 #include "update_centroids.cuh"
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/distance/specializations.cuh>
-#include <raft/handle.hpp>
 
-namespace raft::cluster::kmeans::runtime {
+namespace raft::runtime::cluster::kmeans {
 
-void update_centroids(raft::handle_t const& handle,
+void update_centroids(raft::device_resources const& handle,
                       const double* X,
                       int n_samples,
                       int n_features,
@@ -44,4 +44,4 @@ void update_centroids(raft::handle_t const& handle,
                                 weight_per_cluster);
 }
 
-}  // namespace raft::cluster::kmeans::runtime
\ No newline at end of file
+}  // namespace raft::runtime::cluster::kmeans
\ No newline at end of file
diff --git a/cpp/src/distance/update_centroids_float.cu b/cpp/src/distance/cluster/update_centroids_float.cu
similarity index 86%
rename from cpp/src/distance/update_centroids_float.cu
rename to cpp/src/distance/cluster/update_centroids_float.cu
index ebb06376ff..7ce74b584c 100644
--- a/cpp/src/distance/update_centroids_float.cu
+++ b/cpp/src/distance/cluster/update_centroids_float.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,13 +15,13 @@
  */
 
 #include "update_centroids.cuh"
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/distance/specializations.cuh>
-#include <raft/handle.hpp>
 
-namespace raft::cluster::kmeans::runtime {
+namespace raft::runtime::cluster::kmeans {
 
-void update_centroids(raft::handle_t const& handle,
+void update_centroids(raft::device_resources const& handle,
                       const float* X,
                       int n_samples,
                       int n_features,
@@ -44,4 +44,4 @@ void update_centroids(raft::handle_t const& handle,
                                weight_per_cluster);
 }
 
-}  // namespace raft::cluster::kmeans::runtime
\ No newline at end of file
+}  // namespace raft::runtime::cluster::kmeans
\ No newline at end of file
diff --git a/cpp/src/distance/fused_l2_min_arg.cu b/cpp/src/distance/distance/fused_l2_min_arg.cu
similarity index 74%
rename from cpp/src/distance/fused_l2_min_arg.cu
rename to cpp/src/distance/distance/fused_l2_min_arg.cu
index c722b5a566..b682446cc2 100644
--- a/cpp/src/distance/fused_l2_min_arg.cu
+++ b/cpp/src/distance/distance/fused_l2_min_arg.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 
 #include <raft/core/device_mdarray.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/kvp.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/distance/fused_l2_nn.cuh>
@@ -23,7 +23,7 @@
 #include <thrust/for_each.h>
 #include <thrust/tuple.h>
 
-namespace raft::distance::runtime {
+namespace raft::runtime::distance {
 
 template <typename IndexT, typename DataT>
 struct KeyValueIndexOp {
@@ -35,7 +35,7 @@ struct KeyValueIndexOp {
 };
 
 template <typename value_t, typename idx_t>
-void compute_fused_l2_nn_min_arg(raft::handle_t const& handle,
+void compute_fused_l2_nn_min_arg(raft::device_resources const& handle,
                                  idx_t* min,
                                  const value_t* x,
                                  const value_t* y,
@@ -52,18 +52,18 @@ void compute_fused_l2_nn_min_arg(raft::handle_t const& handle,
   raft::linalg::rowNorm(x_norms.data(), x, k, m, raft::linalg::L2Norm, true, handle.get_stream());
   raft::linalg::rowNorm(y_norms.data(), y, k, n, raft::linalg::L2Norm, true, handle.get_stream());
 
-  fusedL2NNMinReduce(kvp.data_handle(),
-                     x,
-                     y,
-                     x_norms.data(),
-                     y_norms.data(),
-                     m,
-                     n,
-                     k,
-                     (void*)workspace.data(),
-                     sqrt,
-                     true,
-                     handle.get_stream());
+  raft::distance::fusedL2NNMinReduce(kvp.data_handle(),
+                                     x,
+                                     y,
+                                     x_norms.data(),
+                                     y_norms.data(),
+                                     m,
+                                     n,
+                                     k,
+                                     (void*)workspace.data(),
+                                     sqrt,
+                                     true,
+                                     handle.get_stream());
 
   KeyValueIndexOp<idx_t, value_t> conversion_op;
   thrust::transform(
@@ -71,7 +71,7 @@ void compute_fused_l2_nn_min_arg(raft::handle_t const& handle,
   handle.sync_stream();
 }
 
-void fused_l2_nn_min_arg(raft::handle_t const& handle,
+void fused_l2_nn_min_arg(raft::device_resources const& handle,
                          int* min,
                          const float* x,
                          const float* y,
@@ -83,7 +83,7 @@ void fused_l2_nn_min_arg(raft::handle_t const& handle,
   compute_fused_l2_nn_min_arg<float, int>(handle, min, x, y, m, n, k, sqrt);
 }
 
-void fused_l2_nn_min_arg(raft::handle_t const& handle,
+void fused_l2_nn_min_arg(raft::device_resources const& handle,
                          int* min,
                          const double* x,
                          const double* y,
@@ -95,4 +95,4 @@ void fused_l2_nn_min_arg(raft::handle_t const& handle,
   compute_fused_l2_nn_min_arg<double, int>(handle, min, x, y, m, n, k, sqrt);
 }
 
-}  // end namespace raft::distance::runtime
\ No newline at end of file
+}  // end namespace raft::runtime::distance
\ No newline at end of file
diff --git a/cpp/src/distance/pairwise_distance.cu b/cpp/src/distance/distance/pairwise_distance.cu
similarity index 84%
rename from cpp/src/distance/pairwise_distance.cu
rename to cpp/src/distance/distance/pairwise_distance.cu
index 71133c5f84..dfdfa553e9 100644
--- a/cpp/src/distance/pairwise_distance.cu
+++ b/cpp/src/distance/distance/pairwise_distance.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <raft/distance/specializations.cuh>
-#include <raft/handle.hpp>
 
-namespace raft::distance::runtime {
+namespace raft::runtime::distance {
 
-void pairwise_distance(raft::handle_t const& handle,
+void pairwise_distance(raft::device_resources const& handle,
                        float* x,
                        float* y,
                        float* dists,
@@ -36,7 +36,7 @@ void pairwise_distance(raft::handle_t const& handle,
     handle, x, y, dists, m, n, k, metric, isRowMajor, metric_arg);
 }
 
-void pairwise_distance(raft::handle_t const& handle,
+void pairwise_distance(raft::device_resources const& handle,
                        double* x,
                        double* y,
                        double* dists,
@@ -50,4 +50,4 @@ void pairwise_distance(raft::handle_t const& handle,
   raft::distance::pairwise_distance<double, int>(
     handle, x, y, dists, m, n, k, metric, isRowMajor, metric_arg);
 }
-}  // namespace raft::distance::runtime
\ No newline at end of file
+}  // namespace raft::runtime::distance
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/canberra.cu b/cpp/src/distance/distance/specializations/detail/canberra.cu
similarity index 97%
rename from cpp/src/distance/specializations/detail/canberra.cu
rename to cpp/src/distance/distance/specializations/detail/canberra.cu
index b2dd993ab7..3a81b35a46 100644
--- a/cpp/src/distance/specializations/detail/canberra.cu
+++ b/cpp/src/distance/distance/specializations/detail/canberra.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/chebyshev.cu b/cpp/src/distance/distance/specializations/detail/chebyshev.cu
similarity index 97%
rename from cpp/src/distance/specializations/detail/chebyshev.cu
rename to cpp/src/distance/distance/specializations/detail/chebyshev.cu
index ab310515bd..7406265e97 100644
--- a/cpp/src/distance/specializations/detail/chebyshev.cu
+++ b/cpp/src/distance/distance/specializations/detail/chebyshev.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/correlation.cu b/cpp/src/distance/distance/specializations/detail/correlation.cu
similarity index 97%
rename from cpp/src/distance/specializations/detail/correlation.cu
rename to cpp/src/distance/distance/specializations/detail/correlation.cu
index 04b9e5bf69..0e2e55ecfb 100644
--- a/cpp/src/distance/specializations/detail/correlation.cu
+++ b/cpp/src/distance/distance/specializations/detail/correlation.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/cosine.cu b/cpp/src/distance/distance/specializations/detail/cosine.cu
similarity index 97%
rename from cpp/src/distance/specializations/detail/cosine.cu
rename to cpp/src/distance/distance/specializations/detail/cosine.cu
index bc19599511..efbdfbcd36 100644
--- a/cpp/src/distance/specializations/detail/cosine.cu
+++ b/cpp/src/distance/distance/specializations/detail/cosine.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/hamming_unexpanded.cu b/cpp/src/distance/distance/specializations/detail/hamming_unexpanded.cu
similarity index 97%
rename from cpp/src/distance/specializations/detail/hamming_unexpanded.cu
rename to cpp/src/distance/distance/specializations/detail/hamming_unexpanded.cu
index e5e66e85bd..ec46ae4383 100644
--- a/cpp/src/distance/specializations/detail/hamming_unexpanded.cu
+++ b/cpp/src/distance/distance/specializations/detail/hamming_unexpanded.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/hellinger_expanded.cu b/cpp/src/distance/distance/specializations/detail/hellinger_expanded.cu
similarity index 97%
rename from cpp/src/distance/specializations/detail/hellinger_expanded.cu
rename to cpp/src/distance/distance/specializations/detail/hellinger_expanded.cu
index fa9b8e14d6..8bf8ed8b5a 100644
--- a/cpp/src/distance/specializations/detail/hellinger_expanded.cu
+++ b/cpp/src/distance/distance/specializations/detail/hellinger_expanded.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu b/cpp/src/distance/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
rename to cpp/src/distance/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
diff --git a/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu b/cpp/src/distance/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
rename to cpp/src/distance/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
diff --git a/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_uint32.cu b/cpp/src/distance/distance/specializations/detail/jensen_shannon_float_float_float_uint32.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_uint32.cu
rename to cpp/src/distance/distance/specializations/detail/jensen_shannon_float_float_float_uint32.cu
diff --git a/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_double.cu b/cpp/src/distance/distance/specializations/detail/kernels/gram_matrix_base_double.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/kernels/gram_matrix_base_double.cu
rename to cpp/src/distance/distance/specializations/detail/kernels/gram_matrix_base_double.cu
diff --git a/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_float.cu b/cpp/src/distance/distance/specializations/detail/kernels/gram_matrix_base_float.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
rename to cpp/src/distance/distance/specializations/detail/kernels/gram_matrix_base_float.cu
diff --git a/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu b/cpp/src/distance/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
rename to cpp/src/distance/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
diff --git a/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu b/cpp/src/distance/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
rename to cpp/src/distance/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
diff --git a/cpp/src/distance/specializations/detail/kernels/rbf_kernel_double.cu b/cpp/src/distance/distance/specializations/detail/kernels/rbf_kernel_double.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/kernels/rbf_kernel_double.cu
rename to cpp/src/distance/distance/specializations/detail/kernels/rbf_kernel_double.cu
diff --git a/cpp/src/distance/specializations/detail/kernels/rbf_kernel_float.cu b/cpp/src/distance/distance/specializations/detail/kernels/rbf_kernel_float.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/kernels/rbf_kernel_float.cu
rename to cpp/src/distance/distance/specializations/detail/kernels/rbf_kernel_float.cu
diff --git a/cpp/src/distance/specializations/detail/kernels/tanh_kernel_double.cu b/cpp/src/distance/distance/specializations/detail/kernels/tanh_kernel_double.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/kernels/tanh_kernel_double.cu
rename to cpp/src/distance/distance/specializations/detail/kernels/tanh_kernel_double.cu
diff --git a/cpp/src/distance/specializations/detail/kernels/tanh_kernel_float.cu b/cpp/src/distance/distance/specializations/detail/kernels/tanh_kernel_float.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/kernels/tanh_kernel_float.cu
rename to cpp/src/distance/distance/specializations/detail/kernels/tanh_kernel_float.cu
diff --git a/cpp/src/distance/specializations/detail/kl_divergence_double_double_double_int.cu b/cpp/src/distance/distance/specializations/detail/kl_divergence_double_double_double_int.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
rename to cpp/src/distance/distance/specializations/detail/kl_divergence_double_double_double_int.cu
diff --git a/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_int.cu b/cpp/src/distance/distance/specializations/detail/kl_divergence_float_float_float_int.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
rename to cpp/src/distance/distance/specializations/detail/kl_divergence_float_float_float_int.cu
diff --git a/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_uint32.cu b/cpp/src/distance/distance/specializations/detail/kl_divergence_float_float_float_uint32.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/kl_divergence_float_float_float_uint32.cu
rename to cpp/src/distance/distance/specializations/detail/kl_divergence_float_float_float_uint32.cu
diff --git a/cpp/src/distance/specializations/detail/l1_double_double_double_int.cu b/cpp/src/distance/distance/specializations/detail/l1_double_double_double_int.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/l1_double_double_double_int.cu
rename to cpp/src/distance/distance/specializations/detail/l1_double_double_double_int.cu
diff --git a/cpp/src/distance/specializations/detail/l1_float_float_float_int.cu b/cpp/src/distance/distance/specializations/detail/l1_float_float_float_int.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/l1_float_float_float_int.cu
rename to cpp/src/distance/distance/specializations/detail/l1_float_float_float_int.cu
diff --git a/cpp/src/distance/specializations/detail/l1_float_float_float_uint32.cu b/cpp/src/distance/distance/specializations/detail/l1_float_float_float_uint32.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/l1_float_float_float_uint32.cu
rename to cpp/src/distance/distance/specializations/detail/l1_float_float_float_uint32.cu
diff --git a/cpp/src/distance/specializations/detail/l2_expanded_double_double_double_int.cu b/cpp/src/distance/distance/specializations/detail/l2_expanded_double_double_double_int.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/l2_expanded_double_double_double_int.cu
rename to cpp/src/distance/distance/specializations/detail/l2_expanded_double_double_double_int.cu
diff --git a/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_int.cu b/cpp/src/distance/distance/specializations/detail/l2_expanded_float_float_float_int.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/l2_expanded_float_float_float_int.cu
rename to cpp/src/distance/distance/specializations/detail/l2_expanded_float_float_float_int.cu
diff --git a/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_uint32.cu b/cpp/src/distance/distance/specializations/detail/l2_expanded_float_float_float_uint32.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/l2_expanded_float_float_float_uint32.cu
rename to cpp/src/distance/distance/specializations/detail/l2_expanded_float_float_float_uint32.cu
diff --git a/cpp/src/distance/specializations/detail/l2_sqrt_expanded_double_double_double_int.cu b/cpp/src/distance/distance/specializations/detail/l2_sqrt_expanded_double_double_double_int.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/l2_sqrt_expanded_double_double_double_int.cu
rename to cpp/src/distance/distance/specializations/detail/l2_sqrt_expanded_double_double_double_int.cu
diff --git a/cpp/src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_int.cu b/cpp/src/distance/distance/specializations/detail/l2_sqrt_expanded_float_float_float_int.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_int.cu
rename to cpp/src/distance/distance/specializations/detail/l2_sqrt_expanded_float_float_float_int.cu
diff --git a/cpp/src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_uint32.cu b/cpp/src/distance/distance/specializations/detail/l2_sqrt_expanded_float_float_float_uint32.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_uint32.cu
rename to cpp/src/distance/distance/specializations/detail/l2_sqrt_expanded_float_float_float_uint32.cu
diff --git a/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_double_double_double_int.cu b/cpp/src/distance/distance/specializations/detail/l2_sqrt_unexpanded_double_double_double_int.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_double_double_double_int.cu
rename to cpp/src/distance/distance/specializations/detail/l2_sqrt_unexpanded_double_double_double_int.cu
diff --git a/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_int.cu b/cpp/src/distance/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_int.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_int.cu
rename to cpp/src/distance/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_int.cu
diff --git a/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_uint32.cu b/cpp/src/distance/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_uint32.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_uint32.cu
rename to cpp/src/distance/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_uint32.cu
diff --git a/cpp/src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu b/cpp/src/distance/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
rename to cpp/src/distance/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
diff --git a/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu b/cpp/src/distance/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
rename to cpp/src/distance/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
diff --git a/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_uint32.cu b/cpp/src/distance/distance/specializations/detail/l2_unexpanded_float_float_float_uint32.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_uint32.cu
rename to cpp/src/distance/distance/specializations/detail/l2_unexpanded_float_float_float_uint32.cu
diff --git a/cpp/src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu b/cpp/src/distance/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
rename to cpp/src/distance/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
diff --git a/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu b/cpp/src/distance/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
rename to cpp/src/distance/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
diff --git a/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_uint32.cu b/cpp/src/distance/distance/specializations/detail/lp_unexpanded_float_float_float_uint32.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_uint32.cu
rename to cpp/src/distance/distance/specializations/detail/lp_unexpanded_float_float_float_uint32.cu
diff --git a/cpp/src/distance/specializations/detail/russel_rao_double_double_double_int.cu b/cpp/src/distance/distance/specializations/detail/russel_rao_double_double_double_int.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/russel_rao_double_double_double_int.cu
rename to cpp/src/distance/distance/specializations/detail/russel_rao_double_double_double_int.cu
diff --git a/cpp/src/distance/specializations/detail/russel_rao_float_float_float_int.cu b/cpp/src/distance/distance/specializations/detail/russel_rao_float_float_float_int.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/russel_rao_float_float_float_int.cu
rename to cpp/src/distance/distance/specializations/detail/russel_rao_float_float_float_int.cu
diff --git a/cpp/src/distance/specializations/detail/russel_rao_float_float_float_uint32.cu b/cpp/src/distance/distance/specializations/detail/russel_rao_float_float_float_uint32.cu
similarity index 100%
rename from cpp/src/distance/specializations/detail/russel_rao_float_float_float_uint32.cu
rename to cpp/src/distance/distance/specializations/detail/russel_rao_float_float_float_uint32.cu
diff --git a/cpp/src/distance/specializations/fused_l2_nn_double_int.cu b/cpp/src/distance/distance/specializations/fused_l2_nn_double_int.cu
similarity index 100%
rename from cpp/src/distance/specializations/fused_l2_nn_double_int.cu
rename to cpp/src/distance/distance/specializations/fused_l2_nn_double_int.cu
diff --git a/cpp/src/distance/specializations/fused_l2_nn_double_int64.cu b/cpp/src/distance/distance/specializations/fused_l2_nn_double_int64.cu
similarity index 100%
rename from cpp/src/distance/specializations/fused_l2_nn_double_int64.cu
rename to cpp/src/distance/distance/specializations/fused_l2_nn_double_int64.cu
diff --git a/cpp/src/distance/specializations/fused_l2_nn_float_int.cu b/cpp/src/distance/distance/specializations/fused_l2_nn_float_int.cu
similarity index 100%
rename from cpp/src/distance/specializations/fused_l2_nn_float_int.cu
rename to cpp/src/distance/distance/specializations/fused_l2_nn_float_int.cu
diff --git a/cpp/src/distance/specializations/fused_l2_nn_float_int64.cu b/cpp/src/distance/distance/specializations/fused_l2_nn_float_int64.cu
similarity index 100%
rename from cpp/src/distance/specializations/fused_l2_nn_float_int64.cu
rename to cpp/src/distance/distance/specializations/fused_l2_nn_float_int64.cu
diff --git a/cpp/src/distance/neighbors/ivfpq_build.cu b/cpp/src/distance/neighbors/ivfpq_build.cu
new file mode 100644
index 0000000000..7e6b12fb80
--- /dev/null
+++ b/cpp/src/distance/neighbors/ivfpq_build.cu
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_pq.cuh>
+#include <raft_runtime/neighbors/ivf_pq.hpp>
+
+namespace raft::runtime::neighbors::ivf_pq {
+
+#define RAFT_INST_BUILD_EXTEND(T, IdxT)                                                      \
+  auto build(raft::device_resources const& handle,                                           \
+             const raft::neighbors::ivf_pq::index_params& params,                            \
+             const T* dataset,                                                               \
+             IdxT n_rows,                                                                    \
+             uint32_t dim)                                                                   \
+    ->raft::neighbors::ivf_pq::index<IdxT>                                                   \
+  {                                                                                          \
+    return raft::neighbors::ivf_pq::build<T, IdxT>(handle, params, dataset, n_rows, dim);    \
+  }                                                                                          \
+  auto extend(raft::device_resources const& handle,                                          \
+              const raft::neighbors::ivf_pq::index<IdxT>& orig_index,                        \
+              const T* new_vectors,                                                          \
+              const IdxT* new_indices,                                                       \
+              IdxT n_rows)                                                                   \
+    ->raft::neighbors::ivf_pq::index<IdxT>                                                   \
+  {                                                                                          \
+    return raft::neighbors::ivf_pq::extend<T, IdxT>(                                         \
+      handle, orig_index, new_vectors, new_indices, n_rows);                                 \
+  }                                                                                          \
+                                                                                             \
+  void build(raft::device_resources const& handle,                                           \
+             const raft::neighbors::ivf_pq::index_params& params,                            \
+             const T* dataset,                                                               \
+             IdxT n_rows,                                                                    \
+             uint32_t dim,                                                                   \
+             raft::neighbors::ivf_pq::index<IdxT>* idx)                                      \
+  {                                                                                          \
+    *idx = raft::neighbors::ivf_pq::build<T, IdxT>(handle, params, dataset, n_rows, dim);    \
+  }                                                                                          \
+  void extend(raft::device_resources const& handle,                                          \
+              raft::neighbors::ivf_pq::index<IdxT>* idx,                                     \
+              const T* new_vectors,                                                          \
+              const IdxT* new_indices,                                                       \
+              IdxT n_rows)                                                                   \
+  {                                                                                          \
+    raft::neighbors::ivf_pq::extend<T, IdxT>(handle, idx, new_vectors, new_indices, n_rows); \
+  }
+
+RAFT_INST_BUILD_EXTEND(float, uint64_t);
+RAFT_INST_BUILD_EXTEND(int8_t, uint64_t);
+RAFT_INST_BUILD_EXTEND(uint8_t, uint64_t);
+
+#undef RAFT_INST_BUILD_EXTEND
+
+}  // namespace raft::runtime::neighbors::ivf_pq
diff --git a/cpp/src/distance/neighbors/ivfpq_deserialize.cu b/cpp/src/distance/neighbors/ivfpq_deserialize.cu
new file mode 100644
index 0000000000..e7ad77eef2
--- /dev/null
+++ b/cpp/src/distance/neighbors/ivfpq_deserialize.cu
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_pq.cuh>
+#include <raft_runtime/neighbors/ivf_pq.hpp>
+
+namespace raft::runtime::neighbors::ivf_pq {
+
+void deserialize(raft::device_resources const& handle,
+                 const std::string& filename,
+                 raft::neighbors::ivf_pq::index<uint64_t>* index)
+{
+  if (!index) { RAFT_FAIL("Invalid index pointer"); }
+  *index = raft::spatial::knn::ivf_pq::detail::deserialize<uint64_t>(handle, filename);
+};
+}  // namespace raft::runtime::neighbors::ivf_pq
diff --git a/cpp/src/distance/neighbors/ivfpq_search_float_uint64_t.cu b/cpp/src/distance/neighbors/ivfpq_search_float_uint64_t.cu
new file mode 100644
index 0000000000..c463aa9845
--- /dev/null
+++ b/cpp/src/distance/neighbors/ivfpq_search_float_uint64_t.cu
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_pq.cuh>
+#include <raft/neighbors/specializations/detail/ivf_pq_search.cuh>
+#include <raft_runtime/neighbors/ivf_pq.hpp>
+
+namespace raft::runtime::neighbors::ivf_pq {
+
+#define RAFT_SEARCH_INST(T, IdxT)                                            \
+  void search(raft::device_resources const& handle,                          \
+              const raft::neighbors::ivf_pq::search_params& params,          \
+              const raft::neighbors::ivf_pq::index<IdxT>& idx,               \
+              const T* queries,                                              \
+              uint32_t n_queries,                                            \
+              uint32_t k,                                                    \
+              IdxT* neighbors,                                               \
+              float* distances,                                              \
+              rmm::mr::device_memory_resource* mr)                           \
+  {                                                                          \
+    raft::neighbors::ivf_pq::search<T, IdxT>(                                \
+      handle, params, idx, queries, n_queries, k, neighbors, distances, mr); \
+  }
+
+RAFT_SEARCH_INST(float, uint64_t);
+
+#undef RAFT_INST_SEARCH
+
+}  // namespace raft::runtime::neighbors::ivf_pq
diff --git a/cpp/src/distance/neighbors/ivfpq_search_int8_t_uint64_t.cu b/cpp/src/distance/neighbors/ivfpq_search_int8_t_uint64_t.cu
new file mode 100644
index 0000000000..ab0dd576b9
--- /dev/null
+++ b/cpp/src/distance/neighbors/ivfpq_search_int8_t_uint64_t.cu
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_pq.cuh>
+#include <raft/neighbors/specializations/detail/ivf_pq_search.cuh>
+#include <raft_runtime/neighbors/ivf_pq.hpp>
+
+namespace raft::runtime::neighbors::ivf_pq {
+
+#define RAFT_SEARCH_INST(T, IdxT)                                            \
+  void search(raft::device_resources const& handle,                          \
+              const raft::neighbors::ivf_pq::search_params& params,          \
+              const raft::neighbors::ivf_pq::index<IdxT>& idx,               \
+              const T* queries,                                              \
+              uint32_t n_queries,                                            \
+              uint32_t k,                                                    \
+              IdxT* neighbors,                                               \
+              float* distances,                                              \
+              rmm::mr::device_memory_resource* mr)                           \
+  {                                                                          \
+    raft::neighbors::ivf_pq::search<T, IdxT>(                                \
+      handle, params, idx, queries, n_queries, k, neighbors, distances, mr); \
+  }
+
+RAFT_SEARCH_INST(int8_t, uint64_t);
+
+#undef RAFT_INST_SEARCH
+
+}  // namespace raft::runtime::neighbors::ivf_pq
diff --git a/cpp/src/nn/specializations/detail/ivfpq_search.cu b/cpp/src/distance/neighbors/ivfpq_search_uint8_t_uint64_t.cu
similarity index 59%
rename from cpp/src/nn/specializations/detail/ivfpq_search.cu
rename to cpp/src/distance/neighbors/ivfpq_search_uint8_t_uint64_t.cu
index 80bf589803..2a745eb37d 100644
--- a/cpp/src/nn/specializations/detail/ivfpq_search.cu
+++ b/cpp/src/distance/neighbors/ivfpq_search_uint8_t_uint64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,28 +16,27 @@
 
 #include <raft/neighbors/ivf_pq.cuh>
 #include <raft/neighbors/specializations/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations/ivf_pq_specialization.hpp>
+#include <raft_runtime/neighbors/ivf_pq.hpp>
 
-namespace raft::neighbors::ivf_pq {
+namespace raft::runtime::neighbors::ivf_pq {
 
-#define RAFT_SEARCH_INST(T, IdxT)                                                          \
-  void search(const handle_t& handle,                                                      \
-              const search_params& params,                                                 \
-              const index<IdxT>& idx,                                                      \
-              const T* queries,                                                            \
-              uint32_t n_queries,                                                          \
-              uint32_t k,                                                                  \
-              IdxT* neighbors,                                                             \
-              float* distances,                                                            \
-              rmm::mr::device_memory_resource* mr)                                         \
-  {                                                                                        \
-    search<T, IdxT>(handle, params, idx, queries, n_queries, k, neighbors, distances, mr); \
+#define RAFT_SEARCH_INST(T, IdxT)                                            \
+  void search(raft::device_resources const& handle,                          \
+              const raft::neighbors::ivf_pq::search_params& params,          \
+              const raft::neighbors::ivf_pq::index<IdxT>& idx,               \
+              const T* queries,                                              \
+              uint32_t n_queries,                                            \
+              uint32_t k,                                                    \
+              IdxT* neighbors,                                               \
+              float* distances,                                              \
+              rmm::mr::device_memory_resource* mr)                           \
+  {                                                                          \
+    raft::neighbors::ivf_pq::search<T, IdxT>(                                \
+      handle, params, idx, queries, n_queries, k, neighbors, distances, mr); \
   }
 
-RAFT_SEARCH_INST(float, uint64_t);
-RAFT_SEARCH_INST(int8_t, uint64_t);
 RAFT_SEARCH_INST(uint8_t, uint64_t);
 
 #undef RAFT_INST_SEARCH
 
-}  // namespace raft::neighbors::ivf_pq
+}  // namespace raft::runtime::neighbors::ivf_pq
diff --git a/cpp/src/distance/neighbors/ivfpq_serialize.cu b/cpp/src/distance/neighbors/ivfpq_serialize.cu
new file mode 100644
index 0000000000..706c344993
--- /dev/null
+++ b/cpp/src/distance/neighbors/ivfpq_serialize.cu
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_pq.cuh>
+#include <raft_runtime/neighbors/ivf_pq.hpp>
+
+namespace raft::runtime::neighbors::ivf_pq {
+
+void serialize(raft::device_resources const& handle,
+               const std::string& filename,
+               const raft::neighbors::ivf_pq::index<uint64_t>& index)
+{
+  raft::spatial::knn::ivf_pq::detail::serialize(handle, filename, index);
+};
+
+}  // namespace raft::runtime::neighbors::ivf_pq
diff --git a/cpp/src/distance/neighbors/refine_d_uint64_t_float.cu b/cpp/src/distance/neighbors/refine_d_uint64_t_float.cu
new file mode 100644
index 0000000000..d7b460180a
--- /dev/null
+++ b/cpp/src/distance/neighbors/refine_d_uint64_t_float.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/refine.cuh>
+#include <raft/neighbors/specializations/refine.cuh>
+
+namespace raft::runtime::neighbors {
+
+void refine(raft::device_resources const& handle,
+            raft::device_matrix_view<const float, uint64_t, row_major> dataset,
+            raft::device_matrix_view<const float, uint64_t, row_major> queries,
+            raft::device_matrix_view<const uint64_t, uint64_t, row_major> neighbor_candidates,
+            raft::device_matrix_view<uint64_t, uint64_t, row_major> indices,
+            raft::device_matrix_view<float, uint64_t, row_major> distances,
+            distance::DistanceType metric)
+{
+  raft::neighbors::refine<uint64_t, float, float, uint64_t>(
+    handle, dataset, queries, neighbor_candidates, indices, distances, metric);
+}
+
+}  // namespace raft::runtime::neighbors
diff --git a/cpp/src/distance/neighbors/refine_d_uint64_t_int8_t.cu b/cpp/src/distance/neighbors/refine_d_uint64_t_int8_t.cu
new file mode 100644
index 0000000000..3db07f0cdb
--- /dev/null
+++ b/cpp/src/distance/neighbors/refine_d_uint64_t_int8_t.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/refine.cuh>
+#include <raft/neighbors/specializations/refine.cuh>
+
+namespace raft::runtime::neighbors {
+
+void refine(raft::device_resources const& handle,
+            raft::device_matrix_view<const int8_t, uint64_t, row_major> dataset,
+            raft::device_matrix_view<const int8_t, uint64_t, row_major> queries,
+            raft::device_matrix_view<const uint64_t, uint64_t, row_major> neighbor_candidates,
+            raft::device_matrix_view<uint64_t, uint64_t, row_major> indices,
+            raft::device_matrix_view<float, uint64_t, row_major> distances,
+            distance::DistanceType metric)
+{
+  raft::neighbors::refine<uint64_t, int8_t, float, uint64_t>(
+    handle, dataset, queries, neighbor_candidates, indices, distances, metric);
+}
+
+}  // namespace raft::runtime::neighbors
diff --git a/cpp/src/distance/neighbors/refine_d_uint64_t_uint8_t.cu b/cpp/src/distance/neighbors/refine_d_uint64_t_uint8_t.cu
new file mode 100644
index 0000000000..2ce43d5800
--- /dev/null
+++ b/cpp/src/distance/neighbors/refine_d_uint64_t_uint8_t.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/refine.cuh>
+#include <raft/neighbors/specializations/refine.cuh>
+
+namespace raft::runtime::neighbors {
+
+void refine(raft::device_resources const& handle,
+            raft::device_matrix_view<const uint8_t, uint64_t, row_major> dataset,
+            raft::device_matrix_view<const uint8_t, uint64_t, row_major> queries,
+            raft::device_matrix_view<const uint64_t, uint64_t, row_major> neighbor_candidates,
+            raft::device_matrix_view<uint64_t, uint64_t, row_major> indices,
+            raft::device_matrix_view<float, uint64_t, row_major> distances,
+            distance::DistanceType metric)
+{
+  raft::neighbors::refine<uint64_t, uint8_t, float, uint64_t>(
+    handle, dataset, queries, neighbor_candidates, indices, distances, metric);
+}
+
+}  // namespace raft::runtime::neighbors
diff --git a/cpp/src/distance/neighbors/refine_h_uint64_t_float.cu b/cpp/src/distance/neighbors/refine_h_uint64_t_float.cu
new file mode 100644
index 0000000000..8549d65dc5
--- /dev/null
+++ b/cpp/src/distance/neighbors/refine_h_uint64_t_float.cu
@@ -0,0 +1,35 @@
+
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/refine.cuh>
+#include <raft/neighbors/specializations/refine.cuh>
+
+namespace raft::runtime::neighbors {
+
+void refine(raft::device_resources const& handle,
+            raft::host_matrix_view<const float, uint64_t, row_major> dataset,
+            raft::host_matrix_view<const float, uint64_t, row_major> queries,
+            raft::host_matrix_view<const uint64_t, uint64_t, row_major> neighbor_candidates,
+            raft::host_matrix_view<uint64_t, uint64_t, row_major> indices,
+            raft::host_matrix_view<float, uint64_t, row_major> distances,
+            distance::DistanceType metric)
+{
+  raft::neighbors::refine<uint64_t, float, float, uint64_t>(
+    handle, dataset, queries, neighbor_candidates, indices, distances, metric);
+}
+
+}  // namespace raft::runtime::neighbors
diff --git a/cpp/src/distance/neighbors/refine_h_uint64_t_int8_t.cu b/cpp/src/distance/neighbors/refine_h_uint64_t_int8_t.cu
new file mode 100644
index 0000000000..cf6d7a397a
--- /dev/null
+++ b/cpp/src/distance/neighbors/refine_h_uint64_t_int8_t.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/refine.cuh>
+#include <raft/neighbors/specializations/refine.cuh>
+
+namespace raft::runtime::neighbors {
+
+void refine(raft::device_resources const& handle,
+            raft::host_matrix_view<const int8_t, uint64_t, row_major> dataset,
+            raft::host_matrix_view<const int8_t, uint64_t, row_major> queries,
+            raft::host_matrix_view<const uint64_t, uint64_t, row_major> neighbor_candidates,
+            raft::host_matrix_view<uint64_t, uint64_t, row_major> indices,
+            raft::host_matrix_view<float, uint64_t, row_major> distances,
+            distance::DistanceType metric)
+{
+  raft::neighbors::refine<uint64_t, int8_t, float, uint64_t>(
+    handle, dataset, queries, neighbor_candidates, indices, distances, metric);
+}
+
+}  // namespace raft::runtime::neighbors
diff --git a/cpp/src/distance/neighbors/refine_h_uint64_t_uint8_t.cu b/cpp/src/distance/neighbors/refine_h_uint64_t_uint8_t.cu
new file mode 100644
index 0000000000..e9c4345e97
--- /dev/null
+++ b/cpp/src/distance/neighbors/refine_h_uint64_t_uint8_t.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/refine.cuh>
+#include <raft/neighbors/specializations/refine.cuh>
+
+namespace raft::runtime::neighbors {
+
+void refine(raft::device_resources const& handle,
+            raft::host_matrix_view<const uint8_t, uint64_t, row_major> dataset,
+            raft::host_matrix_view<const uint8_t, uint64_t, row_major> queries,
+            raft::host_matrix_view<const uint64_t, uint64_t, row_major> neighbor_candidates,
+            raft::host_matrix_view<uint64_t, uint64_t, row_major> indices,
+            raft::host_matrix_view<float, uint64_t, row_major> distances,
+            distance::DistanceType metric)
+{
+  raft::neighbors::refine<uint64_t, uint8_t, float, uint64_t>(
+    handle, dataset, queries, neighbor_candidates, indices, distances, metric);
+}
+
+}  // namespace raft::runtime::neighbors
diff --git a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_float_fast.cu b/cpp/src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_float_fast.cu
similarity index 100%
rename from cpp/src/nn/specializations/detail/ivfpq_compute_similarity_float_fast.cu
rename to cpp/src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_float_fast.cu
diff --git a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_float_no_basediff.cu b/cpp/src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_float_no_basediff.cu
similarity index 100%
rename from cpp/src/nn/specializations/detail/ivfpq_compute_similarity_float_no_basediff.cu
rename to cpp/src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_float_no_basediff.cu
diff --git a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_float_no_smem_lut.cu b/cpp/src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_float_no_smem_lut.cu
similarity index 100%
rename from cpp/src/nn/specializations/detail/ivfpq_compute_similarity_float_no_smem_lut.cu
rename to cpp/src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_float_no_smem_lut.cu
diff --git a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_fast.cu b/cpp/src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_fp8s_fast.cu
similarity index 100%
rename from cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_fast.cu
rename to cpp/src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_fp8s_fast.cu
diff --git a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_no_basediff.cu b/cpp/src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_fp8s_no_basediff.cu
similarity index 100%
rename from cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_no_basediff.cu
rename to cpp/src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_fp8s_no_basediff.cu
diff --git a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_no_smem_lut.cu b/cpp/src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_fp8s_no_smem_lut.cu
similarity index 100%
rename from cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_no_smem_lut.cu
rename to cpp/src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_fp8s_no_smem_lut.cu
diff --git a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_fast.cu b/cpp/src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_fp8u_fast.cu
similarity index 100%
rename from cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_fast.cu
rename to cpp/src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_fp8u_fast.cu
diff --git a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_no_basediff.cu b/cpp/src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_fp8u_no_basediff.cu
similarity index 100%
rename from cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_no_basediff.cu
rename to cpp/src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_fp8u_no_basediff.cu
diff --git a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_no_smem_lut.cu b/cpp/src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_fp8u_no_smem_lut.cu
similarity index 100%
rename from cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_no_smem_lut.cu
rename to cpp/src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_fp8u_no_smem_lut.cu
diff --git a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_half_fast.cu b/cpp/src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_half_fast.cu
similarity index 100%
rename from cpp/src/nn/specializations/detail/ivfpq_compute_similarity_half_fast.cu
rename to cpp/src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_half_fast.cu
diff --git a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_half_no_basediff.cu b/cpp/src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_half_no_basediff.cu
similarity index 100%
rename from cpp/src/nn/specializations/detail/ivfpq_compute_similarity_half_no_basediff.cu
rename to cpp/src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_half_no_basediff.cu
diff --git a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_half_no_smem_lut.cu b/cpp/src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_half_no_smem_lut.cu
similarity index 100%
rename from cpp/src/nn/specializations/detail/ivfpq_compute_similarity_half_no_smem_lut.cu
rename to cpp/src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_half_no_smem_lut.cu
diff --git a/cpp/src/nn/specializations/detail/ivfpq_search_float_int64_t.cu b/cpp/src/distance/neighbors/specializations/detail/ivfpq_search_float_int64_t.cu
similarity index 90%
rename from cpp/src/nn/specializations/detail/ivfpq_search_float_int64_t.cu
rename to cpp/src/distance/neighbors/specializations/detail/ivfpq_search_float_int64_t.cu
index a32147b2b1..34825b253b 100644
--- a/cpp/src/nn/specializations/detail/ivfpq_search_float_int64_t.cu
+++ b/cpp/src/distance/neighbors/specializations/detail/ivfpq_search_float_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 namespace raft::spatial::knn::ivf_pq::detail {
 
-template void search<float, int64_t>(const handle_t&,
+template void search<float, int64_t>(const raft::device_resources&,
                                      const search_params&,
                                      const index<int64_t>&,
                                      const float*,
diff --git a/cpp/src/nn/specializations/detail/ivfpq_search_float_uint32_t.cu b/cpp/src/distance/neighbors/specializations/detail/ivfpq_search_float_uint32_t.cu
similarity index 90%
rename from cpp/src/nn/specializations/detail/ivfpq_search_float_uint32_t.cu
rename to cpp/src/distance/neighbors/specializations/detail/ivfpq_search_float_uint32_t.cu
index f3e80206e4..ec746dc434 100644
--- a/cpp/src/nn/specializations/detail/ivfpq_search_float_uint32_t.cu
+++ b/cpp/src/distance/neighbors/specializations/detail/ivfpq_search_float_uint32_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 namespace raft::spatial::knn::ivf_pq::detail {
 
-template void search<float, uint32_t>(const handle_t&,
+template void search<float, uint32_t>(const raft::device_resources&,
                                       const search_params&,
                                       const index<uint32_t>&,
                                       const float*,
diff --git a/cpp/src/nn/specializations/detail/ivfpq_search_float_uint64_t.cu b/cpp/src/distance/neighbors/specializations/detail/ivfpq_search_float_uint64_t.cu
similarity index 90%
rename from cpp/src/nn/specializations/detail/ivfpq_search_float_uint64_t.cu
rename to cpp/src/distance/neighbors/specializations/detail/ivfpq_search_float_uint64_t.cu
index e732646f99..ea18fac24d 100644
--- a/cpp/src/nn/specializations/detail/ivfpq_search_float_uint64_t.cu
+++ b/cpp/src/distance/neighbors/specializations/detail/ivfpq_search_float_uint64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 namespace raft::spatial::knn::ivf_pq::detail {
 
-template void search<float, uint64_t>(const handle_t&,
+template void search<float, uint64_t>(const raft::device_resources&,
                                       const search_params&,
                                       const index<uint64_t>&,
                                       const float*,
diff --git a/cpp/src/distance/neighbors/specializations/refine_d_uint64_t_float.cu b/cpp/src/distance/neighbors/specializations/refine_d_uint64_t_float.cu
new file mode 100644
index 0000000000..6bb1985d94
--- /dev/null
+++ b/cpp/src/distance/neighbors/specializations/refine_d_uint64_t_float.cu
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/refine.cuh>
+
+namespace raft::neighbors {
+
+template void refine<uint64_t, float, float, uint64_t>(
+  raft::device_resources const& handle,
+  raft::device_matrix_view<const float, uint64_t, row_major> dataset,
+  raft::device_matrix_view<const float, uint64_t, row_major> queries,
+  raft::device_matrix_view<const uint64_t, uint64_t, row_major> neighbor_candidates,
+  raft::device_matrix_view<uint64_t, uint64_t, row_major> indices,
+  raft::device_matrix_view<float, uint64_t, row_major> distances,
+  distance::DistanceType metric);
+
+}  // namespace raft::neighbors
diff --git a/cpp/src/distance/neighbors/specializations/refine_d_uint64_t_int8_t.cu b/cpp/src/distance/neighbors/specializations/refine_d_uint64_t_int8_t.cu
new file mode 100644
index 0000000000..7e70ee5e29
--- /dev/null
+++ b/cpp/src/distance/neighbors/specializations/refine_d_uint64_t_int8_t.cu
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/refine.cuh>
+
+namespace raft::neighbors {
+
+template void refine<uint64_t, int8_t, float, uint64_t>(
+  raft::device_resources const& handle,
+  raft::device_matrix_view<const int8_t, uint64_t, row_major> dataset,
+  raft::device_matrix_view<const int8_t, uint64_t, row_major> queries,
+  raft::device_matrix_view<const uint64_t, uint64_t, row_major> neighbor_candidates,
+  raft::device_matrix_view<uint64_t, uint64_t, row_major> indices,
+  raft::device_matrix_view<float, uint64_t, row_major> distances,
+  distance::DistanceType metric);
+
+}  // namespace raft::neighbors
diff --git a/cpp/src/distance/neighbors/specializations/refine_d_uint64_t_uint8_t.cu b/cpp/src/distance/neighbors/specializations/refine_d_uint64_t_uint8_t.cu
new file mode 100644
index 0000000000..53de106ef9
--- /dev/null
+++ b/cpp/src/distance/neighbors/specializations/refine_d_uint64_t_uint8_t.cu
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/refine.cuh>
+
+namespace raft::neighbors {
+
+template void refine<uint64_t, uint8_t, float, uint64_t>(
+  raft::device_resources const& handle,
+  raft::device_matrix_view<const uint8_t, uint64_t, row_major> dataset,
+  raft::device_matrix_view<const uint8_t, uint64_t, row_major> queries,
+  raft::device_matrix_view<const uint64_t, uint64_t, row_major> neighbor_candidates,
+  raft::device_matrix_view<uint64_t, uint64_t, row_major> indices,
+  raft::device_matrix_view<float, uint64_t, row_major> distances,
+  distance::DistanceType metric);
+
+}  // namespace raft::neighbors
diff --git a/cpp/src/distance/neighbors/specializations/refine_h_uint64_t_float.cu b/cpp/src/distance/neighbors/specializations/refine_h_uint64_t_float.cu
new file mode 100644
index 0000000000..b473924741
--- /dev/null
+++ b/cpp/src/distance/neighbors/specializations/refine_h_uint64_t_float.cu
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/refine.cuh>
+
+namespace raft::neighbors {
+
+template void refine<uint64_t, float, float, uint64_t>(
+  raft::device_resources const& handle,
+  raft::host_matrix_view<const float, uint64_t, row_major> dataset,
+  raft::host_matrix_view<const float, uint64_t, row_major> queries,
+  raft::host_matrix_view<const uint64_t, uint64_t, row_major> neighbor_candidates,
+  raft::host_matrix_view<uint64_t, uint64_t, row_major> indices,
+  raft::host_matrix_view<float, uint64_t, row_major> distances,
+  distance::DistanceType metric);
+
+}  // namespace raft::neighbors
diff --git a/cpp/src/distance/neighbors/specializations/refine_h_uint64_t_int8_t.cu b/cpp/src/distance/neighbors/specializations/refine_h_uint64_t_int8_t.cu
new file mode 100644
index 0000000000..c8b0e4c1c2
--- /dev/null
+++ b/cpp/src/distance/neighbors/specializations/refine_h_uint64_t_int8_t.cu
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/refine.cuh>
+
+namespace raft::neighbors {
+template void refine<uint64_t, int8_t, float, uint64_t>(
+  raft::device_resources const& handle,
+  raft::host_matrix_view<const int8_t, uint64_t, row_major> dataset,
+  raft::host_matrix_view<const int8_t, uint64_t, row_major> queries,
+  raft::host_matrix_view<const uint64_t, uint64_t, row_major> neighbor_candidates,
+  raft::host_matrix_view<uint64_t, uint64_t, row_major> indices,
+  raft::host_matrix_view<float, uint64_t, row_major> distances,
+  distance::DistanceType metric);
+
+}  // namespace raft::neighbors
diff --git a/cpp/src/distance/neighbors/specializations/refine_h_uint64_t_uint8_t.cu b/cpp/src/distance/neighbors/specializations/refine_h_uint64_t_uint8_t.cu
new file mode 100644
index 0000000000..b9e0f58ef6
--- /dev/null
+++ b/cpp/src/distance/neighbors/specializations/refine_h_uint64_t_uint8_t.cu
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/refine.cuh>
+
+namespace raft::neighbors {
+
+template void refine<uint64_t, uint8_t, float, uint64_t>(
+  raft::device_resources const& handle,
+  raft::host_matrix_view<const uint8_t, uint64_t, row_major> dataset,
+  raft::host_matrix_view<const uint8_t, uint64_t, row_major> queries,
+  raft::host_matrix_view<const uint64_t, uint64_t, row_major> neighbor_candidates,
+  raft::host_matrix_view<uint64_t, uint64_t, row_major> indices,
+  raft::host_matrix_view<float, uint64_t, row_major> distances,
+  distance::DistanceType metric);
+
+}  // namespace raft::neighbors
diff --git a/cpp/src/random/specializations/common.cuh b/cpp/src/distance/random/common.cuh
similarity index 89%
rename from cpp/src/random/specializations/common.cuh
rename to cpp/src/distance/random/common.cuh
index d854087714..69b507b07b 100644
--- a/cpp/src/random/specializations/common.cuh
+++ b/cpp/src/distance/random/common.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,10 +15,10 @@
  */
 
 #include <raft/random/rmat_rectangular_generator.cuh>
-#include <raft_distance/random/rmat_rectangular_generator.hpp>
+#include <raft_runtime/random/rmat_rectangular_generator.hpp>
 
 #define FUNC_DEF(IdxT, ProbT)                                                           \
-  void rmat_rectangular_gen(raft::handle_t const& handle,                               \
+  void rmat_rectangular_gen(raft::device_resources const& handle,                       \
                             IdxT* out,                                                  \
                             IdxT* out_src,                                              \
                             IdxT* out_dst,                                              \
diff --git a/cpp/src/random/specializations/rmat_rectangular_generator_int64_double.cu b/cpp/src/distance/random/rmat_rectangular_generator_int64_double.cu
similarity index 90%
rename from cpp/src/random/specializations/rmat_rectangular_generator_int64_double.cu
rename to cpp/src/distance/random/rmat_rectangular_generator_int64_double.cu
index 4985a64927..1b8fb8bd6d 100644
--- a/cpp/src/random/specializations/rmat_rectangular_generator_int64_double.cu
+++ b/cpp/src/distance/random/rmat_rectangular_generator_int64_double.cu
@@ -16,8 +16,8 @@
 
 #include "common.cuh"
 
-namespace raft::random::runtime {
+namespace raft::runtime::random {
 
 FUNC_DEF(int64_t, double);
 
-}  // namespace raft::random::runtime
+}  // namespace raft::runtime::random
diff --git a/cpp/src/random/specializations/rmat_rectangular_generator_int64_float.cu b/cpp/src/distance/random/rmat_rectangular_generator_int64_float.cu
similarity index 90%
rename from cpp/src/random/specializations/rmat_rectangular_generator_int64_float.cu
rename to cpp/src/distance/random/rmat_rectangular_generator_int64_float.cu
index f42e039bae..249e8c2ffb 100644
--- a/cpp/src/random/specializations/rmat_rectangular_generator_int64_float.cu
+++ b/cpp/src/distance/random/rmat_rectangular_generator_int64_float.cu
@@ -16,8 +16,8 @@
 
 #include "common.cuh"
 
-namespace raft::random::runtime {
+namespace raft::runtime::random {
 
 FUNC_DEF(int64_t, float);
 
-}  // namespace raft::random::runtime
+}  // namespace raft::runtime::random
diff --git a/cpp/src/random/specializations/rmat_rectangular_generator_int_double.cu b/cpp/src/distance/random/rmat_rectangular_generator_int_double.cu
similarity index 90%
rename from cpp/src/random/specializations/rmat_rectangular_generator_int_double.cu
rename to cpp/src/distance/random/rmat_rectangular_generator_int_double.cu
index c29d140f09..3333b87983 100644
--- a/cpp/src/random/specializations/rmat_rectangular_generator_int_double.cu
+++ b/cpp/src/distance/random/rmat_rectangular_generator_int_double.cu
@@ -16,8 +16,8 @@
 
 #include "common.cuh"
 
-namespace raft::random::runtime {
+namespace raft::runtime::random {
 
 FUNC_DEF(int, double);
 
-}  // namespace raft::random::runtime
+}  // namespace raft::runtime::random
diff --git a/cpp/src/random/specializations/rmat_rectangular_generator_int_float.cu b/cpp/src/distance/random/rmat_rectangular_generator_int_float.cu
similarity index 90%
rename from cpp/src/random/specializations/rmat_rectangular_generator_int_float.cu
rename to cpp/src/distance/random/rmat_rectangular_generator_int_float.cu
index b07ede6a58..db8d024c04 100644
--- a/cpp/src/random/specializations/rmat_rectangular_generator_int_float.cu
+++ b/cpp/src/distance/random/rmat_rectangular_generator_int_float.cu
@@ -16,8 +16,8 @@
 
 #include "common.cuh"
 
-namespace raft::random::runtime {
+namespace raft::runtime::random {
 
 FUNC_DEF(int, float);
 
-}  // namespace raft::random::runtime
+}  // namespace raft::runtime::random
diff --git a/cpp/src/nn/specializations/ball_cover_all_knn_query.cu b/cpp/src/nn/specializations/ball_cover_all_knn_query.cu
new file mode 100644
index 0000000000..da5cd8de4f
--- /dev/null
+++ b/cpp/src/nn/specializations/ball_cover_all_knn_query.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ball_cover.cuh>
+#include <raft/neighbors/ball_cover_types.hpp>
+
+// Ignore upstream specializations to avoid unnecessary recompiling
+#ifdef RAFT_DISTANCE_COMPILED
+#include <raft/distance/specializations.cuh>
+#endif
+
+#include <raft/neighbors/specializations/detail/ball_cover_lowdim.hpp>
+#include <raft/neighbors/specializations/fused_l2_knn.cuh>
+#include <raft/neighbors/specializations/knn.cuh>
+
+#include <cstdint>
+
+namespace raft::neighbors::ball_cover {
+template void all_knn_query<std::int64_t, float, std::uint32_t>(
+  raft::device_resources const& handle,
+  BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index,
+  std::uint32_t k,
+  std::int64_t* inds,
+  float* dists,
+  bool perform_post_filtering,
+  float weight);
+
+};  // namespace raft::neighbors::ball_cover
diff --git a/cpp/src/nn/specializations/ball_cover.cu b/cpp/src/nn/specializations/ball_cover_build_index.cu
similarity index 67%
rename from cpp/src/nn/specializations/ball_cover.cu
rename to cpp/src/nn/specializations/ball_cover_build_index.cu
index b608a1a865..70fcbec356 100644
--- a/cpp/src/nn/specializations/ball_cover.cu
+++ b/cpp/src/nn/specializations/ball_cover_build_index.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,27 +33,7 @@ template class BallCoverIndex<int, float, std::uint32_t, std::uint32_t>;
 template class BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>;
 
 template void build_index<std::int64_t, float, std::uint32_t, std::uint32_t>(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index);
 
-template void knn_query<std::int64_t, float, std::uint32_t>(
-  const raft::handle_t& handle,
-  const BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index,
-  std::uint32_t k,
-  const float* query,
-  std::uint32_t n_query_pts,
-  std::int64_t* inds,
-  float* dists,
-  bool perform_post_filtering,
-  float weight);
-
-template void all_knn_query<std::int64_t, float, std::uint32_t>(
-  const raft::handle_t& handle,
-  BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index,
-  std::uint32_t k,
-  std::int64_t* inds,
-  float* dists,
-  bool perform_post_filtering,
-  float weight);
-
 };  // namespace raft::neighbors::ball_cover
diff --git a/cpp/src/nn/specializations/ball_cover_knn_query.cu b/cpp/src/nn/specializations/ball_cover_knn_query.cu
new file mode 100644
index 0000000000..d5ca1cbc1c
--- /dev/null
+++ b/cpp/src/nn/specializations/ball_cover_knn_query.cu
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ball_cover.cuh>
+#include <raft/neighbors/ball_cover_types.hpp>
+
+// Ignore upstream specializations to avoid unnecessary recompiling
+#ifdef RAFT_DISTANCE_COMPILED
+#include <raft/distance/specializations.cuh>
+#endif
+
+#include <raft/neighbors/specializations/detail/ball_cover_lowdim.hpp>
+#include <raft/neighbors/specializations/fused_l2_knn.cuh>
+#include <raft/neighbors/specializations/knn.cuh>
+
+#include <cstdint>
+
+namespace raft::neighbors::ball_cover {
+template void knn_query<std::int64_t, float, std::uint32_t>(
+  raft::device_resources const& handle,
+  const BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index,
+  std::uint32_t k,
+  const float* query,
+  std::uint32_t n_query_pts,
+  std::int64_t* inds,
+  float* dists,
+  bool perform_post_filtering,
+  float weight);
+
+};  // namespace raft::neighbors::ball_cover
diff --git a/cpp/src/nn/specializations/brute_force_knn_long_float_int.cu b/cpp/src/nn/specializations/brute_force_knn_long_float_int.cu
new file mode 100644
index 0000000000..b08bcfbc79
--- /dev/null
+++ b/cpp/src/nn/specializations/brute_force_knn_long_float_int.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/spatial/knn/knn.cuh>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+
+template void brute_force_knn<long, float, int>(raft::device_resources const& handle,
+                                                std::vector<float*>& input,
+                                                std::vector<int>& sizes,
+                                                int D,
+                                                float* search_items,
+                                                int n,
+                                                long* res_I,
+                                                float* res_D,
+                                                int k,
+                                                bool rowMajorIndex,
+                                                bool rowMajorQuery,
+                                                std::vector<long>* translations,
+                                                distance::DistanceType metric,
+                                                float metric_arg);
+
+};  // namespace knn
+};  // namespace spatial
+};  // namespace raft
diff --git a/cpp/src/nn/specializations/brute_force_knn_long_float_uint.cu b/cpp/src/nn/specializations/brute_force_knn_long_float_uint.cu
new file mode 100644
index 0000000000..78cb92bb38
--- /dev/null
+++ b/cpp/src/nn/specializations/brute_force_knn_long_float_uint.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/spatial/knn/knn.cuh>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+
+template void brute_force_knn<long, float, unsigned int>(raft::device_resources const& handle,
+                                                         std::vector<float*>& input,
+                                                         std::vector<unsigned int>& sizes,
+                                                         unsigned int D,
+                                                         float* search_items,
+                                                         unsigned int n,
+                                                         long* res_I,
+                                                         float* res_D,
+                                                         unsigned int k,
+                                                         bool rowMajorIndex,
+                                                         bool rowMajorQuery,
+                                                         std::vector<long>* translations,
+                                                         distance::DistanceType metric,
+                                                         float metric_arg);
+
+};  // namespace knn
+};  // namespace spatial
+};  // namespace raft
diff --git a/cpp/src/nn/specializations/brute_force_knn_uint32_t_float_int.cu b/cpp/src/nn/specializations/brute_force_knn_uint32_t_float_int.cu
new file mode 100644
index 0000000000..0082a30796
--- /dev/null
+++ b/cpp/src/nn/specializations/brute_force_knn_uint32_t_float_int.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/spatial/knn/knn.cuh>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+template void brute_force_knn<uint32_t, float, int>(raft::device_resources const& handle,
+                                                    std::vector<float*>& input,
+                                                    std::vector<int>& sizes,
+                                                    int D,
+                                                    float* search_items,
+                                                    int n,
+                                                    uint32_t* res_I,
+                                                    float* res_D,
+                                                    int k,
+                                                    bool rowMajorIndex,
+                                                    bool rowMajorQuery,
+                                                    std::vector<uint32_t>* translations,
+                                                    distance::DistanceType metric,
+                                                    float metric_arg);
+
+};  // namespace knn
+};  // namespace spatial
+};  // namespace raft
diff --git a/cpp/src/nn/specializations/brute_force_knn_uint32_t_float_uint.cu b/cpp/src/nn/specializations/brute_force_knn_uint32_t_float_uint.cu
new file mode 100644
index 0000000000..b2a1af2cf0
--- /dev/null
+++ b/cpp/src/nn/specializations/brute_force_knn_uint32_t_float_uint.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/spatial/knn/knn.cuh>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+
+template void brute_force_knn<uint32_t, float, unsigned int>(raft::device_resources const& handle,
+                                                             std::vector<float*>& input,
+                                                             std::vector<unsigned int>& sizes,
+                                                             unsigned int D,
+                                                             float* search_items,
+                                                             unsigned int n,
+                                                             uint32_t* res_I,
+                                                             float* res_D,
+                                                             unsigned int k,
+                                                             bool rowMajorIndex,
+                                                             bool rowMajorQuery,
+                                                             std::vector<uint32_t>* translations,
+                                                             distance::DistanceType metric,
+                                                             float metric_arg);
+
+};  // namespace knn
+};  // namespace spatial
+};  // namespace raft
diff --git a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_2d.cu b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
index 961af0b89c..cff83ad3cf 100644
--- a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
+++ b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@ namespace knn {
 namespace detail {
 
 template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t>(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
   const float* query,
   const std::uint32_t n_query_rows,
diff --git a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_3d.cu b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
index daa509b5b1..1a1c17b29f 100644
--- a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
+++ b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@ namespace knn {
 namespace detail {
 
 template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t, 3>(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
   const float* query,
   const std::uint32_t n_query_rows,
@@ -37,19 +37,6 @@ template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t, 3>(
   float weight,
   std::uint32_t* dists_counter);
 
-template void rbc_low_dim_pass_two<std::int64_t, float, std::uint32_t, 3>(
-  const raft::handle_t& handle,
-  const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
-  const float* query,
-  const std::uint32_t n_query_rows,
-  std::uint32_t k,
-  const std::int64_t* R_knn_inds,
-  const float* R_knn_dists,
-  DistFunc<float, std::uint32_t>& dfunc,
-  std::int64_t* inds,
-  float* dists,
-  float weight,
-  std::uint32_t* post_dists_counter);
 };  // namespace detail
 };  // namespace knn
 };  // namespace spatial
diff --git a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_2d.cu b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
index 9487641945..7e784cb4d8 100644
--- a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
+++ b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@ namespace knn {
 namespace detail {
 
 template void rbc_low_dim_pass_two<std::int64_t, float, std::uint32_t, 2>(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
   const float* query,
   const std::uint32_t n_query_rows,
diff --git a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_3d.cu b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
index c07ed45427..e650c7ed37 100644
--- a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
+++ b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@ namespace knn {
 namespace detail {
 
 template void rbc_low_dim_pass_two<std::int64_t, float, std::uint32_t, 3>(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
   const float* query,
   const std::uint32_t n_query_rows,
diff --git a/cpp/src/nn/specializations/detail/ivfpq_build.cu b/cpp/src/nn/specializations/detail/ivfpq_build.cu
deleted file mode 100644
index 9ff22a3729..0000000000
--- a/cpp/src/nn/specializations/detail/ivfpq_build.cu
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations/ivf_pq_specialization.hpp>
-
-namespace raft::neighbors::ivf_pq {
-
-#define RAFT_INST_BUILD_EXTEND(T, IdxT)                                           \
-  auto build(const handle_t& handle,                                              \
-             const index_params& params,                                          \
-             const T* dataset,                                                    \
-             IdxT n_rows,                                                         \
-             uint32_t dim)                                                        \
-    ->index<IdxT>                                                                 \
-  {                                                                               \
-    return build<T, IdxT>(handle, params, dataset, n_rows, dim);                  \
-  }                                                                               \
-  auto extend(const handle_t& handle,                                             \
-              const index<IdxT>& orig_index,                                      \
-              const T* new_vectors,                                               \
-              const IdxT* new_indices,                                            \
-              IdxT n_rows)                                                        \
-    ->index<IdxT>                                                                 \
-  {                                                                               \
-    return extend<T, IdxT>(handle, orig_index, new_vectors, new_indices, n_rows); \
-  }                                                                               \
-                                                                                  \
-  void build(const handle_t& handle,                                              \
-             const index_params& params,                                          \
-             const T* dataset,                                                    \
-             IdxT n_rows,                                                         \
-             uint32_t dim,                                                        \
-             index<IdxT>* idx)                                                    \
-  {                                                                               \
-    *idx = build<T, IdxT>(handle, params, dataset, n_rows, dim);                  \
-  }                                                                               \
-  void extend(const handle_t& handle,                                             \
-              index<IdxT>* idx,                                                   \
-              const T* new_vectors,                                               \
-              const IdxT* new_indices,                                            \
-              IdxT n_rows)                                                        \
-  {                                                                               \
-    extend<T, IdxT>(handle, idx, new_vectors, new_indices, n_rows);               \
-  }
-
-RAFT_INST_BUILD_EXTEND(float, uint64_t);
-RAFT_INST_BUILD_EXTEND(int8_t, uint64_t);
-RAFT_INST_BUILD_EXTEND(uint8_t, uint64_t);
-
-#undef RAFT_INST_BUILD_EXTEND
-
-}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/src/nn/specializations/knn.cu b/cpp/src/nn/specializations/knn.cu
deleted file mode 100644
index 4e0a821c24..0000000000
--- a/cpp/src/nn/specializations/knn.cu
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/spatial/knn/knn.cuh>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-
-template void brute_force_knn<long, float, int>(raft::handle_t const& handle,
-                                                std::vector<float*>& input,
-                                                std::vector<int>& sizes,
-                                                int D,
-                                                float* search_items,
-                                                int n,
-                                                long* res_I,
-                                                float* res_D,
-                                                int k,
-                                                bool rowMajorIndex,
-                                                bool rowMajorQuery,
-                                                std::vector<long>* translations,
-                                                distance::DistanceType metric,
-                                                float metric_arg);
-
-template void brute_force_knn<long, float, unsigned int>(raft::handle_t const& handle,
-                                                         std::vector<float*>& input,
-                                                         std::vector<unsigned int>& sizes,
-                                                         unsigned int D,
-                                                         float* search_items,
-                                                         unsigned int n,
-                                                         long* res_I,
-                                                         float* res_D,
-                                                         unsigned int k,
-                                                         bool rowMajorIndex,
-                                                         bool rowMajorQuery,
-                                                         std::vector<long>* translations,
-                                                         distance::DistanceType metric,
-                                                         float metric_arg);
-
-template void brute_force_knn<uint32_t, float, int>(raft::handle_t const& handle,
-                                                    std::vector<float*>& input,
-                                                    std::vector<int>& sizes,
-                                                    int D,
-                                                    float* search_items,
-                                                    int n,
-                                                    uint32_t* res_I,
-                                                    float* res_D,
-                                                    int k,
-                                                    bool rowMajorIndex,
-                                                    bool rowMajorQuery,
-                                                    std::vector<uint32_t>* translations,
-                                                    distance::DistanceType metric,
-                                                    float metric_arg);
-
-template void brute_force_knn<uint32_t, float, unsigned int>(raft::handle_t const& handle,
-                                                             std::vector<float*>& input,
-                                                             std::vector<unsigned int>& sizes,
-                                                             unsigned int D,
-                                                             float* search_items,
-                                                             unsigned int n,
-                                                             uint32_t* res_I,
-                                                             float* res_D,
-                                                             unsigned int k,
-                                                             bool rowMajorIndex,
-                                                             bool rowMajorQuery,
-                                                             std::vector<uint32_t>* translations,
-                                                             distance::DistanceType metric,
-                                                             float metric_arg);
-
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
diff --git a/cpp/src/nn/specializations/refine.cu b/cpp/src/nn/specializations/refine.cu
deleted file mode 100644
index 3dbb218705..0000000000
--- a/cpp/src/nn/specializations/refine.cu
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/refine.cuh>
-
-namespace raft::neighbors {
-
-#define RAFT_INST_REFINE(IDX_T, DATA_T)                                                 \
-  void refine(raft::handle_t const& handle,                                             \
-              raft::device_matrix_view<DATA_T, uint64_t, row_major> dataset,            \
-              raft::device_matrix_view<DATA_T, uint64_t, row_major> queries,            \
-              raft::device_matrix_view<IDX_T, uint64_t, row_major> neighbor_candidates, \
-              raft::device_matrix_view<IDX_T, uint64_t, row_major> indices,             \
-              raft::device_matrix_view<float, uint64_t, row_major> distances,           \
-              distance::DistanceType metric)                                            \
-  {                                                                                     \
-    detail::refine_device<IDX_T, DATA_T, float, uint64_t>(                              \
-      handle, dataset, queries, neighbor_candidates, indices, distances, metric);       \
-  }                                                                                     \
-                                                                                        \
-  void refine(raft::handle_t const& handle,                                             \
-              raft::host_matrix_view<DATA_T, uint64_t, row_major> dataset,              \
-              raft::host_matrix_view<DATA_T, uint64_t, row_major> queries,              \
-              raft::host_matrix_view<IDX_T, uint64_t, row_major> neighbor_candidates,   \
-              raft::host_matrix_view<IDX_T, uint64_t, row_major> indices,               \
-              raft::host_matrix_view<float, uint64_t, row_major> distances,             \
-              distance::DistanceType metric)                                            \
-  {                                                                                     \
-    detail::refine_host<IDX_T, DATA_T, float, uint64_t>(                                \
-      dataset, queries, neighbor_candidates, indices, distances, metric);               \
-  }
-
-RAFT_INST_REFINE(uint64_t, float);
-RAFT_INST_REFINE(uint64_t, uint8_t);
-RAFT_INST_REFINE(uint64_t, int8_t);
-
-#undef RAFT_INST_REFINE
-
-}  // namespace raft::neighbors
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index dae0f6f6b1..3c41621274 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -32,6 +32,7 @@ function(ConfigureTest)
   target_link_libraries(
     ${TEST_NAME}
     PRIVATE raft::raft
+            raft_internal
             $<$<BOOL:${ConfigureTest_DIST}>:raft::distance>
             $<$<BOOL:${ConfigureTest_NN}>:raft::nn>
             GTest::gtest
@@ -77,23 +78,28 @@ endfunction()
 
 if(BUILD_TESTS)
   ConfigureTest(
-    NAME CLUSTER_TEST PATH test/cluster/kmeans.cu test/cluster_solvers.cu test/cluster/linkage.cu
-    OPTIONAL DIST NN
+    NAME CLUSTER_TEST PATH test/cluster/kmeans.cu test/cluster/kmeans_balanced.cu
+    test/cluster/cluster_solvers.cu test/cluster/linkage.cu OPTIONAL DIST NN
   )
 
   ConfigureTest(
     NAME
     CORE_TEST
     PATH
-    test/common/logger.cpp
-    test/handle.cpp
-    test/interruptible.cu
-    test/nvtx.cpp
-    test/mdarray.cu
-    test/mdspan_utils.cu
-    test/memory_type.cpp
-    test/span.cpp
-    test/span.cu
+    test/core/logger.cpp
+    test/core/math_device.cu
+    test/core/math_host.cpp
+    test/core/operators_device.cu
+    test/core/operators_host.cpp
+    test/core/handle.cpp
+    test/core/interruptible.cu
+    test/core/nvtx.cpp
+    test/core/mdarray.cu
+    test/core/mdspan_utils.cu
+    test/core/numpy_serializer.cu
+    test/core/memory_type.cpp
+    test/core/span.cpp
+    test/core/span.cu
     test/test.cpp
   )
 
@@ -175,9 +181,10 @@ if(BUILD_TESTS)
     test/matrix/matrix.cu
     test/matrix/norm.cu
     test/matrix/reverse.cu
+    test/matrix/select_k.cu
     test/matrix/slice.cu
     test/matrix/triangular.cu
-    test/spectral_matrix.cu
+    test/sparse/spectral_matrix.cu
   )
 
   ConfigureTest(
@@ -189,14 +196,15 @@ if(BUILD_TESTS)
     test/random/multi_variable_gaussian.cu
     test/random/permute.cu
     test/random/rng.cu
+    test/random/rng_discrete.cu
     test/random/rng_int.cu
     test/random/rmat_rectangular_generator.cu
     test/random/sample_without_replacement.cu
   )
 
   ConfigureTest(
-    NAME SOLVERS_TEST PATH test/cluster_solvers_deprecated.cu test/eigen_solvers.cu test/lap/lap.cu
-    test/mst.cu OPTIONAL DIST
+    NAME SOLVERS_TEST PATH test/cluster/cluster_solvers_deprecated.cu test/linalg/eigen_solvers.cu
+    test/lap/lap.cu test/sparse/mst.cu OPTIONAL DIST
   )
 
   ConfigureTest(
@@ -287,7 +295,7 @@ if(BUILD_TESTS)
   )
 
   ConfigureTest(
-    NAME UTILS_TEST PATH test/common/seive.cu test/cudart_utils.cpp test/device_atomics.cu
-    test/integer_utils.cpp test/pow2_utils.cu
+    NAME UTILS_TEST PATH test/core/seive.cu test/util/bitonic_sort.cu test/util/cudart_utils.cpp
+    test/util/device_atomics.cu test/util/integer_utils.cpp test/util/pow2_utils.cu
   )
 endif()
diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster/cluster_solvers.cu
similarity index 93%
rename from cpp/test/cluster_solvers.cu
rename to cpp/test/cluster/cluster_solvers.cu
index 26fbfec011..5121cdf139 100644
--- a/cpp/test/cluster_solvers.cu
+++ b/cpp/test/cluster/cluster_solvers.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include <memory>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 #if defined RAFT_DISTANCE_COMPILED && defined RAFT_NN_COMPILED
 #include <raft/spectral/specializations.cuh>
@@ -35,7 +35,7 @@ TEST(Raft, ClusterSolvers)
   using index_type = int;
   using value_type = double;
 
-  handle_t h;
+  raft::device_resources h;
 
   index_type maxiter{100};
   value_type tol{1.0e-10};
@@ -65,13 +65,8 @@ TEST(Raft, ModularitySolvers)
   using index_type = int;
   using value_type = double;
 
-  handle_t h;
-  ASSERT_EQ(0,
-            h.
-
-            get_device()
-
-  );
+  raft::device_resources h;
+  ASSERT_EQ(0, h.get_device());
 
   index_type neigvs{10};
   index_type maxiter{100};
diff --git a/cpp/test/cluster_solvers_deprecated.cu b/cpp/test/cluster/cluster_solvers_deprecated.cu
similarity index 92%
rename from cpp/test/cluster_solvers_deprecated.cu
rename to cpp/test/cluster/cluster_solvers_deprecated.cu
index 167a710b34..dbafbd15d6 100644
--- a/cpp/test/cluster_solvers_deprecated.cu
+++ b/cpp/test/cluster/cluster_solvers_deprecated.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include <memory>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 #include <raft/spectral/cluster_solvers_deprecated.cuh>
 
@@ -30,7 +30,7 @@ TEST(Raft, ClusterSolvers)
   using index_type = int;
   using value_type = double;
 
-  handle_t h;
+  raft::device_resources h;
 
   index_type maxiter{100};
   value_type tol{1.0e-10};
diff --git a/cpp/test/cluster/kmeans.cu b/cpp/test/cluster/kmeans.cu
index 698d23ac27..685bd1f965 100644
--- a/cpp/test/cluster/kmeans.cu
+++ b/cpp/test/cluster/kmeans.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,22 +14,23 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <optional>
 #include <vector>
 
 #include <raft/cluster/kmeans.cuh>
 #include <raft/core/cudart_utils.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/operators.hpp>
 #include <raft/random/make_blobs.cuh>
 #include <raft/stats/adjusted_rand_index.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <rmm/device_uvector.hpp>
 #include <thrust/fill.h>
 
-#if defined RAFT_DISTANCE_COMPILED && defined RAFT_NN_COMPILED
-#include <raft/cluster/specializations.cuh>
+#if defined RAFT_DISTANCE_COMPILED
+#include <raft/distance/specializations.cuh>
 #endif
 
 namespace raft {
@@ -44,28 +45,23 @@ struct KmeansInputs {
 };
 
 template <typename DataT, typename IndexT>
-void run_cluster_cost(const raft::handle_t& handle,
+void run_cluster_cost(const raft::device_resources& handle,
                       raft::device_vector_view<DataT, IndexT> minClusterDistance,
                       rmm::device_uvector<char>& workspace,
                       raft::device_scalar_view<DataT> clusterCost)
 {
   raft::cluster::kmeans::cluster_cost(
-    handle,
-    minClusterDistance,
-    workspace,
-    clusterCost,
-    [] __device__(const DataT& a, const DataT& b) { return a + b; });
+    handle, minClusterDistance, workspace, clusterCost, raft::add_op{});
 }
 
 template <typename T>
 class KmeansTest : public ::testing::TestWithParam<KmeansInputs<T>> {
  protected:
   KmeansTest()
-    : stream(handle.get_stream()),
-      d_labels(0, stream),
-      d_labels_ref(0, stream),
-      d_centroids(0, stream),
-      d_sample_weight(0, stream)
+    : d_labels(0, handle.get_stream()),
+      d_labels_ref(0, handle.get_stream()),
+      d_centroids(0, handle.get_stream()),
+      d_sample_weight(0, handle.get_stream())
   {
   }
 
@@ -73,6 +69,7 @@ class KmeansTest : public ::testing::TestWithParam<KmeansInputs<T>> {
   {
     testparams = ::testing::TestWithParam<KmeansInputs<T>>::GetParam();
 
+    auto stream                = handle.get_stream();
     int n_samples              = testparams.n_row;
     int n_features             = testparams.n_col;
     params.n_clusters          = testparams.n_clusters;
@@ -115,8 +112,7 @@ class KmeansTest : public ::testing::TestWithParam<KmeansInputs<T>> {
     rmm::device_uvector<char> workspace(0, stream);
     rmm::device_uvector<T> L2NormBuf_OR_DistBuf(0, stream);
     rmm::device_uvector<T> inRankCp(0, stream);
-    auto X_view =
-      raft::make_device_matrix_view<const T, int>(X.data_handle(), X.extent(0), X.extent(1));
+    auto X_view = raft::make_const_mdspan(X.view());
     auto centroids_view =
       raft::make_device_matrix_view<T, int>(d_centroids.data(), params.n_clusters, n_features);
     auto miniX = raft::make_device_matrix<T, int>(handle, n_samples / 4, n_features);
@@ -129,12 +125,8 @@ class KmeansTest : public ::testing::TestWithParam<KmeansInputs<T>> {
       miniX.extent(0),
       params.rng_state.seed);
 
-    raft::cluster::kmeans::init_plus_plus(handle,
-                                          params,
-                                          raft::make_device_matrix_view<const T, int>(
-                                            miniX.data_handle(), miniX.extent(0), miniX.extent(1)),
-                                          centroids_view,
-                                          workspace);
+    raft::cluster::kmeans::init_plus_plus(
+      handle, params, raft::make_const_mdspan(miniX.view()), centroids_view, workspace);
 
     auto minClusterDistance = raft::make_device_vector<T, int>(handle, n_samples);
     auto minClusterAndDistance =
@@ -252,6 +244,7 @@ class KmeansTest : public ::testing::TestWithParam<KmeansInputs<T>> {
 
     auto X      = raft::make_device_matrix<T, int>(handle, n_samples, n_features);
     auto labels = raft::make_device_vector<int, int>(handle, n_samples);
+    auto stream = handle.get_stream();
 
     raft::random::make_blobs<T, int>(X.data_handle(),
                                      labels.data_handle(),
@@ -287,10 +280,9 @@ class KmeansTest : public ::testing::TestWithParam<KmeansInputs<T>> {
 
     raft::copy(d_labels_ref.data(), labels.data_handle(), n_samples, stream);
 
-    T inertia  = 0;
-    int n_iter = 0;
-    auto X_view =
-      raft::make_device_matrix_view<const T, int>(X.data_handle(), X.extent(0), X.extent(1));
+    T inertia   = 0;
+    int n_iter  = 0;
+    auto X_view = raft::make_const_mdspan(X.view());
 
     raft::cluster::kmeans_fit_predict<T, int>(
       handle,
@@ -325,8 +317,7 @@ class KmeansTest : public ::testing::TestWithParam<KmeansInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
-  cudaStream_t stream;
+  raft::device_resources handle;
   KmeansInputs<T> testparams;
   rmm::device_uvector<int> d_labels;
   rmm::device_uvector<int> d_labels_ref;
diff --git a/cpp/test/cluster/kmeans_balanced.cu b/cpp/test/cluster/kmeans_balanced.cu
new file mode 100644
index 0000000000..028819563e
--- /dev/null
+++ b/cpp/test/cluster/kmeans_balanced.cu
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <optional>
+#include <vector>
+
+#include <raft/cluster/kmeans_balanced.cuh>
+#include <raft/core/cudart_utils.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/random/make_blobs.cuh>
+#include <raft/stats/adjusted_rand_index.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <rmm/device_uvector.hpp>
+#include <thrust/fill.h>
+
+#if defined RAFT_DISTANCE_COMPILED
+#include <raft/distance/specializations.cuh>
+#endif
+
+/* This test takes advantage of the fact that make_blobs generates balanced clusters.
+ * It doesn't currently test whether the algorithm can make balanced clusters with an imbalanced
+ * dataset.
+ */
+
+namespace raft {
+
+template <typename MathT, typename IdxT>
+struct KmeansBalancedInputs {
+  IdxT n_rows;
+  IdxT n_cols;
+  IdxT n_clusters;
+  raft::cluster::kmeans_balanced_params kb_params;
+  MathT tol;
+};
+
+template <typename MathT, typename IdxT>
+::std::ostream& operator<<(::std::ostream& os, const KmeansBalancedInputs<MathT, IdxT>& p)
+{
+  os << "{ " << p.n_rows << ", " << p.n_cols << ", " << p.n_clusters << ", " << p.kb_params.n_iters
+     << static_cast<int>(p.kb_params.metric) << '}' << std::endl;
+  return os;
+}
+
+template <typename DataT, typename MathT, typename LabelT, typename IdxT, typename MappingOpT>
+class KmeansBalancedTest : public ::testing::TestWithParam<KmeansBalancedInputs<MathT, IdxT>> {
+ protected:
+  KmeansBalancedTest()
+    : stream(handle.get_stream()),
+      d_labels(0, stream),
+      d_labels_ref(0, stream),
+      d_centroids(0, stream)
+  {
+  }
+
+  void basicTest()
+  {
+    MappingOpT op{};
+
+    auto p = ::testing::TestWithParam<KmeansBalancedInputs<MathT, IdxT>>::GetParam();
+
+    auto X           = raft::make_device_matrix<DataT, IdxT>(handle, p.n_rows, p.n_cols);
+    auto blob_labels = raft::make_device_vector<IdxT, IdxT>(handle, p.n_rows);
+
+    MathT* blobs_ptr;
+    rmm::device_uvector<MathT> blobs(0, stream);
+    if constexpr (!std::is_same_v<DataT, MathT>) {
+      blobs.resize(p.n_rows * p.n_cols, stream);
+      blobs_ptr = blobs.data();
+    } else {
+      blobs_ptr = X.data_handle();
+    }
+
+    raft::random::make_blobs<MathT, IdxT>(blobs_ptr,
+                                          blob_labels.data_handle(),
+                                          p.n_rows,
+                                          p.n_cols,
+                                          p.n_clusters,
+                                          stream,
+                                          true,
+                                          nullptr,
+                                          nullptr,
+                                          MathT{0.1},
+                                          true,
+                                          MathT{-1},
+                                          MathT{1},
+                                          (uint64_t)1234);
+
+    // Convert blobs dataset to DataT if necessary
+    if constexpr (!std::is_same_v<DataT, MathT>) {
+      raft::linalg::unaryOp(
+        X.data_handle(), blobs.data(), p.n_rows * p.n_cols, op.reverse_op, stream);
+    }
+
+    d_labels.resize(p.n_rows, stream);
+    d_labels_ref.resize(p.n_rows, stream);
+    d_centroids.resize(p.n_clusters * p.n_cols, stream);
+
+    raft::linalg::unaryOp(
+      d_labels_ref.data(), blob_labels.data_handle(), p.n_rows, raft::cast_op<LabelT>(), stream);
+
+    auto X_view =
+      raft::make_device_matrix_view<const DataT, IdxT>(X.data_handle(), X.extent(0), X.extent(1));
+    auto d_centroids_view =
+      raft::make_device_matrix_view<MathT, IdxT>(d_centroids.data(), p.n_clusters, p.n_cols);
+    auto d_labels_view = raft::make_device_vector_view<LabelT, IdxT>(d_labels.data(), p.n_rows);
+
+    raft::cluster::kmeans_balanced::fit_predict(
+      handle, p.kb_params, X_view, d_centroids_view, d_labels_view, op);
+
+    handle.sync_stream(stream);
+
+    score = raft::stats::adjusted_rand_index(
+      d_labels_ref.data(), d_labels.data(), p.n_rows, handle.get_stream());
+
+    if (score < 1.0) {
+      std::stringstream ss;
+      ss << "Expected: " << raft::arr2Str(d_labels_ref.data(), 25, "d_labels_ref", stream);
+      std::cout << (ss.str().c_str()) << '\n';
+      ss.str(std::string());
+      ss << "Actual: " << raft::arr2Str(d_labels.data(), 25, "d_labels", stream);
+      std::cout << (ss.str().c_str()) << '\n';
+      std::cout << "Score = " << score << '\n';
+    }
+  }
+
+  void SetUp() override { basicTest(); }
+
+ protected:
+  raft::handle_t handle;
+  cudaStream_t stream;
+  rmm::device_uvector<LabelT> d_labels;
+  rmm::device_uvector<LabelT> d_labels_ref;
+  rmm::device_uvector<MathT> d_centroids;
+  double score;
+};
+
+template <typename MathT, typename IdxT>
+std::vector<KmeansBalancedInputs<MathT, IdxT>> get_kmeans_balanced_inputs()
+{
+  std::vector<KmeansBalancedInputs<MathT, IdxT>> out;
+  KmeansBalancedInputs<MathT, IdxT> p;
+  p.kb_params.n_iters = 20;
+  p.kb_params.metric  = raft::distance::DistanceType::L2Expanded;
+  p.tol               = MathT{0.0001};
+  std::vector<std::tuple<size_t, size_t, size_t>> row_cols_k = {{1000, 32, 5},
+                                                                {1000, 100, 20},
+                                                                {10000, 32, 10},
+                                                                {10000, 100, 50},
+                                                                {10000, 500, 100},
+                                                                {1000000, 128, 10}};
+  for (auto& rck : row_cols_k) {
+    p.n_rows     = static_cast<IdxT>(std::get<0>(rck));
+    p.n_cols     = static_cast<IdxT>(std::get<1>(rck));
+    p.n_clusters = static_cast<IdxT>(std::get<2>(rck));
+    out.push_back(p);
+  }
+  return out;
+}
+
+const auto inputsf_i32 = get_kmeans_balanced_inputs<float, int>();
+const auto inputsd_i32 = get_kmeans_balanced_inputs<double, int>();
+const auto inputsf_i64 = get_kmeans_balanced_inputs<float, int64_t>();
+const auto inputsd_i64 = get_kmeans_balanced_inputs<double, int64_t>();
+
+#define KB_TEST(test_type, test_name, test_inputs)         \
+  typedef RAFT_DEPAREN(test_type) test_name;               \
+  TEST_P(test_name, Result) { ASSERT_TRUE(score == 1.0); } \
+  INSTANTIATE_TEST_CASE_P(KmeansBalancedTests, test_name, ::testing::ValuesIn(test_inputs))
+
+/*
+ * First set of tests: no conversion
+ */
+
+KB_TEST((KmeansBalancedTest<float, float, uint32_t, int, raft::identity_op>),
+        KmeansBalancedTestFFU32I32,
+        inputsf_i32);
+KB_TEST((KmeansBalancedTest<double, double, uint32_t, int, raft::identity_op>),
+        KmeansBalancedTestDDU32I32,
+        inputsd_i32);
+KB_TEST((KmeansBalancedTest<float, float, uint32_t, int64_t, raft::identity_op>),
+        KmeansBalancedTestFFU32I64,
+        inputsf_i64);
+KB_TEST((KmeansBalancedTest<double, double, uint32_t, int64_t, raft::identity_op>),
+        KmeansBalancedTestDDU32I64,
+        inputsd_i64);
+KB_TEST((KmeansBalancedTest<float, float, int, int, raft::identity_op>),
+        KmeansBalancedTestFFI32I32,
+        inputsf_i32);
+KB_TEST((KmeansBalancedTest<float, float, int, int64_t, raft::identity_op>),
+        KmeansBalancedTestFFI32I64,
+        inputsf_i64);
+KB_TEST((KmeansBalancedTest<float, float, int64_t, int, raft::identity_op>),
+        KmeansBalancedTestFFI64I32,
+        inputsf_i32);
+KB_TEST((KmeansBalancedTest<float, float, int64_t, int64_t, raft::identity_op>),
+        KmeansBalancedTestFFI64I64,
+        inputsf_i64);
+
+/*
+ * Second set of tests: integer dataset with conversion
+ */
+
+template <typename DataT, typename MathT>
+struct i2f_scaler {
+  // Note: with a scaling factor of 42, and generating blobs with centers between -1 and 1 with a
+  // standard deviation of 0.1, it's statistically very unlikely that we'd overflow
+  const raft::compose_op<raft::div_const_op<MathT>, raft::cast_op<MathT>> op{
+    raft::div_const_op<MathT>{42}, raft::cast_op<MathT>{}};
+  const raft::compose_op<raft::cast_op<DataT>, raft::mul_const_op<MathT>> reverse_op{
+    raft::cast_op<DataT>{}, raft::mul_const_op<MathT>{42}};
+
+  RAFT_INLINE_FUNCTION auto operator()(const DataT& x) const { return op(x); };
+};
+
+KB_TEST((KmeansBalancedTest<int8_t, float, uint32_t, int, i2f_scaler<int8_t, float>>),
+        KmeansBalancedTestFI8U32I32,
+        inputsf_i32);
+KB_TEST((KmeansBalancedTest<int8_t, double, uint32_t, int, i2f_scaler<int8_t, double>>),
+        KmeansBalancedTestDI8U32I32,
+        inputsd_i32);
+
+}  // namespace raft
diff --git a/cpp/test/cluster/linkage.cu b/cpp/test/cluster/linkage.cu
index 5533f552bd..20f2952e7d 100644
--- a/cpp/test/cluster/linkage.cu
+++ b/cpp/test/cluster/linkage.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/transpose.cuh>
@@ -49,6 +49,8 @@ struct LinkageInputs {
 
   int n_clusters;
 
+  bool use_knn;
+
   int c;
 };
 
@@ -162,15 +164,18 @@ class LinkageTest : public ::testing::TestWithParam<LinkageInputs<T, IdxT>> {
  public:
   LinkageTest()
     : params(::testing::TestWithParam<LinkageInputs<T, IdxT>>::GetParam()),
-      stream(handle.get_stream()),
-      labels(params.n_row, stream),
-      labels_ref(params.n_row, stream)
+      labels(0, handle.get_stream()),
+      labels_ref(0, handle.get_stream())
   {
   }
 
  protected:
   void basicTest()
   {
+    auto stream = handle.get_stream();
+
+    labels.resize(params.n_row, stream);
+    labels_ref.resize(params.n_row, stream);
     rmm::device_uvector<T> data(params.n_row * params.n_col, stream);
 
     raft::copy(data.data(), params.data.data(), data.size(), stream);
@@ -178,23 +183,34 @@ class LinkageTest : public ::testing::TestWithParam<LinkageInputs<T, IdxT>> {
 
     rmm::device_uvector<IdxT> out_children(params.n_row * 2, stream);
 
-    raft::handle_t handle;
-
     auto data_view = raft::make_device_matrix_view<const T, IdxT, row_major>(
       data.data(), params.n_row, params.n_col);
     auto dendrogram_view =
       raft::make_device_matrix_view<IdxT, IdxT, row_major>(out_children.data(), params.n_row, 2);
     auto labels_view = raft::make_device_vector_view<IdxT, IdxT>(labels.data(), params.n_row);
 
-    raft::cluster::hierarchy::
-      single_linkage<T, IdxT, raft::cluster::hierarchy::LinkageDistance::KNN_GRAPH>(
-        handle,
-        data_view,
-        dendrogram_view,
-        labels_view,
-        raft::distance::DistanceType::L2SqrtExpanded,
-        params.n_clusters,
-        std::make_optional<int>(params.c));
+    if (params.use_knn) {
+      raft::cluster::hierarchy::
+        single_linkage<T, IdxT, raft::cluster::hierarchy::LinkageDistance::KNN_GRAPH>(
+          handle,
+          data_view,
+          dendrogram_view,
+          labels_view,
+          raft::distance::DistanceType::L2SqrtExpanded,
+          params.n_clusters,
+          std::make_optional<int>(params.c));
+
+    } else {
+      raft::cluster::hierarchy::
+        single_linkage<T, IdxT, raft::cluster::hierarchy::LinkageDistance::PAIRWISE>(
+          handle,
+          data_view,
+          dendrogram_view,
+          labels_view,
+          raft::distance::DistanceType::L2SqrtExpanded,
+          params.n_clusters,
+          std::make_optional<int>(params.c));
+    }
 
     handle.sync_stream(stream);
 
@@ -204,8 +220,7 @@ class LinkageTest : public ::testing::TestWithParam<LinkageInputs<T, IdxT>> {
   void SetUp() override { basicTest(); }
 
  protected:
-  raft::handle_t handle;
-  cudaStream_t stream;
+  raft::device_resources handle;
 
   LinkageInputs<T, IdxT> params;
   rmm::device_uvector<IdxT> labels, labels_ref;
@@ -225,6 +240,7 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
     0.76166195, 0.66613745},
    {9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
    10,
+   true,
    -1},
   //  // Test outlier points
   {9,
@@ -232,6 +248,7 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
    {-1, -50, 3, 4, 5000, 10000, 1, 3, 4, 5, 0.000005, 0.00002, 2000000, 500000, 10, 50, 30, 5},
    {6, 0, 5, 0, 0, 4, 3, 2, 1},
    7,
+   true,
    -1},
 
   // Test n_clusters == (n_points / 2)
@@ -246,6 +263,7 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
     0.76166195, 0.66613745},
    {1, 0, 4, 0, 0, 3, 2, 0, 2, 1},
    5,
+   true,
    -1},
 
   // Test n_points == 100
@@ -425,7 +443,224 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
     0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
    10,
-   -4}};
+   true,
+   -4},
+  {10,
+   5,
+   {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379,
+    0.4035871,  0.3282796,  0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717,
+    0.50498218, 0.5113505,  0.16233086, 0.62165332, 0.42281548, 0.933117,   0.41386077, 0.23264562,
+    0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674,  0.84854131, 0.28890216,
+    0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554,
+    0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396,
+    0.76166195, 0.66613745},
+   {9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
+   10,
+   false,
+   5},
+  // Test outlier points
+  {9,
+   2,
+   {-1, -50, 3, 4, 5000, 10000, 1, 3, 4, 5, 0.000005, 0.00002, 2000000, 500000, 10, 50, 30, 5},
+   {6, 0, 5, 0, 0, 4, 3, 2, 1},
+   7,
+   false,
+   5},
+
+  // Test n_clusters == (n_points / 2)
+  {10,
+   5,
+   {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379,
+    0.4035871,  0.3282796,  0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717,
+    0.50498218, 0.5113505,  0.16233086, 0.62165332, 0.42281548, 0.933117,   0.41386077, 0.23264562,
+    0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674,  0.84854131, 0.28890216,
+    0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554,
+    0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396,
+    0.76166195, 0.66613745},
+   {1, 0, 4, 0, 0, 3, 2, 0, 2, 1},
+   5,
+   false,
+   10},
+
+  // Test n_points == 100
+  {100,
+   10,
+   {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, 2.73025296e-01, 9.53050619e-01, 3.32164396e-01,
+    6.88942598e-01, 5.79163537e-01, 6.70341547e-01, 2.70140602e-02, 9.30429671e-01, 7.17721157e-01,
+    9.89948537e-01, 7.75253347e-01, 1.34491522e-02, 2.48522428e-02, 3.51413378e-01, 7.64405834e-01,
+    7.86373507e-01, 7.18748577e-01, 8.66998621e-01, 6.80316582e-01, 2.51288712e-01, 4.91078420e-01,
+    3.76246281e-01, 4.86828710e-01, 5.67464772e-01, 5.30734742e-01, 8.99478296e-01, 7.66699088e-01,
+    9.49339111e-01, 3.55248484e-01, 9.06046929e-01, 4.48407772e-01, 6.96395305e-01, 2.44277335e-01,
+    7.74840000e-01, 5.21046603e-01, 4.66423971e-02, 5.12019638e-02, 8.95019614e-01, 5.28956953e-01,
+    4.31536306e-01, 5.83857744e-01, 4.41787364e-01, 4.68656523e-01, 5.73971433e-01, 6.79989654e-01,
+    3.19650588e-01, 6.12579596e-01, 6.49126442e-02, 8.39131142e-01, 2.85252117e-01, 5.84848929e-01,
+    9.46507115e-01, 8.58440748e-01, 3.61528940e-01, 2.44215959e-01, 3.80101125e-01, 4.57128957e-02,
+    8.82216988e-01, 8.31498633e-01, 7.23474381e-01, 7.75788607e-01, 1.40864146e-01, 6.62092382e-01,
+    5.13985168e-01, 3.00686418e-01, 8.70109949e-01, 2.43187753e-01, 2.89391938e-01, 2.84214238e-01,
+    8.70985521e-01, 8.77491176e-01, 6.72537226e-01, 3.30929686e-01, 1.85934324e-01, 9.16222614e-01,
+    6.18239142e-01, 2.64768597e-01, 5.76145451e-01, 8.62961369e-01, 6.84757925e-01, 7.60549082e-01,
+    1.27645356e-01, 4.51004673e-01, 3.92292980e-01, 4.63170803e-01, 4.35449330e-02, 2.17583404e-01,
+    5.71832605e-02, 2.06763039e-01, 3.70116249e-01, 2.09750028e-01, 6.17283019e-01, 8.62549231e-01,
+    9.84156240e-02, 2.66249156e-01, 3.87635103e-01, 2.85591012e-02, 4.24826068e-01, 4.45795088e-01,
+    6.86227676e-01, 1.08848960e-01, 5.96731841e-02, 3.71770228e-01, 1.91548833e-01, 6.95136078e-01,
+    9.00700636e-01, 8.76363105e-01, 2.67334632e-01, 1.80619709e-01, 7.94060419e-01, 1.42854171e-02,
+    1.09372387e-01, 8.74028108e-01, 6.46403232e-01, 4.86588834e-01, 5.93446175e-02, 6.11886291e-01,
+    8.83865057e-01, 3.15879821e-01, 2.27043992e-01, 9.76764951e-01, 6.15620336e-01, 9.76199360e-01,
+    2.40548962e-01, 3.21795663e-01, 8.75087904e-02, 8.11234663e-01, 6.96070480e-01, 8.12062321e-01,
+    1.21958818e-01, 3.44348628e-02, 8.72630414e-01, 3.06162776e-01, 1.76043529e-02, 9.45894971e-01,
+    5.33896401e-01, 6.21642973e-01, 4.93062535e-01, 4.48984262e-01, 2.24560379e-01, 4.24052195e-02,
+    4.43447610e-01, 8.95646149e-01, 6.05220676e-01, 1.81840491e-01, 9.70831206e-01, 2.12563586e-02,
+    6.92582693e-01, 7.55946922e-01, 7.95086143e-01, 6.05328941e-01, 3.99350764e-01, 4.32846636e-01,
+    9.81114529e-01, 4.98266428e-01, 6.37127930e-03, 1.59085889e-01, 6.34682067e-05, 5.59429440e-01,
+    7.38827633e-01, 8.93214770e-01, 2.16494306e-01, 9.35430573e-02, 4.75665868e-02, 7.80503518e-01,
+    7.86240041e-01, 7.06854594e-01, 2.13725879e-02, 7.68246091e-01, 4.50234808e-01, 5.21231104e-01,
+    5.01989826e-03, 4.22081572e-02, 1.65337732e-01, 8.54134740e-01, 4.99430262e-01, 8.94525601e-01,
+    1.14028379e-01, 3.69739861e-01, 1.32955599e-01, 2.65563824e-01, 2.52811151e-01, 1.44792843e-01,
+    6.88449594e-01, 4.44921417e-01, 8.23296587e-01, 1.93266317e-01, 1.19033309e-01, 1.36368966e-01,
+    3.42600285e-01, 5.64505195e-01, 5.57594559e-01, 7.44257892e-01, 8.38231569e-02, 4.11548847e-01,
+    3.21010077e-01, 8.55081359e-01, 4.30105779e-01, 1.16229135e-01, 9.87731964e-02, 3.14712335e-01,
+    4.50880592e-01, 2.72289598e-01, 6.31615256e-01, 8.97432958e-01, 4.44764250e-01, 8.03776440e-01,
+    2.68767748e-02, 2.43374608e-01, 4.02141103e-01, 4.98881209e-01, 5.33173003e-01, 8.82890436e-01,
+    7.16149148e-01, 4.19664401e-01, 2.29335357e-01, 2.88637806e-01, 3.44696803e-01, 6.78171906e-01,
+    5.69849716e-01, 5.86454477e-01, 3.54474989e-01, 9.03876540e-01, 6.45980000e-01, 6.34887593e-01,
+    7.88039746e-02, 2.04814126e-01, 7.82251754e-01, 2.43147074e-01, 7.50951808e-01, 1.72799092e-02,
+    2.95349590e-01, 6.57991826e-01, 8.81214312e-01, 5.73970708e-01, 2.77610881e-01, 1.82155097e-01,
+    7.69797417e-02, 6.44792402e-01, 9.46950998e-01, 7.73064845e-01, 6.04733624e-01, 5.80094567e-01,
+    1.67498426e-01, 2.66514296e-01, 6.50140368e-01, 1.91170299e-01, 2.08752199e-01, 3.01664091e-01,
+    9.85033484e-01, 2.92909152e-01, 8.65816607e-01, 1.85222119e-01, 2.28814559e-01, 1.34286382e-02,
+    2.89234322e-01, 8.18668708e-01, 4.71706924e-01, 9.23199803e-01, 2.80879188e-01, 1.47319284e-01,
+    4.13915748e-01, 9.31274932e-02, 6.66322195e-01, 9.66953974e-01, 3.19405786e-01, 6.69486551e-01,
+    5.03096313e-02, 6.95225201e-01, 5.78469859e-01, 6.29481655e-01, 1.39252534e-01, 1.22564968e-01,
+    6.80663678e-01, 6.34607157e-01, 6.42765834e-01, 1.57127410e-02, 2.92132086e-01, 5.24423878e-01,
+    4.68676824e-01, 2.86003928e-01, 7.18608322e-01, 8.95617933e-01, 5.48844309e-01, 1.74517278e-01,
+    5.24379196e-01, 2.13526524e-01, 5.88375435e-01, 9.88560185e-01, 4.17435771e-01, 6.14438688e-01,
+    9.53760881e-01, 5.27151288e-01, 7.03017278e-01, 3.44448559e-01, 4.47059676e-01, 2.83414901e-01,
+    1.98979011e-01, 4.24917361e-01, 5.73172761e-01, 2.32398853e-02, 1.65887230e-01, 4.05552785e-01,
+    9.29665524e-01, 2.26135696e-01, 9.20563384e-01, 7.65259963e-01, 4.54820075e-01, 8.97710267e-01,
+    3.78559302e-03, 9.15219382e-01, 3.55705698e-01, 6.94905124e-01, 8.58540202e-01, 3.89790666e-01,
+    2.49478206e-01, 7.93679304e-01, 4.75830027e-01, 4.40425353e-01, 3.70579459e-01, 1.40578049e-01,
+    1.70386675e-01, 7.04056121e-01, 4.85963102e-01, 9.68450060e-01, 6.77178001e-01, 2.65934654e-01,
+    2.58915007e-01, 6.70052890e-01, 2.61945109e-01, 8.46207759e-01, 1.01928951e-01, 2.85611334e-01,
+    2.45776933e-01, 2.66658783e-01, 3.71724077e-01, 4.34319025e-01, 4.24407347e-01, 7.15417683e-01,
+    8.07997684e-01, 1.64296275e-01, 6.01638065e-01, 8.60606804e-02, 2.68719187e-01, 5.11764101e-01,
+    9.75844338e-01, 7.81226782e-01, 2.20925515e-01, 7.18135040e-01, 9.82395577e-01, 8.39160243e-01,
+    9.08058083e-01, 6.88010677e-01, 8.14271847e-01, 5.12460821e-01, 1.17311345e-01, 5.96075228e-01,
+    9.17455497e-01, 2.12052706e-01, 7.04074603e-01, 8.72872565e-02, 8.76047818e-01, 6.96235046e-01,
+    8.54801557e-01, 2.49729159e-01, 9.76594604e-01, 2.87386363e-01, 2.36461559e-02, 9.94075254e-01,
+    4.25193986e-01, 7.61869994e-01, 5.13334255e-01, 6.44711165e-02, 8.92156689e-01, 3.55235167e-01,
+    1.08154647e-01, 8.78446825e-01, 2.43833016e-01, 9.23071293e-01, 2.72724115e-01, 9.46631338e-01,
+    3.74510294e-01, 4.08451278e-02, 9.78392777e-01, 3.65079221e-01, 6.37199516e-01, 5.51144906e-01,
+    5.25978080e-01, 1.42803678e-01, 4.05451674e-01, 7.79788219e-01, 6.26009784e-01, 3.35249497e-01,
+    1.43159543e-02, 1.80363779e-01, 5.05096904e-01, 2.82619947e-01, 5.83561392e-01, 3.10951324e-01,
+    8.73223968e-01, 4.38545619e-01, 4.81348800e-01, 6.68497085e-01, 3.79345401e-01, 9.58832501e-01,
+    1.89869550e-01, 2.34083070e-01, 2.94066207e-01, 5.74892667e-02, 6.92106828e-02, 9.61127686e-02,
+    6.72650672e-02, 8.47345378e-01, 2.80916761e-01, 7.32177357e-03, 9.80785961e-01, 5.73192225e-02,
+    8.48781331e-01, 8.83225408e-01, 7.34398275e-01, 7.70381941e-01, 6.20778343e-01, 8.96822048e-01,
+    5.40732486e-01, 3.69704071e-01, 5.77305837e-01, 2.08221827e-01, 7.34275341e-01, 1.06110900e-01,
+    3.49496706e-01, 8.34948910e-01, 1.56403291e-02, 6.78576376e-01, 8.96141268e-01, 5.94835119e-01,
+    1.43943153e-01, 3.49618530e-01, 2.10440392e-01, 3.46585620e-01, 1.05153093e-01, 3.45446174e-01,
+    2.72177079e-01, 7.07946300e-01, 4.33717726e-02, 3.31232203e-01, 3.91874320e-01, 4.76338141e-01,
+    6.22777789e-01, 2.95989228e-02, 4.32855769e-01, 7.61049310e-01, 3.63279149e-01, 9.47210350e-01,
+    6.43721247e-01, 6.58025802e-01, 1.05247633e-02, 5.29974442e-01, 7.30675767e-01, 4.30041079e-01,
+    6.62634841e-01, 8.25936616e-01, 9.91253704e-01, 6.79399281e-01, 5.44177006e-01, 7.52876048e-01,
+    3.32139049e-01, 7.98732398e-01, 7.38865223e-01, 9.16055132e-01, 6.11736493e-01, 9.63672879e-01,
+    1.83778839e-01, 7.27558919e-02, 5.91602822e-01, 3.25235484e-01, 2.34741217e-01, 9.52346277e-01,
+    9.18556407e-01, 9.35373324e-01, 6.89209070e-01, 2.56049054e-01, 6.17975395e-01, 7.82285691e-01,
+    9.84983432e-01, 6.62322741e-01, 2.04144457e-01, 3.98446577e-01, 1.38918297e-01, 3.05919921e-01,
+    3.14043787e-01, 5.91072666e-01, 7.44703771e-01, 8.92272567e-01, 9.78017873e-01, 9.01203161e-01,
+    1.41526372e-01, 4.14878484e-01, 6.80683651e-01, 5.01733152e-02, 8.14635389e-01, 2.27926375e-01,
+    9.03269815e-01, 8.68443745e-01, 9.86939190e-01, 7.40779486e-01, 2.61005311e-01, 3.19276232e-01,
+    9.69509248e-01, 1.11908818e-01, 4.49198556e-01, 1.27056715e-01, 3.84064823e-01, 5.14591811e-01,
+    2.10747488e-01, 9.53884090e-01, 8.43167950e-01, 4.51187972e-01, 3.75331782e-01, 6.23566461e-01,
+    3.55290379e-01, 2.95705968e-01, 1.69622690e-01, 1.42981830e-01, 2.72180991e-01, 9.46468040e-01,
+    3.70932500e-01, 9.94292830e-01, 4.62587505e-01, 7.14817405e-01, 2.45370540e-02, 3.00906377e-01,
+    5.75768304e-01, 9.71448393e-01, 6.95574827e-02, 3.93693854e-01, 5.29306116e-01, 5.04694554e-01,
+    6.73797120e-02, 6.76596969e-01, 5.50948898e-01, 3.24909641e-01, 7.70337719e-01, 6.51842631e-03,
+    3.03264879e-01, 7.61037886e-03, 2.72289601e-01, 1.50502041e-01, 6.71103888e-02, 7.41503703e-01,
+    1.92088941e-01, 2.19043977e-01, 9.09320161e-01, 2.37993569e-01, 6.18107973e-02, 8.31447852e-01,
+    2.23355609e-01, 1.84789435e-01, 4.16104518e-01, 4.21573859e-01, 8.72446305e-02, 2.97294197e-01,
+    4.50328256e-01, 8.72199917e-01, 2.51279916e-01, 4.86219272e-01, 7.57071329e-01, 4.85655942e-01,
+    1.06187277e-01, 4.92341327e-01, 1.46017513e-01, 5.25421017e-01, 4.22637906e-01, 2.24685018e-01,
+    8.72648431e-01, 5.54051490e-01, 1.80745062e-01, 2.12756336e-01, 5.20883169e-01, 7.60363654e-01,
+    8.30254678e-01, 5.00003328e-01, 4.69017439e-01, 6.38105527e-01, 3.50638261e-02, 5.22217353e-02,
+    9.06516882e-02, 8.52975842e-01, 1.19985883e-01, 3.74926753e-01, 6.50302066e-01, 1.98875727e-01,
+    6.28362507e-02, 4.32693501e-01, 3.10500685e-01, 6.20732833e-01, 4.58503272e-01, 3.20790034e-01,
+    7.91284868e-01, 7.93054570e-01, 2.93406765e-01, 8.95399023e-01, 1.06441034e-01, 7.53085241e-02,
+    8.67523104e-01, 1.47963482e-01, 1.25584706e-01, 3.81545040e-02, 6.34338619e-01, 1.76368938e-02,
+    5.75553531e-02, 5.31607516e-01, 2.63869588e-01, 9.41945823e-01, 9.24028838e-02, 5.21496463e-01,
+    7.74866558e-01, 5.65210610e-01, 7.28015327e-02, 6.51963790e-01, 8.94727453e-01, 4.49571590e-01,
+    1.29932405e-01, 8.64026259e-01, 9.92599934e-01, 7.43721560e-01, 8.87300215e-01, 1.06369925e-01,
+    8.11335531e-01, 7.87734900e-01, 9.87344678e-01, 5.32502820e-01, 4.42612382e-01, 9.64041183e-01,
+    1.66085871e-01, 1.12937664e-01, 5.24423470e-01, 6.54689333e-01, 4.59119726e-01, 5.22774091e-01,
+    3.08722276e-02, 6.26979315e-01, 4.49754105e-01, 8.07495757e-01, 2.34199499e-01, 1.67765675e-01,
+    9.22168418e-01, 3.73210378e-01, 8.04432575e-01, 5.61890354e-01, 4.47025593e-01, 6.43155678e-01,
+    2.40407640e-01, 5.91631279e-01, 1.59369206e-01, 7.75799090e-01, 8.32067212e-01, 5.59791576e-02,
+    6.39105224e-01, 4.85274738e-01, 2.12630838e-01, 2.81431312e-02, 7.16205363e-01, 6.83885011e-01,
+    5.23869697e-01, 9.99418314e-01, 8.35331599e-01, 4.69877463e-02, 6.74712562e-01, 7.99273684e-01,
+    2.77001890e-02, 5.75809742e-01, 2.78513031e-01, 8.36209905e-01, 7.25472379e-01, 4.87173943e-01,
+    7.88311357e-01, 9.64676177e-01, 1.75752651e-01, 4.98112580e-01, 8.08850418e-02, 6.40981131e-01,
+    4.06647450e-01, 8.46539387e-01, 2.12620694e-01, 9.11012851e-01, 8.25041445e-01, 8.90065575e-01,
+    9.63626055e-01, 5.96689242e-01, 1.63372670e-01, 4.51640148e-01, 3.43026542e-01, 5.80658851e-01,
+    2.82327625e-01, 4.75535418e-01, 6.27760926e-01, 8.46314115e-01, 9.61961932e-01, 3.19806094e-01,
+    5.05508062e-01, 5.28102944e-01, 6.13045057e-01, 7.44714938e-01, 1.50586073e-01, 7.91878033e-01,
+    4.89839179e-01, 3.10496849e-01, 8.82309038e-01, 2.86922314e-01, 4.84687559e-01, 5.20838630e-01,
+    4.62955493e-01, 2.38185305e-01, 5.47259907e-02, 7.10916137e-01, 7.31887202e-01, 6.25602317e-01,
+    8.77741168e-01, 4.19881322e-01, 4.81222328e-01, 1.28224501e-01, 2.46034010e-01, 3.34971854e-01,
+    7.37216484e-01, 5.62134821e-02, 7.14089724e-01, 9.85549393e-01, 4.66295827e-01, 3.08722434e-03,
+    4.70237690e-01, 2.66524167e-01, 7.93875484e-01, 4.54795911e-02, 8.09702944e-01, 1.47709735e-02,
+    1.70082405e-01, 6.35905179e-01, 3.75379109e-01, 4.30315011e-01, 3.15788760e-01, 5.58065230e-01,
+    2.24643800e-01, 2.42142981e-01, 6.57283636e-01, 3.34921891e-01, 1.26588975e-01, 7.68064155e-01,
+    9.43856291e-01, 4.47518596e-01, 5.44453573e-01, 9.95764932e-01, 7.16444391e-01, 8.51019765e-01,
+    1.01179183e-01, 4.45473958e-01, 4.60327322e-01, 4.96895844e-02, 4.72907738e-01, 5.58987444e-01,
+    3.41027487e-01, 1.56175026e-01, 7.58283148e-01, 6.83600909e-01, 2.14623396e-01, 3.27348880e-01,
+    3.92517893e-01, 6.70418431e-01, 5.16440832e-01, 8.63140348e-01, 5.73277464e-01, 3.46608058e-01,
+    7.39396341e-01, 7.20852434e-01, 2.35653246e-02, 3.89935659e-01, 7.53783745e-01, 6.34563528e-01,
+    8.79339335e-01, 7.41599159e-02, 5.62433904e-01, 6.15553852e-01, 4.56956324e-01, 5.20047447e-01,
+    5.26845015e-02, 5.58471266e-01, 1.63632233e-01, 5.38936665e-02, 6.49593683e-01, 2.56838748e-01,
+    8.99035326e-01, 7.20847756e-01, 5.68954684e-01, 7.43684755e-01, 5.70924238e-01, 3.82318724e-01,
+    4.89328290e-01, 5.62208561e-01, 4.97540804e-02, 4.18011085e-01, 6.88041565e-01, 2.16234653e-01,
+    7.89548214e-01, 8.46136387e-01, 8.46816189e-01, 1.73842353e-01, 6.11627842e-02, 8.44440559e-01,
+    4.50646654e-01, 3.74785037e-01, 4.87196697e-01, 4.56276448e-01, 9.13284391e-01, 4.15715464e-01,
+    7.13597697e-01, 1.23641270e-02, 5.10031271e-01, 4.74601930e-02, 2.55731159e-01, 3.22090006e-01,
+    1.91165703e-01, 4.51170940e-01, 7.50843157e-01, 4.42420576e-01, 4.25380660e-01, 4.50667257e-01,
+    6.55689206e-01, 9.68257670e-02, 1.96528793e-01, 8.97343028e-01, 4.99940904e-01, 6.65504083e-01,
+    9.41828079e-01, 4.54397338e-01, 5.61893331e-01, 5.09839880e-01, 4.53117514e-01, 8.96804127e-02,
+    1.74888861e-01, 6.65641378e-01, 2.81668336e-01, 1.89532742e-01, 5.61668382e-01, 8.68330157e-02,
+    8.25092797e-01, 5.18106324e-01, 1.71904024e-01, 3.68385523e-01, 1.62005436e-01, 7.48507399e-01,
+    9.30274827e-01, 2.38198517e-01, 9.52222901e-01, 5.23587800e-01, 6.94384557e-01, 1.09338652e-01,
+    4.83356794e-01, 2.73050402e-01, 3.68027050e-01, 5.92366466e-01, 1.83192289e-01, 8.60376029e-01,
+    7.13926203e-01, 8.16750052e-01, 1.57890291e-01, 6.25691951e-01, 5.24831646e-01, 1.73873797e-01,
+    1.02429784e-01, 9.17488471e-01, 4.03584434e-01, 9.31170884e-01, 2.79386137e-01, 8.77745206e-01,
+    2.45200576e-01, 1.28896951e-01, 3.15713052e-01, 5.27874291e-01, 2.16444335e-01, 7.03883817e-01,
+    7.74738919e-02, 8.42422142e-01, 3.75598924e-01, 3.51002411e-01, 6.22752776e-01, 4.82407943e-01,
+    7.43107867e-01, 9.46182666e-01, 9.44344819e-01, 3.28124763e-01, 1.06147431e-01, 1.65102684e-01,
+    3.84060507e-01, 2.91057722e-01, 7.68173662e-02, 1.03543651e-01, 6.76698940e-01, 1.43141994e-01,
+    7.21342202e-01, 6.69471294e-03, 9.07298311e-01, 5.57080171e-01, 8.10954489e-01, 4.11120526e-01,
+    2.06407453e-01, 2.59590556e-01, 7.58512718e-01, 5.79873897e-01, 2.92875650e-01, 2.83686529e-01,
+    2.42829343e-01, 9.19323719e-01, 3.46832864e-01, 3.58238858e-01, 7.42827585e-01, 2.05760059e-01,
+    9.58438860e-01, 5.66326411e-01, 6.60292846e-01, 5.61095078e-02, 6.79465531e-01, 7.05118513e-01,
+    4.44713264e-01, 2.09732933e-01, 5.22732436e-01, 1.74396512e-01, 5.29356748e-01, 4.38475687e-01,
+    4.94036404e-01, 4.09785794e-01, 6.40025507e-01, 5.79371821e-01, 1.57726118e-01, 6.04572263e-01,
+    5.41072639e-01, 5.18847173e-01, 1.97093284e-01, 8.91767002e-01, 4.29050835e-01, 8.25490570e-01,
+    3.87699807e-01, 4.50705808e-01, 2.49371643e-01, 3.36074898e-01, 9.29925118e-01, 6.65393649e-01,
+    9.07275994e-01, 3.73075859e-01, 4.14044139e-03, 2.37463702e-01, 2.25893784e-01, 2.46900245e-01,
+    4.50350196e-01, 3.48618117e-01, 5.07193932e-01, 5.23435142e-01, 8.13611417e-01, 8.92715622e-01,
+    1.02623450e-01, 3.06088345e-01, 7.80461650e-01, 2.21453645e-01, 2.01419652e-01, 2.84254457e-01,
+    3.68286735e-01, 7.39358243e-01, 8.97879394e-01, 9.81599566e-01, 7.56526442e-01, 7.37645545e-01,
+    4.23976657e-02, 8.25922012e-01, 2.60956996e-01, 2.90702065e-01, 8.98388344e-01, 3.03733299e-01,
+    8.49071471e-01, 3.45835425e-01, 7.65458276e-01, 5.68094872e-01, 8.93770930e-01, 9.93161641e-01,
+    5.63368667e-02, 4.26548945e-01, 5.46745780e-01, 5.75674571e-01, 7.94599487e-01, 7.18935553e-02,
+    4.46492976e-01, 6.40240123e-01, 2.73246969e-01, 2.00465968e-01, 1.30718835e-01, 1.92492005e-01,
+    1.96617189e-01, 6.61271644e-01, 8.12687657e-01, 8.66342445e-01
+
+   },
+   {0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+   10,
+   false,
+   5}};
 
 typedef LinkageTest<float, int> LinkageTestF_Int;
 TEST_P(LinkageTestF_Int, Result) { EXPECT_TRUE(score == 1.0); }
diff --git a/cpp/test/core/handle.cpp b/cpp/test/core/handle.cpp
new file mode 100644
index 0000000000..9f416d3ae8
--- /dev/null
+++ b/cpp/test/core/handle.cpp
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstddef>
+#include <cuda_runtime.h>
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <raft/core/comms.hpp>
+#include <raft/core/handle.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <unordered_map>
+
+namespace raft {
+
+using namespace comms;
+class mock_comms : public comms_iface {
+ public:
+  mock_comms(int n) : n_ranks(n) {}
+  ~mock_comms() {}
+
+  int get_size() const override { return n_ranks; }
+
+  int get_rank() const override { return 0; }
+
+  std::unique_ptr<comms_iface> comm_split(int color, int key) const
+  {
+    return std::unique_ptr<comms_iface>(new mock_comms(0));
+  }
+
+  void barrier() const {}
+
+  void get_request_id(request_t* req) const {}
+
+  void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const {}
+
+  void irecv(void* buf, size_t size, int source, int tag, request_t* request) const {}
+
+  void waitall(int count, request_t array_of_requests[]) const {}
+
+  void allreduce(const void* sendbuff,
+                 void* recvbuff,
+                 size_t count,
+                 datatype_t datatype,
+                 op_t op,
+                 cudaStream_t stream) const
+  {
+  }
+
+  void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const {}
+
+  void bcast(const void* sendbuff,
+             void* recvbuff,
+             size_t count,
+             datatype_t datatype,
+             int root,
+             cudaStream_t stream) const
+  {
+  }
+
+  void reduce(const void* sendbuff,
+              void* recvbuff,
+              size_t count,
+              datatype_t datatype,
+              op_t op,
+              int root,
+              cudaStream_t stream) const
+  {
+  }
+
+  void allgather(const void* sendbuff,
+                 void* recvbuff,
+                 size_t sendcount,
+                 datatype_t datatype,
+                 cudaStream_t stream) const
+  {
+  }
+
+  void allgatherv(const void* sendbuf,
+                  void* recvbuf,
+                  const size_t* recvcounts,
+                  const size_t* displs,
+                  datatype_t datatype,
+                  cudaStream_t stream) const
+  {
+  }
+
+  void gather(const void* sendbuff,
+              void* recvbuff,
+              size_t sendcount,
+              datatype_t datatype,
+              int root,
+              cudaStream_t stream) const
+  {
+  }
+
+  void gatherv(const void* sendbuff,
+               void* recvbuff,
+               size_t sendcount,
+               const size_t* recvcounts,
+               const size_t* displs,
+               datatype_t datatype,
+               int root,
+               cudaStream_t stream) const
+  {
+  }
+
+  void reducescatter(const void* sendbuff,
+                     void* recvbuff,
+                     size_t recvcount,
+                     datatype_t datatype,
+                     op_t op,
+                     cudaStream_t stream) const
+  {
+  }
+
+  status_t sync_stream(cudaStream_t stream) const { return status_t::SUCCESS; }
+
+  // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
+  void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const {}
+
+  // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
+  void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const {}
+
+  void device_sendrecv(const void* sendbuf,
+                       size_t sendsize,
+                       int dest,
+                       void* recvbuf,
+                       size_t recvsize,
+                       int source,
+                       cudaStream_t stream) const
+  {
+  }
+
+  void device_multicast_sendrecv(const void* sendbuf,
+                                 std::vector<size_t> const& sendsizes,
+                                 std::vector<size_t> const& sendoffsets,
+                                 std::vector<int> const& dests,
+                                 void* recvbuf,
+                                 std::vector<size_t> const& recvsizes,
+                                 std::vector<size_t> const& recvoffsets,
+                                 std::vector<int> const& sources,
+                                 cudaStream_t stream) const
+  {
+  }
+
+  void group_start() const {}
+
+  void group_end() const {}
+
+ private:
+  int n_ranks;
+};
+
+void assert_handles_equal(raft::handle_t& handle_one, raft::handle_t& handle_two)
+{
+  // Assert shallow copied state
+  ASSERT_EQ(handle_one.get_stream().value(), handle_two.get_stream().value());
+  ASSERT_EQ(handle_one.get_stream_pool_size(), handle_two.get_stream_pool_size());
+
+  // Sanity check to make sure non-corresponding streams are not equal
+  ASSERT_NE(handle_one.get_stream_pool().get_stream(0).value(),
+            handle_two.get_stream_pool().get_stream(1).value());
+
+  for (size_t i = 0; i < handle_one.get_stream_pool_size(); ++i) {
+    ASSERT_EQ(handle_one.get_stream_pool().get_stream(i).value(),
+              handle_two.get_stream_pool().get_stream(i).value());
+  }
+}
+
+TEST(Raft, HandleDefault)
+{
+  raft::handle_t h;
+  ASSERT_EQ(0, h.get_device());
+  ASSERT_EQ(rmm::cuda_stream_per_thread, h.get_stream());
+  ASSERT_NE(nullptr, h.get_cublas_handle());
+  ASSERT_NE(nullptr, h.get_cusolver_dn_handle());
+  ASSERT_NE(nullptr, h.get_cusolver_sp_handle());
+  ASSERT_NE(nullptr, h.get_cusparse_handle());
+}
+
+TEST(Raft, Handle)
+{
+  // test stream pool creation
+  constexpr std::size_t n_streams = 4;
+  auto stream_pool                = std::make_shared<rmm::cuda_stream_pool>(n_streams);
+  raft::handle_t h(rmm::cuda_stream_default, stream_pool);
+  ASSERT_EQ(n_streams, h.get_stream_pool_size());
+
+  // test non default stream handle
+  cudaStream_t stream;
+  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+  rmm::cuda_stream_view stream_view(stream);
+  raft::handle_t handle(stream_view);
+  ASSERT_EQ(stream_view, handle.get_stream());
+  handle.sync_stream(stream);
+  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
+}
+
+TEST(Raft, DefaultConstructor)
+{
+  raft::handle_t handle;
+
+  // Make sure waiting on the default stream pool
+  // does not fail.
+  handle.wait_stream_pool_on_stream();
+  handle.sync_stream_pool();
+
+  auto s1 = handle.get_next_usable_stream();
+  auto s2 = handle.get_stream();
+  auto s3 = handle.get_next_usable_stream(5);
+
+  ASSERT_EQ(s1, s2);
+  ASSERT_EQ(s2, s3);
+  ASSERT_EQ(0, handle.get_stream_pool_size());
+}
+
+TEST(Raft, GetHandleFromPool)
+{
+  constexpr std::size_t n_streams = 4;
+  auto stream_pool                = std::make_shared<rmm::cuda_stream_pool>(n_streams);
+  raft::handle_t parent(rmm::cuda_stream_default, stream_pool);
+
+  for (std::size_t i = 0; i < n_streams; i++) {
+    auto worker_stream = parent.get_stream_from_stream_pool(i);
+    raft::handle_t child(worker_stream);
+    ASSERT_EQ(parent.get_stream_from_stream_pool(i), child.get_stream());
+  }
+
+  parent.wait_stream_pool_on_stream();
+}
+
+TEST(Raft, Comms)
+{
+  raft::handle_t handle;
+  auto comm1 = std::make_shared<comms_t>(std::unique_ptr<comms_iface>(new mock_comms(2)));
+  handle.set_comms(comm1);
+
+  ASSERT_EQ(handle.get_comms().get_size(), 2);
+}
+
+TEST(Raft, SubComms)
+{
+  raft::handle_t handle;
+  auto comm1 = std::make_shared<comms_t>(std::unique_ptr<comms_iface>(new mock_comms(1)));
+  handle.set_subcomm("key1", comm1);
+
+  auto comm2 = std::make_shared<comms_t>(std::unique_ptr<comms_iface>(new mock_comms(2)));
+  handle.set_subcomm("key2", comm2);
+
+  ASSERT_EQ(handle.get_subcomm("key1").get_size(), 1);
+  ASSERT_EQ(handle.get_subcomm("key2").get_size(), 2);
+}
+
+TEST(Raft, WorkspaceResource)
+{
+  raft::handle_t handle;
+
+  ASSERT_TRUE(dynamic_cast<const rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>*>(
+                handle.get_workspace_resource()) == nullptr);
+  ASSERT_EQ(rmm::mr::get_current_device_resource(), handle.get_workspace_resource());
+
+  auto pool_mr = new rmm::mr::pool_memory_resource(rmm::mr::get_current_device_resource());
+  std::shared_ptr<rmm::cuda_stream_pool> pool = {nullptr};
+  raft::handle_t handle2(rmm::cuda_stream_per_thread, pool, pool_mr);
+
+  ASSERT_TRUE(dynamic_cast<const rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>*>(
+                handle2.get_workspace_resource()) != nullptr);
+  ASSERT_EQ(pool_mr, handle2.get_workspace_resource());
+
+  delete pool_mr;
+}
+
+TEST(Raft, WorkspaceResourceCopy)
+{
+  auto stream_pool = std::make_shared<rmm::cuda_stream_pool>(10);
+
+  handle_t handle(rmm::cuda_stream_per_thread, stream_pool);
+
+  auto pool_mr = new rmm::mr::pool_memory_resource(rmm::mr::get_current_device_resource());
+
+  handle_t copied_handle(handle, pool_mr);
+
+  assert_handles_equal(handle, copied_handle);
+
+  // Assert the workspace_resources are what we expect
+  ASSERT_TRUE(dynamic_cast<const rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>*>(
+                handle.get_workspace_resource()) == nullptr);
+
+  ASSERT_TRUE(dynamic_cast<const rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>*>(
+                copied_handle.get_workspace_resource()) != nullptr);
+}
+
+TEST(Raft, HandleCopy)
+{
+  auto stream_pool = std::make_shared<rmm::cuda_stream_pool>(10);
+
+  handle_t handle(rmm::cuda_stream_per_thread, stream_pool);
+  handle_t copied_handle(handle);
+
+  assert_handles_equal(handle, copied_handle);
+}
+
+TEST(Raft, HandleAssign)
+{
+  auto stream_pool = std::make_shared<rmm::cuda_stream_pool>(10);
+
+  handle_t handle(rmm::cuda_stream_per_thread, stream_pool);
+  handle_t copied_handle = handle;
+
+  assert_handles_equal(handle, copied_handle);
+}
+
+}  // namespace raft
diff --git a/cpp/test/interruptible.cu b/cpp/test/core/interruptible.cu
similarity index 98%
rename from cpp/test/interruptible.cu
rename to cpp/test/core/interruptible.cu
index 92adfabd55..f54bb6f859 100644
--- a/cpp/test/interruptible.cu
+++ b/cpp/test/core/interruptible.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/common/logger.cpp b/cpp/test/core/logger.cpp
similarity index 98%
rename from cpp/test/common/logger.cpp
rename to cpp/test/core/logger.cpp
index a8460e45ca..3f29c9f12c 100644
--- a/cpp/test/common/logger.cpp
+++ b/cpp/test/core/logger.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/core/math_device.cu b/cpp/test/core/math_device.cu
new file mode 100644
index 0000000000..ff4b343d9e
--- /dev/null
+++ b/cpp/test/core/math_device.cu
@@ -0,0 +1,352 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../test_utils.h"
+#include <raft/core/math.hpp>
+#include <rmm/cuda_stream.hpp>
+#include <rmm/device_scalar.hpp>
+
+template <typename OutT, typename OpT, typename... Args>
+__global__ void math_eval_kernel(OutT* out, OpT op, Args... args)
+{
+  out[0] = op(std::forward<Args>(args)...);
+}
+
+template <typename OpT, typename... Args>
+auto math_eval(OpT op, Args&&... args)
+{
+  typedef decltype(op(args...)) OutT;
+  auto stream = rmm::cuda_stream_default;
+  rmm::device_scalar<OutT> result(stream);
+  math_eval_kernel<<<1, 1, 0, stream>>>(result.data(), op, std::forward<Args>(args)...);
+  return result.value(stream);
+}
+
+struct abs_test_op {
+  template <typename Type>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+    return raft::abs(in);
+  }
+};
+
+TEST(MathDevice, Abs)
+{
+  // Integer abs
+  ASSERT_TRUE(
+    raft::match(int8_t{123}, math_eval(abs_test_op{}, int8_t{-123}), raft::Compare<int8_t>()));
+  ASSERT_TRUE(raft::match(12345, math_eval(abs_test_op{}, -12345), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(12345l, math_eval(abs_test_op{}, -12345l), raft::Compare<long int>()));
+  ASSERT_TRUE(raft::match(123451234512345ll,
+                          math_eval(abs_test_op{}, -123451234512345ll),
+                          raft::Compare<long long int>()));
+  // Floating-point abs
+  ASSERT_TRUE(
+    raft::match(12.34f, math_eval(abs_test_op{}, -12.34f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(
+    raft::match(12.34, math_eval(abs_test_op{}, -12.34), raft::CompareApprox<double>(0.000001)));
+}
+
+struct acos_test_op {
+  template <typename Type>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+    return raft::acos(in);
+  }
+};
+
+TEST(MathDevice, Acos)
+{
+  ASSERT_TRUE(raft::match(
+    std::acos(0.123f), math_eval(acos_test_op{}, 0.123f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(
+    std::acos(0.123), math_eval(acos_test_op{}, 0.123), raft::CompareApprox<double>(0.000001)));
+}
+
+struct asin_test_op {
+  template <typename Type>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+    return raft::asin(in);
+  }
+};
+
+TEST(MathDevice, Asin)
+{
+  ASSERT_TRUE(raft::match(
+    std::asin(0.123f), math_eval(asin_test_op{}, 0.123f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(
+    std::asin(0.123), math_eval(asin_test_op{}, 0.123), raft::CompareApprox<double>(0.000001)));
+}
+
+struct atanh_test_op {
+  template <typename Type>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+    return raft::atanh(in);
+  }
+};
+
+TEST(MathDevice, Atanh)
+{
+  ASSERT_TRUE(raft::match(
+    std::atanh(0.123f), math_eval(atanh_test_op{}, 0.123f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(
+    std::atanh(0.123), math_eval(atanh_test_op{}, 0.123), raft::CompareApprox<double>(0.000001)));
+}
+
+struct cos_test_op {
+  template <typename Type>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+    return raft::cos(in);
+  }
+};
+
+TEST(MathDevice, Cos)
+{
+  ASSERT_TRUE(raft::match(
+    std::cos(12.34f), math_eval(cos_test_op{}, 12.34f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(
+    std::cos(12.34), math_eval(cos_test_op{}, 12.34), raft::CompareApprox<double>(0.000001)));
+}
+
+struct exp_test_op {
+  template <typename Type>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+    return raft::exp(in);
+  }
+};
+
+TEST(MathDevice, Exp)
+{
+  ASSERT_TRUE(raft::match(
+    std::exp(12.34f), math_eval(exp_test_op{}, 12.34f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(
+    std::exp(12.34), math_eval(exp_test_op{}, 12.34), raft::CompareApprox<double>(0.000001)));
+}
+
+struct log_test_op {
+  template <typename Type>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+    return raft::log(in);
+  }
+};
+
+TEST(MathDevice, Log)
+{
+  ASSERT_TRUE(raft::match(
+    std::log(12.34f), math_eval(log_test_op{}, 12.34f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(
+    std::log(12.34), math_eval(log_test_op{}, 12.34), raft::CompareApprox<double>(0.000001)));
+}
+
+struct max_test_op {
+  template <typename... Args>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(Args&&... args) const
+  {
+    return raft::max(std::forward<Args>(args)...);
+  }
+};
+
+TEST(MathDevice, Max2)
+{
+  ASSERT_TRUE(raft::match(1234, math_eval(max_test_op{}, -1234, 1234), raft::Compare<int>()));
+  ASSERT_TRUE(
+    raft::match(1234u, math_eval(max_test_op{}, 1234u, 123u), raft::Compare<unsigned int>()));
+  ASSERT_TRUE(
+    raft::match(1234ll, math_eval(max_test_op{}, -1234ll, 1234ll), raft::Compare<long long int>()));
+  ASSERT_TRUE(raft::match(
+    1234ull, math_eval(max_test_op{}, 1234ull, 123ull), raft::Compare<unsigned long long int>()));
+
+  ASSERT_TRUE(
+    raft::match(12.34f, math_eval(max_test_op{}, -12.34f, 12.34f), raft::Compare<float>()));
+  ASSERT_TRUE(raft::match(12.34, math_eval(max_test_op{}, -12.34, 12.34), raft::Compare<double>()));
+  ASSERT_TRUE(raft::match(
+    12.34, math_eval(max_test_op{}, -12.34f, 12.34), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(raft::match(
+    12.34, math_eval(max_test_op{}, -12.34, 12.34f), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathDevice, Max3)
+{
+  ASSERT_TRUE(raft::match(1234, math_eval(max_test_op{}, 1234, 0, -1234), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(1234, math_eval(max_test_op{}, -1234, 1234, 0), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(1234, math_eval(max_test_op{}, 0, -1234, 1234), raft::Compare<int>()));
+
+  ASSERT_TRUE(raft::match(
+    12.34, math_eval(max_test_op{}, 12.34f, 0., -12.34), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(raft::match(
+    12.34, math_eval(max_test_op{}, -12.34, 12.34f, 0.), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(raft::match(
+    12.34, math_eval(max_test_op{}, 0., -12.34, 12.34f), raft::CompareApprox<double>(0.000001)));
+}
+
+struct min_test_op {
+  template <typename... Args>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(Args&&... args) const
+  {
+    return raft::min(std::forward<Args>(args)...);
+  }
+};
+
+TEST(MathDevice, Min2)
+{
+  ASSERT_TRUE(raft::match(-1234, math_eval(min_test_op{}, -1234, 1234), raft::Compare<int>()));
+  ASSERT_TRUE(
+    raft::match(123u, math_eval(min_test_op{}, 1234u, 123u), raft::Compare<unsigned int>()));
+  ASSERT_TRUE(raft::match(
+    -1234ll, math_eval(min_test_op{}, -1234ll, 1234ll), raft::Compare<long long int>()));
+  ASSERT_TRUE(raft::match(
+    123ull, math_eval(min_test_op{}, 1234ull, 123ull), raft::Compare<unsigned long long int>()));
+
+  ASSERT_TRUE(
+    raft::match(-12.34f, math_eval(min_test_op{}, -12.34f, 12.34f), raft::Compare<float>()));
+  ASSERT_TRUE(
+    raft::match(-12.34, math_eval(min_test_op{}, -12.34, 12.34), raft::Compare<double>()));
+  ASSERT_TRUE(raft::match(
+    -12.34, math_eval(min_test_op{}, -12.34f, 12.34), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(raft::match(
+    -12.34, math_eval(min_test_op{}, -12.34, 12.34f), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathDevice, Min3)
+{
+  ASSERT_TRUE(raft::match(-1234, math_eval(min_test_op{}, 1234, 0, -1234), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(-1234, math_eval(min_test_op{}, -1234, 1234, 0), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(-1234, math_eval(min_test_op{}, 0, -1234, 1234), raft::Compare<int>()));
+
+  ASSERT_TRUE(raft::match(
+    -12.34, math_eval(min_test_op{}, 12.34f, 0., -12.34), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(raft::match(
+    -12.34, math_eval(min_test_op{}, -12.34, 12.34f, 0.), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(raft::match(
+    -12.34, math_eval(min_test_op{}, 0., -12.34, 12.34f), raft::CompareApprox<double>(0.000001)));
+}
+
+struct pow_test_op {
+  template <typename Type>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& x, const Type& y) const
+  {
+    return raft::pow(x, y);
+  }
+};
+
+TEST(MathDevice, Pow)
+{
+  ASSERT_TRUE(raft::match(std::pow(12.34f, 2.f),
+                          math_eval(pow_test_op{}, 12.34f, 2.f),
+                          raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(std::pow(12.34, 2.),
+                          math_eval(pow_test_op{}, 12.34, 2.),
+                          raft::CompareApprox<double>(0.000001)));
+}
+
+struct sgn_test_op {
+  template <typename Type>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+    return raft::sgn(in);
+  }
+};
+
+TEST(MathDevice, Sgn)
+{
+  ASSERT_TRUE(raft::match(-1, math_eval(sgn_test_op{}, -1234), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(0, math_eval(sgn_test_op{}, 0), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(1, math_eval(sgn_test_op{}, 1234), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(-1, math_eval(sgn_test_op{}, -12.34f), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(0, math_eval(sgn_test_op{}, 0.f), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(1, math_eval(sgn_test_op{}, 12.34f), raft::Compare<int>()));
+}
+
+struct sin_test_op {
+  template <typename Type>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+    return raft::sin(in);
+  }
+};
+
+TEST(MathDevice, Sin)
+{
+  ASSERT_TRUE(raft::match(
+    std::sin(12.34f), math_eval(sin_test_op{}, 12.34f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(
+    std::sin(12.34), math_eval(sin_test_op{}, 12.34), raft::CompareApprox<double>(0.000001)));
+}
+
+struct sincos_test_op {
+  template <typename Type>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& x, Type* s, Type* c) const
+  {
+    raft::sincos(x, s, c);
+    return x;  // unused, just to avoid creating another helper
+  }
+};
+
+TEST(MathDevice, SinCos)
+{
+  auto stream = rmm::cuda_stream_default;
+  float xf    = 12.34f;
+  rmm::device_scalar<float> sf(stream);
+  rmm::device_scalar<float> cf(stream);
+  math_eval(sincos_test_op{}, xf, sf.data(), cf.data());
+  ASSERT_TRUE(raft::match(std::sin(12.34f), sf.value(stream), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(std::cos(12.34f), cf.value(stream), raft::CompareApprox<float>(0.0001f)));
+  double xd = 12.34f;
+  rmm::device_scalar<double> sd(stream);
+  rmm::device_scalar<double> cd(stream);
+  math_eval(sincos_test_op{}, xd, sd.data(), cd.data());
+  ASSERT_TRUE(raft::match(std::sin(12.34), sd.value(stream), raft::CompareApprox<double>(0.0001f)));
+  ASSERT_TRUE(raft::match(std::cos(12.34), cd.value(stream), raft::CompareApprox<double>(0.0001f)));
+}
+
+struct sqrt_test_op {
+  template <typename Type>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+    return raft::sqrt(in);
+  }
+};
+
+TEST(MathDevice, Sqrt)
+{
+  ASSERT_TRUE(raft::match(
+    std::sqrt(12.34f), math_eval(sqrt_test_op{}, 12.34f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(
+    std::sqrt(12.34), math_eval(sqrt_test_op{}, 12.34), raft::CompareApprox<double>(0.000001)));
+}
+
+struct tanh_test_op {
+  template <typename Type>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+    return raft::tanh(in);
+  }
+};
+
+TEST(MathDevice, Tanh)
+{
+  ASSERT_TRUE(raft::match(
+    std::tanh(12.34f), math_eval(tanh_test_op{}, 12.34f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(
+    std::tanh(12.34), math_eval(tanh_test_op{}, 12.34), raft::CompareApprox<double>(0.000001)));
+}
diff --git a/cpp/test/core/math_host.cpp b/cpp/test/core/math_host.cpp
new file mode 100644
index 0000000000..5808905713
--- /dev/null
+++ b/cpp/test/core/math_host.cpp
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../test_utils.h"
+#include <raft/core/math.hpp>
+
+TEST(MathHost, Abs)
+{
+  // Integer abs
+  ASSERT_TRUE(raft::match(int8_t{123}, raft::abs(int8_t{-123}), raft::Compare<int8_t>()));
+  ASSERT_TRUE(raft::match(12345, raft::abs(-12345), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(12345l, raft::abs(-12345l), raft::Compare<long int>()));
+  ASSERT_TRUE(
+    raft::match(123451234512345ll, raft::abs(-123451234512345ll), raft::Compare<long long int>()));
+  // Floating-point abs
+  ASSERT_TRUE(raft::match(12.34f, raft::abs(-12.34f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(12.34, raft::abs(-12.34), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, Acos)
+{
+  ASSERT_TRUE(
+    raft::match(std::acos(0.123f), raft::acos(0.123f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(
+    raft::match(std::acos(0.123), raft::acos(0.123), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, Asin)
+{
+  ASSERT_TRUE(
+    raft::match(std::asin(0.123f), raft::asin(0.123f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(
+    raft::match(std::asin(0.123), raft::asin(0.123), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, Atanh)
+{
+  ASSERT_TRUE(
+    raft::match(std::atanh(0.123f), raft::atanh(0.123f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(
+    raft::match(std::atanh(0.123), raft::atanh(0.123), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, Cos)
+{
+  ASSERT_TRUE(
+    raft::match(std::cos(12.34f), raft::cos(12.34f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(
+    raft::match(std::cos(12.34), raft::cos(12.34), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, Exp)
+{
+  ASSERT_TRUE(
+    raft::match(std::exp(12.34f), raft::exp(12.34f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(
+    raft::match(std::exp(12.34), raft::exp(12.34), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, Log)
+{
+  ASSERT_TRUE(
+    raft::match(std::log(12.34f), raft::log(12.34f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(
+    raft::match(std::log(12.34), raft::log(12.34), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, Max2)
+{
+  ASSERT_TRUE(raft::match(1234, raft::max(-1234, 1234), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(1234u, raft::max(1234u, 123u), raft::Compare<unsigned int>()));
+  ASSERT_TRUE(raft::match(1234ll, raft::max(-1234ll, 1234ll), raft::Compare<long long int>()));
+  ASSERT_TRUE(
+    raft::match(1234ull, raft::max(1234ull, 123ull), raft::Compare<unsigned long long int>()));
+
+  ASSERT_TRUE(raft::match(12.34f, raft::max(-12.34f, 12.34f), raft::Compare<float>()));
+  ASSERT_TRUE(raft::match(12.34, raft::max(-12.34, 12.34), raft::Compare<double>()));
+  ASSERT_TRUE(raft::match(12.34, raft::max(-12.34f, 12.34), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(raft::match(12.34, raft::max(-12.34, 12.34f), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, Max3)
+{
+  ASSERT_TRUE(raft::match(1234, raft::max(1234, 0, -1234), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(1234, raft::max(-1234, 1234, 0), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(1234, raft::max(0, -1234, 1234), raft::Compare<int>()));
+
+  ASSERT_TRUE(
+    raft::match(12.34, raft::max(12.34f, 0., -12.34), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(
+    raft::match(12.34, raft::max(-12.34, 12.34f, 0.), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(
+    raft::match(12.34, raft::max(0., -12.34, 12.34f), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, Min2)
+{
+  ASSERT_TRUE(raft::match(-1234, raft::min(-1234, 1234), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(123u, raft::min(1234u, 123u), raft::Compare<unsigned int>()));
+  ASSERT_TRUE(raft::match(-1234ll, raft::min(-1234ll, 1234ll), raft::Compare<long long int>()));
+  ASSERT_TRUE(
+    raft::match(123ull, raft::min(1234ull, 123ull), raft::Compare<unsigned long long int>()));
+
+  ASSERT_TRUE(raft::match(-12.34f, raft::min(-12.34f, 12.34f), raft::Compare<float>()));
+  ASSERT_TRUE(raft::match(-12.34, raft::min(-12.34, 12.34), raft::Compare<double>()));
+  ASSERT_TRUE(
+    raft::match(-12.34, raft::min(-12.34f, 12.34), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(
+    raft::match(-12.34, raft::min(-12.34, 12.34f), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, Min3)
+{
+  ASSERT_TRUE(raft::match(-1234, raft::min(1234, 0, -1234), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(-1234, raft::min(-1234, 1234, 0), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(-1234, raft::min(0, -1234, 1234), raft::Compare<int>()));
+
+  ASSERT_TRUE(
+    raft::match(-12.34, raft::min(12.34f, 0., -12.34), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(
+    raft::match(-12.34, raft::min(-12.34, 12.34f, 0.), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(
+    raft::match(-12.34, raft::min(0., -12.34, 12.34f), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, Pow)
+{
+  ASSERT_TRUE(raft::match(
+    std::pow(12.34f, 2.f), raft::pow(12.34f, 2.f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(
+    raft::match(std::pow(12.34, 2.), raft::pow(12.34, 2.), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, Sgn)
+{
+  ASSERT_TRUE(raft::match(-1, raft::sgn(-1234), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(0, raft::sgn(0), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(1, raft::sgn(1234), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(-1, raft::sgn(-12.34f), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(0, raft::sgn(0.f), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(1, raft::sgn(12.34f), raft::Compare<int>()));
+}
+
+TEST(MathHost, Sin)
+{
+  ASSERT_TRUE(
+    raft::match(std::sin(12.34f), raft::sin(12.34f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(
+    raft::match(std::sin(12.34), raft::sin(12.34), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, SinCos)
+{
+  float xf = 12.34f;
+  float sf, cf;
+  raft::sincos(xf, &sf, &cf);
+  ASSERT_TRUE(raft::match(std::sin(12.34f), sf, raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(std::cos(12.34f), cf, raft::CompareApprox<float>(0.0001f)));
+  double xd = 12.34f;
+  double sd, cd;
+  raft::sincos(xd, &sd, &cd);
+  ASSERT_TRUE(raft::match(std::sin(12.34), sd, raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(raft::match(std::cos(12.34), cd, raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, Sqrt)
+{
+  ASSERT_TRUE(
+    raft::match(std::sqrt(12.34f), raft::sqrt(12.34f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(
+    raft::match(std::sqrt(12.34), raft::sqrt(12.34), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, Tanh)
+{
+  ASSERT_TRUE(
+    raft::match(std::tanh(12.34f), raft::tanh(12.34f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(
+    raft::match(std::tanh(12.34), raft::tanh(12.34), raft::CompareApprox<double>(0.000001)));
+}
diff --git a/cpp/test/mdarray.cu b/cpp/test/core/mdarray.cu
similarity index 99%
rename from cpp/test/mdarray.cu
rename to cpp/test/core/mdarray.cu
index c292feb894..018b8a4e5a 100644
--- a/cpp/test/mdarray.cu
+++ b/cpp/test/core/mdarray.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -340,7 +340,7 @@ void test_factory_methods()
     ASSERT_EQ(h_vec.extent(0), n);
   }
   {
-    raft::handle_t handle{};
+    raft::device_resources handle{};
     // device mdarray
     auto d_matrix = make_device_matrix<float>(handle, n, n);
     ASSERT_EQ(d_matrix.extent(0), n);
@@ -353,7 +353,7 @@ void test_factory_methods()
   }
 
   {
-    raft::handle_t handle{};
+    raft::device_resources handle{};
     // device scalar
     auto d_scalar = make_device_scalar<double>(handle, 17.0);
     static_assert(d_scalar.rank() == 1);
@@ -385,7 +385,7 @@ void test_factory_methods()
 
   // managed
   {
-    raft::handle_t handle{};
+    raft::device_resources handle{};
     auto mda = make_device_vector<int>(handle, 10);
 
     auto mdv = make_managed_mdspan(mda.data_handle(), raft::vector_extent<int>{10});
@@ -416,7 +416,7 @@ void check_matrix_layout(device_matrix_view<T, Index, LayoutPolicy> in)
 
 TEST(MDArray, FuncArg)
 {
-  raft::handle_t handle{};
+  raft::device_resources handle{};
   {
     auto d_matrix = make_device_matrix<float>(handle, 10, 10);
     check_matrix_layout(d_matrix.view());
@@ -918,7 +918,7 @@ void test_mdarray_unravel()
   }
 
   {
-    handle_t handle;
+    raft::device_resources handle;
     auto m   = make_device_matrix<float, size_t>(handle, 7, 6);
     auto m_v = m.view();
     thrust::for_each_n(handle.get_thrust_policy(),
diff --git a/cpp/test/mdspan_utils.cu b/cpp/test/core/mdspan_utils.cu
similarity index 86%
rename from cpp/test/mdspan_utils.cu
rename to cpp/test/core/mdspan_utils.cu
index 7f1efb78bb..448391fa95 100644
--- a/cpp/test/mdspan_utils.cu
+++ b/cpp/test/core/mdspan_utils.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -118,7 +118,7 @@ TEST(MDArray, HostFlatten) { test_host_flatten(); }
 
 void test_device_flatten()
 {
-  raft::handle_t handle{};
+  raft::device_resources handle{};
   // flatten 3d device mdspan
   {
     using three_d_extents = extents<int, dynamic_extent, dynamic_extent, dynamic_extent>;
@@ -179,7 +179,7 @@ void test_reshape()
 
   // reshape 4d device array to 2d
   {
-    raft::handle_t handle{};
+    raft::device_resources handle{};
     using four_d_extents =
       extents<int, dynamic_extent, dynamic_extent, dynamic_extent, dynamic_extent>;
     using four_d_mdarray = device_mdarray<int, four_d_extents>;
@@ -214,4 +214,31 @@ void test_reshape()
 
 TEST(MDArray, Reshape) { test_reshape(); }
 
-}  // namespace raft
\ No newline at end of file
+void test_const_mdspan()
+{
+  // 3d host array
+  {
+    using two_d_extents = extents<int, 5, 5>;
+    using two_d_mdarray = host_mdarray<float, two_d_extents>;
+
+    typename two_d_mdarray::mapping_type layout{two_d_extents{}};
+    typename two_d_mdarray::container_policy_type policy;
+    two_d_mdarray mda{layout, policy};
+
+    auto const_mda = make_const_mdspan(mda.view());
+
+    static_assert(std::is_same_v<const float, typename decltype(const_mda)::element_type>,
+                  "elements not the same");
+    static_assert(std::is_same_v<typename decltype(mda)::extents_type,
+                                 typename decltype(const_mda)::extents_type>,
+                  "extents not the same");
+    static_assert(std::is_same_v<typename decltype(mda)::layout_type,
+                                 typename decltype(const_mda)::layout_type>,
+                  "layouts not the same");
+    ASSERT_EQ(mda.size(), const_mda.size());
+  }
+}
+
+TEST(MDSpan, ConstMDSpan) { test_const_mdspan(); }
+
+}  // namespace raft
diff --git a/cpp/test/memory_type.cpp b/cpp/test/core/memory_type.cpp
similarity index 96%
rename from cpp/test/memory_type.cpp
rename to cpp/test/core/memory_type.cpp
index 57d44ceefe..02aa8caa6c 100644
--- a/cpp/test/memory_type.cpp
+++ b/cpp/test/core/memory_type.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/core/numpy_serializer.cu b/cpp/test/core/numpy_serializer.cu
new file mode 100644
index 0000000000..4131a33171
--- /dev/null
+++ b/cpp/test/core/numpy_serializer.cu
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/serialize.hpp>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/universal_vector.h>
+
+#include <complex>
+#include <cstdint>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace {
+
+template <class IndexType, std::size_t Rank>
+using dextents = std::experimental::dextents<IndexType, Rank>;
+
+}  // anonymous namespace
+
+namespace raft {
+
+template <typename MDSpanType, typename VectorType, typename... Args>
+void test_mdspan_roundtrip(const raft::device_resources& handle, VectorType& vec, Args... dims)
+{
+  VectorType vec2(vec.size());
+
+  auto span = MDSpanType(thrust::raw_pointer_cast(vec.data()), dims...);
+  std::ostringstream oss;
+  serialize_mdspan(handle, oss, span);
+
+  auto span2 = MDSpanType(thrust::raw_pointer_cast(vec2.data()), dims...);
+  std::istringstream iss(oss.str());
+  deserialize_mdspan(handle, iss, span2);
+  EXPECT_EQ(vec, vec2);
+}
+
+template <typename T>
+void run_roundtrip_test_mdspan_serializer()
+{
+  raft::device_resources handle{};
+  thrust::host_vector<T> vec = std::vector<T>{1, 2, 3, 4, 5, 6, 7, 8};
+
+  using mdspan_matrix2d_c_layout =
+    raft::host_mdspan<T, dextents<std::size_t, 2>, raft::layout_c_contiguous>;
+  using mdspan_matrix2d_f_layout =
+    raft::host_mdspan<T, dextents<std::size_t, 2>, raft::layout_f_contiguous>;
+
+  test_mdspan_roundtrip<mdspan_matrix2d_c_layout>(handle, vec, 2, 4);
+  test_mdspan_roundtrip<mdspan_matrix2d_f_layout>(handle, vec, 2, 4);
+
+  using device_mdspan_matrix3d_c_layout =
+    raft::device_mdspan<T, dextents<std::size_t, 3>, raft::layout_c_contiguous>;
+  using device_mdspan_matrix3d_f_layout =
+    raft::device_mdspan<T, dextents<std::size_t, 3>, raft::layout_f_contiguous>;
+
+  thrust::device_vector<T> d_vec(vec);
+  test_mdspan_roundtrip<device_mdspan_matrix3d_c_layout>(handle, d_vec, 2, 2, 2);
+  test_mdspan_roundtrip<device_mdspan_matrix3d_f_layout>(handle, d_vec, 2, 2, 2);
+}
+
+TEST(NumPySerializerMDSpan, E2ERoundTrip)
+{
+  run_roundtrip_test_mdspan_serializer<float>();
+  run_roundtrip_test_mdspan_serializer<double>();
+  run_roundtrip_test_mdspan_serializer<std::int32_t>();
+  run_roundtrip_test_mdspan_serializer<std::uint32_t>();
+  run_roundtrip_test_mdspan_serializer<std::complex<float>>();
+}
+
+TEST(NumPySerializerMDSpan, HeaderRoundTrip)
+{
+  char byteorder = RAFT_NUMPY_HOST_ENDIAN_CHAR;
+  for (char kind : std::vector<char>{'f', 'i', 'u', 'c'}) {
+    for (unsigned int itemsize : std::vector<unsigned int>{1, 2, 4, 8, 16}) {
+      for (bool fortran_order : std::vector<bool>{true, false}) {
+        for (const auto& shape : std::vector<std::vector<detail::numpy_serializer::ndarray_len_t>>{
+               {10}, {2, 2}, {10, 30, 100}, {}}) {
+          detail::numpy_serializer::dtype_t dtype{byteorder, kind, itemsize};
+          detail::numpy_serializer::header_t header{dtype, fortran_order, shape};
+          std::ostringstream oss;
+          detail::numpy_serializer::write_header(oss, header);
+          std::istringstream iss(oss.str());
+          auto header2 = detail::numpy_serializer::read_header(iss);
+          EXPECT_EQ(header, header2);
+        }
+      }
+    }
+  }
+}
+
+TEST(NumPySerializerMDSpan, ManagedMDSpan)
+{
+  raft::device_resources handle{};
+  thrust::universal_vector<float> vec = std::vector<float>{1, 2, 3, 4, 5, 6, 7, 8};
+  using managed_mdspan_matrix2d_c_layout =
+    raft::managed_mdspan<float, dextents<std::size_t, 3>, raft::layout_c_contiguous>;
+  test_mdspan_roundtrip<managed_mdspan_matrix2d_c_layout>(handle, vec, 2, 2, 2);
+}
+
+TEST(NumPySerializerMDSpan, Tuple2String)
+{
+  {
+    std::vector<int> tuple{};
+    EXPECT_EQ(detail::numpy_serializer::tuple_to_string(tuple), "()");
+  }
+  {
+    std::vector<int> tuple{2};
+    EXPECT_EQ(detail::numpy_serializer::tuple_to_string(tuple), "(2,)");
+  }
+  {
+    std::vector<int> tuple{2, 3};
+    EXPECT_EQ(detail::numpy_serializer::tuple_to_string(tuple), "(2, 3)");
+  }
+  {
+    std::vector<int> tuple{2, 3, 10, 20};
+    EXPECT_EQ(detail::numpy_serializer::tuple_to_string(tuple), "(2, 3, 10, 20)");
+  }
+}
+
+TEST(NumPySerializerMDSpan, NumPyDType)
+{
+  const char expected_endian_char = RAFT_SYSTEM_LITTLE_ENDIAN ? '<' : '>';
+  {
+    const detail::numpy_serializer::dtype_t expected_dtype{
+      expected_endian_char, 'f', sizeof(float)};
+    EXPECT_EQ(detail::numpy_serializer::get_numpy_dtype<float>(), expected_dtype);
+  }
+  {
+    const detail::numpy_serializer::dtype_t expected_dtype{
+      expected_endian_char, 'f', sizeof(long double)};
+    EXPECT_EQ(detail::numpy_serializer::get_numpy_dtype<long double>(), expected_dtype);
+  }
+  {
+    const detail::numpy_serializer::dtype_t expected_dtype{'|', 'i', sizeof(signed char)};
+    EXPECT_EQ(detail::numpy_serializer::get_numpy_dtype<signed char>(), expected_dtype);
+  }
+  {
+    const detail::numpy_serializer::dtype_t expected_dtype{
+      expected_endian_char, 'i', sizeof(std::int64_t)};
+    EXPECT_EQ(detail::numpy_serializer::get_numpy_dtype<std::int64_t>(), expected_dtype);
+  }
+  {
+    const detail::numpy_serializer::dtype_t expected_dtype{'|', 'u', sizeof(unsigned char)};
+    EXPECT_EQ(detail::numpy_serializer::get_numpy_dtype<unsigned char>(), expected_dtype);
+  }
+  {
+    const detail::numpy_serializer::dtype_t expected_dtype{
+      expected_endian_char, 'u', sizeof(std::uint64_t)};
+    EXPECT_EQ(detail::numpy_serializer::get_numpy_dtype<std::uint64_t>(), expected_dtype);
+  }
+  {
+    const detail::numpy_serializer::dtype_t expected_dtype{
+      expected_endian_char, 'c', sizeof(std::complex<double>)};
+    EXPECT_EQ(detail::numpy_serializer::get_numpy_dtype<std::complex<double>>(), expected_dtype);
+  }
+}
+
+TEST(NumPySerializerMDSpan, WriteHeader)
+{
+  using namespace std::string_literals;
+  std::ostringstream oss;
+  detail::numpy_serializer::header_t header{{'<', 'f', 8}, false, {2, 10, 5}};
+  detail::numpy_serializer::write_header(oss, header);
+  EXPECT_EQ(oss.str(),
+            "\x93NUMPY\x01\x00"s  // magic string + version (1.0)
+            "\x76\x00"s           // HEADER_LEN = 118, in little endian
+            "{'descr': '<f8', 'fortran_order': False, 'shape': (2, 10, 5)}"s  // header
+            "\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20"s                       // padding
+            "\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20"s
+            "\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20"s
+            "\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20"s
+            "\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20"s
+            "\x20\x20\x20\x20\x20\x20\n"s);
+}
+
+TEST(NumPySerializerMDSpan, ParsePyDict)
+{
+  std::string dict{"{'apple': 2, 'pie': 'is', 'delicious': True, 'piece of': 'cake'}"};
+  auto parse =
+    detail::numpy_serializer::parse_pydict(dict, {"apple", "pie", "delicious", "piece of"});
+  auto expected_parse = std::map<std::string, std::string>{
+    {"apple", "2"}, {"pie", "'is'"}, {"delicious", "True"}, {"piece of", "'cake'"}};
+  EXPECT_EQ(parse, expected_parse);
+}
+
+TEST(NumPySerializerMDSpan, ParsePyString)
+{
+  EXPECT_EQ(detail::numpy_serializer::parse_pystring("'foobar'"), "foobar");
+}
+
+TEST(NumPySerializerMDSpan, ParsePyTuple)
+{
+  {
+    std::string tuple_str{"(2,)"};
+    std::vector<std::string> expected_parse{"2"};
+    EXPECT_EQ(detail::numpy_serializer::parse_pytuple(tuple_str), expected_parse);
+  }
+  {
+    std::string tuple_str{"(2, 3)"};
+    std::vector<std::string> expected_parse{"2", "3"};
+    EXPECT_EQ(detail::numpy_serializer::parse_pytuple(tuple_str), expected_parse);
+  }
+  {
+    std::string tuple_str{"(2, 3, 10, 20)"};
+    std::vector<std::string> expected_parse{"2", "3", "10", "20"};
+    EXPECT_EQ(detail::numpy_serializer::parse_pytuple(tuple_str), expected_parse);
+  }
+}
+
+template <typename T>
+void run_roundtrip_test_scalar_serializer(T scalar)
+{
+  std::ostringstream oss;
+  detail::numpy_serializer::serialize_scalar(oss, scalar);
+  std::istringstream iss(oss.str());
+  T tmp = detail::numpy_serializer::deserialize_scalar<T>(iss);
+  EXPECT_EQ(scalar, tmp);
+}
+
+TEST(NumPySerializerScalar, E2ERoundTrip)
+{
+  using namespace std::complex_literals;
+  run_roundtrip_test_scalar_serializer<float>(2.0f);
+  run_roundtrip_test_scalar_serializer<double>(-2.0);
+  run_roundtrip_test_scalar_serializer<std::int8_t>(-2);
+  run_roundtrip_test_scalar_serializer<std::uint32_t>(0x4FFFFFF);
+  run_roundtrip_test_scalar_serializer<std::complex<double>>(1.0 - 2.0i);
+}
+
+template <typename T>
+void check_header_scalar_serializer(T scalar)
+{
+  std::ostringstream oss;
+  detail::numpy_serializer::serialize_scalar(oss, scalar);
+  std::istringstream iss(oss.str());
+  detail::numpy_serializer::header_t header = detail::numpy_serializer::read_header(iss);
+  EXPECT_TRUE(header.shape.empty());
+  EXPECT_EQ(header.dtype.to_string(), detail::numpy_serializer::get_numpy_dtype<T>().to_string());
+}
+
+TEST(NumPySerializerScalar, HeaderCheck)
+{
+  using namespace std::complex_literals;
+  check_header_scalar_serializer<float>(2.0f);
+  check_header_scalar_serializer<double>(-2.0);
+  check_header_scalar_serializer<std::int8_t>(-2);
+  check_header_scalar_serializer<std::uint32_t>(0x4FFFFFF);
+  check_header_scalar_serializer<std::complex<double>>(1.0 - 2.0i);
+}
+
+}  // namespace raft
diff --git a/cpp/test/nvtx.cpp b/cpp/test/core/nvtx.cpp
similarity index 96%
rename from cpp/test/nvtx.cpp
rename to cpp/test/core/nvtx.cpp
index 635fe55012..e6c29fa3d8 100644
--- a/cpp/test/nvtx.cpp
+++ b/cpp/test/core/nvtx.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/core/operators_device.cu b/cpp/test/core/operators_device.cu
new file mode 100644
index 0000000000..1697a09fcf
--- /dev/null
+++ b/cpp/test/core/operators_device.cu
@@ -0,0 +1,340 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <cmath>
+
+#include <gtest/gtest.h>
+
+#include "../test_utils.cuh"
+#include <raft/core/kvp.hpp>
+#include <raft/core/operators.hpp>
+#include <rmm/cuda_stream.hpp>
+#include <rmm/device_scalar.hpp>
+
+template <typename OutT, typename OpT, typename... Args>
+__global__ void eval_op_on_device_kernel(OutT* out, OpT op, Args... args)
+{
+  out[0] = op(std::forward<Args>(args)...);
+}
+
+template <typename OpT, typename... Args>
+auto eval_op_on_device(OpT op, Args&&... args)
+{
+  typedef decltype(op(args...)) OutT;
+  auto stream = rmm::cuda_stream_default;
+  rmm::device_scalar<OutT> result(stream);
+  eval_op_on_device_kernel<<<1, 1, 0, stream>>>(result.data(), op, std::forward<Args>(args)...);
+  return result.value(stream);
+}
+
+TEST(OperatorsDevice, IdentityOp)
+{
+  raft::identity_op op;
+  ASSERT_TRUE(raft::match(12.34f, eval_op_on_device(op, 12.34f, 0), raft::Compare<float>()));
+}
+
+TEST(OperatorsDevice, CastOp)
+{
+  raft::cast_op<float> op;
+  ASSERT_TRUE(
+    raft::match(1234.0f, eval_op_on_device(op, 1234, 0), raft::CompareApprox<float>(0.00001f)));
+}
+
+TEST(OperatorsDevice, KeyOp)
+{
+  raft::key_op op;
+  raft::KeyValuePair<int, float> kvp(12, 3.4f);
+  ASSERT_TRUE(raft::match(12, eval_op_on_device(op, kvp, 0), raft::Compare<int>()));
+}
+
+TEST(OperatorsDevice, ValueOp)
+{
+  raft::value_op op;
+  raft::KeyValuePair<int, float> kvp(12, 3.4f);
+  ASSERT_TRUE(
+    raft::match(3.4f, eval_op_on_device(op, kvp, 0), raft::CompareApprox<float>(0.00001f)));
+}
+
+TEST(OperatorsDevice, SqrtOpF)
+{
+  raft::sqrt_op op;
+  ASSERT_TRUE(raft::match(
+    std::sqrt(12.34f), eval_op_on_device(op, 12.34f, 0), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(
+    std::sqrt(12.34), eval_op_on_device(op, 12.34, 0), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(OperatorsDevice, NZOp)
+{
+  raft::nz_op op;
+  ASSERT_TRUE(
+    raft::match(0.0f, eval_op_on_device(op, 0.0f, 0), raft::CompareApprox<float>(0.00001f)));
+  ASSERT_TRUE(
+    raft::match(1.0f, eval_op_on_device(op, 12.34f, 0), raft::CompareApprox<float>(0.00001f)));
+}
+
+TEST(OperatorsDevice, AbsOp)
+{
+  raft::abs_op op;
+  ASSERT_TRUE(
+    raft::match(12.34f, eval_op_on_device(op, -12.34f, 0), raft::CompareApprox<float>(0.00001f)));
+  ASSERT_TRUE(
+    raft::match(12.34, eval_op_on_device(op, -12.34, 0), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(raft::match(1234, eval_op_on_device(op, -1234, 0), raft::Compare<int>()));
+}
+
+TEST(OperatorsDevice, SqOp)
+{
+  raft::sq_op op;
+  ASSERT_TRUE(
+    raft::match(152.2756f, eval_op_on_device(op, 12.34f, 0), raft::CompareApprox<float>(0.00001f)));
+  ASSERT_TRUE(raft::match(289, eval_op_on_device(op, -17, 0), raft::Compare<int>()));
+}
+
+TEST(OperatorsDevice, AddOp)
+{
+  raft::add_op op;
+  ASSERT_TRUE(
+    raft::match(12.34f, eval_op_on_device(op, 12.0f, 0.34f), raft::CompareApprox<float>(0.00001f)));
+  ASSERT_TRUE(raft::match(1234, eval_op_on_device(op, 1200, 34), raft::Compare<int>()));
+}
+
+TEST(OperatorsDevice, SubOp)
+{
+  raft::sub_op op;
+  ASSERT_TRUE(
+    raft::match(12.34f, eval_op_on_device(op, 13.0f, 0.66f), raft::CompareApprox<float>(0.00001f)));
+  ASSERT_TRUE(raft::match(1234, eval_op_on_device(op, 1300, 66), raft::Compare<int>()));
+}
+
+TEST(OperatorsDevice, MulOp)
+{
+  raft::mul_op op;
+  ASSERT_TRUE(
+    raft::match(12.34f, eval_op_on_device(op, 2.0f, 6.17f), raft::CompareApprox<float>(0.00001f)));
+}
+
+TEST(OperatorsDevice, DivOp)
+{
+  raft::div_op op;
+  ASSERT_TRUE(
+    raft::match(12.34f, eval_op_on_device(op, 37.02f, 3.0f), raft::CompareApprox<float>(0.00001f)));
+}
+
+TEST(OperatorsDevice, DivCheckZeroOp)
+{
+  raft::div_checkzero_op op;
+  ASSERT_TRUE(
+    raft::match(12.34f, eval_op_on_device(op, 37.02f, 3.0f), raft::CompareApprox<float>(0.00001f)));
+  ASSERT_TRUE(
+    raft::match(0.0f, eval_op_on_device(op, 37.02f, 0.0f), raft::CompareApprox<float>(0.00001f)));
+}
+
+TEST(OperatorsDevice, PowOp)
+{
+  raft::pow_op op;
+  ASSERT_TRUE(
+    raft::match(1000.0f, eval_op_on_device(op, 10.0f, 3.0f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(
+    raft::match(1000.0, eval_op_on_device(op, 10.0, 3.0), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(OperatorsDevice, MinOp)
+{
+  raft::min_op op;
+  ASSERT_TRUE(
+    raft::match(3.0f, eval_op_on_device(op, 3.0f, 5.0f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(
+    raft::match(3.0, eval_op_on_device(op, 5.0, 3.0), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(raft::match(3, eval_op_on_device(op, 3, 5), raft::Compare<int>()));
+}
+
+TEST(OperatorsDevice, MaxOp)
+{
+  raft::max_op op;
+  ASSERT_TRUE(
+    raft::match(5.0f, eval_op_on_device(op, 3.0f, 5.0f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(
+    raft::match(5.0, eval_op_on_device(op, 5.0, 3.0), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(raft::match(5, eval_op_on_device(op, 3, 5), raft::Compare<int>()));
+}
+
+TEST(OperatorsDevice, SqDiffOp)
+{
+  raft::sqdiff_op op;
+  ASSERT_TRUE(
+    raft::match(4.0f, eval_op_on_device(op, 3.0f, 5.0f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(
+    raft::match(4.0, eval_op_on_device(op, 5.0, 3.0), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(OperatorsDevice, ArgminOp)
+{
+  raft::argmin_op op;
+  raft::KeyValuePair<int, float> kvp_a(0, 1.2f);
+  raft::KeyValuePair<int, float> kvp_b(0, 3.4f);
+  raft::KeyValuePair<int, float> kvp_c(1, 1.2f);
+  ASSERT_TRUE(raft::match(
+    kvp_a, eval_op_on_device(op, kvp_a, kvp_b), raft::Compare<raft::KeyValuePair<int, float>>()));
+  ASSERT_TRUE(raft::match(
+    kvp_a, eval_op_on_device(op, kvp_b, kvp_a), raft::Compare<raft::KeyValuePair<int, float>>()));
+  ASSERT_TRUE(raft::match(
+    kvp_a, eval_op_on_device(op, kvp_a, kvp_c), raft::Compare<raft::KeyValuePair<int, float>>()));
+  ASSERT_TRUE(raft::match(
+    kvp_a, eval_op_on_device(op, kvp_c, kvp_a), raft::Compare<raft::KeyValuePair<int, float>>()));
+  ASSERT_TRUE(raft::match(
+    kvp_c, eval_op_on_device(op, kvp_b, kvp_c), raft::Compare<raft::KeyValuePair<int, float>>()));
+  ASSERT_TRUE(raft::match(
+    kvp_c, eval_op_on_device(op, kvp_c, kvp_b), raft::Compare<raft::KeyValuePair<int, float>>()));
+}
+
+TEST(OperatorsDevice, ArgmaxOp)
+{
+  raft::argmax_op op;
+  raft::KeyValuePair<int, float> kvp_a(0, 1.2f);
+  raft::KeyValuePair<int, float> kvp_b(0, 3.4f);
+  raft::KeyValuePair<int, float> kvp_c(1, 1.2f);
+  ASSERT_TRUE(raft::match(
+    kvp_b, eval_op_on_device(op, kvp_a, kvp_b), raft::Compare<raft::KeyValuePair<int, float>>()));
+  ASSERT_TRUE(raft::match(
+    kvp_b, eval_op_on_device(op, kvp_b, kvp_a), raft::Compare<raft::KeyValuePair<int, float>>()));
+  ASSERT_TRUE(raft::match(
+    kvp_a, eval_op_on_device(op, kvp_a, kvp_c), raft::Compare<raft::KeyValuePair<int, float>>()));
+  ASSERT_TRUE(raft::match(
+    kvp_a, eval_op_on_device(op, kvp_c, kvp_a), raft::Compare<raft::KeyValuePair<int, float>>()));
+  ASSERT_TRUE(raft::match(
+    kvp_b, eval_op_on_device(op, kvp_b, kvp_c), raft::Compare<raft::KeyValuePair<int, float>>()));
+  ASSERT_TRUE(raft::match(
+    kvp_b, eval_op_on_device(op, kvp_c, kvp_b), raft::Compare<raft::KeyValuePair<int, float>>()));
+}
+
+TEST(OperatorsDevice, ConstOp)
+{
+  raft::const_op op(12.34f);
+  ASSERT_TRUE(raft::match(12.34f, eval_op_on_device(op), raft::Compare<float>()));
+  ASSERT_TRUE(raft::match(12.34f, eval_op_on_device(op, 42), raft::Compare<float>()));
+  ASSERT_TRUE(raft::match(12.34f, eval_op_on_device(op, 13, 37.0f), raft::Compare<float>()));
+}
+
+template <typename T>
+struct trinary_add {
+  const T c;
+  constexpr explicit trinary_add(const T& c_) : c{c_} {}
+  constexpr RAFT_INLINE_FUNCTION auto operator()(T a, T b) const { return a + b + c; }
+};
+
+TEST(OperatorsDevice, PlugConstOp)
+{
+  // First, wrap around a default-constructible op
+  {
+    raft::plug_const_op<float, raft::add_op> op(0.34f);
+    ASSERT_TRUE(
+      raft::match(12.34f, eval_op_on_device(op, 12.0f), raft::CompareApprox<float>(0.0001f)));
+  }
+
+  // Second, wrap around a non-default-constructible op
+  {
+    auto op = raft::plug_const_op(10.0f, trinary_add<float>(2.0f));
+    ASSERT_TRUE(
+      raft::match(12.34f, eval_op_on_device(op, 0.34f), raft::CompareApprox<float>(0.0001f)));
+  }
+}
+
+TEST(OperatorsDevice, AddConstOp)
+{
+  raft::add_const_op<float> op(0.34f);
+  ASSERT_TRUE(
+    raft::match(12.34f, eval_op_on_device(op, 12.0f), raft::CompareApprox<float>(0.0001f)));
+}
+
+TEST(OperatorsDevice, SubConstOp)
+{
+  raft::sub_const_op<float> op(0.66f);
+  ASSERT_TRUE(
+    raft::match(12.34f, eval_op_on_device(op, 13.0f), raft::CompareApprox<float>(0.0001f)));
+}
+
+TEST(OperatorsDevice, MulConstOp)
+{
+  raft::mul_const_op<float> op(2.0f);
+  ASSERT_TRUE(
+    raft::match(12.34f, eval_op_on_device(op, 6.17f), raft::CompareApprox<float>(0.0001f)));
+}
+
+TEST(OperatorsDevice, DivConstOp)
+{
+  raft::div_const_op<float> op(3.0f);
+  ASSERT_TRUE(
+    raft::match(12.34f, eval_op_on_device(op, 37.02f), raft::CompareApprox<float>(0.0001f)));
+}
+
+TEST(OperatorsDevice, DivCheckZeroConstOp)
+{
+  // Non-zero denominator
+  {
+    raft::div_checkzero_const_op<float> op(3.0f);
+    ASSERT_TRUE(
+      raft::match(12.34f, eval_op_on_device(op, 37.02f), raft::CompareApprox<float>(0.0001f)));
+  }
+  // Zero denominator
+  {
+    raft::div_checkzero_const_op<float> op(0.0f);
+    ASSERT_TRUE(
+      raft::match(0.0f, eval_op_on_device(op, 37.02f), raft::CompareApprox<float>(0.0001f)));
+  }
+}
+
+TEST(OperatorsDevice, PowConstOp)
+{
+  raft::pow_const_op<float> op(3.0f);
+  ASSERT_TRUE(
+    raft::match(1000.0f, eval_op_on_device(op, 10.0f), raft::CompareApprox<float>(0.0001f)));
+}
+
+TEST(OperatorsDevice, ComposeOp)
+{
+  // All ops are default-constructible
+  {
+    raft::compose_op<raft::sqrt_op, raft::abs_op, raft::cast_op<float>> op;
+    ASSERT_TRUE(raft::match(
+      std::sqrt(42.0f), eval_op_on_device(op, -42, 0), raft::CompareApprox<float>(0.0001f)));
+  }
+  // Some ops are not default-constructible
+  {
+    auto op = raft::compose_op(
+      raft::sqrt_op(), raft::abs_op(), raft::add_const_op<float>(8.0f), raft::cast_op<float>());
+    ASSERT_TRUE(raft::match(
+      std::sqrt(42.0f), eval_op_on_device(op, -50, 0), raft::CompareApprox<float>(0.0001f)));
+  }
+}
+
+TEST(OperatorsDevice, MapArgsOp)
+{
+  // All ops are default-constructible
+  {
+    raft::map_args_op<raft::add_op, raft::sq_op, raft::abs_op> op;
+    ASSERT_TRUE(
+      raft::match(42.0f, eval_op_on_device(op, 5.0f, -17.0f), raft::CompareApprox<float>(0.0001f)));
+  }
+  // Some ops are not default-constructible
+  {
+    auto op = raft::map_args_op(
+      raft::add_op(), raft::pow_const_op<float>(2.0f), raft::mul_const_op<float>(-1.0f));
+    ASSERT_TRUE(
+      raft::match(42.0f, eval_op_on_device(op, 5.0f, -17.0f), raft::CompareApprox<float>(0.0001f)));
+  }
+}
diff --git a/cpp/test/core/operators_host.cpp b/cpp/test/core/operators_host.cpp
new file mode 100644
index 0000000000..de66fda919
--- /dev/null
+++ b/cpp/test/core/operators_host.cpp
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <cmath>
+
+#include <gtest/gtest.h>
+
+#include "../test_utils.h"
+#include <raft/core/kvp.hpp>
+#include <raft/core/operators.hpp>
+
+TEST(OperatorsHost, IdentityOp)
+{
+  raft::identity_op op;
+  ASSERT_TRUE(raft::match(12.34f, op(12.34f, 0), raft::Compare<float>()));
+}
+
+TEST(OperatorsHost, CastOp)
+{
+  raft::cast_op<float> op;
+  ASSERT_TRUE(raft::match(1234.0f, op(1234, 0), raft::CompareApprox<float>(0.00001f)));
+}
+
+TEST(OperatorsHost, KeyOp)
+{
+  raft::key_op op;
+  raft::KeyValuePair<int, float> kvp(12, 3.4f);
+  ASSERT_TRUE(raft::match(12, op(kvp, 0), raft::Compare<int>()));
+}
+
+TEST(OperatorsHost, ValueOp)
+{
+  raft::value_op op;
+  raft::KeyValuePair<int, float> kvp(12, 3.4f);
+  ASSERT_TRUE(raft::match(3.4f, op(kvp, 0), raft::CompareApprox<float>(0.00001f)));
+}
+
+TEST(OperatorsHost, SqrtOpF)
+{
+  raft::sqrt_op op;
+  ASSERT_TRUE(raft::match(std::sqrt(12.34f), op(12.34f, 0), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(std::sqrt(12.34), op(12.34, 0), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(OperatorsHost, NZOp)
+{
+  raft::nz_op op;
+  ASSERT_TRUE(raft::match(0.0f, op(0.0f, 0), raft::CompareApprox<float>(0.00001f)));
+  ASSERT_TRUE(raft::match(1.0f, op(12.34f, 0), raft::CompareApprox<float>(0.00001f)));
+}
+
+TEST(OperatorsHost, AbsOp)
+{
+  raft::abs_op op;
+  ASSERT_TRUE(raft::match(12.34f, op(-12.34f, 0), raft::CompareApprox<float>(0.00001f)));
+  ASSERT_TRUE(raft::match(12.34, op(-12.34, 0), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(raft::match(1234, op(-1234, 0), raft::Compare<int>()));
+}
+
+TEST(OperatorsHost, SqOp)
+{
+  raft::sq_op op;
+  ASSERT_TRUE(raft::match(152.2756f, op(12.34f, 0), raft::CompareApprox<float>(0.00001f)));
+  ASSERT_TRUE(raft::match(289, op(-17, 0), raft::Compare<int>()));
+}
+
+TEST(OperatorsHost, AddOp)
+{
+  raft::add_op op;
+  ASSERT_TRUE(raft::match(12.34f, op(12.0f, 0.34f), raft::CompareApprox<float>(0.00001f)));
+  ASSERT_TRUE(raft::match(1234, op(1200, 34), raft::Compare<int>()));
+}
+
+TEST(OperatorsHost, SubOp)
+{
+  raft::sub_op op;
+  ASSERT_TRUE(raft::match(12.34f, op(13.0f, 0.66f), raft::CompareApprox<float>(0.00001f)));
+  ASSERT_TRUE(raft::match(1234, op(1300, 66), raft::Compare<int>()));
+}
+
+TEST(OperatorsHost, MulOp)
+{
+  raft::mul_op op;
+  ASSERT_TRUE(raft::match(12.34f, op(2.0f, 6.17f), raft::CompareApprox<float>(0.00001f)));
+}
+
+TEST(OperatorsHost, DivOp)
+{
+  raft::div_op op;
+  ASSERT_TRUE(raft::match(12.34f, op(37.02f, 3.0f), raft::CompareApprox<float>(0.00001f)));
+}
+
+TEST(OperatorsHost, DivCheckZeroOp)
+{
+  raft::div_checkzero_op op;
+  ASSERT_TRUE(raft::match(12.34f, op(37.02f, 3.0f), raft::CompareApprox<float>(0.00001f)));
+  ASSERT_TRUE(raft::match(0.0f, op(37.02f, 0.0f), raft::CompareApprox<float>(0.00001f)));
+}
+
+TEST(OperatorsHost, PowOp)
+{
+  raft::pow_op op;
+  ASSERT_TRUE(raft::match(1000.0f, op(10.0f, 3.0f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(1000.0, op(10.0, 3.0), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(OperatorsHost, MinOp)
+{
+  raft::min_op op;
+  ASSERT_TRUE(raft::match(3.0f, op(3.0f, 5.0f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(3.0, op(5.0, 3.0), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(raft::match(3, op(3, 5), raft::Compare<int>()));
+}
+
+TEST(OperatorsHost, MaxOp)
+{
+  raft::max_op op;
+  ASSERT_TRUE(raft::match(5.0f, op(3.0f, 5.0f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(5.0, op(5.0, 3.0), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(raft::match(5, op(3, 5), raft::Compare<int>()));
+}
+
+TEST(OperatorsHost, SqDiffOp)
+{
+  raft::sqdiff_op op;
+  ASSERT_TRUE(raft::match(4.0f, op(3.0f, 5.0f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(4.0, op(5.0, 3.0), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(OperatorsHost, ArgminOp)
+{
+  raft::argmin_op op;
+  raft::KeyValuePair<int, float> kvp_a(0, 1.2f);
+  raft::KeyValuePair<int, float> kvp_b(0, 3.4f);
+  raft::KeyValuePair<int, float> kvp_c(1, 1.2f);
+  ASSERT_TRUE(
+    raft::match(kvp_a, op(kvp_a, kvp_b), raft::Compare<raft::KeyValuePair<int, float>>()));
+  ASSERT_TRUE(
+    raft::match(kvp_a, op(kvp_b, kvp_a), raft::Compare<raft::KeyValuePair<int, float>>()));
+  ASSERT_TRUE(
+    raft::match(kvp_a, op(kvp_a, kvp_c), raft::Compare<raft::KeyValuePair<int, float>>()));
+  ASSERT_TRUE(
+    raft::match(kvp_a, op(kvp_c, kvp_a), raft::Compare<raft::KeyValuePair<int, float>>()));
+  ASSERT_TRUE(
+    raft::match(kvp_c, op(kvp_b, kvp_c), raft::Compare<raft::KeyValuePair<int, float>>()));
+  ASSERT_TRUE(
+    raft::match(kvp_c, op(kvp_c, kvp_b), raft::Compare<raft::KeyValuePair<int, float>>()));
+}
+
+TEST(OperatorsHost, ArgmaxOp)
+{
+  raft::argmax_op op;
+  raft::KeyValuePair<int, float> kvp_a(0, 1.2f);
+  raft::KeyValuePair<int, float> kvp_b(0, 3.4f);
+  raft::KeyValuePair<int, float> kvp_c(1, 1.2f);
+  ASSERT_TRUE(
+    raft::match(kvp_b, op(kvp_a, kvp_b), raft::Compare<raft::KeyValuePair<int, float>>()));
+  ASSERT_TRUE(
+    raft::match(kvp_b, op(kvp_b, kvp_a), raft::Compare<raft::KeyValuePair<int, float>>()));
+  ASSERT_TRUE(
+    raft::match(kvp_a, op(kvp_a, kvp_c), raft::Compare<raft::KeyValuePair<int, float>>()));
+  ASSERT_TRUE(
+    raft::match(kvp_a, op(kvp_c, kvp_a), raft::Compare<raft::KeyValuePair<int, float>>()));
+  ASSERT_TRUE(
+    raft::match(kvp_b, op(kvp_b, kvp_c), raft::Compare<raft::KeyValuePair<int, float>>()));
+  ASSERT_TRUE(
+    raft::match(kvp_b, op(kvp_c, kvp_b), raft::Compare<raft::KeyValuePair<int, float>>()));
+}
+
+TEST(OperatorsHost, ConstOp)
+{
+  raft::const_op op(12.34f);
+  ASSERT_TRUE(raft::match(12.34f, op(), raft::Compare<float>()));
+  ASSERT_TRUE(raft::match(12.34f, op(42), raft::Compare<float>()));
+  ASSERT_TRUE(raft::match(12.34f, op(13, 37.0f), raft::Compare<float>()));
+}
+
+template <typename T>
+struct trinary_add {
+  const T c;
+  constexpr explicit trinary_add(const T& c_) : c{c_} {}
+  constexpr RAFT_INLINE_FUNCTION auto operator()(T a, T b) const { return a + b + c; }
+};
+
+TEST(OperatorsHost, PlugConstOp)
+{
+  // First, wrap around a default-constructible op
+  {
+    raft::plug_const_op<float, raft::add_op> op(0.34f);
+    ASSERT_TRUE(raft::match(12.34f, op(12.0f), raft::CompareApprox<float>(0.0001f)));
+  }
+
+  // Second, wrap around a non-default-constructible op
+  {
+    auto op = raft::plug_const_op(10.0f, trinary_add<float>(2.0f));
+    ASSERT_TRUE(raft::match(12.34f, op(0.34f), raft::CompareApprox<float>(0.0001f)));
+  }
+}
+
+TEST(OperatorsHost, AddConstOp)
+{
+  raft::add_const_op<float> op(0.34f);
+  ASSERT_TRUE(raft::match(12.34f, op(12.0f), raft::CompareApprox<float>(0.0001f)));
+}
+
+TEST(OperatorsHost, SubConstOp)
+{
+  raft::sub_const_op<float> op(0.66f);
+  ASSERT_TRUE(raft::match(12.34f, op(13.0f), raft::CompareApprox<float>(0.0001f)));
+}
+
+TEST(OperatorsHost, MulConstOp)
+{
+  raft::mul_const_op<float> op(2.0f);
+  ASSERT_TRUE(raft::match(12.34f, op(6.17f), raft::CompareApprox<float>(0.0001f)));
+}
+
+TEST(OperatorsHost, DivConstOp)
+{
+  raft::div_const_op<float> op(3.0f);
+  ASSERT_TRUE(raft::match(12.34f, op(37.02f), raft::CompareApprox<float>(0.0001f)));
+}
+
+TEST(OperatorsHost, DivCheckZeroConstOp)
+{
+  // Non-zero denominator
+  {
+    raft::div_checkzero_const_op<float> op(3.0f);
+    ASSERT_TRUE(raft::match(12.34f, op(37.02f), raft::CompareApprox<float>(0.0001f)));
+  }
+  // Zero denominator
+  {
+    raft::div_checkzero_const_op<float> op(0.0f);
+    ASSERT_TRUE(raft::match(0.0f, op(37.02f), raft::CompareApprox<float>(0.0001f)));
+  }
+}
+
+TEST(OperatorsHost, PowConstOp)
+{
+  raft::pow_const_op<float> op(3.0f);
+  ASSERT_TRUE(raft::match(1000.0f, op(10.0f), raft::CompareApprox<float>(0.0001f)));
+}
+
+TEST(OperatorsHost, ComposeOp)
+{
+  // All ops are default-constructible
+  {
+    raft::compose_op<raft::sqrt_op, raft::abs_op, raft::cast_op<float>> op;
+    ASSERT_TRUE(raft::match(std::sqrt(42.0f), op(-42, 0), raft::CompareApprox<float>(0.0001f)));
+  }
+  // Some ops are not default-constructible
+  {
+    auto op = raft::compose_op(
+      raft::sqrt_op(), raft::abs_op(), raft::add_const_op<float>(8.0f), raft::cast_op<float>());
+    ASSERT_TRUE(raft::match(std::sqrt(42.0f), op(-50, 0), raft::CompareApprox<float>(0.0001f)));
+  }
+}
+
+TEST(OperatorsHost, MapArgsOp)
+{
+  // All ops are default-constructible
+  {
+    raft::map_args_op<raft::add_op, raft::sq_op, raft::abs_op> op;
+    ASSERT_TRUE(raft::match(42.0f, op(5.0f, -17.0f), raft::CompareApprox<float>(0.0001f)));
+  }
+  // Some ops are not default-constructible
+  {
+    auto op = raft::map_args_op(
+      raft::add_op(), raft::pow_const_op<float>(2.0f), raft::mul_const_op<float>(-1.0f));
+    ASSERT_TRUE(raft::match(42.0f, op(5.0f, -17.0f), raft::CompareApprox<float>(0.0001f)));
+  }
+}
diff --git a/cpp/test/common/seive.cu b/cpp/test/core/seive.cu
similarity index 95%
rename from cpp/test/common/seive.cu
rename to cpp/test/core/seive.cu
index 54a59d6251..8634abf3be 100644
--- a/cpp/test/common/seive.cu
+++ b/cpp/test/core/seive.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/span.cpp b/cpp/test/core/span.cpp
similarity index 99%
rename from cpp/test/span.cpp
rename to cpp/test/core/span.cpp
index f8d9345a12..1a21b5ff47 100644
--- a/cpp/test/span.cpp
+++ b/cpp/test/core/span.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/span.cu b/cpp/test/core/span.cu
similarity index 99%
rename from cpp/test/span.cu
rename to cpp/test/core/span.cu
index e9af9b857f..f16a18332b 100644
--- a/cpp/test/span.cu
+++ b/cpp/test/core/span.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/test_span.hpp b/cpp/test/core/test_span.hpp
similarity index 99%
rename from cpp/test/test_span.hpp
rename to cpp/test/core/test_span.hpp
index 254c89f91c..27c50e9695 100644
--- a/cpp/test/test_span.hpp
+++ b/cpp/test/core/test_span.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu
index f3f36b4576..bbd06042c3 100644
--- a/cpp/test/distance/dist_adj.cu
+++ b/cpp/test/distance/dist_adj.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/distance/distance.cuh>
 #include <raft/random/rng.cuh>
@@ -156,7 +156,7 @@ class DistanceAdjTest : public ::testing::TestWithParam<DistanceAdjInputs<DataTy
   // memory consumption if we use uint8_t instead of bool.
   rmm::device_uvector<uint8_t> dist_ref;
   rmm::device_uvector<uint8_t> dist;
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 };
 
diff --git a/cpp/test/distance/dist_canberra.cu b/cpp/test/distance/dist_canberra.cu
index 1f368fbee8..db5555d9c8 100644
--- a/cpp/test/distance/dist_canberra.cu
+++ b/cpp/test/distance/dist_canberra.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include "distance_base.cuh"
 
 namespace raft {
diff --git a/cpp/test/distance/dist_chebyshev.cu b/cpp/test/distance/dist_chebyshev.cu
index 8f506601ca..abad828de7 100644
--- a/cpp/test/distance/dist_chebyshev.cu
+++ b/cpp/test/distance/dist_chebyshev.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include "distance_base.cuh"
 
 namespace raft {
diff --git a/cpp/test/distance/dist_correlation.cu b/cpp/test/distance/dist_correlation.cu
index 77d770b4d1..0e3f0ee0b5 100644
--- a/cpp/test/distance/dist_correlation.cu
+++ b/cpp/test/distance/dist_correlation.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include "distance_base.cuh"
 
 namespace raft {
diff --git a/cpp/test/distance/dist_cos.cu b/cpp/test/distance/dist_cos.cu
index 900a71e514..9faf7651f7 100644
--- a/cpp/test/distance/dist_cos.cu
+++ b/cpp/test/distance/dist_cos.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include "distance_base.cuh"
 
 namespace raft {
diff --git a/cpp/test/distance/dist_euc_exp.cu b/cpp/test/distance/dist_euc_exp.cu
index 5371b8a3e2..567e279691 100644
--- a/cpp/test/distance/dist_euc_exp.cu
+++ b/cpp/test/distance/dist_euc_exp.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include "distance_base.cuh"
 
 namespace raft {
diff --git a/cpp/test/distance/dist_euc_unexp.cu b/cpp/test/distance/dist_euc_unexp.cu
index 81e6be7116..311ad190e2 100644
--- a/cpp/test/distance/dist_euc_unexp.cu
+++ b/cpp/test/distance/dist_euc_unexp.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include "distance_base.cuh"
 
 namespace raft {
diff --git a/cpp/test/distance/dist_eucsqrt_exp.cu b/cpp/test/distance/dist_eucsqrt_exp.cu
index c4f2dc80c2..d717158649 100644
--- a/cpp/test/distance/dist_eucsqrt_exp.cu
+++ b/cpp/test/distance/dist_eucsqrt_exp.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include "distance_base.cuh"
 
 namespace raft {
diff --git a/cpp/test/distance/dist_hamming.cu b/cpp/test/distance/dist_hamming.cu
index 616ce8f729..1eef9fba4e 100644
--- a/cpp/test/distance/dist_hamming.cu
+++ b/cpp/test/distance/dist_hamming.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include "distance_base.cuh"
 
 namespace raft {
diff --git a/cpp/test/distance/dist_hellinger.cu b/cpp/test/distance/dist_hellinger.cu
index d6f994aaf6..85a157aa31 100644
--- a/cpp/test/distance/dist_hellinger.cu
+++ b/cpp/test/distance/dist_hellinger.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include "distance_base.cuh"
 
 namespace raft {
diff --git a/cpp/test/distance/dist_jensen_shannon.cu b/cpp/test/distance/dist_jensen_shannon.cu
index 43e4f3aa0f..a1e2f9f38c 100644
--- a/cpp/test/distance/dist_jensen_shannon.cu
+++ b/cpp/test/distance/dist_jensen_shannon.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include "distance_base.cuh"
 
 namespace raft {
diff --git a/cpp/test/distance/dist_kl_divergence.cu b/cpp/test/distance/dist_kl_divergence.cu
index 6a5fe8d7ac..94330d9450 100644
--- a/cpp/test/distance/dist_kl_divergence.cu
+++ b/cpp/test/distance/dist_kl_divergence.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include "distance_base.cuh"
 
 namespace raft {
diff --git a/cpp/test/distance/dist_l1.cu b/cpp/test/distance/dist_l1.cu
index 322fb52d5c..dc6bcf72b7 100644
--- a/cpp/test/distance/dist_l1.cu
+++ b/cpp/test/distance/dist_l1.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include "distance_base.cuh"
 
 namespace raft {
diff --git a/cpp/test/distance/dist_minkowski.cu b/cpp/test/distance/dist_minkowski.cu
index 3e0a2ead92..af2661da3a 100644
--- a/cpp/test/distance/dist_minkowski.cu
+++ b/cpp/test/distance/dist_minkowski.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include "distance_base.cuh"
 
 namespace raft {
diff --git a/cpp/test/distance/dist_russell_rao.cu b/cpp/test/distance/dist_russell_rao.cu
index e92a01c70a..3c5124c31f 100644
--- a/cpp/test/distance/dist_russell_rao.cu
+++ b/cpp/test/distance/dist_russell_rao.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include "distance_base.cuh"
 
 namespace raft {
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index 19d449c18b..be7b2b1de8 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,10 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/common/nvtx.hpp>
+#include <raft/core/operators.hpp>
 #include <raft/distance/distance.cuh>
 #include <raft/mdarray.hpp>
 #include <raft/util/cuda_utils.cuh>
@@ -51,7 +52,7 @@ __global__ void naiveDistanceKernel(DataType* dist,
   }
   if (type == raft::distance::DistanceType::L2SqrtExpanded ||
       type == raft::distance::DistanceType::L2SqrtUnexpanded)
-    acc = raft::mySqrt(acc);
+    acc = raft::sqrt(acc);
   int outidx   = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx] = acc;
 }
@@ -78,9 +79,9 @@ __global__ void naiveL1_Linf_CanberraDistanceKernel(DataType* dist,
     auto b    = y[yidx];
     auto diff = (a > b) ? (a - b) : (b - a);
     if (type == raft::distance::DistanceType::Linf) {
-      acc = raft::myMax(acc, diff);
+      acc = raft::max(acc, diff);
     } else if (type == raft::distance::DistanceType::Canberra) {
-      const auto add = raft::myAbs(a) + raft::myAbs(b);
+      const auto add = raft::abs(a) + raft::abs(b);
       // deal with potential for 0 in denominator by
       // forcing 1/0 instead
       acc += ((add != 0) * diff / (add + (add == 0)));
@@ -118,7 +119,7 @@ __global__ void naiveCosineDistanceKernel(
   int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
 
   // Use 1.0 - (cosine similarity) to calc the distance
-  dist[outidx] = (DataType)1.0 - acc_ab / (raft::mySqrt(acc_a) * raft::mySqrt(acc_b));
+  dist[outidx] = (DataType)1.0 - acc_ab / (raft::sqrt(acc_a) * raft::sqrt(acc_b));
 }
 
 template <typename DataType>
@@ -136,7 +137,7 @@ __global__ void naiveHellingerDistanceKernel(
     int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
     auto a   = x[xidx];
     auto b   = y[yidx];
-    acc_ab += raft::mySqrt(a) * raft::mySqrt(b);
+    acc_ab += raft::sqrt(a) * raft::sqrt(b);
   }
 
   int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
@@ -144,7 +145,7 @@ __global__ void naiveHellingerDistanceKernel(
   // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative
   acc_ab         = 1 - acc_ab;
   auto rectifier = (!signbit(acc_ab));
-  dist[outidx]   = raft::mySqrt(rectifier * acc_ab);
+  dist[outidx]   = raft::sqrt(rectifier * acc_ab);
 }
 
 template <typename DataType>
@@ -166,11 +167,11 @@ __global__ void naiveLpUnexpDistanceKernel(DataType* dist,
     int yidx  = isRowMajor ? i + nidx * k : i * n + nidx;
     auto a    = x[xidx];
     auto b    = y[yidx];
-    auto diff = raft::L1Op<DataType>()(a - b);
-    acc += raft::myPow(diff, p);
+    auto diff = raft::abs(a - b);
+    acc += raft::pow(diff, p);
   }
   auto one_over_p = 1 / p;
-  acc             = raft::myPow(acc, one_over_p);
+  acc             = raft::pow(acc, one_over_p);
   int outidx      = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx]    = acc;
 }
@@ -221,7 +222,7 @@ __global__ void naiveJensenShannonDistanceKernel(
 
     acc += (-a * (!p_zero * log(p + p_zero))) + (-b * (!q_zero * log(q + q_zero)));
   }
-  acc          = raft::mySqrt(0.5f * acc);
+  acc          = raft::sqrt(0.5f * acc);
   int outidx   = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx] = acc;
 }
@@ -296,7 +297,7 @@ __global__ void naiveCorrelationDistanceKernel(
   auto Q_denom = k * a_sq_norm - (a_norm * a_norm);
   auto R_denom = k * b_sq_norm - (b_norm * b_norm);
 
-  acc = 1 - (numer / raft::mySqrt(Q_denom * R_denom));
+  acc = 1 - (numer / raft::sqrt(Q_denom * R_denom));
 
   int outidx   = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx] = acc;
@@ -396,7 +397,7 @@ void distanceLauncher(DataType* x,
                       cudaStream_t stream,
                       DataType metric_arg = 2.0f)
 {
-  raft::handle_t handle(stream);
+  raft::device_resources handle(stream);
 
   auto x_v    = make_device_matrix_view<DataType, int, layout>(x, m, k);
   auto y_v    = make_device_matrix_view<DataType, int, layout>(y, n, k);
@@ -482,7 +483,7 @@ class DistanceTest : public ::testing::TestWithParam<DistanceInputs<DataType>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   DistanceInputs<DataType> params;
@@ -518,10 +519,10 @@ class BigMatrixDistanceTest : public ::testing::Test {
   }
 
  protected:
+  raft::device_resources handle;
   int m = 48000;
   int n = 48000;
   int k = 1;
-  raft::handle_t handle;
   rmm::device_uvector<float> x, dist;
 };
 }  // end namespace distance
diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
index 800f45c7fc..8b9681b9d3 100644
--- a/cpp/test/distance/fused_l2_nn.cu
+++ b/cpp/test/distance/fused_l2_nn.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/core/kvp.hpp>
 #include <raft/distance/detail/fused_l2_nn.cuh>
@@ -60,7 +60,7 @@ __global__ void naiveKernel(raft::KeyValuePair<int, DataT>* min,
     auto diff = midx >= m || nidx >= n ? DataT(0) : x[xidx] - y[yidx];
     acc += diff * diff;
   }
-  if (Sqrt) { acc = raft::mySqrt(acc); }
+  if (Sqrt) { acc = raft::sqrt(acc); }
   ReduceOpT redOp;
   typedef cub::WarpReduce<raft::KeyValuePair<int, DataT>> WarpReduce;
   __shared__ typename WarpReduce::TempStorage temp[NWARPS];
@@ -158,6 +158,8 @@ class FusedL2NNTest : public ::testing::TestWithParam<Inputs<DataT>> {
   }
 
  protected:
+  raft::device_resources handle;
+  cudaStream_t stream;
   Inputs<DataT> params;
   rmm::device_uvector<DataT> x;
   rmm::device_uvector<DataT> y;
@@ -166,8 +168,6 @@ class FusedL2NNTest : public ::testing::TestWithParam<Inputs<DataT>> {
   rmm::device_uvector<raft::KeyValuePair<int, DataT>> min;
   rmm::device_uvector<raft::KeyValuePair<int, DataT>> min_ref;
   rmm::device_uvector<char> workspace;
-  raft::handle_t handle;
-  cudaStream_t stream;
 
   virtual void generateGoldenResult()
   {
@@ -208,8 +208,8 @@ struct CompareApproxAbsKVP {
   CompareApproxAbsKVP(T eps_) : eps(eps_) {}
   bool operator()(const KVP& a, const KVP& b) const
   {
-    T diff  = raft::abs(raft::abs(a.value) - raft::abs(b.value));
-    T m     = std::max(raft::abs(a.value), raft::abs(b.value));
+    T diff  = std::abs(std::abs(a.value) - std::abs(b.value));
+    T m     = std::max(std::abs(a.value), std::abs(b.value));
     T ratio = m >= eps ? diff / m : diff;
     return (ratio <= eps);
   }
@@ -380,7 +380,7 @@ class FusedL2NNDetTest : public FusedL2NNTest<DataT, Sqrt> {
   void TearDown() override { FusedL2NNTest<DataT, Sqrt>::TearDown(); }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   rmm::device_uvector<raft::KeyValuePair<int, DataT>> min1;
diff --git a/cpp/test/distance/gram.cu b/cpp/test/distance/gram.cu
index 168e3d93f8..a2f0e2385c 100644
--- a/cpp/test/distance/gram.cu
+++ b/cpp/test/distance/gram.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 #include <raft/distance/specializations.cuh>
 #endif
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <iostream>
 #include <memory>
@@ -170,7 +170,7 @@ class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
       gram_host.data(), gram.data(), gram.size(), raft::CompareApprox<math_t>(1e-6f)));
   }
 
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream = 0;
   GramMatrixInputs params;
 
diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp
deleted file mode 100644
index 2ebc38d03a..0000000000
--- a/cpp/test/handle.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstddef>
-#include <gtest/gtest.h>
-#include <iostream>
-#include <memory>
-#include <raft/core/handle.hpp>
-
-namespace raft {
-
-TEST(Raft, HandleDefault)
-{
-  handle_t h;
-  ASSERT_EQ(0, h.get_device());
-  ASSERT_EQ(rmm::cuda_stream_per_thread, h.get_stream());
-  ASSERT_NE(nullptr, h.get_cublas_handle());
-  ASSERT_NE(nullptr, h.get_cusolver_dn_handle());
-  ASSERT_NE(nullptr, h.get_cusolver_sp_handle());
-  ASSERT_NE(nullptr, h.get_cusparse_handle());
-}
-
-TEST(Raft, Handle)
-{
-  // test stream pool creation
-  constexpr std::size_t n_streams = 4;
-  auto stream_pool                = std::make_shared<rmm::cuda_stream_pool>(n_streams);
-  handle_t h(rmm::cuda_stream_default, stream_pool);
-  ASSERT_EQ(n_streams, h.get_stream_pool_size());
-
-  // test non default stream handle
-  cudaStream_t stream;
-  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-  rmm::cuda_stream_view stream_view(stream);
-  handle_t handle(stream_view);
-  ASSERT_EQ(stream_view, handle.get_stream());
-  handle.sync_stream(stream);
-  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
-}
-
-TEST(Raft, GetHandleFromPool)
-{
-  constexpr std::size_t n_streams = 4;
-  auto stream_pool                = std::make_shared<rmm::cuda_stream_pool>(n_streams);
-  handle_t parent(rmm::cuda_stream_default, stream_pool);
-
-  for (std::size_t i = 0; i < n_streams; i++) {
-    auto worker_stream = parent.get_stream_from_stream_pool(i);
-    handle_t child(worker_stream);
-    ASSERT_EQ(parent.get_stream_from_stream_pool(i), child.get_stream());
-  }
-}
-
-}  // namespace raft
diff --git a/cpp/test/label/label.cu b/cpp/test/label/label.cu
index 02b3191c4d..bda87d423c 100644
--- a/cpp/test/label/label.cu
+++ b/cpp/test/label/label.cu
@@ -18,7 +18,7 @@
 
 #include <raft/label/classlabels.cuh>
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
diff --git a/cpp/test/label/merge_labels.cu b/cpp/test/label/merge_labels.cu
index 184ab4922f..5107015652 100644
--- a/cpp/test/label/merge_labels.cu
+++ b/cpp/test/label/merge_labels.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@
 #include <gtest/gtest.h>
 #include <raft/label/merge_labels.cuh>
 
-#include "../test_utils.h"
-#include <raft/core/handle.hpp>
+#include "../test_utils.cuh"
+#include <raft/core/device_resources.hpp>
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
@@ -69,7 +69,7 @@ class MergeLabelsTest : public ::testing::TestWithParam<MergeLabelsInputs<Index_
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   MergeLabelsInputs<Index_> params;
diff --git a/cpp/test/lap/lap.cu b/cpp/test/lap/lap.cu
index 58fd94f343..f26e41456f 100644
--- a/cpp/test/lap/lap.cu
+++ b/cpp/test/lap/lap.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  * Copyright 2020 KETAN DATE & RAKESH NAGI
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -66,7 +66,7 @@ void hungarian_test(int problemsize,
                     weight_t epsilon,
                     bool verbose = false)
 {
-  raft::handle_t handle;
+  raft::device_resources handle;
 
   weight_t* h_cost = new weight_t[batchsize * problemsize * problemsize];
 
diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu
index c73791086b..3836f714cb 100644
--- a/cpp/test/linalg/add.cu
+++ b/cpp/test/linalg/add.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include "add.cuh"
 #include <gtest/gtest.h>
 #include <raft/linalg/add.cuh>
@@ -62,7 +62,7 @@ class AddTest : public ::testing::TestWithParam<AddInputs<InT, OutT>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   AddInputs<InT, OutT> params;
diff --git a/cpp/test/linalg/axpy.cu b/cpp/test/linalg/axpy.cu
index f6cabae012..5fd7676792 100644
--- a/cpp/test/linalg/axpy.cu
+++ b/cpp/test/linalg/axpy.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 #include <raft/linalg/axpy.cuh>
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/random/rng.cuh>
 #include <raft/util/cuda_utils.cuh>
@@ -45,7 +45,7 @@ struct AxpyInputs {
 template <typename T, typename IndexType = int>
 class AxpyTest : public ::testing::TestWithParam<AxpyInputs<T>> {
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   AxpyInputs<T, IndexType> params;
   rmm::device_uvector<T> refy;
   rmm::device_uvector<T> y_device_alpha;
diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu
index b92fa09427..9936e665ba 100644
--- a/cpp/test/linalg/binary_op.cu
+++ b/cpp/test/linalg/binary_op.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,10 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include "binary_op.cuh"
 #include <gtest/gtest.h>
+#include <raft/core/operators.hpp>
 #include <raft/linalg/binary_op.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -29,15 +30,17 @@ namespace linalg {
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename InType, typename IdxType, typename OutType>
-void binaryOpLaunch(
-  const raft::handle_t& handle, OutType* out, const InType* in1, const InType* in2, IdxType len)
+void binaryOpLaunch(const raft::device_resources& handle,
+                    OutType* out,
+                    const InType* in1,
+                    const InType* in2,
+                    IdxType len)
 {
   auto out_view = raft::make_device_vector_view(out, len);
   auto in1_view = raft::make_device_vector_view(in1, len);
   auto in2_view = raft::make_device_vector_view(in2, len);
 
-  binary_op(
-    handle, in1_view, in2_view, out_view, [] __device__(InType a, InType b) { return a + b; });
+  binary_op(handle, in1_view, in2_view, out_view, raft::add_op{});
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
@@ -66,7 +69,7 @@ class BinaryOpTest : public ::testing::TestWithParam<BinaryOpInputs<InType, IdxT
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   BinaryOpInputs<InType, IdxType, OutType> params;
@@ -139,15 +142,10 @@ class BinaryOpAlignment : public ::testing::Test {
     RAFT_CUDA_TRY(cudaMemsetAsync(x.data(), 0, n * sizeof(math_t), stream));
     RAFT_CUDA_TRY(cudaMemsetAsync(y.data(), 0, n * sizeof(math_t), stream));
     raft::linalg::binaryOp(
-      z.data() + 9,
-      x.data() + 137,
-      y.data() + 19,
-      256,
-      [] __device__(math_t x, math_t y) { return x + y; },
-      handle.get_stream());
+      z.data() + 9, x.data() + 137, y.data() + 19, 256, raft::add_op{}, handle.get_stream());
   }
 
-  raft::handle_t handle;
+  raft::device_resources handle;
 };
 typedef ::testing::Types<float, double> FloatTypes;
 TYPED_TEST_CASE(BinaryOpAlignment, FloatTypes);
diff --git a/cpp/test/linalg/binary_op.cuh b/cpp/test/linalg/binary_op.cuh
index 62820ddb97..8b0bc609d2 100644
--- a/cpp/test/linalg/binary_op.cuh
+++ b/cpp/test/linalg/binary_op.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <raft/linalg/binary_op.cuh>
 #include <raft/util/cuda_utils.cuh>
 
diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu
index 134b7645ff..fba885957f 100644
--- a/cpp/test/linalg/cholesky_r1.cu
+++ b/cpp/test/linalg/cholesky_r1.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,14 +15,14 @@
  */
 
 #include <gtest/gtest.h>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/cholesky_r1_update.cuh>
 #include <raft/linalg/detail/cusolver_wrappers.hpp>
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <sstream>
 #include <vector>
 namespace raft {
@@ -115,7 +115,7 @@ class CholeskyR1Test : public ::testing::Test {
     }
   }
 
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   cusolverDnHandle_t solver_handle;
diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu
index 791537b430..1309d4c9c1 100644
--- a/cpp/test/linalg/coalesced_reduction.cu
+++ b/cpp/test/linalg/coalesced_reduction.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,10 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include "reduce.cuh"
 #include <gtest/gtest.h>
+#include <raft/core/operators.hpp>
 #include <raft/linalg/coalesced_reduction.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/util/cuda_utils.cuh>
@@ -42,13 +43,16 @@ template <typename T>
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename T>
-void coalescedReductionLaunch(
-  const raft::handle_t& handle, T* dots, const T* data, int cols, int rows, bool inplace = false)
+void coalescedReductionLaunch(const raft::device_resources& handle,
+                              T* dots,
+                              const T* data,
+                              int cols,
+                              int rows,
+                              bool inplace = false)
 {
   auto dots_view = raft::make_device_vector_view(dots, rows);
   auto data_view = raft::make_device_matrix_view(data, rows, cols);
-  coalesced_reduction(
-    handle, data_view, dots_view, (T)0, inplace, [] __device__(T in, int i) { return in * in; });
+  coalesced_reduction(handle, data_view, dots_view, (T)0, inplace, raft::sq_op{});
 }
 
 template <typename T>
@@ -80,9 +84,9 @@ class coalescedReductionTest : public ::testing::TestWithParam<coalescedReductio
                             stream,
                             T(0),
                             false,
-                            raft::L2Op<T, int>{},
-                            raft::Sum<T>{},
-                            raft::Nop<T>{});
+                            raft::sq_op{},
+                            raft::add_op{},
+                            raft::identity_op{});
     naiveCoalescedReduction(dots_exp.data(),
                             data.data(),
                             cols,
@@ -90,9 +94,9 @@ class coalescedReductionTest : public ::testing::TestWithParam<coalescedReductio
                             stream,
                             T(0),
                             true,
-                            raft::L2Op<T, int>{},
-                            raft::Sum<T>{},
-                            raft::Nop<T>{});
+                            raft::sq_op{},
+                            raft::add_op{},
+                            raft::identity_op{});
 
     coalescedReductionLaunch(handle, dots_act.data(), data.data(), cols, rows);
     coalescedReductionLaunch(handle, dots_act.data(), data.data(), cols, rows, true);
@@ -101,7 +105,7 @@ class coalescedReductionTest : public ::testing::TestWithParam<coalescedReductio
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   coalescedReductionInputs<T> params;
diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu
index 4e2e5cdba7..6188e891d5 100644
--- a/cpp/test/linalg/divide.cu
+++ b/cpp/test/linalg/divide.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include "unary_op.cuh"
 #include <gtest/gtest.h>
 #include <raft/linalg/divide.cuh>
@@ -67,7 +67,7 @@ class DivideTest : public ::testing::TestWithParam<raft::linalg::UnaryOpInputs<T
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   UnaryOpInputs<T> params;
diff --git a/cpp/test/linalg/dot.cu b/cpp/test/linalg/dot.cu
index b5007aea32..8b8ca374d7 100644
--- a/cpp/test/linalg/dot.cu
+++ b/cpp/test/linalg/dot.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 #include <raft/linalg/dot.cuh>
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/random/rng.cuh>
 #include <raft/util/cuda_utils.cuh>
@@ -57,7 +57,7 @@ class DotTest : public ::testing::TestWithParam<DotInputs<T>> {
   {
     params = ::testing::TestWithParam<DotInputs<T>>::GetParam();
 
-    raft::handle_t handle;
+    raft::device_resources handle;
     cudaStream_t stream = handle.get_stream();
 
     raft::random::RngState r(params.seed);
diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu
index a913b14fcb..5229e99d20 100644
--- a/cpp/test/linalg/eig.cu
+++ b/cpp/test/linalg/eig.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/linalg/eig.cuh>
 #include <raft/random/rng.cuh>
@@ -141,7 +141,7 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   EigInputs<T> params;
diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu
index 9d57c4fa0a..24e8e83832 100644
--- a/cpp/test/linalg/eig_sel.cu
+++ b/cpp/test/linalg/eig_sel.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #if CUDART_VERSION >= 10010
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/linalg/eig.cuh>
 #include <raft/util/cuda_utils.cuh>
@@ -99,7 +99,7 @@ class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   EigSelInputs<T> params;
diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/linalg/eigen_solvers.cu
similarity index 95%
rename from cpp/test/eigen_solvers.cu
rename to cpp/test/linalg/eigen_solvers.cu
index 68b431b894..1f29d7e275 100644
--- a/cpp/test/eigen_solvers.cu
+++ b/cpp/test/linalg/eigen_solvers.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 
 #include <raft/common/nvtx.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/spectral/eigen_solvers.cuh>
 #include <raft/spectral/partition.cuh>
 
@@ -35,7 +35,7 @@ TEST(Raft, EigenSolvers)
   using index_type = int;
   using value_type = double;
 
-  handle_t h;
+  raft::device_resources h;
   ASSERT_EQ(0,
             h.
 
@@ -81,7 +81,7 @@ TEST(Raft, SpectralSolvers)
   using index_type = int;
   using value_type = double;
 
-  handle_t h;
+  raft::device_resources h;
   ASSERT_EQ(0,
             h.
 
diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu
index 07ded5ec79..d8c72991c3 100644
--- a/cpp/test/linalg/eltwise.cu
+++ b/cpp/test/linalg/eltwise.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/linalg/eltwise.cuh>
 #include <raft/random/rng.cuh>
@@ -80,7 +80,7 @@ class ScalarMultiplyTest : public ::testing::TestWithParam<ScalarMultiplyInputs<
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   ScalarMultiplyInputs<T> params;
@@ -168,7 +168,7 @@ class EltwiseAddTest : public ::testing::TestWithParam<EltwiseAddInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   EltwiseAddInputs<T> params;
diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/test/linalg/gemm_layout.cu
index dbe10ab4cc..47b7e22d5d 100644
--- a/cpp/test/linalg/gemm_layout.cu
+++ b/cpp/test/linalg/gemm_layout.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/linalg/gemm.cuh>
 #include <raft/random/rng.cuh>
@@ -63,7 +63,7 @@ class GemmLayoutTest : public ::testing::TestWithParam<GemmLayoutInputs<T>> {
   {
     params = ::testing::TestWithParam<GemmLayoutInputs<T>>::GetParam();
 
-    raft::handle_t handle;
+    raft::device_resources handle;
     cudaStream_t stream = handle.get_stream();
 
     raft::random::RngState r(params.seed);
diff --git a/cpp/test/linalg/gemv.cu b/cpp/test/linalg/gemv.cu
index 2bd9abc200..b4f338fdd1 100644
--- a/cpp/test/linalg/gemv.cu
+++ b/cpp/test/linalg/gemv.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/linalg/gemv.cuh>
 #include <raft/random/rng.cuh>
@@ -85,7 +85,7 @@ class GemvTest : public ::testing::TestWithParam<GemvInputs<T>> {
   {
     params = ::testing::TestWithParam<GemvInputs<T>>::GetParam();
 
-    raft::handle_t handle;
+    raft::device_resources handle;
     cudaStream_t stream = handle.get_stream();
 
     raft::random::RngState r(params.seed);
diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu
index 95a2aff130..5b52374789 100644
--- a/cpp/test/linalg/map.cu
+++ b/cpp/test/linalg/map.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,8 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
+#include "unary_op.cuh"
 #include <gtest/gtest.h>
 #include <raft/linalg/eltwise.cuh>
 #include <raft/linalg/map.cuh>
@@ -33,7 +34,7 @@ void mapLaunch(OutType* out,
                IdxType len,
                cudaStream_t stream)
 {
-  raft::handle_t handle{stream};
+  raft::device_resources handle{stream};
   auto out_view = raft::make_device_vector_view(out, len);
   auto in1_view = raft::make_device_vector_view(in1, len);
   map(
@@ -99,7 +100,7 @@ class MapTest : public ::testing::TestWithParam<MapInputs<InType, IdxType, OutTy
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   MapInputs<InType, IdxType, OutType> params;
@@ -107,52 +108,70 @@ class MapTest : public ::testing::TestWithParam<MapInputs<InType, IdxType, OutTy
   rmm::device_uvector<OutType> out_ref, out;
 };
 
+template <typename OutType, typename IdxType>
+class MapOffsetTest : public ::testing::TestWithParam<MapInputs<OutType, IdxType, OutType>> {
+ public:
+  MapOffsetTest()
+    : params(::testing::TestWithParam<MapInputs<OutType, IdxType, OutType>>::GetParam()),
+      stream(handle.get_stream()),
+      out_ref(params.len, stream),
+      out(params.len, stream)
+  {
+  }
+
+ protected:
+  void SetUp() override
+  {
+    IdxType len    = params.len;
+    OutType scalar = params.scalar;
+    naiveScale(out_ref.data(), (OutType*)nullptr, scalar, len, stream);
+
+    auto out_view = raft::make_device_vector_view(out.data(), len);
+    map_offset(handle,
+               out_view,
+               raft::compose_op(raft::cast_op<OutType>(), raft::mul_const_op<OutType>(scalar)));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  }
+
+ protected:
+  raft::device_resources handle;
+  cudaStream_t stream;
+
+  MapInputs<OutType, IdxType, OutType> params;
+  rmm::device_uvector<OutType> out_ref, out;
+};
+
+#define MAP_TEST(test_type, test_name, inputs)                       \
+  typedef RAFT_DEPAREN(test_type) test_name;                         \
+  TEST_P(test_name, Result)                                          \
+  {                                                                  \
+    ASSERT_TRUE(devArrMatch(this->out_ref.data(),                    \
+                            this->out.data(),                        \
+                            this->params.len,                        \
+                            CompareApprox(this->params.tolerance))); \
+  }                                                                  \
+  INSTANTIATE_TEST_SUITE_P(MapTests, test_name, ::testing::ValuesIn(inputs))
+
 const std::vector<MapInputs<float, int>> inputsf_i32 = {{0.000001f, 1024 * 1024, 1234ULL, 3.2}};
-typedef MapTest<float, int> MapTestF_i32;
-TEST_P(MapTestF_i32, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<float>(params.tolerance)));
-}
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32, ::testing::ValuesIn(inputsf_i32));
+MAP_TEST((MapTest<float, int>), MapTestF_i32, inputsf_i32);
+MAP_TEST((MapOffsetTest<float, int>), MapOffsetTestF_i32, inputsf_i32);
 
 const std::vector<MapInputs<float, size_t>> inputsf_i64 = {{0.000001f, 1024 * 1024, 1234ULL, 9.4}};
-typedef MapTest<float, size_t> MapTestF_i64;
-TEST_P(MapTestF_i64, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<float>(params.tolerance)));
-}
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i64, ::testing::ValuesIn(inputsf_i64));
+MAP_TEST((MapTest<float, size_t>), MapTestF_i64, inputsf_i64);
+MAP_TEST((MapOffsetTest<float, size_t>), MapOffsetTestF_i64, inputsf_i64);
 
 const std::vector<MapInputs<float, int, double>> inputsf_i32_d = {
   {0.000001f, 1024 * 1024, 1234ULL, 5.9}};
-typedef MapTest<float, int, double> MapTestF_i32_D;
-TEST_P(MapTestF_i32_D, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
-}
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32_D, ::testing::ValuesIn(inputsf_i32_d));
+MAP_TEST((MapTest<float, int, double>), MapTestF_i32_D, inputsf_i32_d);
 
 const std::vector<MapInputs<double, int>> inputsd_i32 = {{0.00000001, 1024 * 1024, 1234ULL, 7.5}};
-typedef MapTest<double, int> MapTestD_i32;
-TEST_P(MapTestD_i32, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
-}
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i32, ::testing::ValuesIn(inputsd_i32));
+MAP_TEST((MapTest<double, int>), MapTestD_i32, inputsd_i32);
+MAP_TEST((MapOffsetTest<double, int>), MapOffsetTestD_i32, inputsd_i32);
 
 const std::vector<MapInputs<double, size_t>> inputsd_i64 = {
   {0.00000001, 1024 * 1024, 1234ULL, 5.2}};
-typedef MapTest<double, size_t> MapTestD_i64;
-TEST_P(MapTestD_i64, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
-}
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i64, ::testing::ValuesIn(inputsd_i64));
+MAP_TEST((MapTest<double, size_t>), MapTestD_i64, inputsd_i64);
+MAP_TEST((MapOffsetTest<double, size_t>), MapOffsetTestD_i64, inputsd_i64);
 
 }  // namespace linalg
 }  // namespace raft
diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu
index adf784f601..ae5058ef3e 100644
--- a/cpp/test/linalg/map_then_reduce.cu
+++ b/cpp/test/linalg/map_then_reduce.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,12 +14,14 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <limits>
+#include <raft/core/operators.hpp>
 #include <raft/linalg/map_reduce.cuh>
 #include <raft/linalg/map_then_reduce.cuh>
 #include <raft/random/rng.cuh>
+#include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
@@ -63,9 +65,8 @@ template <typename InType, typename OutType>
 void mapReduceLaunch(
   OutType* out_ref, OutType* out, const InType* in, size_t len, cudaStream_t stream)
 {
-  auto op = [] __device__(InType in) { return in; };
-  naiveMapReduce(out_ref, in, len, op, stream);
-  mapThenSumReduce(out, len, op, 0, in);
+  naiveMapReduce(out_ref, in, len, raft::identity_op{}, stream);
+  mapThenSumReduce(out, len, raft::identity_op{}, 0, in);
 }
 
 template <typename InType, typename OutType>
@@ -92,7 +93,7 @@ class MapReduceTest : public ::testing::TestWithParam<MapReduceInputs<InType>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   MapReduceInputs<InType> params;
@@ -150,29 +151,27 @@ class MapGenericReduceTest : public ::testing::Test {
 
   void testMin()
   {
-    auto op          = [] __device__(InType in) { return in; };
     OutType neutral  = std::numeric_limits<InType>::max();
     auto output_view = raft::make_device_scalar_view(output.data());
     auto input_view  = raft::make_device_vector_view<const InType>(
       input.data(), static_cast<std::uint32_t>(input.size()));
-    map_reduce(handle, input_view, output_view, neutral, op, cub::Min());
+    map_reduce(handle, input_view, output_view, neutral, raft::identity_op{}, cub::Min());
     EXPECT_TRUE(raft::devArrMatch(
       OutType(1), output.data(), 1, raft::Compare<OutType>(), handle.get_stream()));
   }
   void testMax()
   {
-    auto op          = [] __device__(InType in) { return in; };
     OutType neutral  = std::numeric_limits<InType>::min();
     auto output_view = raft::make_device_scalar_view(output.data());
     auto input_view  = raft::make_device_vector_view<const InType>(
       input.data(), static_cast<std::uint32_t>(input.size()));
-    map_reduce(handle, input_view, output_view, neutral, op, cub::Max());
+    map_reduce(handle, input_view, output_view, neutral, raft::identity_op{}, cub::Max());
     EXPECT_TRUE(raft::devArrMatch(
       OutType(5), output.data(), 1, raft::Compare<OutType>(), handle.get_stream()));
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   int n = 1237;
diff --git a/cpp/test/linalg/matrix_vector.cu b/cpp/test/linalg/matrix_vector.cu
index f103b5918b..602d01f60c 100644
--- a/cpp/test/linalg/matrix_vector.cu
+++ b/cpp/test/linalg/matrix_vector.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,12 +14,14 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include "matrix_vector_op.cuh"
 #include <gtest/gtest.h>
 #include <raft/core/device_mdspan.hpp>
+#include <raft/core/operators.hpp>
 #include <raft/linalg/matrix_vector.cuh>
 #include <raft/random/rng.cuh>
+#include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
 namespace raft {
@@ -44,7 +46,7 @@ template <typename T, typename IdxType>
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename T, typename IdxType>
-void matrix_vector_op_launch(const raft::handle_t& handle,
+void matrix_vector_op_launch(const raft::device_resources& handle,
                              T* in,
                              const T* vec1,
                              IdxType D,
@@ -96,7 +98,7 @@ void matrix_vector_op_launch(const raft::handle_t& handle,
 }
 
 template <typename T, typename IdxType>
-void naive_matrix_vector_op_launch(const raft::handle_t& handle,
+void naive_matrix_vector_op_launch(const raft::device_resources& handle,
                                    T* in,
                                    const T* vec1,
                                    IdxType D,
@@ -113,34 +115,25 @@ void naive_matrix_vector_op_launch(const raft::handle_t& handle,
       return mat_element;
     }
   };
-  auto operation_div = [] __device__(T mat_element, T vec_element) {
-    return mat_element / vec_element;
-  };
   auto operation_bin_div_skip_zero = [] __device__(T mat_element, T vec_element) {
-    if (raft::myAbs(vec_element) < T(1e-10))
+    if (raft::abs(vec_element) < T(1e-10))
       return T(0);
     else
       return mat_element / vec_element;
   };
-  auto operation_bin_add = [] __device__(T mat_element, T vec_element) {
-    return mat_element + vec_element;
-  };
-  auto operation_bin_sub = [] __device__(T mat_element, T vec_element) {
-    return mat_element - vec_element;
-  };
 
   if (operation_type == 0) {
     naiveMatVec(
       in, in, vec1, D, N, row_major, bcast_along_rows, operation_bin_mult_skip_zero, stream);
   } else if (operation_type == 1) {
-    naiveMatVec(in, in, vec1, D, N, row_major, bcast_along_rows, operation_div, stream);
+    naiveMatVec(in, in, vec1, D, N, row_major, bcast_along_rows, raft::div_op{}, stream);
   } else if (operation_type == 2) {
     naiveMatVec(
       in, in, vec1, D, N, row_major, bcast_along_rows, operation_bin_div_skip_zero, stream);
   } else if (operation_type == 3) {
-    naiveMatVec(in, in, vec1, D, N, row_major, bcast_along_rows, operation_bin_add, stream);
+    naiveMatVec(in, in, vec1, D, N, row_major, bcast_along_rows, raft::add_op{}, stream);
   } else if (operation_type == 4) {
-    naiveMatVec(in, in, vec1, D, N, row_major, bcast_along_rows, operation_bin_sub, stream);
+    naiveMatVec(in, in, vec1, D, N, row_major, bcast_along_rows, raft::sub_op{}, stream);
   } else {
     THROW("Unknown operation type '%d'!", (int)operation_type);
   }
@@ -190,7 +183,7 @@ class MatrixVectorTest : public ::testing::TestWithParam<MatrixVectorInputs<T, I
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   MatrixVectorInputs<T, IdxType> params;
diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
index 1c96c3fc74..5ba178e212 100644
--- a/cpp/test/linalg/matrix_vector_op.cu
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include "matrix_vector_op.cuh"
 #include <gtest/gtest.h>
 #include <raft/linalg/matrix_vector_op.cuh>
@@ -41,7 +41,10 @@ template <typename IdxType>
 }
 
 template <typename T, typename LenT>
-inline void gen_uniform(const raft::handle_t& handle, raft::random::RngState& rng, T* ptr, LenT len)
+inline void gen_uniform(const raft::device_resources& handle,
+                        raft::random::RngState& rng,
+                        T* ptr,
+                        LenT len)
 {
   if constexpr (std::is_integral_v<T>) {
     raft::random::uniformInt(handle, rng, ptr, len, (T)0, (T)100);
@@ -54,7 +57,7 @@ inline void gen_uniform(const raft::handle_t& handle, raft::random::RngState& rn
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename OpT, typename MatT, typename IdxType, typename Vec1T, typename Vec2T>
-void matrixVectorOpLaunch(const raft::handle_t& handle,
+void matrixVectorOpLaunch(const raft::device_resources& handle,
                           MatT* out,
                           const MatT* in,
                           const Vec1T* vec1,
@@ -156,7 +159,7 @@ class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<IdxType>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   MatVecOpInputs<IdxType> params;
diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/test/linalg/matrix_vector_op.cuh
index 602d05d153..cf316ef111 100644
--- a/cpp/test/linalg/matrix_vector_op.cuh
+++ b/cpp/test/linalg/matrix_vector_op.cuh
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <raft/linalg/matrix_vector_op.cuh>
 #include <raft/util/cuda_utils.cuh>
 
diff --git a/cpp/test/linalg/mean_squared_error.cu b/cpp/test/linalg/mean_squared_error.cu
index 795f831417..aa1c314e68 100644
--- a/cpp/test/linalg/mean_squared_error.cu
+++ b/cpp/test/linalg/mean_squared_error.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 #include <raft/linalg/mean_squared_error.cuh>
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/random/rng.cuh>
 #include <raft/util/cuda_utils.cuh>
@@ -49,7 +49,7 @@ class MeanSquaredErrorTest : public ::testing::TestWithParam<MeanSquaredErrorInp
  protected:
   MeanSquaredErrorInputs<T> params;
 
-  raft::handle_t handle;
+  raft::device_resources handle;
   rmm::device_scalar<T> output;
   rmm::device_scalar<T> refoutput;
 
diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu
index 1d6446c5c0..b8af7515e0 100644
--- a/cpp/test/linalg/multiply.cu
+++ b/cpp/test/linalg/multiply.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include "unary_op.cuh"
 #include <gtest/gtest.h>
 #include <raft/linalg/multiply.cuh>
@@ -52,7 +52,7 @@ class MultiplyTest : public ::testing::TestWithParam<UnaryOpInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   UnaryOpInputs<T> params;
diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu
index f0b8d3bb55..6dfeced6e0 100644
--- a/cpp/test/linalg/norm.cu
+++ b/cpp/test/linalg/norm.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,10 +14,12 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
+#include <raft/core/operators.hpp>
 #include <raft/linalg/norm.cuh>
 #include <raft/random/rng.cuh>
+#include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/itertools.hpp>
 
@@ -54,10 +56,10 @@ __global__ void naiveRowNormKernel(
       if (type == L2Norm) {
         acc += data[rowStart * D + i] * data[rowStart * D + i];
       } else {
-        acc += raft::myAbs(data[rowStart * D + i]);
+        acc += raft::abs(data[rowStart * D + i]);
       }
     }
-    dots[rowStart] = do_sqrt ? raft::mySqrt(acc) : acc;
+    dots[rowStart] = do_sqrt ? raft::sqrt(acc) : acc;
   }
 }
 
@@ -95,11 +97,10 @@ class RowNormTest : public ::testing::TestWithParam<NormInputs<T, IdxT>> {
     auto input_col_major = raft::make_device_matrix_view<const T, IdxT, raft::col_major>(
       data.data(), params.rows, params.cols);
     if (params.do_sqrt) {
-      auto fin_op = [] __device__(const T in) { return raft::mySqrt(in); };
       if (params.rowMajor) {
-        norm(handle, input_row_major, output_view, params.type, Apply::ALONG_ROWS, fin_op);
+        norm(handle, input_row_major, output_view, params.type, Apply::ALONG_ROWS, raft::sqrt_op{});
       } else {
-        norm(handle, input_col_major, output_view, params.type, Apply::ALONG_ROWS, fin_op);
+        norm(handle, input_col_major, output_view, params.type, Apply::ALONG_ROWS, raft::sqrt_op{});
       }
     } else {
       if (params.rowMajor) {
@@ -112,7 +113,7 @@ class RowNormTest : public ::testing::TestWithParam<NormInputs<T, IdxT>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   NormInputs<T, IdxT> params;
@@ -130,10 +131,10 @@ __global__ void naiveColNormKernel(
   Type acc = 0;
   for (IdxT i = 0; i < N; i++) {
     Type v = data[colID + i * D];
-    acc += type == L2Norm ? v * v : raft::myAbs(v);
+    acc += type == L2Norm ? v * v : raft::abs(v);
   }
 
-  dots[colID] = do_sqrt ? raft::mySqrt(acc) : acc;
+  dots[colID] = do_sqrt ? raft::sqrt(acc) : acc;
 }
 
 template <typename Type, typename IdxT>
@@ -171,11 +172,12 @@ class ColNormTest : public ::testing::TestWithParam<NormInputs<T, IdxT>> {
     auto input_col_major = raft::make_device_matrix_view<const T, IdxT, raft::col_major>(
       data.data(), params.rows, params.cols);
     if (params.do_sqrt) {
-      auto fin_op = [] __device__(const T in) { return raft::mySqrt(in); };
       if (params.rowMajor) {
-        norm(handle, input_row_major, output_view, params.type, Apply::ALONG_COLUMNS, fin_op);
+        norm(
+          handle, input_row_major, output_view, params.type, Apply::ALONG_COLUMNS, raft::sqrt_op{});
       } else {
-        norm(handle, input_col_major, output_view, params.type, Apply::ALONG_COLUMNS, fin_op);
+        norm(
+          handle, input_col_major, output_view, params.type, Apply::ALONG_COLUMNS, raft::sqrt_op{});
       }
     } else {
       if (params.rowMajor) {
@@ -188,7 +190,7 @@ class ColNormTest : public ::testing::TestWithParam<NormInputs<T, IdxT>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   NormInputs<T, IdxT> params;
diff --git a/cpp/test/linalg/normalize.cu b/cpp/test/linalg/normalize.cu
index cb949b6a5d..24f83a0d0a 100644
--- a/cpp/test/linalg/normalize.cu
+++ b/cpp/test/linalg/normalize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,12 +14,14 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
+#include <raft/core/operators.hpp>
 #include <raft/linalg/matrix_vector_op.cuh>
 #include <raft/linalg/norm.cuh>
 #include <raft/linalg/normalize.cuh>
 #include <raft/random/rng.cuh>
+#include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/itertools.hpp>
 
@@ -48,20 +50,13 @@ void rowNormalizeRef(
 {
   rmm::device_uvector<T> norm(rows, stream);
   if (norm_type == raft::linalg::L2Norm) {
-    raft::linalg::rowNorm(norm.data(), in, cols, rows, norm_type, true, stream, raft::SqrtOp<T>());
+    raft::linalg::rowNorm(norm.data(), in, cols, rows, norm_type, true, stream, raft::sqrt_op());
   } else {
-    raft::linalg::rowNorm(norm.data(), in, cols, rows, norm_type, true, stream, raft::Nop<T>());
+    raft::linalg::rowNorm(
+      norm.data(), in, cols, rows, norm_type, true, stream, raft::identity_op());
   }
   raft::linalg::matrixVectorOp(
-    out,
-    in,
-    norm.data(),
-    cols,
-    rows,
-    true,
-    false,
-    [] __device__(T a, T b) { return a / b; },
-    stream);
+    out, in, norm.data(), cols, rows, true, false, raft::div_op{}, stream);
 }
 
 template <typename T, typename IdxT>
@@ -95,7 +90,7 @@ class RowNormalizeTest : public ::testing::TestWithParam<RowNormalizeInputs<T, I
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   RowNormalizeInputs<T, IdxT> params;
diff --git a/cpp/test/linalg/power.cu b/cpp/test/linalg/power.cu
index bdab49d5c8..20b1fa0e45 100644
--- a/cpp/test/linalg/power.cu
+++ b/cpp/test/linalg/power.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/linalg/power.cuh>
 #include <raft/random/rng.cuh>
@@ -27,7 +27,7 @@ template <typename Type>
 __global__ void naivePowerElemKernel(Type* out, const Type* in1, const Type* in2, int len)
 {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) { out[idx] = raft::myPow(in1[idx], in2[idx]); }
+  if (idx < len) { out[idx] = raft::pow(in1[idx], in2[idx]); }
 }
 
 template <typename Type>
@@ -43,7 +43,7 @@ template <typename Type>
 __global__ void naivePowerScalarKernel(Type* out, const Type* in1, const Type in2, int len)
 {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) { out[idx] = raft::myPow(in1[idx], in2); }
+  if (idx < len) { out[idx] = raft::pow(in1[idx], in2); }
 }
 
 template <typename Type>
@@ -113,7 +113,7 @@ class PowerTest : public ::testing::TestWithParam<PowerInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   PowerInputs<T> params;
   rmm::device_uvector<T> in1, in2, out_ref, out;
   int device_count = 0;
diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu
index 00f3810d28..8cdeab5a94 100644
--- a/cpp/test/linalg/reduce.cu
+++ b/cpp/test/linalg/reduce.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,10 +14,11 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include "reduce.cuh"
 #include <gtest/gtest.h>
 #include <raft/core/detail/macros.hpp>
+#include <raft/core/operators.hpp>
 #include <raft/linalg/reduce.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -73,7 +74,7 @@ void reduceLaunch(OutType* dots,
   auto input_view_col_major =
     raft::make_device_matrix_view<const InType, IdxType, raft::col_major>(data, rows, cols);
 
-  raft::handle_t handle{stream};
+  raft::device_resources handle{stream};
 
   if (rowMajor) {
     reduce(handle,
@@ -101,9 +102,9 @@ void reduceLaunch(OutType* dots,
 template <typename InType,
           typename OutType,
           typename IdxType,
-          typename MainLambda   = raft::L2Op<InType, IdxType>,
-          typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda  = raft::SqrtOp<InType>>
+          typename MainLambda   = raft::sq_op,
+          typename ReduceLambda = raft::add_op,
+          typename FinalLambda  = raft::sqrt_op>
 class ReduceTest : public ::testing::TestWithParam<ReduceInputs<InType, OutType, IdxType>> {
  public:
   ReduceTest()
@@ -183,7 +184,7 @@ class ReduceTest : public ::testing::TestWithParam<ReduceInputs<InType, OutType,
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   ReduceInputs<InType, OutType, IdxType> params;
@@ -301,7 +302,7 @@ REDUCE_TEST((ReduceTest<short,
                         int,
                         ValueToKVP<short, int>,
                         ArgMaxOp<int, short>,
-                        raft::Nop<raft::KeyValuePair<int, short>, int>>),
+                        raft::identity_op>),
             ReduceTestKVPISI32,
             inputs_kvpis_i32);
 REDUCE_TEST((ReduceTest<float,
@@ -309,7 +310,7 @@ REDUCE_TEST((ReduceTest<float,
                         int,
                         ValueToKVP<float, int>,
                         ArgMaxOp<int, float>,
-                        raft::Nop<raft::KeyValuePair<int, float>, int>>),
+                        raft::identity_op>),
             ReduceTestKVPIFI32,
             inputs_kvpif_i32);
 REDUCE_TEST((ReduceTest<double,
@@ -317,7 +318,7 @@ REDUCE_TEST((ReduceTest<double,
                         int,
                         ValueToKVP<double, int>,
                         ArgMaxOp<int, double>,
-                        raft::Nop<raft::KeyValuePair<int, double>, int>>),
+                        raft::identity_op>),
             ReduceTestKVPIDI32,
             inputs_kvpid_i32);
 
diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh
index 0dcffd3f41..17e91ce202 100644
--- a/cpp/test/linalg/reduce.cuh
+++ b/cpp/test/linalg/reduce.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cublas_v2.h>
+#include <raft/core/operators.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/util/cuda_utils.cuh>
@@ -61,9 +62,9 @@ __global__ void naiveCoalescedReductionKernel(OutType* dots,
 template <typename InType,
           typename OutType,
           typename IdxType,
-          typename MainLambda   = raft::Nop<InType, IdxType>,
-          typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda  = raft::Nop<InType>>
+          typename MainLambda   = raft::identity_op,
+          typename ReduceLambda = raft::add_op,
+          typename FinalLambda  = raft::identity_op>
 void naiveCoalescedReduction(OutType* dots,
                              const InType* data,
                              IdxType D,
@@ -71,9 +72,9 @@ void naiveCoalescedReduction(OutType* dots,
                              cudaStream_t stream,
                              OutType init,
                              bool inplace           = false,
-                             MainLambda main_op     = raft::Nop<InType, IdxType>(),
-                             ReduceLambda reduce_op = raft::Sum<OutType>(),
-                             FinalLambda fin_op     = raft::Nop<InType>())
+                             MainLambda main_op     = raft::identity_op(),
+                             ReduceLambda reduce_op = raft::add_op(),
+                             FinalLambda fin_op     = raft::identity_op())
 {
   static const IdxType TPB = 64;
   IdxType nblks            = raft::ceildiv(N, TPB);
@@ -115,9 +116,9 @@ __global__ void naiveStridedReductionKernel(OutType* dots,
 template <typename InType,
           typename OutType,
           typename IdxType,
-          typename MainLambda   = raft::Nop<InType, IdxType>,
-          typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda  = raft::Nop<InType>>
+          typename MainLambda   = raft::identity_op,
+          typename ReduceLambda = raft::add_op,
+          typename FinalLambda  = raft::identity_op>
 void naiveStridedReduction(OutType* dots,
                            const InType* data,
                            IdxType D,
@@ -125,9 +126,9 @@ void naiveStridedReduction(OutType* dots,
                            cudaStream_t stream,
                            OutType init,
                            bool inplace           = false,
-                           MainLambda main_op     = raft::Nop<InType, IdxType>(),
-                           ReduceLambda reduce_op = raft::Sum<OutType>(),
-                           FinalLambda fin_op     = raft::Nop<InType>())
+                           MainLambda main_op     = raft::identity_op(),
+                           ReduceLambda reduce_op = raft::add_op(),
+                           FinalLambda fin_op     = raft::identity_op())
 {
   static const IdxType TPB = 64;
   IdxType nblks            = raft::ceildiv(D, TPB);
@@ -139,9 +140,9 @@ void naiveStridedReduction(OutType* dots,
 template <typename InType,
           typename OutType,
           typename IdxType,
-          typename MainLambda   = raft::Nop<InType, IdxType>,
-          typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda  = raft::Nop<InType>>
+          typename MainLambda   = raft::identity_op,
+          typename ReduceLambda = raft::add_op,
+          typename FinalLambda  = raft::identity_op>
 void naiveReduction(OutType* dots,
                     const InType* data,
                     IdxType D,
@@ -151,9 +152,9 @@ void naiveReduction(OutType* dots,
                     cudaStream_t stream,
                     OutType init,
                     bool inplace           = false,
-                    MainLambda main_op     = raft::Nop<InType, IdxType>(),
-                    ReduceLambda reduce_op = raft::Sum<OutType>(),
-                    FinalLambda fin_op     = raft::Nop<InType>())
+                    MainLambda main_op     = raft::identity_op(),
+                    ReduceLambda reduce_op = raft::add_op(),
+                    FinalLambda fin_op     = raft::identity_op())
 {
   if (rowMajor && alongRows) {
     naiveCoalescedReduction(dots, data, D, N, stream, init, inplace, main_op, reduce_op, fin_op);
diff --git a/cpp/test/linalg/reduce_cols_by_key.cu b/cpp/test/linalg/reduce_cols_by_key.cu
index 63afbe2fed..037a6a86e0 100644
--- a/cpp/test/linalg/reduce_cols_by_key.cu
+++ b/cpp/test/linalg/reduce_cols_by_key.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,33 +14,34 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/interruptible.hpp>
 #include <raft/linalg/reduce_cols_by_key.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/util/cudart_utils.hpp>
+#include <raft/util/itertools.hpp>
 
 namespace raft {
 namespace linalg {
 
-template <typename T>
+template <typename T, typename KeyT, typename IdxT>
 void naiveReduceColsByKey(const T* in,
-                          const uint32_t* keys,
+                          const KeyT* keys,
                           T* out_ref,
-                          uint32_t nrows,
-                          uint32_t ncols,
-                          uint32_t nkeys,
+                          IdxT nrows,
+                          IdxT ncols,
+                          IdxT nkeys,
                           cudaStream_t stream)
 {
-  std::vector<uint32_t> h_keys(ncols, 0u);
+  std::vector<KeyT> h_keys(ncols, 0u);
   raft::copy(&(h_keys[0]), keys, ncols, stream);
   std::vector<T> h_in(nrows * ncols);
   raft::copy(&(h_in[0]), in, nrows * ncols, stream);
   raft::interruptible::synchronize(stream);
   std::vector<T> out(nrows * nkeys, T(0));
-  for (uint32_t i = 0; i < nrows; ++i) {
-    for (uint32_t j = 0; j < ncols; ++j) {
+  for (IdxT i = 0; i < nrows; ++i) {
+    for (IdxT j = 0; j < ncols; ++j) {
       out[i * nkeys + h_keys[j]] += h_in[i * ncols + j];
     }
   }
@@ -48,31 +49,33 @@ void naiveReduceColsByKey(const T* in,
   raft::interruptible::synchronize(stream);
 }
 
-template <typename T>
+template <typename T, typename IdxT>
 struct ReduceColsInputs {
   T tolerance;
-  uint32_t rows;
-  uint32_t cols;
-  uint32_t nkeys;
+  IdxT rows;
+  IdxT cols;
+  IdxT nkeys;
   unsigned long long int seed;
 };
 
-template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const ReduceColsInputs<T>& dims)
+template <typename T, typename IdxT>
+::std::ostream& operator<<(::std::ostream& os, const ReduceColsInputs<T, IdxT>& p)
 {
+  os << "{" << p.tolerance << "," << p.rows << "," << p.cols << "," << p.nkeys << "," << p.seed
+     << "}";
   return os;
 }
 
-template <typename T>
-class ReduceColsTest : public ::testing::TestWithParam<ReduceColsInputs<T>> {
+template <typename T, typename KeyT, typename IdxT>
+class ReduceColsTest : public ::testing::TestWithParam<ReduceColsInputs<T, IdxT>> {
  protected:
   ReduceColsTest() : in(0, stream), out_ref(0, stream), out(0, stream), keys(0, stream) {}
 
   void SetUp() override
   {
-    params = ::testing::TestWithParam<ReduceColsInputs<T>>::GetParam();
+    params = ::testing::TestWithParam<ReduceColsInputs<T, IdxT>>::GetParam();
     raft::random::RngState r(params.seed);
-    raft::handle_t handle;
+    raft::device_resources handle;
     auto stream = handle.get_stream();
     auto nrows  = params.rows;
     auto ncols  = params.cols;
@@ -82,45 +85,53 @@ class ReduceColsTest : public ::testing::TestWithParam<ReduceColsInputs<T>> {
     out_ref.resize(nrows * nkeys, stream);
     out.resize(nrows * nkeys, stream);
     uniform(handle, r, in.data(), nrows * ncols, T(-1.0), T(1.0));
-    uniformInt(handle, r, keys.data(), ncols, 0u, params.nkeys);
+    uniformInt(handle, r, keys.data(), ncols, KeyT{0}, static_cast<KeyT>(params.nkeys));
     naiveReduceColsByKey(in.data(), keys.data(), out_ref.data(), nrows, ncols, nkeys, stream);
     auto input_view  = raft::make_device_matrix_view<const T>(in.data(), nrows, ncols);
     auto output_view = raft::make_device_matrix_view(out.data(), nrows, nkeys);
-    auto keys_view   = raft::make_device_vector_view<const uint32_t>(keys.data(), ncols);
+    auto keys_view   = raft::make_device_vector_view<const KeyT>(keys.data(), ncols);
     reduce_cols_by_key(handle, input_view, keys_view, output_view, nkeys);
     raft::interruptible::synchronize(stream);
   }
 
  protected:
   cudaStream_t stream = 0;
-  ReduceColsInputs<T> params;
+  ReduceColsInputs<T, IdxT> params;
   rmm::device_uvector<T> in, out_ref, out;
-  rmm::device_uvector<uint32_t> keys;
+  rmm::device_uvector<KeyT> keys;
 };
 
-const std::vector<ReduceColsInputs<float>> inputsf = {{0.0001f, 128, 32, 6, 1234ULL},
-                                                      {0.0005f, 121, 63, 10, 1234ULL}};
-typedef ReduceColsTest<float> ReduceColsTestF;
-TEST_P(ReduceColsTestF, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
-                                out.data(),
-                                params.rows * params.nkeys,
-                                raft::CompareApprox<float>(params.tolerance)));
-}
-INSTANTIATE_TEST_CASE_P(ReduceColsTests, ReduceColsTestF, ::testing::ValuesIn(inputsf));
+#define RCBK_TEST(test_type, test_name, test_inputs)                       \
+  typedef RAFT_DEPAREN(test_type) test_name;                               \
+  TEST_P(test_name, Result)                                                \
+  {                                                                        \
+    ASSERT_TRUE(raft::devArrMatch(out_ref.data(),                          \
+                                  out.data(),                              \
+                                  params.rows* params.nkeys,               \
+                                  raft::CompareApprox(params.tolerance))); \
+  }                                                                        \
+  INSTANTIATE_TEST_CASE_P(ReduceColsTests, test_name, ::testing::ValuesIn(test_inputs))
 
-const std::vector<ReduceColsInputs<double>> inputsd2 = {{0.0000001, 128, 32, 6, 1234ULL},
-                                                        {0.0000001, 121, 63, 10, 1234ULL}};
-typedef ReduceColsTest<double> ReduceColsTestD;
-TEST_P(ReduceColsTestD, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
-                                out.data(),
-                                params.rows * params.nkeys,
-                                raft::CompareApprox<double>(params.tolerance)));
-}
-INSTANTIATE_TEST_CASE_P(ReduceColsTests, ReduceColsTestD, ::testing::ValuesIn(inputsd2));
+const std::vector<ReduceColsInputs<float, int>> inputsf_i32 =
+  raft::util::itertools::product<ReduceColsInputs<float, int>>(
+    {0.001f}, {1, 9, 63, 1024}, {1234, 9999, 101010}, {7, 42, 127, 515, 2022}, {1234ULL});
+const std::vector<ReduceColsInputs<double, int>> inputsd_i32 =
+  raft::util::itertools::product<ReduceColsInputs<double, int>>(
+    {0.000001}, {1, 9, 63, 1024}, {1234, 9999, 101010}, {7, 42, 127, 515, 2022}, {1234ULL});
+const std::vector<ReduceColsInputs<float, uint32_t>> inputsf_u32 =
+  raft::util::itertools::product<ReduceColsInputs<float, uint32_t>>({0.001f},
+                                                                    {1u, 9u, 63u, 1024u},
+                                                                    {1234u, 9999u, 101010u},
+                                                                    {7u, 42u, 127u, 515u, 2022u},
+                                                                    {1234ULL});
+const std::vector<ReduceColsInputs<float, int64_t>> inputsf_i64 =
+  raft::util::itertools::product<ReduceColsInputs<float, int64_t>>(
+    {0.001f}, {1, 9, 63, 1024}, {1234, 9999, 101010}, {7, 42, 127, 515, 2022}, {1234ULL});
+
+RCBK_TEST((ReduceColsTest<float, uint32_t, int>), ReduceColsTestFU32I32, inputsf_i32);
+RCBK_TEST((ReduceColsTest<double, uint32_t, int>), ReduceColsTestDU32I32, inputsd_i32);
+RCBK_TEST((ReduceColsTest<float, int, uint32_t>), ReduceColsTestFI32U32, inputsf_u32);
+RCBK_TEST((ReduceColsTest<float, uint32_t, int64_t>), ReduceColsTestFI32I64, inputsf_i64);
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/reduce_rows_by_key.cu b/cpp/test/linalg/reduce_rows_by_key.cu
index 7b124cb7bb..69bacb0631 100644
--- a/cpp/test/linalg/reduce_rows_by_key.cu
+++ b/cpp/test/linalg/reduce_rows_by_key.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <iostream>
 #include <raft/linalg/reduce_rows_by_key.cuh>
@@ -112,7 +112,7 @@ class ReduceRowTest : public ::testing::TestWithParam<ReduceRowsInputs<T>> {
     rmm::device_uvector<T> weight(0, stream);
     if (params.weighted) {
       weight.resize(nobs, stream);
-      raft::random::RngState r(params.seed, raft::random::GeneratorType::GenPhilox);
+      raft::random::RngState r(params.seed);
       uniform(handle, r, weight.data(), nobs, T(1), params.max_weight);
     }
 
@@ -145,7 +145,7 @@ class ReduceRowTest : public ::testing::TestWithParam<ReduceRowsInputs<T>> {
 
  protected:
   ReduceRowsInputs<T> params;
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream = 0;
 
   int device_count = 0;
diff --git a/cpp/test/linalg/rsvd.cu b/cpp/test/linalg/rsvd.cu
index f774d59631..ba2572b5a9 100644
--- a/cpp/test/linalg/rsvd.cu
+++ b/cpp/test/linalg/rsvd.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/rsvd.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/util/cuda_utils.cuh>
@@ -64,7 +64,7 @@ class RsvdTest : public ::testing::TestWithParam<RsvdInputs<T>> {
 
   void SetUp() override
   {
-    raft::handle_t handle;
+    raft::device_resources handle;
     stream = handle.get_stream();
 
     params = ::testing::TestWithParam<RsvdInputs<T>>::GetParam();
@@ -272,7 +272,7 @@ TEST_P(RsvdSanityCheckRightVecD, Result)
 typedef RsvdTest<float> RsvdTestSquareMatrixNormF;
 TEST_P(RsvdTestSquareMatrixNormF, Result)
 {
-  raft::handle_t handle;
+  raft::device_resources handle;
 
   ASSERT_TRUE(raft::linalg::evaluateSVDByL2Norm(handle,
                                                 A.data(),
@@ -289,7 +289,7 @@ TEST_P(RsvdTestSquareMatrixNormF, Result)
 typedef RsvdTest<double> RsvdTestSquareMatrixNormD;
 TEST_P(RsvdTestSquareMatrixNormD, Result)
 {
-  raft::handle_t handle;
+  raft::device_resources handle;
 
   ASSERT_TRUE(raft::linalg::evaluateSVDByL2Norm(handle,
                                                 A.data(),
diff --git a/cpp/test/linalg/sqrt.cu b/cpp/test/linalg/sqrt.cu
index ed57e94914..7ee31da874 100644
--- a/cpp/test/linalg/sqrt.cu
+++ b/cpp/test/linalg/sqrt.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/linalg/sqrt.cuh>
 #include <raft/random/rng.cuh>
@@ -27,7 +27,7 @@ template <typename Type>
 __global__ void naiveSqrtElemKernel(Type* out, const Type* in1, int len)
 {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) { out[idx] = raft::mySqrt(in1[idx]); }
+  if (idx < len) { out[idx] = raft::sqrt(in1[idx]); }
 }
 
 template <typename Type>
@@ -82,7 +82,7 @@ class SqrtTest : public ::testing::TestWithParam<SqrtInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   SqrtInputs<T> params;
   rmm::device_uvector<T> in1, out_ref, out;
   int device_count = 0;
diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu
index 77ca585ea5..c9b32c3585 100644
--- a/cpp/test/linalg/strided_reduction.cu
+++ b/cpp/test/linalg/strided_reduction.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,11 +14,13 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include "reduce.cuh"
 #include <gtest/gtest.h>
+#include <raft/core/operators.hpp>
 #include <raft/linalg/strided_reduction.cuh>
 #include <raft/random/rng.cuh>
+#include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
 namespace raft {
@@ -35,10 +37,10 @@ template <typename T>
 void stridedReductionLaunch(
   T* dots, const T* data, int cols, int rows, bool inplace, cudaStream_t stream)
 {
-  raft::handle_t handle{stream};
+  raft::device_resources handle{stream};
   auto dots_view = raft::make_device_vector_view(dots, cols);
   auto data_view = raft::make_device_matrix_view(data, rows, cols);
-  strided_reduction(handle, data_view, dots_view, (T)0, inplace, raft::L2Op<T, int>{});
+  strided_reduction(handle, data_view, dots_view, (T)0, inplace, raft::sq_op{});
 }
 
 template <typename T>
@@ -70,9 +72,9 @@ class stridedReductionTest : public ::testing::TestWithParam<stridedReductionInp
                           stream,
                           T(0),
                           false,
-                          raft::L2Op<T, int>{},
-                          raft::Sum<T>{},
-                          raft::Nop<T>{});
+                          raft::sq_op{},
+                          raft::add_op{},
+                          raft::identity_op{});
     naiveStridedReduction(dots_exp.data(),
                           data.data(),
                           cols,
@@ -80,16 +82,16 @@ class stridedReductionTest : public ::testing::TestWithParam<stridedReductionInp
                           stream,
                           T(0),
                           true,
-                          raft::L2Op<T, int>{},
-                          raft::Sum<T>{},
-                          raft::Nop<T>{});
+                          raft::sq_op{},
+                          raft::add_op{},
+                          raft::identity_op{});
     stridedReductionLaunch(dots_act.data(), data.data(), cols, rows, false, stream);
     stridedReductionLaunch(dots_act.data(), data.data(), cols, rows, true, stream);
     handle.sync_stream(stream);
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   stridedReductionInputs<T> params;
diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu
index 3904f9f33f..222e64fc3c 100644
--- a/cpp/test/linalg/subtract.cu
+++ b/cpp/test/linalg/subtract.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/linalg/subtract.cuh>
 #include <raft/random/rng.cuh>
@@ -108,7 +108,7 @@ class SubtractTest : public ::testing::TestWithParam<SubtractInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   SubtractInputs<T> params;
diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu
index c18417dc9e..9eee0f538e 100644
--- a/cpp/test/linalg/svd.cu
+++ b/cpp/test/linalg/svd.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/linalg/svd.cuh>
 #include <raft/matrix/matrix.cuh>
@@ -98,7 +98,7 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   SvdInputs<T> params;
diff --git a/cpp/test/linalg/ternary_op.cu b/cpp/test/linalg/ternary_op.cu
index e172d771cd..3eadae95ae 100644
--- a/cpp/test/linalg/ternary_op.cu
+++ b/cpp/test/linalg/ternary_op.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/linalg/ternary_op.cuh>
 #include <raft/random/rng.cuh>
@@ -77,7 +77,7 @@ class ternaryOpTest : public ::testing::TestWithParam<BinaryOpInputs<T>> {
 
  protected:
   BinaryOpInputs<T> params;
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream = 0;
 
   rmm::device_uvector<T> out_add_ref, out_add, out_mul_ref, out_mul;
diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu
index 6a05317f49..9644ee53db 100644
--- a/cpp/test/linalg/transpose.cu
+++ b/cpp/test/linalg/transpose.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 
 #include <raft/linalg/transpose.cuh>
 #include <raft/util/cuda_utils.cuh>
@@ -71,7 +71,7 @@ class TransposeTest : public ::testing::TestWithParam<TranposeInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   TranposeInputs<T> params;
@@ -133,7 +133,7 @@ namespace {
  * @return The transposed matrix.
  */
 template <typename T, typename IndexType, typename LayoutPolicy>
-[[nodiscard]] auto transpose(handle_t const& handle,
+[[nodiscard]] auto transpose(raft::device_resources const& handle,
                              device_matrix_view<T, IndexType, LayoutPolicy> in)
   -> std::enable_if_t<std::is_floating_point_v<T> &&
                         (std::is_same_v<LayoutPolicy, layout_c_contiguous> ||
@@ -158,7 +158,7 @@ template <typename T, typename IndexType, typename LayoutPolicy>
  * @return The transposed matrix.
  */
 template <typename T, typename IndexType>
-[[nodiscard]] auto transpose(handle_t const& handle,
+[[nodiscard]] auto transpose(raft::device_resources const& handle,
                              device_matrix_view<T, IndexType, layout_stride> in)
   -> std::enable_if_t<std::is_floating_point_v<T>, device_matrix<T, IndexType, layout_stride>>
 {
@@ -188,7 +188,7 @@ template <typename T, typename IndexType>
 template <typename T, typename LayoutPolicy>
 void test_transpose_with_mdspan()
 {
-  handle_t handle;
+  raft::device_resources handle;
   auto v = make_device_matrix<T, size_t, LayoutPolicy>(handle, 32, 3);
   T k{0};
   for (size_t i = 0; i < v.extent(0); ++i) {
@@ -223,7 +223,7 @@ namespace {
 template <typename T, typename LayoutPolicy>
 void test_transpose_submatrix()
 {
-  handle_t handle;
+  raft::device_resources handle;
   auto v = make_device_matrix<T, size_t, LayoutPolicy>(handle, 32, 33);
   T k{0};
   size_t row_beg{3}, row_end{13}, col_beg{2}, col_end{11};
diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu
index 57b009a0ac..278eac348b 100644
--- a/cpp/test/linalg/unary_op.cu
+++ b/cpp/test/linalg/unary_op.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,10 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include "unary_op.cuh"
 #include <gtest/gtest.h>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -24,27 +25,6 @@
 namespace raft {
 namespace linalg {
 
-// Or else, we get the following compilation error
-// for an extended __device__ lambda cannot have private or protected access
-// within its class
-template <typename InType, typename IdxType = int, typename OutType = InType>
-void unaryOpLaunch(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
-{
-  raft::handle_t handle{stream};
-  auto out_view = raft::make_device_vector_view(out, len);
-  auto in_view  = raft::make_device_vector_view<const InType>(in, len);
-  if (in == nullptr) {
-    auto op = [scalar] __device__(OutType * ptr, IdxType idx) {
-      *ptr = static_cast<OutType>(scalar * idx);
-    };
-
-    write_only_unary_op(handle, out_view, op);
-  } else {
-    auto op = [scalar] __device__(InType in) { return static_cast<OutType>(in * scalar); };
-    unary_op(handle, in_view, out_view, op);
-  }
-}
-
 template <typename InType, typename IdxType, typename OutType = InType>
 class UnaryOpTest : public ::testing::TestWithParam<UnaryOpInputs<InType, IdxType, OutType>> {
  public:
@@ -71,14 +51,18 @@ class UnaryOpTest : public ::testing::TestWithParam<UnaryOpInputs<InType, IdxTyp
     auto len    = params.len;
     auto scalar = params.scalar;
     naiveScale(out_ref.data(), in.data(), scalar, len, stream);
-    unaryOpLaunch(out.data(), in.data(), scalar, len, stream);
+
+    auto in_view  = raft::make_device_vector_view<const InType>(in.data(), len);
+    auto out_view = raft::make_device_vector_view(out.data(), len);
+    unary_op(handle,
+             in_view,
+             out_view,
+             raft::compose_op(raft::cast_op<OutType>(), raft::mul_const_op<InType>(scalar)));
     handle.sync_stream(stream);
-    ASSERT_TRUE(devArrMatch(
-      out_ref.data(), out.data(), params.len, CompareApprox<OutType>(params.tolerance)));
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   UnaryOpInputs<InType, IdxType, OutType> params;
@@ -86,6 +70,22 @@ class UnaryOpTest : public ::testing::TestWithParam<UnaryOpInputs<InType, IdxTyp
   rmm::device_uvector<OutType> out_ref, out;
 };
 
+// Or else, we get the following compilation error:
+// The enclosing parent function ("DoTest") for an extended __device__ lambda cannot have private or
+// protected access within its class
+template <typename InType, typename IdxType, typename OutType>
+void launchWriteOnlyUnaryOp(const raft::device_resources& handle,
+                            OutType* out,
+                            InType scalar,
+                            IdxType len)
+{
+  auto out_view = raft::make_device_vector_view(out, len);
+  auto op       = [scalar] __device__(OutType * ptr, IdxType idx) {
+    *ptr = static_cast<OutType>(scalar * idx);
+  };
+  write_only_unary_op(handle, out_view, op);
+}
+
 template <typename OutType, typename IdxType>
 class WriteOnlyUnaryOpTest : public UnaryOpTest<OutType, IdxType, OutType> {
  protected:
@@ -94,50 +94,46 @@ class WriteOnlyUnaryOpTest : public UnaryOpTest<OutType, IdxType, OutType> {
     auto len    = this->params.len;
     auto scalar = this->params.scalar;
     naiveScale(this->out_ref.data(), (OutType*)nullptr, scalar, len, this->stream);
-    unaryOpLaunch(this->out.data(), (OutType*)nullptr, scalar, len, this->stream);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(this->stream));
-    ASSERT_TRUE(devArrMatch(this->out_ref.data(),
-                            this->out.data(),
-                            this->params.len,
-                            CompareApprox<OutType>(this->params.tolerance)));
+
+    launchWriteOnlyUnaryOp(this->handle, this->out.data(), scalar, len);
+    this->handle.sync_stream(this->stream);
   }
 };
 
-#define UNARY_OP_TEST(Name, inputs)  \
-  TEST_P(Name, Result) { DoTest(); } \
-  INSTANTIATE_TEST_SUITE_P(UnaryOpTests, Name, ::testing::ValuesIn(inputs))
+#define UNARY_OP_TEST(test_type, test_name, inputs)                  \
+  typedef RAFT_DEPAREN(test_type) test_name;                         \
+  TEST_P(test_name, Result)                                          \
+  {                                                                  \
+    DoTest();                                                        \
+    ASSERT_TRUE(devArrMatch(this->out_ref.data(),                    \
+                            this->out.data(),                        \
+                            this->params.len,                        \
+                            CompareApprox(this->params.tolerance))); \
+  }                                                                  \
+  INSTANTIATE_TEST_SUITE_P(UnaryOpTests, test_name, ::testing::ValuesIn(inputs))
 
 const std::vector<UnaryOpInputs<float, int>> inputsf_i32 = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}};
-typedef UnaryOpTest<float, int> UnaryOpTestF_i32;
-UNARY_OP_TEST(UnaryOpTestF_i32, inputsf_i32);
-typedef WriteOnlyUnaryOpTest<float, int> WriteOnlyUnaryOpTestF_i32;
-UNARY_OP_TEST(WriteOnlyUnaryOpTestF_i32, inputsf_i32);
+UNARY_OP_TEST((UnaryOpTest<float, int>), UnaryOpTestF_i32, inputsf_i32);
+UNARY_OP_TEST((WriteOnlyUnaryOpTest<float, int>), WriteOnlyUnaryOpTestF_i32, inputsf_i32);
 
 const std::vector<UnaryOpInputs<float, size_t>> inputsf_i64 = {
   {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
-typedef UnaryOpTest<float, size_t> UnaryOpTestF_i64;
-UNARY_OP_TEST(UnaryOpTestF_i64, inputsf_i64);
-typedef WriteOnlyUnaryOpTest<float, size_t> WriteOnlyUnaryOpTestF_i64;
-UNARY_OP_TEST(WriteOnlyUnaryOpTestF_i64, inputsf_i64);
+UNARY_OP_TEST((UnaryOpTest<float, size_t>), UnaryOpTestF_i64, inputsf_i64);
+UNARY_OP_TEST((WriteOnlyUnaryOpTest<float, size_t>), WriteOnlyUnaryOpTestF_i64, inputsf_i64);
 
 const std::vector<UnaryOpInputs<float, int, double>> inputsf_i32_d = {
   {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
-typedef UnaryOpTest<float, int, double> UnaryOpTestF_i32_D;
-UNARY_OP_TEST(UnaryOpTestF_i32_D, inputsf_i32_d);
+UNARY_OP_TEST((UnaryOpTest<float, int, double>), UnaryOpTestF_i32_D, inputsf_i32_d);
 
 const std::vector<UnaryOpInputs<double, int>> inputsd_i32 = {
   {0.00000001, 1024 * 1024, 2.0, 1234ULL}};
-typedef UnaryOpTest<double, int> UnaryOpTestD_i32;
-UNARY_OP_TEST(UnaryOpTestD_i32, inputsd_i32);
-typedef WriteOnlyUnaryOpTest<double, int> WriteOnlyUnaryOpTestD_i32;
-UNARY_OP_TEST(WriteOnlyUnaryOpTestD_i32, inputsd_i32);
+UNARY_OP_TEST((UnaryOpTest<double, int>), UnaryOpTestD_i32, inputsd_i32);
+UNARY_OP_TEST((WriteOnlyUnaryOpTest<double, int>), WriteOnlyUnaryOpTestD_i32, inputsd_i32);
 
 const std::vector<UnaryOpInputs<double, size_t>> inputsd_i64 = {
   {0.00000001, 1024 * 1024, 2.0, 1234ULL}};
-typedef UnaryOpTest<double, size_t> UnaryOpTestD_i64;
-UNARY_OP_TEST(UnaryOpTestD_i64, inputsd_i64);
-typedef WriteOnlyUnaryOpTest<double, size_t> WriteOnlyUnaryOpTestD_i64;
-UNARY_OP_TEST(WriteOnlyUnaryOpTestD_i64, inputsd_i64);
+UNARY_OP_TEST((UnaryOpTest<double, size_t>), UnaryOpTestD_i64, inputsd_i64);
+UNARY_OP_TEST((WriteOnlyUnaryOpTest<double, size_t>), WriteOnlyUnaryOpTestD_i64, inputsd_i64);
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/unary_op.cuh b/cpp/test/linalg/unary_op.cuh
index 190d531a9f..9d2bd6f7c9 100644
--- a/cpp/test/linalg/unary_op.cuh
+++ b/cpp/test/linalg/unary_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <raft/linalg/unary_op.cuh>
 #include <raft/util/cuda_utils.cuh>
 
@@ -29,7 +29,7 @@ __global__ void naiveScaleKernel(OutType* out, const InType* in, InType scalar,
   IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x);
   if (idx < len) {
     if (in == nullptr) {
-      // used for testing writeOnlyUnaryOp
+      // used for testing write_only_unary_op
       out[idx] = static_cast<OutType>(scalar * idx);
     } else {
       out[idx] = static_cast<OutType>(scalar * in[idx]);
diff --git a/cpp/test/matrix/argmax.cu b/cpp/test/matrix/argmax.cu
index 0219eb1aff..ec27b530d7 100644
--- a/cpp/test/matrix/argmax.cu
+++ b/cpp/test/matrix/argmax.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <cstdint>
 #include <gtest/gtest.h>
 #include <raft/core/device_mdarray.hpp>
@@ -67,7 +67,7 @@ class ArgMaxTest : public ::testing::TestWithParam<ArgMaxInputs<T, IdxT>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   ArgMaxInputs<T, IdxT> params;
 
   raft::device_matrix<T, std::uint32_t, row_major> input;
diff --git a/cpp/test/matrix/argmin.cu b/cpp/test/matrix/argmin.cu
index bdf178cd8a..73f6123167 100644
--- a/cpp/test/matrix/argmin.cu
+++ b/cpp/test/matrix/argmin.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <cstdint>
 #include <gtest/gtest.h>
 #include <raft/core/device_mdarray.hpp>
@@ -67,7 +67,7 @@ class ArgMinTest : public ::testing::TestWithParam<ArgMinInputs<T, IdxT>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   ArgMinInputs<T, IdxT> params;
 
   raft::device_matrix<T, std::uint32_t, row_major> input;
diff --git a/cpp/test/matrix/columnSort.cu b/cpp/test/matrix/columnSort.cu
index aba1c4e1f0..2292772b1a 100644
--- a/cpp/test/matrix/columnSort.cu
+++ b/cpp/test/matrix/columnSort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <algorithm>
 #include <gtest/gtest.h>
 #include <numeric>
@@ -116,10 +116,10 @@ class ColumnSort : public ::testing::TestWithParam<columnSort<T>> {
   }
 
  protected:
+  raft::device_resources handle;
   columnSort<T> params;
   rmm::device_uvector<T> keyIn, keySorted, keySortGolden;
   rmm::device_uvector<int> valueOut, goldenValOut;  // valueOut are indexes
-  raft::handle_t handle;
 };
 
 const std::vector<columnSort<float>> inputsf1 = {{0.000001f, 503, 2000, false},
diff --git a/cpp/test/matrix/diagonal.cu b/cpp/test/matrix/diagonal.cu
index e1ad9e144b..118aa7988f 100644
--- a/cpp/test/matrix/diagonal.cu
+++ b/cpp/test/matrix/diagonal.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
@@ -74,7 +74,7 @@ class DiagonalTest : public ::testing::TestWithParam<DiagonalInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   DiagonalInputs<T> params;
 
   int diag_size;
diff --git a/cpp/test/matrix/gather.cu b/cpp/test/matrix/gather.cu
index 4b3244913b..37c2067c77 100644
--- a/cpp/test/matrix/gather.cu
+++ b/cpp/test/matrix/gather.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,54 +14,76 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/device_mdspan.hpp>
+#include <raft/core/operators.hpp>
 #include <raft/matrix/gather.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/util/cuda_utils.cuh>
+#include <raft/util/itertools.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
 
-template <typename MatrixIteratorT, typename MapIteratorT>
-void naiveGatherImpl(
-  MatrixIteratorT in, int D, int N, MapIteratorT map, int map_length, MatrixIteratorT out)
+template <bool Conditional,
+          bool MapTransform,
+          typename InputIteratorT,
+          typename MapIteratorT,
+          typename StencilIteratorT,
+          typename UnaryPredicateOp,
+          typename MapTransformOp,
+          typename OutputIteratorT,
+          typename IdxT>
+void naiveGather(InputIteratorT in,
+                 IdxT D,
+                 IdxT N,
+                 MapIteratorT map,
+                 StencilIteratorT stencil,
+                 IdxT map_length,
+                 OutputIteratorT out,
+                 UnaryPredicateOp pred_op,
+                 MapTransformOp transform_op)
 {
-  for (int outRow = 0; outRow < map_length; ++outRow) {
+  for (IdxT outRow = 0; outRow < map_length; ++outRow) {
+    if constexpr (Conditional) {
+      auto stencil_val = stencil[outRow];
+      if (!pred_op(stencil_val)) continue;
+    }
     typename std::iterator_traits<MapIteratorT>::value_type map_val = map[outRow];
-    int inRowStart                                                  = map_val * D;
-    int outRowStart                                                 = outRow * D;
-    for (int i = 0; i < D; ++i) {
+    IdxT transformed_val;
+    if constexpr (MapTransform) {
+      transformed_val = transform_op(map_val);
+    } else {
+      transformed_val = map_val;
+    }
+    IdxT inRowStart  = transformed_val * D;
+    IdxT outRowStart = outRow * D;
+    for (IdxT i = 0; i < D; ++i) {
       out[outRowStart + i] = in[inRowStart + i];
     }
   }
 }
 
-template <typename MatrixIteratorT, typename MapIteratorT>
-void naiveGather(
-  MatrixIteratorT in, int D, int N, MapIteratorT map, int map_length, MatrixIteratorT out)
-{
-  naiveGatherImpl(in, D, N, map, map_length, out);
-}
-
+template <typename IdxT>
 struct GatherInputs {
-  uint32_t nrows;
-  uint32_t ncols;
-  uint32_t map_length;
+  IdxT nrows;
+  IdxT ncols;
+  IdxT map_length;
   unsigned long long int seed;
 };
 
-template <typename MatrixT, typename MapT>
-class GatherTest : public ::testing::TestWithParam<GatherInputs> {
+template <bool Conditional, bool MapTransform, typename MatrixT, typename MapT, typename IdxT>
+class GatherTest : public ::testing::TestWithParam<GatherInputs<IdxT>> {
  protected:
   GatherTest()
     : stream(handle.get_stream()),
-      params(::testing::TestWithParam<GatherInputs>::GetParam()),
+      params(::testing::TestWithParam<GatherInputs<IdxT>>::GetParam()),
       d_in(0, stream),
       d_out_exp(0, stream),
       d_out_act(0, stream),
+      d_stencil(0, stream),
       d_map(0, stream)
   {
   }
@@ -71,86 +93,118 @@ class GatherTest : public ::testing::TestWithParam<GatherInputs> {
     raft::random::RngState r(params.seed);
     raft::random::RngState r_int(params.seed);
 
-    uint32_t nrows      = params.nrows;
-    uint32_t ncols      = params.ncols;
-    uint32_t map_length = params.map_length;
-    uint32_t len        = nrows * ncols;
+    IdxT map_length = params.map_length;
+    IdxT len        = params.nrows * params.ncols;
 
     // input matrix setup
-    d_in.resize(nrows * ncols, stream);
-    h_in.resize(nrows * ncols);
+    d_in.resize(params.nrows * params.ncols, stream);
+    h_in.resize(params.nrows * params.ncols);
     raft::random::uniform(handle, r, d_in.data(), len, MatrixT(-1.0), MatrixT(1.0));
     raft::update_host(h_in.data(), d_in.data(), len, stream);
 
     // map setup
     d_map.resize(map_length, stream);
     h_map.resize(map_length);
-    raft::random::uniformInt(handle, r_int, d_map.data(), map_length, (MapT)0, nrows);
+    raft::random::uniformInt(handle, r_int, d_map.data(), map_length, (MapT)0, (MapT)params.nrows);
     raft::update_host(h_map.data(), d_map.data(), map_length, stream);
 
-    // expected and actual output matrix setup
-    h_out.resize(map_length * ncols);
-    d_out_exp.resize(map_length * ncols, stream);
-    d_out_act.resize(map_length * ncols, stream);
+    // stencil setup
+    if (Conditional) {
+      d_stencil.resize(map_length, stream);
+      h_stencil.resize(map_length);
+      raft::random::uniform(handle, r, d_stencil.data(), map_length, MatrixT(-1.0), MatrixT(1.0));
+      raft::update_host(h_stencil.data(), d_stencil.data(), map_length, stream);
+    }
 
-    // launch gather on the host and copy the results to device
-    naiveGather(h_in.data(), ncols, nrows, h_map.data(), map_length, h_out.data());
-    raft::update_device(d_out_exp.data(), h_out.data(), map_length * ncols, stream);
+    // unary predicate op (used only when Conditional is true)
+    auto pred_op = raft::plug_const_op(MatrixT(0.0), raft::greater_op());
 
-    auto in_view = raft::make_device_matrix_view<const MatrixT, std::uint32_t, row_major>(
-      d_in.data(), nrows, ncols);
-    auto out_view =
-      raft::make_device_matrix_view<MatrixT, std::uint32_t>(d_out_act.data(), map_length, ncols);
-    auto map_view =
-      raft::make_device_vector_view<const MapT, std::uint32_t, row_major>(d_map.data(), map_length);
+    // map transform op (used only when MapTransform is true)
+    auto transform_op =
+      raft::compose_op(raft::mod_const_op<IdxT>(params.nrows), raft::add_const_op<IdxT>(10));
 
-    raft::matrix::gather(handle, in_view, map_view, out_view);
+    // expected and actual output matrix setup
+    h_out.resize(map_length * params.ncols);
+    d_out_exp.resize(map_length * params.ncols, stream);
+    d_out_act.resize(map_length * params.ncols, stream);
 
-    //      // launch device version of the kernel
-    //    gatherLaunch(
-    //      handle, d_in.data(), ncols, nrows, d_map.data(), map_length, d_out_act.data(), stream);
+    // launch gather on the host and copy the results to device
+    naiveGather<Conditional, MapTransform>(h_in.data(),
+                                           params.ncols,
+                                           params.nrows,
+                                           h_map.data(),
+                                           h_stencil.data(),
+                                           map_length,
+                                           h_out.data(),
+                                           pred_op,
+                                           transform_op);
+    raft::update_device(d_out_exp.data(), h_out.data(), map_length * params.ncols, stream);
+
+    auto in_view = raft::make_device_matrix_view<const MatrixT, IdxT, row_major>(
+      d_in.data(), params.nrows, params.ncols);
+    auto out_view = raft::make_device_matrix_view<MatrixT, IdxT, row_major>(
+      d_out_act.data(), map_length, params.ncols);
+    auto map_view = raft::make_device_vector_view<const MapT, IdxT>(d_map.data(), map_length);
+    auto stencil_view =
+      raft::make_device_vector_view<const MatrixT, IdxT>(d_stencil.data(), map_length);
+
+    if (Conditional && MapTransform) {
+      raft::matrix::gather_if(
+        handle, in_view, out_view, map_view, stencil_view, pred_op, transform_op);
+    } else if (Conditional) {
+      raft::matrix::gather_if(handle, in_view, out_view, map_view, stencil_view, pred_op);
+    } else if (MapTransform) {
+      raft::matrix::gather(handle, in_view, map_view, out_view, transform_op);
+    } else {
+      raft::matrix::gather(handle, in_view, map_view, out_view);
+    }
 
     handle.sync_stream(stream);
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream = 0;
-  GatherInputs params;
-  std::vector<MatrixT> h_in, h_out;
+  GatherInputs<IdxT> params;
+  std::vector<MatrixT> h_in, h_out, h_stencil;
   std::vector<MapT> h_map;
-  rmm::device_uvector<MatrixT> d_in, d_out_exp, d_out_act;
+  rmm::device_uvector<MatrixT> d_in, d_out_exp, d_out_act, d_stencil;
   rmm::device_uvector<MapT> d_map;
 };
 
-const std::vector<GatherInputs> inputs = {{1024, 32, 128, 1234ULL},
-                                          {1024, 32, 256, 1234ULL},
-                                          {1024, 32, 512, 1234ULL},
-                                          {1024, 32, 1024, 1234ULL},
-                                          {1024, 64, 128, 1234ULL},
-                                          {1024, 64, 256, 1234ULL},
-                                          {1024, 64, 512, 1234ULL},
-                                          {1024, 64, 1024, 1234ULL},
-                                          {1024, 128, 128, 1234ULL},
-                                          {1024, 128, 256, 1234ULL},
-                                          {1024, 128, 512, 1234ULL},
-                                          {1024, 128, 1024, 1234ULL}};
-
-typedef GatherTest<float, uint32_t> GatherTestF;
-TEST_P(GatherTestF, Result)
-{
-  ASSERT_TRUE(devArrMatch(
-    d_out_exp.data(), d_out_act.data(), params.map_length * params.ncols, raft::Compare<float>()));
-}
-
-typedef GatherTest<double, uint32_t> GatherTestD;
-TEST_P(GatherTestD, Result)
-{
-  ASSERT_TRUE(devArrMatch(
-    d_out_exp.data(), d_out_act.data(), params.map_length * params.ncols, raft::Compare<double>()));
-}
-
-INSTANTIATE_TEST_CASE_P(GatherTests, GatherTestF, ::testing::ValuesIn(inputs));
-INSTANTIATE_TEST_CASE_P(GatherTests, GatherTestD, ::testing::ValuesIn(inputs));
+#define GATHER_TEST(test_type, test_name, test_inputs)       \
+  typedef RAFT_DEPAREN(test_type) test_name;                 \
+  TEST_P(test_name, Result)                                  \
+  {                                                          \
+    ASSERT_TRUE(devArrMatch(d_out_exp.data(),                \
+                            d_out_act.data(),                \
+                            params.map_length* params.ncols, \
+                            raft::Compare<float>()));        \
+  }                                                          \
+  INSTANTIATE_TEST_CASE_P(GatherTests, test_name, ::testing::ValuesIn(test_inputs))
+
+const std::vector<GatherInputs<int>> inputs_i32 =
+  raft::util::itertools::product<GatherInputs<int>>({25, 2000}, {6, 31, 129}, {11, 999}, {1234ULL});
+const std::vector<GatherInputs<int64_t>> inputs_i64 =
+  raft::util::itertools::product<GatherInputs<int64_t>>(
+    {25, 2000}, {6, 31, 129}, {11, 999}, {1234ULL});
+
+GATHER_TEST((GatherTest<false, false, float, uint32_t, int>), GatherTestFU32I32, inputs_i32);
+GATHER_TEST((GatherTest<false, true, float, uint32_t, int>),
+            GatherTransformTestFU32I32,
+            inputs_i32);
+GATHER_TEST((GatherTest<true, false, float, uint32_t, int>), GatherIfTestFU32I32, inputs_i32);
+GATHER_TEST((GatherTest<true, true, float, uint32_t, int>),
+            GatherIfTransformTestFU32I32,
+            inputs_i32);
+GATHER_TEST((GatherTest<true, true, double, uint32_t, int>),
+            GatherIfTransformTestDU32I32,
+            inputs_i32);
+GATHER_TEST((GatherTest<true, true, float, uint32_t, int64_t>),
+            GatherIfTransformTestFU32I64,
+            inputs_i64);
+GATHER_TEST((GatherTest<true, true, float, int64_t, int64_t>),
+            GatherIfTransformTestFI64I64,
+            inputs_i64);
 
 }  // end namespace raft
\ No newline at end of file
diff --git a/cpp/test/matrix/linewise_op.cu b/cpp/test/matrix/linewise_op.cu
index 2e3d54dcf5..04a8a91b01 100644
--- a/cpp/test/matrix/linewise_op.cu
+++ b/cpp/test/matrix/linewise_op.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,11 +15,12 @@
  */
 
 #include "../linalg/matrix_vector_op.cuh"
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <cuda_profiler_api.h>
 #include <gtest/gtest.h>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/nvtx.hpp>
+#include <raft/core/operators.hpp>
 #include <raft/linalg/matrix_vector_op.cuh>
 #include <raft/matrix/linewise_op.cuh>
 #include <raft/random/rng.cuh>
@@ -42,8 +43,8 @@ struct LinewiseTestParams {
 
 template <typename T, typename I, typename ParamsReader>
 struct LinewiseTest : public ::testing::TestWithParam<typename ParamsReader::Params> {
+  const raft::device_resources handle;
   const LinewiseTestParams params;
-  const raft::handle_t handle;
   rmm::cuda_stream_view stream;
 
   LinewiseTest()
@@ -58,7 +59,6 @@ struct LinewiseTest : public ::testing::TestWithParam<typename ParamsReader::Par
   template <typename layout>
   void runLinewiseSum(T* out, const T* in, const I lineLen, const I nLines, const T* vec)
   {
-    auto f                  = [] __device__(T a, T b) -> T { return a + b; };
     constexpr auto rowmajor = std::is_same_v<layout, row_major>;
 
     I m = rowmajor ? lineLen : nLines;
@@ -68,7 +68,8 @@ struct LinewiseTest : public ::testing::TestWithParam<typename ParamsReader::Par
     auto out_view = raft::make_device_matrix_view<T, I, layout>(out, n, m);
 
     auto vec_view = raft::make_device_vector_view<const T>(vec, lineLen);
-    matrix::linewise_op(handle, in_view, out_view, raft::is_row_major(in_view), f, vec_view);
+    matrix::linewise_op(
+      handle, in_view, out_view, raft::is_row_major(in_view), raft::add_op{}, vec_view);
   }
 
   template <typename layout>
@@ -107,9 +108,8 @@ struct LinewiseTest : public ::testing::TestWithParam<typename ParamsReader::Par
                             const bool alongLines,
                             const T* vec)
   {
-    auto f        = [] __device__(T a, T b) -> T { return a + b; };
     auto vec_view = raft::make_device_vector_view<const T, I>(vec, alongLines ? lineLen : nLines);
-    matrix::linewise_op(handle, in, out, alongLines, f, vec_view);
+    matrix::linewise_op(handle, in, out, alongLines, raft::add_op{}, vec_view);
   }
 
   /**
diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu
index 684b550dfc..cd3d865d80 100644
--- a/cpp/test/matrix/math.cu
+++ b/cpp/test/matrix/math.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 
 #include <raft/core/device_mdspan.hpp>
@@ -51,7 +51,7 @@ template <typename Type>
 __global__ void naiveSqrtKernel(Type* in, Type* out, int len)
 {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) { out[idx] = std::sqrt(in[idx]); }
+  if (idx < len) { out[idx] = raft::sqrt(in[idx]); }
 }
 
 template <typename Type>
@@ -207,7 +207,7 @@ class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   MathInputs<T> params;
diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu
index 78391d5ff2..10105203f7 100644
--- a/cpp/test/matrix/matrix.cu
+++ b/cpp/test/matrix/matrix.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/core/device_mdarray.hpp>
 
@@ -80,7 +80,7 @@ class MatrixTest : public ::testing::TestWithParam<MatrixInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   MatrixInputs<T> params;
@@ -161,7 +161,7 @@ class MatrixCopyRowsTest : public ::testing::Test {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   int n_rows     = 10;
diff --git a/cpp/test/matrix/norm.cu b/cpp/test/matrix/norm.cu
index 38fdd409eb..ed1c393c4f 100644
--- a/cpp/test/matrix/norm.cu
+++ b/cpp/test/matrix/norm.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/matrix/norm.cuh>
 #include <raft/random/rng.cuh>
@@ -74,7 +74,7 @@ class NormTest : public ::testing::TestWithParam<NormInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   NormInputs<T> params;
diff --git a/cpp/test/matrix/reverse.cu b/cpp/test/matrix/reverse.cu
index c905b8711e..f3929c582b 100644
--- a/cpp/test/matrix/reverse.cu
+++ b/cpp/test/matrix/reverse.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/matrix/reverse.cuh>
 #include <raft/random/rng.cuh>
@@ -118,7 +118,7 @@ class ReverseTest : public ::testing::TestWithParam<ReverseInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   ReverseInputs<T> params;
diff --git a/cpp/test/matrix/select_k.cu b/cpp/test/matrix/select_k.cu
new file mode 100644
index 0000000000..344e5b5748
--- /dev/null
+++ b/cpp/test/matrix/select_k.cu
@@ -0,0 +1,460 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+
+#include <raft_internal/matrix/select_k.cuh>
+
+#include <raft/core/device_resources.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/sparse/detail/utils.h>
+#include <raft/util/cudart_utils.hpp>
+
+#include <gtest/gtest.h>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <algorithm>
+#include <numeric>
+
+namespace raft::matrix {
+
+template <typename IdxT>
+auto gen_simple_ids(int batch_size, int len) -> std::vector<IdxT>
+{
+  std::vector<IdxT> out(batch_size * len);
+  auto s = rmm::cuda_stream_default;
+  rmm::device_uvector<IdxT> out_d(out.size(), s);
+  sparse::iota_fill(out_d.data(), IdxT(batch_size), IdxT(len), s);
+  update_host(out.data(), out_d.data(), out.size(), s);
+  s.synchronize();
+  return out;
+}
+
+template <typename KeyT, typename IdxT>
+struct io_simple {
+ public:
+  bool not_supported = false;
+
+  io_simple(const select::params& spec,
+            const std::vector<KeyT>& in_dists,
+            const std::vector<KeyT>& out_dists,
+            const std::vector<IdxT>& out_ids)
+    : in_dists_(in_dists),
+      in_ids_(gen_simple_ids<IdxT>(spec.batch_size, spec.len)),
+      out_dists_(out_dists),
+      out_ids_(out_ids)
+  {
+  }
+
+  auto get_in_dists() -> std::vector<KeyT>& { return in_dists_; }
+  auto get_in_ids() -> std::vector<IdxT>& { return in_ids_; }
+  auto get_out_dists() -> std::vector<KeyT>& { return out_dists_; }
+  auto get_out_ids() -> std::vector<IdxT>& { return out_ids_; }
+
+ private:
+  std::vector<KeyT> in_dists_;
+  std::vector<IdxT> in_ids_;
+  std::vector<KeyT> out_dists_;
+  std::vector<IdxT> out_ids_;
+};
+
+template <typename KeyT, typename IdxT>
+struct io_computed {
+ public:
+  bool not_supported = false;
+
+  io_computed(const select::params& spec,
+              const select::Algo& algo,
+              const std::vector<KeyT>& in_dists,
+              const std::optional<std::vector<IdxT>>& in_ids = std::nullopt)
+    : in_dists_(in_dists),
+      in_ids_(in_ids.value_or(gen_simple_ids<IdxT>(spec.batch_size, spec.len))),
+      out_dists_(spec.batch_size * spec.k),
+      out_ids_(spec.batch_size * spec.k)
+  {
+    // check if the size is supported by the algorithm
+    switch (algo) {
+      case select::Algo::kWarpAuto:
+      case select::Algo::kWarpImmediate:
+      case select::Algo::kWarpFiltered:
+      case select::Algo::kWarpDistributed:
+      case select::Algo::kWarpDistributedShm: {
+        if (spec.k > raft::matrix::detail::select::warpsort::kMaxCapacity) {
+          not_supported = true;
+          return;
+        }
+      } break;
+      default: break;
+    }
+
+    device_resources handle{};
+    auto stream = handle.get_stream();
+
+    rmm::device_uvector<KeyT> in_dists_d(in_dists_.size(), stream);
+    rmm::device_uvector<IdxT> in_ids_d(in_ids_.size(), stream);
+    rmm::device_uvector<KeyT> out_dists_d(out_dists_.size(), stream);
+    rmm::device_uvector<IdxT> out_ids_d(out_ids_.size(), stream);
+
+    update_device(in_dists_d.data(), in_dists_.data(), in_dists_.size(), stream);
+    update_device(in_ids_d.data(), in_ids_.data(), in_ids_.size(), stream);
+
+    select::select_k_impl<KeyT, IdxT>(handle,
+                                      algo,
+                                      in_dists_d.data(),
+                                      spec.use_index_input ? in_ids_d.data() : nullptr,
+                                      spec.batch_size,
+                                      spec.len,
+                                      spec.k,
+                                      out_dists_d.data(),
+                                      out_ids_d.data(),
+                                      spec.select_min);
+
+    update_host(out_dists_.data(), out_dists_d.data(), out_dists_.size(), stream);
+    update_host(out_ids_.data(), out_ids_d.data(), out_ids_.size(), stream);
+
+    interruptible::synchronize(stream);
+
+    auto p = topk_sort_permutation(out_dists_, out_ids_, spec.k, spec.select_min);
+    apply_permutation(out_dists_, p);
+    apply_permutation(out_ids_, p);
+  }
+
+  auto get_in_dists() -> std::vector<KeyT>& { return in_dists_; }
+  auto get_in_ids() -> std::vector<IdxT>& { return in_ids_; }
+  auto get_out_dists() -> std::vector<KeyT>& { return out_dists_; }
+  auto get_out_ids() -> std::vector<IdxT>& { return out_ids_; }
+
+ private:
+  std::vector<KeyT> in_dists_;
+  std::vector<IdxT> in_ids_;
+  std::vector<KeyT> out_dists_;
+  std::vector<IdxT> out_ids_;
+
+  auto topk_sort_permutation(const std::vector<KeyT>& vec,
+                             const std::vector<IdxT>& inds,
+                             int k,
+                             bool select_min) -> std::vector<IdxT>
+  {
+    std::vector<IdxT> p(vec.size());
+    std::iota(p.begin(), p.end(), 0);
+    if (select_min) {
+      std::sort(p.begin(), p.end(), [&vec, &inds, k](IdxT i, IdxT j) {
+        const IdxT ik = i / k;
+        const IdxT jk = j / k;
+        if (ik == jk) {
+          if (vec[i] == vec[j]) { return inds[i] < inds[j]; }
+          return vec[i] < vec[j];
+        }
+        return ik < jk;
+      });
+    } else {
+      std::sort(p.begin(), p.end(), [&vec, &inds, k](IdxT i, IdxT j) {
+        const IdxT ik = i / k;
+        const IdxT jk = j / k;
+        if (ik == jk) {
+          if (vec[i] == vec[j]) { return inds[i] < inds[j]; }
+          return vec[i] > vec[j];
+        }
+        return ik < jk;
+      });
+    }
+    return p;
+  }
+
+  template <typename T>
+  void apply_permutation(std::vector<T>& vec, const std::vector<IdxT>& p)  // NOLINT
+  {
+    for (auto i = IdxT(vec.size()) - 1; i > 0; i--) {
+      auto j = p[i];
+      while (j > i)
+        j = p[j];
+      std::swap(vec[j], vec[i]);
+    }
+  }
+};
+
+template <typename InOut>
+using Params = std::tuple<select::params, select::Algo, InOut>;
+
+template <typename KeyT, typename IdxT, template <typename, typename> typename ParamsReader>
+struct SelectK  // NOLINT
+  : public testing::TestWithParam<typename ParamsReader<KeyT, IdxT>::params_t> {
+  const select::params spec;
+  const select::Algo algo;
+  typename ParamsReader<KeyT, IdxT>::io_t ref;
+  io_computed<KeyT, IdxT> res;
+
+  explicit SelectK(Params<typename ParamsReader<KeyT, IdxT>::io_t> ps)
+    : spec(std::get<0>(ps)),
+      algo(std::get<1>(ps)),                                 // NOLINT
+      ref(std::get<2>(ps)),                                  // NOLINT
+      res(spec, algo, ref.get_in_dists(), ref.get_in_ids())  // NOLINT
+  {
+  }
+
+  explicit SelectK(typename ParamsReader<KeyT, IdxT>::params_t ps)
+    : SelectK(ParamsReader<KeyT, IdxT>::read(ps))
+  {
+  }
+
+  SelectK()
+    : SelectK(testing::TestWithParam<typename ParamsReader<KeyT, IdxT>::params_t>::GetParam())
+  {
+  }
+
+  void run()
+  {
+    if (ref.not_supported || res.not_supported) { GTEST_SKIP(); }
+    ASSERT_TRUE(hostVecMatch(ref.get_out_dists(), res.get_out_dists(), Compare<KeyT>()));
+
+    // If the dists (keys) are the same, different corresponding ids may end up in the selection due
+    // to non-deterministic nature of some implementations.
+    auto& in_ids     = ref.get_in_ids();
+    auto& in_dists   = ref.get_in_dists();
+    auto compare_ids = [&in_ids, &in_dists](const IdxT& i, const IdxT& j) {
+      if (i == j) return true;
+      auto ix_i = size_t(std::find(in_ids.begin(), in_ids.end(), i) - in_ids.begin());
+      auto ix_j = size_t(std::find(in_ids.begin(), in_ids.end(), j) - in_ids.begin());
+      if (ix_i >= in_ids.size() || ix_j >= in_ids.size()) return false;
+      auto dist_i = in_dists[ix_i];
+      auto dist_j = in_dists[ix_j];
+      if (dist_i == dist_j) return true;
+      std::cout << "ERROR: ref[" << ix_i << "] = " << dist_i << " != "
+                << "res[" << ix_j << "] = " << dist_j << std::endl;
+      return false;
+    };
+    ASSERT_TRUE(hostVecMatch(ref.get_out_ids(), res.get_out_ids(), compare_ids));
+  }
+};
+
+template <typename KeyT, typename IdxT>
+struct params_simple {
+  using io_t = io_simple<KeyT, IdxT>;
+  using input_t =
+    std::tuple<select::params, std::vector<KeyT>, std::vector<KeyT>, std::vector<IdxT>>;
+  using params_t = std::tuple<input_t, select::Algo>;
+
+  static auto read(params_t ps) -> Params<io_t>
+  {
+    auto ins  = std::get<0>(ps);
+    auto algo = std::get<1>(ps);
+    return std::make_tuple(
+      std::get<0>(ins),
+      algo,
+      io_simple<KeyT, IdxT>(
+        std::get<0>(ins), std::get<1>(ins), std::get<2>(ins), std::get<3>(ins)));
+  }
+};
+
+auto inputs_simple_f = testing::Values(
+  params_simple<float, int>::input_t(
+    {5, 5, 5, true, true},
+    {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0,
+     1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0},
+    {1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0,
+     4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0},
+    {4, 3, 2, 1, 0, 0, 1, 2, 3, 4, 3, 0, 1, 4, 2, 4, 2, 1, 3, 0, 0, 2, 1, 4, 3}),
+  params_simple<float, int>::input_t(
+    {5, 5, 3, true, true},
+    {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0,
+     1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0},
+    {1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0},
+    {4, 3, 2, 0, 1, 2, 3, 0, 1, 4, 2, 1, 0, 2, 1}),
+  params_simple<float, int>::input_t(
+    {5, 5, 5, true, false},
+    {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0,
+     1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0},
+    {1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0,
+     4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0},
+    {4, 3, 2, 1, 0, 0, 1, 2, 3, 4, 3, 0, 1, 4, 2, 4, 2, 1, 3, 0, 0, 2, 1, 4, 3}),
+  params_simple<float, int>::input_t(
+    {5, 5, 3, true, false},
+    {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0,
+     1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0},
+    {1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0},
+    {4, 3, 2, 0, 1, 2, 3, 0, 1, 4, 2, 1, 0, 2, 1}),
+  params_simple<float, int>::input_t(
+    {5, 7, 3, true, true},
+    {5.0, 4.0, 3.0, 2.0, 1.3, 7.5, 19.0, 9.0, 2.0, 3.0, 3.0, 5.0, 6.0, 4.0, 2.0, 3.0, 5.0, 1.0,
+     4.0, 1.0, 1.0, 5.0, 7.0, 2.5, 4.0,  7.0, 8.0, 8.0, 1.0, 3.0, 2.0, 5.0, 4.0, 1.1, 1.2},
+    {1.3, 2.0, 3.0, 2.0, 3.0, 3.0, 1.0, 1.0, 1.0, 2.5, 4.0, 5.0, 1.0, 1.1, 1.2},
+    {4, 3, 2, 1, 2, 3, 3, 5, 6, 2, 3, 0, 0, 5, 6}),
+  params_simple<float, int>::input_t(
+    {1, 7, 3, true, true}, {2.0, 3.0, 5.0, 1.0, 4.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {3, 5, 6}),
+  params_simple<float, int>::input_t(
+    {1, 7, 3, false, false}, {2.0, 3.0, 5.0, 1.0, 4.0, 1.0, 1.0}, {5.0, 4.0, 3.0}, {2, 4, 1}),
+  params_simple<float, int>::input_t(
+    {1, 7, 3, false, true}, {2.0, 3.0, 5.0, 9.0, 4.0, 9.0, 9.0}, {9.0, 9.0, 9.0}, {3, 5, 6}),
+  params_simple<float, int>::input_t(
+    {1, 130, 5, false, true},
+    {19, 1, 0, 1, 0, 1,  0,  1,  0,  1,  0,  1,  0,  1,  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+     0,  1, 0, 1, 0, 1,  0,  1,  0,  1,  0,  1,  0,  1,  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+     0,  1, 0, 1, 0, 1,  0,  1,  1,  2,  1,  2,  1,  2,  1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
+     1,  2, 1, 2, 1, 2,  1,  2,  1,  2,  1,  2,  1,  2,  1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 3, 4,
+     5,  6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 4, 4, 2, 3, 2, 3, 2, 3, 2, 3, 2, 20},
+    {20, 19, 18, 17, 16},
+    {129, 0, 117, 116, 115}),
+  params_simple<float, int>::input_t(
+    {1, 130, 15, false, true},
+    {19, 1, 0, 1, 0, 1,  0,  1,  0,  1,  0,  1,  0,  1,  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+     0,  1, 0, 1, 0, 1,  0,  1,  0,  1,  0,  1,  0,  1,  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+     0,  1, 0, 1, 0, 1,  0,  1,  1,  2,  1,  2,  1,  2,  1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
+     1,  2, 1, 2, 1, 2,  1,  2,  1,  2,  1,  2,  1,  2,  1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 3, 4,
+     5,  6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 4, 4, 2, 3, 2, 3, 2, 3, 2, 3, 2, 20},
+    {20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6},
+    {129, 0, 117, 116, 115, 114, 113, 112, 111, 110, 109, 108, 107, 106, 105}));
+
+using SimpleFloatInt = SelectK<float, int, params_simple>;
+TEST_P(SimpleFloatInt, Run) { run(); }  // NOLINT
+INSTANTIATE_TEST_CASE_P(                // NOLINT
+  SelectK,
+  SimpleFloatInt,
+  testing::Combine(inputs_simple_f,
+                   testing::Values(select::Algo::kPublicApi,
+                                   select::Algo::kRadix8bits,
+                                   select::Algo::kRadix11bits,
+                                   select::Algo::kWarpImmediate,
+                                   select::Algo::kWarpFiltered,
+                                   select::Algo::kWarpDistributed)));
+
+template <select::Algo RefAlgo>
+struct with_ref {
+  template <typename KeyT, typename IdxT>
+  struct params_random {
+    using io_t     = io_computed<KeyT, IdxT>;
+    using params_t = std::tuple<select::params, select::Algo>;
+
+    static auto read(params_t ps) -> Params<io_t>
+    {
+      auto spec = std::get<0>(ps);
+      auto algo = std::get<1>(ps);
+      std::vector<KeyT> dists(spec.len * spec.batch_size);
+
+      raft::device_resources handle;
+      {
+        auto s = handle.get_stream();
+        rmm::device_uvector<KeyT> dists_d(spec.len * spec.batch_size, s);
+        raft::random::RngState r(42);
+        normal(handle, r, dists_d.data(), dists_d.size(), KeyT(10.0), KeyT(100.0));
+        update_host(dists.data(), dists_d.data(), dists_d.size(), s);
+        s.synchronize();
+      }
+
+      return std::make_tuple(spec, algo, io_computed<KeyT, IdxT>(spec, RefAlgo, dists));
+    }
+  };
+};
+
+auto inputs_random_longlist = testing::Values(select::params{1, 130, 15, false},
+                                              select::params{1, 128, 15, false},
+                                              select::params{20, 700, 1, true},
+                                              select::params{20, 700, 2, true},
+                                              select::params{20, 700, 3, true},
+                                              select::params{20, 700, 4, true},
+                                              select::params{20, 700, 5, true},
+                                              select::params{20, 700, 6, true},
+                                              select::params{20, 700, 7, true},
+                                              select::params{20, 700, 8, true},
+                                              select::params{20, 700, 9, true},
+                                              select::params{20, 700, 10, true, false},
+                                              select::params{20, 700, 11, true},
+                                              select::params{20, 700, 12, true},
+                                              select::params{20, 700, 16, true},
+                                              select::params{100, 1700, 17, true},
+                                              select::params{100, 1700, 31, true, false},
+                                              select::params{100, 1700, 32, false},
+                                              select::params{100, 1700, 33, false},
+                                              select::params{100, 1700, 63, false},
+                                              select::params{100, 1700, 64, false, false},
+                                              select::params{100, 1700, 65, false},
+                                              select::params{100, 1700, 255, true},
+                                              select::params{100, 1700, 256, true},
+                                              select::params{100, 1700, 511, false},
+                                              select::params{100, 1700, 512, true},
+                                              select::params{100, 1700, 1023, false, false},
+                                              select::params{100, 1700, 1024, true},
+                                              select::params{100, 1700, 1700, true});
+
+auto inputs_random_largesize = testing::Values(select::params{100, 100000, 1, true},
+                                               select::params{100, 100000, 2, true},
+                                               select::params{100, 100000, 3, true, false},
+                                               select::params{100, 100000, 7, true},
+                                               select::params{100, 100000, 16, true},
+                                               select::params{100, 100000, 31, true},
+                                               select::params{100, 100000, 32, true, false},
+                                               select::params{100, 100000, 60, true},
+                                               select::params{100, 100000, 100, true, false},
+                                               select::params{100, 100000, 200, true},
+                                               select::params{100000, 100, 100, false},
+                                               select::params{1, 1000000000, 1, true},
+                                               select::params{1, 1000000000, 16, false, false},
+                                               select::params{1, 1000000000, 64, false},
+                                               select::params{1, 1000000000, 128, true, false},
+                                               select::params{1, 1000000000, 256, false, false});
+
+auto inputs_random_largek = testing::Values(select::params{100, 100000, 1000, true},
+                                            select::params{100, 100000, 2000, true},
+                                            select::params{100, 100000, 100000, true, false},
+                                            select::params{100, 100000, 2048, false},
+                                            select::params{100, 100000, 1237, true});
+
+using ReferencedRandomFloatInt =
+  SelectK<float, int, with_ref<select::Algo::kPublicApi>::params_random>;
+TEST_P(ReferencedRandomFloatInt, Run) { run(); }  // NOLINT
+INSTANTIATE_TEST_CASE_P(                          // NOLINT
+  SelectK,
+  ReferencedRandomFloatInt,
+  testing::Combine(inputs_random_longlist,
+                   testing::Values(select::Algo::kRadix8bits,
+                                   select::Algo::kRadix11bits,
+                                   select::Algo::kWarpImmediate,
+                                   select::Algo::kWarpFiltered,
+                                   select::Algo::kWarpDistributed,
+                                   select::Algo::kWarpDistributedShm)));
+
+using ReferencedRandomDoubleSizeT =
+  SelectK<double, size_t, with_ref<select::Algo::kPublicApi>::params_random>;
+TEST_P(ReferencedRandomDoubleSizeT, Run) { run(); }  // NOLINT
+INSTANTIATE_TEST_CASE_P(                             // NOLINT
+  SelectK,
+  ReferencedRandomDoubleSizeT,
+  testing::Combine(inputs_random_longlist,
+                   testing::Values(select::Algo::kRadix8bits,
+                                   select::Algo::kRadix11bits,
+                                   select::Algo::kWarpImmediate,
+                                   select::Algo::kWarpFiltered,
+                                   select::Algo::kWarpDistributed,
+                                   select::Algo::kWarpDistributedShm)));
+
+using ReferencedRandomDoubleInt =
+  SelectK<double, int, with_ref<select::Algo::kRadix11bits>::params_random>;
+TEST_P(ReferencedRandomDoubleInt, LargeSize) { run(); }  // NOLINT
+INSTANTIATE_TEST_CASE_P(                                 // NOLINT
+  SelectK,
+  ReferencedRandomDoubleInt,
+  testing::Combine(inputs_random_largesize, testing::Values(select::Algo::kWarpAuto)));
+
+using ReferencedRandomFloatSizeT =
+  SelectK<float, size_t, with_ref<select::Algo::kRadix8bits>::params_random>;
+TEST_P(ReferencedRandomFloatSizeT, LargeK) { run(); }  // NOLINT
+INSTANTIATE_TEST_CASE_P(SelectK,                       // NOLINT
+                        ReferencedRandomFloatSizeT,
+                        testing::Combine(inputs_random_largek,
+                                         testing::Values(select::Algo::kRadix11bits)));
+
+}  // namespace raft::matrix
diff --git a/cpp/test/matrix/slice.cu b/cpp/test/matrix/slice.cu
index 5faf672d13..58f849a87c 100644
--- a/cpp/test/matrix/slice.cu
+++ b/cpp/test/matrix/slice.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/matrix/slice.cuh>
 #include <raft/random/rng.cuh>
@@ -94,7 +94,7 @@ class SliceTest : public ::testing::TestWithParam<SliceInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   SliceInputs<T> params;
diff --git a/cpp/test/matrix/triangular.cu b/cpp/test/matrix/triangular.cu
index 9af3defb5d..82b01181f5 100644
--- a/cpp/test/matrix/triangular.cu
+++ b/cpp/test/matrix/triangular.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/matrix/init.cuh>
 #include <raft/matrix/triangular.cuh>
@@ -91,7 +91,7 @@ class TriangularTest : public ::testing::TestWithParam<TriangularInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   TriangularInputs<T> params;
diff --git a/cpp/test/neighbors/ann_ivf_flat.cu b/cpp/test/neighbors/ann_ivf_flat.cu
index 735d569318..98cc11c24e 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cu
+++ b/cpp/test/neighbors/ann_ivf_flat.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,11 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include "ann_utils.cuh"
 
+#include <raft_internal/neighbors/naive_knn.cuh>
+
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_types.hpp>
@@ -78,16 +80,16 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
     {
       rmm::device_uvector<T> distances_naive_dev(queries_size, stream_);
       rmm::device_uvector<IdxT> indices_naive_dev(queries_size, stream_);
-      naiveBfKnn<T, DataT, IdxT>(distances_naive_dev.data(),
-                                 indices_naive_dev.data(),
-                                 search_queries.data(),
-                                 database.data(),
-                                 ps.num_queries,
-                                 ps.num_db_vecs,
-                                 ps.dim,
-                                 ps.k,
-                                 ps.metric,
-                                 stream_);
+      naive_knn<T, DataT, IdxT>(distances_naive_dev.data(),
+                                indices_naive_dev.data(),
+                                search_queries.data(),
+                                database.data(),
+                                ps.num_queries,
+                                ps.num_db_vecs,
+                                ps.dim,
+                                ps.k,
+                                ps.metric,
+                                stream_);
       update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_);
       update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_);
       handle_.sync_stream(stream_);
@@ -107,8 +109,6 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
         ivfParams.nprobe = ps.nprobe;
         ivfParams.nlist  = ps.nlist;
         raft::spatial::knn::knnIndex index;
-        index.index   = nullptr;
-        index.gpu_res = nullptr;
 
         approx_knn_build_index(handle_,
                                &index,
@@ -118,6 +118,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
                                database.data(),
                                ps.num_db_vecs,
                                ps.dim);
+
         handle_.sync_stream(stream_);
         approx_knn_search(handle_,
                           distances_ivfflat_dev.data(),
@@ -187,8 +188,13 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
           indices_ivfflat_dev.data(), ps.num_queries, ps.k);
         auto dists_out_view = raft::make_device_matrix_view<T, IdxT>(
           distances_ivfflat_dev.data(), ps.num_queries, ps.k);
+        raft::spatial::knn::ivf_flat::detail::serialize(handle_, "ivf_flat_index", index_2);
+
+        auto index_loaded =
+          raft::spatial::knn::ivf_flat::detail::deserialize<DataT, IdxT>(handle_, "ivf_flat_index");
+
         ivf_flat::search(handle_,
-                         index_2,
+                         index_loaded,
                          search_queries_view,
                          indices_out_view,
                          dists_out_view,
@@ -273,7 +279,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
   }
 
  private:
-  raft::handle_t handle_;
+  raft::device_resources handle_;
   rmm::cuda_stream_view stream_;
   AnnIvfFlatInputs<IdxT> ps;
   rmm::device_uvector<DataT> database;
@@ -288,6 +294,8 @@ const std::vector<AnnIvfFlatInputs<int64_t>> inputs = {
   {1000, 10000, 4, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
   {1000, 10000, 5, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, false},
   {1000, 10000, 8, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, true},
+  {1000, 10000, 5, 16, 40, 1024, raft::distance::DistanceType::L2SqrtExpanded, false},
+  {1000, 10000, 8, 16, 40, 1024, raft::distance::DistanceType::L2SqrtExpanded, true},
 
   // test dims that do not fit into kernel shared memory limits
   {1000, 10000, 2048, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
@@ -326,16 +334,16 @@ const std::vector<AnnIvfFlatInputs<int64_t>> inputs = {
    10000,
    16,
    10,
-   raft::spatial::knn::detail::topk::kMaxCapacity * 2,
-   raft::spatial::knn::detail::topk::kMaxCapacity * 4,
+   raft::matrix::detail::select::warpsort::kMaxCapacity * 2,
+   raft::matrix::detail::select::warpsort::kMaxCapacity * 4,
    raft::distance::DistanceType::L2Expanded,
    false},
   {1000,
    10000,
    16,
    10,
-   raft::spatial::knn::detail::topk::kMaxCapacity * 4,
-   raft::spatial::knn::detail::topk::kMaxCapacity * 4,
+   raft::matrix::detail::select::warpsort::kMaxCapacity * 4,
+   raft::matrix::detail::select::warpsort::kMaxCapacity * 4,
    raft::distance::DistanceType::InnerProduct,
    false}};
 
diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index 9d6ad11ccb..31261871c1 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,9 +15,11 @@
  */
 #pragma once
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include "ann_utils.cuh"
 
+#include <raft_internal/neighbors/naive_knn.cuh>
+
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/neighbors/ivf_pq.cuh>
@@ -42,15 +44,18 @@
 #include <algorithm>
 #include <cstddef>
 #include <iostream>
+#include <optional>
 #include <vector>
 
 namespace raft::neighbors::ivf_pq {
 
 struct ivf_pq_inputs {
-  uint32_t num_db_vecs = 4096;
-  uint32_t num_queries = 1024;
-  uint32_t dim         = 64;
-  uint32_t k           = 32;
+  uint32_t num_db_vecs             = 4096;
+  uint32_t num_queries             = 1024;
+  uint32_t dim                     = 64;
+  uint32_t k                       = 32;
+  std::optional<double> min_recall = std::nullopt;
+
   ivf_pq::index_params index_params;
   ivf_pq::search_params search_params;
 
@@ -91,6 +96,7 @@ inline auto operator<<(std::ostream& os, const ivf_pq_inputs& p) -> std::ostream
   PRINT_DIFF(.num_queries);
   PRINT_DIFF(.dim);
   PRINT_DIFF(.k);
+  PRINT_DIFF_V(.min_recall, p.min_recall.value_or(0));
   PRINT_DIFF_V(.index_params.metric, print_metric{p.index_params.metric});
   PRINT_DIFF(.index_params.metric_arg);
   PRINT_DIFF(.index_params.add_data_on_build);
@@ -100,6 +106,7 @@ inline auto operator<<(std::ostream& os, const ivf_pq_inputs& p) -> std::ostream
   PRINT_DIFF(.index_params.pq_bits);
   PRINT_DIFF(.index_params.pq_dim);
   PRINT_DIFF(.index_params.codebook_kind);
+  PRINT_DIFF(.index_params.force_random_rotation);
   PRINT_DIFF(.search_params.n_probes);
   PRINT_DIFF_V(.search_params.lut_dtype, print_dtype{p.search_params.lut_dtype});
   PRINT_DIFF_V(.search_params.internal_distance_dtype,
@@ -109,8 +116,9 @@ inline auto operator<<(std::ostream& os, const ivf_pq_inputs& p) -> std::ostream
 }
 
 template <typename IdxT>
-auto min_output_size(const handle_t& handle, const ivf_pq::index<IdxT>& index, uint32_t n_probes)
-  -> IdxT
+auto min_output_size(const raft::device_resources& handle,
+                     const ivf_pq::index<IdxT>& index,
+                     uint32_t n_probes) -> IdxT
 {
   uint32_t skip = index.n_nonempty_lists() > n_probes ? index.n_nonempty_lists() - n_probes : 0;
   auto map_type = [] __device__(uint32_t x) { return IdxT(x); };
@@ -134,8 +142,8 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
  protected:
   void gen_data()
   {
-    database.resize(ps.num_db_vecs * ps.dim, stream_);
-    search_queries.resize(ps.num_queries * ps.dim, stream_);
+    database.resize(size_t{ps.num_db_vecs} * size_t{ps.dim}, stream_);
+    search_queries.resize(size_t{ps.num_queries} * size_t{ps.dim}, stream_);
 
     raft::random::Rng r(1234ULL);
     if constexpr (std::is_same<DataT, float>{}) {
@@ -150,19 +158,19 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
 
   void calc_ref()
   {
-    size_t queries_size = ps.num_queries * ps.k;
+    size_t queries_size = size_t{ps.num_queries} * size_t{ps.k};
     rmm::device_uvector<EvalT> distances_naive_dev(queries_size, stream_);
     rmm::device_uvector<IdxT> indices_naive_dev(queries_size, stream_);
-    naiveBfKnn<EvalT, DataT, IdxT>(distances_naive_dev.data(),
-                                   indices_naive_dev.data(),
-                                   search_queries.data(),
-                                   database.data(),
-                                   ps.num_queries,
-                                   ps.num_db_vecs,
-                                   ps.dim,
-                                   ps.k,
-                                   ps.index_params.metric,
-                                   stream_);
+    naive_knn<EvalT, DataT, IdxT>(distances_naive_dev.data(),
+                                  indices_naive_dev.data(),
+                                  search_queries.data(),
+                                  database.data(),
+                                  ps.num_queries,
+                                  ps.num_db_vecs,
+                                  ps.dim,
+                                  ps.k,
+                                  ps.index_params.metric,
+                                  stream_);
     distances_ref.resize(queries_size);
     update_host(distances_ref.data(), distances_naive_dev.data(), queries_size, stream_);
     indices_ref.resize(queries_size);
@@ -205,7 +213,11 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
   template <typename BuildIndex>
   void run(BuildIndex build_index)
   {
-    auto index = build_index();
+    {
+      auto index = build_index();
+      raft::spatial::knn::ivf_pq::detail::serialize<IdxT>(handle_, "ivf_pq_index", index);
+    }
+    auto index = raft::spatial::knn::ivf_pq::detail::deserialize<IdxT>(handle_, "ivf_pq_index");
 
     size_t queries_size = ps.num_queries * ps.k;
     std::vector<IdxT> indices_ivf_pq(queries_size);
@@ -227,12 +239,16 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     update_host(indices_ivf_pq.data(), indices_ivf_pq_dev.data(), queries_size, stream_);
     handle_.sync_stream(stream_);
 
-    // Using very dense, small codebooks results in large errors in the distance calculation
-    double low_precision_factor =
-      static_cast<double>(index.pq_dim() * index.pq_bits()) / static_cast<double>(ps.dim * 8);
     // A very conservative lower bound on recall
-    double min_recall = low_precision_factor * static_cast<double>(ps.search_params.n_probes) /
-                        static_cast<double>(ps.index_params.n_lists);
+    double min_recall =
+      static_cast<double>(ps.search_params.n_probes) / static_cast<double>(ps.index_params.n_lists);
+    double low_precision_factor =
+      static_cast<double>(ps.dim * 8) / static_cast<double>(index.pq_dim() * index.pq_bits());
+    // Using a heuristic to lower the required recall due to code-packing errors
+    min_recall =
+      std::min(std::erfc(0.05 * low_precision_factor / std::max(min_recall, 0.5)), min_recall);
+    // Use explicit per-test min recall value if provided.
+    min_recall = ps.min_recall.value_or(min_recall);
 
     ASSERT_TRUE(eval_neighbours(indices_ref,
                                 indices_ivf_pq,
@@ -240,8 +256,9 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
                                 distances_ivf_pq,
                                 ps.num_queries,
                                 ps.k,
-                                0.001 / low_precision_factor,
-                                min_recall));
+                                0.0001 * low_precision_factor,
+                                min_recall))
+      << ps;
 
     // Test a few extra invariants
     IdxT min_results = min_output_size(handle_, index, ps.search_params.n_probes);
@@ -292,7 +309,7 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
   }
 
  private:
-  raft::handle_t handle_;
+  raft::device_resources handle_;
   rmm::cuda_stream_view stream_;
   ivf_pq_inputs ps;                           // NOLINT
   rmm::device_uvector<DataT> database;        // NOLINT
@@ -346,9 +363,16 @@ inline auto small_dims_per_cluster() -> test_cases_t
 
 inline auto big_dims() -> test_cases_t
 {
-  return with_dims({512, 513, 1023, 1024, 1025, 2048, 2049, 2050, 2053, 6144});
-  // return with_dims({512, 513, 1023, 1024, 1025, 2048, 2049, 2050, 2053, 6144, 8192, 12288,
-  // 16384});
+  // with_dims({512, 513, 1023, 1024, 1025, 2048, 2049, 2050, 2053, 6144, 8192, 12288, 16384});
+  auto xs = with_dims({512, 513, 1023, 1024, 1025, 2048, 2049, 2050, 2053, 6144});
+  return map<ivf_pq_inputs>(xs, [](const ivf_pq_inputs& x) {
+    ivf_pq_inputs y(x);
+    uint32_t pq_len       = 2;
+    y.index_params.pq_dim = div_rounding_up_safe(x.dim, pq_len);
+    // This comes from pure experimentation, also the recall depens a lot on pq_len.
+    y.min_recall = 0.48 + 0.028 * std::log2(x.dim);
+    return y;
+  });
 }
 
 /** These will surely trigger no-smem-lut kernel.  */
@@ -356,8 +380,11 @@ inline auto big_dims_moderate_lut() -> test_cases_t
 {
   return map<ivf_pq_inputs>(big_dims(), [](const ivf_pq_inputs& x) {
     ivf_pq_inputs y(x);
+    uint32_t pq_len           = 2;
+    y.index_params.pq_dim     = round_up_safe(div_rounding_up_safe(x.dim, pq_len), 4u);
     y.index_params.pq_bits    = 6;
     y.search_params.lut_dtype = CUDA_R_16F;
+    y.min_recall              = 0.69;
     return y;
   });
 }
@@ -367,9 +394,11 @@ inline auto big_dims_small_lut() -> test_cases_t
 {
   return map<ivf_pq_inputs>(big_dims(), [](const ivf_pq_inputs& x) {
     ivf_pq_inputs y(x);
-    y.index_params.pq_dim     = raft::round_up_safe(y.dim / 8u, 64u);
+    uint32_t pq_len           = 8;
+    y.index_params.pq_dim     = round_up_safe(div_rounding_up_safe(x.dim, pq_len), 4u);
     y.index_params.pq_bits    = 6;
     y.search_params.lut_dtype = CUDA_R_8U;
+    y.min_recall              = 0.21;
     return y;
   });
 }
@@ -386,30 +415,68 @@ inline auto enum_variety() -> test_cases_t
     ([](ivf_pq_inputs & x) f)(xs[xs.size() - 1]); \
   } while (0);
 
-  ADD_CASE({ x.index_params.codebook_kind = ivf_pq::codebook_gen::PER_CLUSTER; });
-  ADD_CASE({ x.index_params.codebook_kind = ivf_pq::codebook_gen::PER_SUBSPACE; });
+  ADD_CASE({
+    x.index_params.codebook_kind = ivf_pq::codebook_gen::PER_CLUSTER;
+    x.min_recall                 = 0.86;
+  });
+  ADD_CASE({
+    x.index_params.codebook_kind = ivf_pq::codebook_gen::PER_SUBSPACE;
+    x.min_recall                 = 0.86;
+  });
   ADD_CASE({
     x.index_params.codebook_kind = ivf_pq::codebook_gen::PER_CLUSTER;
     x.index_params.pq_bits       = 4;
+    x.min_recall                 = 0.79;
   });
   ADD_CASE({
     x.index_params.codebook_kind = ivf_pq::codebook_gen::PER_CLUSTER;
     x.index_params.pq_bits       = 5;
+    x.min_recall                 = 0.83;
   });
 
-  ADD_CASE({ x.index_params.pq_bits = 6; });
-  ADD_CASE({ x.index_params.pq_bits = 7; });
-  ADD_CASE({ x.index_params.pq_bits = 8; });
+  ADD_CASE({
+    x.index_params.pq_bits = 6;
+    x.min_recall           = 0.84;
+  });
+  ADD_CASE({
+    x.index_params.pq_bits = 7;
+    x.min_recall           = 0.85;
+  });
+  ADD_CASE({
+    x.index_params.pq_bits = 8;
+    x.min_recall           = 0.86;
+  });
 
-  ADD_CASE({ x.index_params.force_random_rotation = true; });
-  ADD_CASE({ x.index_params.force_random_rotation = false; });
+  ADD_CASE({
+    x.index_params.force_random_rotation = true;
+    x.min_recall                         = 0.86;
+  });
+  ADD_CASE({
+    x.index_params.force_random_rotation = false;
+    x.min_recall                         = 0.86;
+  });
 
-  ADD_CASE({ x.search_params.lut_dtype = CUDA_R_32F; });
-  ADD_CASE({ x.search_params.lut_dtype = CUDA_R_16F; });
-  ADD_CASE({ x.search_params.lut_dtype = CUDA_R_8U; });
+  ADD_CASE({
+    x.search_params.lut_dtype = CUDA_R_32F;
+    x.min_recall              = 0.86;
+  });
+  ADD_CASE({
+    x.search_params.lut_dtype = CUDA_R_16F;
+    x.min_recall              = 0.86;
+  });
+  ADD_CASE({
+    x.search_params.lut_dtype = CUDA_R_8U;
+    x.min_recall              = 0.84;
+  });
 
-  ADD_CASE({ x.search_params.internal_distance_dtype = CUDA_R_32F; });
-  ADD_CASE({ x.search_params.internal_distance_dtype = CUDA_R_16F; });
+  ADD_CASE({
+    x.search_params.internal_distance_dtype = CUDA_R_32F;
+    x.min_recall                            = 0.86;
+  });
+  ADD_CASE({
+    x.search_params.internal_distance_dtype = CUDA_R_16F;
+    x.min_recall                            = 0.86;
+  });
 
   return xs;
 }
@@ -427,11 +494,31 @@ inline auto enum_variety_ip() -> test_cases_t
 {
   return map<ivf_pq_inputs>(enum_variety(), [](const ivf_pq_inputs& x) {
     ivf_pq_inputs y(x);
+    if (y.min_recall.has_value()) {
+      if (y.search_params.lut_dtype == CUDA_R_8U) {
+        // InnerProduct score is signed,
+        // thus we're forced to used signed 8-bit representation,
+        // thus we have one bit less precision
+        y.min_recall = y.min_recall.value() * 0.90;
+      } else {
+        // In other cases it seems to perform a little bit better, still worse than L2
+        y.min_recall = y.min_recall.value() * 0.94;
+      }
+    }
     y.index_params.metric = distance::DistanceType::InnerProduct;
     return y;
   });
 }
 
+inline auto enum_variety_l2sqrt() -> test_cases_t
+{
+  return map<ivf_pq_inputs>(enum_variety(), [](const ivf_pq_inputs& x) {
+    ivf_pq_inputs y(x);
+    y.index_params.metric = distance::DistanceType::L2SqrtExpanded;
+    return y;
+  });
+}
+
 /**
  * Try different number of n_probes, some of which may trigger the non-fused version of the search
  * kernel.
diff --git a/cpp/test/neighbors/ann_ivf_pq/test_float_int64_t.cu b/cpp/test/neighbors/ann_ivf_pq/test_float_int64_t.cu
index ecb2faa6a0..db42b1ee6a 100644
--- a/cpp/test/neighbors/ann_ivf_pq/test_float_int64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_pq/test_float_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@ using f32_f32_i64 = ivf_pq_test<float, float, int64_t>;
 
 TEST_BUILD_SEARCH(f32_f32_i64)
 TEST_BUILD_EXTEND_SEARCH(f32_f32_i64)
-INSTANTIATE(f32_f32_i64, enum_variety_l2() + enum_variety_ip() + big_dims_small_lut());
+INSTANTIATE(f32_f32_i64,
+            enum_variety_l2() + enum_variety_ip() + big_dims_small_lut() + enum_variety_l2sqrt());
 
 }  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/test/neighbors/ann_utils.cuh b/cpp/test/neighbors/ann_utils.cuh
index 07ef410d36..4b07db32f4 100644
--- a/cpp/test/neighbors/ann_utils.cuh
+++ b/cpp/test/neighbors/ann_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,15 +17,15 @@
 #pragma once
 
 #include <raft/distance/distance_types.hpp>
+#include <raft/matrix/detail/select_k.cuh>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
-#include <raft/spatial/knn/detail/topk.cuh>
 #include <raft/util/cuda_utils.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 
 namespace raft::neighbors {
@@ -103,90 +103,6 @@ inline auto operator<<(std::ostream& os, const print_metric& p) -> std::ostream&
   return os;
 }
 
-template <typename EvalT, typename DataT, typename IdxT>
-__global__ void naive_distance_kernel(EvalT* dist,
-                                      const DataT* x,
-                                      const DataT* y,
-                                      IdxT m,
-                                      IdxT n,
-                                      IdxT k,
-                                      raft::distance::DistanceType type)
-{
-  IdxT midx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (midx >= m) return;
-  for (IdxT nidx = threadIdx.y + blockIdx.y * blockDim.y; nidx < n;
-       nidx += blockDim.y * gridDim.y) {
-    EvalT acc = EvalT(0);
-    for (IdxT i = 0; i < k; ++i) {
-      IdxT xidx = i + midx * k;
-      IdxT yidx = i + nidx * k;
-      EvalT xv  = (EvalT)x[xidx];
-      EvalT yv  = (EvalT)y[yidx];
-      if (type == raft::distance::DistanceType::InnerProduct) {
-        acc += xv * yv;
-      } else {
-        EvalT diff = xv - yv;
-        acc += diff * diff;
-      }
-    }
-    if (type == raft::distance::DistanceType::L2SqrtExpanded ||
-        type == raft::distance::DistanceType::L2SqrtUnexpanded)
-      acc = raft::mySqrt(acc);
-    dist[midx * n + nidx] = acc;
-  }
-}
-
-/**
- * TODO: either replace this with brute_force_knn or with distance+select_k
- *       when either distance or brute_force_knn support 8-bit int inputs.
- */
-template <typename EvalT, typename DataT, typename IdxT>
-void naiveBfKnn(EvalT* dist_topk,
-                IdxT* indices_topk,
-                const DataT* x,
-                const DataT* y,
-                size_t n_inputs,
-                size_t input_len,
-                size_t dim,
-                uint32_t k,
-                raft::distance::DistanceType type,
-                rmm::cuda_stream_view stream)
-{
-  rmm::mr::device_memory_resource* mr = nullptr;
-  auto pool_guard                     = raft::get_pool_memory_resource(mr, 1024 * 1024);
-
-  dim3 block_dim(16, 32, 1);
-  // maximum reasonable grid size in `y` direction
-  auto grid_y =
-    static_cast<uint16_t>(std::min<size_t>(raft::ceildiv<size_t>(input_len, block_dim.y), 32768));
-
-  // bound the memory used by this function
-  size_t max_batch_size =
-    std::min<size_t>(n_inputs, raft::ceildiv<size_t>(size_t(1) << size_t(27), input_len));
-  rmm::device_uvector<EvalT> dist(max_batch_size * input_len, stream, mr);
-
-  for (size_t offset = 0; offset < n_inputs; offset += max_batch_size) {
-    size_t batch_size = std::min(max_batch_size, n_inputs - offset);
-    dim3 grid_dim(raft::ceildiv<size_t>(batch_size, block_dim.x), grid_y, 1);
-
-    naive_distance_kernel<EvalT, DataT, IdxT><<<grid_dim, block_dim, 0, stream>>>(
-      dist.data(), x + offset * dim, y, batch_size, input_len, dim, type);
-
-    spatial::knn::detail::select_topk<EvalT, IdxT>(
-      dist.data(),
-      nullptr,
-      batch_size,
-      input_len,
-      static_cast<int>(k),
-      dist_topk + offset * k,
-      indices_topk + offset * k,
-      type != raft::distance::DistanceType::InnerProduct,
-      stream,
-      mr);
-  }
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-}
-
 template <typename IdxT, typename DistT, typename CompareDist>
 struct idx_dist_pair {
   IdxT idx;
@@ -232,18 +148,18 @@ auto eval_neighbours(const std::vector<T>& expected_idx,
     }
   }
   double actual_recall = static_cast<double>(match_count) / static_cast<double>(total_count);
-  RAFT_LOG_INFO("Recall = %f (%zu/%zu)", actual_recall, match_count, total_count);
+  double error_margin  = (actual_recall - min_recall) / std::max(1.0 - min_recall, eps);
+  RAFT_LOG_INFO("Recall = %f (%zu/%zu), the error is %2.1f%% %s the threshold (eps = %f).",
+                actual_recall,
+                match_count,
+                total_count,
+                std::abs(error_margin * 100.0),
+                error_margin < 0 ? "above" : "below",
+                eps);
   if (actual_recall < min_recall - eps) {
-    if (actual_recall < min_recall * min_recall - eps) {
-      RAFT_LOG_ERROR("Recall is much lower than the minimum (%f < %f)", actual_recall, min_recall);
-    } else {
-      RAFT_LOG_WARN("Recall is suspiciously too low (%f < %f)", actual_recall, min_recall);
-    }
-    if (match_count == 0 || actual_recall < min_recall * std::min(min_recall, 0.5) - eps) {
-      return testing::AssertionFailure()
-             << "actual recall (" << actual_recall
-             << ") is much smaller than the minimum expected recall (" << min_recall << ").";
-    }
+    return testing::AssertionFailure()
+           << "actual recall (" << actual_recall << ") is lower than the minimum expected recall ("
+           << min_recall << "); eps = " << eps << ". ";
   }
   return testing::AssertionSuccess();
 }
diff --git a/cpp/test/neighbors/ball_cover.cu b/cpp/test/neighbors/ball_cover.cu
index 47030b0d62..a97df7df75 100644
--- a/cpp/test/neighbors/ball_cover.cu
+++ b/cpp/test/neighbors/ball_cover.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include "spatial_data.h"
 #include <raft/core/device_mdspan.hpp>
 #include <raft/distance/distance_types.hpp>
@@ -101,7 +101,7 @@ uint32_t count_discrepancies(value_idx* actual_idx,
 }
 
 template <typename value_t>
-void compute_bfknn(const raft::handle_t& handle,
+void compute_bfknn(const raft::device_resources& handle,
                    const value_t* X1,
                    const value_t* X2,
                    uint32_t n_rows,
@@ -152,7 +152,7 @@ class BallCoverKNNQueryTest : public ::testing::TestWithParam<BallCoverInputs<va
   void basicTest()
   {
     params = ::testing::TestWithParam<BallCoverInputs<value_int>>::GetParam();
-    raft::handle_t handle;
+    raft::device_resources handle;
 
     uint32_t k         = params.k;
     uint32_t n_centers = 25;
@@ -252,7 +252,7 @@ class BallCoverAllKNNTest : public ::testing::TestWithParam<BallCoverInputs<valu
   void basicTest()
   {
     params = ::testing::TestWithParam<BallCoverInputs<value_int>>::GetParam();
-    raft::handle_t handle;
+    raft::device_resources handle;
 
     uint32_t k         = params.k;
     uint32_t n_centers = 25;
diff --git a/cpp/test/neighbors/epsilon_neighborhood.cu b/cpp/test/neighbors/epsilon_neighborhood.cu
index c83817f6f8..c78a15dd2d 100644
--- a/cpp/test/neighbors/epsilon_neighborhood.cu
+++ b/cpp/test/neighbors/epsilon_neighborhood.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <memory>
 #include <raft/core/device_mdspan.hpp>
@@ -72,13 +72,13 @@ class EpsNeighTest : public ::testing::TestWithParam<EpsInputs<T, IdxT>> {
                                 false);
   }
 
+  const raft::device_resources handle;
   EpsInputs<T, IdxT> param;
   cudaStream_t stream = 0;
   rmm::device_uvector<T> data;
   rmm::device_uvector<bool> adj;
   rmm::device_uvector<IdxT> labels, vd;
   IdxT batchSize;
-  const raft::handle_t handle;
 };  // class EpsNeighTest
 
 const std::vector<EpsInputs<float, int>> inputsfi = {
diff --git a/cpp/test/neighbors/faiss_mr.cu b/cpp/test/neighbors/faiss_mr.cu
index 91ba1cc94c..5f0bcae933 100644
--- a/cpp/test/neighbors/faiss_mr.cu
+++ b/cpp/test/neighbors/faiss_mr.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 
 #include <faiss/gpu/GpuResources.h>
 #include <raft/distance/distance_types.hpp>
@@ -73,7 +73,7 @@ class FAISS_MR_Test : public ::testing::TestWithParam<AllocInputs> {
     ASSERT_TRUE(free_after_alloc <= free_before - params_.size);
   }
 
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
   AllocInputs params_;
 };
diff --git a/cpp/test/neighbors/fused_l2_knn.cu b/cpp/test/neighbors/fused_l2_knn.cu
index 8df193d53d..6ab0671229 100644
--- a/cpp/test/neighbors/fused_l2_knn.cu
+++ b/cpp/test/neighbors/fused_l2_knn.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,16 +14,12 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
-
-#include <faiss/gpu/GpuDistance.h>
-#include <faiss/gpu/StandardGpuResources.h>
+#include "../test_utils.cuh"
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/neighbors/brute_force.cuh>
 #include <raft/random/rng.cuh>
-#include <raft/spatial/knn/detail/common_faiss.h>
 #include <raft/spatial/knn/knn.cuh>
 
 #if defined RAFT_NN_COMPILED
@@ -112,8 +108,8 @@ class FusedL2KNNTest : public ::testing::TestWithParam<FusedL2KNNInputs> {
       search_queries(params_.num_queries * params_.dim, stream_),
       raft_indices_(params_.num_queries * params_.k, stream_),
       raft_distances_(params_.num_queries * params_.k, stream_),
-      faiss_indices_(params_.num_queries * params_.k, stream_),
-      faiss_distances_(params_.num_queries * params_.k, stream_)
+      ref_indices_(params_.num_queries * params_.k, stream_),
+      ref_distances_(params_.num_queries * params_.k, stream_)
   {
     RAFT_CUDA_TRY(cudaMemsetAsync(database.data(), 0, database.size() * sizeof(T), stream_));
     RAFT_CUDA_TRY(
@@ -123,15 +119,32 @@ class FusedL2KNNTest : public ::testing::TestWithParam<FusedL2KNNInputs> {
     RAFT_CUDA_TRY(
       cudaMemsetAsync(raft_distances_.data(), 0, raft_distances_.size() * sizeof(T), stream_));
     RAFT_CUDA_TRY(
-      cudaMemsetAsync(faiss_indices_.data(), 0, faiss_indices_.size() * sizeof(int64_t), stream_));
+      cudaMemsetAsync(ref_indices_.data(), 0, ref_indices_.size() * sizeof(int64_t), stream_));
     RAFT_CUDA_TRY(
-      cudaMemsetAsync(faiss_distances_.data(), 0, faiss_distances_.size() * sizeof(T), stream_));
+      cudaMemsetAsync(ref_distances_.data(), 0, ref_distances_.size() * sizeof(T), stream_));
   }
 
  protected:
   void testBruteForce()
   {
-    launchFaissBfknn();
+    // calculate the naive knn, by calculating the full pairwise distances and doing a k-select
+    rmm::device_uvector<T> temp_distances(num_db_vecs * num_queries, stream_);
+    distance::pairwise_distance(
+      handle_,
+      raft::make_device_matrix_view<T, int64_t>(search_queries.data(), num_queries, dim),
+      raft::make_device_matrix_view<T, int64_t>(database.data(), num_db_vecs, dim),
+      raft::make_device_matrix_view<T, int64_t>(temp_distances.data(), num_queries, num_db_vecs),
+      metric);
+
+    spatial::knn::select_k<int64_t, T>(temp_distances.data(),
+                                       nullptr,
+                                       num_queries,
+                                       num_db_vecs,
+                                       ref_distances_.data(),
+                                       ref_indices_.data(),
+                                       true,
+                                       k_,
+                                       stream_);
 
     auto index_view =
       raft::make_device_matrix_view<const T, int64_t>(database.data(), num_db_vecs, dim);
@@ -145,14 +158,14 @@ class FusedL2KNNTest : public ::testing::TestWithParam<FusedL2KNNInputs> {
       handle_, index_view, query_view, out_indices_view, out_dists_view, metric);
 
     // verify.
-    devArrMatchKnnPair(faiss_indices_.data(),
-                       raft_indices_.data(),
-                       faiss_distances_.data(),
-                       raft_distances_.data(),
-                       num_queries,
-                       k_,
-                       float(0.001),
-                       stream_);
+    ASSERT_TRUE(devArrMatchKnnPair(ref_indices_.data(),
+                                   raft_indices_.data(),
+                                   ref_distances_.data(),
+                                   raft_distances_.data(),
+                                   num_queries,
+                                   k_,
+                                   float(0.001),
+                                   stream_));
   }
 
   void SetUp() override
@@ -169,36 +182,8 @@ class FusedL2KNNTest : public ::testing::TestWithParam<FusedL2KNNInputs> {
     uniform(handle_, r, search_queries.data(), num_queries * dim, T(-1.0), T(1.0));
   }
 
-  void launchFaissBfknn()
-  {
-    faiss::MetricType m = detail::build_faiss_metric(metric);
-
-    faiss::gpu::StandardGpuResources gpu_res;
-
-    gpu_res.noTempMemory();
-    int device;
-    RAFT_CUDA_TRY(cudaGetDevice(&device));
-    gpu_res.setDefaultStream(device, stream_);
-
-    faiss::gpu::GpuDistanceParams args;
-    args.metric          = m;
-    args.metricArg       = 0;
-    args.k               = k_;
-    args.dims            = dim;
-    args.vectors         = database.data();
-    args.vectorsRowMajor = true;
-    args.numVectors      = num_db_vecs;
-    args.queries         = search_queries.data();
-    args.queriesRowMajor = true;
-    args.numQueries      = num_queries;
-    args.outDistances    = faiss_distances_.data();
-    args.outIndices      = faiss_indices_.data();
-
-    bfKnn(&gpu_res, args);
-  }
-
  private:
-  raft::handle_t handle_;
+  raft::device_resources handle_;
   cudaStream_t stream_ = 0;
   FusedL2KNNInputs params_;
   int num_queries;
@@ -208,8 +193,8 @@ class FusedL2KNNTest : public ::testing::TestWithParam<FusedL2KNNInputs> {
   rmm::device_uvector<T> search_queries;
   rmm::device_uvector<int64_t> raft_indices_;
   rmm::device_uvector<T> raft_distances_;
-  rmm::device_uvector<int64_t> faiss_indices_;
-  rmm::device_uvector<T> faiss_distances_;
+  rmm::device_uvector<int64_t> ref_indices_;
+  rmm::device_uvector<T> ref_distances_;
   int k_;
   raft::distance::DistanceType metric;
 };
@@ -223,7 +208,6 @@ const std::vector<FusedL2KNNInputs> inputs = {
   {1000, 10000, 16, 50, raft::distance::DistanceType::L2Expanded},
   {1000, 10000, 32, 50, raft::distance::DistanceType::L2Expanded},
   {10000, 40000, 32, 30, raft::distance::DistanceType::L2Expanded},
-  {131072, 131072, 8, 60, raft::distance::DistanceType::L2Expanded},
   // L2 unexpanded
   {100, 1000, 16, 10, raft::distance::DistanceType::L2Unexpanded},
   {1000, 10000, 16, 10, raft::distance::DistanceType::L2Unexpanded},
@@ -232,7 +216,7 @@ const std::vector<FusedL2KNNInputs> inputs = {
   {1000, 10000, 16, 50, raft::distance::DistanceType::L2Unexpanded},
   {1000, 10000, 32, 50, raft::distance::DistanceType::L2Unexpanded},
   {10000, 40000, 32, 30, raft::distance::DistanceType::L2Unexpanded},
-  {131072, 131072, 8, 60, raft::distance::DistanceType::L2Unexpanded}};
+};
 
 typedef FusedL2KNNTest<float> FusedL2KNNTestF;
 TEST_P(FusedL2KNNTestF, FusedBruteForce) { this->testBruteForce(); }
diff --git a/cpp/test/neighbors/haversine.cu b/cpp/test/neighbors/haversine.cu
index 78bd377156..dc5c8afe18 100644
--- a/cpp/test/neighbors/haversine.cu
+++ b/cpp/test/neighbors/haversine.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <iostream>
 #include <raft/distance/distance_types.hpp>
@@ -100,7 +100,7 @@ class HaversineKNNTest : public ::testing::Test {
   void SetUp() override { basicTest(); }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   rmm::device_uvector<value_t> d_train_inputs;
diff --git a/cpp/test/neighbors/knn.cu b/cpp/test/neighbors/knn.cu
index eb5ecf663f..6814d47dcb 100644
--- a/cpp/test/neighbors/knn.cu
+++ b/cpp/test/neighbors/knn.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/logger.hpp>
@@ -154,7 +154,7 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
   }
 
  private:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   KNNInputs params_;
diff --git a/cpp/test/neighbors/refine.cu b/cpp/test/neighbors/refine.cu
index 06c1317b1e..174dce5a7f 100644
--- a/cpp/test/neighbors/refine.cu
+++ b/cpp/test/neighbors/refine.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include "ann_utils.cuh"
 
-#include "refine_helper.cuh"
+#include <raft_internal/neighbors/refine_helper.cuh>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/neighbors/detail/refine.cuh>
@@ -31,8 +31,8 @@
 
 #include <gtest/gtest.h>
 
-#if defined RAFT_NN_COMPILED
-#include <raft/neighbors/specializations.cuh>
+#if defined RAFT_DISTANCE_COMPILED
+#include <raft/neighbors/specializations/refine.cuh>
 #endif
 
 #include <vector>
@@ -40,11 +40,11 @@
 namespace raft::neighbors {
 
 template <typename DataT, typename DistanceT, typename IdxT>
-class RefineTest : public ::testing::TestWithParam<detail::RefineInputs<IdxT>> {
+class RefineTest : public ::testing::TestWithParam<RefineInputs<IdxT>> {
  public:
   RefineTest()
     : stream_(handle_.get_stream()),
-      data(handle_, ::testing::TestWithParam<detail::RefineInputs<IdxT>>::GetParam())
+      data(handle_, ::testing::TestWithParam<RefineInputs<IdxT>>::GetParam())
   {
   }
 
@@ -102,31 +102,31 @@ class RefineTest : public ::testing::TestWithParam<detail::RefineInputs<IdxT>> {
   }
 
  public:
-  raft::handle_t handle_;
+  raft::device_resources handle_;
   rmm::cuda_stream_view stream_;
-  detail::RefineHelper<DataT, DistanceT, IdxT> data;
+  RefineHelper<DataT, DistanceT, IdxT> data;
 };
 
-const std::vector<detail::RefineInputs<int64_t>> inputs =
-  raft::util::itertools::product<detail::RefineInputs<int64_t>>(
-    {137},
-    {1000},
-    {16},
-    {1, 10, 33},
-    {33},
+const std::vector<RefineInputs<uint64_t>> inputs =
+  raft::util::itertools::product<RefineInputs<uint64_t>>(
+    {static_cast<uint64_t>(137)},
+    {static_cast<uint64_t>(1000)},
+    {static_cast<uint64_t>(16)},
+    {static_cast<uint64_t>(1), static_cast<uint64_t>(10), static_cast<uint64_t>(33)},
+    {static_cast<uint64_t>(33)},
     {raft::distance::DistanceType::L2Expanded, raft::distance::DistanceType::InnerProduct},
     {false, true});
 
-typedef RefineTest<float, float, std::int64_t> RefineTestF;
+typedef RefineTest<float, float, std::uint64_t> RefineTestF;
 TEST_P(RefineTestF, AnnRefine) { this->testRefine(); }
 
 INSTANTIATE_TEST_CASE_P(RefineTest, RefineTestF, ::testing::ValuesIn(inputs));
 
-typedef RefineTest<uint8_t, float, std::int64_t> RefineTestF_uint8;
+typedef RefineTest<uint8_t, float, std::uint64_t> RefineTestF_uint8;
 TEST_P(RefineTestF_uint8, AnnRefine) { this->testRefine(); }
 INSTANTIATE_TEST_CASE_P(RefineTest, RefineTestF_uint8, ::testing::ValuesIn(inputs));
 
-typedef RefineTest<int8_t, float, std::int64_t> RefineTestF_int8;
+typedef RefineTest<int8_t, float, std::uint64_t> RefineTestF_int8;
 TEST_P(RefineTestF_int8, AnnRefine) { this->testRefine(); }
 INSTANTIATE_TEST_CASE_P(RefineTest, RefineTestF_int8, ::testing::ValuesIn(inputs));
 }  // namespace raft::neighbors
diff --git a/cpp/test/neighbors/selection.cu b/cpp/test/neighbors/selection.cu
index bfcfca5ead..61a6345e5e 100644
--- a/cpp/test/neighbors/selection.cu
+++ b/cpp/test/neighbors/selection.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
 #include <raft/random/rng.cuh>
 #include <raft/util/cudart_utils.hpp>
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 
 #include <raft/sparse/detail/utils.h>
 #include <raft/spatial/knn/knn.cuh>
@@ -49,10 +49,11 @@ std::ostream& operator<<(std::ostream& os, const SelectTestSpec& ss)
 }
 
 template <typename IdxT>
-auto gen_simple_ids(int n_inputs, int input_len) -> std::vector<IdxT>
+auto gen_simple_ids(int n_inputs, int input_len, const raft::device_resources& handle)
+  -> std::vector<IdxT>
 {
   std::vector<IdxT> out(n_inputs * input_len);
-  auto s = rmm::cuda_stream_default;
+  auto s = handle.get_stream();
   rmm::device_uvector<IdxT> out_d(out.size(), s);
   iota_fill(out_d.data(), IdxT(n_inputs), IdxT(input_len), s);
   update_host(out.data(), out_d.data(), out.size(), s);
@@ -65,14 +66,16 @@ struct SelectInOutSimple {
  public:
   bool not_supported = false;
 
-  SelectInOutSimple(const SelectTestSpec& spec,
+  SelectInOutSimple(std::shared_ptr<raft::device_resources> handle,
+                    const SelectTestSpec& spec,
                     const std::vector<KeyT>& in_dists,
                     const std::vector<KeyT>& out_dists,
                     const std::vector<IdxT>& out_ids)
     : in_dists_(in_dists),
-      in_ids_(gen_simple_ids<IdxT>(spec.n_inputs, spec.input_len)),
+      in_ids_(gen_simple_ids<IdxT>(spec.n_inputs, spec.input_len, *handle.get())),
       out_dists_(out_dists),
-      out_ids_(out_ids)
+      out_ids_(out_ids),
+      handle_(handle)
   {
   }
 
@@ -82,6 +85,7 @@ struct SelectInOutSimple {
   auto get_out_ids() -> std::vector<IdxT>& { return out_ids_; }
 
  private:
+  std::shared_ptr<raft::device_resources> handle_;
   std::vector<KeyT> in_dists_;
   std::vector<IdxT> in_ids_;
   std::vector<KeyT> out_dists_;
@@ -93,19 +97,22 @@ struct SelectInOutComputed {
  public:
   bool not_supported = false;
 
-  SelectInOutComputed(const SelectTestSpec& spec,
+  SelectInOutComputed(std::shared_ptr<raft::device_resources> handle,
+                      const SelectTestSpec& spec,
                       knn::SelectKAlgo algo,
                       const std::vector<KeyT>& in_dists,
                       const std::optional<std::vector<IdxT>>& in_ids = std::nullopt)
-    : in_dists_(in_dists),
-      in_ids_(in_ids.value_or(gen_simple_ids<IdxT>(spec.n_inputs, spec.input_len))),
+    : handle_(handle),
+      in_dists_(in_dists),
+      in_ids_(in_ids.value_or(gen_simple_ids<IdxT>(spec.n_inputs, spec.input_len, *handle.get()))),
       out_dists_(spec.n_inputs * spec.k),
       out_ids_(spec.n_inputs * spec.k)
+
   {
     // check if the size is supported by the algorithm
     switch (algo) {
       case knn::SelectKAlgo::WARP_SORT:
-        if (spec.k > raft::spatial::knn::detail::topk::kMaxCapacity) {
+        if (spec.k > raft::matrix::detail::select::warpsort::kMaxCapacity) {
           not_supported = true;
           return;
         }
@@ -119,7 +126,7 @@ struct SelectInOutComputed {
       default: break;
     }
 
-    auto stream = rmm::cuda_stream_default;
+    auto stream = handle_.get()->get_stream();
 
     rmm::device_uvector<KeyT> in_dists_d(in_dists_.size(), stream);
     rmm::device_uvector<IdxT> in_ids_d(in_ids_.size(), stream);
@@ -156,6 +163,7 @@ struct SelectInOutComputed {
   auto get_out_ids() -> std::vector<IdxT>& { return out_ids_; }
 
  private:
+  std::shared_ptr<raft::device_resources> handle_;
   std::vector<KeyT> in_dists_;
   std::vector<IdxT> in_ids_;
   std::vector<KeyT> out_dists_;
@@ -205,11 +213,13 @@ struct SelectInOutComputed {
 };
 
 template <typename InOut>
-using Params = std::tuple<SelectTestSpec, knn::SelectKAlgo, InOut>;
+using Params =
+  std::tuple<SelectTestSpec, knn::SelectKAlgo, InOut, std::shared_ptr<raft::device_resources>>;
 
 template <typename KeyT, typename IdxT, template <typename, typename> typename ParamsReader>
 class SelectionTest : public testing::TestWithParam<typename ParamsReader<KeyT, IdxT>::ParamsIn> {
  protected:
+  std::shared_ptr<raft::device_resources> handle_;
   const SelectTestSpec spec;
   const knn::SelectKAlgo algo;
 
@@ -218,10 +228,11 @@ class SelectionTest : public testing::TestWithParam<typename ParamsReader<KeyT,
 
  public:
   explicit SelectionTest(Params<typename ParamsReader<KeyT, IdxT>::InOut> ps)
-    : spec(std::get<0>(ps)),
+    : handle_(std::get<3>(ps)),
+      spec(std::get<0>(ps)),
       algo(std::get<1>(ps)),
       ref(std::get<2>(ps)),
-      res(spec, algo, ref.get_in_dists(), ref.get_in_ids())
+      res(handle_, spec, algo, ref.get_in_dists(), ref.get_in_ids())
   {
   }
 
@@ -238,12 +249,13 @@ class SelectionTest : public testing::TestWithParam<typename ParamsReader<KeyT,
   void run()
   {
     if (ref.not_supported || res.not_supported) { GTEST_SKIP(); }
-    ASSERT_TRUE(hostVecMatch(ref.get_out_dists(), res.get_out_dists(), Compare<KeyT>()));
 
+    ASSERT_TRUE(hostVecMatch(ref.get_out_dists(), res.get_out_dists(), Compare<KeyT>()));
     // If the dists (keys) are the same, different corresponding ids may end up in the selection due
     // to non-deterministic nature of some implementations.
-    auto& in_ids     = ref.get_in_ids();
-    auto& in_dists   = ref.get_in_dists();
+    auto& in_ids   = ref.get_in_ids();
+    auto& in_dists = ref.get_in_dists();
+
     auto compare_ids = [&in_ids, &in_dists](const IdxT& i, const IdxT& j) {
       if (i == j) return true;
       auto ix_i = size_t(std::find(in_ids.begin(), in_ids.end(), i) - in_ids.begin());
@@ -265,17 +277,20 @@ struct params_simple {
   using InOut = SelectInOutSimple<KeyT, IdxT>;
   using Inputs =
     std::tuple<SelectTestSpec, std::vector<KeyT>, std::vector<KeyT>, std::vector<IdxT>>;
-  using ParamsIn = std::tuple<Inputs, knn::SelectKAlgo>;
+  using Handle   = std::shared_ptr<raft::device_resources>;
+  using ParamsIn = std::tuple<Inputs, knn::SelectKAlgo, Handle>;
 
   static auto read(ParamsIn ps) -> Params<InOut>
   {
-    auto ins  = std::get<0>(ps);
-    auto algo = std::get<1>(ps);
+    auto ins    = std::get<0>(ps);
+    auto algo   = std::get<1>(ps);
+    auto handle = std::get<2>(ps);
     return std::make_tuple(
       std::get<0>(ins),
       algo,
       SelectInOutSimple<KeyT, IdxT>(
-        std::get<0>(ins), std::get<1>(ins), std::get<2>(ins), std::get<3>(ins)));
+        handle, std::get<0>(ins), std::get<1>(ins), std::get<2>(ins), std::get<3>(ins)),
+      handle);
   }
 };
 
@@ -339,38 +354,43 @@ auto inputs_simple_f = testing::Values(
 
 typedef SelectionTest<float, int, params_simple> SimpleFloatInt;
 TEST_P(SimpleFloatInt, Run) { run(); }
-INSTANTIATE_TEST_CASE_P(SelectionTest,
-                        SimpleFloatInt,
-                        testing::Combine(inputs_simple_f,
-                                         testing::Values(knn::SelectKAlgo::FAISS,
-                                                         knn::SelectKAlgo::RADIX_8_BITS,
-                                                         knn::SelectKAlgo::RADIX_11_BITS,
-                                                         knn::SelectKAlgo::WARP_SORT)));
+INSTANTIATE_TEST_CASE_P(
+  SelectionTest,
+  SimpleFloatInt,
+  testing::Combine(inputs_simple_f,
+                   testing::Values(knn::SelectKAlgo::FAISS,
+                                   knn::SelectKAlgo::RADIX_8_BITS,
+                                   knn::SelectKAlgo::RADIX_11_BITS,
+                                   knn::SelectKAlgo::WARP_SORT),
+                   testing::Values(std::make_shared<raft::device_resources>())));
 
 template <knn::SelectKAlgo RefAlgo>
 struct with_ref {
   template <typename KeyT, typename IdxT>
   struct params_random {
     using InOut    = SelectInOutComputed<KeyT, IdxT>;
-    using ParamsIn = std::tuple<SelectTestSpec, knn::SelectKAlgo>;
+    using Handle   = std::shared_ptr<raft::device_resources>;
+    using ParamsIn = std::tuple<SelectTestSpec, knn::SelectKAlgo, Handle>;
 
     static auto read(ParamsIn ps) -> Params<InOut>
     {
-      auto spec = std::get<0>(ps);
-      auto algo = std::get<1>(ps);
+      auto spec   = std::get<0>(ps);
+      auto algo   = std::get<1>(ps);
+      auto handle = std::get<2>(ps);
+
       std::vector<KeyT> dists(spec.input_len * spec.n_inputs);
 
-      raft::handle_t handle;
       {
-        auto s = handle.get_stream();
+        auto s = (*handle.get()).get_stream();
         rmm::device_uvector<KeyT> dists_d(spec.input_len * spec.n_inputs, s);
         raft::random::RngState r(42);
-        normal(handle, r, dists_d.data(), dists_d.size(), KeyT(10.0), KeyT(100.0));
+        normal(*(handle.get()), r, dists_d.data(), dists_d.size(), KeyT(10.0), KeyT(100.0));
         update_host(dists.data(), dists_d.data(), dists_d.size(), s);
         s.synchronize();
       }
 
-      return std::make_tuple(spec, algo, SelectInOutComputed<KeyT, IdxT>(spec, RefAlgo, dists));
+      return std::make_tuple(
+        spec, algo, SelectInOutComputed<KeyT, IdxT>(handle, spec, RefAlgo, dists), handle);
     }
   };
 };
@@ -416,11 +436,11 @@ auto inputs_random_largesize = testing::Values(SelectTestSpec{100, 100000, 1, tr
                                                SelectTestSpec{100, 100000, 100, true, false},
                                                SelectTestSpec{100, 100000, 200, true},
                                                SelectTestSpec{100000, 100, 100, false},
-                                               SelectTestSpec{1, 1000000000, 1, true},
-                                               SelectTestSpec{1, 1000000000, 16, false, false},
-                                               SelectTestSpec{1, 1000000000, 64, false},
-                                               SelectTestSpec{1, 1000000000, 128, true, false},
-                                               SelectTestSpec{1, 1000000000, 256, false, false});
+                                               SelectTestSpec{1, 100000000, 1, true},
+                                               SelectTestSpec{1, 100000000, 16, false, false},
+                                               SelectTestSpec{1, 100000000, 64, false},
+                                               SelectTestSpec{1, 100000000, 128, true, false},
+                                               SelectTestSpec{1, 100000000, 256, false, false});
 
 auto inputs_random_largek = testing::Values(SelectTestSpec{100, 100000, 1000, true},
                                             SelectTestSpec{100, 100000, 2000, true},
@@ -431,30 +451,36 @@ auto inputs_random_largek = testing::Values(SelectTestSpec{100, 100000, 1000, tr
 typedef SelectionTest<float, int, with_ref<knn::SelectKAlgo::FAISS>::params_random>
   ReferencedRandomFloatInt;
 TEST_P(ReferencedRandomFloatInt, Run) { run(); }
-INSTANTIATE_TEST_CASE_P(SelectionTest,
-                        ReferencedRandomFloatInt,
-                        testing::Combine(inputs_random_longlist,
-                                         testing::Values(knn::SelectKAlgo::RADIX_8_BITS,
-                                                         knn::SelectKAlgo::RADIX_11_BITS,
-                                                         knn::SelectKAlgo::WARP_SORT)));
+INSTANTIATE_TEST_CASE_P(
+  SelectionTest,
+  ReferencedRandomFloatInt,
+  testing::Combine(inputs_random_longlist,
+                   testing::Values(knn::SelectKAlgo::RADIX_8_BITS,
+                                   knn::SelectKAlgo::RADIX_11_BITS,
+                                   knn::SelectKAlgo::WARP_SORT),
+                   testing::Values(std::make_shared<raft::device_resources>())));
 
 typedef SelectionTest<double, size_t, with_ref<knn::SelectKAlgo::FAISS>::params_random>
   ReferencedRandomDoubleSizeT;
 TEST_P(ReferencedRandomDoubleSizeT, Run) { run(); }
-INSTANTIATE_TEST_CASE_P(SelectionTest,
-                        ReferencedRandomDoubleSizeT,
-                        testing::Combine(inputs_random_longlist,
-                                         testing::Values(knn::SelectKAlgo::RADIX_8_BITS,
-                                                         knn::SelectKAlgo::RADIX_11_BITS,
-                                                         knn::SelectKAlgo::WARP_SORT)));
+INSTANTIATE_TEST_CASE_P(
+  SelectionTest,
+  ReferencedRandomDoubleSizeT,
+  testing::Combine(inputs_random_longlist,
+                   testing::Values(knn::SelectKAlgo::RADIX_8_BITS,
+                                   knn::SelectKAlgo::RADIX_11_BITS,
+                                   knn::SelectKAlgo::WARP_SORT),
+                   testing::Values(std::make_shared<raft::device_resources>())));
 
 typedef SelectionTest<double, int, with_ref<knn::SelectKAlgo::FAISS>::params_random>
   ReferencedRandomDoubleInt;
 TEST_P(ReferencedRandomDoubleInt, LargeSize) { run(); }
-INSTANTIATE_TEST_CASE_P(SelectionTest,
-                        ReferencedRandomDoubleInt,
-                        testing::Combine(inputs_random_largesize,
-                                         testing::Values(knn::SelectKAlgo::WARP_SORT)));
+INSTANTIATE_TEST_CASE_P(
+  SelectionTest,
+  ReferencedRandomDoubleInt,
+  testing::Combine(inputs_random_largesize,
+                   testing::Values(knn::SelectKAlgo::WARP_SORT),
+                   testing::Values(std::make_shared<raft::device_resources>())));
 
 /** TODO: Fix test failure in RAFT CI
  *
diff --git a/cpp/test/random/make_blobs.cu b/cpp/test/random/make_blobs.cu
index 1f14fd23f7..c2dbc5dc1c 100644
--- a/cpp/test/random/make_blobs.cu
+++ b/cpp/test/random/make_blobs.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <cub/cub.cuh>
 #include <gtest/gtest.h>
 #include <raft/core/device_mdarray.hpp>
@@ -147,8 +147,8 @@ class MakeBlobsTest : public ::testing::TestWithParam<MakeBlobsInputs<T>> {
   }
 
  protected:
+  raft::device_resources handle;
   MakeBlobsInputs<T> params;
-  raft::handle_t handle;
   cudaStream_t stream = 0;
 
   device_vector<T, int> mean_var;
diff --git a/cpp/test/random/make_regression.cu b/cpp/test/random/make_regression.cu
index 65d4c4cb31..7508b57bdd 100644
--- a/cpp/test/random/make_regression.cu
+++ b/cpp/test/random/make_regression.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #include <thrust/device_ptr.h>
 #include <thrust/device_vector.h>
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/subtract.cuh>
 #include <raft/linalg/transpose.cuh>
@@ -117,7 +117,7 @@ class MakeRegressionTest : public ::testing::TestWithParam<MakeRegressionInputs<
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream = 0;
 
   MakeRegressionInputs<T> params;
@@ -127,6 +127,9 @@ class MakeRegressionTest : public ::testing::TestWithParam<MakeRegressionInputs<
 
 typedef MakeRegressionTest<float> MakeRegressionTestF;
 const std::vector<MakeRegressionInputs<float>> inputsf_t = {
+  {0.01f, 256, 32, 16, 1, -1, 0.f, true, raft::random::GenPC, 1234ULL},
+  {0.01f, 1000, 100, 47, 4, 65, 4.2f, true, raft::random::GenPC, 1234ULL},
+  {0.01f, 20000, 500, 450, 13, -1, -3.f, false, raft::random::GenPC, 1234ULL},
   {0.01f, 256, 32, 16, 1, -1, 0.f, true, raft::random::GenPhilox, 1234ULL},
   {0.01f, 1000, 100, 47, 4, 65, 4.2f, true, raft::random::GenPhilox, 1234ULL},
   {0.01f, 20000, 500, 450, 13, -1, -3.f, false, raft::random::GenPhilox, 1234ULL}};
@@ -147,6 +150,9 @@ INSTANTIATE_TEST_CASE_P(MakeRegressionTests, MakeRegressionTestF, ::testing::Val
 
 typedef MakeRegressionTest<double> MakeRegressionTestD;
 const std::vector<MakeRegressionInputs<double>> inputsd_t = {
+  {0.01, 256, 32, 16, 1, -1, 0.0, true, raft::random::GenPC, 1234ULL},
+  {0.01, 1000, 100, 47, 4, 65, 4.2, true, raft::random::GenPC, 1234ULL},
+  {0.01, 20000, 500, 450, 13, -1, -3.0, false, raft::random::GenPC, 1234ULL},
   {0.01, 256, 32, 16, 1, -1, 0.0, true, raft::random::GenPhilox, 1234ULL},
   {0.01, 1000, 100, 47, 4, 65, 4.2, true, raft::random::GenPhilox, 1234ULL},
   {0.01, 20000, 500, 450, 13, -1, -3.0, false, raft::random::GenPhilox, 1234ULL}};
@@ -245,7 +251,7 @@ class MakeRegressionMdspanTest : public ::testing::TestWithParam<MakeRegressionI
 
  private:
   MakeRegressionInputs<T> params{::testing::TestWithParam<MakeRegressionInputs<T>>::GetParam()};
-  raft::handle_t handle;
+  raft::device_resources handle;
   rmm::device_uvector<T> values_ret{params.n_samples * params.n_targets, handle.get_stream()};
   rmm::device_uvector<T> values_prod{params.n_samples * params.n_targets, handle.get_stream()};
   int zero_count = -1;
diff --git a/cpp/test/random/multi_variable_gaussian.cu b/cpp/test/random/multi_variable_gaussian.cu
index 51a79ae04a..1aa8b6a555 100644
--- a/cpp/test/random/multi_variable_gaussian.cu
+++ b/cpp/test/random/multi_variable_gaussian.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <cmath>
 #include <gtest/gtest.h>
 #include <iostream>
@@ -65,7 +65,7 @@ enum Correlation : unsigned char {
 template <typename T>
 struct MVGInputs {
   T tolerance;
-  typename multi_variable_gaussian<T>::Decomposer method;
+  typename detail::multi_variable_gaussian<T>::Decomposer method;
   Correlation corr;
   int dim, nPoints;
   unsigned long long int seed;
@@ -79,9 +79,10 @@ template <typename T>
 
 template <typename T>
 class MVGTest : public ::testing::TestWithParam<MVGInputs<T>> {
- protected:
+ public:
   MVGTest()
-    : workspace_d(0, handle.get_stream()),
+    : params(::testing::TestWithParam<MVGInputs<T>>::GetParam()),
+      workspace_d(0, handle.get_stream()),
       P_d(0, handle.get_stream()),
       x_d(0, handle.get_stream()),
       X_d(0, handle.get_stream()),
@@ -90,6 +91,7 @@ class MVGTest : public ::testing::TestWithParam<MVGInputs<T>> {
   {
   }
 
+ protected:
   void SetUp() override
   {
     // getting params
@@ -139,7 +141,7 @@ class MVGTest : public ::testing::TestWithParam<MVGInputs<T>> {
     raft::update_device(x_d.data(), x.data(), dim, stream);
 
     // initializing the mvg
-    mvg           = new multi_variable_gaussian<T>(handle, dim, method);
+    mvg           = new detail::multi_variable_gaussian<T>(handle, dim, method);
     std::size_t o = mvg->get_workspace_size();
 
     // give the workspace area to mvg
@@ -195,32 +197,32 @@ class MVGTest : public ::testing::TestWithParam<MVGInputs<T>> {
   }
 
  protected:
+  raft::device_resources handle;
   MVGInputs<T> params;
-  std::vector<T> P, x, X;
   rmm::device_uvector<T> workspace_d, P_d, x_d, X_d, Rand_cov, Rand_mean;
+  std::vector<T> P, x, X;
   int dim, nPoints;
-  typename multi_variable_gaussian<T>::Decomposer method;
+  typename detail::multi_variable_gaussian<T>::Decomposer method;
   Correlation corr;
-  multi_variable_gaussian<T>* mvg = NULL;
+  detail::multi_variable_gaussian<T>* mvg = NULL;
   T tolerance;
-  raft::handle_t handle;
 };  // end of MVGTest class
 
 template <typename T>
 class MVGMdspanTest : public ::testing::TestWithParam<MVGInputs<T>> {
  private:
-  static auto old_enum_to_new_enum(typename multi_variable_gaussian<T>::Decomposer method)
+  static auto old_enum_to_new_enum(typename detail::multi_variable_gaussian<T>::Decomposer method)
   {
-    if (method == multi_variable_gaussian<T>::chol_decomp) {
+    if (method == detail::multi_variable_gaussian<T>::chol_decomp) {
       return multi_variable_gaussian_decomposition_method::CHOLESKY;
-    } else if (method == multi_variable_gaussian<T>::jacobi) {
+    } else if (method == detail::multi_variable_gaussian<T>::jacobi) {
       return multi_variable_gaussian_decomposition_method::JACOBI;
     } else {
       return multi_variable_gaussian_decomposition_method::QR;
     }
   }
 
- protected:
+ public:
   MVGMdspanTest()
     : workspace_d(0, handle.get_stream()),
       P_d(0, handle.get_stream()),
@@ -280,7 +282,7 @@ class MVGMdspanTest : public ::testing::TestWithParam<MVGInputs<T>> {
 
     rmm::mr::device_memory_resource* mem_resource_ptr = rmm::mr::get_current_device_resource();
     ASSERT_TRUE(mem_resource_ptr != nullptr);
-    raft::random::compute_multi_variable_gaussian(
+    raft::random::multi_variable_gaussian(
       handle, *mem_resource_ptr, x_view, P_view, X_view, method);
 
     // saving the mean of the randoms in Rand_mean
@@ -323,73 +325,79 @@ class MVGMdspanTest : public ::testing::TestWithParam<MVGInputs<T>> {
   }
 
  protected:
+  raft::device_resources handle;
+
   MVGInputs<T> params;
   std::vector<T> P, x, X;
   rmm::device_uvector<T> workspace_d, P_d, x_d, X_d, Rand_cov, Rand_mean;
   int dim, nPoints;
   Correlation corr;
   T tolerance;
-  raft::handle_t handle;
 };  // end of MVGTest class
 
 ///@todo find out the reason that Un-correlated covs are giving problems (in qr)
 // Declare your inputs
 const std::vector<MVGInputs<float>> inputsf = {
   {0.3f,
-   multi_variable_gaussian<float>::Decomposer::chol_decomp,
+   detail::multi_variable_gaussian<float>::Decomposer::chol_decomp,
    Correlation::CORRELATED,
    5,
    30000,
    6ULL},
   {0.1f,
-   multi_variable_gaussian<float>::Decomposer::chol_decomp,
+   detail::multi_variable_gaussian<float>::Decomposer::chol_decomp,
    Correlation::UNCORRELATED,
    5,
    30000,
    6ULL},
   {0.25f,
-   multi_variable_gaussian<float>::Decomposer::jacobi,
+   detail::multi_variable_gaussian<float>::Decomposer::jacobi,
    Correlation::CORRELATED,
    5,
    30000,
    6ULL},
   {0.1f,
-   multi_variable_gaussian<float>::Decomposer::jacobi,
+   detail::multi_variable_gaussian<float>::Decomposer::jacobi,
    Correlation::UNCORRELATED,
    5,
    30000,
    6ULL},
-  {0.2f, multi_variable_gaussian<float>::Decomposer::qr, Correlation::CORRELATED, 5, 30000, 6ULL},
+  {0.2f,
+   detail::multi_variable_gaussian<float>::Decomposer::qr,
+   Correlation::CORRELATED,
+   5,
+   30000,
+   6ULL},
   // { 0.2f,          multi_variable_gaussian<float>::Decomposer::qr,
   // Correlation::UNCORRELATED, 5, 30000, 6ULL}
 };
 const std::vector<MVGInputs<double>> inputsd = {
   {0.25,
-   multi_variable_gaussian<double>::Decomposer::chol_decomp,
+   detail::multi_variable_gaussian<double>::Decomposer::chol_decomp,
    Correlation::CORRELATED,
    10,
    3000000,
    6ULL},
   {0.1,
-   multi_variable_gaussian<double>::Decomposer::chol_decomp,
+   detail::multi_variable_gaussian<double>::Decomposer::chol_decomp,
    Correlation::UNCORRELATED,
    10,
    3000000,
    6ULL},
   {0.25,
-   multi_variable_gaussian<double>::Decomposer::jacobi,
+   detail::multi_variable_gaussian<double>::Decomposer::jacobi,
    Correlation::CORRELATED,
    10,
    3000000,
    6ULL},
   {0.1,
-   multi_variable_gaussian<double>::Decomposer::jacobi,
+   detail::multi_variable_gaussian<double>::Decomposer::jacobi,
    Correlation::UNCORRELATED,
    10,
    3000000,
    6ULL},
   {0.2,
-   multi_variable_gaussian<double>::Decomposer::qr,
+   detail::multi_variable_gaussian<double>::Decomposer::qr,
    Correlation::CORRELATED,
    10,
    3000000,
diff --git a/cpp/test/random/permute.cu b/cpp/test/random/permute.cu
index 32e5540d51..d5fcca270e 100644
--- a/cpp/test/random/permute.cu
+++ b/cpp/test/random/permute.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <algorithm>
 #include <raft/random/permute.cuh>
 #include <raft/random/rng.cuh>
@@ -75,7 +75,7 @@ class PermTest : public ::testing::TestWithParam<PermInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   PermInputs<T> params;
   rmm::device_uvector<T> in, out;
   T* in_ptr  = nullptr;
@@ -158,7 +158,7 @@ class PermMdspanTest : public ::testing::TestWithParam<PermInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   PermInputs<T> params;
   rmm::device_uvector<T> in, out;
   T* in_ptr  = nullptr;
diff --git a/cpp/test/random/rmat_rectangular_generator.cu b/cpp/test/random/rmat_rectangular_generator.cu
index 0baaaf28cf..aae3898389 100644
--- a/cpp/test/random/rmat_rectangular_generator.cu
+++ b/cpp/test/random/rmat_rectangular_generator.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #include <sys/timeb.h>
 #include <vector>
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 
 #include <raft/random/rmat_rectangular_generator.cuh>
 #include <raft/random/rng.cuh>
@@ -242,7 +242,7 @@ class RmatGenTest : public ::testing::TestWithParam<RmatInputs> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   RmatInputs params;
@@ -347,7 +347,7 @@ class RmatGenMdspanTest : public ::testing::TestWithParam<RmatInputs> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   RmatInputs params;
diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu
index 82f6e0e247..d3b8e44b05 100644
--- a/cpp/test/random/rng.cu
+++ b/cpp/test/random/rng.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #include <memory>
 #include <sys/timeb.h>
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <cub/cub.cuh>
 #include <gtest/gtest.h>
 #include <raft/random/rng.cuh>
@@ -145,8 +145,8 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
       case RNG_LogNormal: {
         auto var   = params.end * params.end;
         auto mu    = params.start;
-        meanvar[0] = raft::myExp(mu + var * T(0.5));
-        meanvar[1] = (raft::myExp(var) - T(1.0)) * raft::myExp(T(2.0) * mu + var);
+        meanvar[0] = raft::exp(mu + var * T(0.5));
+        meanvar[1] = (raft::exp(var) - T(1.0)) * raft::exp(T(2.0) * mu + var);
         break;
       }
       case RNG_Uniform:
@@ -169,7 +169,7 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
         meanvar[1] = meanvar[0] * meanvar[0];
         break;
       case RNG_Rayleigh:
-        meanvar[0] = params.start * raft::mySqrt(T(3.1415 / 2.0));
+        meanvar[0] = params.start * raft::sqrt(T(3.1415 / 2.0));
         meanvar[1] = ((T(4.0) - T(3.1415)) / T(2.0)) * params.start * params.start;
         break;
       case RNG_Laplace:
@@ -180,7 +180,7 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   RngInputs<T> params;
@@ -239,8 +239,8 @@ class RngMdspanTest : public ::testing::TestWithParam<RngInputs<T>> {
       case RNG_LogNormal: {
         auto var   = params.end * params.end;
         auto mu    = params.start;
-        meanvar[0] = raft::myExp(mu + var * T(0.5));
-        meanvar[1] = (raft::myExp(var) - T(1.0)) * raft::myExp(T(2.0) * mu + var);
+        meanvar[0] = raft::exp(mu + var * T(0.5));
+        meanvar[1] = (raft::exp(var) - T(1.0)) * raft::exp(T(2.0) * mu + var);
         break;
       }
       case RNG_Uniform:
@@ -263,7 +263,7 @@ class RngMdspanTest : public ::testing::TestWithParam<RngInputs<T>> {
         meanvar[1] = meanvar[0] * meanvar[0];
         break;
       case RNG_Rayleigh:
-        meanvar[0] = params.start * raft::mySqrt(T(3.1415 / 2.0));
+        meanvar[0] = params.start * raft::sqrt(T(3.1415 / 2.0));
         meanvar[1] = ((T(4.0) - T(3.1415)) / T(2.0)) * params.start * params.start;
         break;
       case RNG_Laplace:
@@ -274,7 +274,7 @@ class RngMdspanTest : public ::testing::TestWithParam<RngInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   RngInputs<T> params;
@@ -391,7 +391,7 @@ TEST(Rng, MeanError)
   int num_experiments = 1024;
   int len             = num_samples * num_experiments;
 
-  raft::handle_t handle;
+  raft::device_resources handle;
   auto stream = handle.get_stream();
 
   rmm::device_uvector<float> data(len, stream);
@@ -458,7 +458,7 @@ class ScaledBernoulliTest : public ::testing::Test {
       h_data.get(), h_data.get() + len, [](const T& a) { return a < -scale || a > scale; }));
   }
 
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   rmm::device_uvector<T> data;
@@ -487,7 +487,7 @@ class ScaledBernoulliMdspanTest : public ::testing::Test {
       h_data.get(), h_data.get() + len, [](const T& a) { return a < -scale || a > scale; }));
   }
 
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   rmm::device_uvector<T> data;
@@ -528,7 +528,7 @@ class BernoulliTest : public ::testing::Test {
     delete[] h_data;
   }
 
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   rmm::device_uvector<bool> data;
@@ -559,7 +559,7 @@ class BernoulliMdspanTest : public ::testing::Test {
     ASSERT_TRUE(std::any_of(h_data.get(), h_data.get() + len, [](bool a) { return !a; }));
   }
 
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   rmm::device_uvector<bool> data;
@@ -635,7 +635,7 @@ class RngNormalTableTest : public ::testing::TestWithParam<RngNormalTableInputs<
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   RngNormalTableInputs<T> params;
@@ -691,7 +691,7 @@ class RngNormalTableMdspanTest : public ::testing::TestWithParam<RngNormalTableI
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   RngNormalTableInputs<T> params;
diff --git a/cpp/test/random/rng_discrete.cu b/cpp/test/random/rng_discrete.cu
new file mode 100644
index 0000000000..741f7c65e0
--- /dev/null
+++ b/cpp/test/random/rng_discrete.cu
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+#include <algorithm>
+#include <cmath>
+#include <gtest/gtest.h>
+#include <raft/linalg/add.cuh>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <vector>
+
+namespace raft {
+namespace random {
+
+/* In this test we generate pseudo-random integers following a probability distribution defined by
+ * an array of weights, such that the probability of the integer i is p_i=w_i/sum(w). A histogram of
+ * the generated integers is compared to the expected probabilities. The histogram is normalized,
+ * i.e divided by the number of drawn integers n=sampled_len*n_repeat. The expected value for the
+ * index i of the histogram is E_i=p_i, the standard deviation sigma_i=sqrt(p_i*(1-p_i)/n).
+ *
+ * Weights are constructed as a sparse vector containing mostly zeros and a small number of non-zero
+ * values. The test tolerance used to compare the actual and expected histograms is
+ * eps=max(sigma_i). For the test to be relevant, the tolerance must be small w.r.t the non-zero
+ * probabilities. Hence, n_repeat, sampled_len and nnz must be chosen accordingly. The test
+ * automatically computes the tolerance and will fail if it is estimated too high for the test to be
+ * relevant.
+ */
+
+template <typename IdxT>
+struct RngDiscreteInputs {
+  IdxT n_repeat;
+  IdxT sampled_len;
+  IdxT len;
+  IdxT nnz;
+  GeneratorType gtype;
+  unsigned long long int seed;
+};
+
+template <typename WeightT, typename IdxT>
+::std::ostream& operator<<(::std::ostream& os, const RngDiscreteInputs<IdxT>& d)
+{
+  return os << "{" << d.n_repeat << ", " << d.sampled_len << ", " << d.len << ", " << d.nnz << "}";
+}
+
+template <typename LabelT, typename IdxT>
+void update_count(
+  const LabelT* labels, IdxT* count, IdxT sampled_len, IdxT len, const cudaStream_t& stream)
+{
+  IdxT num_levels  = len + 1;
+  IdxT lower_level = 0;
+  IdxT upper_level = len;
+
+  rmm::device_uvector<IdxT> temp_count(len, stream);
+
+  size_t temp_storage_bytes = 0;
+  RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(nullptr,
+                                                    temp_storage_bytes,
+                                                    labels,
+                                                    temp_count.data(),
+                                                    num_levels,
+                                                    lower_level,
+                                                    upper_level,
+                                                    sampled_len,
+                                                    stream));
+
+  rmm::device_uvector<char> workspace(temp_storage_bytes, stream);
+
+  RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(workspace.data(),
+                                                    temp_storage_bytes,
+                                                    labels,
+                                                    temp_count.data(),
+                                                    num_levels,
+                                                    lower_level,
+                                                    upper_level,
+                                                    sampled_len,
+                                                    stream));
+
+  raft::linalg::add(count, count, temp_count.data(), len, stream);
+}
+
+template <typename IdxT>
+void normalize_count(
+  float* histogram, const IdxT* count, float scale, IdxT len, const cudaStream_t& stream)
+{
+  raft::linalg::unaryOp(
+    histogram,
+    count,
+    len,
+    [scale] __device__(const IdxT& cnt) { return static_cast<float>(cnt) / scale; },
+    stream);
+}
+
+template <typename OutT, typename WeightT, typename IdxT>
+class RngDiscreteTest : public ::testing::TestWithParam<RngDiscreteInputs<IdxT>> {
+ public:
+  RngDiscreteTest()
+    : params(::testing::TestWithParam<RngDiscreteInputs<IdxT>>::GetParam()),
+      stream(handle.get_stream()),
+      out(params.sampled_len, stream),
+      weights(params.len, stream),
+      histogram(params.len, stream),
+      exp_histogram(params.len)
+  {
+  }
+
+ protected:
+  void SetUp() override
+  {
+    tolerance = 0.0f;
+    std::vector<WeightT> h_weights(params.len, WeightT{0});
+    std::mt19937 gen(params.seed);
+    std::uniform_real_distribution dis(WeightT{0.2}, WeightT{2.0});
+    WeightT total_weight = WeightT{0};
+    for (int i = 0; i < params.nnz; i++) {
+      h_weights[i] = dis(gen);
+      total_weight += h_weights[i];
+    }
+    float min_p = 1.f;
+    for (int i = 0; i < params.nnz; i++) {
+      float p     = static_cast<float>(h_weights[i] / total_weight);
+      float n     = static_cast<float>(params.n_repeat * params.sampled_len);
+      float sigma = std::sqrt(p * (1.f - p) / n);
+      tolerance   = std::max(tolerance, 4.f * sigma);
+      min_p       = std::min(min_p, p);
+    }
+    EXPECT_TRUE(tolerance < 0.5f * min_p) << "Test tolerance (" << tolerance
+                                          << ") is too high. Use more samples, more "
+                                             "repetitions or less non-zero weights.";
+    std::shuffle(h_weights.begin(), h_weights.end(), gen);
+    raft::copy(weights.data(), h_weights.data(), params.len, stream);
+
+    RngState r(params.seed, params.gtype);
+    raft::device_vector_view<OutT, IdxT> out_view(out.data(), out.size());
+    auto weights_view =
+      raft::make_device_vector_view<const WeightT, IdxT>(weights.data(), weights.size());
+
+    rmm::device_uvector<IdxT> count(params.len, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, params.len * sizeof(IdxT), stream));
+    for (int iter = 0; iter < params.n_repeat; iter++) {
+      discrete(handle, r, out_view, weights_view);
+      update_count(out.data(), count.data(), params.sampled_len, params.len, stream);
+    }
+    float scale = static_cast<float>(params.sampled_len * params.n_repeat);
+    normalize_count(histogram.data(), count.data(), scale, params.len, stream);
+
+    // Compute the expected normalized histogram
+    for (IdxT i = 0; i < params.len; i++) {
+      exp_histogram[i] = h_weights[i] / total_weight;
+    }
+  }
+
+ protected:
+  raft::device_resources handle;
+  cudaStream_t stream;
+
+  RngDiscreteInputs<IdxT> params;
+  float tolerance;
+  rmm::device_uvector<OutT> out;
+  rmm::device_uvector<WeightT> weights;
+  rmm::device_uvector<float> histogram;
+  std::vector<float> exp_histogram;
+};
+
+const std::vector<RngDiscreteInputs<int>> inputs_i32 = {
+  {1, 10000, 5, 5, GenPC, 123ULL},
+  {1, 10000, 10, 7, GenPC, 456ULL},
+  {1000, 100, 10000, 20, GenPC, 123ULL},
+  {1, 10000, 5, 5, GenPhilox, 1234ULL},
+};
+const std::vector<RngDiscreteInputs<int64_t>> inputs_i64 = {
+  {1, 10000, 5, 5, GenPC, 123ULL},
+  {1, 10000, 10, 7, GenPC, 456ULL},
+  {1000, 100, 10000, 20, GenPC, 123ULL},
+  {1, 10000, 5, 5, GenPhilox, 1234ULL},
+};
+
+#define RNG_DISCRETE_TEST(test_type, test_name, test_inputs)       \
+  typedef RAFT_DEPAREN(test_type) test_name;                       \
+  TEST_P(test_name, Result)                                        \
+  {                                                                \
+    ASSERT_TRUE(devArrMatchHost(exp_histogram.data(),              \
+                                histogram.data(),                  \
+                                exp_histogram.size(),              \
+                                CompareApprox<float>(tolerance))); \
+  }                                                                \
+  INSTANTIATE_TEST_CASE_P(ReduceTests, test_name, ::testing::ValuesIn(test_inputs))
+
+RNG_DISCRETE_TEST((RngDiscreteTest<int, float, int>), RngDiscreteTestI32FI32, inputs_i32);
+RNG_DISCRETE_TEST((RngDiscreteTest<uint32_t, float, int>), RngDiscreteTestU32FI32, inputs_i32);
+RNG_DISCRETE_TEST((RngDiscreteTest<int64_t, float, int>), RngDiscreteTestI64FI32, inputs_i32);
+RNG_DISCRETE_TEST((RngDiscreteTest<int, double, int>), RngDiscreteTestI32DI32, inputs_i32);
+
+// Disable IdxT=int64_t test due to CUB error: https://github.com/NVIDIA/cub/issues/192
+// RNG_DISCRETE_TEST((RngDiscreteTest<int, float, int64_t>), RngDiscreteTestI32FI64, inputs_i64);
+
+}  // namespace random
+}  // namespace raft
diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu
index d5270c456e..83300b3ecc 100644
--- a/cpp/test/random/rng_int.cu
+++ b/cpp/test/random/rng_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <cub/cub.cuh>
 #include <gtest/gtest.h>
 #include <raft/random/rng.cuh>
@@ -112,7 +112,7 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   RngInputs<T> params;
@@ -165,7 +165,7 @@ class RngMdspanTest : public ::testing::TestWithParam<RngInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   RngInputs<T> params;
diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu
index 482b35168a..ae5a58da3d 100644
--- a/cpp/test/random/sample_without_replacement.cu
+++ b/cpp/test/random/sample_without_replacement.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,10 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/random/rng.cuh>
+#include <raft/random/sample_without_replacement.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <set>
@@ -75,7 +76,7 @@ class SWoRTest : public ::testing::TestWithParam<SWoRInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   SWoRInputs<T> params;
@@ -144,7 +145,7 @@ class SWoRMdspanTest : public ::testing::TestWithParam<SWoRInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   SWoRInputs<T> params;
@@ -213,7 +214,11 @@ const std::vector<SWoRInputs<float>> inputsf = {{1024, 512, -1, 0.f, GenPhilox,
         << "repeated index @i=" << i << " idx=" << val;                                            \
       occurrence.insert(val);                                                                      \
     }                                                                                              \
-    if (params.largeWeightIndex >= 0) { ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); }         \
+    if (params.largeWeightIndex >= 0) {                                                            \
+      ASSERT_TRUE((h_outIdx[0] == params.largeWeightIndex) ||                                      \
+                  (h_outIdx[1] == params.largeWeightIndex) ||                                      \
+                  (h_outIdx[2] == params.largeWeightIndex));                                       \
+    }                                                                                              \
   } while (false)
 
 using SWoRTestF = SWoRTest<float>;
diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu
index 862cbffdc7..eb10432f3d 100644
--- a/cpp/test/sparse/add.cu
+++ b/cpp/test/sparse/add.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,11 +16,11 @@
 
 #include <gtest/gtest.h>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/sparse/csr.hpp>
 #include <raft/sparse/linalg/add.cuh>
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <raft/util/cudart_utils.hpp>
 
 #include <iostream>
@@ -126,7 +126,7 @@ class CSRAddTest : public ::testing::TestWithParam<CSRAddInputs<Type_f, Index_>>
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   CSRAddInputs<Type_f, Index_> params;
diff --git a/cpp/test/sparse/convert_coo.cu b/cpp/test/sparse/convert_coo.cu
index 1142c6f3f2..ad91d0d284 100644
--- a/cpp/test/sparse/convert_coo.cu
+++ b/cpp/test/sparse/convert_coo.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,13 +16,13 @@
 
 #include <gtest/gtest.h>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/sparse/convert/coo.cuh>
 #include <raft/sparse/csr.hpp>
 
 #include <raft/util/cudart_utils.hpp>
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 
 #include <iostream>
 #include <limits>
@@ -66,7 +66,7 @@ class CSRtoCOOTest : public ::testing::TestWithParam<CSRtoCOOInputs<Index_>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   CSRtoCOOInputs<Index_> params;
diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu
index 007cbd7fdb..71d296f665 100644
--- a/cpp/test/sparse/convert_csr.cu
+++ b/cpp/test/sparse/convert_csr.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/util/cuda_utils.cuh>
 
@@ -181,7 +181,7 @@ class CSRAdjGraphTest : public ::testing::TestWithParam<CSRAdjGraphInputs<index_
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   CSRAdjGraphInputs<index_t> params;
diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu
index 39b235d5f1..73b8691774 100644
--- a/cpp/test/sparse/csr_row_slice.cu
+++ b/cpp/test/sparse/csr_row_slice.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 
 #include <cusparse_v2.h>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/util/cudart_utils.hpp>
 
 #include <gtest/gtest.h>
@@ -24,7 +24,7 @@
 
 #include <rmm/device_uvector.hpp>
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 
 namespace raft {
 namespace sparse {
@@ -142,7 +142,7 @@ class CSRRowSliceTest : public ::testing::TestWithParam<CSRRowSliceInputs<value_
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   // input data
diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu
index 5811c5c22b..39a6cc4164 100644
--- a/cpp/test/sparse/csr_to_dense.cu
+++ b/cpp/test/sparse/csr_to_dense.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 
 #include <cusparse_v2.h>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/util/cudart_utils.hpp>
 
 #include <gtest/gtest.h>
@@ -24,7 +24,7 @@
 
 #include <rmm/device_uvector.hpp>
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 
 namespace raft {
 namespace sparse {
@@ -116,7 +116,7 @@ class CSRToDenseTest : public ::testing::TestWithParam<CSRToDenseInputs<value_id
   }
 
  protected:
-  raft::handle_t raft_handle;
+  raft::device_resources raft_handle;
   cudaStream_t stream;
 
   cusparseHandle_t handle;
diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu
index 108d38a8b4..812c3defea 100644
--- a/cpp/test/sparse/csr_transpose.cu
+++ b/cpp/test/sparse/csr_transpose.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,12 +18,12 @@
 
 #include <gtest/gtest.h>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/sparse/linalg/transpose.cuh>
 #include <raft/util/cudart_utils.hpp>
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 
 namespace raft {
 namespace sparse {
@@ -101,7 +101,7 @@ class CSRTransposeTest : public ::testing::TestWithParam<CSRTransposeInputs<valu
 
   void SetUp() override
   {
-    raft::handle_t handle;
+    raft::device_resources handle;
 
     make_data();
 
@@ -135,7 +135,7 @@ class CSRTransposeTest : public ::testing::TestWithParam<CSRTransposeInputs<valu
   }
 
  protected:
-  raft::handle_t raft_handle;
+  raft::device_resources raft_handle;
   cudaStream_t stream;
 
   cusparseHandle_t handle;
diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu
index a4af021c05..b5b22ff15c 100644
--- a/cpp/test/sparse/degree.cu
+++ b/cpp/test/sparse/degree.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/util/cudart_utils.hpp>
 
diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu
index c004aeaef0..e768e49f6c 100644
--- a/cpp/test/sparse/dist_coo_spmv.cu
+++ b/cpp/test/sparse/dist_coo_spmv.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 #include <gtest/gtest.h>
 
+#include <raft/core/operators.cuh>
+#include <raft/core/operators.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/sparse/detail/cusparse_wrappers.h>
@@ -24,9 +26,8 @@
 
 #include <raft/sparse/convert/coo.cuh>
 #include <raft/sparse/distance/detail/coo_spmv.cuh>
-#include <raft/sparse/distance/detail/operators.cuh>
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 
 #include <type_traits>
 
@@ -138,33 +139,35 @@ class SparseDistanceCOOSPMVTest
   {
     switch (params.input_configuration.metric) {
       case raft::distance::DistanceType::InnerProduct:
-        compute_dist(detail::Product(), detail::Sum(), detail::AtomicAdd(), true);
+        compute_dist(raft::mul_op(), raft::add_op(), raft::atomic_add_op(), true);
         break;
       case raft::distance::DistanceType::L2Unexpanded:
-        compute_dist(detail::SqDiff(), detail::Sum(), detail::AtomicAdd());
+        compute_dist(raft::sqdiff_op(), raft::add_op(), raft::atomic_add_op());
         break;
       case raft::distance::DistanceType::Canberra:
         compute_dist(
           [] __device__(value_t a, value_t b) { return fabsf(a - b) / (fabsf(a) + fabsf(b)); },
-          detail::Sum(),
-          detail::AtomicAdd());
+          raft::add_op(),
+          raft::atomic_add_op());
         break;
       case raft::distance::DistanceType::L1:
-        compute_dist(detail::AbsDiff(), detail::Sum(), detail::AtomicAdd());
+        compute_dist(absdiff_op(), raft::add_op(), raft::atomic_add_op());
         break;
       case raft::distance::DistanceType::Linf:
-        compute_dist(detail::AbsDiff(), detail::Max(), detail::AtomicMax());
+        compute_dist(absdiff_op(), raft::max_op(), raft::atomic_max_op());
         break;
       case raft::distance::DistanceType::LpUnexpanded: {
         compute_dist(
-          detail::PDiff(params.input_configuration.metric_arg), detail::Sum(), detail::AtomicAdd());
-        float p = 1.0f / params.input_configuration.metric_arg;
-        raft::linalg::unaryOp<value_t>(
-          out_dists.data(),
-          out_dists.data(),
-          dist_config.a_nrows * dist_config.b_nrows,
-          [=] __device__(value_t input) { return powf(input, p); },
-          dist_config.handle.get_stream());
+          raft::compose_op(raft::pow_const_op<value_t>(params.input_configuration.metric_arg),
+                           raft::sub_op()),
+          raft::add_op(),
+          raft::atomic_add_op());
+        value_t p = value_t{1} / params.input_configuration.metric_arg;
+        raft::linalg::unaryOp<value_t>(out_dists.data(),
+                                       out_dists.data(),
+                                       dist_config.a_nrows * dist_config.b_nrows,
+                                       raft::pow_const_op<value_t>{p},
+                                       dist_config.handle.get_stream());
 
       } break;
       default: throw raft::exception("Unknown distance");
@@ -232,7 +235,7 @@ class SparseDistanceCOOSPMVTest
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
 
   // input data
   rmm::device_uvector<value_idx> indptr, indices;
diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu
index 4ce2f4cbde..2a973d675c 100644
--- a/cpp/test/sparse/distance.cu
+++ b/cpp/test/sparse/distance.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
 
 #include <raft/sparse/distance/distance.cuh>
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 
 namespace raft {
 namespace sparse {
@@ -129,7 +129,7 @@ class SparseDistanceTest
                   dist_config.handle.get_stream());
   }
 
-  raft::handle_t handle;
+  raft::device_resources handle;
 
   // input data
   rmm::device_uvector<value_idx> indptr, indices;
diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu
index ba80c84fd5..8c106f8868 100644
--- a/cpp/test/sparse/filter.cu
+++ b/cpp/test/sparse/filter.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/util/cudart_utils.hpp>
 
@@ -50,7 +50,7 @@ const std::vector<SparseFilterInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
 typedef SparseFilterTests<float> COORemoveZeros;
 TEST_P(COORemoveZeros, Result)
 {
-  raft::handle_t h;
+  raft::device_resources h;
   auto stream = h.get_stream();
   params      = ::testing::TestWithParam<SparseFilterInputs<float>>::GetParam();
 
diff --git a/cpp/test/mst.cu b/cpp/test/sparse/mst.cu
similarity index 98%
rename from cpp/test/mst.cu
rename to cpp/test/sparse/mst.cu
index 544ca80a46..0a80846440 100644
--- a/cpp/test/mst.cu
+++ b/cpp/test/sparse/mst.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,13 +16,13 @@
 
 #include <bits/stdc++.h>
 
-#include "test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <iostream>
 #include <rmm/device_uvector.hpp>
 #include <vector>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/sparse/mst/mst.cuh>
 #include <raft/util/cudart_utils.hpp>
 
@@ -241,7 +241,7 @@ class MSTTest : public ::testing::TestWithParam<MSTTestInput<vertex_t, edge_t, w
   edge_t e;
   int iterations;
 
-  raft::handle_t handle;
+  raft::device_resources handle;
 };
 
 // connected components tests
diff --git a/cpp/test/sparse/neighbors/brute_force.cu b/cpp/test/sparse/neighbors/brute_force.cu
index 8fa5e8322d..49284a498b 100644
--- a/cpp/test/sparse/neighbors/brute_force.cu
+++ b/cpp/test/sparse/neighbors/brute_force.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #include <cusparse_v2.h>
 #include <gtest/gtest.h>
 
-#include "../../test_utils.h"
+#include "../../test_utils.cuh"
 #include <raft/distance/distance_types.hpp>
 #include <raft/sparse/neighbors/knn.cuh>
 
@@ -140,7 +140,7 @@ class SparseKNNTest : public ::testing::TestWithParam<SparseKNNInputs<value_idx,
     out_indices.resize(n_rows * k, stream);
   }
 
-  raft::handle_t handle;
+  raft::device_resources handle;
 
   int n_rows, nnz, k;
 
diff --git a/cpp/test/sparse/neighbors/connect_components.cu b/cpp/test/sparse/neighbors/connect_components.cu
index fc4eecd4ee..d200744329 100644
--- a/cpp/test/sparse/neighbors/connect_components.cu
+++ b/cpp/test/sparse/neighbors/connect_components.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@
 #include <raft/sparse/hierarchy/single_linkage.cuh>
 #include <rmm/device_uvector.hpp>
 
-#include "../../test_utils.h"
+#include "../../test_utils.cuh"
 
 namespace raft {
 namespace sparse {
@@ -56,7 +56,7 @@ class ConnectComponentsTest
  protected:
   void basicTest()
   {
-    raft::handle_t handle;
+    raft::device_resources handle;
 
     auto stream = handle.get_stream();
 
diff --git a/cpp/test/sparse/neighbors/knn_graph.cu b/cpp/test/sparse/neighbors/knn_graph.cu
index d6f9e8386f..3b025fc082 100644
--- a/cpp/test/sparse/neighbors/knn_graph.cu
+++ b/cpp/test/sparse/neighbors/knn_graph.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../../test_utils.h"
+#include "../../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_scalar.hpp>
@@ -96,7 +96,7 @@ class KNNGraphTest : public ::testing::TestWithParam<KNNGraphInputs<value_idx, v
   void TearDown() override { delete out; }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   // input data
diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu
index 8e54edd6c9..91b7b09fcc 100644
--- a/cpp/test/sparse/norm.cu
+++ b/cpp/test/sparse/norm.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,9 @@
 
 #include <gtest/gtest.h>
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/sparse/csr.hpp>
 #include <raft/sparse/linalg/norm.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -81,7 +81,7 @@ class CSRRowNormalizeTest : public ::testing::TestWithParam<CSRRowNormalizeInput
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   CSRRowNormalizeInputs<Type_f, Index_> params;
diff --git a/cpp/test/sparse/reduce.cu b/cpp/test/sparse/reduce.cu
index 4280192723..6dc67dbbd8 100644
--- a/cpp/test/sparse/reduce.cu
+++ b/cpp/test/sparse/reduce.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,10 @@
 
 #include <gtest/gtest.h>
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <iostream>
 #include <limits>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/op/reduce.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -51,7 +51,7 @@ class SparseReduceTest : public ::testing::TestWithParam<SparseReduceInputs<valu
 
   void Run()
   {
-    raft::handle_t handle;
+    raft::device_resources handle;
 
     auto stream = handle.get_stream();
 
diff --git a/cpp/test/sparse/row_op.cu b/cpp/test/sparse/row_op.cu
index 732bd06103..e09af0d9ff 100644
--- a/cpp/test/sparse/row_op.cu
+++ b/cpp/test/sparse/row_op.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #include <raft/sparse/csr.hpp>
 #include <raft/sparse/op/row_op.cuh>
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <raft/util/cudart_utils.hpp>
 
 #include <iostream>
@@ -82,7 +82,7 @@ class CSRRowOpTest : public ::testing::TestWithParam<CSRRowOpInputs<Type_f, Inde
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   CSRRowOpInputs<Type_f, Index_> params;
diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu
index 9b75965498..319c96bc02 100644
--- a/cpp/test/sparse/sort.cu
+++ b/cpp/test/sparse/sort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/random/rng.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -51,7 +51,7 @@ TEST_P(COOSort, Result)
 {
   params = ::testing::TestWithParam<SparseSortInput<float>>::GetParam();
   raft::random::RngState r(params.seed);
-  raft::handle_t h;
+  raft::device_resources h;
   auto stream = h.get_stream();
 
   rmm::device_uvector<int> in_rows(params.nnz, stream);
diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/sparse/spectral_matrix.cu
similarity index 95%
rename from cpp/test/spectral_matrix.cu
rename to cpp/test/sparse/spectral_matrix.cu
index 867b1e9daf..3b044e3974 100644
--- a/cpp/test/spectral_matrix.cu
+++ b/cpp/test/sparse/spectral_matrix.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include <memory>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 #include <raft/spectral/matrix_wrappers.hpp>
 
@@ -39,7 +39,7 @@ TEST(Raft, SpectralMatrices)
   using index_type = int;
   using value_type = double;
 
-  handle_t h;
+  raft::device_resources h;
   ASSERT_EQ(0, h.get_device());
 
   csr_view_t<index_type, value_type> csr_v{nullptr, nullptr, nullptr, 0, 0};
diff --git a/cpp/test/sparse/spgemmi.cu b/cpp/test/sparse/spgemmi.cu
index a132c94fde..ec77b8e88b 100644
--- a/cpp/test/sparse/spgemmi.cu
+++ b/cpp/test/sparse/spgemmi.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,9 @@
 
 #include <gtest/gtest.h>
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/transpose.cuh>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/util/cudart_utils.hpp>
@@ -120,7 +120,7 @@ class SPGemmiTest : public ::testing::TestWithParam<SPGemmiInputs> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   SPGemmiInputs params;
diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu
index 7cf1a1e07d..80a512a019 100644
--- a/cpp/test/sparse/symmetrize.cu
+++ b/cpp/test/sparse/symmetrize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 
 #include <iostream>
 
@@ -114,7 +114,7 @@ class SparseSymmetrizeTest
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   // input data
diff --git a/cpp/test/stats/accuracy.cu b/cpp/test/stats/accuracy.cu
index 192c187794..543b99bda0 100644
--- a/cpp/test/stats/accuracy.cu
+++ b/cpp/test/stats/accuracy.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <optional>
 #include <raft/interruptible.hpp>
@@ -76,7 +76,7 @@ class AccuracyTest : public ::testing::TestWithParam<AccuracyInputs<T>> {
 
  protected:
   AccuracyInputs<T> params;
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream = 0;
   T expectedVal, actualVal;
 };
diff --git a/cpp/test/stats/adjusted_rand_index.cu b/cpp/test/stats/adjusted_rand_index.cu
index f113af821d..4506a6730a 100644
--- a/cpp/test/stats/adjusted_rand_index.cu
+++ b/cpp/test/stats/adjusted_rand_index.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <algorithm>
 #include <gtest/gtest.h>
 #include <iostream>
@@ -137,7 +137,7 @@ class adjustedRandIndexTest : public ::testing::TestWithParam<adjustedRandIndexP
     truth_adjusted_rand_index = 1.0;
   }
 
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream = 0;
   adjustedRandIndexParam params;
   T lowerLabelRange, upperLabelRange;
diff --git a/cpp/test/stats/completeness_score.cu b/cpp/test/stats/completeness_score.cu
index 2f8a40afdc..a2a926d41d 100644
--- a/cpp/test/stats/completeness_score.cu
+++ b/cpp/test/stats/completeness_score.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <algorithm>
 #include <gtest/gtest.h>
 #include <iostream>
@@ -100,7 +100,7 @@ class completenessTest : public ::testing::TestWithParam<completenessParam> {
   }
 
   // declaring the data values
-  raft::handle_t handle;
+  raft::device_resources handle;
   completenessParam params;
   T lowerLabelRange, upperLabelRange;
   int nElements               = 0;
diff --git a/cpp/test/stats/contingencyMatrix.cu b/cpp/test/stats/contingencyMatrix.cu
index 7943610689..f344b9ae71 100644
--- a/cpp/test/stats/contingencyMatrix.cu
+++ b/cpp/test/stats/contingencyMatrix.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <algorithm>
 #include <gtest/gtest.h>
 #include <iostream>
@@ -135,7 +135,7 @@ class ContingencyMatrixTest : public ::testing::TestWithParam<ContingencyMatrixP
                                   raft::Compare<T>()));
   }
 
-  raft::handle_t handle;
+  raft::device_resources handle;
   ContingencyMatrixParam params;
   int numUniqueClasses = -1;
   T minLabel, maxLabel;
diff --git a/cpp/test/stats/cov.cu b/cpp/test/stats/cov.cu
index 890c5b7826..c8a90b2f7d 100644
--- a/cpp/test/stats/cov.cu
+++ b/cpp/test/stats/cov.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/random/rng.cuh>
 #include <raft/stats/cov.cuh>
@@ -53,7 +53,7 @@ class CovTest : public ::testing::TestWithParam<CovInputs<T>> {
 
   void SetUp() override
   {
-    raft::handle_t handle;
+    raft::device_resources handle;
     cudaStream_t stream = handle.get_stream();
 
     params = ::testing::TestWithParam<CovInputs<T>>::GetParam();
@@ -103,10 +103,10 @@ class CovTest : public ::testing::TestWithParam<CovInputs<T>> {
   }
 
  protected:
-  CovInputs<T> params;
-  rmm::device_uvector<T> data, mean_act, cov_act, cov_cm, cov_cm_ref;
   cublasHandle_t handle;
   cudaStream_t stream = 0;
+  CovInputs<T> params;
+  rmm::device_uvector<T> data, mean_act, cov_act, cov_cm, cov_cm_ref;
 };
 
 ///@todo: add stable=false after it has been implemented
diff --git a/cpp/test/stats/dispersion.cu b/cpp/test/stats/dispersion.cu
index 4f18c9fb54..261e66af52 100644
--- a/cpp/test/stats/dispersion.cu
+++ b/cpp/test/stats/dispersion.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <optional>
 #include <raft/interruptible.hpp>
@@ -95,7 +95,7 @@ class DispersionTest : public ::testing::TestWithParam<DispersionInputs<T>> {
 
  protected:
   DispersionInputs<T> params;
-  raft::handle_t handle;
+  raft::device_resources handle;
   rmm::device_uvector<T> exp_mean, act_mean;
   cudaStream_t stream = 0;
   int npoints;
diff --git a/cpp/test/stats/entropy.cu b/cpp/test/stats/entropy.cu
index 04aa9f7a80..f19da32bb0 100644
--- a/cpp/test/stats/entropy.cu
+++ b/cpp/test/stats/entropy.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <algorithm>
 #include <gtest/gtest.h>
 #include <iostream>
@@ -88,7 +88,7 @@ class entropyTest : public ::testing::TestWithParam<entropyParam> {
                            upperLabelRange);
   }
 
-  raft::handle_t handle;
+  raft::device_resources handle;
   // declaring the data values
   entropyParam params;
   T lowerLabelRange, upperLabelRange;
diff --git a/cpp/test/stats/histogram.cu b/cpp/test/stats/histogram.cu
index d9793a57df..b3f9fe6782 100644
--- a/cpp/test/stats/histogram.cu
+++ b/cpp/test/stats/histogram.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/interruptible.hpp>
 #include <raft/random/rng.cuh>
@@ -94,7 +94,7 @@ class HistTest : public ::testing::TestWithParam<HistInputs> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   HistInputs params;
   rmm::device_uvector<int> in, bins, ref_bins;
 };
@@ -131,7 +131,7 @@ class HistMdspanTest : public ::testing::TestWithParam<HistInputs> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   HistInputs params;
   rmm::device_uvector<int> in, bins, ref_bins;
 };
diff --git a/cpp/test/stats/homogeneity_score.cu b/cpp/test/stats/homogeneity_score.cu
index 9bd6d9266b..1b48bb1823 100644
--- a/cpp/test/stats/homogeneity_score.cu
+++ b/cpp/test/stats/homogeneity_score.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <algorithm>
 #include <gtest/gtest.h>
 #include <iostream>
@@ -98,7 +98,7 @@ class homogeneityTest : public ::testing::TestWithParam<homogeneityParam> {
   }
 
   // declaring the data values
-  raft::handle_t handle;
+  raft::device_resources handle;
   homogeneityParam params;
   T lowerLabelRange, upperLabelRange;
   int nElements              = 0;
diff --git a/cpp/test/stats/information_criterion.cu b/cpp/test/stats/information_criterion.cu
index 4a9a2128c6..45804c6724 100644
--- a/cpp/test/stats/information_criterion.cu
+++ b/cpp/test/stats/information_criterion.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 
 #include <raft/stats/information_criterion.cuh>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -109,7 +109,7 @@ class BatchedICTest : public ::testing::TestWithParam<BatchedICInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream = 0;
   BatchedICInputs<T> params;
   rmm::device_uvector<T> res_d;
diff --git a/cpp/test/stats/kl_divergence.cu b/cpp/test/stats/kl_divergence.cu
index 58a64f7199..15eac6428a 100644
--- a/cpp/test/stats/kl_divergence.cu
+++ b/cpp/test/stats/kl_divergence.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <algorithm>
 #include <gtest/gtest.h>
 #include <iostream>
@@ -80,7 +80,7 @@ class klDivergenceTest : public ::testing::TestWithParam<klDivergenceParam> {
   }
 
   // declaring the data values
-  raft::handle_t handle;
+  raft::device_resources handle;
   klDivergenceParam params;
   int nElements              = 0;
   DataT truthklDivergence    = 0;
diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu
index b299f81f68..4d011a2425 100644
--- a/cpp/test/stats/mean.cu
+++ b/cpp/test/stats/mean.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/random/rng.cuh>
 #include <raft/stats/mean.cuh>
@@ -81,7 +81,7 @@ class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   MeanInputs<T> params;
diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu
index 30dcdd475b..e5e01a2b10 100644
--- a/cpp/test/stats/mean_center.cu
+++ b/cpp/test/stats/mean_center.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 
 #include "../linalg/matrix_vector_op.cuh"
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/random/rng.cuh>
 #include <raft/stats/mean.cuh>
@@ -91,7 +91,7 @@ class MeanCenterTest : public ::testing::TestWithParam<MeanCenterInputs<T, IdxTy
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   MeanCenterInputs<T, IdxType> params;
diff --git a/cpp/test/stats/meanvar.cu b/cpp/test/stats/meanvar.cu
index 424395c5e8..d21ec43bba 100644
--- a/cpp/test/stats/meanvar.cu
+++ b/cpp/test/stats/meanvar.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/matrix/math.cuh>
 #include <raft/random/rng.cuh>
@@ -89,7 +89,7 @@ class MeanVarTest : public ::testing::TestWithParam<MeanVarInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   MeanVarInputs<T> params;
diff --git a/cpp/test/stats/minmax.cu b/cpp/test/stats/minmax.cu
index a2ba6bfc9e..8b58f9692a 100644
--- a/cpp/test/stats/minmax.cu
+++ b/cpp/test/stats/minmax.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <limits>
 #include <raft/core/device_mdspan.hpp>
@@ -130,7 +130,7 @@ class MinMaxTest : public ::testing::TestWithParam<MinMaxInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   MinMaxInputs<T> params;
   rmm::device_uvector<T> minmax_act;
   rmm::device_uvector<T> minmax_ref;
diff --git a/cpp/test/stats/mutual_info_score.cu b/cpp/test/stats/mutual_info_score.cu
index fb9362df52..1b4ce26746 100644
--- a/cpp/test/stats/mutual_info_score.cu
+++ b/cpp/test/stats/mutual_info_score.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,11 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <algorithm>
 #include <gtest/gtest.h>
 #include <iostream>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/stats/mutual_info_score.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <random>
@@ -126,7 +126,7 @@ class mutualInfoTest : public ::testing::TestWithParam<mutualInfoParam> {
   }
 
   // declaring the data values
-  raft::handle_t handle;
+  raft::device_resources handle;
   mutualInfoParam params;
   T lowerLabelRange, upperLabelRange;
   int nElements             = 0;
diff --git a/cpp/test/stats/r2_score.cu b/cpp/test/stats/r2_score.cu
index d77daacb04..26a1920aae 100644
--- a/cpp/test/stats/r2_score.cu
+++ b/cpp/test/stats/r2_score.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <optional>
 #include <raft/interruptible.hpp>
@@ -84,7 +84,7 @@ class R2_scoreTest : public ::testing::TestWithParam<R2_scoreInputs<T>> {
 
  protected:
   R2_scoreInputs<T> params;
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream = 0;
   T expectedVal, actualVal;
 };
diff --git a/cpp/test/stats/rand_index.cu b/cpp/test/stats/rand_index.cu
index 67e4ab5517..10a31a27ca 100644
--- a/cpp/test/stats/rand_index.cu
+++ b/cpp/test/stats/rand_index.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 
 #include <raft/util/cudart_utils.hpp>
 
@@ -22,7 +22,7 @@
 
 #include <algorithm>
 #include <iostream>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/stats/rand_index.cuh>
 #include <random>
 
@@ -98,7 +98,7 @@ class randIndexTest : public ::testing::TestWithParam<randIndexParam> {
   }
 
   // declaring the data values
-  raft::handle_t handle;
+  raft::device_resources handle;
   randIndexParam params;
   int lowerLabelRange = 0, upperLabelRange = 2;
   uint64_t size            = 0;
diff --git a/cpp/test/stats/regression_metrics.cu b/cpp/test/stats/regression_metrics.cu
index effc3d04dd..9a8e4af6a4 100644
--- a/cpp/test/stats/regression_metrics.cu
+++ b/cpp/test/stats/regression_metrics.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <algorithm>
 #include <gtest/gtest.h>
 #include <optional>
@@ -106,8 +106,8 @@ class RegressionTest : public ::testing::TestWithParam<RegressionInputs<T>> {
   }
 
  protected:
+  raft::device_resources handle;
   RegressionInputs<T> params;
-  raft::handle_t handle;
   cudaStream_t stream           = 0;
   double mean_abs_error         = 0;
   double mean_squared_error     = 0;
diff --git a/cpp/test/stats/silhouette_score.cu b/cpp/test/stats/silhouette_score.cu
index 37a6fff786..80e60a4884 100644
--- a/cpp/test/stats/silhouette_score.cu
+++ b/cpp/test/stats/silhouette_score.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <algorithm>
 #include <gtest/gtest.h>
 #include <iostream>
@@ -192,6 +192,7 @@ class silhouetteScoreTest : public ::testing::TestWithParam<silhouetteScoreParam
   }
 
   // declaring the data values
+  raft::device_resources handle;
   silhouetteScoreParam params;
   int nLabels;
   rmm::device_uvector<DataT> d_X;
@@ -203,7 +204,6 @@ class silhouetteScoreTest : public ::testing::TestWithParam<silhouetteScoreParam
   double truthSilhouetteScore    = 0;
   double computedSilhouetteScore = 0;
   double batchedSilhouetteScore  = 0;
-  raft::handle_t handle;
   int chunk;
 };
 
diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu
index 73f30f17e9..dfc31f31d2 100644
--- a/cpp/test/stats/stddev.cu
+++ b/cpp/test/stats/stddev.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <raft/matrix/math.cuh>
 #include <raft/random/rng.cuh>
@@ -114,7 +114,7 @@ class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   StdDevInputs<T> params;
diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu
index e67988abb0..f6b6ffcc45 100644
--- a/cpp/test/stats/sum.cu
+++ b/cpp/test/stats/sum.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/eltwise.cuh>
 #include <raft/stats/sum.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -72,7 +72,7 @@ class SumTest : public ::testing::TestWithParam<SumInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   SumInputs<T> params;
diff --git a/cpp/test/stats/trustworthiness.cu b/cpp/test/stats/trustworthiness.cu
index cbb8228f8f..a2f72516eb 100644
--- a/cpp/test/stats/trustworthiness.cu
+++ b/cpp/test/stats/trustworthiness.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <iostream>
 #include <raft/distance/distance.cuh>
@@ -31,6 +31,9 @@ namespace raft {
 namespace stats {
 
 class TrustworthinessScoreTest : public ::testing::Test {
+ public:
+  TrustworthinessScoreTest() : d_X(0, handle.get_stream()), d_X_embedded(0, handle.get_stream()) {}
+
  protected:
   void basicTest()
   {
@@ -311,13 +314,10 @@ class TrustworthinessScoreTest : public ::testing::Test {
       -0.02323332, 0.04292452,  0.39291084,  -0.94897962, -0.63863206, -0.16546988, 0.23698957,
       -0.30633628};
 
-    raft::handle_t handle;
-
-    cudaStream_t stream = handle.get_stream();
-
-    rmm::device_uvector<float> d_X(X.size(), stream);
-    rmm::device_uvector<float> d_X_embedded(X_embedded.size(), stream);
+    auto stream = handle.get_stream();
 
+    d_X.resize(X.size(), stream);
+    d_X_embedded.resize(X_embedded.size(), stream);
     raft::update_device(d_X.data(), X.data(), X.size(), stream);
     raft::update_device(d_X_embedded.data(), X_embedded.data(), X_embedded.size(), stream);
     auto n_sample            = 50;
@@ -338,6 +338,11 @@ class TrustworthinessScoreTest : public ::testing::Test {
   void TearDown() override {}
 
  protected:
+  raft::device_resources handle;
+
+  rmm::device_uvector<float> d_X;
+  rmm::device_uvector<float> d_X_embedded;
+
   double score;
 };
 
diff --git a/cpp/test/stats/v_measure.cu b/cpp/test/stats/v_measure.cu
index 0cbc2da7d9..9d1522a5c8 100644
--- a/cpp/test/stats/v_measure.cu
+++ b/cpp/test/stats/v_measure.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <algorithm>
 #include <gtest/gtest.h>
 #include <iostream>
@@ -103,7 +103,7 @@ class vMeasureTest : public ::testing::TestWithParam<vMeasureParam> {
   }
 
   // declaring the data values
-  raft::handle_t handle;
+  raft::device_resources handle;
   vMeasureParam params;
   T lowerLabelRange, upperLabelRange;
   int nElements           = 0;
diff --git a/cpp/test/stats/weighted_mean.cu b/cpp/test/stats/weighted_mean.cu
index 9f33855572..7e28ca9aa3 100644
--- a/cpp/test/stats/weighted_mean.cu
+++ b/cpp/test/stats/weighted_mean.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 #include <cstdint>
 #include <gtest/gtest.h>
 #include <raft/core/device_mdspan.hpp>
@@ -112,7 +112,7 @@ class RowWeightedMeanTest : public ::testing::TestWithParam<WeightedMeanInputs<T
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   WeightedMeanInputs<T> params;
   thrust::host_vector<T> hin, hweights;
   thrust::device_vector<T> din, dweights, dexp, dact;
@@ -186,7 +186,7 @@ class ColWeightedMeanTest : public ::testing::TestWithParam<WeightedMeanInputs<T
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   WeightedMeanInputs<T> params;
   thrust::host_vector<T> hin, hweights;
   thrust::device_vector<T> din, dweights, dexp, dact;
@@ -244,7 +244,7 @@ class WeightedMeanTest : public ::testing::TestWithParam<WeightedMeanInputs<T>>
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   WeightedMeanInputs<T> params;
   thrust::host_vector<T> hin, hweights;
   thrust::device_vector<T> din, dweights, dexp, dact;
diff --git a/cpp/test/test_utils.cuh b/cpp/test/test_utils.cuh
new file mode 100644
index 0000000000..5704eefae3
--- /dev/null
+++ b/cpp/test/test_utils.cuh
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "test_utils.h"
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <raft/random/rng.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/exec_policy.hpp>
+#include <thrust/for_each.h>
+
+#include <fstream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+namespace raft {
+
+/*
+ * @brief Helper function to compare 2 device n-D arrays with custom comparison
+ * @tparam T the data type of the arrays
+ * @tparam L the comparator lambda or object function
+ * @param expected expected value(s)
+ * @param actual actual values
+ * @param eq_compare the comparator
+ * @param stream cuda stream
+ * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
+ * @{
+ */
+template <typename T, typename L>
+testing::AssertionResult devArrMatch(
+  const T* expected, const T* actual, size_t size, L eq_compare, cudaStream_t stream = 0)
+{
+  std::unique_ptr<T[]> exp_h(new T[size]);
+  std::unique_ptr<T[]> act_h(new T[size]);
+  raft::update_host<T>(exp_h.get(), expected, size, stream);
+  raft::update_host<T>(act_h.get(), actual, size, stream);
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  for (size_t i(0); i < size; ++i) {
+    auto exp = exp_h.get()[i];
+    auto act = act_h.get()[i];
+    if (!eq_compare(exp, act)) {
+      return testing::AssertionFailure() << "actual=" << act << " != expected=" << exp << " @" << i;
+    }
+  }
+  return testing::AssertionSuccess();
+}
+
+template <typename T, typename L>
+testing::AssertionResult devArrMatch(
+  T expected, const T* actual, size_t size, L eq_compare, cudaStream_t stream = 0)
+{
+  std::unique_ptr<T[]> act_h(new T[size]);
+  raft::update_host<T>(act_h.get(), actual, size, stream);
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  for (size_t i(0); i < size; ++i) {
+    auto act = act_h.get()[i];
+    if (!eq_compare(expected, act)) {
+      return testing::AssertionFailure()
+             << "actual=" << act << " != expected=" << expected << " @" << i;
+    }
+  }
+  return testing::AssertionSuccess();
+}
+
+template <typename T, typename L>
+testing::AssertionResult devArrMatch(const T* expected,
+                                     const T* actual,
+                                     size_t rows,
+                                     size_t cols,
+                                     L eq_compare,
+                                     cudaStream_t stream = 0)
+{
+  size_t size = rows * cols;
+  std::unique_ptr<T[]> exp_h(new T[size]);
+  std::unique_ptr<T[]> act_h(new T[size]);
+  raft::update_host<T>(exp_h.get(), expected, size, stream);
+  raft::update_host<T>(act_h.get(), actual, size, stream);
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  for (size_t i(0); i < rows; ++i) {
+    for (size_t j(0); j < cols; ++j) {
+      auto idx = i * cols + j;  // row major assumption!
+      auto exp = exp_h.get()[idx];
+      auto act = act_h.get()[idx];
+      if (!eq_compare(exp, act)) {
+        return testing::AssertionFailure()
+               << "actual=" << act << " != expected=" << exp << " @" << i << "," << j;
+      }
+    }
+  }
+  return testing::AssertionSuccess();
+}
+
+template <typename T, typename L>
+testing::AssertionResult devArrMatch(
+  T expected, const T* actual, size_t rows, size_t cols, L eq_compare, cudaStream_t stream = 0)
+{
+  size_t size = rows * cols;
+  std::unique_ptr<T[]> act_h(new T[size]);
+  raft::update_host<T>(act_h.get(), actual, size, stream);
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  for (size_t i(0); i < rows; ++i) {
+    for (size_t j(0); j < cols; ++j) {
+      auto idx = i * cols + j;  // row major assumption!
+      auto act = act_h.get()[idx];
+      if (!eq_compare(expected, act)) {
+        return testing::AssertionFailure()
+               << "actual=" << act << " != expected=" << expected << " @" << i << "," << j;
+      }
+    }
+  }
+  return testing::AssertionSuccess();
+}
+
+/*
+ * @brief Helper function to compare a device n-D arrays with an expected array
+ * on the host, using a custom comparison
+ * @tparam T the data type of the arrays
+ * @tparam L the comparator lambda or object function
+ * @param expected_h host array of expected value(s)
+ * @param actual_d device array actual values
+ * @param eq_compare the comparator
+ * @param stream cuda stream
+ * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
+ */
+template <typename T, typename L>
+testing::AssertionResult devArrMatchHost(
+  const T* expected_h, const T* actual_d, size_t size, L eq_compare, cudaStream_t stream = 0)
+{
+  std::unique_ptr<T[]> act_h(new T[size]);
+  raft::update_host<T>(act_h.get(), actual_d, size, stream);
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  bool ok   = true;
+  auto fail = testing::AssertionFailure();
+  for (size_t i(0); i < size; ++i) {
+    auto exp = expected_h[i];
+    auto act = act_h.get()[i];
+    if (!eq_compare(exp, act)) {
+      ok = false;
+      fail << "actual=" << act << " != expected=" << exp << " @" << i << "; ";
+    }
+  }
+  if (!ok) return fail;
+  return testing::AssertionSuccess();
+}
+
+/**
+ * @brief Helper function to compare host vectors using a custom comparison
+ * @tparam T the element type
+ * @tparam L the comparator lambda or object function
+ * @param expected_h host vector of expected value(s)
+ * @param actual_h host vector actual values
+ * @param eq_compare the comparator
+ * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
+ */
+template <typename T, typename L>
+testing::AssertionResult hostVecMatch(const std::vector<T>& expected_h,
+                                      const std::vector<T>& actual_h,
+                                      L eq_compare)
+{
+  auto n = actual_h.size();
+  if (n != expected_h.size())
+    return testing::AssertionFailure()
+           << "vector sizez mismatch: "
+           << "actual=" << n << " != expected=" << expected_h.size() << "; ";
+  for (size_t i = 0; i < n; ++i) {
+    auto exp = expected_h[i];
+    auto act = actual_h[i];
+    if (!eq_compare(exp, act)) {
+      return testing::AssertionFailure()
+             << "actual=" << act << " != expected=" << exp << " @" << i << "; ";
+    }
+  }
+  return testing::AssertionSuccess();
+}
+
+/*
+ * @brief Helper function to compare diagonal values of a 2D matrix
+ * @tparam T the data type of the arrays
+ * @tparam L the comparator lambda or object function
+ * @param expected expected value along diagonal
+ * @param actual actual matrix
+ * @param eq_compare the comparator
+ * @param stream cuda stream
+ * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
+ */
+template <typename T, typename L>
+testing::AssertionResult diagonalMatch(
+  T expected, const T* actual, size_t rows, size_t cols, L eq_compare, cudaStream_t stream = 0)
+{
+  size_t size = rows * cols;
+  std::unique_ptr<T[]> act_h(new T[size]);
+  raft::update_host<T>(act_h.get(), actual, size, stream);
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  for (size_t i(0); i < rows; ++i) {
+    for (size_t j(0); j < cols; ++j) {
+      if (i != j) continue;
+      auto idx = i * cols + j;  // row major assumption!
+      auto act = act_h.get()[idx];
+      if (!eq_compare(expected, act)) {
+        return testing::AssertionFailure()
+               << "actual=" << act << " != expected=" << expected << " @" << i << "," << j;
+      }
+    }
+  }
+  return testing::AssertionSuccess();
+}
+
+template <typename T, typename IdxT>
+typename std::enable_if_t<std::is_floating_point_v<T>> gen_uniform(T* out,
+                                                                   raft::random::RngState& rng,
+                                                                   IdxT len,
+                                                                   cudaStream_t stream,
+                                                                   T range_min = T(-1),
+                                                                   T range_max = T(1))
+{
+  raft::random::uniform(rng, out, len, range_min, range_max, stream);
+}
+
+template <typename T, typename IdxT>
+typename std::enable_if_t<std::is_integral_v<T>> gen_uniform(T* out,
+                                                             raft::random::RngState& rng,
+                                                             IdxT len,
+                                                             cudaStream_t stream,
+                                                             T range_min = T(0),
+                                                             T range_max = T(100))
+{
+  raft::random::uniformInt(rng, out, len, range_min, range_max, stream);
+}
+
+template <typename T1, typename T2, typename IdxT>
+void gen_uniform(raft::KeyValuePair<T1, T2>* out,
+                 raft::random::RngState& rng,
+                 IdxT len,
+                 cudaStream_t stream)
+{
+  rmm::device_uvector<T1> keys(len, stream);
+  rmm::device_uvector<T2> values(len, stream);
+
+  gen_uniform(keys.data(), rng, len, stream);
+  gen_uniform(values.data(), rng, len, stream);
+
+  const T1* d_keys   = keys.data();
+  const T2* d_values = values.data();
+  auto counting      = thrust::make_counting_iterator<IdxT>(0);
+  thrust::for_each(rmm::exec_policy(stream),
+                   counting,
+                   counting + len,
+                   [out, d_keys, d_values] __device__(int idx) {
+                     out[idx].key   = d_keys[idx];
+                     out[idx].value = d_values[idx];
+                   });
+}
+
+/** @} */
+
+/** time the function call 'func' using cuda events */
+#define TIMEIT_LOOP(ms, count, func)                       \
+  do {                                                     \
+    cudaEvent_t start, stop;                               \
+    RAFT_CUDA_TRY(cudaEventCreate(&start));                \
+    RAFT_CUDA_TRY(cudaEventCreate(&stop));                 \
+    RAFT_CUDA_TRY(cudaEventRecord(start));                 \
+    for (int i = 0; i < count; ++i) {                      \
+      func;                                                \
+    }                                                      \
+    RAFT_CUDA_TRY(cudaEventRecord(stop));                  \
+    RAFT_CUDA_TRY(cudaEventSynchronize(stop));             \
+    ms = 0.f;                                              \
+    RAFT_CUDA_TRY(cudaEventElapsedTime(&ms, start, stop)); \
+    ms /= args.runs;                                       \
+  } while (0)
+
+inline std::vector<float> read_csv(std::string filename, bool skip_first_n_columns = 1)
+{
+  std::vector<float> result;
+  std::ifstream myFile(filename);
+  if (!myFile.is_open()) throw std::runtime_error("Could not open file");
+
+  std::string line, colname;
+  int val;
+
+  if (myFile.good()) {
+    std::getline(myFile, line);
+    std::stringstream ss(line);
+    while (std::getline(ss, colname, ',')) {}
+  }
+
+  int n_lines = 0;
+  while (std::getline(myFile, line)) {
+    std::stringstream ss(line);
+    int colIdx = 0;
+    while (ss >> val) {
+      if (colIdx >= skip_first_n_columns) {
+        result.push_back(val);
+        if (ss.peek() == ',') ss.ignore();
+      }
+      colIdx++;
+    }
+    n_lines++;
+  }
+
+  printf("lines read: %d\n", n_lines);
+  myFile.close();
+  return result;
+}
+
+};  // end namespace raft
\ No newline at end of file
diff --git a/cpp/test/test_utils.h b/cpp/test/test_utils.h
index 26483e6b2d..75590463b0 100644
--- a/cpp/test/test_utils.h
+++ b/cpp/test/test_utils.h
@@ -15,23 +15,13 @@
  */
 
 #pragma once
-#include <gtest/gtest.h>
+
+#include <cmath>
 #include <iostream>
-#include <memory>
-#include <raft/core/kvp.hpp>
-#include <raft/random/rng.cuh>
-#include <raft/util/cuda_utils.cuh>
-#include <raft/util/cudart_utils.hpp>
-#include <rmm/exec_policy.hpp>
-#include <thrust/for_each.h>
 
-#include <fstream>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-#include <type_traits>
-#include <utility>
-#include <vector>
+#include <gtest/gtest.h>
+
+#include <raft/core/kvp.hpp>
 
 namespace raft {
 
@@ -40,13 +30,22 @@ struct Compare {
   bool operator()(const T& a, const T& b) const { return a == b; }
 };
 
+template <typename Key, typename Value>
+struct Compare<raft::KeyValuePair<Key, Value>> {
+  bool operator()(const raft::KeyValuePair<Key, Value>& a,
+                  const raft::KeyValuePair<Key, Value>& b) const
+  {
+    return a.key == b.key && a.value == b.value;
+  }
+};
+
 template <typename T>
 struct CompareApprox {
   CompareApprox(T eps_) : eps(eps_) {}
   bool operator()(const T& a, const T& b) const
   {
-    T diff  = abs(a - b);
-    T m     = std::max(abs(a), abs(b));
+    T diff  = std::abs(a - b);
+    T m     = std::max(std::abs(a), std::abs(b));
     T ratio = diff > eps ? diff / m : diff;
 
     return (ratio <= eps);
@@ -85,8 +84,8 @@ struct CompareApproxAbs {
   CompareApproxAbs(T eps_) : eps(eps_) {}
   bool operator()(const T& a, const T& b) const
   {
-    T diff  = abs(abs(a) - abs(b));
-    T m     = std::max(abs(a), abs(b));
+    T diff  = std::abs(std::abs(a) - std::abs(b));
+    T m     = std::max(std::abs(a), std::abs(b));
     T ratio = diff >= eps ? diff / m : diff;
     return (ratio <= eps);
   }
@@ -98,210 +97,14 @@ struct CompareApproxAbs {
 template <typename T>
 struct CompareApproxNoScaling {
   CompareApproxNoScaling(T eps_) : eps(eps_) {}
-  bool operator()(const T& a, const T& b) const { return (abs(a - b) <= eps); }
+  bool operator()(const T& a, const T& b) const { return (std::abs(a - b) <= eps); }
 
  private:
   T eps;
 };
 
-template <typename T>
-__host__ __device__ T abs(const T& a)
-{
-  return a > T(0) ? a : -a;
-}
-
-/*
- * @brief Helper function to compare 2 device n-D arrays with custom comparison
- * @tparam T the data type of the arrays
- * @tparam L the comparator lambda or object function
- * @param expected expected value(s)
- * @param actual actual values
- * @param eq_compare the comparator
- * @param stream cuda stream
- * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
- * @{
- */
-template <typename T, typename L>
-testing::AssertionResult devArrMatch(
-  const T* expected, const T* actual, size_t size, L eq_compare, cudaStream_t stream = 0)
-{
-  std::unique_ptr<T[]> exp_h(new T[size]);
-  std::unique_ptr<T[]> act_h(new T[size]);
-  raft::update_host<T>(exp_h.get(), expected, size, stream);
-  raft::update_host<T>(act_h.get(), actual, size, stream);
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-  for (size_t i(0); i < size; ++i) {
-    auto exp = exp_h.get()[i];
-    auto act = act_h.get()[i];
-    if (!eq_compare(exp, act)) {
-      return testing::AssertionFailure() << "actual=" << act << " != expected=" << exp << " @" << i;
-    }
-  }
-  return testing::AssertionSuccess();
-}
-
-template <typename T, typename L>
-testing::AssertionResult devArrMatch(
-  T expected, const T* actual, size_t size, L eq_compare, cudaStream_t stream = 0)
-{
-  std::unique_ptr<T[]> act_h(new T[size]);
-  raft::update_host<T>(act_h.get(), actual, size, stream);
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-  for (size_t i(0); i < size; ++i) {
-    auto act = act_h.get()[i];
-    if (!eq_compare(expected, act)) {
-      return testing::AssertionFailure()
-             << "actual=" << act << " != expected=" << expected << " @" << i;
-    }
-  }
-  return testing::AssertionSuccess();
-}
-
-template <typename T, typename L>
-testing::AssertionResult devArrMatch(const T* expected,
-                                     const T* actual,
-                                     size_t rows,
-                                     size_t cols,
-                                     L eq_compare,
-                                     cudaStream_t stream = 0)
-{
-  size_t size = rows * cols;
-  std::unique_ptr<T[]> exp_h(new T[size]);
-  std::unique_ptr<T[]> act_h(new T[size]);
-  raft::update_host<T>(exp_h.get(), expected, size, stream);
-  raft::update_host<T>(act_h.get(), actual, size, stream);
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-  for (size_t i(0); i < rows; ++i) {
-    for (size_t j(0); j < cols; ++j) {
-      auto idx = i * cols + j;  // row major assumption!
-      auto exp = exp_h.get()[idx];
-      auto act = act_h.get()[idx];
-      if (!eq_compare(exp, act)) {
-        return testing::AssertionFailure()
-               << "actual=" << act << " != expected=" << exp << " @" << i << "," << j;
-      }
-    }
-  }
-  return testing::AssertionSuccess();
-}
-
-template <typename T, typename L>
-testing::AssertionResult devArrMatch(
-  T expected, const T* actual, size_t rows, size_t cols, L eq_compare, cudaStream_t stream = 0)
-{
-  size_t size = rows * cols;
-  std::unique_ptr<T[]> act_h(new T[size]);
-  raft::update_host<T>(act_h.get(), actual, size, stream);
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-  for (size_t i(0); i < rows; ++i) {
-    for (size_t j(0); j < cols; ++j) {
-      auto idx = i * cols + j;  // row major assumption!
-      auto act = act_h.get()[idx];
-      if (!eq_compare(expected, act)) {
-        return testing::AssertionFailure()
-               << "actual=" << act << " != expected=" << expected << " @" << i << "," << j;
-      }
-    }
-  }
-  return testing::AssertionSuccess();
-}
-
-/*
- * @brief Helper function to compare a device n-D arrays with an expected array
- * on the host, using a custom comparison
- * @tparam T the data type of the arrays
- * @tparam L the comparator lambda or object function
- * @param expected_h host array of expected value(s)
- * @param actual_d device array actual values
- * @param eq_compare the comparator
- * @param stream cuda stream
- * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
- */
-template <typename T, typename L>
-testing::AssertionResult devArrMatchHost(
-  const T* expected_h, const T* actual_d, size_t size, L eq_compare, cudaStream_t stream = 0)
-{
-  std::unique_ptr<T[]> act_h(new T[size]);
-  raft::update_host<T>(act_h.get(), actual_d, size, stream);
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-  bool ok   = true;
-  auto fail = testing::AssertionFailure();
-  for (size_t i(0); i < size; ++i) {
-    auto exp = expected_h[i];
-    auto act = act_h.get()[i];
-    if (!eq_compare(exp, act)) {
-      ok = false;
-      fail << "actual=" << act << " != expected=" << exp << " @" << i << "; ";
-    }
-  }
-  if (!ok) return fail;
-  return testing::AssertionSuccess();
-}
-
-/**
- * @brief Helper function to compare host vectors using a custom comparison
- * @tparam T the element type
- * @tparam L the comparator lambda or object function
- * @param expected_h host vector of expected value(s)
- * @param actual_h host vector actual values
- * @param eq_compare the comparator
- * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
- */
-template <typename T, typename L>
-testing::AssertionResult hostVecMatch(const std::vector<T>& expected_h,
-                                      const std::vector<T>& actual_h,
-                                      L eq_compare)
-{
-  auto n = actual_h.size();
-  if (n != expected_h.size())
-    return testing::AssertionFailure()
-           << "vector sizez mismatch: "
-           << "actual=" << n << " != expected=" << expected_h.size() << "; ";
-  for (size_t i = 0; i < n; ++i) {
-    auto exp = expected_h[i];
-    auto act = actual_h[i];
-    if (!eq_compare(exp, act)) {
-      return testing::AssertionFailure()
-             << "actual=" << act << " != expected=" << exp << " @" << i << "; ";
-    }
-  }
-  return testing::AssertionSuccess();
-}
-
-/*
- * @brief Helper function to compare diagonal values of a 2D matrix
- * @tparam T the data type of the arrays
- * @tparam L the comparator lambda or object function
- * @param expected expected value along diagonal
- * @param actual actual matrix
- * @param eq_compare the comparator
- * @param stream cuda stream
- * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
- */
-template <typename T, typename L>
-testing::AssertionResult diagonalMatch(
-  T expected, const T* actual, size_t rows, size_t cols, L eq_compare, cudaStream_t stream = 0)
-{
-  size_t size = rows * cols;
-  std::unique_ptr<T[]> act_h(new T[size]);
-  raft::update_host<T>(act_h.get(), actual, size, stream);
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-  for (size_t i(0); i < rows; ++i) {
-    for (size_t j(0); j < cols; ++j) {
-      if (i != j) continue;
-      auto idx = i * cols + j;  // row major assumption!
-      auto act = act_h.get()[idx];
-      if (!eq_compare(expected, act)) {
-        return testing::AssertionFailure()
-               << "actual=" << act << " != expected=" << expected << " @" << i << "," << j;
-      }
-    }
-  }
-  return testing::AssertionSuccess();
-}
-
 template <typename T, typename L>
-testing::AssertionResult match(const T expected, T actual, L eq_compare)
+testing::AssertionResult match(const T& expected, const T& actual, L eq_compare)
 {
   if (!eq_compare(expected, actual)) {
     return testing::AssertionFailure() << "actual=" << actual << " != expected=" << expected;
@@ -309,103 +112,4 @@ testing::AssertionResult match(const T expected, T actual, L eq_compare)
   return testing::AssertionSuccess();
 }
 
-template <typename T, typename IdxT>
-typename std::enable_if_t<std::is_floating_point_v<T>> gen_uniform(T* out,
-                                                                   raft::random::RngState& rng,
-                                                                   IdxT len,
-                                                                   cudaStream_t stream,
-                                                                   T range_min = T(-1),
-                                                                   T range_max = T(1))
-{
-  raft::random::uniform(rng, out, len, range_min, range_max, stream);
-}
-
-template <typename T, typename IdxT>
-typename std::enable_if_t<std::is_integral_v<T>> gen_uniform(T* out,
-                                                             raft::random::RngState& rng,
-                                                             IdxT len,
-                                                             cudaStream_t stream,
-                                                             T range_min = T(0),
-                                                             T range_max = T(100))
-{
-  raft::random::uniformInt(rng, out, len, range_min, range_max, stream);
-}
-
-template <typename T1, typename T2, typename IdxT>
-void gen_uniform(raft::KeyValuePair<T1, T2>* out,
-                 raft::random::RngState& rng,
-                 IdxT len,
-                 cudaStream_t stream)
-{
-  rmm::device_uvector<T1> keys(len, stream);
-  rmm::device_uvector<T2> values(len, stream);
-
-  gen_uniform(keys.data(), rng, len, stream);
-  gen_uniform(values.data(), rng, len, stream);
-
-  const T1* d_keys   = keys.data();
-  const T2* d_values = values.data();
-  auto counting      = thrust::make_counting_iterator<IdxT>(0);
-  thrust::for_each(rmm::exec_policy(stream),
-                   counting,
-                   counting + len,
-                   [out, d_keys, d_values] __device__(int idx) {
-                     out[idx].key   = d_keys[idx];
-                     out[idx].value = d_values[idx];
-                   });
-}
-
-/** @} */
-
-/** time the function call 'func' using cuda events */
-#define TIMEIT_LOOP(ms, count, func)                       \
-  do {                                                     \
-    cudaEvent_t start, stop;                               \
-    RAFT_CUDA_TRY(cudaEventCreate(&start));                \
-    RAFT_CUDA_TRY(cudaEventCreate(&stop));                 \
-    RAFT_CUDA_TRY(cudaEventRecord(start));                 \
-    for (int i = 0; i < count; ++i) {                      \
-      func;                                                \
-    }                                                      \
-    RAFT_CUDA_TRY(cudaEventRecord(stop));                  \
-    RAFT_CUDA_TRY(cudaEventSynchronize(stop));             \
-    ms = 0.f;                                              \
-    RAFT_CUDA_TRY(cudaEventElapsedTime(&ms, start, stop)); \
-    ms /= args.runs;                                       \
-  } while (0)
-
-inline std::vector<float> read_csv(std::string filename, bool skip_first_n_columns = 1)
-{
-  std::vector<float> result;
-  std::ifstream myFile(filename);
-  if (!myFile.is_open()) throw std::runtime_error("Could not open file");
-
-  std::string line, colname;
-  int val;
-
-  if (myFile.good()) {
-    std::getline(myFile, line);
-    std::stringstream ss(line);
-    while (std::getline(ss, colname, ',')) {}
-  }
-
-  int n_lines = 0;
-  while (std::getline(myFile, line)) {
-    std::stringstream ss(line);
-    int colIdx = 0;
-    while (ss >> val) {
-      if (colIdx >= skip_first_n_columns) {
-        result.push_back(val);
-        if (ss.peek() == ',') ss.ignore();
-      }
-      colIdx++;
-    }
-    n_lines++;
-  }
-
-  printf("lines read: %d\n", n_lines);
-  myFile.close();
-  return result;
-}
-
 };  // end namespace raft
diff --git a/cpp/test/util/bitonic_sort.cu b/cpp/test/util/bitonic_sort.cu
new file mode 100644
index 0000000000..f45e8ce1e0
--- /dev/null
+++ b/cpp/test/util/bitonic_sort.cu
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+
+#include <raft/util/bitonic_sort.cuh>
+
+#include <raft/cudart_utils.h>
+#include <raft/random/rng.hpp>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <numeric>
+
+namespace raft::util {
+
+constexpr int kMaxBlockSize = 512;
+constexpr int kMaxCapacity  = 128;
+
+struct test_spec {
+  int n_inputs;
+  int warp_width;
+  int capacity;
+  bool ascending;
+
+  [[nodiscard]] auto len() const -> int { return n_inputs * warp_width * capacity; }
+};
+
+auto operator<<(std::ostream& os, const test_spec& ss) -> std::ostream&
+{
+  os << "spec{n_inputs: " << ss.n_inputs << ", input_len: " << (ss.warp_width * ss.capacity) << " ("
+     << ss.warp_width << " * " << ss.capacity << ")";
+  os << (ss.ascending ? "; asc}" : "; dsc}");
+  return os;
+}
+
+template <int Capacity, typename T>
+__global__ void bitonic_kernel(T* arr, bool ascending, int warp_width, int n_inputs)
+{
+  const int tid          = blockDim.x * blockIdx.x + threadIdx.x;
+  const int subwarp_id   = tid / warp_width;
+  const int subwarp_lane = tid % warp_width;
+  T local_arr[Capacity];  // NOLINT
+  // Split the data into chunks of size `warp_width * Capacity`, each thread poiting
+  // to the beginning of its stride within the chunk.
+  T* per_thread_arr = arr + subwarp_id * warp_width * Capacity + subwarp_lane;
+
+  if (subwarp_id < n_inputs) {
+#pragma unroll
+    for (int i = 0; i < Capacity; i++) {
+      local_arr[i] = per_thread_arr[i * warp_width];
+    }
+  }
+
+  bitonic<Capacity>(ascending, warp_width).sort(local_arr);
+
+  if (subwarp_id < n_inputs) {
+#pragma unroll
+    for (int i = 0; i < Capacity; i++) {
+      per_thread_arr[i * warp_width] = local_arr[i];
+    }
+  }
+}
+
+template <int Capacity>
+struct bitonic_launch {
+  template <typename T>
+  static void run(const test_spec& spec, T* arr, rmm::cuda_stream_view stream)
+  {
+    ASSERT(spec.capacity <= Capacity, "Invalid input: the requested capacity is too high.");
+    ASSERT(spec.warp_width <= WarpSize,
+           "Invalid input: the requested warp_width must be not larger than the WarpSize.");
+    if constexpr (Capacity > 1) {
+      if (spec.capacity < Capacity) {
+        return bitonic_launch<std::max(1, Capacity / 2)>::run(spec, arr, stream);
+      }
+    }
+    int max_block_size, min_grid_size;
+    RAFT_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize(
+      &min_grid_size, &max_block_size, bitonic_kernel<Capacity, T>, 0, kMaxBlockSize));
+    const int n_warps =
+      ceildiv(std::min(spec.n_inputs * spec.warp_width, max_block_size), WarpSize);
+    const int block_dim  = n_warps * WarpSize;
+    const int n_subwarps = block_dim / spec.warp_width;
+    const int grid_dim   = ceildiv(spec.n_inputs, n_subwarps);
+    bitonic_kernel<Capacity, T>
+      <<<grid_dim, block_dim, 0, stream>>>(arr, spec.ascending, spec.warp_width, spec.n_inputs);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+};
+
+template <typename T>
+class BitonicTest : public testing::TestWithParam<test_spec> {  // NOLINT
+ protected:
+  const test_spec spec;  // NOLINT
+  std::vector<T> in;     // NOLINT
+  std::vector<T> out;    // NOLINT
+  std::vector<T> ref;    // NOLINT
+
+  void segmented_sort(std::vector<T>& vec, int k, bool ascending)  // NOLINT
+  {
+    std::vector<int> p(vec.size());
+    std::iota(p.begin(), p.end(), 0);
+    std::sort(p.begin(), p.end(), [&vec, k, ascending](int i, int j) {
+      const int ik = i / k;
+      const int jk = j / k;
+      if (ik == jk) { return ascending ? vec[i] < vec[j] : vec[i] > vec[j]; }
+      return ik < jk;
+    });
+    for (auto i = int(vec.size()) - 1; i > 0; i--) {
+      auto j = p[i];
+      while (j > i)
+        j = p[j];
+      std::swap(vec[j], vec[i]);
+    }
+  }
+
+  void fill_random(rmm::device_uvector<T>& arr, rmm::cuda_stream_view stream)
+  {
+    raft::random::Rng rng(42);
+    if constexpr (std::is_floating_point_v<T>) {
+      return rng.normal(arr.data(), arr.size(), T(10), T(100), stream);
+    }
+    if constexpr (std::is_integral_v<T>) {
+      return rng.normalInt(arr.data(), arr.size(), T(10), T(100), stream);
+    }
+  }
+
+ public:
+  explicit BitonicTest()
+    : spec(testing::TestWithParam<test_spec>::GetParam()),
+      in(spec.len()),
+      out(spec.len()),
+      ref(spec.len())
+  {
+    auto stream = rmm::cuda_stream_default;
+
+    // generate input
+    rmm::device_uvector<T> arr_d(spec.len(), stream);
+    fill_random(arr_d, stream);
+    update_host(in.data(), arr_d.data(), arr_d.size(), stream);
+
+    // calculate the results
+    bitonic_launch<kMaxCapacity>::run(spec, arr_d.data(), stream);
+    update_host(out.data(), arr_d.data(), arr_d.size(), stream);
+
+    // make sure the results are available on host
+    stream.synchronize();
+
+    // calculate the reference
+    std::copy(in.begin(), in.end(), ref.begin());
+    segmented_sort(ref, spec.warp_width * spec.capacity, spec.ascending);
+  }
+
+  void run() { ASSERT_TRUE(hostVecMatch(ref, out, Compare<T>())); }
+};
+
+auto inputs = ::testing::Values(test_spec{1, 1, 1, true},
+                                test_spec{1, 2, 1, true},
+                                test_spec{1, 4, 1, true},
+                                test_spec{1, 8, 1, true},
+                                test_spec{1, 16, 1, false},
+                                test_spec{1, 32, 1, false},
+                                test_spec{1, 32, 2, false},
+                                test_spec{1, 32, 4, true},
+                                test_spec{1, 32, 8, true},
+                                test_spec{5, 32, 2, true},
+                                test_spec{7, 16, 4, true},
+                                test_spec{7, 8, 2, true},
+                                test_spec{70, 4, 32, true},
+                                test_spec{70, 1, 64, true},
+                                test_spec{70, 2, 128, false});
+
+using Floats = BitonicTest<float>;                     // NOLINT
+TEST_P(Floats, Run) { run(); }                         // NOLINT
+INSTANTIATE_TEST_CASE_P(BitonicTest, Floats, inputs);  // NOLINT
+
+using Ints = BitonicTest<int>;                       // NOLINT
+TEST_P(Ints, Run) { run(); }                         // NOLINT
+INSTANTIATE_TEST_CASE_P(BitonicTest, Ints, inputs);  // NOLINT
+
+using Doubles = BitonicTest<double>;                    // NOLINT
+TEST_P(Doubles, Run) { run(); }                         // NOLINT
+INSTANTIATE_TEST_CASE_P(BitonicTest, Doubles, inputs);  // NOLINT
+
+}  // namespace raft::util
diff --git a/cpp/test/cudart_utils.cpp b/cpp/test/util/cudart_utils.cpp
similarity index 98%
rename from cpp/test/cudart_utils.cpp
rename to cpp/test/util/cudart_utils.cpp
index 7e8585c7c7..e6b1aa9676 100644
--- a/cpp/test/cudart_utils.cpp
+++ b/cpp/test/util/cudart_utils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/device_atomics.cu b/cpp/test/util/device_atomics.cu
similarity index 97%
rename from cpp/test/device_atomics.cu
rename to cpp/test/util/device_atomics.cu
index 4e56b8d486..5e8a67c8f6 100644
--- a/cpp/test/device_atomics.cu
+++ b/cpp/test/util/device_atomics.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/integer_utils.cpp b/cpp/test/util/integer_utils.cpp
similarity index 96%
rename from cpp/test/integer_utils.cpp
rename to cpp/test/util/integer_utils.cpp
index 46fa8d348d..ed5dddf72d 100644
--- a/cpp/test/integer_utils.cpp
+++ b/cpp/test/util/integer_utils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/pow2_utils.cu b/cpp/test/util/pow2_utils.cu
similarity index 98%
rename from cpp/test/pow2_utils.cu
rename to cpp/test/util/pow2_utils.cu
index 9e9bd80673..e29e4eeb9c 100644
--- a/cpp/test/pow2_utils.cu
+++ b/cpp/test/util/pow2_utils.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/dependencies.yaml b/dependencies.yaml
new file mode 100644
index 0000000000..921e1ca46e
--- /dev/null
+++ b/dependencies.yaml
@@ -0,0 +1,186 @@
+# Dependency list for https://github.com/rapidsai/dependency-file-generator
+files:
+  all:
+    output: conda
+    matrix:
+      cuda: ["11.8"]
+      arch: [x86_64]
+    includes:
+      - build
+      - cudatoolkit
+      - develop
+      - doc
+      - run
+      - test_python
+  test_cpp:
+    output: none
+    includes:
+      - cudatoolkit
+  test_python:
+    output: none
+    includes:
+      - cudatoolkit
+      - py_version
+      - test_python
+  checks:
+    output: none
+    includes:
+      - checks
+      - py_version
+channels:
+  - rapidsai
+  - rapidsai-nightly
+  - dask/label/dev
+  - conda-forge
+  - nvidia
+dependencies:
+  build:
+    common:
+      - output_types: [conda, requirements]
+        packages:
+          - cmake>=3.23.1,!=3.25.0
+          - cuda-python >=11.7.1,<12.0
+          - cython>=0.29,<0.30
+          - ninja
+          - scikit-build>=0.13.1
+      - output_types: [conda]
+        packages:
+          - c-compiler
+          - cxx-compiler
+    specific:
+      - output_types: conda
+        matrices:
+          - matrix:
+              arch: x86_64
+            packages:
+              - gcc_linux-64=9
+              - sysroot_linux-64==2.17
+          - matrix:
+              arch: aarch64
+            packages:
+              - gcc_linux-aarch64=9
+              - sysroot_linux-aarch64==2.17
+  checks:
+    common:
+      - output_types: [conda, requirements]
+        packages:
+          - pre-commit
+  develop:
+    common:
+      - output_types: [conda, requirements]
+        packages:
+          - clang=11.1.0
+      - output_types: [conda]
+        packages:
+          - clang-tools=11.1.0
+  cudatoolkit:
+    specific:
+      - output_types: conda
+        matrices:
+          - matrix:
+              cuda: "11.8"
+            packages:
+              - cudatoolkit=11.8
+              - cuda-profiler-api=11.8.86
+              - libcublas-dev=11.11.3.6
+              - libcublas=11.11.3.6
+              - libcurand-dev=10.3.0.86
+              - libcurand=10.3.0.86
+              - libcusolver-dev=11.4.1.48
+              - libcusolver=11.4.1.48
+              - libcusparse-dev=11.7.5.86
+              - libcusparse=11.7.5.86
+          - matrix:
+              cuda: "11.5"
+            packages:
+              - cudatoolkit=11.5
+              - cuda-profiler-api>=11.4.240,<=11.8.86 # use any `11.x` version since pkg is missing several CUDA/arch packages
+              - libcublas-dev>=11.7.3.1,<=11.7.4.6
+              - libcublas>=11.7.3.1,<=11.7.4.6
+              - libcurand-dev>=10.2.6.48,<=10.2.7.107
+              - libcurand>=10.2.6.48,<=10.2.7.107
+              - libcusolver-dev>=11.2.1.48,<=11.3.2.107
+              - libcusolver>=11.2.1.48,<=11.3.2.107
+              - libcusparse-dev>=11.7.0.31,<=11.7.0.107
+              - libcusparse>=11.7.0.31,<=11.7.0.107
+          - matrix:
+              cuda: "11.4"
+            packages:
+              - cudatoolkit=11.4
+              - cuda-profiler-api>=11.4.240,<=11.8.86 # use any `11.x` version since pkg is missing several CUDA/arch packages
+              - &libcublas_dev114 libcublas-dev>=11.5.2.43,<=11.6.5.2
+              - &libcublas114 libcublas>=11.5.2.43,<=11.6.5.2
+              - &libcurand_dev114 libcurand-dev>=10.2.5.43,<=10.2.5.120
+              - &libcurand114 libcurand>=10.2.5.43,<=10.2.5.120
+              - &libcusolver_dev114 libcusolver-dev>=11.2.0.43,<=11.2.0.120
+              - &libcusolver114 libcusolver>=11.2.0.43,<=11.2.0.120
+              - &libcusparse_dev114 libcusparse-dev>=11.6.0.43,<=11.6.0.120
+              - &libcusparse114 libcusparse>=11.6.0.43,<=11.6.0.120
+          - matrix:
+              cuda: "11.2"
+            packages:
+              - cudatoolkit=11.2
+              - cuda-profiler-api>=11.4.240,<=11.8.86 # use any `11.x` version since pkg is missing several CUDA/arch packages
+              # The NVIDIA channel doesn't publish pkgs older than 11.4 for these libs,
+              # so 11.2 uses 11.4 packages (the oldest available).
+              - *libcublas_dev114
+              - *libcublas114
+              - *libcurand_dev114
+              - *libcurand114
+              - *libcusolver_dev114
+              - *libcusolver114
+              - *libcusparse_dev114
+              - *libcusparse114
+  doc:
+    common:
+      - output_types: [conda, requirements]
+        packages:
+          - breathe
+      - output_types: [requirements]
+        packages:
+          - sphinx_markdown_tables
+      - output_types: [conda]
+        packages:
+          - doxygen>=1.8.20
+          - sphinx-markdown-tables
+  py_version:
+    specific:
+      - output_types: conda
+        matrices:
+          - matrix:
+              py: "3.8"
+            packages:
+              - python=3.8
+          - matrix:
+              py: "3.9"
+            packages:
+              - python=3.9
+          - matrix:
+              py: "3.10"
+            packages:
+              - python=3.10
+          - matrix:
+            packages:
+              - python>=3.8,<3.11
+  run:
+    common:
+      - output_types: [conda]
+        packages:
+          - rmm=23.02
+          - dask==2023.1.1
+          - distributed==2023.1.1
+          - ucx>=1.13.0
+          - ucx-py=0.30
+          - ucx-proc=*=gpu
+          - libfaiss>=1.7.1=cuda*
+          - faiss-proc=*=cuda
+          - dask-cuda=23.02
+  test_python:
+    common:
+      - output_types: [conda, requirements]
+        packages:
+          - cupy
+          - pytest
+          - pytest-cov
+          - scikit-learn
+          - scipy
diff --git a/docs/source/build.md b/docs/source/build.md
index 4e692f85c5..4052e49cf8 100644
--- a/docs/source/build.md
+++ b/docs/source/build.md
@@ -1,4 +1,30 @@
-# Install Guide
+# Installation
+
+### Conda
+
+The easiest way to install RAFT is through conda and several packages are provided.
+- `libraft-headers` RAFT headers
+- `libraft-nn` (optional) contains shared libraries for the nearest neighbors primitives.
+- `libraft-distance` (optional) contains shared libraries for distance primitives.
+- `pylibraft` (optional) Python wrappers around RAFT algorithms and primitives.
+- `raft-dask` (optional) enables deployment of multi-node multi-GPU algorithms that use RAFT `raft::comms` in Dask clusters.
+
+Use the following command to install all of the RAFT packages with conda (replace `rapidsai` with `rapidsai-nightly` to install more up-to-date but less stable nightly packages). `mamba` is preferred over the `conda` command.
+```bash
+mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft
+```
+
+You can also install the `libraft-*` conda packages individually using the `mamba` command above.
+
+After installing RAFT, `find_package(raft COMPONENTS nn distance)` can be used in your CUDA/C++ cmake build to compile and/or link against needed dependencies in your raft target. `COMPONENTS` are optional and will depend on the packages installed.
+
+### Pip
+
+pylibraft and raft-dask both have experimental packages that can be [installed through pip](https://rapids.ai/pip.html#install):
+```bash
+pip install pylibraft-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
+pip install raft-dask-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
+```
 
 ## Building and installing RAFT
 
@@ -46,6 +72,12 @@ The `-n` flag can be passed to just have the build download the needed dependenc
 ./build.sh libraft -n
 ```
 
+Once installed, `libraft` headers (and dependencies which were downloaded and installed using `rapids-cmake`) can be uninstalled also using `build.sh`:
+```bash
+./build.sh libraft --uninstall
+```
+
+
 ### C++ Shared Libraries (optional)
 
 For larger projects which make heavy use of the pairwise distances or nearest neighbors APIs, shared libraries can be built to speed up compile times. These shared libraries can also significantly improve re-compile times both while developing RAFT and developing against the APIs. Build all of the available shared libraries by passing `--compile-libs` flag to `build.sh`:
@@ -60,6 +92,12 @@ Individual shared libraries have their own flags and multiple can be used (thoug
 
 In above example the shared libraries are installed by default into `$INSTALL_PREFIX/lib`. To disable this, pass `-n` flag.
 
+Once installed, the shared libraries, headers (and any dependencies downloaded and installed via `rapids-cmake`) can be uninstalled using `build.sh`:
+```bash
+./build.sh libraft --uninstall
+```
+
+
 ### ccache and sccache
 
 `ccache` and `sccache` can be used to better cache parts of the build when rebuilding frequently, such as when working on a new feature. You can also use `ccache` or `sccache` with `build.sh`:
@@ -92,7 +130,7 @@ For example, to run the distance tests:
 It can take sometime to compile all of the tests. You can build individual tests by providing a semicolon-separated list to the `--limit-tests` option in `build.sh`:
 
 ```bash
-./build.sh libraft tests --limit-tests=NEIGHBORS_TEST;DISTANCE_TEST;MATRIX_TEST
+./build.sh libraft tests -n --limit-tests=NEIGHBORS_TEST;DISTANCE_TEST;MATRIX_TEST
 ```
 
 ### Benchmarks
@@ -105,10 +143,10 @@ The benchmarks are broken apart by algorithm category, so you will find several
 It can take sometime to compile all of the benchmarks. You can build individual benchmarks by providing a semicolon-separated list to the `--limit-bench` option in `build.sh`:
 
 ```bash
-./build.sh libraft bench --limit-bench=NEIGHBORS_BENCH;DISTANCE_BENCH;LINALG_BENCH
+./build.sh libraft bench -n --limit-bench=NEIGHBORS_BENCH;DISTANCE_BENCH;LINALG_BENCH
 ```
 
-### C++ Using Cmake
+### C++ Using Cmake Directly
 
 Use `CMAKE_INSTALL_PREFIX` to install RAFT into a specific location. The snippet below will install it into the current conda environment:
 ```bash
@@ -142,10 +180,10 @@ Currently, shared libraries are provided for the `libraft-nn` and `libraft-dista
 
 ### Python
 
-Conda environment scripts are provided for installing the necessary dependencies for building and using the Python APIs. It is preferred to use `mamba`, as it provides significant speedup over `conda`. In addition you will have to manually install `nvcc` as it will not be installed as part of the conda environment. The following example will install create and install dependencies for a CUDA 11.5 conda environment:
+Conda environment scripts are provided for installing the necessary dependencies for building and using the Python APIs. It is preferred to use `mamba`, as it provides significant speedup over `conda`. In addition you will have to manually install `nvcc` as it will not be installed as part of the conda environment. The following example will install create and install dependencies for a CUDA 11.8 conda environment:
 
 ```bash
-mamba env create --name raft_env_name -f conda/environments/raft_dev_cuda11.5.yml
+mamba env create --name raft_env_name -f conda/environments/all_cuda-118_arch-x86_64.yaml
 mamba activate raft_env_name
 ```
 
@@ -179,6 +217,11 @@ cd python/pylibraft
 py.test -s -v
 ```
 
+The Python packages can also be uninstalled using the `build.sh` script:
+```bash
+./build.sh pylibraft raft-dask --uninstall
+```
+
 ### Documentation
 
 The documentation requires that the C++ headers and python packages have been built and installed.
@@ -335,6 +378,14 @@ find_and_configure_raft(VERSION    ${RAFT_VERSION}.00
 
 If using the nearest neighbors APIs without the shared libraries, set `ENABLE_NN_DEPENDENCIES=ON` and keep `USE_NN_LIBRARY=OFF`
 
-### Python/Cython Integration
+## Uninstall
+
+Once built and installed, RAFT can be safely uninstalled using `build.sh` by specifying any or all of the installed components. Please note that since `pylibraft` depends on `libraft`, uninstalling `pylibraft` will also uninstall `libraft`:
+```bash
+./build.sh libraft pylibraft raft-dask --uninstall
+```
 
-Once installed, RAFT's Python library can be added to downstream conda recipes, imported and used directly.
+Leaving off the installed components will uninstall everything that's been installed:
+```bash
+./build.sh --uninstall
+```
\ No newline at end of file
diff --git a/docs/source/conf.py b/docs/source/conf.py
index e0c1f4543a..4a0dfe00b5 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -77,17 +77,17 @@
 
 # General information about the project.
 project = "raft"
-copyright = "2022, nvidia"
-author = "nvidia"
+copyright = "2023, NVIDIA Corporation"
+author = "NVIDIA Corporation"
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
 # The short X.Y version.
-version = '22.12'
+version = '23.02'
 # The full version, including alpha/beta/rc tags.
-release = '22.12.01'
+release = '23.02.00'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -161,7 +161,7 @@
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-    (master_doc, "raft.tex", "RAFT Documentation", "nvidia", "manual"),
+    (master_doc, "raft.tex", "RAFT Documentation", "NVIDIA Corporation", "manual"),
 ]
 
 # -- Options for manual page output ---------------------------------------
diff --git a/docs/source/cpp_api.rst b/docs/source/cpp_api.rst
index 569fd64061..0e82d81e35 100644
--- a/docs/source/cpp_api.rst
+++ b/docs/source/cpp_api.rst
@@ -1,6 +1,6 @@
-~~~~~~~~~~~~~~~~~
-C++ API Reference
-~~~~~~~~~~~~~~~~~
+~~~~~~~
+C++ API
+~~~~~~~
 
 .. _api:
 
@@ -13,8 +13,9 @@ C++ API Reference
    cpp_api/linalg.rst
    cpp_api/matrix.rst
    cpp_api/mdspan.rst
+   cpp_api/mnmg.rst
    cpp_api/neighbors.rst
-   cpp_api/solver.rst
    cpp_api/random.rst
+   cpp_api/solver.rst
    cpp_api/sparse.rst
    cpp_api/stats.rst
\ No newline at end of file
diff --git a/docs/source/cpp_api/cluster.rst b/docs/source/cpp_api/cluster.rst
index 6fdc1c696f..77c8332bbd 100644
--- a/docs/source/cpp_api/cluster.rst
+++ b/docs/source/cpp_api/cluster.rst
@@ -1,41 +1,17 @@
 Cluster
 =======
 
-This page provides C++ class references for the publicly-exposed elements of the `raft/cluster` headers. RAFT provides
+This page provides C++ API references for the publicly-exposed elements of the `raft/cluster` headers. RAFT provides
 fundamental clustering algorithms which are, themselves, considered reusable building blocks for other algorithms.
 
 .. role:: py(code)
    :language: c++
    :class: highlight
 
-K-Means
-#######
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
 
-Header: `raft/cluster/kmeans.cuh`
-
-.. doxygennamespace:: raft::cluster::kmeans
-    :project: RAFT
-    :members:
-    :content-only:
-
-
-Hierarchical Clustering
-#######################
-
-Header: `raft/cluster/single_linkage.cuh`
-
-.. doxygennamespace:: raft::cluster::hierarchy
-    :project: RAFT
-    :members:
-    :content-only:
-
-
-Spectral Clustering
-###################
-
-Header: `raft/spectral/partition.cuh`
-
-.. doxygennamespace:: raft::spectral
-    :project: RAFT
-    :members:
-    :content-only:
+   cluster_kmeans.rst
+   cluster_slhc.rst
+   cluster_spectral.rst
\ No newline at end of file
diff --git a/docs/source/cpp_api/cluster_kmeans.rst b/docs/source/cpp_api/cluster_kmeans.rst
new file mode 100644
index 0000000000..fa040ddc18
--- /dev/null
+++ b/docs/source/cpp_api/cluster_kmeans.rst
@@ -0,0 +1,13 @@
+K-Means
+=======
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <raft/cluster/kmeans.cuh>``
+
+.. doxygennamespace:: raft::cluster::kmeans
+    :project: RAFT
+    :members:
+    :content-only:
diff --git a/docs/source/cpp_api/cluster_slhc.rst b/docs/source/cpp_api/cluster_slhc.rst
new file mode 100644
index 0000000000..fc45ae699a
--- /dev/null
+++ b/docs/source/cpp_api/cluster_slhc.rst
@@ -0,0 +1,13 @@
+Hierarchical Clustering
+=======================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <raft/cluster/single_linkage.cuh>``
+
+.. doxygennamespace:: raft::cluster::hierarchy
+    :project: RAFT
+    :members:
+    :content-only:
diff --git a/docs/source/cpp_api/cluster_spectral.rst b/docs/source/cpp_api/cluster_spectral.rst
new file mode 100644
index 0000000000..a71f431ab8
--- /dev/null
+++ b/docs/source/cpp_api/cluster_spectral.rst
@@ -0,0 +1,13 @@
+Spectral Clustering
+===================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <raft/spectral/partition.cuh>``
+
+.. doxygennamespace:: raft::spectral
+    :project: RAFT
+    :members:
+    :content-only:
diff --git a/docs/source/cpp_api/core.rst b/docs/source/cpp_api/core.rst
index 68965053de..c4728337a0 100644
--- a/docs/source/cpp_api/core.rst
+++ b/docs/source/cpp_api/core.rst
@@ -10,64 +10,13 @@ expose in public APIs.
    :language: c++
    :class: highlight
 
-
-handle_t
-########
-
-Header: `raft/core/handle.hpp`
-
-.. doxygenclass:: raft::handle_t
-    :project: RAFT
-    :members:
-
-
-Interruptible
-#############
-
-Header: `raft/core/interupptible.hpp`
-
-.. doxygenclass:: raft::interruptible
-    :project: RAFT
-    :members:
-
-NVTX
-####
-
-Header: `raft/core/nvtx.hpp`
-
-.. doxygennamespace:: raft::common::nvtx
-    :project: RAFT
-    :members:
-    :content-only:
-
-
-Key-Value Pair
-##############
-
-Header: `raft/core/kvp.hpp`
-
-.. doxygenstruct:: raft::KeyValuePair
-    :project: RAFT
-    :members:
-
-
-logger
-######
-
-Header: `raft/core/logger.hpp`
-
-.. doxygenclass:: raft::logger
-    :project: RAFT
-    :members:
-
-
-Multi-node Multi-GPU
-####################
-
-Header: `raft/core/comms.hpp`
-
-.. doxygennamespace:: raft::comms
-    :project: RAFT
-    :members:
-    :content-only:
-
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   core_resources.rst
+   core_logger.rst
+   core_kvp.rst
+   core_nvtx.rst
+   core_interruptible.rst
+   core_operators.rst
\ No newline at end of file
diff --git a/docs/source/cpp_api/core_interruptible.rst b/docs/source/cpp_api/core_interruptible.rst
new file mode 100644
index 0000000000..da767cdd6d
--- /dev/null
+++ b/docs/source/cpp_api/core_interruptible.rst
@@ -0,0 +1,15 @@
+Interruptible
+=============
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+
+``#include <raft/core/interruptible.hpp>``
+
+namespace *raft::core*
+
+.. doxygenclass:: raft::interruptible
+    :project: RAFT
+    :members:
diff --git a/docs/source/cpp_api/core_kvp.rst b/docs/source/cpp_api/core_kvp.rst
new file mode 100644
index 0000000000..60a0da078b
--- /dev/null
+++ b/docs/source/cpp_api/core_kvp.rst
@@ -0,0 +1,15 @@
+Key-Value Pair
+==============
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <raft/core/kvp.hpp>``
+
+namespace *raft::core*
+
+.. doxygenstruct:: raft::KeyValuePair
+    :project: RAFT
+    :members:
+
diff --git a/docs/source/cpp_api/core_logger.rst b/docs/source/cpp_api/core_logger.rst
new file mode 100644
index 0000000000..60714a63ea
--- /dev/null
+++ b/docs/source/cpp_api/core_logger.rst
@@ -0,0 +1,15 @@
+logger
+======
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <raft/core/logger.hpp>``
+
+namespace *raft::core*
+
+.. doxygenclass:: raft::logger
+    :project: RAFT
+    :members:
+
diff --git a/docs/source/cpp_api/core_nvtx.rst b/docs/source/cpp_api/core_nvtx.rst
new file mode 100644
index 0000000000..addcbdda30
--- /dev/null
+++ b/docs/source/cpp_api/core_nvtx.rst
@@ -0,0 +1,17 @@
+NVTX
+====
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <raft/core/nvtx.hpp>``
+
+namespace *raft::core*
+
+.. doxygennamespace:: raft::common::nvtx
+    :project: RAFT
+    :members:
+    :content-only:
+
+
diff --git a/docs/source/cpp_api/core_operators.rst b/docs/source/cpp_api/core_operators.rst
new file mode 100644
index 0000000000..be6443069d
--- /dev/null
+++ b/docs/source/cpp_api/core_operators.rst
@@ -0,0 +1,16 @@
+Operators and Functors
+======================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+
+``#include <raft/core/operators.hpp>``
+
+namespace *raft::core*
+
+.. doxygengroup:: operators
+    :project: RAFT
+    :members:
+    :content-only:
diff --git a/docs/source/cpp_api/core_resources.rst b/docs/source/cpp_api/core_resources.rst
new file mode 100644
index 0000000000..b148e38e44
--- /dev/null
+++ b/docs/source/cpp_api/core_resources.rst
@@ -0,0 +1,183 @@
+Resources
+=========
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+All resources which are specific to a computing environment like host or device are contained within, and managed by,
+raft::resources. This design simplifies the APIs and eases user burden by making them opaque by default but allowing customization based on user preference.
+
+
+Vocabulary
+----------
+
+``#include <raft/core/resource/resource_types.hpp>``
+
+namespace *raft::resource*
+
+ .. doxygengroup:: resource_types
+     :project: RAFT
+     :members:
+     :content-only:
+
+
+Device Resources
+----------------
+
+``#include <raft/core/device_resources.hpp>``
+
+namespace *raft::core*
+
+.. doxygenclass:: raft::device_resources
+    :project: RAFT
+    :members:
+
+
+Resource Functions
+------------------
+
+Comms
+~~~~~
+
+``#include <raft/core/resource/comms.hpp>``
+
+namespace *raft::resource*
+
+ .. doxygengroup:: resource_comms
+     :project: RAFT
+     :members:
+     :content-only:
+
+cuBLAS Handle
+~~~~~~~~~~~~~
+
+``#include <raft/core/resource/cublase_handle.hpp>``
+
+namespace *raft::resource*
+
+ .. doxygengroup:: resource_cublas
+     :project: RAFT
+     :members:
+     :content-only:
+
+CUDA Stream
+~~~~~~~~~~~
+
+``#include <raft/core/resource/cuda_stream.hpp>``
+
+namespace *raft::resource*
+
+ .. doxygengroup:: resource_cuda_stream
+     :project: RAFT
+     :members:
+     :content-only:
+
+
+CUDA Stream Pool
+~~~~~~~~~~~~~~~~
+
+``#include <raft/core/resource/cuda_stream_pool.hpp>``
+
+namespace *raft::resource*
+
+.. doxygengroup:: resource_cuda_stream_pool
+    :project: RAFT
+    :members:
+    :content-only:
+
+cuSolverDn Handle
+~~~~~~~~~~~~~~~~~
+
+``#include <raft/core/resource/cusolver_dn_handle.hpp>``
+namespace *raft::resource*
+
+ .. doxygengroup:: resource_cusolver_dn
+     :project: RAFT
+     :members:
+     :content-only:
+
+cuSolverSp Handle
+~~~~~~~~~~~~~~~~~
+
+``#include <raft/core/resource/cusolver_sp_handle.hpp>``
+
+namespace *raft::resource*
+
+ .. doxygengroup:: resource_cusolver_sp
+     :project: RAFT
+     :members:
+     :content-only:
+
+cuSparse Handle
+~~~~~~~~~~~~~~~
+
+``#include <raft/core/resource/cusparse_handle.hpp>``
+
+namespace *raft::resource*
+
+ .. doxygengroup:: resource_cusparse
+     :project: RAFT
+     :members:
+     :content-only:
+
+Device ID
+~~~~~~~~~
+
+``#include <raft/core/resource/device_id.hpp>``
+
+namespace *raft::resource*
+
+ .. doxygengroup:: resource_device_id
+     :project: RAFT
+     :members:
+     :content-only:
+
+
+Device Memory Resource
+~~~~~~~~~~~~~~~~~~~~~~
+
+``#include <raft/core/resource/device_memory_resource.hpp>``
+
+namespace *raft::resource*
+
+ .. doxygengroup:: resource_memory_resource
+     :project: RAFT
+     :members:
+     :content-only:
+
+Device Properties
+~~~~~~~~~~~~~~~~~
+
+``#include <raft/core/resource/device_properties.hpp>``
+
+namespace *raft::resource*
+
+ .. doxygengroup:: resource_device_props
+     :project: RAFT
+     :members:
+     :content-only:
+
+Sub Communicators
+~~~~~~~~~~~~~~~~~
+
+``#include <raft/core/resource/sub_comms.hpp>``
+
+namespace *raft::resource*
+
+ .. doxygengroup:: resource_sub_comms
+     :project: RAFT
+     :members:
+     :content-only:
+
+Thrust Exec Policy
+~~~~~~~~~~~~~~~~~~
+
+``#include <raft/core/resource/thrust_policy.hpp>``
+
+namespace *raft::resource*
+
+ .. doxygengroup:: resource_thrust_policy
+     :project: RAFT
+     :members:
+     :content-only:
diff --git a/docs/source/cpp_api/distance.rst b/docs/source/cpp_api/distance.rst
index e77e311cdc..eb9bc6255d 100644
--- a/docs/source/cpp_api/distance.rst
+++ b/docs/source/cpp_api/distance.rst
@@ -8,12 +8,20 @@ distances have been highly optimized and support a wide assortment of different
    :language: c++
    :class: highlight
 
+Distance Types
+--------------
+
+``#include <raft/distance/distance_types.hpp>``
+
+namespace *raft::distance*
+
+.. doxygenenum:: raft::distance::DistanceType
+   :project: RAFT
 
-Distance
-########
 
-Header: `raft/distance/distance.cuh`
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
 
-.. doxygennamespace:: raft::distance
-    :project: RAFT
-    :members:
+   distance_pairwise.rst
+   distance_1nn.rst
diff --git a/docs/source/cpp_api/distance_1nn.rst b/docs/source/cpp_api/distance_1nn.rst
new file mode 100644
index 0000000000..8627069a2d
--- /dev/null
+++ b/docs/source/cpp_api/distance_1nn.rst
@@ -0,0 +1,16 @@
+1-Nearest Neighbors
+===================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <raft/distance/fused_l2_nn.cuh>``
+
+namespace *raft::distance*
+
+.. doxygengroup:: fused_l2_nn
+    :project: RAFT
+    :members:
+    :content-only:
+
diff --git a/docs/source/cpp_api/distance_pairwise.rst b/docs/source/cpp_api/distance_pairwise.rst
new file mode 100644
index 0000000000..2a9c9a92f5
--- /dev/null
+++ b/docs/source/cpp_api/distance_pairwise.rst
@@ -0,0 +1,17 @@
+Pairwise Distance
+=================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <raft/distance/distance.cuh>``
+
+namespace *raft::distance*
+
+.. doxygengroup:: distance_mdspan
+    :project: RAFT
+    :members:
+    :content-only:
+
+
diff --git a/docs/source/cpp_api/linalg.rst b/docs/source/cpp_api/linalg.rst
index 081eb40298..3cd928c9db 100644
--- a/docs/source/cpp_api/linalg.rst
+++ b/docs/source/cpp_api/linalg.rst
@@ -10,8 +10,13 @@ hide the complexities of lower-level C-based libraries provided in the CUDA tool
    :language: c++
    :class: highlight
 
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
 
-.. doxygennamespace:: raft::linalg
-    :project: RAFT
-    :members:
-    :content-only:
+   linalg_arithmetic.rst
+   linalg_blas.rst
+   linalg_map_reduce.rst
+   linalg_matrix.rst
+   linalg_matrix_vector.rst
+   linalg_solver.rst
\ No newline at end of file
diff --git a/docs/source/cpp_api/linalg_arithmetic.rst b/docs/source/cpp_api/linalg_arithmetic.rst
new file mode 100644
index 0000000000..7bc428b9f0
--- /dev/null
+++ b/docs/source/cpp_api/linalg_arithmetic.rst
@@ -0,0 +1,117 @@
+Arithmetic
+==========
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+
+Addition
+--------
+
+``#include <raft/linalg/add.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: add_dense
+    :project: RAFT
+    :members:
+    :content-only:
+
+
+Binary Op
+---------
+
+``#include <raft/linalg/binary_op.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: binary_op
+    :project: RAFT
+    :members:
+    :content-only:
+
+Division
+--------
+
+``#include <raft/linalg/divide.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: divide
+    :project: RAFT
+    :members:
+    :content-only:
+
+Multiplication
+--------------
+
+``#include <raft/linalg/multiply.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: multiply
+    :project: RAFT
+    :members:
+    :content-only:
+
+Power
+-----
+
+``#include <raft/linalg/power.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: power
+    :project: RAFT
+    :members:
+    :content-only:
+
+Square Root
+-----------
+
+``#include <raft/linalg/sqrt.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: sqrt
+    :project: RAFT
+    :members:
+    :content-only:
+
+Subtraction
+-----------
+
+``#include <raft/linalg/subtract.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: sub
+    :project: RAFT
+    :members:
+    :content-only:
+
+Ternary Op
+----------
+
+``#include <raft/linalg/ternary_op.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: ternary_op
+    :project: RAFT
+    :members:
+    :content-only:
+
+Unary Op
+--------
+
+``#include <raft/linalg/unary_op.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: unary_op
+    :project: RAFT
+    :members:
+    :content-only:
+
diff --git a/docs/source/cpp_api/linalg_blas.rst b/docs/source/cpp_api/linalg_blas.rst
new file mode 100644
index 0000000000..12133e1dc5
--- /dev/null
+++ b/docs/source/cpp_api/linalg_blas.rst
@@ -0,0 +1,55 @@
+BLAS Routines
+=============
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+axpy
+----
+
+``#include <raft/linalg/axpy.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: axpy
+    :project: RAFT
+    :members:
+    :content-only:
+
+dot
+---
+
+``#include <raft/linalg/dot.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: dot
+    :project: RAFT
+    :members:
+    :content-only:
+
+gemm
+----
+
+``#include <raft/linalg/gemm.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: gemm
+    :project: RAFT
+    :members:
+    :content-only:
+
+gemv
+----
+
+``#include <raft/linalg/gemv.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: gemv
+    :project: RAFT
+    :members:
+    :content-only:
+
diff --git a/docs/source/cpp_api/linalg_map_reduce.rst b/docs/source/cpp_api/linalg_map_reduce.rst
new file mode 100644
index 0000000000..5333a23f43
--- /dev/null
+++ b/docs/source/cpp_api/linalg_map_reduce.rst
@@ -0,0 +1,127 @@
+Mapping and Reduction
+=====================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+Coalesced Reduction
+-------------------
+
+``#include <raft/linalg/coalesced_reduction.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: coalesced_reduction
+    :project: RAFT
+    :members:
+    :content-only:
+
+Map
+---
+
+``#include <raft/linalg/map.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: map
+    :project: RAFT
+    :members:
+    :content-only:
+
+Map Reduce
+----------
+
+``#include <raft/linalg/map_reduce.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: map_reduce
+    :project: RAFT
+    :members:
+    :content-only:
+
+Mean Squared Error
+------------------
+
+
+``#include <raft/linalg/mean_squared_error.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: mean_squared_error
+    :project: RAFT
+    :members:
+    :content-only:
+
+Norm
+----
+
+``#include <raft/linalg/norm.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: norm
+    :project: RAFT
+    :members:
+    :content-only:
+
+Normalize
+---------
+
+``#include <raft/linalg/normalize.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: normalize
+    :project: RAFT
+    :members:
+    :content-only:
+
+Reduction
+---------
+
+``#include <raft/linalg/reduce.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: reduction
+    :project: RAFT
+    :members:
+    :content-only:
+
+Reduce Cols By Key
+------------------
+
+``#include <raft/linalg/reduce_cols_by_key.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: reduce_cols_by_key
+    :project: RAFT
+    :members:
+    :content-only:
+
+Reduce Rows By Key
+------------------
+
+``#include <raft/linalg/reduce_rows_by_key.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: reduce_rows_by_key
+    :project: RAFT
+    :members:
+    :content-only:
+
+Strided Reduction
+-----------------
+
+``#include <raft/linalg/strided_reduction.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: strided_reduction
+    :project: RAFT
+    :members:
+    :content-only:
diff --git a/docs/source/cpp_api/linalg_matrix.rst b/docs/source/cpp_api/linalg_matrix.rst
new file mode 100644
index 0000000000..e6024bcd02
--- /dev/null
+++ b/docs/source/cpp_api/linalg_matrix.rst
@@ -0,0 +1,19 @@
+Matrix Operations
+=================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+Transpose
+---------
+
+``#include <raft/linalg/transpose.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: transpose
+    :project: RAFT
+    :members:
+    :content-only:
+
diff --git a/docs/source/cpp_api/linalg_matrix_vector.rst b/docs/source/cpp_api/linalg_matrix_vector.rst
new file mode 100644
index 0000000000..d92a3c9874
--- /dev/null
+++ b/docs/source/cpp_api/linalg_matrix_vector.rst
@@ -0,0 +1,32 @@
+Matrix-Vector Operations
+========================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+Arithmetic
+----------
+
+``#include <raft/linalg/matrix_vector.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: matrix_vector
+    :project: RAFT
+    :members:
+    :content-only:
+
+
+Operations
+----------
+
+``#include <raft/linalg/matrix_vector_op.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: matrix_vector_op
+    :project: RAFT
+    :members:
+    :content-only:
+
diff --git a/docs/source/cpp_api/linalg_solver.rst b/docs/source/cpp_api/linalg_solver.rst
new file mode 100644
index 0000000000..1a811e072a
--- /dev/null
+++ b/docs/source/cpp_api/linalg_solver.rst
@@ -0,0 +1,66 @@
+Linear Algebra Solvers
+======================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+Eigen Decomposition
+-------------------
+
+``#include <raft/linalg/eig.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: eig
+    :project: RAFT
+    :members:
+    :content-only:
+
+QR Decomposition
+----------------
+
+``#include <raft/linalg/qr.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: qr
+    :project: RAFT
+    :members:
+    :content-only:
+
+Randomized Singular-Value Decomposition
+---------------------------------------
+
+``#include <raft/linalg/rsvd.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: rsvd
+    :project: RAFT
+    :members:
+    :content-only:
+
+Singular-Value Decomposition
+----------------------------
+
+``#include <raft/linalg/svd.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: svd
+    :project: RAFT
+    :members:
+    :content-only:
+
+Least Squares
+-------------
+
+``#include <raft/linalg/lstsq.cuh>``
+
+namespace *raft::linalg*
+
+.. doxygengroup:: lstsq
+    :project: RAFT
+    :members:
+    :content-only:
diff --git a/docs/source/cpp_api/matrix.rst b/docs/source/cpp_api/matrix.rst
index b032281a1c..17953bc128 100644
--- a/docs/source/cpp_api/matrix.rst
+++ b/docs/source/cpp_api/matrix.rst
@@ -9,7 +9,12 @@ headers cover many operations on matrices that are otherwise not covered by `raf
    :class: highlight
 
 
-.. doxygennamespace:: raft::matrix
-    :project: RAFT
-    :members:
-    :content-only:
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   matrix_arithmetic.rst
+   matrix_manipulation.rst
+   matrix_ordering.rst
+   matrix_reduction.rst
+   matrix_selection.rst
\ No newline at end of file
diff --git a/docs/source/cpp_api/matrix_arithmetic.rst b/docs/source/cpp_api/matrix_arithmetic.rst
new file mode 100644
index 0000000000..4ed2a41680
--- /dev/null
+++ b/docs/source/cpp_api/matrix_arithmetic.rst
@@ -0,0 +1,80 @@
+Matrix Arithmetic
+=================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+
+Line-wise Operation
+-------------------
+
+``#include <raft/matrix/linewise_op.cuh>``
+
+namespace *raft::matrix*
+
+.. doxygengroup:: linewise_op
+    :project: RAFT
+    :members:
+    :content-only:
+
+Power
+-----
+
+``#include <raft/matrix/power.cuh>``
+
+namespace *raft::matrix*
+
+.. doxygengroup:: matrix_power
+    :project: RAFT
+    :members:
+    :content-only:
+
+Ratio
+-----
+
+``#include <raft/matrix/ratio.cuh>``
+
+namespace *raft::matrix*
+
+.. doxygengroup:: matrix_ratio
+    :project: RAFT
+    :members:
+    :content-only:
+
+Reciprocal
+----------
+
+``#include <raft/matrix/reciprocal.cuh>``
+
+namespace *raft::matrix*
+
+.. doxygengroup:: matrix_reciprocal
+    :project: RAFT
+    :members:
+    :content-only:
+
+Sign-flip
+---------
+
+``#include <raft/matrix/sign_flip.cuh>``
+
+namespace *raft::matrix*
+
+.. doxygengroup:: matrix_sign_flip
+    :project: RAFT
+    :members:
+    :content-only:
+
+
+Square Root
+-----------
+
+``#include <raft/matrix/sqrt.cuh>``
+
+namespace *raft::matrix*
+
+.. doxygengroup:: matrix_sqrt
+    :project: RAFT
+    :members:
+    :content-only:
diff --git a/docs/source/cpp_api/matrix_manipulation.rst b/docs/source/cpp_api/matrix_manipulation.rst
new file mode 100644
index 0000000000..d0da51e4b7
--- /dev/null
+++ b/docs/source/cpp_api/matrix_manipulation.rst
@@ -0,0 +1,44 @@
+Matrix Manipulation
+===================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+Initialization
+--------------
+
+``#include <raft/matrix/init.cuh>``
+
+namespace *raft::matrix*
+
+.. doxygengroup:: matrix_init
+    :project: RAFT
+    :members:
+    :content-only:
+
+
+Reverse
+-------
+
+``#include <raft/matrix/reverse.cuh>``
+
+namespace *raft::matrix*
+
+.. doxygengroup:: matrix_reverse
+    :project: RAFT
+    :members:
+    :content-only:
+
+Threshold
+---------
+
+``#include <raft/matrix/threshold.cuh>``
+
+namespace *raft::matrix*
+
+.. doxygengroup:: matrix_threshold
+    :project: RAFT
+    :members:
+    :content-only:
+
diff --git a/docs/source/cpp_api/matrix_ordering.rst b/docs/source/cpp_api/matrix_ordering.rst
new file mode 100644
index 0000000000..0af84e14f5
--- /dev/null
+++ b/docs/source/cpp_api/matrix_ordering.rst
@@ -0,0 +1,54 @@
+Matrix Ordering
+===============
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+Argmax
+------
+
+``#include <raft/matrix/argmax.cuh>``
+
+namespace *raft::matrix*
+
+.. doxygengroup:: argmax
+    :project: RAFT
+    :members:
+    :content-only:
+
+Argmin
+------
+
+``#include <raft/matrix/argmin.cuh>``
+
+namespace *raft::matrix*
+
+.. doxygengroup:: argmin
+    :project: RAFT
+    :members:
+    :content-only:
+
+Select-K
+--------
+
+``#include <raft/matrix/select_k.cuh>``
+
+namespace *raft::matrix*
+
+.. doxygengroup:: select_k
+    :project: RAFT
+    :members:
+    :content-only:
+
+Column-wise Sort
+----------------
+
+``#include <raft/matrix/col_wise_sort.cuh>``
+
+namespace *raft::matrix*
+
+.. doxygengroup:: col_wise_sort
+    :project: RAFT
+    :members:
+    :content-only:
diff --git a/docs/source/cpp_api/matrix_reduction.rst b/docs/source/cpp_api/matrix_reduction.rst
new file mode 100644
index 0000000000..440a1528b4
--- /dev/null
+++ b/docs/source/cpp_api/matrix_reduction.rst
@@ -0,0 +1,19 @@
+Matrix Reductions
+=================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+
+Matrix Norm
+-----------
+
+``#include <raft/matrix/norm.cuh>``
+
+namespace *raft::matrix*
+
+.. doxygengroup:: matrix_norm
+    :project: RAFT
+    :members:
+    :content-only:
\ No newline at end of file
diff --git a/docs/source/cpp_api/matrix_selection.rst b/docs/source/cpp_api/matrix_selection.rst
new file mode 100644
index 0000000000..4842a75e0e
--- /dev/null
+++ b/docs/source/cpp_api/matrix_selection.rst
@@ -0,0 +1,69 @@
+Matrix Selection
+================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+
+Copy
+----
+
+``#include <raft/matrix/copy.cuh>``
+
+namespace *raft::matrix*
+
+.. doxygengroup:: matrix_copy
+    :project: RAFT
+    :members:
+    :content-only:
+
+Diagonal
+--------
+
+``#include <raft/matrix/diagonal.cuh>``
+
+namespace *raft::matrix*
+
+.. doxygengroup:: matrix_diagonal
+    :project: RAFT
+    :members:
+    :content-only:
+
+
+Gather
+------
+
+``#include <raft/matrix/gather.cuh>``
+
+namespace *raft::matrix*
+
+.. doxygengroup:: matrix_gather
+    :project: RAFT
+    :members:
+    :content-only:
+
+
+Slicing
+-------
+
+``#include <raft/matrix/slice.cuh>``
+
+namespace *raft::matrix*
+
+.. doxygengroup:: matrix_slice
+    :project: RAFT
+    :members:
+    :content-only:
+
+Triangular
+----------
+
+``#include <raft/matrix/triangular.cuh>``
+
+namespace *raft::matrix*
+
+.. doxygengroup:: matrix_triangular
+    :project: RAFT
+    :members:
+    :content-only:
diff --git a/docs/source/cpp_api/mdspan.rst b/docs/source/cpp_api/mdspan.rst
index 511ead8573..af38247c01 100644
--- a/docs/source/cpp_api/mdspan.rst
+++ b/docs/source/cpp_api/mdspan.rst
@@ -8,339 +8,11 @@ This page provides C++ class references for the RAFT's 1d span and multi-dimensi
    :class: highlight
 
 
-Representation
-##############
-
-
-Layouts
--------
-
-.. doxygentypedef:: raft::row_major
-    :project: RAFT
-
-.. doxygentypedef:: raft::col_major
-    :project: RAFT
-
-
-Shapes
-------
-
-.. doxygentypedef:: raft::matrix_extent
-    :project: RAFT
-
-.. doxygentypedef:: raft::vector_extent
-    :project: RAFT
-
-.. doxygentypedef:: raft::scalar_extent
-    :project: RAFT
-
-.. doxygentypedef:: raft::extent_3d
-    :project: RAFT
-
-.. doxygentypedef:: raft::extent_4d
-    :project: RAFT
-
-.. doxygentypedef:: raft::extent_5d
-    :project: RAFT
-
-.. doxygenfunction:: raft::flatten(mdspan_type mds)
-    :project: RAFT
-
-.. doxygenfunction:: raft:: flatten(const array_interface_type& mda)
-    :project: RAFT
-
-.. doxygenfunction:: raft::reshape(mdspan_type mds, extents<IndexType, Extents...> new_shape)
-    :project: RAFT
-
-.. doxygenfunction:: raft::reshape(const array_interface_type& mda, extents<IndexType, Extents...> new_shape)
-    :project: RAFT
-
-
-Accessors
----------
-
-.. doxygenstruct:: raft::host_device_accessor
-    :project: RAFT
-    :members:
-
-.. doxygentypedef:: raft::host_accessor
-    :project: RAFT
-
-.. doxygentypedef:: raft::device_accessor
-    :project: RAFT
-
-.. doxygentypedef:: raft::managed_accessor
-    :project: RAFT
-
-
-
-
-mdarray
-#######
-
-.. doxygenclass:: raft::mdarray
-    :project: RAFT
-    :members:
-
-.. doxygenclass:: raft::array_interface
-    :project: RAFT
-    :members:
-
-.. doxygenstruct:: raft::is_array_interface
-    :project: RAFT
-    :members:
-
-.. doxygentypedef:: raft::is_array_interface_t
-    :project RAFT
-
-Device Vocabulary
------------------
-
-.. doxygentypedef:: raft::device_mdarray
-    :project: RAFT
-
-.. doxygentypedef:: raft::device_matrix
-    :project: RAFT
-
-.. doxygentypedef:: raft::device_vector
-    :project: RAFT
-
-.. doxygentypedef:: raft::device_scalar
-    :project: RAFT
-
-
-Device Factories
-----------------
-
-.. doxygenfunction:: raft::make_device_matrix
-    :project: RAFT
-
-.. doxygenfunction:: raft::make_device_vector
-    :project: RAFT
-
-.. doxygenfunction:: raft::make_device_scalar
-    :project: RAFT
-
-
-Host Vocabulary
----------------
-
-.. doxygentypedef:: raft::host_matrix
-    :project: RAFT
-
-.. doxygentypedef:: raft::host_vector
-    :project: RAFT
-
-.. doxygentypedef:: raft::host_scalar
-    :project: RAFT
-
-
-Host Factories
---------------
-
-.. doxygenfunction:: raft::make_host_matrix
-    :project: RAFT
-
-.. doxygenfunction:: raft::make_host_vector
-    :project: RAFT
-
-.. doxygenfunction:: raft::make_device_scalar
-    :project: RAFT
-
-mdspan
-######
-
-.. doxygentypedef:: raft::mdspan
-    :project: RAFT
-
-.. doxygenfunction:: raft::make_mdspan
-    :project: RAFT
-
-.. doxygenfunction:: raft::make_extents
-    :project: RAFT
-
-.. doxygenfunction:: raft::make_strided_layout(Extents extents, Strides strides)
-    :project: RAFT
-
-.. doxygenfunction:: raft::unravel_index
-    :project: RAFT
-
-
-Device Vocabulary
------------------
-
-.. doxygentypedef:: raft::device_mdspan
-   :project: RAFT
-
-.. doxygenstruct:: raft::is_device_mdspan
-   :project: RAFT
-
-.. doxygentypedef:: raft::is_device_mdspan_t
-   :project: RAFT
-
-.. doxygentypedef:: raft::is_input_device_mdspan_t
-   :project: RAFT
-
-.. doxygentypedef:: raft::is_output_device_mdspan_t
-   :project: RAFT
-
-.. doxygentypedef:: raft::enable_if_device_mdspan
-    :project: RAFT
-
-.. doxygentypedef:: raft::enable_if_input_device_mdspan
-    :project: RAFT
-
-.. doxygentypedef:: raft::enable_if_output_device_mdspan
-    :project: RAFT
-
-.. doxygentypedef:: raft::device_matrix_view
-   :project: RAFT
-
-.. doxygentypedef:: raft::device_vector_view
-   :project: RAFT
-
-.. doxygentypedef:: raft::device_scalar_view
-   :project: RAFT
-
-
-Device Factories
-----------------
-
-.. doxygenfunction:: raft::make_device_matrix_view
-    :project: RAFT
-
-.. doxygenfunction:: raft::make_device_vector_view(ElementType* ptr, IndexType n)
-    :project: RAFT
-
-.. doxygenfunction:: raft::make_device_scalar_view
-   :project: RAFT
-
-
-Managed Vocabulary
-------------------
-
-..doxygentypedef:: raft::managed_mdspan
-  :project: RAFT
-
-.. doxygenstruct:: raft::is_managed_mdspan
-   :project: RAFT
-
-.. doxygentypedef:: raft::is_managed_mdspan_t
-   :project: RAFT
-
-.. doxygentypedef:: raft::is_input_managed_mdspan_t
-   :project: RAFT
-
-.. doxygentypedef:: raft::is_output_managed_mdspan_t
-   :project: RAFT
-
-.. doxygentypedef:: raft::enable_if_managed_mdspan
-    :project: RAFT
-
-.. doxygentypedef:: raft::enable_if_input_managed_mdspan
-    :project: RAFT
-
-.. doxygentypedef:: raft::enable_if_output_managed_mdspan
-    :project: RAFT
-
-
-Managed Factories
------------------
-
-.. doxygenfunction:: make_managed_mdspan(ElementType* ptr, extents<IndexType, Extents...> exts)
-
-
-Host Vocabulary
----------------
-
-.. doxygentypedef:: raft::host_mdspan
-   :project: RAFT
-
-.. doxygenstruct:: raft::is_host_mdspan
-   :project: RAFT
-
-.. doxygentypedef:: raft::is_host_mdspan_t
-   :project: RAFT
-
-.. doxygentypedef:: raft::is_input_host_mdspan_t
-   :project: RAFT
-
-.. doxygentypedef:: raft::is_output_host_mdspan_t
-   :project: RAFT
-
-.. doxygentypedef:: raft::enable_if_host_mdspan
-    :project: RAFT
-
-.. doxygentypedef:: raft::enable_if_input_host_mdspan
-    :project: RAFT
-
-.. doxygentypedef:: raft::enable_if_output_host_mdspan
-    :project: RAFT
-
-.. doxygentypedef:: raft::host_matrix_view
-   :project: RAFT
-
-.. doxygentypedef:: raft::host_vector_view
-   :project: RAFT
-
-.. doxygentypedef:: raft::host_scalar_view
-   :project: RAFT
-
-Host Factories
---------------
-
-.. doxygenfunction:: raft::make_host_matrix_view
-    :project: RAFT
-
-.. doxygenfunction:: raft::make_host_vector_view
-    :project: RAFT
-
-.. doxygenfunction:: raft::make_device_scalar_view
-    :project: RAFT
-
-
-Validation Routines
--------------------
-
-.. doxygenstruct:: raft::is_mdspan
-    :project: RAFT
-    :members:
-
-.. doxygentypedef:: raft::is_mdspan_t
-    :project: RAFT
-
-.. doxygenstruct:: raft::is_input_mdspan
-    :project: RAFT
-    :members:
-
-.. doxygentypedef:: raft::is_input_mdspan_t
-    :project: RAFT
-
-.. doxygenstruct:: raft::is_output_mdspan
-    :project: RAFT
-    :members:
-
-.. doxygentypedef:: raft::is_output_mdspan_t
-    :project: RAFT
-
-.. doxygentypedef:: raft::enable_if_mdspan
-    :project: RAFT
-
-.. doxygentypedef:: raft::enable_if_input_mdspan
-    :project: RAFT
-
-.. doxygentypedef:: raft::enable_if_output_mdspan
-    :project: RAFT
-
-span
-####
-
-.. doxygentypedef:: raft::device_span
-   :project: RAFT
-
-.. doxygentypedef:: raft::host_span
-   :project: RAFT
-
-.. doxygenclass:: raft::span
-    :project: RAFT
-    :members:
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   mdspan_representation.rst
+   mdspan_mdspan.rst
+   mdspan_mdarray.rst
+   mdspan_span.rst
diff --git a/docs/source/cpp_api/mdspan_mdarray.rst b/docs/source/cpp_api/mdspan_mdarray.rst
new file mode 100644
index 0000000000..bf9e9e0139
--- /dev/null
+++ b/docs/source/cpp_api/mdspan_mdarray.rst
@@ -0,0 +1,85 @@
+mdarray: Multi-dimensional Owning Container
+===========================================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <raft/core/mdarray.hpp>``
+
+.. doxygenclass:: raft::mdarray
+    :project: RAFT
+    :members:
+
+.. doxygenclass:: raft::array_interface
+    :project: RAFT
+    :members:
+
+.. doxygenstruct:: raft::is_array_interface
+    :project: RAFT
+    :members:
+
+.. doxygentypedef:: raft::is_array_interface_t
+    :project RAFT
+
+Device Vocabulary
+-----------------
+
+``#include <raft/core/device_mdarray.hpp>``
+
+.. doxygentypedef:: raft::device_mdarray
+    :project: RAFT
+
+.. doxygentypedef:: raft::device_matrix
+    :project: RAFT
+
+.. doxygentypedef:: raft::device_vector
+    :project: RAFT
+
+.. doxygentypedef:: raft::device_scalar
+    :project: RAFT
+
+
+Device Factories
+----------------
+
+``#include <raft/core/device_mdarray.hpp>``
+
+.. doxygenfunction:: raft::make_device_matrix
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_device_vector
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_device_scalar
+    :project: RAFT
+
+
+Host Vocabulary
+---------------
+
+``#include <raft/core/host_mdarray.hpp>``
+
+.. doxygentypedef:: raft::host_matrix
+    :project: RAFT
+
+.. doxygentypedef:: raft::host_vector
+    :project: RAFT
+
+.. doxygentypedef:: raft::host_scalar
+    :project: RAFT
+
+
+Host Factories
+--------------
+
+``#include <raft/core/host_mdarray.hpp>``
+
+.. doxygenfunction:: raft::make_host_matrix
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_host_vector
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_device_scalar
+    :project: RAFT
diff --git a/docs/source/cpp_api/mdspan_mdspan.rst b/docs/source/cpp_api/mdspan_mdspan.rst
new file mode 100644
index 0000000000..619150f538
--- /dev/null
+++ b/docs/source/cpp_api/mdspan_mdspan.rst
@@ -0,0 +1,208 @@
+mdspan: Multi-dimensional Non-owning View
+==========================================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <raft/core/mdspan.hpp>``
+
+.. doxygentypedef:: raft::mdspan
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_mdspan
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_extents
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_strided_layout(Extents extents, Strides strides)
+    :project: RAFT
+
+.. doxygenfunction:: raft::unravel_index
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_const_mdspan(mdspan_type mds)
+    :project: RAFT
+
+
+Device Vocabulary
+-----------------
+
+``#include <raft/core/device_mdspan.hpp>``
+
+.. doxygentypedef:: raft::device_mdspan
+   :project: RAFT
+
+.. doxygenstruct:: raft::is_device_mdspan
+   :project: RAFT
+
+.. doxygentypedef:: raft::is_device_mdspan_t
+   :project: RAFT
+
+.. doxygentypedef:: raft::is_input_device_mdspan_t
+   :project: RAFT
+
+.. doxygentypedef:: raft::is_output_device_mdspan_t
+   :project: RAFT
+
+.. doxygentypedef:: raft::enable_if_device_mdspan
+    :project: RAFT
+
+.. doxygentypedef:: raft::enable_if_input_device_mdspan
+    :project: RAFT
+
+.. doxygentypedef:: raft::enable_if_output_device_mdspan
+    :project: RAFT
+
+.. doxygentypedef:: raft::device_matrix_view
+   :project: RAFT
+
+.. doxygentypedef:: raft::device_vector_view
+   :project: RAFT
+
+.. doxygentypedef:: raft::device_scalar_view
+   :project: RAFT
+
+
+Device Factories
+----------------
+
+``#include <raft/core/device_mdspan.hpp>``
+
+.. doxygenfunction:: raft::make_device_matrix_view
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_device_vector_view(ElementType* ptr, IndexType n)
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_device_scalar_view
+   :project: RAFT
+
+
+Managed Vocabulary
+------------------
+
+``#include <raft/core/device_mdspan.hpp>``
+
+..doxygentypedef:: raft::managed_mdspan
+  :project: RAFT
+
+.. doxygenstruct:: raft::is_managed_mdspan
+   :project: RAFT
+
+.. doxygentypedef:: raft::is_managed_mdspan_t
+   :project: RAFT
+
+.. doxygentypedef:: raft::is_input_managed_mdspan_t
+   :project: RAFT
+
+.. doxygentypedef:: raft::is_output_managed_mdspan_t
+   :project: RAFT
+
+.. doxygentypedef:: raft::enable_if_managed_mdspan
+    :project: RAFT
+
+.. doxygentypedef:: raft::enable_if_input_managed_mdspan
+    :project: RAFT
+
+.. doxygentypedef:: raft::enable_if_output_managed_mdspan
+    :project: RAFT
+
+
+Managed Factories
+-----------------
+
+``#include <raft/core/device_mdspan.hpp>``
+
+.. doxygenfunction:: make_managed_mdspan(ElementType* ptr, extents<IndexType, Extents...> exts)
+    :project: RAFT
+
+
+Host Vocabulary
+---------------
+
+``#include <raft/core/host_mdspan.hpp>``
+
+.. doxygentypedef:: raft::host_mdspan
+   :project: RAFT
+
+.. doxygenstruct:: raft::is_host_mdspan
+   :project: RAFT
+
+.. doxygentypedef:: raft::is_host_mdspan_t
+   :project: RAFT
+
+.. doxygentypedef:: raft::is_input_host_mdspan_t
+   :project: RAFT
+
+.. doxygentypedef:: raft::is_output_host_mdspan_t
+   :project: RAFT
+
+.. doxygentypedef:: raft::enable_if_host_mdspan
+    :project: RAFT
+
+.. doxygentypedef:: raft::enable_if_input_host_mdspan
+    :project: RAFT
+
+.. doxygentypedef:: raft::enable_if_output_host_mdspan
+    :project: RAFT
+
+.. doxygentypedef:: raft::host_matrix_view
+   :project: RAFT
+
+.. doxygentypedef:: raft::host_vector_view
+   :project: RAFT
+
+.. doxygentypedef:: raft::host_scalar_view
+   :project: RAFT
+
+Host Factories
+--------------
+
+``#include <raft/core/host_mdspan.hpp>``
+
+.. doxygenfunction:: raft::make_host_matrix_view
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_host_vector_view
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_device_scalar_view
+    :project: RAFT
+
+
+Validation Routines
+-------------------
+
+``#include <raft/core/mdspan.hpp>``
+
+.. doxygenstruct:: raft::is_mdspan
+    :project: RAFT
+    :members:
+
+.. doxygentypedef:: raft::is_mdspan_t
+    :project: RAFT
+
+.. doxygenstruct:: raft::is_input_mdspan
+    :project: RAFT
+    :members:
+
+.. doxygentypedef:: raft::is_input_mdspan_t
+    :project: RAFT
+
+.. doxygenstruct:: raft::is_output_mdspan
+    :project: RAFT
+    :members:
+
+.. doxygentypedef:: raft::is_output_mdspan_t
+    :project: RAFT
+
+.. doxygentypedef:: raft::enable_if_mdspan
+    :project: RAFT
+
+.. doxygentypedef:: raft::enable_if_input_mdspan
+    :project: RAFT
+
+.. doxygentypedef:: raft::enable_if_output_mdspan
+    :project: RAFT
diff --git a/docs/source/cpp_api/mdspan_representation.rst b/docs/source/cpp_api/mdspan_representation.rst
new file mode 100644
index 0000000000..fbae03a3e0
--- /dev/null
+++ b/docs/source/cpp_api/mdspan_representation.rst
@@ -0,0 +1,74 @@
+Multi-dimensional Representation
+================================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+Data Layouts
+-------------
+
+``#include <raft/core/mdspan.hpp>``
+
+.. doxygentypedef:: raft::row_major
+    :project: RAFT
+
+.. doxygentypedef:: raft::col_major
+    :project: RAFT
+
+
+Shapes
+------
+
+``#include <raft/core/mdspan.hpp>``
+
+.. doxygentypedef:: raft::matrix_extent
+    :project: RAFT
+
+.. doxygentypedef:: raft::vector_extent
+    :project: RAFT
+
+.. doxygentypedef:: raft::scalar_extent
+    :project: RAFT
+
+.. doxygentypedef:: raft::extent_3d
+    :project: RAFT
+
+.. doxygentypedef:: raft::extent_4d
+    :project: RAFT
+
+.. doxygentypedef:: raft::extent_5d
+    :project: RAFT
+
+.. doxygenfunction:: raft::flatten(mdspan_type mds)
+    :project: RAFT
+
+.. doxygenfunction:: raft:: flatten(const array_interface_type& mda)
+    :project: RAFT
+
+.. doxygenfunction:: raft::reshape(mdspan_type mds, extents<IndexType, Extents...> new_shape)
+    :project: RAFT
+
+.. doxygenfunction:: raft::reshape(const array_interface_type& mda, extents<IndexType, Extents...> new_shape)
+    :project: RAFT
+
+
+Accessors
+---------
+
+``#include <raft/core/host_device_accessor.hpp>``
+
+.. doxygenstruct:: raft::host_device_accessor
+    :project: RAFT
+    :members:
+
+.. doxygentypedef:: raft::host_accessor
+    :project: RAFT
+
+.. doxygentypedef:: raft::device_accessor
+    :project: RAFT
+
+.. doxygentypedef:: raft::managed_accessor
+    :project: RAFT
+
+
diff --git a/docs/source/cpp_api/mdspan_span.rst b/docs/source/cpp_api/mdspan_span.rst
new file mode 100644
index 0000000000..2bdaf4941e
--- /dev/null
+++ b/docs/source/cpp_api/mdspan_span.rst
@@ -0,0 +1,23 @@
+span: One-dimensional Non-owning View
+=====================================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <raft/core/span.hpp>``
+
+.. doxygenclass:: raft::span
+    :project: RAFT
+    :members:
+
+``#include <raft/core/device_span.hpp>``
+
+.. doxygentypedef:: raft::device_span
+   :project: RAFT
+
+``#include <raft/core/host_span.hpp>``
+
+.. doxygentypedef:: raft::host_span
+   :project: RAFT
+
diff --git a/docs/source/cpp_api/mnmg.rst b/docs/source/cpp_api/mnmg.rst
new file mode 100644
index 0000000000..9543cbb4ee
--- /dev/null
+++ b/docs/source/cpp_api/mnmg.rst
@@ -0,0 +1,50 @@
+Multi-node Multi-GPU
+====================
+
+RAFT contains C++ infrastructure for abstracting the communications layer when writing applications that scale on multiple nodes and across multiple GPUs. This infrastructure assumes OPG (one-process per GPU) architectures where multiple physical parallel units (processes, ranks, or workers) might be executing code concurrently but where each parallel unit is communicating with only a single GPU and is the only process communicating with each GPU.
+
+The comms layer in RAFT is intended to provide a facade API for barrier synchronous collective communications, allowing users to write algorithms using a single abstraction layer and deploy in many different types of systems. Currently, RAFT communications code has been deployed in MPI, Dask, and Spark clusters.
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+Common Types
+------------
+
+``#include <raft/core/comms.hpp>``
+
+namespace *raft::comms*
+
+.. doxygengroup:: comms_types
+    :project: RAFT
+    :members:
+    :content-only:
+
+
+Comms Interface
+---------------
+
+.. doxygengroup:: comms_t
+    :project: RAFT
+    :members:
+    :content-only:
+
+
+MPI Comms
+---------
+
+.. doxygengroup:: mpi_comms_factory
+    :project: RAFT
+    :members:
+    :content-only:
+
+
+NCCL+UCX Comms
+--------------
+
+.. doxygengroup:: std_comms_factory
+    :project: RAFT
+    :members:
+    :content-only:
+
diff --git a/docs/source/cpp_api/neighbors.rst b/docs/source/cpp_api/neighbors.rst
index 93eecf68b4..9d2e762689 100644
--- a/docs/source/cpp_api/neighbors.rst
+++ b/docs/source/cpp_api/neighbors.rst
@@ -7,57 +7,12 @@ This page provides C++ class references for the publicly-exposed elements of the
    :language: c++
    :class: highlight
 
-
-Brute-force
------------
-
-Header: `raft/neighbors/brute_force.cuh`
-
-.. doxygennamespace:: raft::neighbors::brute_force
-    :project: RAFT
-    :members:
-    :content-only:
-
-
-IVF-Flat
---------
-
-Header: `raft/neighbors/ivf_flat.cuh`
-
-.. doxygennamespace:: raft::neighbors::ivf_flat
-    :project: RAFT
-    :members:
-    :content-only:
-
-
-IVF-PQ
---------
-
-Header: `raft/neighbors/ivf_pq.cuh`
-
-.. doxygennamespace:: raft::neighbors::ivf_pq
-    :project: RAFT
-    :members:
-    :content-only:
-
-
-Epsilon Neighborhood
---------------------
-
-Header: `raft/neighbors/epsilon_neighborhood.cuh`
-
-.. doxygennamespace:: raft::neighbors::epsilon_neighborhood
-    :project: RAFT
-    :members:
-    :content-only:
-
-
-Random Ball Cover
------------------
-
-Header: `raft/neighbors/ball_cover.cuh`
-
-.. doxygennamespace:: raft::neighbors::ball_cover
-    :project: RAFT
-    :members:
-    :content-only:
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   neighbors_brute_force.rst
+   neighbors_ivf_flat.rst
+   neighbors_ivf_pq.rst
+   neighbors_epsilon_neighborhood.rst
+   neighbors_ball_cover.rst
\ No newline at end of file
diff --git a/docs/source/cpp_api/neighbors_ball_cover.rst b/docs/source/cpp_api/neighbors_ball_cover.rst
new file mode 100644
index 0000000000..85bd6b2d8e
--- /dev/null
+++ b/docs/source/cpp_api/neighbors_ball_cover.rst
@@ -0,0 +1,17 @@
+Random Ball Cover
+=================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <raft/neighbors/ball_cover.cuh>``
+
+namespace *raft::neighbors::ball_cover*
+
+.. doxygengroup:: random_ball_cover
+    :project: RAFT
+    :members:
+    :content-only:
+
+
diff --git a/docs/source/cpp_api/neighbors_brute_force.rst b/docs/source/cpp_api/neighbors_brute_force.rst
new file mode 100644
index 0000000000..525addf428
--- /dev/null
+++ b/docs/source/cpp_api/neighbors_brute_force.rst
@@ -0,0 +1,18 @@
+Brute-Force
+===========
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+
+``#include <raft/neighbors/brute_force.cuh>``
+
+namespace *raft::neighbors::brute_force*
+
+.. doxygengroup:: brute_force_knn
+    :project: RAFT
+    :members:
+    :content-only:
+
+
diff --git a/docs/source/cpp_api/neighbors_epsilon_neighborhood.rst b/docs/source/cpp_api/neighbors_epsilon_neighborhood.rst
new file mode 100644
index 0000000000..f291a7605f
--- /dev/null
+++ b/docs/source/cpp_api/neighbors_epsilon_neighborhood.rst
@@ -0,0 +1,15 @@
+Epsilon Neighborhood
+====================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <raft/neighbors/epsilon_neighborhood.cuh>``
+
+namespace *raft::neighbors::epsilon_neighborhood*
+
+.. doxygengroup:: epsilon_neighbors
+    :project: RAFT
+    :members:
+    :content-only:
diff --git a/docs/source/cpp_api/neighbors_ivf_flat.rst b/docs/source/cpp_api/neighbors_ivf_flat.rst
new file mode 100644
index 0000000000..6f418fb165
--- /dev/null
+++ b/docs/source/cpp_api/neighbors_ivf_flat.rst
@@ -0,0 +1,18 @@
+IVF-Flat
+========
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <raft/neighbors/ivf_flat.cuh>``
+
+namespace *raft::neighbors::ivf_flat*
+
+.. doxygengroup:: ivf_flat
+    :project: RAFT
+    :members:
+    :content-only:
+
+
+
diff --git a/docs/source/cpp_api/neighbors_ivf_pq.rst b/docs/source/cpp_api/neighbors_ivf_pq.rst
new file mode 100644
index 0000000000..d22ea6231f
--- /dev/null
+++ b/docs/source/cpp_api/neighbors_ivf_pq.rst
@@ -0,0 +1,17 @@
+IVF-PQ
+======
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <raft/neighbors/ivf_pq.cuh>``
+
+namespace *raft::neighbors::ivf_pq*
+
+.. doxygengroup:: ivf_pq
+    :project: RAFT
+    :members:
+    :content-only:
+
+
diff --git a/docs/source/cpp_api/random.rst b/docs/source/cpp_api/random.rst
index 353f783ed4..9f5cdc7a74 100644
--- a/docs/source/cpp_api/random.rst
+++ b/docs/source/cpp_api/random.rst
@@ -7,105 +7,23 @@ This page provides C++ class references for the publicly-exposed elements of the
    :language: c++
    :class: highlight
 
-Header: `raft/random/rng_state.hpp`
+Random State
+############
 
-.. doxygenstruct:: raft::random::RngState
-    :project: RAFT
-    :members:
-
-
-Data Generation
-###############
-
-make_blobs
-----------
-
-Header: `raft/random/make_blobs.cuh`
-
-.. doxygenfunction:: raft::random::make_blobs(raft::handle_t const& handle, raft::device_matrix_view<DataT, IdxT, layout> out, raft::device_vector_view<IdxT, IdxT> labels, IdxT n_clusters, std::optional<raft::device_matrix_view<DataT, IdxT, layout>> centers, std::optional<raft::device_vector_view<DataT, IdxT>> const cluster_std, const DataT cluster_std_scalar, bool shuffle, DataT center_box_min, DataT center_box_max, uint64_t seed, GeneratorType type)
-    :project: RAFT
-
-make_regression
----------------
-
-Header: `raft/random/make_regression.cuh`
-
-.. doxygenfunction:: raft::random::make_regression(const raft::handle_t& handle, raft::device_matrix_view<DataT, IdxT, raft::row_major> out, raft::device_matrix_view<DataT, IdxT, raft::row_major> values, IdxT n_informative, std::optional<raft::device_matrix_view<DataT, IdxT, raft::row_major>> coef, DataT bias, IdxT effective_rank, DataT tail_strength, DataT noise, bool shuffle, uint64_t seed, GeneratorType type)
-    :project: RAFT
-
-rmat
-----
-
-Header: `raft/random/rmat_rectangular_generator.cuh`
-
-.. doxygenfunction:: raft::random::rmat_rectangular_gen(const raft::handle_t& handle, raft::random::RngState& r, raft::device_vector_view<const ProbT, IdxT> theta, raft::device_mdspan<IdxT, raft::extents<IdxT, raft::dynamic_extent, 2>, raft::row_major> out, raft::device_vector_view<IdxT, IdxT> out_src, raft::device_vector_view<IdxT, IdxT> out_dst, IdxT r_scale, IdxT c_scale)
-    :project: RAFT
-
-
-Random Sampling
-###############
-
-Distributions
--------------
-
-Header: `raft/random/rng.cuh`
-
-.. doxygenfunction:: raft::random::uniform(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType start, OutputValueType end)
-    :project: RAFT
-
-.. doxygenfunction:: raft::random::uniformInt(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType start, OutputValueType end)
-    :project: RAFT
-
-.. doxygenfunction:: raft::random::normal(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType mu, OutputValueType sigma)
-    :project: RAFT
+``#include <raft/random/rng_state.hpp>``
 
-.. doxygenfunction:: raft::random::normalInt(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType mu, OutputValueType sigma)
-    :project: RAFT
-
-.. doxygenfunction:: raft::random::normalTable(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<const OutputValueType, IndexType> mu_vec, std::variant<raft::device_vector_view<const OutputValueType, IndexType>, OutputValueType> sigma, raft::device_matrix_view<OutputValueType, IndexType, raft::row_major> out)
-    :project: RAFT
-
-.. doxygenfunction:: raft::random::fill(const raft::handle_t& handle, RngState& rng_state, OutputValueType val, raft::device_vector_view<OutputValueType, IndexType> out)
-    :project: RAFT
-
-.. doxygenfunction:: raft::random::bernoulli(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, Type prob)
-    :project: RAFT
+namespace *raft::random*
 
-.. doxygenfunction:: raft::random::scaled_bernoulli(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType prob, OutputValueType scale)
-    :project: RAFT
-
-.. doxygenfunction:: raft::random::gumbel(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType mu, OutputValueType beta)
-    :project: RAFT
-
-.. doxygenfunction:: raft::random::lognormal(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType mu, OutputValueType sigma)
-    :project: RAFT
-
-.. doxygenfunction:: raft::random::logistic(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType mu, OutputValueType scale)
-    :project: RAFT
-
-.. doxygenfunction:: raft::random::exponential(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType lambda)
-    :project: RAFT
-
-.. doxygenfunction:: raft::random::rayleigh(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType sigma)
-    :project: RAFT
-
-.. doxygenfunction:: raft::random::laplace(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType mu, OutputValueType scale)
-    :project: RAFT
-
-
-Sampling Without Replacement
-----------------------------
-
-Header: `raft/random/rng.cuh`
-
-.. doxygengroup:: sample_without_replacement
+.. doxygenstruct:: raft::random::RngState
     :project: RAFT
     :members:
-    :content-only:
-
-Header: `raft/random/permute.cuh`
 
-.. doxygenfunction:: raft::random::permute(const raft::handle_t& handle, raft::device_matrix_view<const InputOutputValueType, IdxType, Layout> in, std::optional<raft::device_vector_view<IntType, IdxType>> permsOut, std::optional<raft::device_matrix_view<InputOutputValueType, IdxType, Layout>> out)
-    :project: RAFT
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
 
+   random_datagen.rst
+   random_sampling_univariate.rst
+   random_sampling_multivariable.rst
+   random_sampling_without_replacement.rst
 
diff --git a/docs/source/cpp_api/random_datagen.rst b/docs/source/cpp_api/random_datagen.rst
new file mode 100644
index 0000000000..ec23845b6b
--- /dev/null
+++ b/docs/source/cpp_api/random_datagen.rst
@@ -0,0 +1,46 @@
+Data Generation
+===============
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+make_blobs
+----------
+
+``#include <raft/random/make_blobs.cuh>``
+
+namespace *raft::random*
+2
+.. doxygengroup:: make_blobs
+    :project: RAFT
+    :members:
+    :content-only:
+
+
+
+make_regression
+---------------
+
+``#include <raft/random/make_regression.cuh>``
+
+namespace *raft::random*
+
+.. doxygengroup:: make_regression
+    :project: RAFT
+    :members:
+    :content-only:
+
+
+rmat
+----
+
+``#include <raft/random/rmat_rectangular_generator.cuh>``
+
+namespace *raft::random*
+
+.. doxygengroup:: rmat
+    :project: RAFT
+    :members:
+    :content-only:
+
diff --git a/docs/source/cpp_api/random_sampling_multivariable.rst b/docs/source/cpp_api/random_sampling_multivariable.rst
new file mode 100644
index 0000000000..166043b632
--- /dev/null
+++ b/docs/source/cpp_api/random_sampling_multivariable.rst
@@ -0,0 +1,19 @@
+Multi-Variable Random Sampling
+==============================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+
+Multi-Variable Gaussian
+-----------------------
+
+``#include <raft/random/multi_variable_gaussian.hpp>``
+
+namespace *raft::random*
+
+.. doxygengroup:: multi_variable_gaussian
+    :project: RAFT
+    :members:
+    :content-only:
diff --git a/docs/source/cpp_api/random_sampling_univariate.rst b/docs/source/cpp_api/random_sampling_univariate.rst
new file mode 100644
index 0000000000..ffa58a0d3a
--- /dev/null
+++ b/docs/source/cpp_api/random_sampling_univariate.rst
@@ -0,0 +1,57 @@
+Univariate Random Sampling
+==========================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <raft/random/rng.cuh>``
+
+namespace *raft::random*
+
+.. doxygenfunction:: raft::random::uniform(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType start, OutputValueType end)
+    :project: RAFT
+
+.. doxygenfunction:: raft::random::uniformInt(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType start, OutputValueType end)
+    :project: RAFT
+
+.. doxygenfunction:: raft::random::normal(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType mu, OutputValueType sigma)
+    :project: RAFT
+
+.. doxygenfunction:: raft::random::normalInt(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType mu, OutputValueType sigma)
+    :project: RAFT
+
+.. doxygenfunction:: raft::random::normalTable(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<const OutputValueType, IndexType> mu_vec, std::variant<raft::device_vector_view<const OutputValueType, IndexType>, OutputValueType> sigma, raft::device_matrix_view<OutputValueType, IndexType, raft::row_major> out)
+    :project: RAFT
+
+.. doxygenfunction:: raft::random::fill(const raft::handle_t& handle, RngState& rng_state, OutputValueType val, raft::device_vector_view<OutputValueType, IndexType> out)
+    :project: RAFT
+
+.. doxygenfunction:: raft::random::bernoulli(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, Type prob)
+    :project: RAFT
+
+.. doxygenfunction:: raft::random::scaled_bernoulli(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType prob, OutputValueType scale)
+    :project: RAFT
+
+.. doxygenfunction:: raft::random::gumbel(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType mu, OutputValueType beta)
+    :project: RAFT
+
+.. doxygenfunction:: raft::random::lognormal(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType mu, OutputValueType sigma)
+    :project: RAFT
+
+.. doxygenfunction:: raft::random::logistic(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType mu, OutputValueType scale)
+    :project: RAFT
+
+.. doxygenfunction:: raft::random::exponential(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType lambda)
+    :project: RAFT
+
+.. doxygenfunction:: raft::random::rayleigh(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType sigma)
+    :project: RAFT
+
+.. doxygenfunction:: raft::random::laplace(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType mu, OutputValueType scale)
+    :project: RAFT
+
+.. doxygenfunction:: raft::random::discrete
+    :project: RAFT
+
+
diff --git a/docs/source/cpp_api/random_sampling_without_replacement.rst b/docs/source/cpp_api/random_sampling_without_replacement.rst
new file mode 100644
index 0000000000..ac0d3bea86
--- /dev/null
+++ b/docs/source/cpp_api/random_sampling_without_replacement.rst
@@ -0,0 +1,26 @@
+Sampling Without Replacement
+============================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <raft/random/sample_without_replacement.cuh>``
+
+namespace *raft::random*
+
+.. doxygengroup:: sample_without_replacement
+    :project: RAFT
+    :members:
+    :content-only:
+
+``#include <raft/random/permute.cuh>``
+
+namespace *raft::random*
+
+.. doxygengroup:: permute
+    :project: RAFT
+    :members:
+    :content-only:
+
+
diff --git a/docs/source/cpp_api/solver.rst b/docs/source/cpp_api/solver.rst
index d03f3bb1eb..4b939eef49 100644
--- a/docs/source/cpp_api/solver.rst
+++ b/docs/source/cpp_api/solver.rst
@@ -11,7 +11,7 @@ This page provides C++ class references for the publicly-exposed elements of the
 Linear Assignment Problem
 #########################
 
-Header: `raft/solver/linear_assignment.cuh`
+``#include <raft/solver/linear_assignment.cuh>``
 
 .. doxygenclass:: raft::solver::LinearAssignmentProblem
     :project: RAFT
@@ -20,7 +20,7 @@ Header: `raft/solver/linear_assignment.cuh`
 Minimum Spanning Tree
 #####################
 
-Header: `raft/sparse/solver/mst.cuh`
+``#include <raft/sparse/solver/mst.cuh>``
 
 .. doxygenfunction:: raft::sparse::solver::mst
     :project: RAFT
diff --git a/docs/source/cpp_api/sparse.rst b/docs/source/cpp_api/sparse.rst
index ea29dbebd0..2e2bae6253 100644
--- a/docs/source/cpp_api/sparse.rst
+++ b/docs/source/cpp_api/sparse.rst
@@ -8,42 +8,13 @@ This page provides C++ class references for the publicly-exposed elements of the
    :class: highlight
 
 
-Conversion
-##########
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
 
-.. doxygennamespace:: raft::sparse::convert
-    :project: RAFT
-    :members:
-    :content-only:
+   sparse_distance.rst
+   sparse_linalg.rst
+   sparse_matrix.rst
+   sparse_neighbors.rst
+   sparse_solver.rst
 
-Distance
-########
-
-.. doxygennamespace:: raft::sparse::distance
-    :project: RAFT
-    :members:
-    :content-only:
-
-Linear Algebra
-##############
-
-.. doxygennamespace:: raft::sparse::linalg
-    :project: RAFT
-    :members:
-    :content-only:
-
-Matrix Operations
-#################
-
-.. doxygennamespace:: raft::sparse::op
-    :project: RAFT
-    :members:
-    :content-only:
-
-Neighbors
-#########
-
-.. doxygennamespace:: raft::sparse::neighbors
-    :project: RAFT
-    :members:
-    :content-only:
diff --git a/docs/source/cpp_api/sparse_distance.rst b/docs/source/cpp_api/sparse_distance.rst
new file mode 100644
index 0000000000..e85e43695d
--- /dev/null
+++ b/docs/source/cpp_api/sparse_distance.rst
@@ -0,0 +1,7 @@
+Sparse Distance
+===============
+
+.. doxygennamespace:: raft::sparse::distance
+    :project: RAFT
+    :members:
+    :content-only:
diff --git a/docs/source/cpp_api/sparse_linalg.rst b/docs/source/cpp_api/sparse_linalg.rst
new file mode 100644
index 0000000000..6b20ee6c61
--- /dev/null
+++ b/docs/source/cpp_api/sparse_linalg.rst
@@ -0,0 +1,7 @@
+Sparse Linear Algebra
+=====================
+
+.. doxygennamespace:: raft::sparse::linalg
+    :project: RAFT
+    :members:
+    :content-only:
diff --git a/docs/source/cpp_api/sparse_matrix.rst b/docs/source/cpp_api/sparse_matrix.rst
new file mode 100644
index 0000000000..18c5fea269
--- /dev/null
+++ b/docs/source/cpp_api/sparse_matrix.rst
@@ -0,0 +1,7 @@
+Sparse Matrix Operations
+========================
+
+.. doxygennamespace:: raft::sparse::op
+    :project: RAFT
+    :members:
+    :content-only:
diff --git a/docs/source/cpp_api/sparse_neighbors.rst b/docs/source/cpp_api/sparse_neighbors.rst
new file mode 100644
index 0000000000..9610913da6
--- /dev/null
+++ b/docs/source/cpp_api/sparse_neighbors.rst
@@ -0,0 +1,7 @@
+Sparse Neighbors
+================
+
+.. doxygennamespace:: raft::sparse::neighbors
+    :project: RAFT
+    :members:
+    :content-only:
diff --git a/docs/source/cpp_api/sparse_solver.rst b/docs/source/cpp_api/sparse_solver.rst
new file mode 100644
index 0000000000..d6df5493a3
--- /dev/null
+++ b/docs/source/cpp_api/sparse_solver.rst
@@ -0,0 +1,7 @@
+Sparse Solvers
+==============
+
+.. doxygennamespace:: raft::sparse::solver
+    :project: RAFT
+    :members:
+    :content-only:
diff --git a/docs/source/cpp_api/stats.rst b/docs/source/cpp_api/stats.rst
index f795b9e84c..fd23ce2149 100644
--- a/docs/source/cpp_api/stats.rst
+++ b/docs/source/cpp_api/stats.rst
@@ -7,8 +7,13 @@ This page provides C++ class references for the publicly-exposed elements of the
    :language: c++
    :class: highlight
 
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
 
-.. doxygennamespace:: raft::stats
-    :project: RAFT
-    :members:
-    :content-only:
+   stats_summary.rst
+   stats_probability.rst
+   stats_regression.rst
+   stats_classification.rst
+   stats_clustering.rst
+   stats_neighborhood.rst
diff --git a/docs/source/cpp_api/stats_classification.rst b/docs/source/cpp_api/stats_classification.rst
new file mode 100644
index 0000000000..929d2808f3
--- /dev/null
+++ b/docs/source/cpp_api/stats_classification.rst
@@ -0,0 +1,20 @@
+Classification Model Scoring
+============================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+
+Accuracy
+--------
+
+``#include <raft/stats/accuracy.cuh>``
+
+namespace *raft::stats*
+
+.. doxygengroup:: stats_accuracy
+    :project: RAFT
+    :members:
+    :content-only:
+
diff --git a/docs/source/cpp_api/stats_clustering.rst b/docs/source/cpp_api/stats_clustering.rst
new file mode 100644
index 0000000000..0ab96cf1f5
--- /dev/null
+++ b/docs/source/cpp_api/stats_clustering.rst
@@ -0,0 +1,81 @@
+Clustering Model Scoring
+========================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+
+Adjusted Rand Index
+-------------------
+
+``#include <raft/stats/adjusted_rand_index.cuh>``
+
+namespace *raft::stats*
+
+.. doxygengroup:: stats_adj_rand_index
+    :project: RAFT
+    :members:
+    :content-only:
+
+Completeness Score
+------------------
+
+``#include <raft/stats/completeness_score.cuh>``
+
+namespace *raft::stats*
+
+.. doxygengroup:: stats_completeness
+    :project: RAFT
+    :members:
+    :content-only:
+
+Cluster Dispersion
+------------------
+
+``#include <raft/stats/dispersion.cuh>``
+
+namespace *raft::stats*
+
+.. doxygengroup:: stats_cluster_dispersion
+    :project: RAFT
+    :members:
+    :content-only:
+
+
+Rand Index
+----------
+
+``#include <raft/stats/rand_index.cuh>``
+
+namespace *raft::stats*
+
+.. doxygengroup:: stats_rand_index
+    :project: RAFT
+    :members:
+    :content-only:
+
+Silhouette Score
+----------------
+
+``#include <raft/stats/silhouette_score.cuh>``
+
+namespace *raft::stats*
+
+.. doxygengroup:: stats_silhouette_score
+    :project: RAFT
+    :members:
+    :content-only:
+
+
+V Measure
+---------
+
+``#include <raft/stats/v_measure.cuh>``
+
+namespace *raft::stats*
+
+.. doxygengroup:: stats_vmeasure
+    :project: RAFT
+    :members:
+    :content-only:
diff --git a/docs/source/cpp_api/stats_neighborhood.rst b/docs/source/cpp_api/stats_neighborhood.rst
new file mode 100644
index 0000000000..f80e349c3b
--- /dev/null
+++ b/docs/source/cpp_api/stats_neighborhood.rst
@@ -0,0 +1,18 @@
+Neighborhood Model Scoring
+==========================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+Trustworthiness
+---------------
+
+``#include <raft/stats/trustworthiness.cuh>``
+
+namespace *raft::stats*
+
+.. doxygengroup:: stats_trustworthiness
+    :project: RAFT
+    :members:
+    :content-only:
diff --git a/docs/source/cpp_api/stats_probability.rst b/docs/source/cpp_api/stats_probability.rst
new file mode 100644
index 0000000000..457879d87c
--- /dev/null
+++ b/docs/source/cpp_api/stats_probability.rst
@@ -0,0 +1,56 @@
+Probability & Information Theory
+================================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+Contingency Matrix
+------------------
+
+``#include <raft/stats/contingency_matrix.cuh>``
+
+namespace *raft::stats*
+
+.. doxygengroup:: contingency_matrix
+    :project: RAFT
+    :members:
+    :content-only:
+
+Entropy
+-------
+
+``#include <raft/stats/entropy.cuh>``
+
+namespace *raft::stats*
+
+.. doxygengroup:: stats_entropy
+    :project: RAFT
+    :members:
+    :content-only:
+
+
+KL-Divergence
+-------------
+
+``#include <raft/stats/kl_divergence.cuh>``
+
+namespace *raft::stats*
+
+.. doxygengroup:: kl_divergence
+    :project: RAFT
+    :members:
+    :content-only:
+
+Mutual Information
+------------------
+
+``#include <raft/stats/mutual_info_score.cuh>``
+
+namespace *raft::stats*
+
+.. doxygengroup:: stats_mutual_info
+    :project: RAFT
+    :members:
+    :content-only:
+
diff --git a/docs/source/cpp_api/stats_regression.rst b/docs/source/cpp_api/stats_regression.rst
new file mode 100644
index 0000000000..8c172b441d
--- /dev/null
+++ b/docs/source/cpp_api/stats_regression.rst
@@ -0,0 +1,45 @@
+Regression Model Scoring
+========================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+Information Criterion
+---------------------
+
+``#include <raft/stats/information_criterion.cuh>``
+
+namespace *raft::stats*
+
+.. doxygengroup:: stats_information_criterion
+    :project: RAFT
+    :members:
+    :content-only:
+
+R2 Score
+--------
+
+``#include <raft/stats/r2_score.cuh>``
+
+namespace *raft::stats*
+
+.. doxygengroup:: stats_r2_score
+    :project: RAFT
+    :members:
+    :content-only:
+
+
+Regression Metrics
+------------------
+
+``#include <raft/stats/regression_metrics.cuh>``
+
+namespace *raft::stats*
+
+.. doxygengroup:: stats_regression_metrics
+    :project: RAFT
+    :members:
+    :content-only:
+
+
diff --git a/docs/source/cpp_api/stats_summary.rst b/docs/source/cpp_api/stats_summary.rst
new file mode 100644
index 0000000000..7b4bf6a801
--- /dev/null
+++ b/docs/source/cpp_api/stats_summary.rst
@@ -0,0 +1,114 @@
+Summary Statistics
+==================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+Covariance
+----------
+
+``#include <raft/stats/cov.cuh>``
+
+namespace *raft::stats*
+
+.. doxygengroup:: stats_cov
+    :project: RAFT
+    :members:
+    :content-only:
+
+Histogram
+---------
+
+``#include <raft/stats/histogram.cuh>``
+
+namespace *raft::stats*
+
+.. doxygengroup:: stats_histogram
+    :project: RAFT
+    :members:
+    :content-only:
+
+Mean
+----
+
+``#include <raft/stats/mean.cuh>``
+
+namespace *raft::stats*
+
+.. doxygengroup:: stats_mean
+    :project: RAFT
+    :members:
+    :content-only:
+
+Mean Center
+-----------
+
+``#include <raft/stats/mean_center.cuh>``
+
+namespace *raft::stats*
+
+.. doxygengroup:: stats_mean_center
+    :project: RAFT
+    :members:
+    :content-only:
+
+Mean Variance
+-------------
+
+``#include <raft/stats/mean_var.cuh>``
+
+namespace *raft::stats*
+
+.. doxygengroup:: stats_mean_var
+    :project: RAFT
+    :members:
+    :content-only:
+
+Min/Max
+-------
+
+``#include <raft/stats/minmax.cuh>``
+
+namespace *raft::stats*
+
+.. doxygengroup:: stats_minmax
+    :project: RAFT
+    :members:
+    :content-only:
+
+Standard Deviation
+------------------
+
+``#include <raft/stats/stddev.cuh>``
+
+namespace *raft::stats*
+
+.. doxygengroup:: stats_stddev
+    :project: RAFT
+    :members:
+    :content-only:
+
+Sum
+---
+
+``#include <raft/stats/sum.cuh>``
+
+namespace *raft::stats*
+
+.. doxygengroup:: stats_sum
+    :project: RAFT
+    :members:
+    :content-only:
+
+Weighted Average
+----------------
+
+``#include <raft/stats/weighted_mean.cuh>``
+
+namespace *raft::stats*
+
+.. doxygengroup:: stats_weighted_mean
+    :project: RAFT
+    :members:
+    :content-only:
diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md
index b37d5dc1af..2f54753cc6 100644
--- a/docs/source/developer_guide.md
+++ b/docs/source/developer_guide.md
@@ -1,5 +1,13 @@
 # Developer Guide
 
+## General
+Please start by reading the [Contributor Guide](contributing.md).
+
+## Performance
+1. In performance critical sections of the code, favor `cudaDeviceGetAttribute` over `cudaDeviceGetProperties`. See corresponding CUDA devblog [here](https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/) to know more.
+2. If an algo requires you to launch GPU work in multiple cuda streams, do not create multiple `raft::resources` objects, one for each such work stream. Instead, use the stream pool configured on the given `raft::resources` instance's `raft::resources::get_stream_from_stream_pool()` to pick up the right cuda stream. Refer to the section on [CUDA Resources](#resource-management) and the section on [Threading](#threading-model) for more details. TIP: use `raft::resources::get_stream_pool_size()` to know how many such streams are available at your disposal.
+
+
 ## Local Development
 
 Developing features and fixing bugs for the RAFT library itself is straightforward and only requires building and installing the relevant RAFT artifacts.
@@ -8,11 +16,239 @@ The process for working on a CUDA/C++ feature which might span RAFT and one or m
 
 If building a feature which spans projects and not using the source build in cmake, the RAFT changes (both C++ and Python) will need to be installed into the environment of the consuming project before they can be used. The ideal integration of RAFT into consuming projects will enable both the source build in the consuming project only for this case but also rely on a more stable packaging (such as conda packaging) otherwise. 
 
-## API stability
+
+## Threading Model
+
+With the exception of the `raft::resources`, RAFT algorithms should maintain thread-safety and are, in general,
+assumed to be single threaded. This means they should be able to be called from multiple host threads so
+long as different instances of `raft::resources` are used.
+
+Exceptions are made for algorithms that can take advantage of multiple CUDA streams within multiple host threads
+in order to oversubscribe or increase occupancy on a single GPU. In these cases, the use of multiple host
+threads within RAFT algorithms should be used only to maintain concurrency of the underlying CUDA streams.
+Multiple host threads should be used sparingly, be bounded, and should steer clear of performing CPU-intensive
+computations.
+
+A good example of an acceptable use of host threads within a RAFT algorithm might look like the following
+
+```cpp
+#include <raft/core/resources.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/cuda_stream_pool.hpp>
+raft::resources res;
+
+...
+
+sync_stream(res);
+
+...
+
+int n_streams = get_stream_pool_size(res);
+
+#pragma omp parallel for num_threads(n_threads)
+for(int i = 0; i < n; i++) {
+    int thread_num = omp_get_thread_num() % n_threads;
+    cudaStream_t s = get_stream_from_stream_pool(res, thread_num);
+    ... possible light cpu pre-processing ...
+    my_kernel1<<<b, tpb, 0, s>>>(...);
+    ...
+    ... some possible async d2h / h2d copies ...
+    my_kernel2<<<b, tpb, 0, s>>>(...);
+    ...
+    sync_stream(res, s);
+    ... possible light cpu post-processing ...
+}
+```
+
+In the example above, if there is no CPU pre-processing at the beginning of the for-loop, an event can be registered in
+each of the streams within the for-loop to make them wait on the stream from the handle. If there is no CPU post-processing
+at the end of each for-loop iteration, `sync_stream(res, s)` can be replaced with a single `sync_stream_pool(res)`
+after the for-loop.
+
+To avoid compatibility issues between different threading models, the only threading programming allowed in RAFT is OpenMP.
+Though RAFT's build enables OpenMP by default, RAFT algorithms should still function properly even when OpenMP has been
+disabled. If the CPU pre- and post-processing were not needed in the example above, OpenMP would not be needed.
+
+The use of threads in third-party libraries is allowed, though they should still avoid depending on a specific OpenMP runtime.
+
+## Public Interface
+
+### General guidelines
+Functions exposed via the C++ API must be stateless. Things that are OK to be exposed on the interface:
+1. Any [POD](https://en.wikipedia.org/wiki/Passive_data_structure) - see [std::is_pod](https://en.cppreference.com/w/cpp/types/is_pod) as a reference for C++11  POD types.
+2. `raft::resources` - since it stores resource-related state which has nothing to do with model/algo state.
+3. Avoid using pointers to POD types (explicitly putting it out, even though it can be considered as a POD) and pass the structures by reference instead.
+   Internal to the C++ API, these stateless functions are free to use their own temporary classes, as long as they are not exposed on the interface.
+4. Accept single- (`raft::span`) and multi-dimensional views (`raft::mdspan`) and validate their metadata wherever possible.
+5. Prefer `std::optional` for any optional arguments (e.g. do not accept `nullptr`)
+6. All public APIs should be lightweight wrappers around calls to private APIs inside the `detail` namespace.
+
+### API stability
 
 Since RAFT is a core library with multiple consumers, it's important that the public APIs maintain stability across versions and any changes to them are done with caution, adding new functions and deprecating the old functions over a couple releases as necessary.
 
-The public APIs should be lightweight wrappers around calls to private APIs inside the `detail` namespace. 
+### Stateless C++ APIs
+
+Using the IVF-PQ algorithm as an example, the following way of exposing its API would be wrong according to the guidelines in this section, since it exposes a non-POD C++ class object in the C++ API:
+```cpp
+template <typename value_t, typename idx_t>
+class ivf_pq {
+  ivf_pq_params params_;
+  raft::resources const& res_;
+  
+public:
+  ivf_pq(raft::resources const& res);
+  void train(raft::device_matrix<value_t, idx_t, raft::row_major> dataset);
+  void search(raft::device_matrix<value_t, idx_t, raft::row_major> queries, 
+              raft::device_matrix<value_t, idx_t, raft::row_major> out_inds, 
+              raft::device_matrix<value_t, idx_t, raft::row_major> out_dists);
+};
+```
+
+An alternative correct way to expose this could be:
+```cpp
+namespace raft::ivf_pq {
+
+template<typename value_t, typename value_idx>
+void ivf_pq_train(raft::resources const& res, const raft::ivf_pq_params &params, raft::ivf_pq_index &index,
+raft::device_matrix<value_t, idx_t, raft::row_major> dataset);
+
+template<typename value_t, typename value_idx>
+void ivf_pq_search(raft::resources const& res, raft::ivf_pq_params const&params, raft::ivf_pq_index const & index,
+raft::device_matrix<value_t, idx_t, raft::row_major> queries,
+raft::device_matrix<value_t, idx_t, raft::row_major> out_inds,
+raft::device_matrix<value_t, idx_t, raft::row_major> out_dists);
+}
+```
+
+### Other functions on state
+
+These guidelines also mean that it is the responsibility of C++ API to expose methods to load and store (aka marshalling) such a data structure. Further continuing the IVF-PQ example,  the following methods could achieve this:
+```cpp
+namespace raft::ivf_pq {
+   void save(raft::ivf_pq_index const& model, std::ostream &os);
+   void load(raft::ivf_pq_index& model, std::istream &is);
+}
+```
+
+
+## Coding style
+
+### Code format
+#### Introduction
+RAFT relies on `clang-format` to enforce code style across all C++ and CUDA source code. The coding style is based on the [Google style guide](https://google.github.io/styleguide/cppguide.html#Formatting). The only digressions from this style are the following.
+1. Do not split empty functions/records/namespaces.
+2. Two-space indentation everywhere, including the line continuations.
+3. Disable reflowing of comments.
+   The reasons behind these deviations from the Google style guide are given in comments [here](../../cpp/.clang-format).
+
+#### How is the check done?
+All formatting checks are done by this python script: [run-clang-format.py](../../cpp/scripts/run-clang-format.py) which is effectively a wrapper over `clang-format`. An error is raised if the code diverges from the format suggested by clang-format. It is expected that the developers run this script to detect and fix formatting violations before creating PR.
+
+##### As part of CI
+[run-clang-format.py](../../cpp/scripts/run-clang-format.py) is executed as part of our `ci/checks/style.sh` CI test. If there are any formatting violations, PR author is expected to fix those to get CI passing. Steps needed to fix the formatting violations are described in the subsequent sub-section.
+
+##### Manually
+Developers can also manually (or setup this command as part of git pre-commit hook) run this check by executing:
+```bash
+python ./cpp/scripts/run-clang-format.py
+```
+From the root of the RAFT repository.
+
+#### How to know the formatting violations?
+When there are formatting errors, [run-clang-format.py](../../cpp/scripts/run-clang-format.py) prints a `diff` command, showing where there are formatting differences. Unfortunately, unlike `flake8`, `clang-format` does NOT print descriptions of the violations, but instead directly formats the code. So, the only way currently to know about formatting differences is to run the diff command as suggested by this script against each violating source file.
+
+#### How to fix the formatting violations?
+When there are formatting violations, [run-clang-format.py](../../cpp/scripts/run-clang-format.py) prints at the end, the exact command that can be run by developers to fix them. This is the easiest way to fix formatting errors. [This screencast](https://asciinema.org/a/287367) shows how developers can check for formatting violations in their branches and also how to fix those, before sending out PRs.
+
+In short, to bulk-fix all the formatting violations, execute the following command:
+```bash
+python ./cpp/scripts/run-clang-format.py -inplace
+```
+From the root of the RAFT repository.
+
+#### clang-format version?
+To avoid spurious code style violations we specify the exact clang-format version required, currently `11.1.0`. This is enforced by the [run-clang-format.py](../../cpp/scripts/run-clang-format.py) script itself. Refer [here](../../cpp/README.md#dependencies) for the list of build-time dependencies.
+
+#### Additional scripts
+Along with clang, there are an include checker and copyright checker scripts for checking style, which can be performed as part of CI, as well as manually.
+
+##### #include style
+[include_checker.py](../../cpp/scripts/include_checker.py) is used to enforce the include style as follows:
+1. `#include "..."` should be used for referencing local files only. It is acceptable to be used for referencing files in a sub-folder/parent-folder of the same algorithm, but should never be used to include files in other algorithms or between algorithms and the primitives or other dependencies.
+2. `#include <...>` should be used for referencing everything else
+
+Manually, run the following to bulk-fix include style issues:
+```bash
+python ./cpp/scripts/include_checker.py --inplace [cpp/include cpp/test ... list of folders which you want to fix]
+```
+
+##### Copyright header
+[copyright.py](../../ci/checks/copyright.py) checks the Copyright header for all git-modified files
+
+Manually, you can run the following to bulk-fix the header if only the years need to be updated:
+```bash
+python ./ci/checks/copyright.py --update-current-year
+```
+Keep in mind that this only applies to files tracked by git and having been modified.
+
+## Error handling
+Call CUDA APIs via the provided helper macros `RAFT_CUDA_TRY`, `RAFT_CUBLAS_TRY` and `RAFT_CUSOLVER_TRY`. These macros take care of checking the return values of the used API calls and generate an exception when the command is not successful. If you need to avoid an exception, e.g. inside a destructor, use `RAFT_CUDA_TRY_NO_THROW`, `RAFT_CUBLAS_TRY_NO_THROW ` and `RAFT_CUSOLVER_TRY_NO_THROW`. These macros log the error but do not throw an exception.
+
+## Logging
+
+### Introduction
+Anything and everything about logging is defined inside [logger.hpp](../../cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all.
+
+### Usage
+```cpp
+#include <raft/core/logger.hpp>
+
+// Inside your method or function, use any of these macros
+RAFT_LOG_TRACE("Hello %s!", "world");
+RAFT_LOG_DEBUG("Hello %s!", "world");
+RAFT_LOG_INFO("Hello %s!", "world");
+RAFT_LOG_WARN("Hello %s!", "world");
+RAFT_LOG_ERROR("Hello %s!", "world");
+RAFT_LOG_CRITICAL("Hello %s!", "world");
+```
+
+### Changing logging level
+There are 7 logging levels with each successive level becoming quieter:
+1. RAFT_LEVEL_TRACE
+2. RAFT_LEVEL_DEBUG
+3. RAFT_LEVEL_INFO
+4. RAFT_LEVEL_WARN
+5. RAFT_LEVEL_ERROR
+6. RAFT_LEVEL_CRITICAL
+7. RAFT_LEVEL_OFF
+   Pass one of these as per your needs into the `set_level()` method as follows:
+```cpp
+raft::logger::get.set_level(RAFT_LEVEL_WARN);
+// From now onwards, this will print only WARN and above kind of messages
+```
+
+### Changing logging pattern
+Pass the [format string](https://github.com/gabime/spdlog/wiki/3.-Custom-formatting) as follows in order use a different logging pattern than the default.
+```cpp
+raft::logger::get.set_pattern(YourFavoriteFormat);
+```
+One can also use the corresponding `get_pattern()` method to know the current format as well.
+
+### Temporarily changing the logging pattern
+Sometimes, we need to temporarily change the log pattern (eg: for reporting decision tree structure). This can be achieved in a RAII-like approach as follows:
+```cpp
+{
+  PatternSetter _(MyNewTempFormat);
+  // new log format is in effect from here onwards
+  doStuff();
+  // once the above temporary object goes out-of-scope, the old format will be restored
+}
+```
+
+### Tips
+* Do NOT end your logging messages with a newline! It is automatically added by spdlog.
+* The `RAFT_LOG_TRACE()` is by default not compiled due to the `RAFT_ACTIVE_LEVEL` macro setup, for performance reasons. If you need it to be enabled, change this macro accordingly during compilation time
 
 ## Common Design Considerations
 
@@ -26,9 +262,170 @@ The public APIs should be lightweight wrappers around calls to private APIs insi
 
 ## Testing
 
-It's important for RAFT to maintain a high test coverage in order to minimize the potential for downstream projects to encounter unexpected build or runtime behavior as a result of changes. A well-defined public API can help maintain compile-time stability but means more focus should be placed on testing the functional requirements and verifying execution on the various edge cases within RAFT itself. Ideally, bug fixes and new features should be able to be made to RAFT independently of the consuming projects.
+It's important for RAFT to maintain a high test coverage of the public APIs in order to minimize the potential for downstream projects to encounter unexpected build or runtime behavior as a result of changes. 
 
+A well-defined public API can help maintain compile-time stability but means more focus should be placed on testing the functional requirements and verifying execution on the various edge cases within RAFT itself. Ideally, bug fixes and new features should be able to be made to RAFT independently of the consuming projects.
 
 ## Documentation
 
-Public APIs always require documentation, since those will be exposed directly to users. In addition to summarizing the purpose of each class / function in the public API, the arguments (and relevant templates) should be documented along with brief usage examples.
+Public APIs always require documentation since those will be exposed directly to users. For C++, we use [doxygen](http://www.doxygen.nl) and for Python/cython we use [pydoc](https://docs.python.org/3/library/pydoc.html). In addition to summarizing the purpose of each class / function in the public API, the arguments (and relevant templates) should be documented along with brief usage examples.
+
+## Asynchronous operations and stream ordering
+All RAFT algorithms should be as asynchronous as possible avoiding the use of the default stream (aka as NULL or `0` stream). Implementations that require only one CUDA Stream should use the stream from `raft::resources`:
+
+```cpp
+#include <raft/core/resources.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+
+void foo(const raft::resources& res, ...)
+{
+    cudaStream_t stream = get_cuda_stream(res);
+}
+```
+When multiple streams are needed, e.g. to manage a pipeline, use the internal streams available in `raft::resources` (see [CUDA Resources](#cuda-resources)). If multiple streams are used all operations still must be ordered according to `raft::resource::get_cuda_stream()` (from `raft/core/resource/cuda_stream.hpp`). Before any operation in any of the internal CUDA streams is started, all previous work in `raft::resource::get_cuda_stream()` must have completed. Any work enqueued in `raft::resource::get_cuda_stream()` after a RAFT function returns should not start before all work enqueued in the internal streams has completed. E.g. if a RAFT algorithm is called like this:
+```cpp
+#include <raft/core/resources.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+void foo(const double* srcdata, double* result)
+{
+    cudaStream_t stream;
+    CUDA_RT_CALL( cudaStreamCreate( &stream ) );
+    raft::resources res;
+    set_cuda_stream(res, stream);
+    
+    ...
+    
+    RAFT_CUDA_TRY( cudaMemcpyAsync( srcdata, h_srcdata.data(), n*sizeof(double), cudaMemcpyHostToDevice, stream ) );
+
+    raft::algo(raft::resources, dopredict, srcdata, result, ... );
+
+    RAFT_CUDA_TRY( cudaMemcpyAsync( h_result.data(), result, m*sizeof(int), cudaMemcpyDeviceToHost, stream ) );
+
+    ...
+}
+```
+No work in any stream should start in `raft::algo` before the `cudaMemcpyAsync` in `stream` launched before the call to `raft::algo` is done. And all work in all streams used in `raft::algo` should be done before the `cudaMemcpyAsync` in `stream` launched after the call to `raft::algo` starts.
+
+This can be ensured by introducing interstream dependencies with CUDA events and `cudaStreamWaitEvent`. For convenience, the header `raft/core/device_resources.hpp` provides the class `raft::stream_syncer` which lets all `raft::resources` internal CUDA streams wait on `raft::resource::get_cuda_stream()` in its constructor and in its destructor and lets `raft::resource::get_cuda_stream()` wait on all work enqueued in the `raft::resources` internal CUDA streams. The intended use would be to create a `raft::stream_syncer` object as the first thing in an entry function of the public RAFT API:
+
+```cpp
+namespace raft {
+   void algo(const raft::resources& res, ...)
+   {
+       raft::streamSyncer _(res);
+   }
+}
+```
+This ensures the stream ordering behavior described above.
+
+### Using Thrust
+To ensure that thrust algorithms are executed in the intended stream the `thrust::cuda::par` execution policy should be used. To ensure that thrust algorithms allocate temporary memory via the provided device memory allocator, use the `rmm::exec_policy` available in `raft/core/resource/thrust_policy.hpp`, which can be used through `raft::resources`:
+```cpp
+#include <raft/core/resources.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+void foo(const raft::resources& res, ...)
+{
+    auto execution_policy = get_thrust_policy(res);
+    thrust::for_each(execution_policy, ... );
+}
+```
+
+## Resource Management
+
+Do not create reusable CUDA resources directly in implementations of RAFT algorithms. Instead, use the existing resources in `raft::resources` to avoid constant creation and deletion of reusable resources such as CUDA streams, CUDA events or library handles. Please file a feature request if a resource handle is missing in `raft::resources`.
+The resources can be obtained like this
+```cpp
+#include <raft/core/resources.hpp>
+#include <raft/core/resource/cublas_handle.hpp>
+#include <raft/core/resource/cuda_stream_pool.hpp>
+void foo(const raft::resources& h, ...)
+{
+    cublasHandle_t cublasHandle = get_cublas_handle(h);
+    const int num_streams       = get_stream_pool_size(h);
+    const int stream_idx        = ...
+    cudaStream_t stream         = get_stream_from_stream_pool(stream_idx);
+    ...
+}
+```
+
+The example below shows one way to create `n_stream` number of internal cuda streams with an `rmm::stream_pool` which can later be used by the algos inside RAFT. 
+```cpp
+#include <raft/core/resources.hpp>
+#include <raft/core/resource/cuda_stream_pool.hpp>
+#include <rmm/cuda_stream_pool.hpp>
+int main(int argc, char** argv)
+{
+    int n_streams = argc > 1 ? atoi(argv[1]) : 0;
+    raft::resources res;
+    set_cuda_stream_pool(res, std::make_shared<rmm::cuda_stream_pool>(n_streams));
+
+    foo(res, ...);
+}
+```
+
+## Multi-GPU
+
+The multi-GPU paradigm of RAFT is **O**ne **P**rocess per **G**PU (OPG). Each algorithm should be implemented in a way that it can run with a single GPU without any specific dependencies to a particular communication library. A multi-GPU implementation should use the methods offered by the class `raft::comms::comms_t` from [raft/core/comms.hpp] for inter-rank/GPU communication. It is the responsibility of the user of cuML to create an initialized instance of `raft::comms::comms_t`.
+
+E.g. with a CUDA-aware MPI, a RAFT user could use code like this to inject an initialized instance of `raft::comms::mpi_comms` into a `raft::resources`:
+
+```cpp
+#include <mpi.h>
+#include <raft/core/device_resources.hpp>
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/algo.hpp>
+...
+int main(int argc, char * argv[])
+{
+    MPI_Init(&argc, &argv);
+    int rank = -1;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    int local_rank = -1;
+    {
+        MPI_Comm local_comm;
+        MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &local_comm);
+
+        MPI_Comm_rank(local_comm, &local_rank);
+
+        MPI_Comm_free(&local_comm);
+    }
+
+    cudaSetDevice(local_rank);
+
+    mpi_comms raft_mpi_comms;
+    MPI_Comm_dup(MPI_COMM_WORLD, &raft_mpi_comms);
+
+    {
+        raft::device_resources res;
+        initialize_mpi_comms(res, raft_mpi_comms);
+
+        ...
+
+        raft::algo(res, ... );
+    }
+
+    MPI_Comm_free(&raft_mpi_comms);
+
+    MPI_Finalize();
+    return 0;
+}
+```
+
+A RAFT developer can assume the following:
+* A instance of `raft::comms::comms_t` was correctly initialized.
+* All processes that are part of `raft::comms::comms_t` call into the RAFT algorithm cooperatively.
+
+The initialized instance of `raft::comms::comms_t` can be accessed from the `raft::resources` instance:
+
+```cpp
+#include <raft/core/resources.hpp>
+#include <raft/core/resource/comms.hpp>
+void foo(const raft::resources& res, ...)
+{
+    const raft::comms_t& communicator = get_comms(res);
+    const int rank = communicator.get_rank();
+    const int size = communicator.get_size();
+    ...
+}
+```
diff --git a/docs/source/index.rst b/docs/source/index.rst
index e66152b904..9890bd932f 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -23,7 +23,7 @@ While not exhaustive, the following general categories help summarize the accele
    * - Dense Operations
      - linear algebra, matrix and vector operations, slicing, norms, factorization, least squares, svd & eigenvalue problems
    * - Sparse Operations
-     - linear algebra, arithmetic, eigenvalue problems, slicing, symmetrization, components & labeling
+     - linear algebra, eigenvalue problems, slicing, norms, reductions, factorization, symmetrization, components & labeling
    * - Spatial
      - pairwise distances, nearest neighbors, neighborhood graph construction
    * - Basic Clustering
@@ -45,6 +45,7 @@ While not exhaustive, the following general categories help summarize the accele
    cpp_api.rst
    pylibraft_api.rst
    raft_dask_api.rst
+   using_comms.rst
    contributing.md
 
 
diff --git a/docs/source/pylibraft_api.rst b/docs/source/pylibraft_api.rst
index 5c44c5f419..84955283cb 100644
--- a/docs/source/pylibraft_api.rst
+++ b/docs/source/pylibraft_api.rst
@@ -1,14 +1,14 @@
-~~~~~~~~~~~~~~~~~~~~~~~
-PyLibRAFT API Reference
-~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~
+Python API
+~~~~~~~~~~
 
 .. _api:
 
 .. toctree::
    :maxdepth: 4
 
-   pylibraft_api/common.rst
    pylibraft_api/cluster.rst
+   pylibraft_api/common.rst
    pylibraft_api/distance.rst
    pylibraft_api/neighbors.rst
-   pylibraft_api/random.rst
\ No newline at end of file
+   pylibraft_api/random.rst
diff --git a/docs/source/pylibraft_api/cluster.rst b/docs/source/pylibraft_api/cluster.rst
index 1a5aabf48a..59e53e7d4c 100644
--- a/docs/source/pylibraft_api/cluster.rst
+++ b/docs/source/pylibraft_api/cluster.rst
@@ -7,6 +7,15 @@ This page provides pylibraft class references for the publicly-exposed elements
    :language: python
    :class: highlight
 
+.. autoclass:: pylibraft.cluster.kmeans.KMeansParams
+    :members:
+
+.. autofunction:: pylibraft.cluster.kmeans.fit
+
+.. autofunction:: pylibraft.cluster.kmeans.cluster_cost
+
 .. autofunction:: pylibraft.cluster.compute_new_centroids
 
 
+
+
diff --git a/docs/source/pylibraft_api/common.rst b/docs/source/pylibraft_api/common.rst
index 4070243b22..527309aa69 100644
--- a/docs/source/pylibraft_api/common.rst
+++ b/docs/source/pylibraft_api/common.rst
@@ -12,7 +12,7 @@ This page provides `pylibraft` class references for the publicly-exposed element
 Basic Vocabulary
 ################
 
-.. autoclass:: pylibraft.common.Handle
+.. autoclass:: pylibraft.common.DeviceResources
     :members:
 
 .. autoclass:: pylibraft.common.Stream
@@ -25,7 +25,9 @@ Interruptible
 #############
 
 .. autofunction:: pylibraft.common.interruptible.cuda_interruptible
+
 .. autofunction:: pylibraft.common.interruptible.synchronize
+
 .. autofunction:: pylibraft.common.interruptible.cuda_yield
 
 
diff --git a/docs/source/pylibraft_api/neighbors.rst b/docs/source/pylibraft_api/neighbors.rst
index 7112a3878c..89bb577027 100644
--- a/docs/source/pylibraft_api/neighbors.rst
+++ b/docs/source/pylibraft_api/neighbors.rst
@@ -7,12 +7,24 @@ This page provides pylibraft class references for the publicly-exposed elements
    :language: python
    :class: highlight
 
+
+IVF-PQ
+######
+
 .. autoclass:: pylibraft.neighbors.ivf_pq.IndexParams
+    :members:
 
 .. autofunction:: pylibraft.neighbors.ivf_pq.build
 
 .. autofunction:: pylibraft.neighbors.ivf_pq.extend
 
 .. autoclass:: pylibraft.neighbors.ivf_pq.SearchParams
+    :members:
 
 .. autofunction:: pylibraft.neighbors.ivf_pq.search
+
+
+Candidate Refinement
+####################
+
+.. autofunction:: pylibraft.neighbors.refine
diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md
index 60071f2461..e955706dc4 100644
--- a/docs/source/quick_start.md
+++ b/docs/source/quick_start.md
@@ -8,9 +8,9 @@ RAFT relies heavily on the [RMM](https://github.com/rapidsai/rmm) library which
 
 ## Multi-dimensional Spans and Arrays
 
-The APIs in RAFT currently accept raw pointers to device memory and we are in the process of simplifying the APIs with the [mdspan](https://arxiv.org/abs/2010.06474) multi-dimensional array view for representing data in higher dimensions similar to the `ndarray` in the Numpy Python library. RAFT also contains the corresponding owning `mdarray` structure, which simplifies the allocation and management of multi-dimensional data in both host and device (GPU) memory.
+Most of the APIs in RAFT accept  [mdspan](https://arxiv.org/abs/2010.06474) multi-dimensional array view for representing data in higher dimensions similar to the `ndarray` in the Numpy Python library. RAFT also contains the corresponding owning `mdarray` structure, which simplifies the allocation and management of multi-dimensional data in both host and device (GPU) memory.
 
-The `mdarray` forms a convenience layer over RMM and can be constructed in RAFT using a number of different helper functions:
+The `mdarray` is an owning object that forms a convenience layer over RMM and can be constructed in RAFT using a number of different helper functions:
 
 ```c++
 #include <raft/core/device_mdarray.hpp>
@@ -118,11 +118,54 @@ auto metric = raft::distance::DistanceType::L2SqrtExpanded;
 raft::distance::pairwise_distance(handle, input.view(), input.view(), output.view(), metric);
 ```
 
-## Python Example
+### Python Example
 
-The `pylibraft` package contains a Python API for RAFT algorithms and primitives. `pylibraft` integrates nicely into other libraries by being very lightweight with minimal dependencies and accepting any object that supports the `__cuda_array_interface__`, such as [CuPy's ndarray](https://docs.cupy.dev/en/stable/user_guide/interoperability.html#rmm). The package is currently limited to pairwise distances and RMAT graph generation, but we will continue adding more in future releases.
+The `pylibraft` package contains a Python API for RAFT algorithms and primitives. `pylibraft` integrates nicely into other libraries by being very lightweight with minimal dependencies and accepting any object that supports the `__cuda_array_interface__`, such as [CuPy's ndarray](https://docs.cupy.dev/en/stable/user_guide/interoperability.html#rmm). The number of RAFT algorithms exposed in this package is continuing to grow from release to release.
 
-The example below demonstrates computing the pairwise Euclidean distances between CuPy arrays. `pylibraft` is a low-level API that prioritizes efficiency and simplicity over being pythonic, which is shown here by pre-allocating the output memory before invoking the `pairwise_distance` function. Note that CuPy is not a required dependency for `pylibraft`.
+The example below demonstrates computing the pairwise Euclidean distances between CuPy arrays. Note that CuPy is not a required dependency for `pylibraft`.
+
+```python
+import cupy as cp
+
+from pylibraft.distance import pairwise_distance
+
+n_samples = 5000
+n_features = 50
+
+in1 = cp.random.random_sample((n_samples, n_features), dtype=cp.float32)
+in2 = cp.random.random_sample((n_samples, n_features), dtype=cp.float32)
+
+output = pairwise_distance(in1, in2, metric="euclidean")
+```
+
+The `output` array in the above example is of type `raft.common.device_ndarray`, which supports [__cuda_array_interface__](https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html#cuda-array-interface-version-2) making it interoperable with other libraries like CuPy, Numba, and PyTorch that also support it. CuPy supports DLPack, which also enables zero-copy conversion from `raft.common.device_ndarray` to JAX and Tensorflow.
+
+Below is an example of converting the output `pylibraft.common.device_ndarray` to a CuPy array:
+```python
+cupy_array = cp.asarray(output)
+```
+
+And converting to a PyTorch tensor:
+```python
+import torch
+
+torch_tensor = torch.as_tensor(output, device='cuda')
+```
+
+When the corresponding library has been installed and available in your environment, this conversion can also be done automatically by all RAFT compute APIs by setting a global configuration option:
+```python
+import pylibraft.config
+pylibraft.config.set_output_as("cupy")  # All compute APIs will return cupy arrays
+pylibraft.config.set_output_as("torch") # All compute APIs will return torch tensors
+```
+
+You can also specify a `callable` that accepts a `pylibraft.common.device_ndarray` and performs a custom conversion. The following example converts all output to `numpy` arrays:
+```python
+pylibraft.config.set_output_as(lambda device_ndarray: return device_ndarray.copy_to_host())
+```
+
+
+`pylibraft` also supports writing to a pre-allocated output array so any `__cuda_array_interface__` supported array can be written to in-place:
 
 ```python
 import cupy as cp
@@ -136,5 +179,5 @@ in1 = cp.random.random_sample((n_samples, n_features), dtype=cp.float32)
 in2 = cp.random.random_sample((n_samples, n_features), dtype=cp.float32)
 output = cp.empty((n_samples, n_samples), dtype=cp.float32)
 
-pairwise_distance(in1, in2, output, metric="euclidean")
+pairwise_distance(in1, in2, out=output, metric="euclidean")
 ```
diff --git a/docs/source/raft_dask_api.rst b/docs/source/raft_dask_api.rst
index 10ba8781a2..44720c188c 100644
--- a/docs/source/raft_dask_api.rst
+++ b/docs/source/raft_dask_api.rst
@@ -1,6 +1,6 @@
-~~~~~~~~~~~~~~~~~~~~~~~
-RAFT Dask API Reference
-~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~
+RAFT Dask API
+~~~~~~~~~~~~~
 
 .. role:: py(code)
    :language: python
diff --git a/docs/source/using_comms.rst b/docs/source/using_comms.rst
new file mode 100644
index 0000000000..84ea61c248
--- /dev/null
+++ b/docs/source/using_comms.rst
@@ -0,0 +1,97 @@
+Using RAFT Comms
+================
+
+RAFT provides a communications abstraction for writing distributed algorithms which can scale up to multiple GPUs and scale out to multiple nodes. The communications abstraction is largely based on MPI and NCCL, and allows the user to decouple the design of algorithms from the environments where the algorithms are executed, enabling “write-once deploy everywhere” semantics. Currently, the distributed algorithms in both cuGraph and cuML are being deployed in both MPI and Dask clusters while cuML’s distributed algorithms are also being deployed on GPUs in Apache Spark clusters. This is a powerful concept as distributed algorithms can be non-trivial to write and so maintainability is eased and bug fixes reach further by increasing reuse as much as possible.
+
+While users of RAFT’s communications layer largely get MPI integration for free just by installing MPI and using `mpirun` to run their applications, the `raft-dask` Python package provides a mechanism for executing algorithms written using RAFT’s communications layer in a Dask cluster. It will help to walk through a small example of how one would build an algorithm with RAFT’s communications layer.
+
+First, an instance of `raft::comms_t` is passed through the `raft::device_resources` instance and code is written to utilize collective and/or point-to-point communications as needed.
+
+.. code-block:: cpp
+   :caption: Example function written with the RAFT comms API
+
+   #include <raft/core/comms.hpp>
+   #include <raft/core/device_mdspan.hpp>
+   #include <raft/util/cudart_utils.hpp>
+
+   void test_allreduce(raft::device_resources const &handle, int root) {
+     raft::comms::comms_t const& communicator = handle.get_comms();
+     cudaStream_t stream = handle.get_stream();
+     raft::device_scalar<int> temp_scalar(stream);
+
+     int to_send = 1;
+     raft::copy(temp_scalar.data(), &to_send, 1, stream);
+     communicator.allreduce(temp_scalar.data(), temp_scalar.data(), 1,
+                            raft::comms::opt_t::SUM, stream);
+     handle.sync_stream();
+   }
+
+This exact function can now be executed in several different types of GPU clusters. For example, it can be executed with MPI by initializing an instance of `raft::comms::mpi_comms` with the `MPI_Comm`:
+
+.. code-block:: cpp
+   :caption: Example of running test_allreduce() in MPI
+
+   #include <raft/core/mpi_comms.hpp>
+   #include <raft/core/device_resources.hpp>
+
+   raft::device_resources resource_handle;
+   // ...
+   // initialize MPI_Comm
+   // ...
+   raft::comms::initialize_mpi_comms(resource_handle,  mpi_comm);
+   // ...
+   test_allreduce(resource_handle, 0);
+
+Deploying our`test_allreduce` function in Dask requires a lightweight Python interface, which we can accomplish using `pylibraft` and exposing the function through Cython:
+
+.. code-block:: cython
+   :caption: Example of wrapping test_allreduce() w/ cython
+
+   from pylibraft.common.handle cimport device_resources
+   from cython.operator cimport dereference as deref
+
+   cdef extern from “allreduce_test.hpp”:
+       void test_allreduce(device_resources const &handle, int root) except +
+
+   def run_test_allreduce(handle, root):
+       cdef const device_resources* h = <device_resources*><size_t>handle.getHandle()
+
+   test_allreduce(deref(h), root)
+
+Finally, we can use `raft_dask` to execute our new algorithm in a Dask cluster (please note this also uses `LocalCUDACluster` from the RAPIDS dask-cuda library):
+
+.. code-block:: python
+   :caption: Example of running test_allreduce() in Dask
+
+   from raft_dask.common import Comms, local_handle
+   from dask.distributed import Client, wait
+   from dask_cuda import LocalCUDACluster
+   cluster = LocalCUDACluster()
+   client = Client(cluster)
+
+   # Create and initialize Comms instance
+   comms = Comms(client=client)
+   comms.init()
+
+   def func_run_allreduce(sessionId, root):
+     handle = local_handle(sessionId)
+     run_test_allreduce(handle, root)
+
+   # Invoke run_test_allreduce on all workers
+   dfs = [
+     client.submit(
+       func_run_allreduce,
+       comms.sessionId,
+       0,
+       pure=False,
+       workers=[w]
+     )
+     for w in comms.worker_addresses
+   ]
+
+   # Wait until processing is done
+   wait(dfs, timeout=5)
+
+   comms.destroy()
+   client.close()
+   cluster.close()
diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake
index f1f5b7e8ca..248f5784c0 100644
--- a/fetch_rapids.cmake
+++ b/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake)
-  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.12/RAPIDS.cmake
+  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.02/RAPIDS.cmake
        ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake
   )
 endif()
diff --git a/python/pylibraft/.coveragerc b/python/pylibraft/.coveragerc
new file mode 100644
index 0000000000..fc087fb9c5
--- /dev/null
+++ b/python/pylibraft/.coveragerc
@@ -0,0 +1,3 @@
+# Configuration file for Python coverage tests
+[run]
+source = pylibraft
\ No newline at end of file
diff --git a/python/pylibraft/CMakeLists.txt b/python/pylibraft/CMakeLists.txt
index ba3c9a453e..3efc3a547b 100644
--- a/python/pylibraft/CMakeLists.txt
+++ b/python/pylibraft/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
 
-set(pylibraft_version 22.12.01)
+set(pylibraft_version 23.02.00)
 
 include(../../fetch_rapids.cmake)
 
diff --git a/python/pylibraft/_custom_build/backend.py b/python/pylibraft/_custom_build/backend.py
index 7d1b334626..209e9e4b67 100644
--- a/python/pylibraft/_custom_build/backend.py
+++ b/python/pylibraft/_custom_build/backend.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
 """Custom build backend for pylibraft to get versioned requirements.
 
@@ -18,9 +18,8 @@ def replace_requirements(func):
     @wraps(func)
     def wrapper(config_settings=None):
         orig_list = getattr(_orig, func.__name__)(config_settings)
-        append_list = [
-            f"rmm{os.getenv('RAPIDS_PY_WHEEL_CUDA_SUFFIX', default='')}"
-        ]
+        cuda_suffix = os.getenv("RAPIDS_PY_WHEEL_CUDA_SUFFIX", default="")
+        append_list = [f"rmm{cuda_suffix}==23.2.*"]
         return orig_list + append_list
 
     return wrapper
diff --git a/python/pylibraft/pylibraft/__init__.py b/python/pylibraft/pylibraft/__init__.py
index 273b4497cc..c1a5bf1663 100644
--- a/python/pylibraft/pylibraft/__init__.py
+++ b/python/pylibraft/pylibraft/__init__.py
@@ -12,3 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
+import pylibraft.config
+from pylibraft._version import get_versions
+
+__version__ = get_versions()["version"]
+del get_versions
diff --git a/python/pylibraft/pylibraft/cluster/__init__.py b/python/pylibraft/pylibraft/cluster/__init__.py
index 89a403fce2..4facc3dae2 100644
--- a/python/pylibraft/pylibraft/cluster/__init__.py
+++ b/python/pylibraft/pylibraft/cluster/__init__.py
@@ -13,4 +13,6 @@
 # limitations under the License.
 #
 
-from .kmeans import compute_new_centroids
+from .kmeans import KMeansParams, cluster_cost, compute_new_centroids, fit
+
+__all__ = ["KMeansParams", "cluster_cost", "compute_new_centroids", "fit"]
diff --git a/python/pylibraft/pylibraft/cpp/__init__.pxd b/python/pylibraft/pylibraft/cluster/cpp/__init__.pxd
similarity index 100%
rename from python/pylibraft/pylibraft/cpp/__init__.pxd
rename to python/pylibraft/pylibraft/cluster/cpp/__init__.pxd
diff --git a/python/pylibraft/pylibraft/cpp/__init__.py b/python/pylibraft/pylibraft/cluster/cpp/__init__.py
similarity index 100%
rename from python/pylibraft/pylibraft/cpp/__init__.py
rename to python/pylibraft/pylibraft/cluster/cpp/__init__.py
diff --git a/python/pylibraft/pylibraft/cpp/kmeans.pxd b/python/pylibraft/pylibraft/cluster/cpp/kmeans.pxd
similarity index 53%
rename from python/pylibraft/pylibraft/cpp/kmeans.pxd
rename to python/pylibraft/pylibraft/cluster/cpp/kmeans.pxd
index b263952522..c43f18ac3f 100644
--- a/python/pylibraft/pylibraft/cpp/kmeans.pxd
+++ b/python/pylibraft/pylibraft/cluster/cpp/kmeans.pxd
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,14 +18,23 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from pylibraft.common.handle cimport handle_t
+import numpy as np
 
+from cython.operator cimport dereference as deref
+from libc.stdint cimport uintptr_t
+from libcpp cimport bool, nullptr
 
-cdef extern from "raft_distance/kmeans.hpp" \
-        namespace "raft::cluster::kmeans::runtime":
+from pylibraft.cluster.cpp.kmeans_types cimport KMeansParams
+from pylibraft.common.cpp.mdspan cimport *
+from pylibraft.common.cpp.optional cimport optional
+from pylibraft.common.handle cimport device_resources
+
+
+cdef extern from "raft_runtime/cluster/kmeans.hpp" \
+        namespace "raft::runtime::cluster::kmeans" nogil:
 
     cdef void update_centroids(
-        const handle_t& handle,
+        const device_resources& handle,
         const double *X,
         int n_samples,
         int n_features,
@@ -37,7 +46,7 @@ cdef extern from "raft_distance/kmeans.hpp" \
         double *weight_per_cluster) except +
 
     cdef void update_centroids(
-        const handle_t& handle,
+        const device_resources& handle,
         const float *X,
         int n_samples,
         int n_features,
@@ -49,7 +58,7 @@ cdef extern from "raft_distance/kmeans.hpp" \
         float *weight_per_cluster) except +
 
     cdef void cluster_cost(
-        const handle_t& handle,
+        const device_resources& handle,
         const float* X,
         int n_samples,
         int n_features,
@@ -58,10 +67,28 @@ cdef extern from "raft_distance/kmeans.hpp" \
         float * cost) except +
 
     cdef void cluster_cost(
-        const handle_t& handle,
+        const device_resources& handle,
         const double* X,
         int n_samples,
         int n_features,
         int n_clusters,
         const double * centroids,
         double * cost) except +
+
+    cdef void fit(
+        const device_resources & handle,
+        const KMeansParams& params,
+        device_matrix_view[float, int, row_major] X,
+        optional[device_vector_view[float, int]] sample_weight,
+        device_matrix_view[float, int, row_major] inertia,
+        host_scalar_view[float, int] inertia,
+        host_scalar_view[int, int] n_iter) except +
+
+    cdef void fit(
+        const device_resources & handle,
+        const KMeansParams& params,
+        device_matrix_view[double, int, row_major] X,
+        optional[device_vector_view[double, int]] sample_weight,
+        device_matrix_view[double, int, row_major] inertia,
+        host_scalar_view[double, int] inertia,
+        host_scalar_view[int, int] n_iter) except +
diff --git a/python/pylibraft/pylibraft/cluster/cpp/kmeans_types.pxd b/python/pylibraft/pylibraft/cluster/cpp/kmeans_types.pxd
new file mode 100644
index 0000000000..12cecd4336
--- /dev/null
+++ b/python/pylibraft/pylibraft/cluster/cpp/kmeans_types.pxd
@@ -0,0 +1,44 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from libcpp cimport bool
+
+from pylibraft.distance.distance_type cimport DistanceType
+from pylibraft.random.cpp.rng_state cimport RngState
+
+
+cdef extern from "raft/cluster/kmeans_types.hpp" \
+        namespace "raft::cluster::kmeans":
+
+    ctypedef enum InitMethod 'raft::cluster::KMeansParams::InitMethod':
+        KMeansPlusPlus 'raft::cluster::kmeans::KMeansParams::InitMethod::KMeansPlusPlus' # noqa
+        Random 'raft::cluster::kmeans::KMeansParams::InitMethod::Random'
+        Array 'raft::cluster::kmeans::KMeansParams::InitMethod::Array'
+
+    cdef cppclass KMeansParams:
+        KMeansParams() except +
+        int n_clusters
+        InitMethod init
+        int max_iter
+        double tol
+        int verbosity
+        RngState rng_state
+        DistanceType metric
+        int n_init
+        double oversampling_factor
+        int batch_samples
+        int batch_centroids
+        bool inertia_check
diff --git a/python/pylibraft/pylibraft/cluster/kmeans.pyx b/python/pylibraft/pylibraft/cluster/kmeans.pyx
index 679523cef4..1d0b9ad241 100644
--- a/python/pylibraft/pylibraft/cluster/kmeans.pyx
+++ b/python/pylibraft/pylibraft/cluster/kmeans.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,29 +22,34 @@ import numpy as np
 
 from cython.operator cimport dereference as deref
 from libc.stdint cimport uintptr_t
-from libcpp cimport bool, nullptr
+from libcpp cimport nullptr
 
-from pylibraft.common import Handle
+from collections import namedtuple
+from enum import IntEnum
+
+from pylibraft.common import Handle, cai_wrapper, device_ndarray
 from pylibraft.common.handle import auto_sync_handle
 
-from pylibraft.common.handle cimport handle_t
+from pylibraft.common.handle cimport device_resources
+from pylibraft.random.cpp.rng_state cimport RngState
 
 from pylibraft.common.input_validation import *
 from pylibraft.distance import DISTANCE_TYPES
 
-from pylibraft.cpp.kmeans cimport (
+from pylibraft.cluster.cpp cimport kmeans as cpp_kmeans, kmeans_types
+from pylibraft.cluster.cpp.kmeans cimport (
     cluster_cost as cpp_cluster_cost,
     update_centroids,
 )
+from pylibraft.common.cpp.mdspan cimport *
+from pylibraft.common.cpp.optional cimport optional
+from pylibraft.common.handle cimport device_resources
 
-
-def is_c_cont(cai, dt):
-    return "strides" not in cai or \
-        cai["strides"] is None or \
-        cai["strides"][1] == dt.itemsize
+from pylibraft.common import auto_convert_output
 
 
 @auto_sync_handle
+@auto_convert_output
 def compute_new_centroids(X,
                           centroids,
                           labels,
@@ -55,9 +60,6 @@ def compute_new_centroids(X,
     """
     Compute new centroids given an input matrix and existing centroids
 
-    Valid values for metric:
-        ["euclidean", "sqeuclidean"]
-
     Parameters
     ----------
 
@@ -81,39 +83,37 @@ def compute_new_centroids(X,
     Examples
     --------
 
-    .. code-block:: python
+    >>> import cupy as cp
 
-        import cupy as cp
+    >>> from pylibraft.common import Handle
+    >>> from pylibraft.cluster.kmeans import compute_new_centroids
 
-        from pylibraft.common import Handle
-        from pylibraft.cluster.kmeans import compute_new_centroids
+    >>> # A single RAFT handle can optionally be reused across
+    >>> # pylibraft functions.
+    >>> handle = Handle()
 
-        # A single RAFT handle can optionally be reused across
-        # pylibraft functions.
-        handle = Handle()
+    >>> n_samples = 5000
+    >>> n_features = 50
+    >>> n_clusters = 3
 
-        n_samples = 5000
-        n_features = 50
-        n_clusters = 3
+    >>> X = cp.random.random_sample((n_samples, n_features),
+    ...                               dtype=cp.float32)
 
-        X = cp.random.random_sample((n_samples, n_features),
-                                      dtype=cp.float32)
+    >>> centroids = cp.random.random_sample((n_clusters, n_features),
+    ...                                         dtype=cp.float32)
+    ...
+    >>> labels = cp.random.randint(0, high=n_clusters, size=n_samples,
+    ...                            dtype=cp.int32)
 
-        centroids = cp.random.random_sample((n_clusters, n_features),
-                                                dtype=cp.float32)
+    >>> new_centroids = cp.empty((n_clusters, n_features), dtype=cp.float32)
 
-        labels = cp.random.randint(0, high=n_clusters, size=n_samples,
-                                   dtype=cp.int32)
+    >>> compute_new_centroids(
+    ...     X, centroids, labels, new_centroids, handle=handle
+    ... )
 
-        new_centroids = cp.empty((n_clusters, n_features), dtype=cp.float32)
-
-        compute_new_centroids(
-            X, centroids, labels, new_centroids, handle=handle
-        )
-
-        # pylibraft functions are often asynchronous so the
-        # handle needs to be explicitly synchronized
-        handle.sync()
+    >>> # pylibraft functions are often asynchronous so the
+    >>> # handle needs to be explicitly synchronized
+    >>> handle.sync()
    """
 
     x_cai = X.__cuda_array_interface__
@@ -159,11 +159,11 @@ def compute_new_centroids(X,
         weight_per_cluster_ptr = <uintptr_t>nullptr
 
     handle = handle if handle is not None else Handle()
-    cdef handle_t *h = <handle_t*><size_t>handle.getHandle()
+    cdef device_resources *h = <device_resources*><size_t>handle.getHandle()
 
-    x_c_contiguous = is_c_cont(x_cai, x_dt)
-    centroids_c_contiguous = is_c_cont(centroids_cai, centroids_dt)
-    new_centroids_c_contiguous = is_c_cont(new_centroids_cai, new_centroids_dt)
+    x_c_contiguous = is_c_contiguous(x_cai)
+    centroids_c_contiguous = is_c_contiguous(centroids_cai)
+    new_centroids_c_contiguous = is_c_contiguous(new_centroids_cai)
 
     if not x_c_contiguous or not centroids_c_contiguous \
             or not new_centroids_c_contiguous:
@@ -200,6 +200,7 @@ def compute_new_centroids(X,
 
 
 @auto_sync_handle
+@auto_convert_output
 def cluster_cost(X, centroids, handle=None):
     """
     Compute cluster cost given an input matrix and existing centroids
@@ -214,22 +215,21 @@ def cluster_cost(X, centroids, handle=None):
     Examples
     --------
 
-    .. code-block:: python
-        import cupy as cp
-
-        from pylibraft.cluster.kmeans import cluster_cost
-
-        n_samples = 5000
-        n_features = 50
-        n_clusters = 3
-
-        X = cp.random.random_sample((n_samples, n_features),
-                                      dtype=cp.float32)
-
-        centroids = cp.random.random_sample((n_clusters, n_features),
-                                                dtype=cp.float32)
-
-        inertia = cluster_cost(X, centroids)
+    >>> import cupy as cp
+    >>>
+    >>> from pylibraft.cluster.kmeans import cluster_cost
+    >>>
+    >>> n_samples = 5000
+    >>> n_features = 50
+    >>> n_clusters = 3
+    >>>
+    >>> X = cp.random.random_sample((n_samples, n_features),
+    ...                             dtype=cp.float32)
+
+    >>> centroids = cp.random.random_sample((n_clusters, n_features),
+    ...                                      dtype=cp.float32)
+
+    >>> inertia = cluster_cost(X, centroids)
     """
     x_cai = X.__cuda_array_interface__
     centroids_cai = centroids.__cuda_array_interface__
@@ -250,10 +250,10 @@ def cluster_cost(X, centroids, handle=None):
     centroids_ptr = <uintptr_t>centroids_cai["data"][0]
 
     handle = handle if handle is not None else Handle()
-    cdef handle_t *h = <handle_t*><size_t>handle.getHandle()
+    cdef device_resources *h = <device_resources*><size_t>handle.getHandle()
 
-    x_c_contiguous = is_c_cont(x_cai, x_dt)
-    centroids_c_contiguous = is_c_cont(centroids_cai, centroids_dt)
+    x_c_contiguous = is_c_contiguous(x_cai)
+    centroids_c_contiguous = is_c_contiguous(centroids_cai)
 
     if not x_c_contiguous or not centroids_c_contiguous:
         raise ValueError("Inputs must all be c contiguous")
@@ -285,3 +285,237 @@ def cluster_cost(X, centroids, handle=None):
         return d_cost
     else:
         raise ValueError("dtype %s not supported" % x_dt)
+
+
+class InitMethod(IntEnum):
+    """ Method for initializing kmeans """
+    KMeansPlusPlus = <int> kmeans_types.InitMethod.KMeansPlusPlus
+    Random = <int> kmeans_types.InitMethod.Random
+    Array = <int> kmeans_types.InitMethod.Array
+
+
+cdef class KMeansParams:
+    """ Specifies hyper-parameters for the kmeans algorithm.
+
+    Parameters
+    ----------
+    n_clusters : int, optional
+        The number of clusters to form as well as the number of centroids
+        to generate
+    max_iter : int, optional
+        Maximum number of iterations of the k-means algorithm for a single run
+    tol : float, optional
+        Relative tolerance with regards to inertia to declare convergence
+    verbosity : int, optional
+    seed: int, optional
+        Seed to the random number generator.
+    metric : str, optional
+        Metric names to use for distance computation, see
+        :func:`pylibraft.distance.pairwise_distance` for valid values.
+    init : InitMethod, optional
+    n_init : int, optional
+        Number of instance k-means algorithm will be run with different seeds.
+    oversampling_factor : float, optional
+        Oversampling factor for use in the k-means algorithm
+    """
+    cdef kmeans_types.KMeansParams c_obj
+
+    def __init__(self,
+                 n_clusters: Optional[int] = None,
+                 max_iter: Optional[int] = None,
+                 tol: Optional[float] = None,
+                 verbosity: Optional[int] = None,
+                 seed: Optional[int] = None,
+                 metric: Optional[str] = None,
+                 init: Optional[InitMethod] = None,
+                 n_init: Optional[int] = None,
+                 oversampling_factor: Optional[float] = None,
+                 batch_samples: Optional[int] = None,
+                 batch_centroids: Optional[int] = None,
+                 inertia_check: Optional[bool] = None):
+        if n_clusters is not None:
+            self.c_obj.n_clusters = n_clusters
+        if max_iter is not None:
+            self.c_obj.max_iter = max_iter
+        if tol is not None:
+            self.c_obj.tol = tol
+        if verbosity is not None:
+            self.c_obj.verbosity = verbosity
+        if seed is not None:
+            self.c_obj.rng_state.seed = seed
+        if metric is not None:
+            distance = DISTANCE_TYPES.get(metric)
+            if distance is None:
+                valid_metrics = list(DISTANCE_TYPES.keys())
+                raise ValueError(f"Unknown metric '{metric}'. Valid values "
+                                 f"are: {valid_metrics}")
+            self.c_obj.metric = distance
+        if init is not None:
+            self.c_obj.init = init
+        if n_init is not None:
+            self.c_obj.n_init = n_init
+        if oversampling_factor is not None:
+            self.c_obj.oversampling_factor = oversampling_factor
+        if batch_samples is not None:
+            self.c_obj.batch_samples = batch_samples
+        if batch_centroids is not None:
+            self.c_obj.batch_centroids = batch_centroids
+        if inertia_check is not None:
+            self.c_obj.inertia_check = inertia_check
+
+    @property
+    def n_clusters(self):
+        return self.c_obj.n_clusters
+
+    @property
+    def max_iter(self):
+        return self.c_obj.max_iter
+
+    @property
+    def tol(self):
+        return self.c_obj.tol
+
+    @property
+    def verbosity(self):
+        return self.c_obj.verbosity
+
+    @property
+    def seed(self):
+        return self.c_obj.rng_state.seed
+
+    @property
+    def init(self):
+        return InitMethod(self.c_obj.init)
+
+    @property
+    def oversampling_factor(self):
+        return self.c_obj.oversampling_factor
+
+    @property
+    def batch_samples(self):
+        return self.c_obj.batch_samples
+
+    @property
+    def batch_centroids(self):
+        return self.c_obj.batch_centroids
+
+    @property
+    def inertia_check(self):
+        return self.c_obj.inertia_check
+
+FitOutput = namedtuple("FitOutput", "centroids inertia n_iter")
+
+
+@auto_sync_handle
+@auto_convert_output
+def fit(
+    KMeansParams params, X, centroids=None, sample_weights=None, handle=None
+):
+    """
+    Find clusters with the k-means algorithm
+
+    Parameters
+    ----------
+
+    params : KMeansParams
+        Parameters to use to fit KMeans model
+    X : Input CUDA array interface compliant matrix shape (m, k)
+    centroids : Optional writable CUDA array interface compliant matrix
+                shape (n_clusters, k)
+    sample_weights : Optional input CUDA array interface compliant matrix shape
+                     (n_clusters, 1) default: None
+    {handle_docstring}
+
+    Returns
+    -------
+    centroids : raft.device_ndarray
+        The computed centroids for each cluster
+    inertia : float
+       Sum of squared distances of samples to their closest cluster center
+    n_iter : int
+        The number of iterations used to fit the model
+
+    Examples
+    --------
+
+    >>> import cupy as cp
+    >>>
+    >>> from pylibraft.cluster.kmeans import fit, KMeansParams
+    >>>
+    >>> n_samples = 5000
+    >>> n_features = 50
+    >>> n_clusters = 3
+    >>>
+    >>> X = cp.random.random_sample((n_samples, n_features),
+    ...                             dtype=cp.float32)
+
+    >>> params = KMeansParams(n_clusters=n_clusters)
+    >>> centroids, inertia, n_iter = fit(params, X)
+    """
+    cdef device_resources *h = <device_resources*><size_t>handle.getHandle()
+
+    cdef float f_inertia = 0.0
+    cdef double d_inertia = 0.0
+    cdef int n_iter = 0
+
+    cdef optional[device_vector_view[const double, int]] d_sample_weights
+    cdef optional[device_vector_view[const float, int]] f_sample_weights
+
+    X_cai = cai_wrapper(X)
+    dtype = X_cai.dtype
+
+    if centroids is None:
+        centroids_shape = (params.n_clusters, X_cai.shape[1])
+        centroids = device_ndarray.empty(centroids_shape, dtype=dtype)
+    centroids_cai = cai_wrapper(centroids)
+
+    # validate inputs have are all c-contiguous, and have a consistent dtype
+    # and expected shape
+    X_cai.validate_shape_dtype(2)
+    centroids_cai.validate_shape_dtype(2, dtype)
+    if sample_weights is not None:
+        sample_weights_cai = cai_wrapper(sample_weights)
+        sample_weights_cai.validate_shape_dtype(1, dtype)
+
+    if dtype == np.float64:
+        if sample_weights is not None:
+            d_sample_weights = make_device_vector_view(
+                <const double *><uintptr_t>sample_weights_cai.data,
+                <int>sample_weights_cai.shape[0])
+
+        cpp_kmeans.fit(
+            deref(h),
+            params.c_obj,
+            make_device_matrix_view[double, int, row_major](
+                <double *><uintptr_t>X_cai.data,
+                <int>X_cai.shape[0], <int>X_cai.shape[1]),
+            d_sample_weights,
+            make_device_matrix_view[double, int, row_major](
+                <double *><uintptr_t>centroids_cai.data,
+                <int>centroids_cai.shape[0], <int>centroids_cai.shape[1]),
+            make_host_scalar_view[double, int](&d_inertia),
+            make_host_scalar_view[int, int](&n_iter))
+        return FitOutput(centroids, d_inertia, n_iter)
+
+    elif dtype == np.float32:
+        if sample_weights is not None:
+            f_sample_weights = make_device_vector_view(
+                <const float *><uintptr_t>sample_weights_cai.data,
+                <int>sample_weights_cai.shape[0])
+
+        cpp_kmeans.fit(
+            deref(h),
+            params.c_obj,
+            make_device_matrix_view[float, int, row_major](
+                <float *><uintptr_t>X_cai.data,
+                <int>X_cai.shape[0], <int>X_cai.shape[1]),
+            f_sample_weights,
+            make_device_matrix_view[float, int, row_major](
+                <float *><uintptr_t>centroids_cai.data,
+                <int>centroids_cai.shape[0], <int>centroids_cai.shape[1]),
+            make_host_scalar_view[float, int](&f_inertia),
+            make_host_scalar_view[int, int](&n_iter))
+        return FitOutput(centroids, f_inertia, n_iter)
+
+    else:
+        raise ValueError(f"unhandled dtype {dtype}")
diff --git a/python/pylibraft/pylibraft/common/CMakeLists.txt b/python/pylibraft/pylibraft/common/CMakeLists.txt
index 3b49cef429..6ce1dfe347 100644
--- a/python/pylibraft/pylibraft/common/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/common/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -13,7 +13,7 @@
 # =============================================================================
 
 # Set the list of Cython files to build
-set(cython_sources cuda.pyx handle.pyx interruptible.pyx)
+set(cython_sources cuda.pyx handle.pyx mdspan.pyx interruptible.pyx)
 set(linked_libraries raft::raft)
 
 # Build all of the Cython targets
diff --git a/python/pylibraft/pylibraft/common/__init__.py b/python/pylibraft/pylibraft/common/__init__.py
index 33c2986487..0385ae0899 100644
--- a/python/pylibraft/pylibraft/common/__init__.py
+++ b/python/pylibraft/pylibraft/common/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,7 +13,11 @@
 # limitations under the License.
 #
 
+from .ai_wrapper import ai_wrapper
 from .cai_wrapper import cai_wrapper
 from .cuda import Stream
 from .device_ndarray import device_ndarray
-from .handle import Handle
+from .handle import DeviceResources, Handle
+from .outputs import auto_convert_output
+
+__all__ = ["DeviceResources", "Handle", "Stream"]
diff --git a/python/pylibraft/pylibraft/common/ai_wrapper.py b/python/pylibraft/pylibraft/common/ai_wrapper.py
new file mode 100644
index 0000000000..b6b1f02187
--- /dev/null
+++ b/python/pylibraft/pylibraft/common/ai_wrapper.py
@@ -0,0 +1,89 @@
+#
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import numpy as np
+
+from pylibraft.common import input_validation
+
+
+class ai_wrapper:
+    """
+    Simple wrapper around a array interface object to reduce
+    boilerplate for extracting common information from the underlying
+    dictionary.
+    """
+
+    def __init__(self, ai_arr):
+        """
+        Constructor accepts an array interface compliant array
+
+        Parameters
+        ----------
+        ai_arr : array interface array
+        """
+        self.ai_ = ai_arr.__array_interface__
+
+    @property
+    def dtype(self):
+        """
+        Returns the dtype of the underlying array interface
+        """
+        return np.dtype(self.ai_["typestr"])
+
+    @property
+    def shape(self):
+        """
+        Returns the shape of the underlying array interface
+        """
+        return self.ai_["shape"]
+
+    @property
+    def c_contiguous(self):
+        """
+        Returns whether the underlying array interface has
+        c-ordered (row-major) layout
+        """
+        return input_validation.is_c_contiguous(self.ai_)
+
+    @property
+    def f_contiguous(self):
+        """
+        Returns whether the underlying array interface has
+        f-ordered (column-major) layout
+        """
+        return not input_validation.is_c_contiguous(self.ai_)
+
+    @property
+    def data(self):
+        """
+        Returns the data pointer of the underlying array interface
+        """
+        return self.ai_["data"][0]
+
+    def validate_shape_dtype(self, expected_dims=None, expected_dtype=None):
+        """Checks to see if the shape, dtype, and strides match expectations"""
+        if expected_dims is not None and len(self.shape) != expected_dims:
+            raise ValueError(
+                f"unexpected shape {self.shape} - "
+                f"expected {expected_dims} dimensions"
+            )
+
+        if expected_dtype is not None and self.dtype != expected_dtype:
+            raise ValueError(
+                f"invalid dtype {self.dtype}: expected " f"{expected_dtype}"
+            )
+
+        if not self.c_contiguous:
+            raise ValueError("input must be c-contiguous")
diff --git a/python/pylibraft/pylibraft/common/cai_wrapper.py b/python/pylibraft/pylibraft/common/cai_wrapper.py
index fdfc6b0b09..cf11ea29ce 100644
--- a/python/pylibraft/pylibraft/common/cai_wrapper.py
+++ b/python/pylibraft/pylibraft/common/cai_wrapper.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,12 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-import numpy as np
+from types import SimpleNamespace
 
-from pylibraft.common import input_validation
+from pylibraft.common.ai_wrapper import ai_wrapper
 
 
-class cai_wrapper:
+class cai_wrapper(ai_wrapper):
     """
     Simple wrapper around a CUDA array interface object to reduce
     boilerplate for extracting common information from the underlying
@@ -33,41 +33,14 @@ def __init__(self, cai_arr):
         ----------
         cai_arr : CUDA array interface array
         """
-        self.cai_ = cai_arr.__cuda_array_interface__
+        helper = SimpleNamespace(
+            __array_interface__=cai_arr.__cuda_array_interface__
+        )
+        super().__init__(helper)
 
-    @property
-    def dtype(self):
-        """
-        Returns the dtype of the underlying CUDA array interface
-        """
-        return np.dtype(self.cai_["typestr"])
 
-    @property
-    def shape(self):
-        """
-        Returns the shape of the underlying CUDA array interface
-        """
-        return self.cai_["shape"]
-
-    @property
-    def c_contiguous(self):
-        """
-        Returns whether the underlying CUDA array interface has
-        c-ordered (row-major) layout
-        """
-        return input_validation.is_c_contiguous(self.cai_)
-
-    @property
-    def f_contiguous(self):
-        """
-        Returns whether the underlying CUDA array interface has
-        f-ordered (column-major) layout
-        """
-        return not input_validation.is_c_contiguous(self.cai_)
-
-    @property
-    def data(self):
-        """
-        Returns the data pointer of the underlying CUDA array interface
-        """
-        return self.cai_["data"][0]
+def wrap_array(array):
+    try:
+        return cai_wrapper(array)
+    except AttributeError:
+        return ai_wrapper(array)
diff --git a/python/pylibraft/pylibraft/common/cpp/__init__.pxd b/python/pylibraft/pylibraft/common/cpp/__init__.pxd
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/python/pylibraft/pylibraft/common/cpp/__init__.py b/python/pylibraft/pylibraft/common/cpp/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/python/pylibraft/pylibraft/common/cpp/mdspan.pxd b/python/pylibraft/pylibraft/common/cpp/mdspan.pxd
new file mode 100644
index 0000000000..c3e5abb47e
--- /dev/null
+++ b/python/pylibraft/pylibraft/common/cpp/mdspan.pxd
@@ -0,0 +1,113 @@
+#
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+from libcpp.string cimport string
+
+from pylibraft.common.handle cimport device_resources
+
+
+cdef extern from "raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_stride.hpp" namespace "std::experimental":  # noqa: E501
+    cdef cppclass layout_right:
+        pass
+
+    cdef cppclass layout_left:
+        pass
+
+
+cdef extern from "raft/core/mdspan_types.hpp" \
+        namespace "raft":
+    ctypedef layout_right row_major
+    ctypedef layout_left col_major
+    cdef cppclass matrix_extent[IndexType]:
+        pass
+
+
+cdef extern from "raft/core/device_mdspan.hpp" namespace "raft" nogil:
+
+    cdef cppclass device_vector_view[ElementType, IndexType]:
+        pass
+
+    cdef cppclass device_scalar_view[ElementType, IndexType]:
+        pass
+
+    cdef cppclass device_matrix_view[ElementType, IndexType, LayoutType]:
+        pass
+
+    cdef device_matrix_view[ElementType, IndexType, LayoutPolicy] \
+        make_device_matrix_view[ElementType, IndexType, LayoutPolicy](
+            ElementType* ptr, IndexType n_rows, IndexType n_cols) except +
+
+    cdef device_vector_view[ElementType, IndexType] \
+        make_device_vector_view[ElementType, IndexType](
+            ElementType* ptr, IndexType n) except +
+
+    cdef device_scalar_view[ElementType, IndexType] \
+        make_device_vector_view[ElementType, IndexType](
+            ElementType* ptr) except +
+
+
+cdef extern from "raft/core/host_mdspan.hpp" \
+        namespace "raft" nogil:
+
+    cdef cppclass host_matrix_view[ElementType, IndexType, LayoutPolicy]:
+        pass
+
+    cdef cppclass host_vector_view[ElementType, IndexType]:
+        pass
+
+    cdef cppclass host_scalar_view[ElementType, IndexType]:
+        pass
+
+    cdef cppclass host_mdspan[ElementType, Extents, LayoutPolicy]:
+        pass
+
+    cdef host_matrix_view[ElementType, IndexType, LayoutPolicy] \
+        make_host_matrix_view[ElementType, IndexType, LayoutPolicy](
+            ElementType* ptr, IndexType n_rows, IndexType n_cols) except +
+
+    cdef host_vector_view[ElementType, IndexType] \
+        make_host_vector_view[ElementType, IndexType](
+            ElementType* ptr, IndexType n) except +
+
+    cdef host_scalar_view[ElementType, IndexType] \
+        make_host_scalar_view[ElementType, IndexType](
+            ElementType *ptr) except +
+
+cdef extern from "<sstream>" namespace "std" nogil:
+    cdef cppclass ostringstream:
+        ostringstream() except +
+        string str() except +
+
+
+cdef extern from "<ostream>" namespace "std" nogil:
+
+    cdef cppclass ostream:
+        pass
+
+cdef extern from "raft/core/mdspan.hpp" namespace "raft" nogil:
+    cdef cppclass dextents[IndentType, Rank]:
+        pass
+
+cdef extern from "raft/core/serialize.hpp" namespace "raft" nogil:
+
+    cdef void serialize_mdspan[ElementType, Extents, LayoutPolicy](
+        const device_resources& handle, ostream& os,
+        const host_mdspan[ElementType, Extents, LayoutPolicy]& obj)
diff --git a/python/pylibraft/pylibraft/common/cpp/optional.pxd b/python/pylibraft/pylibraft/common/cpp/optional.pxd
new file mode 100644
index 0000000000..a6dd8a2dcd
--- /dev/null
+++ b/python/pylibraft/pylibraft/common/cpp/optional.pxd
@@ -0,0 +1,24 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# We're still using cython v0.29.x - which doesn't have std::optional
+# support. Include the minimal definition here as suggested by
+# https://github.com/cython/cython/issues/3293#issuecomment-1223058101
+
+cdef extern from "<optional>" namespace "std" nogil:
+    cdef cppclass optional[T]:
+        optional()
+        optional& operator=[U](U&)
diff --git a/python/pylibraft/pylibraft/common/cuda.pyx b/python/pylibraft/pylibraft/common/cuda.pyx
index 606860dbe9..c164a463ae 100644
--- a/python/pylibraft/pylibraft/common/cuda.pyx
+++ b/python/pylibraft/pylibraft/common/cuda.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -30,6 +30,7 @@ from cuda.ccudart cimport (
     cudaStreamSynchronize,
     cudaSuccess,
 )
+from libc.stdint cimport uintptr_t
 
 
 class CudaRuntimeError(RuntimeError):
@@ -50,12 +51,10 @@ cdef class Stream:
     Examples
     --------
 
-    .. code-block:: python
-
-        from pylibraft.common.cuda import Stream
-        stream = Stream()
-        stream.sync()
-        del stream  # optional!
+    >>> from pylibraft.common.cuda import Stream
+    >>> stream = Stream()
+    >>> stream.sync()
+    >>> del stream  # optional!
     """
     def __cinit__(self):
         cdef cudaStream_t stream
@@ -82,3 +81,9 @@ cdef class Stream:
 
     cdef cudaStream_t getStream(self):
         return self.s
+
+    def get_ptr(self):
+        """
+        Return the uintptr_t pointer of the underlying cudaStream_t handle
+        """
+        return <uintptr_t>self.s
diff --git a/python/pylibraft/pylibraft/common/handle.pxd b/python/pylibraft/pylibraft/common/handle.pxd
index e763768eac..c090663547 100644
--- a/python/pylibraft/pylibraft/common/handle.pxd
+++ b/python/pylibraft/pylibraft/common/handle.pxd
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -26,6 +26,8 @@ from rmm._lib.cuda_stream_pool cimport cuda_stream_pool
 from rmm._lib.cuda_stream_view cimport cuda_stream_view
 
 
+# Keeping `handle_t` around for backwards compatibility at the
+# cython layer but users are encourage to switch to device_resources
 cdef extern from "raft/core/handle.hpp" namespace "raft" nogil:
     cdef cppclass handle_t:
         handle_t() except +
@@ -35,7 +37,17 @@ cdef extern from "raft/core/handle.hpp" namespace "raft" nogil:
         cuda_stream_view get_stream() except +
         void sync_stream() except +
 
-cdef class Handle:
-    cdef unique_ptr[handle_t] c_obj
+
+cdef extern from "raft/core/device_resources.hpp" namespace "raft" nogil:
+    cdef cppclass device_resources:
+        device_resources() except +
+        device_resources(cuda_stream_view stream_view) except +
+        device_resources(cuda_stream_view stream_view,
+                         shared_ptr[cuda_stream_pool] stream_pool) except +
+        cuda_stream_view get_stream() except +
+        void sync_stream() except +
+
+cdef class DeviceResources:
+    cdef unique_ptr[device_resources] c_obj
     cdef shared_ptr[cuda_stream_pool] stream_pool
     cdef int n_streams
diff --git a/python/pylibraft/pylibraft/common/handle.pyx b/python/pylibraft/pylibraft/common/handle.pyx
index c14c22f5aa..b4cdb9b0c1 100644
--- a/python/pylibraft/pylibraft/common/handle.pyx
+++ b/python/pylibraft/pylibraft/common/handle.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,6 +21,9 @@
 
 import functools
 
+from cuda.ccudart cimport cudaStream_t
+from libc.stdint cimport uintptr_t
+
 from rmm._lib.cuda_stream_view cimport cuda_stream_per_thread, cuda_stream_view
 
 from .cuda cimport Stream
@@ -28,36 +31,63 @@ from .cuda cimport Stream
 from .cuda import CudaRuntimeError
 
 
-cdef class Handle:
+cdef class DeviceResources:
     """
-    Handle is a lightweight python wrapper around the corresponding C++ class
-    of handle_t exposed by RAFT's C++ interface. Refer to the header file
-    raft/handle.hpp for interface level details of this struct
+    DeviceResources is a lightweight python wrapper around the corresponding
+    C++ class of device_resources exposed by RAFT's C++ interface. Refer to
+    the header file raft/core/device_resources.hpp for interface level
+    details of this struct
+
+    Parameters
+    ----------
+    stream : Optional stream to use for ordering CUDA instructions
+             Accepts pylibraft.common.Stream() or uintptr_t (cudaStream_t)
 
     Examples
     --------
 
-    .. code-block:: python
-
-        from raft.common import Stream, Handle
-        stream = Stream()
-        handle = Handle(stream)
-
-        # call algos here
-
-        # final sync of all work launched in the stream of this handle
-        # this is same as `raft.cuda.Stream.sync()` call, but safer in case
-        # the default stream inside the `handle_t` is being used
-        handle.sync()
-        del handle  # optional!
+    Basic usage:
+
+    >>> from pylibraft.common import Stream, DeviceResources
+    >>> stream = Stream()
+    >>> handle = DeviceResources(stream)
+    >>>
+    >>> # call algos here
+    >>>
+    >>> # final sync of all work launched in the stream of this handle
+    >>> # this is same as `raft.cuda.Stream.sync()` call, but safer in case
+    >>> # the default stream inside the `device_resources` is being used
+    >>> handle.sync()
+    >>> del handle  # optional!
+
+    Using a cuPy stream with RAFT device_resources:
+
+    >>> import cupy
+    >>> from pylibraft.common import Stream, DeviceResources
+    >>>
+    >>> cupy_stream = cupy.cuda.Stream()
+    >>> handle = DeviceResources(stream=cupy_stream.ptr)
+
+    Using a RAFT stream with CuPy ExternalStream:
+
+    >>> import cupy
+    >>> from pylibraft.common import Stream
+    >>>
+    >>> raft_stream = Stream()
+    >>> cupy_stream = cupy.cuda.ExternalStream(raft_stream.get_ptr())
     """
 
-    def __cinit__(self, stream: Stream = None, n_streams=0):
+    def __cinit__(self, stream=None, n_streams=0):
         self.n_streams = n_streams
+
         if n_streams > 0:
             self.stream_pool.reset(new cuda_stream_pool(n_streams))
 
+        cdef uintptr_t s
         cdef cuda_stream_view c_stream
+
+        # We should either have a pylibraft.common.Stream or a uintptr_t
+        # of a cudaStream_t
         if stream is None:
             # this constructor will construct a "main" handle on
             # per-thread default stream, which is non-blocking
@@ -65,22 +95,98 @@ cdef class Handle:
                                           self.stream_pool))
         else:
             # this constructor constructs a handle on user stream
-            c_stream = cuda_stream_view(stream.getStream())
+            if isinstance(stream, Stream):
+                # Stream is pylibraft Stream()
+                s = stream.get_ptr()
+                c_stream = cuda_stream_view(<cudaStream_t>s)
+            elif isinstance(stream, int):
+                # Stream is a pointer, cast to cudaStream_t
+                s = stream
+                c_stream = cuda_stream_view(<cudaStream_t>s)
+            else:
+                raise ValueError("stream should be common.Stream() or "
+                                 "uintptr_t to cudaStream_t")
+
             self.c_obj.reset(new handle_t(c_stream,
-                                          self.stream_pool))
+                             self.stream_pool))
 
     def sync(self):
         """
-        Issues a sync on the stream set for this handle.
+        Issues a sync on the stream set for this instance.
         """
         self.c_obj.get()[0].sync_stream()
 
     def getHandle(self):
+        """
+        Return the pointer to the underlying raft::device_resources
+        instance as a size_t
+        """
         return <size_t> self.c_obj.get()
 
     def __getstate__(self):
         return self.n_streams
 
+    def __setstate__(self, state):
+        self.n_streams = state
+        if self.n_streams > 0:
+            self.stream_pool.reset(new cuda_stream_pool(self.n_streams))
+
+        self.c_obj.reset(new device_resources(cuda_stream_per_thread,
+                                              self.stream_pool))
+
+
+cdef class Handle(DeviceResources):
+    """
+    Handle is a lightweight python wrapper around the corresponding
+    C++ class of handle_t exposed by RAFT's C++ interface. Refer to
+    the header file raft/core/handle.hpp for interface level
+    details of this struct
+
+    Note: This API is officially deprecated in favor of DeviceResources
+    and will be removed in a future release.
+
+    Parameters
+    ----------
+    stream : Optional stream to use for ordering CUDA instructions
+            Accepts pylibraft.common.Stream() or uintptr_t (cudaStream_t)
+
+    Examples
+    --------
+
+    Basic usage:
+
+    >>> from pylibraft.common import Stream, Handle
+    >>> stream = Stream()
+    >>> handle = Handle(stream)
+    >>>
+    >>> # call algos here
+    >>>
+    >>> # final sync of all work launched in the stream of this handle
+    >>> # this is same as `raft.cuda.Stream.sync()` call, but safer in case
+    >>> # the default stream inside the `handle_t` is being used
+    >>> handle.sync()
+    >>> del handle  # optional!
+
+    Using a cuPy stream with RAFT device_resources:
+
+    >>> import cupy
+    >>> from pylibraft.common import Stream, Handle
+    >>>
+    >>> cupy_stream = cupy.cuda.Stream()
+    >>> handle = Handle(stream=cupy_stream.ptr)
+
+    Using a RAFT stream with CuPy ExternalStream:
+
+    >>> import cupy
+    >>> from pylibraft.common import Stream
+    >>>
+    >>> raft_stream = Stream()
+    >>> cupy_stream = cupy.cuda.ExternalStream(raft_stream.get_ptr())
+
+    """
+    def __getstate__(self):
+        return self.n_streams
+
     def __setstate__(self, state):
         self.n_streams = state
         if self.n_streams > 0:
@@ -91,11 +197,12 @@ cdef class Handle:
 
 
 _HANDLE_PARAM_DOCSTRING = """
-     handle : Optional RAFT handle for reusing expensive CUDA resources
-        If a handle isn't supplied, CUDA resources will be allocated
-        inside this function and synchronized before the function exits.
-        If a handle is supplied, you will need to explicitly synchronize
-        yourself by calling `handle.sync()` before accessing the output.
+     handle : Optional RAFT resource handle for reusing expensive CUDA
+        resources. If a handle isn't supplied, CUDA resources will be
+        allocated inside this function and synchronized before the
+        function exits. If a handle is supplied, you will need to
+        explicitly synchronize yourself by calling `handle.sync()`
+        before accessing the output.
 """.strip()
 
 
@@ -113,7 +220,7 @@ def auto_sync_handle(f):
     @functools.wraps(f)
     def wrapper(*args, handle=None, **kwargs):
         sync_handle = handle is None
-        handle = handle if handle is not None else Handle()
+        handle = handle if handle is not None else DeviceResources()
 
         ret_value = f(*args, handle=handle, **kwargs)
 
diff --git a/python/pylibraft/pylibraft/common/interruptible.pyx b/python/pylibraft/pylibraft/common/interruptible.pyx
index fc2e6d9e1f..bb5415428f 100644
--- a/python/pylibraft/pylibraft/common/interruptible.pyx
+++ b/python/pylibraft/pylibraft/common/interruptible.pyx
@@ -38,20 +38,15 @@ def cuda_interruptible():
 
     Use this on a long-running C++ function imported via cython:
 
-    .. code-block:: python
-
-        with cuda_interruptible():
-            my_long_running_function(...)
+    >>> with cuda_interruptible():
+    >>>     my_long_running_function(...)
 
     It's also recommended to release the GIL during the call, to
     make sure the handler has a chance to run:
 
-    .. code-block:: python
-
-        with cuda_interruptible():
-            with nogil:
-                my_long_running_function(...)
-
+    >>> with cuda_interruptible():
+    >>>     with nogil:
+    >>>         my_long_running_function(...)
     '''
     cdef shared_ptr[interruptible] token = get_token()
 
@@ -59,11 +54,17 @@ def cuda_interruptible():
         with nogil:
             dereference(token).cancel()
 
-    oldhr = signal.signal(signal.SIGINT, newhr)
+    try:
+        oldhr = signal.signal(signal.SIGINT, newhr)
+    except ValueError:
+        # the signal creation would fail if this is not the main thread
+        # That's fine! The feature is disabled.
+        oldhr = None
     try:
         yield
     finally:
-        signal.signal(signal.SIGINT, oldhr)
+        if oldhr is not None:
+            signal.signal(signal.SIGINT, oldhr)
 
 
 def synchronize(stream: Stream):
diff --git a/python/pylibraft/pylibraft/common/mdspan.pxd b/python/pylibraft/pylibraft/common/mdspan.pxd
deleted file mode 100644
index e0cd6fb12f..0000000000
--- a/python/pylibraft/pylibraft/common/mdspan.pxd
+++ /dev/null
@@ -1,55 +0,0 @@
-#
-# Copyright (c) 2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-# cython: language_level = 3
-
-import pylibraft.common.handle
-
-from cython.operator cimport dereference as deref
-
-
-cdef extern from "raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_stride.hpp" namespace "std::experimental":  # noqa: E501
-    cdef cppclass layout_right:
-        pass
-
-
-cdef extern from "raft/core/mdspan_types.hpp" \
-        namespace "raft":
-    ctypedef layout_right row_major
-
-
-cdef extern from "raft/core/device_mdspan.hpp" \
-        namespace "raft" nogil:
-
-    cdef cppclass device_matrix_view[ElementType, IndexType, LayoutPolicy]:
-        pass
-
-    cdef device_matrix_view[ElementType, IndexType, LayoutPolicy] \
-        make_device_matrix_view[ElementType, IndexType, LayoutPolicy](
-            ElementType* ptr, IndexType n_rows, IndexType n_cols)
-
-
-cdef extern from "raft/core/host_mdspan.hpp" \
-        namespace "raft" nogil:
-
-    cdef cppclass host_matrix_view[ElementType, IndexType, LayoutPolicy]:
-        pass
-
-    cdef host_matrix_view[ElementType, IndexType, LayoutPolicy] \
-        make_host_matrix_view[ElementType, IndexType, LayoutPolicy](
-            ElementType* ptr, IndexType n_rows, IndexType n_cols)
diff --git a/python/pylibraft/pylibraft/common/mdspan.pyx b/python/pylibraft/pylibraft/common/mdspan.pyx
new file mode 100644
index 0000000000..ec825495f4
--- /dev/null
+++ b/python/pylibraft/pylibraft/common/mdspan.pyx
@@ -0,0 +1,146 @@
+#
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+import io
+
+import numpy as np
+
+from cpython.object cimport PyObject
+from cython.operator cimport dereference as deref
+from libc.stddef cimport size_t
+from libc.stdint cimport int32_t, int64_t, uint32_t, uint64_t, uintptr_t
+
+from pylibraft.common.cpp.mdspan cimport (
+    col_major,
+    host_mdspan,
+    make_host_matrix_view,
+    matrix_extent,
+    ostream,
+    ostringstream,
+    row_major,
+    serialize_mdspan,
+)
+from pylibraft.common.handle cimport device_resources
+
+from pylibraft.common import DeviceResources
+
+
+cdef extern from "Python.h":
+    Py_buffer* PyMemoryView_GET_BUFFER(PyObject* mview)
+
+
+def run_roundtrip_test_for_mdspan(X, fortran_order=False):
+    if not isinstance(X, np.ndarray) or len(X.shape) != 2:
+        raise ValueError("Please call this function with a NumPy array with"
+                         "2 dimensions")
+    handle = DeviceResources()
+    cdef device_resources * handle_ = \
+        <device_resources *> <size_t> handle.getHandle()
+    cdef ostringstream oss
+    if X.dtype == np.float32:
+        if fortran_order:
+            serialize_mdspan[float, matrix_extent[size_t], col_major](
+                deref(handle_),
+                <ostream&>oss,
+                <const host_mdspan[float, matrix_extent[size_t],
+                                   col_major] &>
+                make_host_matrix_view[float, size_t, col_major](
+                    <float *><uintptr_t>PyMemoryView_GET_BUFFER(
+                        <PyObject *> X.data).buf,
+                    X.shape[0], X.shape[1]))
+        else:
+            serialize_mdspan[float, matrix_extent[size_t], row_major](
+                deref(handle_),
+                <ostream&>oss,
+                <const host_mdspan[float, matrix_extent[size_t],
+                                   row_major]&>
+                make_host_matrix_view[float, size_t, row_major](
+                    <float *><uintptr_t>PyMemoryView_GET_BUFFER(
+                        <PyObject *> X.data).buf,
+                    X.shape[0], X.shape[1]))
+    elif X.dtype == np.float64:
+        if fortran_order:
+            serialize_mdspan[double, matrix_extent[size_t], col_major](
+                deref(handle_),
+                <ostream&>oss,
+                <const host_mdspan[double, matrix_extent[size_t],
+                                   col_major]&>
+                make_host_matrix_view[double, size_t, col_major](
+                    <double *><uintptr_t>PyMemoryView_GET_BUFFER(
+                        <PyObject *> X.data).buf,
+                    X.shape[0], X.shape[1]))
+        else:
+            serialize_mdspan[double, matrix_extent[size_t], row_major](
+                deref(handle_),
+                <ostream&>oss,
+                <const host_mdspan[double, matrix_extent[size_t],
+                                   row_major]&>
+                make_host_matrix_view[double, size_t, row_major](
+                    <double *><uintptr_t>PyMemoryView_GET_BUFFER(
+                        <PyObject *> X.data).buf,
+                    X.shape[0], X.shape[1]))
+    elif X.dtype == np.int32:
+        if fortran_order:
+            serialize_mdspan[int32_t, matrix_extent[size_t], col_major](
+                deref(handle_),
+                <ostream&>oss,
+                <const host_mdspan[int32_t, matrix_extent[size_t],
+                                   col_major]&>
+                make_host_matrix_view[int32_t, size_t, col_major](
+                    <int32_t *><uintptr_t>PyMemoryView_GET_BUFFER(
+                        <PyObject *> X.data).buf,
+                    X.shape[0], X.shape[1]))
+        else:
+            serialize_mdspan[int32_t, matrix_extent[size_t], row_major](
+                deref(handle_),
+                <ostream&>oss,
+                <const host_mdspan[int32_t, matrix_extent[size_t],
+                                   row_major]&>
+                make_host_matrix_view[int32_t, size_t, row_major](
+                    <int32_t *><uintptr_t>PyMemoryView_GET_BUFFER(
+                        <PyObject *> X.data).buf,
+                    X.shape[0], X.shape[1]))
+    elif X.dtype == np.uint32:
+        if fortran_order:
+            serialize_mdspan[uint32_t, matrix_extent[size_t], col_major](
+                deref(handle_),
+                <ostream&>oss,
+                <const host_mdspan[uint32_t, matrix_extent[size_t],
+                                   col_major]&>
+                make_host_matrix_view[uint32_t, size_t, col_major](
+                    <uint32_t *><uintptr_t>PyMemoryView_GET_BUFFER(
+                        <PyObject *> X.data).buf,
+                    X.shape[0], X.shape[1]))
+        else:
+            serialize_mdspan[uint32_t, matrix_extent[size_t], row_major](
+                deref(handle_),
+                <ostream&>oss,
+                <const host_mdspan[uint32_t, matrix_extent[size_t],
+                                   row_major]&>
+                make_host_matrix_view[uint32_t, size_t, row_major](
+                    <uint32_t *><uintptr_t>PyMemoryView_GET_BUFFER(
+                        <PyObject *> X.data).buf,
+                    X.shape[0], X.shape[1]))
+    else:
+        raise NotImplementedError()
+    f = io.BytesIO(oss.str())
+    X2 = np.load(f)
+    assert np.all(X.shape == X2.shape)
+    assert np.all(X == X2)
diff --git a/python/pylibraft/pylibraft/common/outputs.py b/python/pylibraft/pylibraft/common/outputs.py
new file mode 100644
index 0000000000..e5b08e1798
--- /dev/null
+++ b/python/pylibraft/pylibraft/common/outputs.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import functools
+import warnings
+
+import pylibraft.config
+
+
+def import_warn_(lib):
+    warnings.warn(
+        "%s is not available and output cannot be converted."
+        "Returning original output instead." % lib
+    )
+
+
+def convert_to_torch(device_ndarray):
+    try:
+        import torch
+
+        return torch.as_tensor(device_ndarray, device="cuda")
+    except ImportError:
+        import_warn_("PyTorch")
+        return device_ndarray
+
+
+def convert_to_cupy(device_ndarray):
+    try:
+        import cupy
+
+        return cupy.asarray(device_ndarray)
+    except ImportError:
+        import_warn_("CuPy")
+        return device_ndarray
+
+
+def no_conversion(device_ndarray):
+    return device_ndarray
+
+
+def convert_to_cai_type(device_ndarray):
+    output_as_ = pylibraft.config.output_as_
+    if callable(output_as_):
+        return output_as_(device_ndarray)
+    elif output_as_ == "raft":
+        return device_ndarray
+    elif output_as_ == "torch":
+        return convert_to_torch(device_ndarray)
+    elif output_as_ == "cupy":
+        return convert_to_cupy(device_ndarray)
+    else:
+        raise ValueError("No valid type conversion found for %s" % output_as_)
+
+
+def conv(ret):
+    for i in ret:
+        if isinstance(i, pylibraft.common.device_ndarray):
+            yield convert_to_cai_type(i)
+        else:
+            yield i
+
+
+def auto_convert_output(f):
+    """Decorator to automatically convert an output device_ndarray
+    (or list or tuple of device_ndarray) into the configured
+    `__cuda_array_interface__` compliant type.
+    """
+
+    @functools.wraps(f)
+    def wrapper(*args, **kwargs):
+        ret_value = f(*args, **kwargs)
+        if isinstance(ret_value, pylibraft.common.device_ndarray):
+            return convert_to_cai_type(ret_value)
+        elif isinstance(ret_value, tuple):
+            return tuple(conv(ret_value))
+        elif isinstance(ret_value, list):
+            return list(conv(ret_value))
+        else:
+            return ret_value
+
+    return wrapper
diff --git a/python/pylibraft/pylibraft/config.py b/python/pylibraft/pylibraft/config.py
new file mode 100644
index 0000000000..c173bca2bd
--- /dev/null
+++ b/python/pylibraft/pylibraft/config.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+SUPPORTED_OUTPUT_TYPES = ["torch", "cupy", "raft"]
+
+output_as_ = "raft"  # By default, return device_ndarray from functions
+
+
+def set_output_as(output):
+    """
+    Set output format for RAFT functions.
+
+    Calling this function will change the output type of RAFT functions.
+    By default RAFT returns a `pylibraft.common.device_ndarray` for arrays
+    on GPU memory. Calling `set_output_as` allows you to have RAFT return
+    arrays as cupy arrays or pytorch tensors instead. You can also have
+    RAFT convert the output to other frameworks by passing a callable to
+    do the conversion here.
+
+    Notes
+    -----
+    Returning arrays in cupy or torch format requires you to install
+    cupy or torch.
+
+    Parameters
+    ----------
+    output : { "raft", "cupy", "torch" } or callable
+        The output format to convert to. Can either be a str describing the
+        framework to convert to, or a callable that accepts a
+        device_ndarray and returns the converted type.
+    """
+    if output not in SUPPORTED_OUTPUT_TYPES and not callable(output):
+        raise ValueError("Unsupported output option " % output)
+    global output_as_
+    output_as_ = output
diff --git a/python/pylibraft/pylibraft/distance/__init__.py b/python/pylibraft/pylibraft/distance/__init__.py
index b251e71ba3..f059b5f3dd 100644
--- a/python/pylibraft/pylibraft/distance/__init__.py
+++ b/python/pylibraft/pylibraft/distance/__init__.py
@@ -15,3 +15,5 @@
 
 from .fused_l2_nn import fused_l2_nn_argmin
 from .pairwise_distance import DISTANCE_TYPES, distance as pairwise_distance
+
+__all__ = ["fused_l2_nn_argmin", "pairwise_distance"]
diff --git a/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx b/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx
index 3f30e6d0a8..c8e7101ee0 100644
--- a/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx
+++ b/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -26,17 +26,22 @@ from libcpp cimport bool
 
 from .distance_type cimport DistanceType
 
-from pylibraft.common import Handle, cai_wrapper, device_ndarray
+from pylibraft.common import (
+    Handle,
+    auto_convert_output,
+    cai_wrapper,
+    device_ndarray,
+)
 from pylibraft.common.handle import auto_sync_handle
 
-from pylibraft.common.handle cimport handle_t
+from pylibraft.common.handle cimport device_resources
 
 
-cdef extern from "raft_distance/fused_l2_min_arg.hpp" \
-        namespace "raft::distance::runtime":
+cdef extern from "raft_runtime/distance/fused_l2_nn.hpp" \
+        namespace "raft::runtime::distance" nogil:
 
     void fused_l2_nn_min_arg(
-        const handle_t &handle,
+        const device_resources &handle,
         int* min,
         const float* x,
         const float* y,
@@ -46,7 +51,7 @@ cdef extern from "raft_distance/fused_l2_min_arg.hpp" \
         bool sqrt) except +
 
     void fused_l2_nn_min_arg(
-        const handle_t &handle,
+        const device_resources &handle,
         int* min,
         const double* x,
         const double* y,
@@ -57,6 +62,7 @@ cdef extern from "raft_distance/fused_l2_min_arg.hpp" \
 
 
 @auto_sync_handle
+@auto_convert_output
 def fused_l2_nn_argmin(X, Y, out=None, sqrt=True, handle=None):
     """
     Compute the 1-nearest neighbors between X and Y using the L2 distance
@@ -73,54 +79,50 @@ def fused_l2_nn_argmin(X, Y, out=None, sqrt=True, handle=None):
     --------
     To compute the 1-nearest neighbors argmin:
 
-    .. code-block:: python
-
-        import cupy as cp
-        from pylibraft.common import Handle
-        from pylibraft.distance import fused_l2_nn_argmin
-        n_samples = 5000
-        n_clusters = 5
-        n_features = 50
-        in1 = cp.random.random_sample((n_samples, n_features),
-                                      dtype=cp.float32)
-        in2 = cp.random.random_sample((n_clusters, n_features),
-                                      dtype=cp.float32)
-        # A single RAFT handle can optionally be reused across
-        # pylibraft functions.
-        handle = Handle()
-        ...
-        output = fused_l2_nn_argmin(in1, in2, output, handle=handle)
-        ...
-        # pylibraft functions are often asynchronous so the
-        # handle needs to be explicitly synchronized
-        handle.sync()
+    >>> import cupy as cp
+    >>> from pylibraft.common import Handle
+    >>> from pylibraft.distance import fused_l2_nn_argmin
+    >>> n_samples = 5000
+    >>> n_clusters = 5
+    >>> n_features = 50
+    >>> in1 = cp.random.random_sample((n_samples, n_features),
+    ...                               dtype=cp.float32)
+    >>> in2 = cp.random.random_sample((n_clusters, n_features),
+    ...                               dtype=cp.float32)
+    >>> # A single RAFT handle can optionally be reused across
+    >>> # pylibraft functions.
+    >>> handle = Handle()
+
+    >>> output = fused_l2_nn_argmin(in1, in2, handle=handle)
+
+    >>> # pylibraft functions are often asynchronous so the
+    >>> # handle needs to be explicitly synchronized
+    >>> handle.sync()
 
     The output can also be computed in-place on a preallocated
     array:
 
-    .. code-block:: python
-
-        import cupy as cp
-        from pylibraft.common import Handle
-        from pylibraft.distance import fused_l2_nn_argmin
-        n_samples = 5000
-        n_clusters = 5
-        n_features = 50
-        in1 = cp.random.random_sample((n_samples, n_features),
-                                      dtype=cp.float32)
-        in2 = cp.random.random_sample((n_clusters, n_features),
-                                      dtype=cp.float32)
-        output = cp.empty((n_samples, 1), dtype=cp.int32)
-        # A single RAFT handle can optionally be reused across
-        # pylibraft functions.
-        handle = Handle()
-        ...
-        fused_l2_nn_argmin(in1, in2, out=output, handle=handle)
-        ...
-        # pylibraft functions are often asynchronous so the
-        # handle needs to be explicitly synchronized
-        handle.sync()
-
+    >>> import cupy as cp
+    >>> from pylibraft.common import Handle
+    >>> from pylibraft.distance import fused_l2_nn_argmin
+    >>> n_samples = 5000
+    >>> n_clusters = 5
+    >>> n_features = 50
+    >>> in1 = cp.random.random_sample((n_samples, n_features),
+    ...                               dtype=cp.float32)
+    >>> in2 = cp.random.random_sample((n_clusters, n_features),
+    ...                               dtype=cp.float32)
+    >>> output = cp.empty((n_samples, 1), dtype=cp.int32)
+    >>> # A single RAFT handle can optionally be reused across
+    >>> # pylibraft functions.
+    >>> handle = Handle()
+
+    >>> fused_l2_nn_argmin(in1, in2, out=output, handle=handle)
+    array(...)
+
+    >>> # pylibraft functions are often asynchronous so the
+    >>> # handle needs to be explicitly synchronized
+    >>> handle.sync()
    """
 
     x_cai = cai_wrapper(X)
@@ -152,7 +154,7 @@ def fused_l2_nn_argmin(X, Y, out=None, sqrt=True, handle=None):
     d_ptr = <uintptr_t>output_cai.data
 
     handle = handle if handle is not None else Handle()
-    cdef handle_t *h = <handle_t*><size_t>handle.getHandle()
+    cdef device_resources *h = <device_resources*><size_t>handle.getHandle()
 
     d_dt = output_cai.dtype
 
diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
index 76e70e3926..9649531b61 100644
--- a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
+++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -29,15 +29,15 @@ from .distance_type cimport DistanceType
 from pylibraft.common import Handle
 from pylibraft.common.handle import auto_sync_handle
 
-from pylibraft.common.handle cimport handle_t
+from pylibraft.common.handle cimport device_resources
 
-from pylibraft.common import cai_wrapper, device_ndarray
+from pylibraft.common import auto_convert_output, cai_wrapper, device_ndarray
 
 
-cdef extern from "raft_distance/pairwise_distance.hpp" \
-        namespace "raft::distance::runtime":
+cdef extern from "raft_runtime/distance/pairwise_distance.hpp" \
+        namespace "raft::runtime::distance" nogil:
 
-    cdef void pairwise_distance(const handle_t &handle,
+    cdef void pairwise_distance(const device_resources &handle,
                                 float *x,
                                 float *y,
                                 float *dists,
@@ -48,7 +48,7 @@ cdef extern from "raft_distance/pairwise_distance.hpp" \
                                 bool isRowMajor,
                                 float metric_arg) except +
 
-    cdef void pairwise_distance(const handle_t &handle,
+    cdef void pairwise_distance(const device_resources &handle,
                                 double *x,
                                 double *y,
                                 double *dists,
@@ -89,6 +89,7 @@ SUPPORTED_DISTANCES = ["euclidean", "l1", "cityblock", "l2", "inner_product",
 
 
 @auto_sync_handle
+@auto_convert_output
 def distance(X, Y, out=None, metric="euclidean", p=2.0, handle=None):
     """
     Compute pairwise distances between X and Y
@@ -118,63 +119,53 @@ def distance(X, Y, out=None, metric="euclidean", p=2.0, handle=None):
     --------
     To compute pairwise distances on cupy arrays:
 
-    .. code-block:: python
-
-        import cupy as cp
-        from pylibraft.common import Handle
-        from pylibraft.distance import pairwise_distance
-        n_samples = 5000
-        n_features = 50
-        in1 = cp.random.random_sample((n_samples, n_features),
-                                      dtype=cp.float32)
-        in2 = cp.random.random_sample((n_samples, n_features),
-                                      dtype=cp.float32)
+    >>> import cupy as cp
+    >>> from pylibraft.common import Handle
+    >>> from pylibraft.distance import pairwise_distance
+    >>> n_samples = 5000
+    >>> n_features = 50
+    >>> in1 = cp.random.random_sample((n_samples, n_features),
+    ...                               dtype=cp.float32)
+    >>> in2 = cp.random.random_sample((n_samples, n_features),
+    ...                               dtype=cp.float32)
 
     A single RAFT handle can optionally be reused across
     pylibraft functions.
 
-    .. code-block:: python
-
-        handle = Handle()
-        output = pairwise_distance(in1, in2, metric="euclidean", handle=handle)
+    >>> handle = Handle()
+    >>> output = pairwise_distance(in1, in2, metric="euclidean", handle=handle)
 
     pylibraft functions are often asynchronous so the
     handle needs to be explicitly synchronized
 
-    .. code-block:: python
-
-        handle.sync()
+    >>> handle.sync()
 
     It's also possible to write to a pre-allocated output array:
 
-    .. code-block:: python
-
-        import cupy as cp
-        from pylibraft.common import Handle
-        from pylibraft.distance import pairwise_distance
-        n_samples = 5000
-        n_features = 50
-        in1 = cp.random.random_sample((n_samples, n_features),
-                                     dtype=cp.float32)
-        in2 = cp.random.random_sample((n_samples, n_features),
-                                     dtype=cp.float32)
-        output = cp.empty((n_samples, n_samples), dtype=cp.float32)
+    >>> import cupy as cp
+    >>> from pylibraft.common import Handle
+    >>> from pylibraft.distance import pairwise_distance
+    >>> n_samples = 5000
+    >>> n_features = 50
+    >>> in1 = cp.random.random_sample((n_samples, n_features),
+    ...                              dtype=cp.float32)
+    >>> in2 = cp.random.random_sample((n_samples, n_features),
+    ...                              dtype=cp.float32)
+    >>> output = cp.empty((n_samples, n_samples), dtype=cp.float32)
 
     A single RAFT handle can optionally be reused across
     pylibraft functions.
 
-    .. code-block:: python
-
-        handle = Handle()
-        pairwise_distance(in1, in2, out=output,
-                         metric="euclidean", handle=handle)
+    >>>
+    >>> handle = Handle()
+    >>> pairwise_distance(in1, in2, out=output,
+    ...                  metric="euclidean", handle=handle)
+    array(...)
 
     pylibraft functions are often asynchronous so the
     handle needs to be explicitly synchronized
 
-    .. code-block:: python
-
-        handle.sync()
+    >>> handle.sync()
     """
 
     x_cai = cai_wrapper(X)
@@ -205,7 +196,7 @@ def distance(X, Y, out=None, metric="euclidean", p=2.0, handle=None):
     d_ptr = <uintptr_t>dists_cai.data
 
     handle = handle if handle is not None else Handle()
-    cdef handle_t *h = <handle_t*><size_t>handle.getHandle()
+    cdef device_resources *h = <device_resources*><size_t>handle.getHandle()
 
     d_dt = dists_cai.dtype
 
diff --git a/python/pylibraft/pylibraft/neighbors/__init__.py b/python/pylibraft/pylibraft/neighbors/__init__.py
index 2f5104bd6b..dd8cdd8445 100644
--- a/python/pylibraft/pylibraft/neighbors/__init__.py
+++ b/python/pylibraft/pylibraft/neighbors/__init__.py
@@ -13,3 +13,5 @@
 # limitations under the License.
 #
 from .refine import refine
+
+__all__ = ["refine"]
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/__init__.py b/python/pylibraft/pylibraft/neighbors/ivf_pq/__init__.py
index 8a231b2c8c..3d604f829d 100644
--- a/python/pylibraft/pylibraft/neighbors/ivf_pq/__init__.py
+++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/__init__.py
@@ -13,4 +13,24 @@
 # limitations under the License.
 #
 
-from .ivf_pq import Index, IndexParams, SearchParams, build, extend, search
+from .ivf_pq import (
+    Index,
+    IndexParams,
+    SearchParams,
+    build,
+    extend,
+    load,
+    save,
+    search,
+)
+
+__all__ = [
+    "Index",
+    "IndexParams",
+    "SearchParams",
+    "build",
+    "extend",
+    "load",
+    "save",
+    "search",
+]
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/__init__.pxd b/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/__init__.pxd
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/__init__.py b/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/__init__.py
new file mode 100644
index 0000000000..273b4497cc
--- /dev/null
+++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/c_ivf_pq.pxd b/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd
similarity index 82%
rename from python/pylibraft/pylibraft/neighbors/ivf_pq/c_ivf_pq.pxd
rename to python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd
index 9728495bf8..c56c3e9d9b 100644
--- a/python/pylibraft/pylibraft/neighbors/ivf_pq/c_ivf_pq.pxd
+++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -32,10 +32,11 @@ from libc.stdint cimport (
     uintptr_t,
 )
 from libcpp cimport bool, nullptr
+from libcpp.string cimport string
 
 from rmm._lib.memory_resource cimport device_memory_resource
 
-from pylibraft.common.handle cimport handle_t
+from pylibraft.common.handle cimport device_resources
 from pylibraft.distance.distance_type cimport DistanceType
 
 
@@ -79,7 +80,7 @@ cdef extern from "raft/neighbors/ivf_pq_types.hpp" \
         bool force_random_rotation
 
     cdef cppclass index[IdxT](ann_index):
-        index(const handle_t& handle,
+        index(const device_resources& handle,
               DistanceType metric,
               codebook_gen codebook_kind,
               uint32_t n_lists,
@@ -104,49 +105,49 @@ cdef extern from "raft/neighbors/ivf_pq_types.hpp" \
         cudaDataType_t internal_distance_dtype
 
 
-cdef extern from "raft/neighbors/specializations/ivf_pq_specialization.hpp" \
-        namespace "raft::neighbors::ivf_pq" nogil:
+cdef extern from "raft_runtime/neighbors/ivf_pq.hpp" \
+        namespace "raft::runtime::neighbors::ivf_pq" nogil:
 
-    cdef void build(const handle_t& handle,
+    cdef void build(const device_resources& handle,
                     const index_params& params,
                     const float* dataset,
                     uint64_t n_rows,
                     uint32_t dim,
                     index[uint64_t]* index) except +
 
-    cdef void build(const handle_t& handle,
+    cdef void build(const device_resources& handle,
                     const index_params& params,
                     const int8_t* dataset,
                     uint64_t n_rows,
                     uint32_t dim,
                     index[uint64_t]* index) except +
 
-    cdef void build(const handle_t& handle,
+    cdef void build(const device_resources& handle,
                     const index_params& params,
                     const uint8_t* dataset,
                     uint64_t n_rows,
                     uint32_t dim,
                     index[uint64_t]* index) except +
 
-    cdef void extend(const handle_t& handle,
+    cdef void extend(const device_resources& handle,
                      index[uint64_t]* index,
                      const float* new_vectors,
                      const uint64_t* new_indices,
                      uint64_t n_rows) except +
 
-    cdef void extend(const handle_t& handle,
+    cdef void extend(const device_resources& handle,
                      index[uint64_t]* index,
                      const int8_t* new_vectors,
                      const uint64_t* new_indices,
                      uint64_t n_rows) except +
 
-    cdef void extend(const handle_t& handle,
+    cdef void extend(const device_resources& handle,
                      index[uint64_t]* index,
                      const uint8_t* new_vectors,
                      const uint64_t* new_indices,
                      uint64_t n_rows) except +
 
-    cdef void search(const handle_t& handle,
+    cdef void search(const device_resources& handle,
                      const search_params& params,
                      const index[uint64_t]& index,
                      const float* queries,
@@ -156,7 +157,7 @@ cdef extern from "raft/neighbors/specializations/ivf_pq_specialization.hpp" \
                      float* distances,
                      device_memory_resource* mr) except +
 
-    cdef void search(const handle_t& handle,
+    cdef void search(const device_resources& handle,
                      const search_params& params,
                      const index[uint64_t]& index,
                      const int8_t* queries,
@@ -166,7 +167,7 @@ cdef extern from "raft/neighbors/specializations/ivf_pq_specialization.hpp" \
                      float* distances,
                      device_memory_resource* mr) except +
 
-    cdef void search(const handle_t& handle,
+    cdef void search(const device_resources& handle,
                      const search_params& params,
                      const index[uint64_t]& index,
                      const uint8_t* queries,
@@ -175,3 +176,11 @@ cdef extern from "raft/neighbors/specializations/ivf_pq_specialization.hpp" \
                      uint64_t* neighbors,
                      float* distances,
                      device_memory_resource* mr) except +
+
+    cdef void serialize(const device_resources& handle,
+                        const string& filename,
+                        const index[uint64_t]& index) except +
+
+    cdef void deserialize(const device_resources& handle,
+                          const string& filename,
+                          index[uint64_t]* index) except +
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
index 75b7cd3abb..e7b69ddbea 100644
--- a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
+++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
+import warnings
+
 import numpy as np
 
 from cython.operator cimport dereference as deref
@@ -30,13 +32,21 @@ from libc.stdint cimport (
     uintptr_t,
 )
 from libcpp cimport bool, nullptr
+from libcpp.string cimport string
 
 from pylibraft.distance.distance_type cimport DistanceType
 
-from pylibraft.common import Handle, cai_wrapper, device_ndarray
+from pylibraft.common import (
+    DeviceResources,
+    ai_wrapper,
+    auto_convert_output,
+    cai_wrapper,
+    device_ndarray,
+)
+from pylibraft.common.cai_wrapper import wrap_array
 from pylibraft.common.interruptible import cuda_interruptible
 
-from pylibraft.common.handle cimport handle_t
+from pylibraft.common.handle cimport device_resources
 
 from pylibraft.common.handle import auto_sync_handle
 from pylibraft.common.input_validation import is_c_contiguous
@@ -46,26 +56,33 @@ from rmm._lib.memory_resource cimport (
     device_memory_resource,
 )
 
-cimport pylibraft.neighbors.ivf_pq.c_ivf_pq as c_ivf_pq
-from pylibraft.neighbors.ivf_pq.c_ivf_pq cimport index_params, search_params
+cimport pylibraft.neighbors.ivf_pq.cpp.c_ivf_pq as c_ivf_pq
+from pylibraft.neighbors.ivf_pq.cpp.c_ivf_pq cimport (
+    index_params,
+    search_params,
+)
 
 
 def _get_metric(metric):
     SUPPORTED_DISTANCES = {
-        "l2_expanded": DistanceType.L2Expanded,
-        # TODO(tfeher): fix inconsistency: index building for L2SqrtExpanded is
-        # only supported by build, not by search.
-        # "euclidean": DistanceType.L2SqrtExpanded
+        "sqeuclidean": DistanceType.L2Expanded,
+        "euclidean": DistanceType.L2SqrtExpanded,
         "inner_product": DistanceType.InnerProduct
     }
     if metric not in SUPPORTED_DISTANCES:
+        if metric == "l2_expanded":
+            warnings.warn("Using l2_expanded as a metric name is deprecated,"
+                          " use sqeuclidean instead", FutureWarning)
+            return DistanceType.L2Expanded
+
         raise ValueError("metric %s is not supported" % metric)
     return SUPPORTED_DISTANCES[metric]
 
 
 cdef _get_metric_string(DistanceType metric):
-    return {DistanceType.L2Expanded : "l2_expanded",
-            DistanceType.InnerProduct: "inner_product"}[metric]
+    return {DistanceType.L2Expanded : "sqeuclidean",
+            DistanceType.InnerProduct: "inner_product",
+            DistanceType.L2SqrtExpanded: "euclidean"}[metric]
 
 
 cdef _get_codebook_string(c_ivf_pq.codebook_gen codebook):
@@ -108,7 +125,7 @@ cdef class IndexParams:
 
     def __init__(self, *,
                  n_lists=1024,
-                 metric="l2_expanded",
+                 metric="sqeuclidean",
                  kmeans_n_iters=20,
                  kmeans_trainset_fraction=0.5,
                  pq_bits=8,
@@ -123,10 +140,12 @@ cdef class IndexParams:
         ----------
         n_list : int, default = 1024
             The number of clusters used in the coarse quantizer.
-        metric : string denoting the metric type, default="l2_expanded"
-            Valid values for metric: ["l2_expanded", "inner_product"], where
-            - l2_expanded is the equclidean distance without the square root
+        metric : string denoting the metric type, default="sqeuclidean"
+            Valid values for metric: ["sqeuclidean", "inner_product",
+            "euclidean"], where
+            - sqeuclidean is the euclidean distance without the square root
               operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2,
+            - euclidean is the euclidean distance
             - inner product distance is defined as
               distance(a, b) = \\sum_i a_i * b_i.
         kmeans_n_iters : int, default = 20
@@ -233,13 +252,14 @@ cdef class Index:
         self.trained = False
         self.index = NULL
         if handle is None:
-            handle = Handle()
-        cdef handle_t* handle_ = <handle_t*><size_t>handle.getHandle()
+            handle = DeviceResources()
+        cdef device_resources* handle_ = \
+            <device_resources*><size_t>handle.getHandle()
 
         # We create a placeholder object. The actual parameter values do
         # not matter, it will be replaced with a built index object later.
         self.index = new c_ivf_pq.index[uint64_t](
-            deref(handle_), _get_metric("l2_expanded"),
+            deref(handle_), _get_metric("sqeuclidean"),
             c_ivf_pq.codebook_gen.PER_SUBSPACE,
             <uint32_t>1,
             <uint32_t>4,
@@ -299,14 +319,18 @@ cdef class Index:
 
 
 @auto_sync_handle
+@auto_convert_output
 def build(IndexParams index_params, dataset, handle=None):
     """
     Builds an IVF-PQ index that can be later used for nearest neighbor search.
 
+    The input array can be either CUDA array interface compliant matrix or
+    array interface compliant matrix in host memory.
+
     Parameters
     ----------
     index_params : IndexParams object
-    dataset : CUDA array interface compliant matrix shape (n_samples, dim)
+    dataset : array interface compliant matrix shape (n_samples, dim)
         Supported dtype [float, int8, uint8]
     {handle_docstring}
 
@@ -317,42 +341,39 @@ def build(IndexParams index_params, dataset, handle=None):
     Examples
     --------
 
-    .. code-block:: python
-
-        import cupy as cp
-
-        from pylibraft.common import Handle
-        from pylibraft.neighbors import ivf_pq
-
-        n_samples = 50000
-        n_features = 50
-        n_queries = 1000
-
-        dataset = cp.random.random_sample((n_samples, n_features),
-            dtype=cp.float32)
-        handle = Handle()
-        index_params = ivf_pq.IndexParams(
-            n_lists=1024,
-            metric="l2_expanded",
-            pq_dim=10)
-        index = ivf_pq.build(index_params, dataset, handle=handle)
-
-        # Search using the built index
-        queries = cp.random.random_sample((n_queries, n_features),
-                                          dtype=cp.float32)
-        k = 10
-        distances, neighbors = ivf_pq.search(ivf_pq.SearchParams(), index,
-                                             queries, k, handle=handle)
-
-        distances = cp.asarray(distances)
-        neighbors = cp.asarray(neighbors)
-
-        # pylibraft functions are often asynchronous so the
-        # handle needs to be explicitly synchronized
-        handle.sync()
-
+    >>> import cupy as cp
+
+    >>> from pylibraft.common import DeviceResources
+    >>> from pylibraft.neighbors import ivf_pq
+
+    >>> n_samples = 50000
+    >>> n_features = 50
+    >>> n_queries = 1000
+
+    >>> dataset = cp.random.random_sample((n_samples, n_features),
+    ...                                   dtype=cp.float32)
+    >>> handle = DeviceResources()
+    >>> index_params = ivf_pq.IndexParams(
+    ...     n_lists=1024,
+    ...     metric="sqeuclidean",
+    ...     pq_dim=10)
+    >>> index = ivf_pq.build(index_params, dataset, handle=handle)
+
+    >>> # Search using the built index
+    >>> queries = cp.random.random_sample((n_queries, n_features),
+    ...                                   dtype=cp.float32)
+    >>> k = 10
+    >>> distances, neighbors = ivf_pq.search(ivf_pq.SearchParams(), index,
+    ...                                      queries, k, handle=handle)
+
+    >>> distances = cp.asarray(distances)
+    >>> neighbors = cp.asarray(neighbors)
+
+    >>> # pylibraft functions are often asynchronous so the
+    >>> # handle needs to be explicitly synchronized
+    >>> handle.sync()
     """
-    dataset_cai = cai_wrapper(dataset)
+    dataset_cai = wrap_array(dataset)
     dataset_dt = dataset_cai.dtype
     _check_input_array(dataset_cai, [np.dtype('float32'), np.dtype('byte'),
                                      np.dtype('ubyte')])
@@ -362,8 +383,9 @@ def build(IndexParams index_params, dataset, handle=None):
     cdef uint32_t dim = dataset_cai.shape[1]
 
     if handle is None:
-        handle = Handle()
-    cdef handle_t* handle_ = <handle_t*><size_t>handle.getHandle()
+        handle = DeviceResources()
+    cdef device_resources* handle_ = \
+        <device_resources*><size_t>handle.getHandle()
 
     idx = Index()
 
@@ -401,18 +423,21 @@ def build(IndexParams index_params, dataset, handle=None):
 
 
 @auto_sync_handle
+@auto_convert_output
 def extend(Index index, new_vectors, new_indices, handle=None):
     """
     Extend an existing index with new vectors.
 
+    The input array can be either CUDA array interface compliant matrix or
+    array interface compliant matrix in host memory.
 
     Parameters
     ----------
     index : ivf_pq.Index
         Trained ivf_pq object.
-    new_vectors : CUDA array interface compliant matrix shape (n_samples, dim)
+    new_vectors : array interface compliant matrix shape (n_samples, dim)
         Supported dtype [float, int8, uint8]
-    new_indices : CUDA array interface compliant matrix shape (n_samples, dim)
+    new_indices : array interface compliant matrix shape (n_samples, dim)
         Supported dtype [uint64]
     {handle_docstring}
 
@@ -423,51 +448,50 @@ def extend(Index index, new_vectors, new_indices, handle=None):
     Examples
     --------
 
-    .. code-block:: python
-
-        import cupy as cp
+    >>> import cupy as cp
 
-        from pylibraft.common import Handle
-        from pylibraft.neighbors import ivf_pq
+    >>> from pylibraft.common import DeviceResources
+    >>> from pylibraft.neighbors import ivf_pq
 
-        n_samples = 50000
-        n_features = 50
-        n_queries = 1000
+    >>> n_samples = 50000
+    >>> n_features = 50
+    >>> n_queries = 1000
 
-        dataset = cp.random.random_sample((n_samples, n_features),
-                                          dtype=cp.float32)
-        handle = Handle()
-        index = ivf_pq.build(ivf_pq.IndexParams(), dataset, handle=handle)
+    >>> dataset = cp.random.random_sample((n_samples, n_features),
+    ...                                   dtype=cp.float32)
+    >>> handle = DeviceResources()
+    >>> index = ivf_pq.build(ivf_pq.IndexParams(), dataset, handle=handle)
 
-        n_rows = 100
-        more_data = cp.random.random_sample((n_rows, n_features),
-                                            dtype=cp.float32)
-        indices = index.size + cp.arange(n_rows, dtype=cp.uint64)
-        index = ivf_pq.extend(index, more_data, indices)
+    >>> n_rows = 100
+    >>> more_data = cp.random.random_sample((n_rows, n_features),
+    ...                                     dtype=cp.float32)
+    >>> indices = index.size + cp.arange(n_rows, dtype=cp.uint64)
+    >>> index = ivf_pq.extend(index, more_data, indices)
 
-        # Search using the built index
-        queries = cp.random.random_sample((n_queries, n_features),
-                                          dtype=cp.float32)
-        k = 10
-        distances, neighbors = ivf_pq.search(ivf_pq.SearchParams(),
-                                             index, queries,
-                                             k, handle=handle)
+    >>> # Search using the built index
+    >>> queries = cp.random.random_sample((n_queries, n_features),
+    ...                                   dtype=cp.float32)
+    >>> k = 10
+    >>> distances, neighbors = ivf_pq.search(ivf_pq.SearchParams(),
+    ...                                      index, queries,
+    ...                                      k, handle=handle)
 
-        # pylibraft functions are often asynchronous so the
-        # handle needs to be explicitly synchronized
-        handle.sync()
+    >>> # pylibraft functions are often asynchronous so the
+    >>> # handle needs to be explicitly synchronized
+    >>> handle.sync()
 
-        distances = cp.asarray(distances)
-        neighbors = cp.asarray(neighbors)
+    >>> distances = cp.asarray(distances)
+    >>> neighbors = cp.asarray(neighbors)
     """
     if not index.trained:
         raise ValueError("Index need to be built before calling extend.")
 
     if handle is None:
-        handle = Handle()
-    cdef handle_t* handle_ = <handle_t*><size_t>handle.getHandle()
+        handle = DeviceResources()
+    cdef device_resources* handle_ = \
+        <device_resources*><size_t>handle.getHandle()
 
-    vecs_cai = cai_wrapper(new_vectors)
+    vecs_cai = wrap_array(new_vectors)
     vecs_dt = vecs_cai.dtype
     cdef uint64_t n_rows = vecs_cai.shape[0]
     cdef uint32_t dim = vecs_cai.shape[1]
@@ -476,7 +500,7 @@ def extend(Index index, new_vectors, new_indices, handle=None):
                                   np.dtype('ubyte')],
                        exp_cols=index.dim)
 
-    idx_cai = cai_wrapper(new_indices)
+    idx_cai = wrap_array(new_indices)
     _check_input_array(idx_cai, [np.dtype('uint64')], exp_rows=n_rows)
     if len(idx_cai.shape)!=1:
         raise ValueError("Indices array is expected to be 1D")
@@ -567,6 +591,7 @@ cdef class SearchParams:
 
 
 @auto_sync_handle
+@auto_convert_output
 def search(SearchParams search_params,
            Index index,
            queries,
@@ -602,59 +627,59 @@ def search(SearchParams search_params,
 
     Examples
     --------
-    .. code-block:: python
-
-        import cupy as cp
-
-        from pylibraft.common import Handle
-        from pylibraft.neighbors import ivf_pq
-
-        n_samples = 50000
-        n_features = 50
-        n_queries = 1000
-        dataset = cp.random.random_sample((n_samples, n_features),
-                                          dtype=cp.float32)
-
-        # Build index
-        handle = Handle()
-        index = ivf_pq.build(ivf_pq.IndexParams(), dataset, handle=handle)
-
-        # Search using the built index
-        queries = cp.random.random_sample((n_queries, n_features),
-                                          dtype=cp.float32)
-        k = 10
-        search_params = ivf_pq.SearchParams(
-            n_probes=20,
-            lut_dtype=ivf_pq.np.float16,
-            internal_distance_dtype=ivf_pq.np.float32
-        )
-
-        # Using a pooling allocator reduces overhead of temporary array
-        # creation during search. This is useful if multiple searches
-        # are performad with same query size.
-        mr = rmm.mr.PoolMemoryResource(
-            rmm.mr.CudaMemoryResource(),
-            initial_pool_size=2**29,
-            maximum_pool_size=2**31
-        )
-        distances, neighbors = ivf_pq.search(search_params, index, queries,
-                                             k, memory_resource=mr,
-                                             handle=handle)
-
-        # pylibraft functions are often asynchronous so the
-        # handle needs to be explicitly synchronized
-        handle.sync()
-
-        neighbors = cp.asarray(neighbors)
-        distances = cp.asarray(distances)
+    >>> import cupy as cp
+
+    >>> from pylibraft.common import DeviceResources
+    >>> from pylibraft.neighbors import ivf_pq
+
+    >>> n_samples = 50000
+    >>> n_features = 50
+    >>> n_queries = 1000
+    >>> dataset = cp.random.random_sample((n_samples, n_features),
+    ...                                   dtype=cp.float32)
+
+    >>> # Build index
+    >>> handle = DeviceResources()
+    >>> index = ivf_pq.build(ivf_pq.IndexParams(), dataset, handle=handle)
+
+    >>> # Search using the built index
+    >>> queries = cp.random.random_sample((n_queries, n_features),
+    ...                                   dtype=cp.float32)
+    >>> k = 10
+    >>> search_params = ivf_pq.SearchParams(
+    ...     n_probes=20,
+    ...     lut_dtype=cp.float16,
+    ...     internal_distance_dtype=cp.float32
+    ... )
+
+    >>> # Using a pooling allocator reduces overhead of temporary array
+    >>> # creation during search. This is useful if multiple searches
+    >>> # are performad with same query size.
+    >>> import rmm
+    >>> mr = rmm.mr.PoolMemoryResource(
+    ...     rmm.mr.CudaMemoryResource(),
+    ...     initial_pool_size=2**29,
+    ...     maximum_pool_size=2**31
+    ... )
+    >>> distances, neighbors = ivf_pq.search(search_params, index, queries,
+    ...                                      k, memory_resource=mr,
+    ...                                      handle=handle)
+
+    >>> # pylibraft functions are often asynchronous so the
+    >>> # handle needs to be explicitly synchronized
+    >>> handle.sync()
+
+    >>> neighbors = cp.asarray(neighbors)
+    >>> distances = cp.asarray(distances)
     """
 
     if not index.trained:
         raise ValueError("Index need to be built before calling search.")
 
     if handle is None:
-        handle = Handle()
-    cdef handle_t* handle_ = <handle_t*><size_t>handle.getHandle()
+        handle = DeviceResources()
+    cdef device_resources* handle_ = \
+        <device_resources*><size_t>handle.getHandle()
 
     queries_cai = cai_wrapper(queries)
     queries_dt = queries_cai.dtype
@@ -725,3 +750,109 @@ def search(SearchParams search_params,
         raise ValueError("query dtype %s not supported" % queries_dt)
 
     return (distances, neighbors)
+
+
+@auto_sync_handle
+def save(filename, Index index, handle=None):
+    """
+    Saves the index to file.
+
+    Saving / loading the index is experimental. The serialization format is
+    subject to change.
+
+    Parameters
+    ----------
+    filename : string
+        Name of the file.
+    index : Index
+        Trained IVF-PQ index.
+    {handle_docstring}
+
+    Examples
+    --------
+    >>> import cupy as cp
+
+    >>> from pylibraft.common import DeviceResources
+    >>> from pylibraft.neighbors import ivf_pq
+
+    >>> n_samples = 50000
+    >>> n_features = 50
+    >>> dataset = cp.random.random_sample((n_samples, n_features),
+    ...                                   dtype=cp.float32)
+
+    >>> # Build index
+    >>> handle = DeviceResources()
+    >>> index = ivf_pq.build(ivf_pq.IndexParams(), dataset, handle=handle)
+    >>> ivf_pq.save("my_index.bin", index, handle=handle)
+    """
+    if not index.trained:
+        raise ValueError("Index need to be built before saving it.")
+
+    if handle is None:
+        handle = DeviceResources()
+    cdef device_resources* handle_ = \
+        <device_resources*><size_t>handle.getHandle()
+
+    cdef string c_filename = filename.encode('utf-8')
+
+    c_ivf_pq.serialize(deref(handle_), c_filename, deref(index.index))
+
+
+@auto_sync_handle
+def load(filename, handle=None):
+    """
+    Loads index from file.
+
+    Saving / loading the index is experimental. The serialization format is
+    subject to change, therefore loading an index saved with a previous
+    version of raft is not guaranteed to work.
+
+    Parameters
+    ----------
+    filename : string
+        Name of the file.
+    {handle_docstring}
+
+    Returns
+    -------
+    index : Index
+
+    Examples
+    --------
+    >>> import cupy as cp
+
+    >>> from pylibraft.common import DeviceResources
+    >>> from pylibraft.neighbors import ivf_pq
+
+    >>> n_samples = 50000
+    >>> n_features = 50
+    >>> dataset = cp.random.random_sample((n_samples, n_features),
+    ...                                   dtype=cp.float32)
+
+    >>> # Build and save index
+    >>> handle = DeviceResources()
+    >>> index = ivf_pq.build(ivf_pq.IndexParams(), dataset, handle=handle)
+    >>> ivf_pq.save("my_index.bin", index, handle=handle)
+    >>> del index
+
+    >>> n_queries = 100
+    >>> queries = cp.random.random_sample((n_queries, n_features),
+    ...                                   dtype=cp.float32)
+    >>> handle = DeviceResources()
+    >>> index = ivf_pq.load("my_index.bin", handle=handle)
+
+    >>> distances, neighbors = ivf_pq.search(ivf_pq.SearchParams(), index,
+    ...                                      queries, k=10, handle=handle)
+    """
+    if handle is None:
+        handle = DeviceResources()
+    cdef device_resources* handle_ = \
+        <device_resources*><size_t>handle.getHandle()
+
+    cdef string c_filename = filename.encode('utf-8')
+    index = Index()
+
+    c_ivf_pq.deserialize(deref(handle_), c_filename, index.index)
+    index.trained = True
+
+    return index
diff --git a/python/pylibraft/pylibraft/neighbors/refine.pyx b/python/pylibraft/pylibraft/neighbors/refine.pyx
index 2d8d4196e3..5c652f7c73 100644
--- a/python/pylibraft/pylibraft/neighbors/refine.pyx
+++ b/python/pylibraft/pylibraft/neighbors/refine.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -33,9 +33,14 @@ from libcpp cimport bool, nullptr
 
 from pylibraft.distance.distance_type cimport DistanceType
 
-from pylibraft.common import Handle, cai_wrapper, device_ndarray
+from pylibraft.common import (
+    DeviceResources,
+    auto_convert_output,
+    cai_wrapper,
+    device_ndarray,
+)
 
-from pylibraft.common.handle cimport handle_t
+from pylibraft.common.handle cimport device_resources
 
 from pylibraft.common.handle import auto_sync_handle
 from pylibraft.common.input_validation import is_c_contiguous
@@ -46,24 +51,27 @@ from pylibraft.distance.distance_type cimport DistanceType
 import pylibraft.neighbors.ivf_pq as ivf_pq
 from pylibraft.neighbors.ivf_pq.ivf_pq import _get_metric
 
-cimport pylibraft.neighbors.ivf_pq.c_ivf_pq as c_ivf_pq
-from pylibraft.common.mdspan cimport (
+cimport pylibraft.neighbors.ivf_pq.cpp.c_ivf_pq as c_ivf_pq
+from pylibraft.common.cpp.mdspan cimport (
     device_matrix_view,
     host_matrix_view,
     make_device_matrix_view,
     make_host_matrix_view,
     row_major,
 )
-from pylibraft.neighbors.ivf_pq.c_ivf_pq cimport index_params, search_params
+from pylibraft.neighbors.ivf_pq.cpp.c_ivf_pq cimport (
+    index_params,
+    search_params,
+)
 
 
 # We omit the const qualifiers in the interface for refine, because cython
 # has an issue parsing it (https://github.com/cython/cython/issues/4180).
-cdef extern from "raft/neighbors/specializations/refine.hpp" \
-        namespace "raft::neighbors" nogil:
+cdef extern from "raft_runtime/neighbors/refine.hpp" \
+        namespace "raft::runtime::neighbors" nogil:
 
-    cdef void c_refine "raft::neighbors::refine" (
-        const handle_t& handle,
+    cdef void c_refine "raft::runtime::neighbors::refine" (
+        const device_resources& handle,
         device_matrix_view[float, uint64_t, row_major] dataset,
         device_matrix_view[float, uint64_t, row_major] queries,
         device_matrix_view[uint64_t, uint64_t, row_major] candidates,
@@ -71,8 +79,8 @@ cdef extern from "raft/neighbors/specializations/refine.hpp" \
         device_matrix_view[float, uint64_t, row_major] distances,
         DistanceType metric) except +
 
-    cdef void c_refine "raft::neighbors::refine" (
-        const handle_t& handle,
+    cdef void c_refine "raft::runtime::neighbors::refine" (
+        const device_resources& handle,
         device_matrix_view[uint8_t, uint64_t, row_major] dataset,
         device_matrix_view[uint8_t, uint64_t, row_major] queries,
         device_matrix_view[uint64_t, uint64_t, row_major] candidates,
@@ -80,8 +88,8 @@ cdef extern from "raft/neighbors/specializations/refine.hpp" \
         device_matrix_view[float, uint64_t, row_major] distances,
         DistanceType metric) except +
 
-    cdef void c_refine "raft::neighbors::refine" (
-        const handle_t& handle,
+    cdef void c_refine "raft::runtime::neighbors::refine" (
+        const device_resources& handle,
         device_matrix_view[int8_t, uint64_t, row_major] dataset,
         device_matrix_view[int8_t, uint64_t, row_major] queries,
         device_matrix_view[uint64_t, uint64_t, row_major] candidates,
@@ -89,8 +97,8 @@ cdef extern from "raft/neighbors/specializations/refine.hpp" \
         device_matrix_view[float, uint64_t, row_major] distances,
         DistanceType metric) except +
 
-    cdef void c_refine "raft::neighbors::refine" (
-        const handle_t& handle,
+    cdef void c_refine "raft::runtime::neighbors::refine" (
+        const device_resources& handle,
         host_matrix_view[float, uint64_t, row_major] dataset,
         host_matrix_view[float, uint64_t, row_major] queries,
         host_matrix_view[uint64_t, uint64_t, row_major] candidates,
@@ -98,8 +106,8 @@ cdef extern from "raft/neighbors/specializations/refine.hpp" \
         host_matrix_view[float, uint64_t, row_major] distances,
         DistanceType metric) except +
 
-    cdef void c_refine "raft::neighbors::refine" (
-        const handle_t& handle,
+    cdef void c_refine "raft::runtime::neighbors::refine" (
+        const device_resources& handle,
         host_matrix_view[uint8_t, uint64_t, row_major] dataset,
         host_matrix_view[uint8_t, uint64_t, row_major] queries,
         host_matrix_view[uint64_t, uint64_t, row_major] candidates,
@@ -107,8 +115,8 @@ cdef extern from "raft/neighbors/specializations/refine.hpp" \
         host_matrix_view[float, uint64_t, row_major] distances,
         DistanceType metric) except +
 
-    cdef void c_refine "raft::neighbors::refine" (
-        const handle_t& handle,
+    cdef void c_refine "raft::runtime::neighbors::refine" (
+        const device_resources& handle,
         host_matrix_view[int8_t, uint64_t, row_major] dataset,
         host_matrix_view[int8_t, uint64_t, row_major] queries,
         host_matrix_view[uint64_t, uint64_t, row_major] candidates,
@@ -205,8 +213,9 @@ cdef host_matrix_view[int8_t, uint64_t, row_major] \
 
 
 @auto_sync_handle
+@auto_convert_output
 def refine(dataset, queries, candidates, k=None, indices=None, distances=None,
-           metric="l2_expanded", handle=None):
+           metric="sqeuclidean", handle=None):
     """
     Refine nearest neighbor search.
 
@@ -250,48 +259,42 @@ def refine(dataset, queries, candidates, k=None, indices=None, distances=None,
     Examples
     --------
 
-    .. code-block:: python
-
-        import cupy as cp
-
-        from pylibraft.common import Handle
-        from pylibraft.neighbors import ivf_pq, refine
-
-        n_samples = 50000
-        n_features = 50
-        n_queries = 1000
-
-        dataset = cp.random.random_sample((n_samples, n_features),
-            dtype=cp.float32)
-        handle = Handle()
-        index_params = ivf_pq.IndexParams(
-            n_lists=1024,
-            metric="l2_expanded",
-            pq_dim=10)
-        index = ivf_pq.build(index_params, dataset, handle=handle)
-
-        # Search using the built index
-        queries = cp.random.random_sample((n_queries, n_features),
-                                          dtype=cp.float32)
-        k = 40
-        _, candidates = ivf_pq.search(ivf_pq.SearchParams(), index,
-                                             queries, k, handle=handle)
-
-        k = 10
-        distances, neighbors = refine(dataset, queries, candidates, k,
-                                      handle=handle)
-        distances = cp.asarray(distances)
-        neighbors = cp.asarray(neighbors)
-
-
-        # pylibraft functions are often asynchronous so the
-        # handle needs to be explicitly synchronized
-        handle.sync()
-
+    >>> import cupy as cp
+
+    >>> from pylibraft.common import DeviceResources
+    >>> from pylibraft.neighbors import ivf_pq, refine
+
+    >>> n_samples = 50000
+    >>> n_features = 50
+    >>> n_queries = 1000
+
+    >>> dataset = cp.random.random_sample((n_samples, n_features),
+    ...                                   dtype=cp.float32)
+    >>> handle = DeviceResources()
+    >>> index_params = ivf_pq.IndexParams(n_lists=1024, metric="sqeuclidean",
+    ...                                   pq_dim=10)
+    >>> index = ivf_pq.build(index_params, dataset, handle=handle)
+
+    >>> # Search using the built index
+    >>> queries = cp.random.random_sample((n_queries, n_features),
+    ...                                   dtype=cp.float32)
+    >>> k = 40
+    >>> _, candidates = ivf_pq.search(ivf_pq.SearchParams(), index,
+    ...                               queries, k, handle=handle)
+
+    >>> k = 10
+    >>> distances, neighbors = refine(dataset, queries, candidates, k,
+    ...                               handle=handle)
+    >>> distances = cp.asarray(distances)
+    >>> neighbors = cp.asarray(neighbors)
+
+    >>> # pylibraft functions are often asynchronous so the
+    >>> # handle needs to be explicitly synchronized
+    >>> handle.sync()
     """
 
     if handle is None:
-        handle = Handle()
+        handle = DeviceResources()
 
     if hasattr(dataset, "__cuda_array_interface__"):
         return _refine_device(dataset, queries, candidates, k, indices,
@@ -303,7 +306,8 @@ def refine(dataset, queries, candidates, k=None, indices=None, distances=None,
 
 def _refine_device(dataset, queries, candidates, k, indices, distances,
                    metric, handle):
-    cdef handle_t* handle_ = <handle_t*><size_t>handle.getHandle()
+    cdef device_resources* handle_ = \
+        <device_resources*><size_t>handle.getHandle()
 
     cdef device_matrix_view[uint64_t, uint64_t, row_major] candidates_view = \
         get_device_matrix_view_uint64(candidates)
@@ -364,7 +368,8 @@ def _refine_device(dataset, queries, candidates, k, indices, distances,
 
 def _refine_host(dataset, queries, candidates, k, indices, distances,
                  metric, handle):
-    cdef handle_t* handle_ = <handle_t*><size_t>handle.getHandle()
+    cdef device_resources* handle_ = \
+        <device_resources*><size_t>handle.getHandle()
 
     if k is None:
         if indices is not None:
diff --git a/python/pylibraft/pylibraft/random/__init__.py b/python/pylibraft/pylibraft/random/__init__.py
index c34e4e6bdb..1c47a6eaac 100644
--- a/python/pylibraft/pylibraft/random/__init__.py
+++ b/python/pylibraft/pylibraft/random/__init__.py
@@ -14,3 +14,5 @@
 #
 
 from .rmat_rectangular_generator import rmat
+
+__all__ = ["rmat"]
diff --git a/python/pylibraft/pylibraft/random/cpp/__init__.pxd b/python/pylibraft/pylibraft/random/cpp/__init__.pxd
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/python/pylibraft/pylibraft/random/cpp/__init__.py b/python/pylibraft/pylibraft/random/cpp/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/python/pylibraft/pylibraft/random/rng_state.pxd b/python/pylibraft/pylibraft/random/cpp/rng_state.pxd
similarity index 100%
rename from python/pylibraft/pylibraft/random/rng_state.pxd
rename to python/pylibraft/pylibraft/random/cpp/rng_state.pxd
diff --git a/python/pylibraft/pylibraft/random/rmat_rectangular_generator.pyx b/python/pylibraft/pylibraft/random/rmat_rectangular_generator.pyx
index ef785a900b..2c7e0430b4 100644
--- a/python/pylibraft/pylibraft/random/rmat_rectangular_generator.pyx
+++ b/python/pylibraft/pylibraft/random/rmat_rectangular_generator.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -28,15 +28,14 @@ from pylibraft.common.handle import auto_sync_handle
 
 from libcpp cimport bool
 
-from pylibraft.common.handle cimport handle_t
+from pylibraft.common.handle cimport device_resources
+from pylibraft.random.cpp.rng_state cimport RngState
 
-from .rng_state cimport RngState
 
+cdef extern from "raft_runtime/random/rmat_rectangular_generator.hpp" \
+        namespace "raft::runtime::random" nogil:
 
-cdef extern from "raft_distance/random/rmat_rectangular_generator.hpp" \
-        namespace "raft::random::runtime":
-
-    cdef void rmat_rectangular_gen(const handle_t &handle,
+    cdef void rmat_rectangular_gen(const device_resources &handle,
                                    int* out,
                                    int* out_src,
                                    int* out_dst,
@@ -46,7 +45,7 @@ cdef extern from "raft_distance/random/rmat_rectangular_generator.hpp" \
                                    int n_edges,
                                    RngState& r) except +
 
-    cdef void rmat_rectangular_gen(const handle_t &handle,
+    cdef void rmat_rectangular_gen(const device_resources &handle,
                                    int64_t* out,
                                    int64_t* out_src,
                                    int64_t* out_dst,
@@ -56,7 +55,7 @@ cdef extern from "raft_distance/random/rmat_rectangular_generator.hpp" \
                                    int64_t n_edges,
                                    RngState& r) except +
 
-    cdef void rmat_rectangular_gen(const handle_t &handle,
+    cdef void rmat_rectangular_gen(const device_resources &handle,
                                    int* out,
                                    int* out_src,
                                    int* out_dst,
@@ -66,7 +65,7 @@ cdef extern from "raft_distance/random/rmat_rectangular_generator.hpp" \
                                    int n_edges,
                                    RngState& r) except +
 
-    cdef void rmat_rectangular_gen(const handle_t &handle,
+    cdef void rmat_rectangular_gen(const device_resources &handle,
                                    int64_t* out,
                                    int64_t* out_src,
                                    int64_t* out_dst,
@@ -98,30 +97,28 @@ def rmat(out, theta, r_scale, c_scale, seed=12345, handle=None):
     Examples
     --------
 
-    .. code-block:: python
+    >>> import cupy as cp
+
+    >>> from pylibraft.common import Handle
+    >>> from pylibraft.random import rmat
 
-        import cupy as cp
+    >>> n_edges = 5000
+    >>> r_scale = 16
+    >>> c_scale = 14
+    >>> theta_len = max(r_scale, c_scale) * 4
 
-        from pylibraft.common import Handle
-        from pylibraft.random import rmat
+    >>> out = cp.empty((n_edges, 2), dtype=cp.int32)
+    >>> theta = cp.random.random_sample(theta_len, dtype=cp.float32)
 
-        n_edges = 5000
-        r_scale = 16
-        c_scale = 14
-        theta_len = max(r_scale, c_scale) * 4
+    >>> # A single RAFT handle can optionally be reused across
+    >>> # pylibraft functions.
+    >>> handle = Handle()
 
-        out = cp.empty((n_edges, 2), dtype=cp.int32)
-        theta = cp.random.random_sample(theta_len, dtype=cp.float32)
+    >>> rmat(out, theta, r_scale, c_scale, handle=handle)
 
-        # A single RAFT handle can optionally be reused across
-        # pylibraft functions.
-        handle = Handle()
-        ...
-        rmat(out, theta, r_scale, c_scale, handle=handle)
-        ...
-        # pylibraft functions are often asynchronous so the
-        # handle needs to be explicitly synchronized
-        handle.sync()
+    >>> # pylibraft functions are often asynchronous so the
+    >>> # handle needs to be explicitly synchronized
+    >>> handle.sync()
    """
 
     if theta is None:
@@ -141,7 +138,7 @@ def rmat(out, theta, r_scale, c_scale, seed=12345, handle=None):
     cdef RngState *rng = new RngState(seed)
 
     handle = handle if handle is not None else Handle()
-    cdef handle_t *h = <handle_t*><size_t>handle.getHandle()
+    cdef device_resources *h = <device_resources*><size_t>handle.getHandle()
 
     if out_dt == np.int32 and theta_dt == np.float32:
         rmat_rectangular_gen(deref(h),
diff --git a/python/pylibraft/pylibraft/test/test_config.py b/python/pylibraft/pylibraft/test/test_config.py
new file mode 100644
index 0000000000..27a697d388
--- /dev/null
+++ b/python/pylibraft/pylibraft/test/test_config.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import numpy as np
+import pytest
+
+try:
+    import cupy
+except ImportError:
+    pytest.skip(reason="cupy not installed.")
+
+import pylibraft.config
+from pylibraft.common import auto_convert_output, device_ndarray
+
+
+@auto_convert_output
+def gen_cai(m, n, t=None):
+    if t is None:
+        return device_ndarray.empty((m, n))
+    elif t == tuple:
+        return device_ndarray.empty((m, n)), device_ndarray.empty((m, n))
+    elif t == list:
+        return [device_ndarray.empty((m, n)), device_ndarray.empty((m, n))]
+
+
+@pytest.mark.parametrize(
+    "out_type",
+    [
+        ["cupy", cupy.ndarray],
+        ["raft", pylibraft.common.device_ndarray],
+        [lambda arr: arr.copy_to_host(), np.ndarray],
+    ],
+)
+@pytest.mark.parametrize("gen_t", [None, tuple, list])
+def test_auto_convert_output(out_type, gen_t):
+
+    conf, t = out_type
+    pylibraft.config.set_output_as(conf)
+
+    output = gen_cai(1, 5, gen_t)
+
+    if not isinstance(output, (list, tuple)):
+        assert isinstance(output, t)
+
+    else:
+        for o in output:
+            assert isinstance(o, t)
+
+    # Make sure we set the config back to default
+    pylibraft.config.set_output_as("raft")
diff --git a/python/pylibraft/pylibraft/test/test_distance.py b/python/pylibraft/pylibraft/test/test_distance.py
index a08656d3aa..dd6050a098 100644
--- a/python/pylibraft/pylibraft/test/test_distance.py
+++ b/python/pylibraft/pylibraft/test/test_distance.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 import pytest
 from scipy.spatial.distance import cdist
 
-from pylibraft.common import Handle, device_ndarray
+from pylibraft.common import DeviceResources, Stream, device_ndarray
 from pylibraft.distance import pairwise_distance
 
 
@@ -64,9 +64,10 @@ def test_distance(n_rows, n_cols, inplace, metric, order, dtype):
     input1_device = device_ndarray(input1)
     output_device = device_ndarray(output) if inplace else None
 
-    handle = Handle()
+    s2 = Stream()
+    handle = DeviceResources(stream=s2)
     ret_output = pairwise_distance(
-        input1_device, input1_device, output_device, metric
+        input1_device, input1_device, output_device, metric, handle=handle
     )
     handle.sync()
 
diff --git a/python/pylibraft/pylibraft/test/test_doctests.py b/python/pylibraft/pylibraft/test/test_doctests.py
new file mode 100644
index 0000000000..3276ca115f
--- /dev/null
+++ b/python/pylibraft/pylibraft/test/test_doctests.py
@@ -0,0 +1,123 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import doctest
+import inspect
+import io
+
+import pytest
+
+import pylibraft.cluster
+import pylibraft.distance
+import pylibraft.neighbors
+import pylibraft.random
+
+# Code adapted from https://github.com/rapidsai/cudf/blob/branch-23.02/python/cudf/cudf/tests/test_doctests.py  # noqa
+
+
+def _name_in_all(parent, name):
+    return name in getattr(parent, "__all__", [])
+
+
+def _is_public_name(parent, name):
+    return not name.startswith("_")
+
+
+def _find_doctests_in_obj(obj, finder=None, criteria=None):
+    """Find all doctests in an object.
+
+    Parameters
+    ----------
+    obj : module or class
+        The object to search for docstring examples.
+    finder : doctest.DocTestFinder, optional
+        The DocTestFinder object to use. If not provided, a DocTestFinder is
+        constructed.
+    criteria : callable, optional
+        Callable indicating whether to recurse over members of the provided
+        object. If not provided, names not defined in the object's ``__all__``
+        property are ignored.
+
+    Yields
+    ------
+    doctest.DocTest
+        The next doctest found in the object.
+    """
+    if finder is None:
+        finder = doctest.DocTestFinder()
+    if criteria is None:
+        criteria = _name_in_all
+    for docstring in finder.find(obj):
+        if docstring.examples:
+            yield docstring
+    for name, member in inspect.getmembers(obj):
+        # Only recurse over members matching the criteria
+        if not criteria(obj, name):
+            continue
+        # Recurse over the public API of modules (objects defined in the
+        # module's __all__)
+        if inspect.ismodule(member):
+            yield from _find_doctests_in_obj(
+                member, finder, criteria=_name_in_all
+            )
+        # Recurse over the public API of classes (attributes not prefixed with
+        # an underscore)
+        if inspect.isclass(member):
+            yield from _find_doctests_in_obj(
+                member, finder, criteria=_is_public_name
+            )
+
+        # doctest finder seems to dislike cython functions, since
+        # `inspect.isfunction` doesn't return true for them. hack around this
+        if callable(member) and not inspect.isfunction(member):
+            for docstring in finder.find(member):
+                if docstring.examples:
+                    yield docstring
+
+
+# since the root pylibraft module doesn't import submodules (or define an
+# __all__) we are explicitly adding all the submodules we want to run
+# doctests for here
+DOC_STRINGS = list(_find_doctests_in_obj(pylibraft.cluster))
+DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.common))
+DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.distance))
+DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.neighbors))
+DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.neighbors.ivf_pq))
+DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.random))
+
+
+@pytest.mark.parametrize(
+    "docstring",
+    DOC_STRINGS,
+    ids=lambda docstring: docstring.name,
+)
+def test_docstring(docstring):
+    # We ignore differences in whitespace in the doctest output, and enable
+    # the use of an ellipsis "..." to match any string in the doctest
+    # output. An ellipsis is useful for, e.g., memory addresses or
+    # imprecise floating point values.
+    optionflags = doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE
+    runner = doctest.DocTestRunner(optionflags=optionflags)
+
+    # Capture stdout and include failing outputs in the traceback.
+    doctest_stdout = io.StringIO()
+    with contextlib.redirect_stdout(doctest_stdout):
+        runner.run(docstring)
+        results = runner.summarize()
+    assert not results.failed, (
+        f"{results.failed} of {results.attempted} doctests failed for "
+        f"{docstring.name}:\n{doctest_stdout.getvalue()}"
+    )
diff --git a/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py b/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py
index b05ad3d530..086bb26f17 100644
--- a/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py
+++ b/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 import pytest
 from scipy.spatial.distance import cdist
 
-from pylibraft.common import Handle, device_ndarray
+from pylibraft.common import DeviceResources, device_ndarray
 from pylibraft.distance import fused_l2_nn_argmin
 
 
@@ -42,7 +42,7 @@ def test_fused_l2_nn_minarg(n_rows, n_cols, n_clusters, dtype, inplace):
     input2_device = device_ndarray(input2)
     output_device = device_ndarray(output) if inplace else None
 
-    handle = Handle()
+    handle = DeviceResources()
     ret_output = fused_l2_nn_argmin(
         input1_device, input2_device, output_device, True, handle=handle
     )
diff --git a/python/pylibraft/pylibraft/test/test_handle.py b/python/pylibraft/pylibraft/test/test_handle.py
new file mode 100644
index 0000000000..ae519ea965
--- /dev/null
+++ b/python/pylibraft/pylibraft/test/test_handle.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import pytest
+
+from pylibraft.common import DeviceResources, Stream, device_ndarray
+from pylibraft.distance import pairwise_distance
+
+try:
+    import cupy
+except ImportError:
+    pytest.skip(reason="cupy not installed.")
+
+
+@pytest.mark.parametrize("stream", [cupy.cuda.Stream().ptr, Stream()])
+def test_handle_external_stream(stream):
+
+    input1 = np.random.random_sample((50, 3))
+    input1 = np.asarray(input1, order="F").astype("float")
+
+    output = np.zeros((50, 50), dtype="float")
+
+    input1_device = device_ndarray(input1)
+    output_device = device_ndarray(output)
+
+    # We are just testing that this doesn't segfault
+    handle = DeviceResources(stream)
+    pairwise_distance(
+        input1_device, input1_device, output_device, "euclidean", handle=handle
+    )
+    handle.sync()
+
+    with pytest.raises(ValueError):
+        handle = DeviceResources(stream=1.0)
diff --git a/python/pylibraft/pylibraft/test/test_ivf_pq.py b/python/pylibraft/pylibraft/test/test_ivf_pq.py
index 4c102873d1..977d365633 100644
--- a/python/pylibraft/pylibraft/test/test_ivf_pq.py
+++ b/python/pylibraft/pylibraft/test/test_ivf_pq.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -58,18 +58,15 @@ def check_distances(dataset, queries, metric, out_idx, out_dist, eps=None):
     for i in range(queries.shape[0]):
         X = queries[np.newaxis, i, :]
         Y = dataset[out_idx[i, :], :]
-        if metric == "l2_expanded":
+        if metric == "sqeuclidean":
+            dist[i, :] = pairwise_distances(X, Y, "sqeuclidean")
+        elif metric == "euclidean":
             dist[i, :] = pairwise_distances(X, Y, "euclidean")
         elif metric == "inner_product":
             dist[i, :] = np.matmul(X, Y.T)
         else:
             raise ValueError("Invalid metric")
 
-    # Note: raft l2 metric does not include the square root operation like
-    # sklearn's euclidean.
-    if metric == "l2_expanded":
-        dist = np.power(dist, 2)
-
     dist_eps = abs(dist)
     dist_eps[dist < 1e-3] = 1e-3
     diff = abs(out_dist - dist) / dist_eps
@@ -97,6 +94,7 @@ def run_ivf_pq_build_search_test(
     kmeans_n_iters=20,
     compare=True,
     inplace=True,
+    array_type="device",
 ):
     dataset = generate_data((n_rows, n_cols), dtype)
     if metric == "inner_product":
@@ -115,7 +113,10 @@ def run_ivf_pq_build_search_test(
         add_data_on_build=add_data_on_build,
     )
 
-    index = ivf_pq.build(build_params, dataset_device)
+    if array_type == "device":
+        index = ivf_pq.build(build_params, dataset_device)
+    else:
+        index = ivf_pq.build(build_params, dataset)
 
     assert index.trained
     if pq_dim != 0:
@@ -125,14 +126,20 @@ def run_ivf_pq_build_search_test(
     assert index.n_lists == build_params.n_lists
 
     if not add_data_on_build:
-        dataset_1_device = device_ndarray(dataset[: n_rows // 2, :])
-        dataset_2_device = device_ndarray(dataset[n_rows // 2 :, :])
+        dataset_1 = dataset[: n_rows // 2, :]
+        dataset_2 = dataset[n_rows // 2 :, :]
         indices_1 = np.arange(n_rows // 2, dtype=np.uint64)
-        indices_1_device = device_ndarray(indices_1)
         indices_2 = np.arange(n_rows // 2, n_rows, dtype=np.uint64)
-        indices_2_device = device_ndarray(indices_2)
-        index = ivf_pq.extend(index, dataset_1_device, indices_1_device)
-        index = ivf_pq.extend(index, dataset_2_device, indices_2_device)
+        if array_type == "device":
+            dataset_1_device = device_ndarray(dataset_1)
+            dataset_2_device = device_ndarray(dataset_2)
+            indices_1_device = device_ndarray(indices_1)
+            indices_2_device = device_ndarray(indices_2)
+            index = ivf_pq.extend(index, dataset_1_device, indices_1_device)
+            index = ivf_pq.extend(index, dataset_2_device, indices_2_device)
+        else:
+            index = ivf_pq.extend(index, dataset_1, indices_1)
+            index = ivf_pq.extend(index, dataset_2, indices_2)
 
     assert index.size >= n_rows
 
@@ -169,9 +176,11 @@ def run_ivf_pq_build_search_test(
     out_dist = out_dist_device.copy_to_host()
 
     # Calculate reference values with sklearn
-    skl_metric = {"l2_expanded": "euclidean", "inner_product": "cosine"}[
-        metric
-    ]
+    skl_metric = {
+        "sqeuclidean": "sqeuclidean",
+        "inner_product": "cosine",
+        "euclidean": "euclidean",
+    }[metric]
     nn_skl = NearestNeighbors(
         n_neighbors=k, algorithm="brute", metric=skl_metric
     )
@@ -190,18 +199,22 @@ def run_ivf_pq_build_search_test(
 @pytest.mark.parametrize("n_queries", [100])
 @pytest.mark.parametrize("n_lists", [100])
 @pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8])
-def test_ivf_pq_dtypes(n_rows, n_cols, n_queries, n_lists, dtype, inplace):
+@pytest.mark.parametrize("array_type", ["host", "device"])
+def test_ivf_pq_dtypes(
+    n_rows, n_cols, n_queries, n_lists, dtype, inplace, array_type
+):
     # Note that inner_product tests use normalized input which we cannot
-    # represent in int8, therefore we test only l2_expanded metric here.
+    # represent in int8, therefore we test only sqeuclidean metric here.
     run_ivf_pq_build_search_test(
         n_rows=n_rows,
         n_cols=n_cols,
         n_queries=n_queries,
         k=10,
         n_lists=n_lists,
-        metric="l2_expanded",
+        metric="sqeuclidean",
         dtype=dtype,
         inplace=inplace,
+        array_type=array_type,
     )
 
 
@@ -218,7 +231,7 @@ def test_ivf_pq_dtypes(n_rows, n_cols, n_queries, n_lists, dtype, inplace):
             },
             marks=pytest.mark.xfail(reason="empty dataset"),
         ),
-        {"n_rows": 1, "n_cols": 10, "n_queries": 10, "k": 1, "n_lists": 10},
+        {"n_rows": 1, "n_cols": 10, "n_queries": 10, "k": 1, "n_lists": 1},
         {"n_rows": 10, "n_cols": 1, "n_queries": 10, "k": 10, "n_lists": 10},
         # {"n_rows": 999, "n_cols": 42, "n_queries": 453, "k": 137,
         #  "n_lists": 53},
@@ -233,13 +246,15 @@ def test_ivf_pq_n(params):
         n_queries=params["n_queries"],
         k=params["k"],
         n_lists=params["n_lists"],
-        metric="l2_expanded",
+        metric="sqeuclidean",
         dtype=np.float32,
         compare=False,
     )
 
 
-@pytest.mark.parametrize("metric", ["l2_expanded", "inner_product"])
+@pytest.mark.parametrize(
+    "metric", ["sqeuclidean", "inner_product", "euclidean"]
+)
 @pytest.mark.parametrize("dtype", [np.float32])
 @pytest.mark.parametrize("codebook_kind", ["subspace", "cluster"])
 @pytest.mark.parametrize("rotation", [True, False])
@@ -283,7 +298,7 @@ def test_ivf_pq_params(params):
         n_queries=1000,
         k=10,
         n_lists=params["n_lists"],
-        metric="l2_expanded",
+        metric="sqeuclidean",
         dtype=np.float32,
         pq_bits=params["pq_bits"],
         pq_dim=params["pq_dims"],
@@ -329,7 +344,7 @@ def test_ivf_pq_search_params(params):
         k=params["k"],
         n_lists=100,
         n_probes=params["n_probes"],
-        metric="l2_expanded",
+        metric="sqeuclidean",
         dtype=np.float32,
         lut_dtype=params["lut"],
         internal_distance_dtype=params["idd"],
@@ -337,16 +352,18 @@ def test_ivf_pq_search_params(params):
 
 
 @pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8])
-def test_extend(dtype):
+@pytest.mark.parametrize("array_type", ["host", "device"])
+def test_extend(dtype, array_type):
     run_ivf_pq_build_search_test(
         n_rows=10000,
         n_cols=10,
         n_queries=100,
         k=10,
         n_lists=100,
-        metric="l2_expanded",
+        metric="sqeuclidean",
         dtype=dtype,
         add_data_on_build=False,
+        array_type=array_type,
     )
 
 
@@ -358,7 +375,7 @@ def test_build_assertions():
             n_queries=100,
             k=10,
             n_lists=100,
-            metric="l2_expanded",
+            metric="sqeuclidean",
             dtype=np.float64,
         )
 
@@ -371,7 +388,7 @@ def test_build_assertions():
 
     index_params = ivf_pq.IndexParams(
         n_lists=50,
-        metric="l2_expanded",
+        metric="sqeuclidean",
         kmeans_n_iters=20,
         kmeans_trainset_fraction=1,
         add_data_on_build=False,
@@ -465,7 +482,7 @@ def test_search_inputs(params):
     out_dist_device = device_ndarray(out_dist)
 
     index_params = ivf_pq.IndexParams(
-        n_lists=50, metric="l2_expanded", add_data_on_build=True
+        n_lists=50, metric="sqeuclidean", add_data_on_build=True
     )
 
     dataset = generate_data((n_rows, n_cols), dtype)
@@ -483,3 +500,51 @@ def test_search_inputs(params):
             out_idx_device,
             out_dist_device,
         )
+
+
+def test_save_load():
+    n_rows = 10000
+    n_cols = 50
+    n_queries = 1000
+    dtype = np.float32
+
+    dataset = generate_data((n_rows, n_cols), dtype)
+    dataset_device = device_ndarray(dataset)
+
+    build_params = ivf_pq.IndexParams(n_lists=100, metric="sqeuclidean")
+    index = ivf_pq.build(build_params, dataset_device)
+
+    assert index.trained
+    filename = "my_index.bin"
+    ivf_pq.save(filename, index)
+    loaded_index = ivf_pq.load(filename)
+
+    assert index.pq_dim == loaded_index.pq_dim
+    assert index.pq_bits == loaded_index.pq_bits
+    assert index.metric == loaded_index.metric
+    assert index.n_lists == loaded_index.n_lists
+    assert index.size == loaded_index.size
+
+    queries = generate_data((n_queries, n_cols), dtype)
+
+    queries_device = device_ndarray(queries)
+    search_params = ivf_pq.SearchParams(n_probes=100)
+    k = 10
+
+    distance_dev, neighbors_dev = ivf_pq.search(
+        search_params, index, queries_device, k
+    )
+
+    neighbors = neighbors_dev.copy_to_host()
+    dist = distance_dev.copy_to_host()
+    del index
+
+    distance_dev, neighbors_dev = ivf_pq.search(
+        search_params, loaded_index, queries_device, k
+    )
+
+    neighbors2 = neighbors_dev.copy_to_host()
+    dist2 = distance_dev.copy_to_host()
+
+    assert np.all(neighbors == neighbors2)
+    assert np.allclose(dist, dist2, rtol=1e-6)
diff --git a/python/pylibraft/pylibraft/test/test_kmeans.py b/python/pylibraft/pylibraft/test/test_kmeans.py
index 44f60be310..4c2388de62 100644
--- a/python/pylibraft/pylibraft/test/test_kmeans.py
+++ b/python/pylibraft/pylibraft/test/test_kmeans.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,11 +16,42 @@
 import numpy as np
 import pytest
 
-from pylibraft.cluster.kmeans import cluster_cost, compute_new_centroids
-from pylibraft.common import Handle, device_ndarray
+from pylibraft.cluster.kmeans import (
+    KMeansParams,
+    cluster_cost,
+    compute_new_centroids,
+    fit,
+)
+from pylibraft.common import DeviceResources, device_ndarray
 from pylibraft.distance import pairwise_distance
 
 
+@pytest.mark.parametrize("n_rows", [100])
+@pytest.mark.parametrize("n_cols", [5, 25])
+@pytest.mark.parametrize("n_clusters", [5, 15])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_kmeans_fit(n_rows, n_cols, n_clusters, dtype):
+    # generate some random input points / centroids
+    X_host = np.random.random_sample((n_rows, n_cols)).astype(dtype)
+    centroids = device_ndarray(X_host[:n_clusters])
+    X = device_ndarray(X_host)
+
+    # compute the inertia, before fitting centroids
+    original_inertia = cluster_cost(X, centroids)
+
+    params = KMeansParams(n_clusters=n_clusters, seed=42)
+
+    # fit the centroids, make sure inertia has gone down
+    # TODO: once we have make_blobs exposed to python
+    # (https://github.com/rapidsai/raft/issues/1059)
+    # we should use that to test out the kmeans fit, like the C++
+    # tests do right now
+    centroids, inertia, n_iter = fit(params, X, centroids)
+    assert inertia < original_inertia
+    assert n_iter >= 1
+    assert np.allclose(cluster_cost(X, centroids), inertia, rtol=1e-6)
+
+
 @pytest.mark.parametrize("n_rows", [100])
 @pytest.mark.parametrize("n_cols", [5, 25])
 @pytest.mark.parametrize("n_clusters", [5, 15])
@@ -33,7 +64,7 @@ def test_compute_new_centroids(
 
     # A single RAFT handle can optionally be reused across
     # pylibraft functions.
-    handle = Handle()
+    handle = DeviceResources()
 
     X = np.random.random_sample((n_rows, n_cols)).astype(dtype)
     X_device = device_ndarray(X)
diff --git a/python/pylibraft/pylibraft/test/test_mdspan_serializer.py b/python/pylibraft/pylibraft/test/test_mdspan_serializer.py
new file mode 100644
index 0000000000..412cf676d0
--- /dev/null
+++ b/python/pylibraft/pylibraft/test/test_mdspan_serializer.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import pytest
+
+from pylibraft.common.mdspan import run_roundtrip_test_for_mdspan
+
+
+# TODO(hcho3): Set up hypothesis
+@pytest.mark.parametrize("dtype", ["float32", "float64", "int32", "uint32"])
+def test_mdspan_serializer(dtype):
+    X = np.random.random_sample((2, 3)).astype(dtype)
+    run_roundtrip_test_for_mdspan(X)
diff --git a/python/pylibraft/pylibraft/test/test_random.py b/python/pylibraft/pylibraft/test/test_random.py
index 229baffff5..76c0f53d3e 100644
--- a/python/pylibraft/pylibraft/test/test_random.py
+++ b/python/pylibraft/pylibraft/test/test_random.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-from pylibraft.common import Handle, device_ndarray
+from pylibraft.common import DeviceResources, device_ndarray
 from pylibraft.random import rmat
 
 
@@ -46,7 +46,7 @@ def test_rmat(n_edges, r_scale, c_scale, dtype):
     out_buff = np.empty((n_edges, 2), dtype=dtype)
     output_device = device_ndarray(out_buff)
 
-    handle = Handle()
+    handle = DeviceResources()
     rmat(output_device, theta_device, r_scale, c_scale, 12345, handle=handle)
     handle.sync()
     output = output_device.copy_to_host()
diff --git a/python/pylibraft/pylibraft/test/test_refine.py b/python/pylibraft/pylibraft/test/test_refine.py
index 49e4e71f9a..8502d0575c 100644
--- a/python/pylibraft/pylibraft/test/test_refine.py
+++ b/python/pylibraft/pylibraft/test/test_refine.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@ def run_refine(
     n_rows=500,
     n_cols=50,
     n_queries=100,
-    metric="l2_expanded",
+    metric="sqeuclidean",
     k0=40,
     k=10,
     inplace=False,
@@ -49,7 +49,7 @@ def run_refine(
     queries_device = device_ndarray(queries)
 
     # Calculate reference values with sklearn
-    skl_metric = {"l2_expanded": "euclidean", "inner_product": "cosine"}[
+    skl_metric = {"sqeuclidean": "euclidean", "inner_product": "cosine"}[
         metric
     ]
     nn_skl = NearestNeighbors(
@@ -106,7 +106,7 @@ def run_refine(
     if recall <= 0.999:
         # We did not find the same neighbor indices.
         # We could have found other neighbor with same distance.
-        if metric == "l2_expanded":
+        if metric == "sqeuclidean":
             skl_dist = np.power(skl_dist[:, :k], 2)
         elif metric == "inner_product":
             skl_dist = 1 - skl_dist[:, :k]
@@ -120,12 +120,10 @@ def run_refine(
 
 @pytest.mark.parametrize("n_queries", [100, 1024, 37])
 @pytest.mark.parametrize("inplace", [True, False])
-@pytest.mark.parametrize("metric", ["l2_expanded", "inner_product"])
+@pytest.mark.parametrize("metric", ["sqeuclidean", "inner_product"])
 @pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8])
 @pytest.mark.parametrize("memory_type", ["device", "host"])
 def test_refine_dtypes(n_queries, dtype, inplace, metric, memory_type):
-    if memory_type == "device" and dtype == np.int8:
-        pytest.xfail("Possibly incorrect distance calculation (IVF-Flat)")
     run_refine(
         n_rows=2000,
         n_queries=n_queries,
diff --git a/python/pylibraft/setup.py b/python/pylibraft/setup.py
index 15889fcd71..230b9127e3 100644
--- a/python/pylibraft/setup.py
+++ b/python/pylibraft/setup.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@
 install_requires = [
     "numpy",
     "cuda-python>=11.7.1,<12.0",
-    f"rmm{cuda_suffix}",
+    f"rmm{cuda_suffix}==23.2.*",
 ]
 
 extras_require = {
@@ -69,6 +69,7 @@ def get_versions():
         "Programming Language :: Python",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
     ],
     author="NVIDIA Corporation",
     include_package_data=True,
diff --git a/python/pylibraft/setuputils.py b/python/pylibraft/setuputils.py
deleted file mode 100755
index 0a3f421856..0000000000
--- a/python/pylibraft/setuputils.py
+++ /dev/null
@@ -1,65 +0,0 @@
-#
-# Copyright (c) 2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import glob
-import os
-import shutil
-import sys
-
-
-def get_environment_option(name):
-    ENV_VARIABLE = os.environ.get(name, False)
-
-    if not ENV_VARIABLE:
-        print("-- " + name + " environment variable not set.")
-
-    else:
-        print("-- " + name + " detected with value: " + str(ENV_VARIABLE))
-
-    return ENV_VARIABLE
-
-
-def get_cli_option(name):
-    if name in sys.argv:
-        print("-- Detected " + str(name) + " build option.")
-        return True
-
-    else:
-        return False
-
-
-def clean_folder(path):
-    """
-    Function to clean all Cython and Python artifacts and cache folders. It
-    clean the folder as well as its direct children recursively.
-
-    Parameters
-    ----------
-    path : String
-        Path to the folder to be cleaned.
-    """
-    shutil.rmtree(path + "/__pycache__", ignore_errors=True)
-
-    folders = glob.glob(path + "/*/")
-    for folder in folders:
-        shutil.rmtree(folder + "/__pycache__", ignore_errors=True)
-
-        clean_folder(folder)
-
-        cython_exts = glob.glob(folder + "/*.cpp")
-        cython_exts.extend(glob.glob(folder + "/*.cpython*"))
-        for file in cython_exts:
-            os.remove(file)
diff --git a/python/raft-dask/.coveragerc b/python/raft-dask/.coveragerc
new file mode 100644
index 0000000000..968c4b898a
--- /dev/null
+++ b/python/raft-dask/.coveragerc
@@ -0,0 +1,3 @@
+# Configuration file for Python coverage tests
+[run]
+source = raft_dask
\ No newline at end of file
diff --git a/python/raft-dask/CMakeLists.txt b/python/raft-dask/CMakeLists.txt
index 2a3b8390e3..742cd522c3 100644
--- a/python/raft-dask/CMakeLists.txt
+++ b/python/raft-dask/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -14,7 +14,7 @@
 
 cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
 
-set(raft_dask_version 22.12.01)
+set(raft_dask_version 23.02.00)
 
 include(../../fetch_rapids.cmake)
 
@@ -69,6 +69,8 @@ if(NOT raft_FOUND)
   endif()
 
   add_subdirectory(../../cpp raft-cpp ${_exclude_from_all})
+  list(APPEND CMAKE_MODULE_PATH ${CMAKE_BINARY_DIR}/cmake/find_modules)
+  find_package(NCCL REQUIRED)
 endif()
 
 include(rapids-cython)
diff --git a/python/raft-dask/raft_dask/__init__.py b/python/raft-dask/raft_dask/__init__.py
index 5face05ef3..092e68670a 100644
--- a/python/raft-dask/raft_dask/__init__.py
+++ b/python/raft-dask/raft_dask/__init__.py
@@ -13,4 +13,9 @@
 # limitations under the License.
 #
 
+from raft_dask._version import get_versions
+
 from .include_test import raft_include_test
+
+__version__ = get_versions()["version"]
+del get_versions
diff --git a/python/raft-dask/raft_dask/common/CMakeLists.txt b/python/raft-dask/raft_dask/common/CMakeLists.txt
index 9827869b98..3798b5ac4b 100644
--- a/python/raft-dask/raft_dask/common/CMakeLists.txt
+++ b/python/raft-dask/raft_dask/common/CMakeLists.txt
@@ -12,10 +12,8 @@
 # the License.
 # =============================================================================
 
-include(${raft-dask-python_SOURCE_DIR}/cmake/thirdparty/get_nccl.cmake)
-
 set(cython_sources comms_utils.pyx nccl.pyx)
-set(linked_libraries raft::raft raft::distributed NCCL::NCCL)
+set(linked_libraries raft::raft raft::distributed)
 rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}" ASSOCIATED_TARGETS raft LINKED_LIBRARIES "${linked_libraries}"
                                                                             CXX
diff --git a/python/raft-dask/raft_dask/common/comms_utils.pyx b/python/raft-dask/raft_dask/common/comms_utils.pyx
index 7db04ef455..768ba0e422 100644
--- a/python/raft-dask/raft_dask/common/comms_utils.pyx
+++ b/python/raft-dask/raft_dask/common/comms_utils.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -30,41 +30,49 @@ cdef extern from "nccl.h":
     ctypedef ncclComm *ncclComm_t
 
 cdef extern from "raft/core/handle.hpp" namespace "raft":
-    cdef cppclass handle_t:
-        handle_t() except +
+    cdef cppclass device_resources:
+        device_resources() except +
+
+cdef extern from "raft/core/device_resources.hpp" namespace "raft":
+    cdef cppclass device_resources:
+        device_resources() except +
 
 cdef extern from "raft/comms/std_comms.hpp" namespace "raft::comms":
 
-    void build_comms_nccl_ucx(handle_t *handle,
+    void build_comms_nccl_ucx(device_resources *handle,
                               ncclComm_t comm,
                               void *ucp_worker,
                               void *eps,
                               int size,
                               int rank) except +
 
-    void build_comms_nccl_only(handle_t *handle,
+    void build_comms_nccl_only(device_resources *handle,
                                ncclComm_t comm,
                                int size,
                                int rank) except +
 
 cdef extern from "raft/comms/comms_test.hpp" namespace "raft::comms":
 
-    bool test_collective_allreduce(const handle_t &h, int root) except +
-    bool test_collective_broadcast(const handle_t &h, int root) except +
-    bool test_collective_reduce(const handle_t &h, int root) except +
-    bool test_collective_allgather(const handle_t &h, int root) except +
-    bool test_collective_gather(const handle_t &h, int root) except +
-    bool test_collective_gatherv(const handle_t &h, int root) except +
-    bool test_collective_reducescatter(const handle_t &h, int root) except +
-    bool test_pointToPoint_simple_send_recv(const handle_t &h,
+    bool test_collective_allreduce(const device_resources &h, int root) \
+        except +
+    bool test_collective_broadcast(const device_resources &h, int root) \
+        except +
+    bool test_collective_reduce(const device_resources &h, int root) except +
+    bool test_collective_allgather(const device_resources &h, int root) \
+        except +
+    bool test_collective_gather(const device_resources &h, int root) except +
+    bool test_collective_gatherv(const device_resources &h, int root) except +
+    bool test_collective_reducescatter(const device_resources &h, int root) \
+        except +
+    bool test_pointToPoint_simple_send_recv(const device_resources &h,
                                             int numTrials) except +
-    bool test_pointToPoint_device_send_or_recv(const handle_t &h,
+    bool test_pointToPoint_device_send_or_recv(const device_resources &h,
                                                int numTrials) except +
-    bool test_pointToPoint_device_sendrecv(const handle_t &h,
+    bool test_pointToPoint_device_sendrecv(const device_resources &h,
                                            int numTrials) except +
-    bool test_pointToPoint_device_multicast_sendrecv(const handle_t &h,
+    bool test_pointToPoint_device_multicast_sendrecv(const device_resources &h,
                                                      int numTrials) except +
-    bool test_commsplit(const handle_t &h, int n_colors) except +
+    bool test_commsplit(const device_resources &h, int n_colors) except +
 
 
 def perform_test_comms_allreduce(handle, root):
@@ -76,7 +84,8 @@ def perform_test_comms_allreduce(handle, root):
     handle : raft.common.Handle
              handle containing comms_t to use
     """
-    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
+    cdef const device_resources* h = \
+        <device_resources*><size_t>handle.getHandle()
     return test_collective_allreduce(deref(h), root)
 
 
@@ -89,7 +98,8 @@ def perform_test_comms_reduce(handle, root):
     handle : raft.common.Handle
              handle containing comms_t to use
     """
-    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
+    cdef const device_resources* h = \
+        <device_resources*><size_t>handle.getHandle()
     return test_collective_reduce(deref(h), root)
 
 
@@ -102,7 +112,8 @@ def perform_test_comms_reducescatter(handle, root):
     handle : raft.common.Handle
              handle containing comms_t to use
     """
-    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
+    cdef const device_resources* h = \
+        <device_resources*><size_t>handle.getHandle()
     return test_collective_reducescatter(deref(h), root)
 
 
@@ -115,7 +126,8 @@ def perform_test_comms_bcast(handle, root):
     handle : raft.common.Handle
              handle containing comms_t to use
     """
-    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
+    cdef const device_resources* h = \
+        <device_resources*><size_t>handle.getHandle()
     return test_collective_broadcast(deref(h), root)
 
 
@@ -128,7 +140,8 @@ def perform_test_comms_allgather(handle, root):
     handle : raft.common.Handle
              handle containing comms_t to use
     """
-    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
+    cdef const device_resources* h = \
+        <device_resources*><size_t>handle.getHandle()
     return test_collective_allgather(deref(h), root)
 
 
@@ -143,7 +156,8 @@ def perform_test_comms_gather(handle, root):
     root : int
            Rank of the root worker
     """
-    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
+    cdef const device_resources* h = \
+        <device_resources*><size_t>handle.getHandle()
     return test_collective_gather(deref(h), root)
 
 
@@ -158,7 +172,8 @@ def perform_test_comms_gatherv(handle, root):
     root : int
            Rank of the root worker
     """
-    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
+    cdef const device_resources* h = \
+        <device_resources*><size_t>handle.getHandle()
     return test_collective_gatherv(deref(h), root)
 
 
@@ -173,7 +188,8 @@ def perform_test_comms_send_recv(handle, n_trials):
     n_trilas : int
                Number of test trials
     """
-    cdef const handle_t *h = <handle_t*><size_t>handle.getHandle()
+    cdef const device_resources *h = \
+        <device_resources*><size_t>handle.getHandle()
     return test_pointToPoint_simple_send_recv(deref(h), <int>n_trials)
 
 
@@ -188,7 +204,8 @@ def perform_test_comms_device_send_or_recv(handle, n_trials):
     n_trilas : int
                Number of test trials
     """
-    cdef const handle_t *h = <handle_t*><size_t>handle.getHandle()
+    cdef const device_resources *h = \
+        <device_resources*><size_t>handle.getHandle()
     return test_pointToPoint_device_send_or_recv(deref(h), <int>n_trials)
 
 
@@ -203,7 +220,8 @@ def perform_test_comms_device_sendrecv(handle, n_trials):
     n_trilas : int
                Number of test trials
     """
-    cdef const handle_t *h = <handle_t*><size_t>handle.getHandle()
+    cdef const device_resources *h = \
+        <device_resources*><size_t>handle.getHandle()
     return test_pointToPoint_device_sendrecv(deref(h), <int>n_trials)
 
 
@@ -218,7 +236,8 @@ def perform_test_comms_device_multicast_sendrecv(handle, n_trials):
     n_trilas : int
                Number of test trials
     """
-    cdef const handle_t *h = <handle_t *> <size_t> handle.getHandle()
+    cdef const device_resources *h = \
+        <device_resources *> <size_t> handle.getHandle()
     return test_pointToPoint_device_multicast_sendrecv(deref(h), <int>n_trials)
 
 
@@ -231,7 +250,8 @@ def perform_test_comm_split(handle, n_colors):
     handle : raft.common.Handle
              handle containing comms_t to use
     """
-    cdef const handle_t * h = < handle_t * > < size_t > handle.getHandle()
+    cdef const device_resources * h = \
+        < device_resources * > < size_t > handle.getHandle()
     return test_commsplit(deref(h), < int > n_colors)
 
 
@@ -254,7 +274,7 @@ def inject_comms_on_handle_coll_only(handle, nccl_inst, size, rank, verbose):
     """
 
     cdef size_t handle_size_t = <size_t>handle.getHandle()
-    handle_ = <handle_t*>handle_size_t
+    handle_ = <device_resources*>handle_size_t
 
     cdef size_t nccl_comm_size_t = <size_t>nccl_inst.get_comm()
     nccl_comm_ = <ncclComm_t*>nccl_comm_size_t
@@ -296,7 +316,7 @@ def inject_comms_on_handle(handle, nccl_inst, ucp_worker, eps, size,
     cdef void* ucp_worker_st = <void*><size_t>ucp_worker
 
     cdef size_t handle_size_t = <size_t>handle.getHandle()
-    handle_ = <handle_t*>handle_size_t
+    handle_ = <device_resources*>handle_size_t
 
     cdef size_t nccl_comm_size_t = <size_t>nccl_inst.get_comm()
     nccl_comm_ = <ncclComm_t*>nccl_comm_size_t
diff --git a/python/raft-dask/setup.cfg b/python/raft-dask/setup.cfg
index b005a7ab8f..e218f00c3e 100644
--- a/python/raft-dask/setup.cfg
+++ b/python/raft-dask/setup.cfg
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 [versioneer]
 VCS = git
@@ -46,4 +46,4 @@ skip=
 
 [options]
 packages = find:
-python_requires = >=3.7,<3.10
+python_requires = >=3.8,<3.11
diff --git a/python/raft-dask/setup.py b/python/raft-dask/setup.py
index 3171867928..5feddb626e 100644
--- a/python/raft-dask/setup.py
+++ b/python/raft-dask/setup.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -26,11 +26,11 @@
     "numpy",
     "numba>=0.49",
     "joblib>=0.11",
-    "dask-cuda==22.12",
-    "dask==2022.11.1",
-    f"ucx-py{cuda_suffix}",
-    "distributed==2022.11.1",
-    f"pylibraft{cuda_suffix}",
+    "dask-cuda==23.2.*",
+    "dask==2023.1.1",
+    f"ucx-py{cuda_suffix}==0.30.*",
+    "distributed==2023.1.1",
+    f"pylibraft{cuda_suffix}==23.2.*",
 ]
 
 extras_require = {
@@ -73,6 +73,7 @@ def get_versions():
         "Programming Language :: Python",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
     ],
     author="NVIDIA Corporation",
     include_package_data=True,
diff --git a/python/raft-dask/setuputils.py b/python/raft-dask/setuputils.py
deleted file mode 100755
index 9370d29876..0000000000
--- a/python/raft-dask/setuputils.py
+++ /dev/null
@@ -1,65 +0,0 @@
-#
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import glob
-import os
-import shutil
-import sys
-
-
-def get_environment_option(name):
-    ENV_VARIABLE = os.environ.get(name, False)
-
-    if not ENV_VARIABLE:
-        print("-- " + name + " environment variable not set.")
-
-    else:
-        print("-- " + name + " detected with value: " + str(ENV_VARIABLE))
-
-    return ENV_VARIABLE
-
-
-def get_cli_option(name):
-    if name in sys.argv:
-        print("-- Detected " + str(name) + " build option.")
-        return True
-
-    else:
-        return False
-
-
-def clean_folder(path):
-    """
-    Function to clean all Cython and Python artifacts and cache folders. It
-    clean the folder as well as its direct children recursively.
-
-    Parameters
-    ----------
-    path : String
-        Path to the folder to be cleaned.
-    """
-    shutil.rmtree(path + "/__pycache__", ignore_errors=True)
-
-    folders = glob.glob(path + "/*/")
-    for folder in folders:
-        shutil.rmtree(folder + "/__pycache__", ignore_errors=True)
-
-        clean_folder(folder)
-
-        cython_exts = glob.glob(folder + "/*.cpp")
-        cython_exts.extend(glob.glob(folder + "/*.cpython*"))
-        for file in cython_exts:
-            os.remove(file)
diff --git a/thirdparty/LICENSES/LICENSE.faiss b/thirdparty/LICENSES/LICENSE.faiss
new file mode 100644
index 0000000000..87cbf536c6
--- /dev/null
+++ b/thirdparty/LICENSES/LICENSE.faiss
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) Facebook, Inc. and its affiliates.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file