diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 536537f07f..27f619f391 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 92e7613a9b..db2a5dbcc6 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,27 +5,27 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda11.8-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/ucx:24.6": {
+    "ghcr.io/rapidsai/devcontainers/features/ucx:24.8": {
       "version": "1.15.0"
     },
-    "ghcr.io/rapidsai/devcontainers/features/cuda:24.6": {
+    "ghcr.io/rapidsai/devcontainers/features/cuda:24.8": {
       "version": "11.8",
       "installcuBLAS": true,
       "installcuSOLVER": true,
       "installcuRAND": true,
       "installcuSPARSE": true
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/ucx",
diff --git a/.devcontainer/cuda12.2-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json
similarity index 84%
rename from .devcontainer/cuda12.2-conda/devcontainer.json
rename to .devcontainer/cuda12.5-conda/devcontainer.json
index 948680eaf6..836a5feacd 100644
--- a/.devcontainer/cuda12.2-conda/devcontainer.json
+++ b/.devcontainer/cuda12.5-conda/devcontainer.json
@@ -3,24 +3,24 @@
     "context": "${localWorkspaceFolder}/.devcontainer",
     "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
     "args": {
-      "CUDA": "12.2",
+      "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.06-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.08-cpp-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.2-envs}"],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.5-envs}"],
   "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/raft,type=bind,consistency=consistent",
@@ -29,7 +29,7 @@
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.2-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.5-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json
similarity index 80%
rename from .devcontainer/cuda12.2-pip/devcontainer.json
rename to .devcontainer/cuda12.5-pip/devcontainer.json
index cd287569d8..28798cbbf5 100644
--- a/.devcontainer/cuda12.2-pip/devcontainer.json
+++ b/.devcontainer/cuda12.5-pip/devcontainer.json
@@ -3,36 +3,36 @@
     "context": "${localWorkspaceFolder}/.devcontainer",
     "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
     "args": {
-      "CUDA": "12.2",
+      "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.06-cpp-cuda12.2-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda12.5-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/ucx:24.6": {
+    "ghcr.io/rapidsai/devcontainers/features/ucx:24.8": {
       "version": "1.15.0"
     },
-    "ghcr.io/rapidsai/devcontainers/features/cuda:24.6": {
-      "version": "12.2",
+    "ghcr.io/rapidsai/devcontainers/features/cuda:24.8": {
+      "version": "12.5",
       "installcuBLAS": true,
       "installcuSOLVER": true,
       "installcuRAND": true,
       "installcuSPARSE": true
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/ucx",
     "ghcr.io/rapidsai/devcontainers/features/cuda",
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.2-venvs}"],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs}"],
   "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/raft,type=bind,consistency=consistent",
@@ -40,7 +40,7 @@
     "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.2-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index d1cc52592c..a7f455e200 100755
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -17,7 +17,7 @@ build.sh           @rapidsai/raft-cmake-codeowners
 /.pre-commit-config.yaml @rapidsai/ci-codeowners
 
 #packaging code owners
-/.devcontainers/   @rapidsai/packaging-codeowners
+/.devcontainer/    @rapidsai/packaging-codeowners
 /conda/            @rapidsai/packaging-codeowners
 /dependencies.yaml @rapidsai/packaging-codeowners
 /build.sh          @rapidsai/packaging-codeowners
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index e013d4f1c5..e6f7043f82 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-pylibraft:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
   wheel-publish-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -88,7 +88,7 @@ jobs:
       package-name: pylibraft
   wheel-build-raft-dask:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -98,7 +98,7 @@ jobs:
   wheel-publish-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index c2d9556859..e6c9604221 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -25,29 +25,29 @@ jobs:
       - wheel-tests-raft-dask
       - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
     with:
       build_type: pull-request
       node_type: cpu16
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08
     with:
       build_type: pull-request
       enable_check_symbols: true
@@ -55,19 +55,19 @@ jobs:
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -77,37 +77,37 @@ jobs:
   wheel-build-pylibraft:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       build_type: pull-request
       script: ci/build_wheel_pylibraft.sh
   wheel-tests-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       build_type: pull-request
       script: ci/test_wheel_pylibraft.sh
   wheel-build-raft-dask:
     needs: wheel-tests-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       build_type: pull-request
       script: "ci/build_wheel_raft_dask.sh"
   wheel-tests-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       build_type: pull-request
       script: ci/test_wheel_raft_dask.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.08
     with:
       arch: '["amd64"]'
-      cuda: '["12.2"]'
+      cuda: '["12.5"]'
       build_command: |
         sccache -z;
         build-all -DBUILD_PRIMS_BENCH=ON -DBUILD_ANN_BENCH=ON --verbose;
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 18094cc05a..0eba0f27d1 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -26,7 +26,7 @@ jobs:
       symbol_exclusions: _ZN\d+raft_cutlass
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -34,7 +34,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -42,7 +42,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-pylibraft:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
       script: ci/test_wheel_pylibraft.sh
   wheel-tests-raft-dask:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2b89948ec1..692fce5e16 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -94,17 +94,12 @@ repos:
                 additional_dependencies: [tomli]
                 args: ["--toml", "pyproject.toml"]
                 exclude: (?x)^(^CHANGELOG.md$)
-      - repo: https://github.com/rapidsai/dependency-file-generator
-        rev: v1.8.0
-        hooks:
-            - id: rapids-dependency-file-generator
-              args: ["--clean"]
       - repo: https://github.com/pre-commit/pre-commit-hooks
         rev: v4.5.0
         hooks:
               - id: check-json
       - repo: https://github.com/rapidsai/pre-commit-hooks
-        rev: v0.0.3
+        rev: v0.2.0
         hooks:
           - id: verify-copyright
             files: |
@@ -120,6 +115,13 @@ repos:
                   cpp/include/raft/thirdparty/|
                   docs/source/sphinxext/github_link[.]py|
                   cpp/cmake/modules/FindAVX[.]cmake
+          - id: verify-alpha-spec
+            args: ["--fix", "--mode=release"]
+      - repo: https://github.com/rapidsai/dependency-file-generator
+        rev: v1.13.11
+        hooks:
+            - id: rapids-dependency-file-generator
+              args: ["--clean"]
 
 default_language_version:
       python: python3
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e0599dae8a..0685145dca 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,53 @@
+# raft 24.08.00 (7 Aug 2024)
+
+## 🚨 Breaking Changes
+
+- [Refactor] move `popc` to under util ([#2394](https://github.com/rapidsai/raft/pull/2394)) [@rhdong](https://github.com/rhdong)
+- [Opt] Expose the `detail::popc` as public API ([#2346](https://github.com/rapidsai/raft/pull/2346)) [@rhdong](https://github.com/rhdong)
+
+## 🐛 Bug Fixes
+
+- Add timeout to UCXX generic operations ([#2398](https://github.com/rapidsai/raft/pull/2398)) [@pentschev](https://github.com/pentschev)
+- [Fix] bitmap set/test issue ([#2371](https://github.com/rapidsai/raft/pull/2371)) [@rhdong](https://github.com/rhdong)
+- Fix 0 recall issue in `raft_cagra_hnswlib` ANN benchmark ([#2369](https://github.com/rapidsai/raft/pull/2369)) [@divyegala](https://github.com/divyegala)
+- Fix `ef` setting in HNSW wrapper ([#2367](https://github.com/rapidsai/raft/pull/2367)) [@divyegala](https://github.com/divyegala)
+- Fix cagra graph opt bug ([#2365](https://github.com/rapidsai/raft/pull/2365)) [@enp1s0](https://github.com/enp1s0)
+- Fix a bug where the wrong API is used to free the memory ([#2361](https://github.com/rapidsai/raft/pull/2361)) [@PointKernel](https://github.com/PointKernel)
+- Allow anonymous user in devcontainer name ([#2355](https://github.com/rapidsai/raft/pull/2355)) [@bdice](https://github.com/bdice)
+- Fix compilation error when _CLK_BREAKDOWN is defined in cagra. ([#2350](https://github.com/rapidsai/raft/pull/2350)) [@jiangyinzuo](https://github.com/jiangyinzuo)
+- ensure raft-dask wheel tests install pylibraft wheel from the same CI run, fix wheel dependencies ([#2349](https://github.com/rapidsai/raft/pull/2349)) [@jameslamb](https://github.com/jameslamb)
+- Change --config-setting to --config-settings ([#2342](https://github.com/rapidsai/raft/pull/2342)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Add workaround for syevd in CUDA 12.0 ([#2332](https://github.com/rapidsai/raft/pull/2332)) [@lowener](https://github.com/lowener)
+
+## 🚀 New Features
+
+- [FEA] add the support of `masked_matmul` ([#2362](https://github.com/rapidsai/raft/pull/2362)) [@rhdong](https://github.com/rhdong)
+- [FEA] Dice Distance for Dense Inputs ([#2359](https://github.com/rapidsai/raft/pull/2359)) [@aamijar](https://github.com/aamijar)
+- [Opt] Expose the `detail::popc` as public API ([#2346](https://github.com/rapidsai/raft/pull/2346)) [@rhdong](https://github.com/rhdong)
+- Enable distance return for NN Descent ([#2345](https://github.com/rapidsai/raft/pull/2345)) [@jinsolp](https://github.com/jinsolp)
+
+## 🛠️ Improvements
+
+- [Refactor] move `popc` to under util ([#2394](https://github.com/rapidsai/raft/pull/2394)) [@rhdong](https://github.com/rhdong)
+- split up CUDA-suffixed dependencies in dependencies.yaml ([#2388](https://github.com/rapidsai/raft/pull/2388)) [@jameslamb](https://github.com/jameslamb)
+- Use workflow branch 24.08 again ([#2385](https://github.com/rapidsai/raft/pull/2385)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Add cusparseSpMV_preprocess to cusparse wrapper ([#2384](https://github.com/rapidsai/raft/pull/2384)) [@Kh4ster](https://github.com/Kh4ster)
+- Consolidate SUM reductions ([#2381](https://github.com/rapidsai/raft/pull/2381)) [@mfoerste4](https://github.com/mfoerste4)
+- Use slicing kernel to copy distances inside NN Descent ([#2380](https://github.com/rapidsai/raft/pull/2380)) [@jinsolp](https://github.com/jinsolp)
+- Build and test with CUDA 12.5.1 ([#2378](https://github.com/rapidsai/raft/pull/2378)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Add CUDA_STATIC_MATH_LIBRARIES ([#2376](https://github.com/rapidsai/raft/pull/2376)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- skip CMake 3.30.0 ([#2375](https://github.com/rapidsai/raft/pull/2375)) [@jameslamb](https://github.com/jameslamb)
+- Use verify-alpha-spec hook ([#2373](https://github.com/rapidsai/raft/pull/2373)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Binarize Dice Distance for Dense Inputs ([#2370](https://github.com/rapidsai/raft/pull/2370)) [@aamijar](https://github.com/aamijar)
+- [FEA] Add distance epilogue for NN Descent ([#2364](https://github.com/rapidsai/raft/pull/2364)) [@jinsolp](https://github.com/jinsolp)
+- resolve dependency-file-generator warning, other rapids-build-backend followup ([#2360](https://github.com/rapidsai/raft/pull/2360)) [@jameslamb](https://github.com/jameslamb)
+- Remove text builds of documentation ([#2354](https://github.com/rapidsai/raft/pull/2354)) [@vyasr](https://github.com/vyasr)
+- Use default init in reduction ([#2351](https://github.com/rapidsai/raft/pull/2351)) [@akifcorduk](https://github.com/akifcorduk)
+- ensure update-version.sh preserves alpha spec, add tests on version constants ([#2344](https://github.com/rapidsai/raft/pull/2344)) [@jameslamb](https://github.com/jameslamb)
+- remove unnecessary &#39;setuptools&#39; dependencies ([#2343](https://github.com/rapidsai/raft/pull/2343)) [@jameslamb](https://github.com/jameslamb)
+- Use rapids-build-backend ([#2331](https://github.com/rapidsai/raft/pull/2331)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Add FAISS with RAFT enabled Benchmarking to raft-ann-bench ([#2026](https://github.com/rapidsai/raft/pull/2026)) [@tarang-jain](https://github.com/tarang-jain)
+
 # raft 24.06.00 (5 Jun 2024)
 
 ## 🚨 Breaking Changes
diff --git a/README.md b/README.md
index fc56859557..d6f3ef9320 100755
--- a/README.md
+++ b/README.md
@@ -281,19 +281,19 @@ mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft cuda-vers
 ```
 
 ```bash
-# for CUDA 12.0
-mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft cuda-version=12.0
+# for CUDA 12.5
+mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft cuda-version=12.5
 ```
 
 Note that the above commands will also install `libraft-headers` and `libraft`.
 
 You can also install the conda packages individually using the `mamba` command above. For example, if you'd like to install RAFT's headers and pre-compiled shared library to use in your project:
 ```bash
-# for CUDA 12.0
-mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-headers cuda-version=12.0
+# for CUDA 12.5
+mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-headers cuda-version=12.5
 ```
 
-If installing the C++ APIs please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.06/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
+If installing the C++ APIs please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.08/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
 
 ### Installing Python through Pip
 
@@ -369,7 +369,7 @@ If citing the k-selection routines, please consider the following bibtex:
     isbn = {9798400701092},
     publisher = {Association for Computing Machinery},
     address = {New York, NY, USA},
-    location = {Denver, CO, USA}
+    location = {Denver, CO, USA},
     series = {SC '23}
 }
 ```
diff --git a/VERSION b/VERSION
index 0bff6981a3..ec8489fda9 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.06.00
+24.08.00
diff --git a/build.sh b/build.sh
index 148d23c9c1..a77dd188f4 100755
--- a/build.sh
+++ b/build.sh
@@ -496,18 +496,18 @@ fi
 # Build and (optionally) install the pylibraft Python package
 if (( ${NUMARGS} == 0 )) || hasArg pylibraft; then
     SKBUILD_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS}" \
-        python -m pip install --no-build-isolation --no-deps ${REPODIR}/python/pylibraft
+        python -m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true ${REPODIR}/python/pylibraft
 fi
 
 # Build and (optionally) install the raft-dask Python package
 if (( ${NUMARGS} == 0 )) || hasArg raft-dask; then
     SKBUILD_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS}" \
-        python -m pip install --no-build-isolation --no-deps ${REPODIR}/python/raft-dask
+        python -m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true ${REPODIR}/python/raft-dask
 fi
 
 # Build and (optionally) install the raft-ann-bench Python package
 if (( ${NUMARGS} == 0 )) || hasArg bench-ann; then
-    python -m pip install --no-build-isolation --no-deps ${REPODIR}/python/raft-ann-bench -vvv
+    python -m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true ${REPODIR}/python/raft-ann-bench -vvv
 fi
 
 if hasArg docs; then
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index 2778c2a7d7..c456bcae80 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -13,10 +13,8 @@ export CMAKE_GENERATOR=Ninja
 
 rapids-print-env
 
-version=$(rapids-generate-version)
-
 rapids-logger "Begin cpp build"
 
-RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild conda/recipes/libraft
+RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry mambabuild conda/recipes/libraft
 
 rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 9605b52f8b..a2447f5f06 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -8,7 +8,7 @@ rapids-logger "Create test conda environment"
 
 rapids-dependency-file-generator \
   --output conda \
-  --file_key docs \
+  --file-key docs \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
 
 rapids-mamba-retry env create --yes -f env.yaml -n docs
@@ -41,10 +41,8 @@ popd
 rapids-logger "Build Python docs"
 pushd docs
 sphinx-build -b dirhtml source _html
-sphinx-build -b text source _text
-mkdir -p "${RAPIDS_DOCS_DIR}/raft/"{html,txt}
+mkdir -p "${RAPIDS_DOCS_DIR}/raft/"html
 mv _html/* "${RAPIDS_DOCS_DIR}/raft/html"
-mv _text/* "${RAPIDS_DOCS_DIR}/raft/txt"
 popd
 
 rapids-upload-docs
diff --git a/ci/build_python.sh b/ci/build_python.sh
index a8b76269ae..80d37b5ae3 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -22,12 +22,6 @@ git_commit=$(git rev-parse HEAD)
 export RAPIDS_PACKAGE_VERSION=${version}
 echo "${version}" > VERSION
 
-package_dir="python"
-for package_name in pylibraft raft-dask; do
-  underscore_package_name=$(echo "${package_name}" | tr "-" "_")
-  sed -i "/^__git_commit__/ s/= .*/= \"${git_commit}\"/g" "${package_dir}/${package_name}/${underscore_package_name}/_version.py"
-done
-
 # TODO: Remove `--no-test` flags once importing on a CPU
 # node works correctly
 rapids-conda-retry mambabuild \
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index e3e7ce9c89..62d93a668e 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -14,47 +14,9 @@ rm -rf /usr/lib64/libuc*
 source rapids-configure-sccache
 source rapids-date-string
 
-version=$(rapids-generate-version)
-git_commit=$(git rev-parse HEAD)
-
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
-# This is the version of the suffix with a preceding hyphen. It's used
-# everywhere except in the final wheel name.
-PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}"
-
-# Patch project metadata files to include the CUDA version suffix and version override.
-pyproject_file="${package_dir}/pyproject.toml"
-version_file="${package_dir}/${underscore_package_name}/_version.py"
-
-sed -i "s/name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
-echo "${version}" > VERSION
-sed -i "/^__git_commit__ / s/= .*/= \"${git_commit}\"/g" ${version_file}
-
-# For nightlies we want to ensure that we're pulling in alphas as well. The
-# easiest way to do so is to augment the spec with a constraint containing a
-# min alpha version that doesn't affect the version bounds but does allow usage
-# of alpha versions for that dependency without --pre
-alpha_spec=''
-if ! rapids-is-release-build; then
-    alpha_spec=',>=0.0.0a0'
-fi
-
-if [[ ${package_name} == "raft-dask" ]]; then
-    sed -r -i "s/pylibraft==(.*)\"/pylibraft${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
-    sed -r -i "s/libucx(.*)\"/libucx${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
-    sed -r -i "s/ucx-py==(.*)\"/ucx-py${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
-    sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" ${pyproject_file}
-    sed -r -i "s/dask-cuda==(.*)\"/dask-cuda==\1${alpha_spec}\"/g" ${pyproject_file}
-    sed -r -i "s/distributed-ucxx==(.*)\"/distributed-ucxx${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
-else
-    sed -r -i "s/rmm(.*)\"/rmm${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
-fi
-
-if [[ $PACKAGE_CUDA_SUFFIX == "-cu12" ]]; then
-    sed -i "s/cuda-python[<=>\.,0-9a]*/cuda-python>=12.0,<13.0a0/g" ${pyproject_file}
-    sed -i "s/cupy-cuda11x/cupy-cuda12x/g" ${pyproject_file}
-fi
+rapids-generate-version > VERSION
 
 cd "${package_dir}"
 
diff --git a/ci/check_style.sh b/ci/check_style.sh
index d7baa88e8f..d7ba4cae25 100755
--- a/ci/check_style.sh
+++ b/ci/check_style.sh
@@ -8,7 +8,7 @@ rapids-logger "Create checks conda environment"
 
 rapids-dependency-file-generator \
   --output conda \
-  --file_key checks \
+  --file-key checks \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
 
 rapids-mamba-retry env create --yes -f env.yaml -n checks
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 9554a7dde8..194ad9a07b 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -56,21 +56,21 @@ DEPENDENCIES=(
 )
 for FILE in dependencies.yaml conda/environments/*.yaml; do
   for DEP in "${DEPENDENCIES[@]}"; do
-    sed_runner "/-.* ${DEP}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}\.*/g" ${FILE};
+    sed_runner "/-.* ${DEP}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE};
   done
-  sed_runner "/-.* ucx-py==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
-  sed_runner "/-.* ucx-py-cu11==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
-  sed_runner "/-.* ucx-py-cu12==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
-  sed_runner "/-.* libucxx==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
-  sed_runner "/-.* distributed-ucxx==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
-  sed_runner "/-.* distributed-ucxx-cu11==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
-  sed_runner "/-.* distributed-ucxx-cu12==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
+  sed_runner "/-.* ucx-py==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE};
+  sed_runner "/-.* ucx-py-cu11==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE};
+  sed_runner "/-.* ucx-py-cu12==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE};
+  sed_runner "/-.* libucxx==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE};
+  sed_runner "/-.* distributed-ucxx==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE};
+  sed_runner "/-.* distributed-ucxx-cu11==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE};
+  sed_runner "/-.* distributed-ucxx-cu12==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE};
 done
 for FILE in python/*/pyproject.toml; do
   for DEP in "${DEPENDENCIES[@]}"; do
-    sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*\"/g" ${FILE}
+    sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" ${FILE}
   done
-  sed_runner "/\"ucx-py==/ s/==.*\"/==${NEXT_UCX_PY_SHORT_TAG_PEP440}.*\"/g" ${FILE}
+  sed_runner "/\"ucx-py==/ s/==.*\"/==${NEXT_UCX_PY_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" ${FILE}
 done
 
 sed_runner "/^ucx_py_version:$/ {n;s/.*/  - \"${NEXT_UCX_PY_VERSION}\"/}" conda/recipes/raft-dask/conda_build_config.yaml
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index f83ddf616d..05323e4f5d 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -11,7 +11,7 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
 rapids-logger "Generate C++ testing dependencies"
 rapids-dependency-file-generator \
   --output conda \
-  --file_key test_cpp \
+  --file-key test_cpp \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee env.yaml
 
 rapids-mamba-retry env create --yes -f env.yaml -n test
diff --git a/ci/test_python.sh b/ci/test_python.sh
index 59da1f0bc4..01e5ac9456 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -11,7 +11,7 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
 rapids-logger "Generate Python testing dependencies"
 rapids-dependency-file-generator \
   --output conda \
-  --file_key test_python \
+  --file-key test_python \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
 
 rapids-mamba-retry env create --yes -f env.yaml -n test
diff --git a/ci/test_wheel_raft_dask.sh b/ci/test_wheel_raft_dask.sh
index bd531e7e85..9b1187592d 100755
--- a/ci/test_wheel_raft_dask.sh
+++ b/ci/test_wheel_raft_dask.sh
@@ -11,7 +11,8 @@ RAPIDS_PY_WHEEL_NAME="raft_dask_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels
 RAPIDS_PY_WHEEL_NAME="pylibraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibraft-dep
 python -m pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl
 
-python -m pip install "raft_dask-${RAPIDS_PY_CUDA_SUFFIX}[test]>=0.0.0a0" --find-links dist/
+# echo to expand wildcard before adding `[extra]` requires for pip
+python -m pip install -v "$(echo ./dist/raft_dask_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]"
 
 test_dir="python/raft-dask/raft_dask/test"
 
diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index 590c3eb68b..0728f1257d 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -11,7 +11,7 @@ dependencies:
 - c-compiler
 - clang-tools=16.0.6
 - clang==16.0.6
-- cmake>=3.26.4
+- cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
 - cuda-python>=11.7.1,<12.0a0
@@ -20,8 +20,8 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.6.*
-- distributed-ucxx==0.38.*
+- dask-cuda==24.8.*
+- distributed-ucxx==0.39.*
 - doxygen>=1.8.20
 - gcc_linux-aarch64=11.*
 - graphviz
@@ -35,7 +35,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- libucxx==0.38.*
+- libucxx==0.39.*
 - nccl>=2.9.9
 - ninja
 - numba>=0.57
@@ -44,16 +44,18 @@ dependencies:
 - nvcc_linux-aarch64=11.8
 - pre-commit
 - pydata-sphinx-theme
+- pylibraft==24.8.*
 - pytest-cov
 - pytest==7.*
-- rapids-dask-dependency==24.6.*
+- rapids-build-backend>=0.3.0,<0.4.0.dev0
+- rapids-dask-dependency==24.8.*
 - recommonmark
-- rmm==24.6.*
+- rmm==24.8.*
 - scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sysroot_linux-aarch64==2.17
-- ucx-py==0.38.*
+- ucx-py==0.39.*
 name: all_cuda-118_arch-aarch64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 00ed8fa65e..a46ace8a29 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -11,7 +11,7 @@ dependencies:
 - c-compiler
 - clang-tools=16.0.6
 - clang==16.0.6
-- cmake>=3.26.4
+- cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
 - cuda-python>=11.7.1,<12.0a0
@@ -20,8 +20,8 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.6.*
-- distributed-ucxx==0.38.*
+- dask-cuda==24.8.*
+- distributed-ucxx==0.39.*
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
 - graphviz
@@ -35,7 +35,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- libucxx==0.38.*
+- libucxx==0.39.*
 - nccl>=2.9.9
 - ninja
 - numba>=0.57
@@ -44,16 +44,18 @@ dependencies:
 - nvcc_linux-64=11.8
 - pre-commit
 - pydata-sphinx-theme
+- pylibraft==24.8.*
 - pytest-cov
 - pytest==7.*
-- rapids-dask-dependency==24.6.*
+- rapids-build-backend>=0.3.0,<0.4.0.dev0
+- rapids-dask-dependency==24.8.*
 - recommonmark
-- rmm==24.6.*
+- rmm==24.8.*
 - scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sysroot_linux-64==2.17
-- ucx-py==0.38.*
+- ucx-py==0.39.*
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-122_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
similarity index 76%
rename from conda/environments/all_cuda-122_arch-aarch64.yaml
rename to conda/environments/all_cuda-125_arch-aarch64.yaml
index f1f346706d..1f7604a5b5 100644
--- a/conda/environments/all_cuda-122_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -11,18 +11,18 @@ dependencies:
 - c-compiler
 - clang-tools=16.0.6
 - clang==16.0.6
-- cmake>=3.26.4
+- cmake>=3.26.4,!=3.30.0
 - cuda-cudart-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
 - cuda-python>=12.0,<13.0a0
-- cuda-version=12.2
+- cuda-version=12.5
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.6.*
-- distributed-ucxx==0.38.*
+- dask-cuda==24.8.*
+- distributed-ucxx==0.39.*
 - doxygen>=1.8.20
 - gcc_linux-aarch64=11.*
 - graphviz
@@ -32,7 +32,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- libucxx==0.38.*
+- libucxx==0.39.*
 - nccl>=2.9.9
 - ninja
 - numba>=0.57
@@ -40,16 +40,18 @@ dependencies:
 - numpydoc
 - pre-commit
 - pydata-sphinx-theme
+- pylibraft==24.8.*
 - pytest-cov
 - pytest==7.*
-- rapids-dask-dependency==24.6.*
+- rapids-build-backend>=0.3.0,<0.4.0.dev0
+- rapids-dask-dependency==24.8.*
 - recommonmark
-- rmm==24.6.*
+- rmm==24.8.*
 - scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sysroot_linux-aarch64==2.17
-- ucx-py==0.38.*
-name: all_cuda-122_arch-aarch64
+- ucx-py==0.39.*
+name: all_cuda-125_arch-aarch64
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
similarity index 76%
rename from conda/environments/all_cuda-122_arch-x86_64.yaml
rename to conda/environments/all_cuda-125_arch-x86_64.yaml
index 505a4f1a97..576666e2f4 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -11,18 +11,18 @@ dependencies:
 - c-compiler
 - clang-tools=16.0.6
 - clang==16.0.6
-- cmake>=3.26.4
+- cmake>=3.26.4,!=3.30.0
 - cuda-cudart-dev
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
 - cuda-python>=12.0,<13.0a0
-- cuda-version=12.2
+- cuda-version=12.5
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.6.*
-- distributed-ucxx==0.38.*
+- dask-cuda==24.8.*
+- distributed-ucxx==0.39.*
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
 - graphviz
@@ -32,7 +32,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- libucxx==0.38.*
+- libucxx==0.39.*
 - nccl>=2.9.9
 - ninja
 - numba>=0.57
@@ -40,16 +40,18 @@ dependencies:
 - numpydoc
 - pre-commit
 - pydata-sphinx-theme
+- pylibraft==24.8.*
 - pytest-cov
 - pytest==7.*
-- rapids-dask-dependency==24.6.*
+- rapids-build-backend>=0.3.0,<0.4.0.dev0
+- rapids-dask-dependency==24.8.*
 - recommonmark
-- rmm==24.6.*
+- rmm==24.8.*
 - scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sysroot_linux-64==2.17
-- ucx-py==0.38.*
-name: all_cuda-122_arch-x86_64
+- ucx-py==0.39.*
+name: all_cuda-125_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index 7315f82c13..1116bbb971 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -11,7 +11,7 @@ dependencies:
 - c-compiler
 - clang-tools=16.0.6
 - clang==16.0.6
-- cmake>=3.26.4
+- cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
 - cuda-version=11.8
@@ -30,7 +30,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- libucxx==0.38.*
+- libucxx==0.39.*
 - matplotlib
 - nccl>=2.9.9
 - ninja
@@ -39,7 +39,8 @@ dependencies:
 - openblas
 - pandas
 - pyyaml
-- rmm==24.6.*
+- rapids-build-backend>=0.3.0,<0.4.0.dev0
+- rmm==24.8.*
 - scikit-build-core>=0.7.0
 - sysroot_linux-aarch64==2.17
 name: bench_ann_cuda-118_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index ff973acc0c..85121af42f 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -11,7 +11,7 @@ dependencies:
 - c-compiler
 - clang-tools=16.0.6
 - clang==16.0.6
-- cmake>=3.26.4
+- cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
 - cuda-version=11.8
@@ -30,7 +30,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- libucxx==0.38.*
+- libucxx==0.39.*
 - matplotlib
 - nccl>=2.9.9
 - ninja
@@ -39,7 +39,8 @@ dependencies:
 - openblas
 - pandas
 - pyyaml
-- rmm==24.6.*
+- rapids-build-backend>=0.3.0,<0.4.0.dev0
+- rmm==24.8.*
 - scikit-build-core>=0.7.0
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-118_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
index 056550fc07..7abf661289 100644
--- a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
@@ -11,7 +11,7 @@ dependencies:
 - c-compiler
 - clang-tools=16.0.6
 - clang==16.0.6
-- cmake>=3.26.4
+- cmake>=3.26.4,!=3.30.0
 - cuda-cudart-dev
 - cuda-nvcc
 - cuda-nvtx-dev
@@ -27,7 +27,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- libucxx==0.38.*
+- libucxx==0.39.*
 - matplotlib
 - nccl>=2.9.9
 - ninja
@@ -35,7 +35,8 @@ dependencies:
 - openblas
 - pandas
 - pyyaml
-- rmm==24.6.*
+- rapids-build-backend>=0.3.0,<0.4.0.dev0
+- rmm==24.8.*
 - scikit-build-core>=0.7.0
 - sysroot_linux-aarch64==2.17
 name: bench_ann_cuda-120_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
index 41a48f4a12..e9f299cf67 100644
--- a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
@@ -11,7 +11,7 @@ dependencies:
 - c-compiler
 - clang-tools=16.0.6
 - clang==16.0.6
-- cmake>=3.26.4
+- cmake>=3.26.4,!=3.30.0
 - cuda-cudart-dev
 - cuda-nvcc
 - cuda-nvtx-dev
@@ -27,7 +27,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- libucxx==0.38.*
+- libucxx==0.39.*
 - matplotlib
 - nccl>=2.9.9
 - ninja
@@ -35,7 +35,8 @@ dependencies:
 - openblas
 - pandas
 - pyyaml
-- rmm==24.6.*
+- rapids-build-backend>=0.3.0,<0.4.0.dev0
+- rmm==24.8.*
 - scikit-build-core>=0.7.0
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-120_arch-x86_64
diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml
index bb9c715e3a..00b133c821 100644
--- a/conda/recipes/libraft/conda_build_config.yaml
+++ b/conda/recipes/libraft/conda_build_config.yaml
@@ -17,7 +17,7 @@ c_stdlib_version:
   - "2.17"
 
 cmake_version:
-  - ">=3.26.4"
+  - ">=3.26.4,!=3.30.0"
 
 nccl_version:
   - ">=2.9.9"
diff --git a/conda/recipes/pylibraft/conda_build_config.yaml b/conda/recipes/pylibraft/conda_build_config.yaml
index e3ca633eb9..001878ff25 100644
--- a/conda/recipes/pylibraft/conda_build_config.yaml
+++ b/conda/recipes/pylibraft/conda_build_config.yaml
@@ -17,4 +17,4 @@ c_stdlib_version:
   - "2.17"
 
 cmake_version:
-  - ">=3.26.4"
+  - ">=3.26.4,!=3.30.0"
diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml
index cbeaec3b55..31086e30aa 100644
--- a/conda/recipes/pylibraft/meta.yaml
+++ b/conda/recipes/pylibraft/meta.yaml
@@ -55,7 +55,7 @@ requirements:
     - python x.x
     - rmm ={{ minor_version }}
     - scikit-build-core >=0.7.0
-    - setuptools
+    - rapids-build-backend>=0.3.0,<0.4.0.dev0
   run:
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
     {% if cuda_major == "11" %}
diff --git a/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml b/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml
index 4de3b98f48..70d1f0490e 100644
--- a/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml
+++ b/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml
@@ -11,7 +11,7 @@ c_stdlib_version:
   - "2.17"
 
 cmake_version:
-  - ">=3.26.4"
+  - ">=3.26.4,!=3.30.0"
 
 glog_version:
   - ">=0.6.0"
diff --git a/conda/recipes/raft-ann-bench-cpu/meta.yaml b/conda/recipes/raft-ann-bench-cpu/meta.yaml
index d0748fdb16..94f7102726 100644
--- a/conda/recipes/raft-ann-bench-cpu/meta.yaml
+++ b/conda/recipes/raft-ann-bench-cpu/meta.yaml
@@ -53,6 +53,7 @@ requirements:
     - python
     - pyyaml
     - pandas
+    - rapids-build-backend>=0.3.0,<0.4.0.dev0
 
   run:
     - glog {{ glog_version }}
diff --git a/conda/recipes/raft-ann-bench/conda_build_config.yaml b/conda/recipes/raft-ann-bench/conda_build_config.yaml
index cf025a06a4..db0083b583 100644
--- a/conda/recipes/raft-ann-bench/conda_build_config.yaml
+++ b/conda/recipes/raft-ann-bench/conda_build_config.yaml
@@ -17,7 +17,7 @@ c_stdlib_version:
   - "2.17"
 
 cmake_version:
-  - ">=3.26.4"
+  - ">=3.26.4,!=3.30.0"
 
 nccl_version:
   - ">=2.9.9"
diff --git a/conda/recipes/raft-ann-bench/meta.yaml b/conda/recipes/raft-ann-bench/meta.yaml
index 8a6a3d033d..d6aeb5f860 100644
--- a/conda/recipes/raft-ann-bench/meta.yaml
+++ b/conda/recipes/raft-ann-bench/meta.yaml
@@ -82,6 +82,7 @@ requirements:
     - pyyaml
     # rmm is needed to determine if package is gpu-enabled
     - rmm ={{ minor_version }}
+    - rapids-build-backend>=0.3.0,<0.4.0.dev0
 
   run:
     - python
diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml
index b157e41753..decd1fad18 100644
--- a/conda/recipes/raft-dask/conda_build_config.yaml
+++ b/conda/recipes/raft-dask/conda_build_config.yaml
@@ -17,10 +17,10 @@ c_stdlib_version:
   - "2.17"
 
 ucx_py_version:
-  - "0.38.*"
+  - "0.39.*"
 
 ucxx_version:
-  - "0.38.*"
+  - "0.39.*"
 
 cmake_version:
-  - ">=3.26.4"
+  - ">=3.26.4,!=3.30.0"
diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml
index af22c8853e..641a11a241 100644
--- a/conda/recipes/raft-dask/meta.yaml
+++ b/conda/recipes/raft-dask/meta.yaml
@@ -55,9 +55,9 @@ requirements:
     - python x.x
     - rmm ={{ minor_version }}
     - scikit-build-core >=0.7.0
-    - setuptools
     - ucx-py {{ ucx_py_version }}
     - ucxx {{ ucxx_version }}
+    - rapids-build-backend>=0.3.0,<0.4.0.dev0
   run:
     {% if cuda_major == "11" %}
     - cudatoolkit
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 39472cae67..d7eeb60b27 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -59,7 +59,8 @@ option(CUDA_ENABLE_KERNELINFO "Enable kernel resource usage info" OFF)
 option(CUDA_ENABLE_LINEINFO
        "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler)" OFF
 )
-option(CUDA_STATIC_RUNTIME "Statically link the CUDA toolkit runtime and libraries" OFF)
+option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
+option(CUDA_STATIC_MATH_LIBRARIES "Statically link the CUDA math libraries" OFF)
 option(CUDA_LOG_COMPILE_TIME "Write a log of compilation times to nvcc_compile_log.csv" OFF)
 option(DETECT_CONDA_ENV "Enable detection of conda environment for dependencies" ON)
 option(DISABLE_DEPRECATION_WARNINGS "Disable deprecaction warnings " ON)
@@ -105,7 +106,10 @@ message(VERBOSE "RAFT: Enable kernel resource usage info: ${CUDA_ENABLE_KERNELIN
 message(VERBOSE "RAFT: Enable lineinfo in nvcc: ${CUDA_ENABLE_LINEINFO}")
 message(VERBOSE "RAFT: Enable nvtx markers: ${RAFT_NVTX}")
 message(VERBOSE
-        "RAFT: Statically link the CUDA toolkit runtime and libraries: ${CUDA_STATIC_RUNTIME}"
+        "RAFT: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}"
+)
+message(VERBOSE
+        "RAFT: Statically link the CUDA math libraries: ${CUDA_STATIC_MATH_LIBRARIES}"
 )
 
 # Set RMM logging level
@@ -135,7 +139,7 @@ endif()
 # * compiler options ----------------------------------------------------------
 
 set(_ctk_static_suffix "")
-if(CUDA_STATIC_RUNTIME)
+if(CUDA_STATIC_MATH_LIBRARIES)
   set(_ctk_static_suffix "_static")
 endif()
 
@@ -302,6 +306,8 @@ if(RAFT_COMPILE_LIBRARY)
     src/distance/detail/pairwise_matrix/dispatch_correlation_float_float_float_int.cu
     src/distance/detail/pairwise_matrix/dispatch_cosine_double_double_double_int.cu
     src/distance/detail/pairwise_matrix/dispatch_cosine_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_dice_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_dice_float_float_float_int.cu
     src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_double_double_double_int.cu
     src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_float_float_float_int.cu
     src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_double_double_double_int.cu
diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index f489cc62c6..35df378438 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -58,10 +58,6 @@ if(BUILD_CPU_ONLY)
   set(RAFT_ANN_BENCH_USE_RAFT_BRUTE_FORCE OFF)
   set(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB OFF)
   set(RAFT_ANN_BENCH_USE_GGNN OFF)
-elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0.0)
-  # Disable faiss benchmarks on CUDA 12 since faiss is not yet CUDA 12-enabled.
-  # https://github.com/rapidsai/raft/issues/1627
-  set(RAFT_FAISS_ENABLE_GPU OFF)
 endif()
 
 set(RAFT_ANN_BENCH_USE_RAFT OFF)
diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index 8762ccd1fe..185d54a0a3 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -459,8 +459,14 @@ void register_search(std::shared_ptr<const Dataset<T>> dataset,
                    */
                   ->MeasureProcessCPUTime()
                   ->UseRealTime();
-
-      if (metric_objective == Objective::THROUGHPUT) { b->ThreadRange(threads[0], threads[1]); }
+      if (metric_objective == Objective::THROUGHPUT) {
+        if (index.algo.find("faiss_gpu") != std::string::npos) {
+          log_warn(
+            "FAISS GPU does not work in throughput mode because the underlying "
+            "StandardGpuResources object is not thread-safe. This will cause unexpected results");
+        }
+        b->ThreadRange(threads[0], threads[1]);
+      }
     }
   }
 }
diff --git a/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp b/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp
index 965522d929..234b33d80a 100644
--- a/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp
+++ b/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp
@@ -51,10 +51,10 @@ void parse_build_param(const nlohmann::json& conf,
 {
   parse_base_build_param<T>(conf, param);
   param.M = conf.at("M");
-  if (conf.contains("usePrecomputed")) {
-    param.usePrecomputed = conf.at("usePrecomputed");
+  if (conf.contains("use_precomputed_table")) {
+    param.use_precomputed_table = conf.at("use_precomputed_table");
   } else {
-    param.usePrecomputed = false;
+    param.use_precomputed_table = false;
   }
   if (conf.contains("bitsPerCode")) {
     param.bitsPerCode = conf.at("bitsPerCode");
diff --git a/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h b/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h
index 3caca15b7f..c7ce4595b5 100644
--- a/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h
+++ b/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h
@@ -229,7 +229,7 @@ class FaissCpuIVFPQ : public FaissCpu<T> {
   struct BuildParam : public FaissCpu<T>::BuildParam {
     int M;
     int bitsPerCode;
-    bool usePrecomputed;
+    bool use_precomputed_table;
   };
 
   FaissCpuIVFPQ(Metric metric, int dim, const BuildParam& param) : FaissCpu<T>(metric, dim, param)
diff --git a/cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu b/cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu
index c5056cb364..b47c497e3d 100644
--- a/cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu
+++ b/cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu
@@ -45,6 +45,11 @@ void parse_build_param(const nlohmann::json& conf,
                        typename raft::bench::ann::FaissGpuIVFFlat<T>::BuildParam& param)
 {
   parse_base_build_param<T>(conf, param);
+  if (conf.contains("use_raft")) {
+    param.use_raft = conf.at("use_raft");
+  } else {
+    param.use_raft = false;
+  }
 }
 
 template <typename T>
@@ -63,6 +68,16 @@ void parse_build_param(const nlohmann::json& conf,
   } else {
     param.useFloat16 = false;
   }
+  if (conf.contains("use_raft")) {
+    param.use_raft = conf.at("use_raft");
+  } else {
+    param.use_raft = false;
+  }
+  if (conf.contains("bitsPerCode")) {
+    param.bitsPerCode = conf.at("bitsPerCode");
+  } else {
+    param.bitsPerCode = 8;
+  }
 }
 
 template <typename T>
@@ -160,5 +175,18 @@ REGISTER_ALGO_INSTANCE(std::uint8_t);
 
 #ifdef ANN_BENCH_BUILD_MAIN
 #include "../common/benchmark.hpp"
-int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
+int main(int argc, char** argv)
+{
+  rmm::mr::cuda_memory_resource cuda_mr;
+  // Construct a resource that uses a coalescing best-fit pool allocator
+  // and is initially sized to half of free device memory.
+  rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> pool_mr{
+    &cuda_mr, rmm::percent_of_free_device_memory(50)};
+  // Updates the current device resource pointer to `pool_mr`
+  auto old_mr = rmm::mr::set_current_device_resource(&pool_mr);
+  auto ret    = raft::bench::ann::run_main(argc, argv);
+  // Restores the current device resource pointer to its previous value
+  rmm::mr::set_current_device_resource(old_mr);
+  return ret;
+}
 #endif
diff --git a/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h b/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h
index 2effe631e5..6955201c5d 100644
--- a/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h
+++ b/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h
@@ -17,15 +17,29 @@
 #define FAISS_WRAPPER_H_
 
 #include "../common/ann_types.hpp"
+#include "../raft/raft_ann_bench_utils.h"
 
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/host_mdspan.hpp>
 #include <raft/core/logger.hpp>
+#include <raft/core/resource/stream_view.hpp>
+#include <raft/distance/distance_types.hpp>
 #include <raft/util/cudart_utils.hpp>
 
+#include <raft_runtime/neighbors/refine.hpp>
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexIVFFlat.h>
 #include <faiss/IndexIVFPQ.h>
 #include <faiss/IndexRefine.h>
 #include <faiss/IndexScalarQuantizer.h>
+#include <faiss/MetricType.h>
 #include <faiss/gpu/GpuIndexFlat.h>
 #include <faiss/gpu/GpuIndexIVFFlat.h>
 #include <faiss/gpu/GpuIndexIVFPQ.h>
@@ -43,7 +57,7 @@
 
 namespace {
 
-faiss::MetricType parse_metric_type(raft::bench::ann::Metric metric)
+faiss::MetricType parse_metric_faiss(raft::bench::ann::Metric metric)
 {
   if (metric == raft::bench::ann::Metric::kInnerProduct) {
     return faiss::METRIC_INNER_PRODUCT;
@@ -95,7 +109,7 @@ class FaissGpu : public ANN<T>, public AnnGPU {
   FaissGpu(Metric metric, int dim, const BuildParam& param)
     : ANN<T>(metric, dim),
       gpu_resource_{std::make_shared<faiss::gpu::StandardGpuResources>()},
-      metric_type_(parse_metric_type(metric)),
+      metric_type_(parse_metric_faiss(metric)),
       nlist_{param.nlist},
       training_sample_fraction_{1.0 / double(param.ratio)}
   {
@@ -127,7 +141,7 @@ class FaissGpu : public ANN<T>, public AnnGPU {
     AlgoProperty property;
     // to enable building big dataset which is larger than GPU memory
     property.dataset_memory_type = MemoryType::Host;
-    property.query_memory_type   = MemoryType::Host;
+    property.query_memory_type   = MemoryType::Device;
     return property;
   }
 
@@ -162,8 +176,10 @@ class FaissGpu : public ANN<T>, public AnnGPU {
   int device_;
   double training_sample_fraction_;
   std::shared_ptr<faiss::SearchParameters> search_params_;
+  std::shared_ptr<faiss::IndexRefineSearchParameters> refine_search_params_{nullptr};
   const T* dataset_;
   float refine_ratio_ = 1.0;
+  Objective metric_objective_;
 };
 
 template <typename T>
@@ -201,19 +217,65 @@ template <typename T>
 void FaissGpu<T>::search(
   const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
 {
+  ASSERT(Objective::LATENCY, "l2Knn: rowMajorIndex and rowMajorQuery should have same layout");
+  using IdxT = faiss::idx_t;
   static_assert(sizeof(size_t) == sizeof(faiss::idx_t),
                 "sizes of size_t and faiss::idx_t are different");
 
-  if (this->refine_ratio_ > 1.0) {
-    // TODO: FAISS changed their search APIs to accept the search parameters as a struct object
-    // but their refine API doesn't allow the struct to be passed in. Once this is fixed, we
-    // need to re-enable refinement below
-    // index_refine_->search(batch_size, queries, k, distances,
-    // reinterpret_cast<faiss::idx_t*>(neighbors), this->search_params_.get()); Related FAISS issue:
-    // https://github.com/facebookresearch/faiss/issues/3118
-    throw std::runtime_error(
-      "FAISS doesn't support refinement in their new APIs so this feature is disabled in the "
-      "benchmarks for the time being.");
+  if (refine_ratio_ > 1.0) {
+    if (raft::get_device_for_address(queries) >= 0) {
+      uint32_t k0        = static_cast<uint32_t>(refine_ratio_ * k);
+      auto distances_tmp = raft::make_device_matrix<float, IdxT>(
+        gpu_resource_->getRaftHandle(device_), batch_size, k0);
+      auto candidates =
+        raft::make_device_matrix<IdxT, IdxT>(gpu_resource_->getRaftHandle(device_), batch_size, k0);
+      index_->search(batch_size,
+                     queries,
+                     k0,
+                     distances_tmp.data_handle(),
+                     candidates.data_handle(),
+                     this->search_params_.get());
+
+      auto queries_host    = raft::make_host_matrix<T, IdxT>(batch_size, index_->d);
+      auto candidates_host = raft::make_host_matrix<IdxT, IdxT>(batch_size, k0);
+      auto neighbors_host  = raft::make_host_matrix<IdxT, IdxT>(batch_size, k);
+      auto distances_host  = raft::make_host_matrix<float, IdxT>(batch_size, k);
+      auto dataset_v       = raft::make_host_matrix_view<const T, faiss::idx_t>(
+        this->dataset_, index_->ntotal, index_->d);
+
+      raft::device_resources handle_ = gpu_resource_->getRaftHandle(device_);
+
+      raft::copy(queries_host.data_handle(), queries, queries_host.size(), handle_.get_stream());
+      raft::copy(candidates_host.data_handle(),
+                 candidates.data_handle(),
+                 candidates_host.size(),
+                 handle_.get_stream());
+
+      // wait for the queries to copy to host in 'stream`
+      handle_.sync_stream();
+
+      raft::runtime::neighbors::refine(handle_,
+                                       dataset_v,
+                                       queries_host.view(),
+                                       candidates_host.view(),
+                                       neighbors_host.view(),
+                                       distances_host.view(),
+                                       parse_metric_type(this->metric_));
+
+      raft::copy(neighbors,
+                 (size_t*)neighbors_host.data_handle(),
+                 neighbors_host.size(),
+                 handle_.get_stream());
+      raft::copy(
+        distances, distances_host.data_handle(), distances_host.size(), handle_.get_stream());
+    } else {
+      index_refine_->search(batch_size,
+                            queries,
+                            k,
+                            distances,
+                            reinterpret_cast<faiss::idx_t*>(neighbors),
+                            this->refine_search_params_.get());
+    }
   } else {
     index_->search(batch_size,
                    queries,
@@ -255,13 +317,16 @@ void FaissGpu<T>::load_(const std::string& file)
 template <typename T>
 class FaissGpuIVFFlat : public FaissGpu<T> {
  public:
-  using typename FaissGpu<T>::BuildParam;
+  struct BuildParam : public FaissGpu<T>::BuildParam {
+    bool use_raft;
+  };
 
   FaissGpuIVFFlat(Metric metric, int dim, const BuildParam& param) : FaissGpu<T>(metric, dim, param)
   {
     faiss::gpu::GpuIndexIVFFlatConfig config;
-    config.device = this->device_;
-    this->index_  = std::make_shared<faiss::gpu::GpuIndexIVFFlat>(
+    config.device   = this->device_;
+    config.use_raft = param.use_raft;
+    this->index_    = std::make_shared<faiss::gpu::GpuIndexIVFFlat>(
       this->gpu_resource_.get(), dim, param.nlist, this->metric_type_, config);
   }
 
@@ -295,6 +360,8 @@ class FaissGpuIVFPQ : public FaissGpu<T> {
     int M;
     bool useFloat16;
     bool usePrecomputed;
+    bool use_raft;
+    int bitsPerCode;
   };
 
   FaissGpuIVFPQ(Metric metric, int dim, const BuildParam& param) : FaissGpu<T>(metric, dim, param)
@@ -302,16 +369,17 @@ class FaissGpuIVFPQ : public FaissGpu<T> {
     faiss::gpu::GpuIndexIVFPQConfig config;
     config.useFloat16LookupTables = param.useFloat16;
     config.usePrecomputedTables   = param.usePrecomputed;
+    config.use_raft               = param.use_raft;
+    config.interleavedLayout      = param.use_raft;
     config.device                 = this->device_;
 
-    this->index_ =
-      std::make_shared<faiss::gpu::GpuIndexIVFPQ>(this->gpu_resource_.get(),
-                                                  dim,
-                                                  param.nlist,
-                                                  param.M,
-                                                  8,  // FAISS only supports bitsPerCode=8
-                                                  this->metric_type_,
-                                                  config);
+    this->index_ = std::make_shared<faiss::gpu::GpuIndexIVFPQ>(this->gpu_resource_.get(),
+                                                               dim,
+                                                               param.nlist,
+                                                               param.M,
+                                                               param.bitsPerCode,
+                                                               this->metric_type_,
+                                                               config);
   }
 
   void set_search_param(const typename FaissGpu<T>::AnnSearchParam& param) override
@@ -329,6 +397,11 @@ class FaissGpuIVFPQ : public FaissGpu<T> {
       this->index_refine_ =
         std::make_shared<faiss::IndexRefineFlat>(this->index_.get(), this->dataset_);
       this->index_refine_.get()->k_factor = search_param.refine_ratio;
+      faiss::IndexRefineSearchParameters faiss_refine_search_params;
+      faiss_refine_search_params.k_factor          = this->index_refine_.get()->k_factor;
+      faiss_refine_search_params.base_index_params = this->search_params_.get();
+      this->refine_search_params_ =
+        std::make_unique<faiss::IndexRefineSearchParameters>(faiss_refine_search_params);
     }
   }
 
@@ -385,6 +458,11 @@ class FaissGpuIVFSQ : public FaissGpu<T> {
       this->index_refine_ =
         std::make_shared<faiss::IndexRefineFlat>(this->index_.get(), this->dataset_);
       this->index_refine_.get()->k_factor = search_param.refine_ratio;
+      faiss::IndexRefineSearchParameters faiss_refine_search_params;
+      faiss_refine_search_params.k_factor          = this->index_refine_.get()->k_factor;
+      faiss_refine_search_params.base_index_params = this->search_params_.get();
+      this->refine_search_params_ =
+        std::make_unique<faiss::IndexRefineSearchParameters>(faiss_refine_search_params);
     }
   }
 
diff --git a/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h
index 1c4b847d1a..1d2a1076ab 100644
--- a/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h
@@ -31,7 +31,7 @@ class RaftCagraHnswlib : public ANN<T>, public AnnGPU {
 
   RaftCagraHnswlib(Metric metric, int dim, const BuildParam& param, int concurrent_searches = 1)
     : ANN<T>(metric, dim),
-      cagra_build_{metric, dim, param, concurrent_searches},
+      cagra_build_{metric, dim, param, concurrent_searches, true},
       // HnswLib param values don't matter since we don't build with HnswLib
       hnswlib_search_{metric, dim, typename HnswLib<T>::BuildParam{50, 100}}
   {
diff --git a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
index 0b892dec35..b03f875a8e 100644
--- a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
@@ -72,11 +72,16 @@ class RaftCagra : public ANN<T>, public AnnGPU {
     std::optional<raft::neighbors::ivf_pq::search_params> ivf_pq_search_params = std::nullopt;
   };
 
-  RaftCagra(Metric metric, int dim, const BuildParam& param, int concurrent_searches = 1)
+  RaftCagra(Metric metric,
+            int dim,
+            const BuildParam& param,
+            int concurrent_searches    = 1,
+            bool shall_include_dataset = false)
     : ANN<T>(metric, dim),
       index_params_(param),
       dimension_(dim),
       need_dataset_update_(true),
+      shall_include_dataset_(shall_include_dataset),
       dataset_(std::make_shared<raft::device_matrix<T, int64_t, row_major>>(
         std::move(make_device_matrix<T, int64_t>(handle_, 0, 0)))),
       graph_(std::make_shared<raft::device_matrix<IdxT, int64_t, row_major>>(
@@ -135,6 +140,7 @@ class RaftCagra : public ANN<T>, public AnnGPU {
   float refine_ratio_;
   BuildParam index_params_;
   bool need_dataset_update_;
+  bool shall_include_dataset_;
   raft::neighbors::cagra::search_params search_params_;
   std::shared_ptr<raft::neighbors::cagra::index<T, IdxT>> index_;
   int dimension_;
@@ -161,7 +167,7 @@ void RaftCagra<T, IdxT>::build(const T* dataset, size_t nrow)
   auto& params = index_params_.cagra_params;
 
   // Do include the compressed dataset for the CAGRA-Q
-  bool shall_include_dataset = params.compression.has_value();
+  bool include_dataset = params.compression.has_value() || shall_include_dataset_;
 
   index_ = std::make_shared<raft::neighbors::cagra::index<T, IdxT>>(
     std::move(raft::neighbors::cagra::detail::build(handle_,
@@ -171,7 +177,7 @@ void RaftCagra<T, IdxT>::build(const T* dataset, size_t nrow)
                                                     index_params_.ivf_pq_refine_rate,
                                                     index_params_.ivf_pq_build_params,
                                                     index_params_.ivf_pq_search_params,
-                                                    shall_include_dataset)));
+                                                    include_dataset)));
 }
 
 inline std::string allocator_to_string(AllocatorType mem_type)
diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
index 0771a60e58..9d80cbaac2 100644
--- a/cpp/bench/prims/CMakeLists.txt
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -75,7 +75,20 @@ endfunction()
 
 if(BUILD_PRIMS_BENCH)
   ConfigureBench(
-    NAME CORE_BENCH PATH core/bitset.cu core/copy.cu main.cpp
+    NAME
+    CORE_BENCH
+    PATH
+    core/bitset.cu
+    core/copy.cu
+    main.cpp
+  )
+
+  ConfigureBench(
+    NAME
+    UTIL_BENCH
+    PATH
+    util/popc.cu
+    main.cpp
   )
 
   ConfigureBench(
@@ -111,6 +124,7 @@ if(BUILD_PRIMS_BENCH)
     PATH
     linalg/add.cu
     linalg/map_then_reduce.cu
+    linalg/masked_matmul.cu
     linalg/matrix_vector_op.cu
     linalg/norm.cu
     linalg/normalize.cu
diff --git a/cpp/bench/prims/linalg/masked_matmul.cu b/cpp/bench/prims/linalg/masked_matmul.cu
new file mode 100644
index 0000000000..eda9cb1710
--- /dev/null
+++ b/cpp/bench/prims/linalg/masked_matmul.cu
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <common/benchmark.hpp>
+
+#include <raft/core/device_resources.hpp>
+#include <raft/core/resource/cublas_handle.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/distance/distance.cuh>
+#include <raft/distance/distance_types.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/sparse/linalg/masked_matmul.hpp>
+#include <raft/util/itertools.hpp>
+
+#include <cusparse_v2.h>
+
+#include <random>
+#include <sstream>
+#include <vector>
+
+namespace raft::bench::linalg {
+
+template <typename value_t>
+struct MaskedMatmulBenchParams {
+  size_t m;
+  size_t k;
+  size_t n;
+  float sparsity;
+  value_t alpha = 1.0;
+  value_t beta  = 0.0;
+};
+
+template <typename value_t>
+inline auto operator<<(std::ostream& os, const MaskedMatmulBenchParams<value_t>& params)
+  -> std::ostream&
+{
+  os << " m*k*n=" << params.m << "*" << params.k << "*" << params.n
+     << "\tsparsity=" << params.sparsity;
+  if (params.sparsity == 1.0) { os << "<-inner product for comparison"; }
+  return os;
+}
+
+template <typename value_t, typename index_t = int64_t, typename bitmap_t = uint32_t>
+struct MaskedMatmulBench : public fixture {
+  MaskedMatmulBench(const MaskedMatmulBenchParams<value_t>& p)
+    : fixture(true),
+      params(p),
+      handle(stream),
+      a_data_d(0, stream),
+      b_data_d(0, stream),
+      c_indptr_d(0, stream),
+      c_indices_d(0, stream),
+      c_data_d(0, stream),
+      bitmap_d(0, stream),
+      c_dense_data_d(0, stream)
+  {
+    index_t element = raft::ceildiv(index_t(params.m * params.n), index_t(sizeof(bitmap_t) * 8));
+    std::vector<bitmap_t> bitmap_h(element);
+
+    a_data_d.resize(params.m * params.k, stream);
+    b_data_d.resize(params.k * params.n, stream);
+    bitmap_d.resize(element, stream);
+
+    raft::random::RngState rng(2024ULL);
+    raft::random::uniform(
+      handle, rng, a_data_d.data(), params.m * params.k, value_t(-1.0), value_t(1.0));
+    raft::random::uniform(
+      handle, rng, b_data_d.data(), params.k * params.n, value_t(-1.0), value_t(1.0));
+
+    std::vector<bool> c_dense_data_h(params.m * params.n);
+
+    c_true_nnz = create_sparse_matrix(params.m, params.n, params.sparsity, bitmap_h);
+
+    std::vector<value_t> values(c_true_nnz);
+    std::vector<index_t> indices(c_true_nnz);
+    std::vector<index_t> indptr(params.m + 1);
+
+    c_data_d.resize(c_true_nnz, stream);
+    c_indptr_d.resize(params.m + 1, stream);
+    c_indices_d.resize(c_true_nnz, stream);
+    c_dense_data_d.resize(params.m * params.n, stream);
+
+    cpu_convert_to_csr(bitmap_h, params.m, params.n, indices, indptr);
+    RAFT_EXPECTS(c_true_nnz == c_indices_d.size(),
+                 "Something wrong. The c_true_nnz != c_indices_d.size()!");
+
+    update_device(c_data_d.data(), values.data(), c_true_nnz, stream);
+    update_device(c_indices_d.data(), indices.data(), c_true_nnz, stream);
+    update_device(c_indptr_d.data(), indptr.data(), params.m + 1, stream);
+    update_device(bitmap_d.data(), bitmap_h.data(), element, stream);
+  }
+
+  index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector<bitmap_t>& bitmap)
+  {
+    index_t total    = static_cast<index_t>(m * n);
+    index_t num_ones = static_cast<index_t>((total * 1.0f) * sparsity);
+    index_t res      = num_ones;
+
+    for (auto& item : bitmap) {
+      item = static_cast<bitmap_t>(0);
+    }
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<index_t> dis(0, total - 1);
+
+    while (num_ones > 0) {
+      index_t index = dis(gen);
+
+      bitmap_t& element    = bitmap[index / (8 * sizeof(bitmap_t))];
+      index_t bit_position = index % (8 * sizeof(bitmap_t));
+
+      if (((element >> bit_position) & 1) == 0) {
+        element |= (static_cast<index_t>(1) << bit_position);
+        num_ones--;
+      }
+    }
+    return res;
+  }
+
+  void cpu_convert_to_csr(std::vector<bitmap_t>& bitmap,
+                          index_t rows,
+                          index_t cols,
+                          std::vector<index_t>& indices,
+                          std::vector<index_t>& indptr)
+  {
+    index_t offset_indptr   = 0;
+    index_t offset_values   = 0;
+    indptr[offset_indptr++] = 0;
+
+    index_t index        = 0;
+    bitmap_t element     = 0;
+    index_t bit_position = 0;
+
+    for (index_t i = 0; i < rows; ++i) {
+      for (index_t j = 0; j < cols; ++j) {
+        index        = i * cols + j;
+        element      = bitmap[index / (8 * sizeof(bitmap_t))];
+        bit_position = index % (8 * sizeof(bitmap_t));
+
+        if (((element >> bit_position) & 1)) {
+          indices[offset_values] = static_cast<index_t>(j);
+          offset_values++;
+        }
+      }
+      indptr[offset_indptr++] = static_cast<index_t>(offset_values);
+    }
+  }
+
+  ~MaskedMatmulBench() {}
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    std::ostringstream label_stream;
+    label_stream << params;
+    state.SetLabel(label_stream.str());
+
+    auto a = raft::make_device_matrix_view<const value_t, index_t, row_major>(
+      a_data_d.data(), params.m, params.k);
+
+    auto b = raft::make_device_matrix_view<const value_t, index_t, row_major>(
+      b_data_d.data(), params.n, params.k);
+
+    auto c_structure = raft::make_device_compressed_structure_view<int64_t, int64_t, int64_t>(
+      c_indptr_d.data(),
+      c_indices_d.data(),
+      params.m,
+      params.n,
+      static_cast<index_t>(c_indices_d.size()));
+
+    auto mask =
+      raft::core::bitmap_view<const bitmap_t, index_t>(bitmap_d.data(), params.m, params.n);
+
+    auto c = raft::make_device_csr_matrix_view<value_t>(c_data_d.data(), c_structure);
+
+    if (params.sparsity < 1.0) {
+      raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
+    } else {
+      raft::distance::pairwise_distance(handle,
+                                        a_data_d.data(),
+                                        b_data_d.data(),
+                                        c_dense_data_d.data(),
+                                        static_cast<int>(params.m),
+                                        static_cast<int>(params.n),
+                                        static_cast<int>(params.k),
+                                        raft::distance::DistanceType::InnerProduct,
+                                        true);
+    }
+    resource::sync_stream(handle);
+
+    raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
+    resource::sync_stream(handle);
+
+    loop_on_state(state, [this, &a, &b, &mask, &c]() {
+      if (params.sparsity < 1.0) {
+        raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
+      } else {
+        raft::distance::pairwise_distance(handle,
+                                          a_data_d.data(),
+                                          b_data_d.data(),
+                                          c_dense_data_d.data(),
+                                          static_cast<int>(params.m),
+                                          static_cast<int>(params.n),
+                                          static_cast<int>(params.k),
+                                          raft::distance::DistanceType::InnerProduct,
+                                          true);
+      }
+      resource::sync_stream(handle);
+    });
+  }
+
+ private:
+  const raft::device_resources handle;
+  MaskedMatmulBenchParams<value_t> params;
+
+  rmm::device_uvector<value_t> a_data_d;
+  rmm::device_uvector<value_t> b_data_d;
+  rmm::device_uvector<bitmap_t> bitmap_d;
+
+  rmm::device_uvector<value_t> c_dense_data_d;
+
+  size_t c_true_nnz = 0;
+  rmm::device_uvector<index_t> c_indptr_d;
+  rmm::device_uvector<index_t> c_indices_d;
+  rmm::device_uvector<value_t> c_data_d;
+};
+
+template <typename value_t>
+static std::vector<MaskedMatmulBenchParams<value_t>> getInputs()
+{
+  std::vector<MaskedMatmulBenchParams<value_t>> param_vec;
+  struct TestParams {
+    size_t m;
+    size_t k;
+    size_t n;
+    float sparsity;
+  };
+
+  const std::vector<TestParams> params_group =
+    raft::util::itertools::product<TestParams>({size_t(10), size_t(1024)},
+                                               {size_t(128), size_t(1024)},
+                                               {size_t(1024 * 1024)},
+                                               {0.01f, 0.1f, 0.2f, 0.5f, 1.0f});
+
+  param_vec.reserve(params_group.size());
+  for (TestParams params : params_group) {
+    param_vec.push_back(
+      MaskedMatmulBenchParams<value_t>({params.m, params.k, params.n, params.sparsity}));
+  }
+  return param_vec;
+}
+
+RAFT_BENCH_REGISTER((MaskedMatmulBench<float>), "", getInputs<float>());
+
+}  // namespace raft::bench::linalg
diff --git a/cpp/bench/prims/util/popc.cu b/cpp/bench/prims/util/popc.cu
new file mode 100644
index 0000000000..249dc13d1e
--- /dev/null
+++ b/cpp/bench/prims/util/popc.cu
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+
+#include <raft/util/popc.cuh>
+
+namespace raft::bench::core {
+
+template <typename index_t>
+struct PopcInputs {
+  index_t n_rows;
+  index_t n_cols;
+  float sparsity;
+};
+
+template <typename index_t>
+inline auto operator<<(std::ostream& os, const PopcInputs<index_t>& params) -> std::ostream&
+{
+  os << params.n_rows << "#" << params.n_cols << "#" << params.sparsity;
+  return os;
+}
+
+template <typename index_t, typename bits_t = uint32_t>
+struct popc_bench : public fixture {
+  popc_bench(const PopcInputs<index_t>& p)
+    : params(p),
+      n_element(raft::ceildiv(params.n_rows * params.n_cols, index_t(sizeof(bits_t) * 8))),
+      bits_d{raft::make_device_vector<bits_t, index_t>(res, n_element)},
+      nnz_actual_d{raft::make_device_scalar<index_t>(res, 0)}
+  {
+  }
+
+  index_t create_bitmap(index_t m, index_t n, float sparsity, std::vector<bits_t>& bitmap)
+  {
+    index_t total    = static_cast<index_t>(m * n);
+    index_t num_ones = static_cast<index_t>((total * 1.0f) * sparsity);
+    index_t res      = num_ones;
+
+    for (auto& item : bitmap) {
+      item = static_cast<bits_t>(0);
+    }
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<index_t> dis(0, total - 1);
+
+    while (num_ones > 0) {
+      index_t index = dis(gen);
+
+      bits_t& element      = bitmap[index / (8 * sizeof(bits_t))];
+      index_t bit_position = index % (8 * sizeof(bits_t));
+
+      if (((element >> bit_position) & 1) == 0) {
+        element |= (static_cast<index_t>(1) << bit_position);
+        num_ones--;
+      }
+    }
+    return res;
+  }
+  void run_benchmark(::benchmark::State& state) override
+  {
+    std::ostringstream label_stream;
+    label_stream << params;
+    state.SetLabel(label_stream.str());
+
+    std::vector<bits_t> bits_h(n_element);
+    auto stream = raft::resource::get_cuda_stream(res);
+
+    create_bitmap(params.n_rows, params.n_cols, params.sparsity, bits_h);
+    update_device(bits_d.data_handle(), bits_h.data(), bits_h.size(), stream);
+
+    resource::sync_stream(res);
+
+    loop_on_state(state, [this]() {
+      auto bits_view =
+        raft::make_device_vector_view<const bits_t, index_t>(bits_d.data_handle(), bits_d.size());
+
+      index_t max_len   = params.n_rows * params.n_cols;
+      auto max_len_view = raft::make_host_scalar_view<index_t>(&max_len);
+      auto nnz_actual_view =
+        nnz_actual_d.view();  // raft::make_device_scalar_view<index_t>(nnz_actual_d.data_handle());
+      raft::popc(this->handle, bits_view, max_len_view, nnz_actual_view);
+    });
+  }
+
+ private:
+  raft::resources res;
+  PopcInputs<index_t> params;
+  index_t n_element;
+
+  raft::device_vector<bits_t, index_t> bits_d;
+  raft::device_scalar<index_t> nnz_actual_d;
+};
+
+template <typename index_t>
+const std::vector<PopcInputs<index_t>> popc_input_vecs{
+  {2, 131072, 0.4},  {8, 131072, 0.5},  {16, 131072, 0.2}, {2, 8192, 0.4},    {16, 8192, 0.5},
+  {128, 8192, 0.2},  {1024, 8192, 0.1}, {1024, 8192, 0.1}, {1024, 8192, 0.1}, {1024, 8192, 0.1},
+
+  {1024, 8192, 0.1}, {1024, 8192, 0.1}, {1024, 8192, 0.1}, {1024, 8192, 0.1},
+
+  {1024, 8192, 0.4}, {1024, 8192, 0.5}, {1024, 8192, 0.2}, {1024, 8192, 0.4}, {1024, 8192, 0.5},
+  {1024, 8192, 0.2}, {1024, 8192, 0.4}, {1024, 8192, 0.5}, {1024, 8192, 0.2}, {1024, 8192, 0.4},
+  {1024, 8192, 0.5}, {1024, 8192, 0.2},
+
+  {1024, 8192, 0.5}, {1024, 8192, 0.2}, {1024, 8192, 0.4}, {1024, 8192, 0.5}, {1024, 8192, 0.2},
+  {1024, 8192, 0.4}, {1024, 8192, 0.5}, {1024, 8192, 0.2}};
+
+using PopcBenchI64 = popc_bench<int64_t>;
+
+RAFT_BENCH_REGISTER(PopcBenchI64, "", popc_input_vecs<int64_t>);
+
+}  // namespace raft::bench::core
diff --git a/cpp/cmake/thirdparty/get_faiss.cmake b/cpp/cmake/thirdparty/get_faiss.cmake
index 288da763bf..706b0c2f11 100644
--- a/cpp/cmake/thirdparty/get_faiss.cmake
+++ b/cpp/cmake/thirdparty/get_faiss.cmake
@@ -55,6 +55,7 @@ function(find_and_configure_faiss)
     EXCLUDE_FROM_ALL ${exclude}
     OPTIONS
     "FAISS_ENABLE_GPU ${PKG_ENABLE_GPU}"
+    "FAISS_ENABLE_RAFT ${PKG_ENABLE_GPU}"
     "FAISS_ENABLE_PYTHON OFF"
     "FAISS_OPT_LEVEL ${RAFT_FAISS_OPT_LEVEL}"
     "FAISS_USE_CUDA_TOOLKIT_STATIC ${CUDA_STATIC_RUNTIME}"
@@ -115,4 +116,4 @@ endfunction()
 find_and_configure_faiss(
   BUILD_STATIC_LIBS ${RAFT_USE_FAISS_STATIC}
   ENABLE_GPU ${RAFT_FAISS_ENABLE_GPU}
-)
+)
\ No newline at end of file
diff --git a/cpp/include/raft/comms/detail/std_comms.hpp b/cpp/include/raft/comms/detail/std_comms.hpp
index cb1accc95e..c5d64f6a29 100644
--- a/cpp/include/raft/comms/detail/std_comms.hpp
+++ b/cpp/include/raft/comms/detail/std_comms.hpp
@@ -307,13 +307,16 @@ class std_comms : public comms_iface {
           bool restart = false;  // resets the timeout when any progress was made
 
           if (worker->isProgressThreadRunning()) {
-            // Wait for a UCXX progress thread roundtrip
+            // Wait for a UCXX progress thread roundtrip, prevent waiting for longer
+            // than 10ms for each operation, will retry in next iteration.
             ucxx::utils::CallbackNotifier callbackNotifierPre{};
-            worker->registerGenericPre([&callbackNotifierPre]() { callbackNotifierPre.set(); });
+            worker->registerGenericPre([&callbackNotifierPre]() { callbackNotifierPre.set(); },
+                                       10000000 /* 10ms */);
             callbackNotifierPre.wait();
 
             ucxx::utils::CallbackNotifier callbackNotifierPost{};
-            worker->registerGenericPost([&callbackNotifierPost]() { callbackNotifierPost.set(); });
+            worker->registerGenericPost([&callbackNotifierPost]() { callbackNotifierPost.set(); },
+                                        10000000 /* 10ms */);
             callbackNotifierPost.wait();
           } else {
             // Causes UCXX to progress through the send/recv message queue
diff --git a/cpp/include/raft/core/bitmap.cuh b/cpp/include/raft/core/bitmap.cuh
index 2c23a77e47..cafd1977ab 100644
--- a/cpp/include/raft/core/bitmap.cuh
+++ b/cpp/include/raft/core/bitmap.cuh
@@ -39,7 +39,7 @@ _RAFT_HOST_DEVICE void bitmap_view<bitmap_t, index_t>::set(const index_t row,
                                                            const index_t col,
                                                            bool new_value) const
 {
-  set(row * cols_ + col, &new_value);
+  set(row * cols_ + col, new_value);
 }
 
 }  // end namespace raft::core
diff --git a/cpp/include/raft/core/bitmap.hpp b/cpp/include/raft/core/bitmap.hpp
index 5c77866164..86b2d77478 100644
--- a/cpp/include/raft/core/bitmap.hpp
+++ b/cpp/include/raft/core/bitmap.hpp
@@ -41,6 +41,9 @@ namespace raft::core {
  */
 template <typename bitmap_t = uint32_t, typename index_t = uint32_t>
 struct bitmap_view : public bitset_view<bitmap_t, index_t> {
+  using bitset_view<bitmap_t, index_t>::set;
+  using bitset_view<bitmap_t, index_t>::test;
+
   static_assert((std::is_same<typename std::remove_const<bitmap_t>::type, uint32_t>::value ||
                  std::is_same<typename std::remove_const<bitmap_t>::type, uint64_t>::value),
                 "The bitmap_t must be uint32_t or uint64_t.");
diff --git a/cpp/include/raft/core/bitset.cuh b/cpp/include/raft/core/bitset.cuh
index d7eedee92e..0cdb4c1fb6 100644
--- a/cpp/include/raft/core/bitset.cuh
+++ b/cpp/include/raft/core/bitset.cuh
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <raft/core/bitset.hpp>
-#include <raft/core/detail/popc.cuh>
 #include <raft/core/device_container_policy.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
@@ -25,6 +24,7 @@
 #include <raft/linalg/map.cuh>
 #include <raft/linalg/reduce.cuh>
 #include <raft/util/device_atomics.cuh>
+#include <raft/util/popc.cuh>
 
 #include <thrust/for_each.h>
 
@@ -167,9 +167,10 @@ template <typename bitset_t, typename index_t>
 void bitset<bitset_t, index_t>::count(const raft::resources& res,
                                       raft::device_scalar_view<index_t> count_gpu_scalar)
 {
+  auto max_len = raft::make_host_scalar_view<index_t>(&bitset_len_);
   auto values =
     raft::make_device_vector_view<const bitset_t, index_t>(bitset_.data(), n_elements());
-  raft::detail::popc(res, values, bitset_len_, count_gpu_scalar);
+  raft::popc(res, values, max_len, count_gpu_scalar);
 }
 
 }  // end namespace raft::core
diff --git a/cpp/include/raft/core/cusparse_macros.hpp b/cpp/include/raft/core/cusparse_macros.hpp
index 9058f4847d..5a1968b529 100644
--- a/cpp/include/raft/core/cusparse_macros.hpp
+++ b/cpp/include/raft/core/cusparse_macros.hpp
@@ -34,7 +34,8 @@
 //
 // (i.e., before including this header)
 //
-#define CUDA_VER_10_1_UP (CUDART_VERSION >= 10100)
+#define CUDA_VER_10_1_UP (CUDART_VERSION >= 10010)
+#define CUDA_VER_12_4_UP (CUDART_VERSION >= 12040)
 
 namespace raft {
 
@@ -59,7 +60,7 @@ namespace detail {
 
 inline const char* cusparse_error_to_string(cusparseStatus_t err)
 {
-#if defined(CUDART_VERSION) && CUDART_VERSION >= 10100
+#if defined(CUDART_VERSION) && CUDART_VERSION >= 10010
   return cusparseGetErrorString(err);
 #else   // CUDART_VERSION
   switch (err) {
diff --git a/cpp/include/raft/core/math.hpp b/cpp/include/raft/core/math.hpp
index 9ce768bf40..e082aaf41a 100644
--- a/cpp/include/raft/core/math.hpp
+++ b/cpp/include/raft/core/math.hpp
@@ -56,7 +56,11 @@ constexpr RAFT_INLINE_FUNCTION auto abs(T x)
                         !std::is_same_v<long long int, T>,
                       T>
 {
-  return x < T{0} ? -x : x;
+  if constexpr (std::is_unsigned_v<T>) {
+    return x;
+  } else {
+    return x < T{0} ? -x : x;
+  }
 }
 #if defined(_RAFT_HAS_CUDA)
 template <typename T>
diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh
index a5c8c0ef4b..a39dbf6700 100644
--- a/cpp/include/raft/distance/detail/distance.cuh
+++ b/cpp/include/raft/distance/detail/distance.cuh
@@ -55,6 +55,7 @@ using distance_tag = std::integral_constant<DistanceType, d>;
  * - DistanceType::Canberra:
  * - DistanceType::CorrelationExpanded:
  * - DistanceType::CosineExpanded:
+ * - DistanceType::DiceExpanded:
  * - DistanceType::HammingUnexpanded:
  * - DistanceType::HellingerExpanded:
  * - DistanceType::JensenShannon:
@@ -238,6 +239,61 @@ void distance_impl(raft::resources const& handle,
     distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
 }
 
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(raft::resources const& handle,
+                   distance_tag<DistanceType::DiceExpanded> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT* workspace,
+                   size_t worksize,
+                   FinOpT fin_op,
+                   bool is_row_major,
+                   DataT)  // unused
+{
+  // raft distance support inputs as float/double and output as uint8_t/float/double.
+  static_assert(!((sizeof(OutT) > 1) && (sizeof(AccT) != sizeof(OutT))),
+                "OutT can be uint8_t, float, double,"
+                "if sizeof(OutT) > 1 then sizeof(AccT) == sizeof(OutT).");
+
+  ASSERT(!(worksize < (m + n) * sizeof(AccT)), "workspace size error");
+  ASSERT(workspace != nullptr, "workspace is null");
+
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+
+  DataT* x_norm = workspace;
+  DataT* y_norm = workspace;
+  // TODO: Column major case looks to have lower accuracy for X == Y,
+  // perhaps the use of stridedSummationKernel could be causing this,
+  // need to investigate and fix.
+  if (x == y && is_row_major) {
+    raft::linalg::reduce(x_norm,
+                         x,
+                         k,
+                         std::max(m, n),
+                         (AccT)0,
+                         is_row_major,
+                         true,
+                         stream,
+                         false,
+                         raft::nz_op(),
+                         raft::add_op());
+  } else {
+    y_norm += m;
+    raft::linalg::reduce(
+      x_norm, x, k, m, (AccT)0, is_row_major, true, stream, false, raft::nz_op(), raft::add_op());
+    raft::linalg::reduce(
+      y_norm, y, k, n, (AccT)0, is_row_major, true, stream, false, raft::nz_op(), raft::add_op());
+  }
+
+  ops::dice_distance_op<DataT, AccT, IdxT> distance_op{};
+  pairwise_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
+}
+
 template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
 void distance_impl(raft::resources const& handle,
                    distance_tag<DistanceType::HammingUnexpanded> distance_type,
@@ -794,9 +850,11 @@ template <raft::distance::DistanceType distanceType,
           typename Index_ = int>
 size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, Index_ k)
 {
-  size_t worksize             = 0;
-  constexpr bool is_allocated = (distanceType <= raft::distance::DistanceType::CosineExpanded) ||
-                                (distanceType == raft::distance::DistanceType::CorrelationExpanded);
+  size_t worksize = 0;
+  constexpr bool is_allocated =
+    (distanceType <= raft::distance::DistanceType::CosineExpanded) ||
+    (distanceType == raft::distance::DistanceType::CorrelationExpanded) ||
+    (distanceType == raft::distance::DistanceType::DiceExpanded);
   constexpr int numOfBuffers =
     (distanceType == raft::distance::DistanceType::CorrelationExpanded) ? 2 : 1;
 
diff --git a/cpp/include/raft/distance/detail/distance_ops/all_ops.cuh b/cpp/include/raft/distance/detail/distance_ops/all_ops.cuh
index 3e8f4e86fb..84eb3c705b 100644
--- a/cpp/include/raft/distance/detail/distance_ops/all_ops.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/all_ops.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <raft/distance/detail/distance_ops/canberra.cuh>
 #include <raft/distance/detail/distance_ops/correlation.cuh>
 #include <raft/distance/detail/distance_ops/cosine.cuh>
+#include <raft/distance/detail/distance_ops/dice.cuh>
 #include <raft/distance/detail/distance_ops/hamming.cuh>
 #include <raft/distance/detail/distance_ops/hellinger.cuh>
 #include <raft/distance/detail/distance_ops/jensen_shannon.cuh>
diff --git a/cpp/include/raft/distance/detail/distance_ops/dice.cuh b/cpp/include/raft/distance/detail/distance_ops/dice.cuh
new file mode 100644
index 0000000000..362ba7eab7
--- /dev/null
+++ b/cpp/include/raft/distance/detail/distance_ops/dice.cuh
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/util/cuda_dev_essentials.cuh>  // DI
+
+namespace raft::distance::detail::ops {
+
+// Epilogue operator for CUTLASS based kernel
+template <typename DataT, typename AccT>
+struct dice_cutlass_op {
+  __device__ dice_cutlass_op() noexcept {}
+  __device__ AccT operator()(DataT& aNorm, const DataT& bNorm, DataT& accVal) const noexcept
+  {
+    return static_cast<AccT>(1.0) - static_cast<AccT>(2 * accVal / (aNorm + bNorm));
+  }
+  __device__ AccT operator()(DataT aData) const noexcept { return aData; }
+};
+
+/**
+ * @brief the expanded dice distance matrix calculation
+ *
+ * It computes the following equation:
+ *
+ * d(x, y) = 1 - 2*(x ⋅ y) / ( Σ(x) + Σ(y) )
+ */
+template <typename DataType, typename AccType, typename IdxType>
+struct dice_distance_op {
+  using DataT = DataType;
+  using AccT  = AccType;
+  using IdxT  = IdxType;
+
+  // Load norms of input data
+  static constexpr bool use_norms = true;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = false;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy>
+  static constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT));
+  }
+
+  DI void core(AccT& acc, DataT& x, DataT& y) const
+  {
+    acc += (x != DataT(0) ? DataT(1) : DataT(0)) * (y != DataT(0) ? DataT(1) : DataT(0));
+  };
+
+  template <typename Policy>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 DataT* regxn,
+                 DataT* regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const
+  {
+#pragma unroll
+    for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+        acc[i][j] = 1.0 - (2 * acc[i][j] / (regxn[i] + regyn[j]));
+      }
+    }
+  }
+
+  constexpr dice_cutlass_op<DataT, AccT> get_cutlass_op() const
+  {
+    return dice_cutlass_op<DataT, AccT>();
+  }
+};
+
+}  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh
index e1dc6f9b37..bced721ec8 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -120,6 +120,10 @@ instantiate_raft_distance_detail_pairwise_matrix_dispatch(
   raft::distance::detail::ops::cosine_distance_op, float, float, float, raft::identity_op, int);
 instantiate_raft_distance_detail_pairwise_matrix_dispatch(
   raft::distance::detail::ops::cosine_distance_op, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::dice_distance_op, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::dice_distance_op, double, double, double, raft::identity_op, int);
 instantiate_raft_distance_detail_pairwise_matrix_dispatch(
   raft::distance::detail::ops::hamming_distance_op, float, float, float, raft::identity_op, int);
 instantiate_raft_distance_detail_pairwise_matrix_dispatch(
diff --git a/cpp/include/raft/distance/distance-ext.cuh b/cpp/include/raft/distance/distance-ext.cuh
index a634e8c995..2d41e029fe 100644
--- a/cpp/include/raft/distance/distance-ext.cuh
+++ b/cpp/include/raft/distance/distance-ext.cuh
@@ -204,6 +204,10 @@ instantiate_raft_distance_distance(
   raft::distance::DistanceType::CosineExpanded, float, float, float, raft::identity_op, int);
 instantiate_raft_distance_distance(
   raft::distance::DistanceType::CosineExpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::DiceExpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::DiceExpanded, double, double, double, raft::identity_op, int);
 instantiate_raft_distance_distance(
   raft::distance::DistanceType::HammingUnexpanded, float, float, float, raft::identity_op, int);
 instantiate_raft_distance_distance(
@@ -286,6 +290,10 @@ instantiate_raft_distance_distance(
   raft::distance::DistanceType::CosineExpanded, float, float, float, int);
 instantiate_raft_distance_distance(
   raft::distance::DistanceType::CosineExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::DiceExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::DiceExpanded, double, double, double, int);
 instantiate_raft_distance_distance(
   raft::distance::DistanceType::HammingUnexpanded, float, float, float, int);
 instantiate_raft_distance_distance(
@@ -362,6 +370,10 @@ instantiate_raft_distance_distance(
   raft::distance::DistanceType::CosineExpanded, float, float, float, int);
 instantiate_raft_distance_distance(
   raft::distance::DistanceType::CosineExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::DiceExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::DiceExpanded, double, double, double, int);
 instantiate_raft_distance_distance(
   raft::distance::DistanceType::HammingUnexpanded, float, float, float, int);
 instantiate_raft_distance_distance(
@@ -429,6 +441,10 @@ instantiate_raft_distance_getWorkspaceSize(
   raft::distance::DistanceType::CosineExpanded, float, float, float, int);
 instantiate_raft_distance_getWorkspaceSize(
   raft::distance::DistanceType::CosineExpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::DiceExpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::DiceExpanded, double, double, double, int);
 instantiate_raft_distance_getWorkspaceSize(
   raft::distance::DistanceType::HammingUnexpanded, float, float, float, int);
 instantiate_raft_distance_getWorkspaceSize(
@@ -547,6 +563,22 @@ instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CosineE
                                            double,
                                            int,
                                            raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::DiceExpanded, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::DiceExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::DiceExpanded, float, float, float, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::DiceExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
 instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HammingUnexpanded,
                                            float,
                                            float,
@@ -822,6 +854,22 @@ instantiate_raft_distance_distance(raft::distance::DistanceType::CosineExpanded,
                                    double,
                                    raft::layout_f_contiguous,
                                    int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::DiceExpanded, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::DiceExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::DiceExpanded, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::DiceExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
 instantiate_raft_distance_distance(raft::distance::DistanceType::HammingUnexpanded,
                                    float,
                                    float,
diff --git a/cpp/include/raft/distance/distance-inl.cuh b/cpp/include/raft/distance/distance-inl.cuh
index 647c5b2908..13c9d57efd 100644
--- a/cpp/include/raft/distance/distance-inl.cuh
+++ b/cpp/include/raft/distance/distance-inl.cuh
@@ -306,6 +306,9 @@ void pairwise_distance(raft::resources const& handle,
     case DistanceType::RusselRaoExpanded:
       dispatch(std::integral_constant<DistanceType, DistanceType::RusselRaoExpanded>{});
       break;
+    case DistanceType::DiceExpanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::DiceExpanded>{});
+      break;
     default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
   };
 }
diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
index 9f3be7ce0e..9680cbc636 100644
--- a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
@@ -42,6 +42,18 @@ struct ReductionThinPolicy {
   static constexpr bool NoSequentialReduce = noLoop;
 };
 
+template <typename Type>
+DI void KahanBabushkaNeumaierSum(Type& sum, Type& compensation, const Type& cur_value)
+{
+  const Type t = sum + cur_value;
+  if (abs(sum) >= abs(cur_value)) {
+    compensation += (sum - t) + cur_value;
+  } else {
+    compensation += (cur_value - t) + sum;
+  }
+  sum = t;
+}
+
 template <typename Policy,
           typename InType,
           typename OutType,
@@ -130,6 +142,99 @@ RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock)
   }
 }
 
+template <typename Policy,
+          typename InType,
+          typename OutType,
+          typename IdxType,
+          typename MainLambda,
+          typename FinalLambda>
+RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock) coalescedSumThinKernel(OutType* dots,
+                                                                              const InType* data,
+                                                                              IdxType D,
+                                                                              IdxType N,
+                                                                              OutType init,
+                                                                              MainLambda main_op,
+                                                                              FinalLambda final_op,
+                                                                              bool inplace = false)
+{
+  /* The strategy to achieve near-SOL memory bandwidth differs based on D:
+   *  - For small D, we need to process multiple rows per logical warp in order to have
+   *    multiple loads per thread and increase bytes in flight and amortize latencies.
+   *  - For large D, we start with a sequential reduction. The compiler partially unrolls
+   *    that loop (e.g. first a loop of stride 16, then 8, 4, and 1).
+   */
+  IdxType i0 = threadIdx.y + (Policy::RowsPerBlock * static_cast<IdxType>(blockIdx.x));
+  if (i0 >= N) return;
+
+  OutType acc[Policy::RowsPerLogicalWarp];
+  OutType thread_c[Policy::RowsPerLogicalWarp];
+
+#pragma unroll
+  for (int k = 0; k < Policy::RowsPerLogicalWarp; k++) {
+    acc[k]      = init;
+    thread_c[k] = 0;
+  }
+
+  if constexpr (Policy::NoSequentialReduce) {
+    IdxType j = threadIdx.x;
+    if (j < D) {
+#pragma unroll
+      for (IdxType k = 0; k < Policy::RowsPerLogicalWarp; k++) {
+        // Only the first row is known to be within bounds. Clamp to avoid out-of-mem read.
+        const IdxType i = raft::min(i0 + k * Policy::NumLogicalWarps, N - 1);
+        // acc[k]          = reduce_op(acc[k], main_op(data[j + (D * i)], j));
+        KahanBabushkaNeumaierSum<OutType>(acc[k], thread_c[k], main_op(data[j + (D * i)], j));
+      }
+    }
+  } else {
+    for (IdxType j = threadIdx.x; j < D; j += Policy::LogicalWarpSize) {
+#pragma unroll
+      for (IdxType k = 0; k < Policy::RowsPerLogicalWarp; k++) {
+        const IdxType i = raft::min(i0 + k * Policy::NumLogicalWarps, N - 1);
+        // acc[k]          = reduce_op(acc[k], main_op(data[j + (D * i)], j));
+        KahanBabushkaNeumaierSum<OutType>(acc[k], thread_c[k], main_op(data[j + (D * i)], j));
+      }
+    }
+  }
+
+  /* This vector reduction has two benefits compared to naive separate reductions:
+   * - It avoids the LSU bottleneck when the number of columns is around 32 (e.g. for 32, 5 shuffles
+   *   are required and there is no initial sequential reduction to amortize that cost).
+   * - It distributes the outputs to multiple threads, enabling a coalesced store when the number of
+   *   rows per logical warp and logical warp size are equal.
+   */
+  raft::logicalWarpReduceVector<Policy::LogicalWarpSize, Policy::RowsPerLogicalWarp>(
+    acc, threadIdx.x, raft::add_op());
+
+  raft::logicalWarpReduceVector<Policy::LogicalWarpSize, Policy::RowsPerLogicalWarp>(
+    thread_c, threadIdx.x, raft::add_op());
+
+  constexpr int reducOutVecWidth =
+    std::max(1, Policy::RowsPerLogicalWarp / Policy::LogicalWarpSize);
+  constexpr int reducOutGroupSize =
+    std::max(1, Policy::LogicalWarpSize / Policy::RowsPerLogicalWarp);
+  constexpr int reducNumGroups = Policy::LogicalWarpSize / reducOutGroupSize;
+
+  if (threadIdx.x % reducOutGroupSize == 0) {
+    const int groupId = threadIdx.x / reducOutGroupSize;
+    if (inplace) {
+#pragma unroll
+      for (int k = 0; k < reducOutVecWidth; k++) {
+        const int reductionId = k * reducNumGroups + groupId;
+        const IdxType i       = i0 + reductionId * Policy::NumLogicalWarps;
+        if (i < N) { dots[i] = final_op(dots[i] + acc[k] + thread_c[k]); }
+      }
+    } else {
+#pragma unroll
+      for (int k = 0; k < reducOutVecWidth; k++) {
+        const int reductionId = k * reducNumGroups + groupId;
+        const IdxType i       = i0 + reductionId * Policy::NumLogicalWarps;
+        if (i < N) { dots[i] = final_op(acc[k] + thread_c[k]); }
+      }
+    }
+  }
+}
+
 template <typename Policy,
           typename InType,
           typename OutType      = InType,
@@ -156,8 +261,13 @@ void coalescedReductionThin(OutType* dots,
     static_cast<int>(Policy::NoSequentialReduce));
   dim3 threads(Policy::LogicalWarpSize, Policy::NumLogicalWarps, 1);
   dim3 blocks(ceildiv<IdxType>(N, Policy::RowsPerBlock), 1, 1);
-  coalescedReductionThinKernel<Policy>
-    <<<blocks, threads, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
+  if constexpr (std::is_same_v<ReduceLambda, raft::add_op>) {
+    coalescedSumThinKernel<Policy>
+      <<<blocks, threads, 0, stream>>>(dots, data, D, N, init, main_op, final_op, inplace);
+  } else {
+    coalescedReductionThinKernel<Policy><<<blocks, threads, 0, stream>>>(
+      dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
+  }
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
@@ -240,6 +350,44 @@ RAFT_KERNEL __launch_bounds__(TPB) coalescedReductionMediumKernel(OutType* dots,
   }
 }
 
+template <int TPB,
+          typename InType,
+          typename OutType,
+          typename IdxType,
+          typename MainLambda,
+          typename FinalLambda>
+RAFT_KERNEL __launch_bounds__(TPB) coalescedSumMediumKernel(OutType* dots,
+                                                            const InType* data,
+                                                            IdxType D,
+                                                            IdxType N,
+                                                            OutType init,
+                                                            MainLambda main_op,
+                                                            FinalLambda final_op,
+                                                            bool inplace = false)
+{
+  typedef cub::BlockReduce<OutType, TPB, cub::BLOCK_REDUCE_RAKING> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage1;
+  __shared__ typename BlockReduce::TempStorage temp_storage2;
+  OutType thread_data = init;
+  OutType thread_c    = (OutType)0;
+
+  IdxType rowStart = blockIdx.x * D;
+  for (IdxType i = threadIdx.x; i < D; i += TPB) {
+    IdxType idx = rowStart + i;
+    KahanBabushkaNeumaierSum<OutType>(thread_data, thread_c, main_op(data[idx], i));
+  }
+  OutType block_acc = BlockReduce(temp_storage1).Sum(thread_data);
+  OutType block_c   = BlockReduce(temp_storage2).Sum(thread_c);
+
+  if (threadIdx.x == 0) {
+    if (inplace) {
+      dots[blockIdx.x] = final_op(dots[blockIdx.x] + block_acc + block_c);
+    } else {
+      dots[blockIdx.x] = final_op(block_acc + block_c);
+    }
+  }
+}
+
 template <int TPB,
           typename InType,
           typename OutType      = InType,
@@ -259,8 +407,13 @@ void coalescedReductionMedium(OutType* dots,
                               FinalLambda final_op   = raft::identity_op())
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope("coalescedReductionMedium<%d>", TPB);
-  coalescedReductionMediumKernel<TPB>
-    <<<N, TPB, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
+  if constexpr (std::is_same_v<ReduceLambda, raft::add_op>) {
+    coalescedSumMediumKernel<TPB>
+      <<<N, TPB, 0, stream>>>(dots, data, D, N, init, main_op, final_op, inplace);
+  } else {
+    coalescedReductionMediumKernel<TPB>
+      <<<N, TPB, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
+  }
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
@@ -322,6 +475,32 @@ RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock)
   if (threadIdx.x == 0) { buffer[Policy::BlocksPerRow * blockIdx.x + blockIdx.y] = acc; }
 }
 
+template <typename Policy, typename InType, typename OutType, typename IdxType, typename MainLambda>
+RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock) coalescedSumThickKernel(
+  OutType* buffer, const InType* data, IdxType D, IdxType N, OutType init, MainLambda main_op)
+{
+  typedef cub::BlockReduce<OutType, Policy::ThreadsPerBlock, cub::BLOCK_REDUCE_RAKING> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage1;
+  __shared__ typename BlockReduce::TempStorage temp_storage2;
+
+  OutType thread_data = init;
+  OutType thread_c    = (OutType)0;
+
+  IdxType rowStart = blockIdx.x * D;
+  for (IdxType i = blockIdx.y * Policy::ThreadsPerBlock + threadIdx.x; i < D;
+       i += Policy::BlockStride) {
+    IdxType idx = rowStart + i;
+    KahanBabushkaNeumaierSum<OutType>(thread_data, thread_c, main_op(data[idx], i));
+  }
+
+  OutType block_acc = BlockReduce(temp_storage1).Sum(thread_data);
+  OutType block_c   = BlockReduce(temp_storage2).Sum(thread_c);
+
+  if (threadIdx.x == 0) {
+    buffer[Policy::BlocksPerRow * blockIdx.x + blockIdx.y] = block_acc + block_c;
+  }
+}
+
 template <typename ThickPolicy,
           typename ThinPolicy,
           typename InType,
@@ -355,9 +534,13 @@ void coalescedReductionThick(OutType* dots,
    *  2. coalescedReductionThinKernel reduces [N x BlocksPerRow] to [N x 1]. It doesn't apply any
    *     main_op but applies final_op. If in-place, the existing and new values are reduced.
    */
-
-  coalescedReductionThickKernel<ThickPolicy>
-    <<<blocks, threads, 0, stream>>>(buffer.data(), data, D, N, init, main_op, reduce_op);
+  if constexpr (std::is_same_v<ReduceLambda, raft::add_op>) {
+    coalescedSumThickKernel<ThickPolicy>
+      <<<blocks, threads, 0, stream>>>(buffer.data(), data, D, N, init, main_op);
+  } else {
+    coalescedReductionThickKernel<ThickPolicy>
+      <<<blocks, threads, 0, stream>>>(buffer.data(), data, D, N, init, main_op, reduce_op);
+  }
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   coalescedReductionThin<ThinPolicy>(dots,
@@ -391,18 +574,16 @@ void coalescedReductionThickDispatcher(OutType* dots,
 {
   // Note: multiple elements per thread to take advantage of the sequential reduction and loop
   // unrolling
-  if (D < IdxType(32768)) {
-    coalescedReductionThick<ReductionThickPolicy<256, 32>, ReductionThinPolicy<32, 128, 1>>(
-      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
-  } else {
-    coalescedReductionThick<ReductionThickPolicy<256, 64>, ReductionThinPolicy<32, 128, 1>>(
-      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
-  }
+  coalescedReductionThick<ReductionThickPolicy<256, 64>, ReductionThinPolicy<32, 128, 1>>(
+    dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
 }
 
 // Primitive to perform reductions along the coalesced dimension of the matrix, i.e. reduce along
 // rows for row major or reduce along columns for column major layout. Can do an inplace reduction
 // adding to original values of dots if requested.
+// In case of an add-reduction, a compensated summation will be performed in order to reduce
+// numerical error. Note that the compensation will only be performed 'per-thread' for performance
+// reasons and therefore not be equivalent to a sequential compensation.
 template <typename InType,
           typename OutType      = InType,
           typename IdxType      = int,
@@ -422,14 +603,17 @@ void coalescedReduction(OutType* dots,
 {
   /* The primitive selects one of three implementations based on heuristics:
    *  - Thin: very efficient when D is small and/or N is large
+   *    At most one warp is processing each row
    *  - Thick: used when N is very small and D very large
-   *  - Medium: used when N is too small to fill the GPU with the thin kernel
+   *    Multiple blocks (32/64) processing each row
+   *  - Medium: everything in between
+   *    One block is processing each row
    */
   const IdxType numSMs = raft::getMultiProcessorCount();
-  if (D <= IdxType(256) || N >= IdxType(4) * numSMs) {
+  if (D <= IdxType(512) || (N >= IdxType(16) * numSMs && D < IdxType(2048))) {
     coalescedReductionThinDispatcher(
       dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
-  } else if (N < numSMs && D >= IdxType(16384)) {
+  } else if (N < numSMs && D >= IdxType(1 << 17)) {
     coalescedReductionThickDispatcher(
       dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   } else {
diff --git a/cpp/include/raft/linalg/detail/eig.cuh b/cpp/include/raft/linalg/detail/eig.cuh
index 2a4cfd52ec..ba7ed3dcdf 100644
--- a/cpp/include/raft/linalg/detail/eig.cuh
+++ b/cpp/include/raft/linalg/detail/eig.cuh
@@ -19,10 +19,12 @@
 #include "cusolver_wrappers.hpp"
 
 #include <raft/core/resource/cusolver_dn_handle.hpp>
+#include <raft/core/resource/detail/stream_sync_event.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/matrix/copy.cuh>
 #include <raft/util/cudart_utils.hpp>
 
+#include <rmm/cuda_stream.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -90,7 +92,19 @@ void eigDC(raft::resources const& handle,
 {
 #if CUDART_VERSION < 11010
   eigDC_legacy(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream);
+  return;
+#endif
+
+#if CUDART_VERSION <= 12040
+  // Use a new stream instead of `cudaStreamPerThread` to avoid cusolver bug # 4580093.
+  rmm::cuda_stream stream_new_wrapper;
+  cudaStream_t stream_new = stream_new_wrapper.value();
+  cudaEvent_t sync_event  = resource::detail::get_cuda_stream_sync_event(handle);
+  RAFT_CUDA_TRY(cudaEventRecord(sync_event, stream));
+  RAFT_CUDA_TRY(cudaStreamWaitEvent(stream_new, sync_event));
 #else
+  cudaStream_t stream_new = stream;
+#endif
   cusolverDnHandle_t cusolverH = resource::get_cusolver_dn_handle(handle);
 
   cusolverDnParams_t dn_params = nullptr;
@@ -108,15 +122,13 @@ void eigDC(raft::resources const& handle,
                                                 eig_vals,
                                                 &workspaceDevice,
                                                 &workspaceHost,
-                                                stream));
+                                                stream_new));
 
-  rmm::device_uvector<math_t> d_work(workspaceDevice / sizeof(math_t), stream);
-  rmm::device_scalar<int> d_dev_info(stream);
+  rmm::device_uvector<math_t> d_work(workspaceDevice / sizeof(math_t), stream_new);
+  rmm::device_scalar<int> d_dev_info(stream_new);
   std::vector<math_t> h_work(workspaceHost / sizeof(math_t));
 
-  raft::matrix::copy(handle,
-                     make_device_matrix_view<const math_t>(in, n_rows, n_cols),
-                     make_device_matrix_view<math_t>(eig_vectors, n_rows, n_cols));
+  raft::copy(eig_vectors, in, n_rows * n_cols, stream_new);
 
   RAFT_CUSOLVER_TRY(cusolverDnxsyevd(cusolverH,
                                      dn_params,
@@ -131,14 +143,19 @@ void eigDC(raft::resources const& handle,
                                      h_work.data(),
                                      workspaceHost,
                                      d_dev_info.data(),
-                                     stream));
+                                     stream_new));
 
   RAFT_CUDA_TRY(cudaGetLastError());
   RAFT_CUSOLVER_TRY(cusolverDnDestroyParams(dn_params));
-  int dev_info = d_dev_info.value(stream);
+  int dev_info = d_dev_info.value(stream_new);
   ASSERT(dev_info == 0,
          "eig.cuh: eigensolver couldn't converge to a solution. "
          "This usually occurs when some of the features do not vary enough.");
+
+#if CUDART_VERSION <= 12040
+  // Synchronize the created stream with the original stream before return
+  RAFT_CUDA_TRY(cudaEventRecord(sync_event, stream_new));
+  RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, sync_event));
 #endif
 }
 
diff --git a/cpp/include/raft/linalg/detail/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh
index 617ac6d874..567dc6220e 100644
--- a/cpp/include/raft/linalg/detail/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh
@@ -28,38 +28,63 @@ namespace raft {
 namespace linalg {
 namespace detail {
 
-// Kernel to perform reductions along the strided dimension
+// Kernel to perform summation along the strided dimension
 // of the matrix, i.e. reduce along columns for row major or reduce along rows
 // for column major layout
+// A compensated summation will be performed in order to reduce numerical error.
+// Note that the compensation will only be performed 'per-block' for performance
+// reasons and therefore not be equivalent to a sequential compensation.
+
 template <typename Type, typename MainLambda>
 RAFT_KERNEL stridedSummationKernel(
-  Type* dots, const Type* data, int D, int N, Type init, MainLambda main_op)
+  Type* out, const Type* data, int D, int N, Type init, MainLambda main_op)
 {
   // Thread reduction
-  Type thread_data = Type(init);
-  int colStart     = blockIdx.x * blockDim.x + threadIdx.x;
+  Type thread_sum = Type(init);
+  Type thread_c   = Type(0);
+  int colStart    = blockIdx.x * blockDim.x + threadIdx.x;
   if (colStart < D) {
     int rowStart = blockIdx.y * blockDim.y + threadIdx.y;
     int stride   = blockDim.y * gridDim.y;
     for (int j = rowStart; j < N; j += stride) {
       int idx = colStart + j * D;
-      thread_data += main_op(data[idx], j);
+
+      // KahanBabushkaNeumaierSum
+      const Type cur_value = main_op(data[idx], j);
+      const Type t         = thread_sum + cur_value;
+      if (abs(thread_sum) >= abs(cur_value)) {
+        thread_c += (thread_sum - t) + cur_value;
+      } else {
+        thread_c += (cur_value - t) + thread_sum;
+      }
+      thread_sum = t;
     }
   }
 
   // Block reduction
-  extern __shared__ char tmp[];  // One element per thread in block
-  Type* temp  = (Type*)tmp;      // Cast to desired type
-  int myidx   = threadIdx.x + blockDim.x * threadIdx.y;
-  temp[myidx] = thread_data;
+  extern __shared__ char tmp[];
+  auto* block_sum = (Type*)tmp;
+  auto* block_c   = block_sum + blockDim.x;
+
+  if (threadIdx.y == 0) {
+    block_sum[threadIdx.x] = Type(0);
+    block_c[threadIdx.x]   = Type(0);
+  }
   __syncthreads();
-  for (int j = blockDim.y / 2; j > 0; j /= 2) {
-    if (threadIdx.y < j) temp[myidx] += temp[myidx + j * blockDim.x];
-    __syncthreads();
+  // also compute compensation for block-sum
+  const Type old_sum = atomicAdd(block_sum + threadIdx.x, thread_sum);
+  const Type t       = old_sum + thread_sum;
+  if (abs(old_sum) >= abs(thread_sum)) {
+    thread_c += (old_sum - t) + thread_sum;
+  } else {
+    thread_c += (thread_sum - t) + old_sum;
   }
+  raft::myAtomicAdd(block_c + threadIdx.x, thread_c);
+  __syncthreads();
 
   // Grid reduction
-  if ((colStart < D) && (threadIdx.y == 0)) raft::myAtomicAdd(dots + colStart, temp[myidx]);
+  if (colStart < D && (threadIdx.y == 0))
+    raft::myAtomicAdd(out + colStart, block_sum[threadIdx.x] + block_c[threadIdx.x]);
 }
 
 // Kernel to perform reductions along the strided dimension
@@ -127,23 +152,35 @@ void stridedReduction(OutType* dots,
   /// for atomics in stridedKernel (redesign for this is already underway)
   if (!inplace) raft::linalg::unaryOp(dots, dots, D, raft::const_op(init), stream);
 
-  // Arbitrary numbers for now, probably need to tune
-  const dim3 thrds(32, 16);
-  IdxType elemsPerThread = raft::ceildiv(N, (IdxType)thrds.y);
-  elemsPerThread         = (elemsPerThread > 8) ? 8 : elemsPerThread;
-  const dim3 nblks(raft::ceildiv(D, (IdxType)thrds.x),
-                   raft::ceildiv(N, (IdxType)thrds.y * elemsPerThread));
-  const size_t shmemSize = sizeof(OutType) * thrds.x * thrds.y;
-
   ///@todo: this complication should go away once we have eliminated the need
   /// for atomics in stridedKernel (redesign for this is already underway)
   if constexpr (std::is_same<ReduceLambda, raft::add_op>::value &&
-                std::is_same<InType, OutType>::value)
+                std::is_same<InType, OutType>::value) {
+    constexpr int TPB        = 256;
+    constexpr int ColsPerBlk = 8;
+    constexpr dim3 Block(ColsPerBlk, TPB / ColsPerBlk);
+    constexpr int MinRowsPerThread = 16;
+    constexpr int MinRowsPerBlk    = Block.y * MinRowsPerThread;
+    constexpr int MaxBlocksDimY    = 8192;
+
+    const dim3 grid(raft::ceildiv(D, (IdxType)ColsPerBlk),
+                    raft::min((IdxType)MaxBlocksDimY, raft::ceildiv(N, (IdxType)MinRowsPerBlk)));
+    const size_t shmemSize = sizeof(OutType) * Block.x * 2;
+
     stridedSummationKernel<InType>
-      <<<nblks, thrds, shmemSize, stream>>>(dots, data, D, N, init, main_op);
-  else
+      <<<grid, Block, shmemSize, stream>>>(dots, data, D, N, init, main_op);
+  } else {
+    // Arbitrary numbers for now, probably need to tune
+    const dim3 thrds(32, 16);
+    IdxType elemsPerThread = raft::ceildiv(N, (IdxType)thrds.y);
+    elemsPerThread         = (elemsPerThread > 8) ? 8 : elemsPerThread;
+    const dim3 nblks(raft::ceildiv(D, (IdxType)thrds.x),
+                     raft::ceildiv(N, (IdxType)thrds.y * elemsPerThread));
+    const size_t shmemSize = sizeof(OutType) * thrds.x * thrds.y;
+
     stridedReductionKernel<InType, OutType, IdxType>
       <<<nblks, thrds, shmemSize, stream>>>(dots, data, D, N, init, main_op, reduce_op);
+  }
 
   ///@todo: this complication should go away once we have eliminated the need
   /// for atomics in stridedKernel (redesign for this is already underway)
diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh
index a4523c6926..8fd6e45d37 100644
--- a/cpp/include/raft/linalg/reduce.cuh
+++ b/cpp/include/raft/linalg/reduce.cuh
@@ -31,6 +31,9 @@ namespace linalg {
 
 /**
  * @brief Compute reduction of the input matrix along the requested dimension
+ *        In case of an add-reduction, a compensated summation will be performed
+ *        in order to reduce numerical error. Note that the compensation will not
+ *        be equivalent to a sequential compensation to preserve parallel efficiency.
  *
  * @tparam InType the data type of the input
  * @tparam OutType the data type of the output (as well as the data type for
@@ -92,7 +95,9 @@ void reduce(OutType* dots,
  *        is either row-major or column-major, while allowing the choose the
  *        dimension for reduction. Depending upon the dimension chosen for
  *        reduction, the memory accesses may be coalesced or strided.
- *
+ *        In case of an add-reduction, a compensated summation will be performed
+ *        in order to reduce numerical error. Note that the compensation will not
+ *        be equivalent to a sequential compensation to preserve parallel efficiency.
  * @tparam InElementType the input data-type of underlying raft::matrix_view
  * @tparam LayoutPolicy The layout of Input/Output (row or col major)
  * @tparam OutElementType the output data-type of underlying raft::matrix_view and reduction
diff --git a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
index e4e3ea3512..93faf9dd19 100644
--- a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
@@ -423,24 +423,39 @@ void optimize(raft::resources const& res,
     const auto num_full = host_stats.data_handle()[1];
 
     // Create pruned kNN graph
-    uint32_t max_detour = 0;
-#pragma omp parallel for reduction(max : max_detour)
+#pragma omp parallel for
     for (uint64_t i = 0; i < graph_size; i++) {
-      uint64_t pk = 0;
-      for (uint32_t num_detour = 0; num_detour < output_graph_degree; num_detour++) {
-        if (max_detour < num_detour) { max_detour = num_detour; /* stats */ }
+      // Find the `output_graph_degree` smallest detourable count nodes by checking the detourable
+      // count of the neighbors while increasing the target detourable count from zero.
+      uint64_t pk         = 0;
+      uint32_t num_detour = 0;
+      while (pk < output_graph_degree) {
+        uint32_t next_num_detour = std::numeric_limits<uint32_t>::max();
         for (uint64_t k = 0; k < input_graph_degree; k++) {
-          if (detour_count.data_handle()[k + (input_graph_degree * i)] != num_detour) { continue; }
+          const auto num_detour_k = detour_count.data_handle()[k + (input_graph_degree * i)];
+          // Find the detourable count to check in the next iteration
+          if (num_detour_k > num_detour) {
+            next_num_detour = std::min(static_cast<uint32_t>(num_detour_k), next_num_detour);
+          }
+
+          // Store the neighbor index if its detourable count is equal to `num_detour`.
+          if (num_detour_k != num_detour) { continue; }
           output_graph_ptr[pk + (output_graph_degree * i)] =
             input_graph_ptr[k + (input_graph_degree * i)];
           pk += 1;
           if (pk >= output_graph_degree) break;
         }
         if (pk >= output_graph_degree) break;
+
+        assert(next_num_detour != std::numeric_limits<uint32_t>::max());
+        num_detour = next_num_detour;
       }
-      assert(pk == output_graph_degree);
+      RAFT_EXPECTS(pk == output_graph_degree,
+                   "Couldn't find the output_graph_degree (%u) smallest detourable count nodes for "
+                   "node %lu in the rank-based node reranking process",
+                   output_graph_degree,
+                   static_cast<uint64_t>(i));
     }
-    // RAFT_LOG_DEBUG("# max_detour: %u\n", max_detour);
 
     const double time_prune_end = cur_time();
     RAFT_LOG_DEBUG(
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
index cfbb1e100c..16bb555aa4 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
@@ -36,6 +36,7 @@
 
 #include <algorithm>
 #include <cassert>
+#include <cstdio>
 #include <iostream>
 #include <memory>
 #include <numeric>
@@ -209,7 +210,7 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
 
 #if 0
     /* debug */
-    for (unsigned i = threadIdx.x; i < result_buffer_size_32; i += BLOCK_SIZE) {
+    for (unsigned i = threadIdx.x; i < result_buffer_size_32; i += blockDim.x) {
         result_indices_buffer[i] = utils::get_max_value<INDEX_T>();
         result_distances_buffer[i] = utils::get_max_value<DISTANCE_T>();
     }
@@ -351,16 +352,19 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
   }
 
 #ifdef _CLK_BREAKDOWN
-  if ((threadIdx.x == 0 || threadIdx.x == BLOCK_SIZE - 1) && (blockIdx.x == 0) &&
+  if ((threadIdx.x == 0 || threadIdx.x == blockDim.x - 1) && (blockIdx.x == 0) &&
       ((query_id * 3) % gridDim.y < 3)) {
-    RAFT_LOG_DEBUG(
+    printf(
+      "%s:%d "
       "query, %d, thread, %d"
-      ", init, %d"
+      ", init, %lu"
       ", 1st_distance, %lu"
       ", topk, %lu"
       ", pickup_parents, %lu"
       ", distance, %lu"
       "\n",
+      __FILE__,
+      __LINE__,
       query_id,
       threadIdx.x,
       clk_init,
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index e8104bd6f6..232dcb782a 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -40,6 +40,7 @@
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
+#include <cstdio>
 #include <iostream>
 #include <memory>
 #include <numeric>
@@ -448,14 +449,6 @@ __device__ inline void hashmap_restore(INDEX_T* const hashmap_ptr,
   }
 }
 
-template <class T, unsigned BLOCK_SIZE>
-__device__ inline void set_value_device(T* const ptr, const T fill, const std::uint32_t count)
-{
-  for (std::uint32_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
-    ptr[i] = fill;
-  }
-}
-
 // One query one thread block
 template <uint32_t TEAM_SIZE,
           uint32_t DATASET_BLOCK_DIM,
@@ -791,10 +784,11 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
     num_executed_iterations[query_id] = iter + 1;
   }
 #ifdef _CLK_BREAKDOWN
-  if ((threadIdx.x == 0 || threadIdx.x == BLOCK_SIZE - 1) && ((query_id * 3) % gridDim.y < 3)) {
-    RAFT_LOG_DEBUG(
+  if ((threadIdx.x == 0 || threadIdx.x == blockDim.x - 1) && ((query_id * 3) % gridDim.y < 3)) {
+    printf(
+      "%s:%d "
       "query, %d, thread, %d"
-      ", init, %d"
+      ", init, %lu"
       ", 1st_distance, %lu"
       ", topk, %lu"
       ", reset_hash, %lu"
@@ -802,6 +796,8 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
       ", restore_hash, %lu"
       ", distance, %lu"
       "\n",
+      __FILE__,
+      __LINE__,
       query_id,
       threadIdx.x,
       clk_init,
diff --git a/cpp/include/raft/neighbors/detail/hnsw.hpp b/cpp/include/raft/neighbors/detail/hnsw.hpp
index 5deee3c2ba..bd4e6608de 100644
--- a/cpp/include/raft/neighbors/detail/hnsw.hpp
+++ b/cpp/include/raft/neighbors/detail/hnsw.hpp
@@ -53,6 +53,7 @@ void search(raft::resources const& res,
             raft::host_matrix_view<uint64_t, int64_t, row_major> neighbors,
             raft::host_matrix_view<float, int64_t, row_major> distances)
 {
+  idx.set_ef(params.ef);
   auto const* hnswlib_index =
     reinterpret_cast<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type> const*>(
       idx.get_index());
diff --git a/cpp/include/raft/neighbors/detail/hnsw_types.hpp b/cpp/include/raft/neighbors/detail/hnsw_types.hpp
index 9d35effd1a..8d601f59ae 100644
--- a/cpp/include/raft/neighbors/detail/hnsw_types.hpp
+++ b/cpp/include/raft/neighbors/detail/hnsw_types.hpp
@@ -93,6 +93,11 @@ struct index_impl : index<T> {
   */
   auto get_index() const -> void const* override { return appr_alg_.get(); }
 
+  /**
+  @brief Set ef for search
+  */
+  void set_ef(int ef) const override { appr_alg_->ef_ = ef; }
+
  private:
   std::unique_ptr<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>> appr_alg_;
   std::unique_ptr<hnswlib::SpaceInterface<typename hnsw_dist_t<T>::type>> space_;
diff --git a/cpp/include/raft/neighbors/detail/nn_descent.cuh b/cpp/include/raft/neighbors/detail/nn_descent.cuh
index cd3c6f3947..9c37ee146d 100644
--- a/cpp/include/raft/neighbors/detail/nn_descent.cuh
+++ b/cpp/include/raft/neighbors/detail/nn_descent.cuh
@@ -19,10 +19,14 @@
 #include "../nn_descent_types.hpp"
 
 #include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
 #include <raft/core/error.hpp>
 #include <raft/core/host_mdarray.hpp>
+#include <raft/core/mdspan.hpp>
+#include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/matrix/slice.cuh>
 #include <raft/neighbors/detail/cagra/device_common.hpp>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 #include <raft/util/arch.cuh>  // raft::util::arch::SM_*
@@ -45,6 +49,7 @@
 #include <omp.h>
 
 #include <limits>
+#include <optional>
 #include <queue>
 #include <random>
 
@@ -217,6 +222,7 @@ struct BuildConfig {
   // If internal_node_degree == 0, the value of node_degree will be assigned to it
   size_t max_iterations{50};
   float termination_threshold{0.0001};
+  size_t output_graph_degree{32};
 };
 
 template <typename Index_t>
@@ -338,14 +344,19 @@ struct GnndGraph {
   ~GnndGraph();
 };
 
-template <typename Data_t = float, typename Index_t = int>
+template <typename Data_t = float, typename Index_t = int, typename epilogue_op = raft::identity_op>
 class GNND {
  public:
   GNND(raft::resources const& res, const BuildConfig& build_config);
   GNND(const GNND&)            = delete;
   GNND& operator=(const GNND&) = delete;
 
-  void build(Data_t* data, const Index_t nrow, Index_t* output_graph);
+  void build(Data_t* data,
+             const Index_t nrow,
+             Index_t* output_graph,
+             bool return_distances,
+             DistData_t* output_distances,
+             epilogue_op distance_epilogue = raft::identity_op());
   ~GNND()    = default;
   using ID_t = InternalID_t<Index_t>;
 
@@ -355,7 +366,7 @@ class GNND {
                          Index_t* d_rev_graph_ptr,
                          int2* list_sizes,
                          cudaStream_t stream = 0);
-  void local_join(cudaStream_t stream = 0);
+  void local_join(cudaStream_t stream = 0, epilogue_op distance_epilogue = raft::identity_op());
 
   raft::resources const& res;
 
@@ -688,7 +699,9 @@ __device__ __forceinline__ void remove_duplicates(
 // MAX_RESIDENT_THREAD_PER_SM = BLOCK_SIZE * BLOCKS_PER_SM = 2048
 // For architectures 750 and 860 (890), the values for MAX_RESIDENT_THREAD_PER_SM
 // is 1024 and 1536 respectively, which means the bounds don't work anymore
-template <typename Index_t, typename ID_t = InternalID_t<Index_t>>
+template <typename Index_t,
+          typename ID_t        = InternalID_t<Index_t>,
+          typename epilogue_op = raft::identity_op>
 RAFT_KERNEL
 #ifdef __CUDA_ARCH__
 #if (__CUDA_ARCH__) == 750 || ((__CUDA_ARCH__) >= 860 && (__CUDA_ARCH__) <= 890)
@@ -710,7 +723,8 @@ __launch_bounds__(BLOCK_SIZE, 4)
                     DistData_t* dists,
                     int graph_width,
                     int* locks,
-                    DistData_t* l2_norms)
+                    DistData_t* l2_norms,
+                    epilogue_op distance_epilogue)
 {
 #if (__CUDA_ARCH__ >= 700)
   using namespace nvcuda;
@@ -820,14 +834,17 @@ __launch_bounds__(BLOCK_SIZE, 4)
   __syncthreads();
 
   for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) {
-    if (i % SKEWED_MAX_NUM_BI_SAMPLES < list_new_size &&
-        i / SKEWED_MAX_NUM_BI_SAMPLES < list_new_size) {
+    auto row_idx = i % SKEWED_MAX_NUM_BI_SAMPLES;
+    auto col_idx = i / SKEWED_MAX_NUM_BI_SAMPLES;
+    if (row_idx < list_new_size && col_idx < list_new_size) {
+      auto r = new_neighbors[row_idx];
+      auto c = new_neighbors[col_idx];
       if (l2_norms == nullptr) {
-        s_distances[i] = -s_distances[i];
+        auto dist_val  = -s_distances[i];
+        s_distances[i] = distance_epilogue(dist_val, r, c);
       } else {
-        s_distances[i] = l2_norms[new_neighbors[i % SKEWED_MAX_NUM_BI_SAMPLES]] +
-                         l2_norms[new_neighbors[i / SKEWED_MAX_NUM_BI_SAMPLES]] -
-                         2.0 * s_distances[i];
+        auto dist_val  = l2_norms[r] + l2_norms[c] - 2.0 * s_distances[i];
+        s_distances[i] = distance_epilogue(dist_val, r, c);
       }
     } else {
       s_distances[i] = std::numeric_limits<float>::max();
@@ -899,14 +916,17 @@ __launch_bounds__(BLOCK_SIZE, 4)
   __syncthreads();
 
   for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) {
-    if (i % SKEWED_MAX_NUM_BI_SAMPLES < list_old_size &&
-        i / SKEWED_MAX_NUM_BI_SAMPLES < list_new_size) {
+    auto row_idx = i % SKEWED_MAX_NUM_BI_SAMPLES;
+    auto col_idx = i / SKEWED_MAX_NUM_BI_SAMPLES;
+    if (row_idx < list_old_size && col_idx < list_new_size) {
+      auto r = old_neighbors[row_idx];
+      auto c = new_neighbors[col_idx];
       if (l2_norms == nullptr) {
-        s_distances[i] = -s_distances[i];
+        auto dist_val  = -s_distances[i];
+        s_distances[i] = distance_epilogue(dist_val, r, c);
       } else {
-        s_distances[i] = l2_norms[old_neighbors[i % SKEWED_MAX_NUM_BI_SAMPLES]] +
-                         l2_norms[new_neighbors[i / SKEWED_MAX_NUM_BI_SAMPLES]] -
-                         2.0 * s_distances[i];
+        auto dist_val  = l2_norms[r] + l2_norms[c] - 2.0 * s_distances[i];
+        s_distances[i] = distance_epilogue(dist_val, r, c);
       }
     } else {
       s_distances[i] = std::numeric_limits<float>::max();
@@ -1134,8 +1154,9 @@ GnndGraph<Index_t>::~GnndGraph()
   assert(h_graph == nullptr);
 }
 
-template <typename Data_t, typename Index_t>
-GNND<Data_t, Index_t>::GNND(raft::resources const& res, const BuildConfig& build_config)
+template <typename Data_t, typename Index_t, typename epilogue_op>
+GNND<Data_t, Index_t, epilogue_op>::GNND(raft::resources const& res,
+                                         const BuildConfig& build_config)
   : res(res),
     build_config_(build_config),
     graph_(build_config.max_dataset_size,
@@ -1174,12 +1195,12 @@ GNND<Data_t, Index_t>::GNND(raft::resources const& res, const BuildConfig& build
   thrust::fill(thrust::device, d_locks_.data_handle(), d_locks_.data_handle() + d_locks_.size(), 0);
 };
 
-template <typename Data_t, typename Index_t>
-void GNND<Data_t, Index_t>::add_reverse_edges(Index_t* graph_ptr,
-                                              Index_t* h_rev_graph_ptr,
-                                              Index_t* d_rev_graph_ptr,
-                                              int2* list_sizes,
-                                              cudaStream_t stream)
+template <typename Data_t, typename Index_t, typename epilogue_op>
+void GNND<Data_t, Index_t, epilogue_op>::add_reverse_edges(Index_t* graph_ptr,
+                                                           Index_t* h_rev_graph_ptr,
+                                                           Index_t* d_rev_graph_ptr,
+                                                           int2* list_sizes,
+                                                           cudaStream_t stream)
 {
   add_rev_edges_kernel<<<nrow_, raft::warp_size(), 0, stream>>>(
     graph_ptr, d_rev_graph_ptr, NUM_SAMPLES, list_sizes);
@@ -1187,8 +1208,9 @@ void GNND<Data_t, Index_t>::add_reverse_edges(Index_t* graph_ptr,
     h_rev_graph_ptr, d_rev_graph_ptr, nrow_ * NUM_SAMPLES, raft::resource::get_cuda_stream(res));
 }
 
-template <typename Data_t, typename Index_t>
-void GNND<Data_t, Index_t>::local_join(cudaStream_t stream)
+template <typename Data_t, typename Index_t, typename epilogue_op>
+void GNND<Data_t, Index_t, epilogue_op>::local_join(cudaStream_t stream,
+                                                    epilogue_op distance_epilogue)
 {
   thrust::fill(thrust::device.on(stream),
                dists_buffer_.data_handle(),
@@ -1208,11 +1230,17 @@ void GNND<Data_t, Index_t>::local_join(cudaStream_t stream)
     dists_buffer_.data_handle(),
     DEGREE_ON_DEVICE,
     d_locks_.data_handle(),
-    l2_norms_.data_handle());
+    l2_norms_.data_handle(),
+    distance_epilogue);
 }
 
-template <typename Data_t, typename Index_t>
-void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* output_graph)
+template <typename Data_t, typename Index_t, typename epilogue_op>
+void GNND<Data_t, Index_t, epilogue_op>::build(Data_t* data,
+                                               const Index_t nrow,
+                                               Index_t* output_graph,
+                                               bool return_distances,
+                                               DistData_t* output_distances,
+                                               epilogue_op distance_epilogue)
 {
   using input_t = typename std::remove_const<Data_t>::type;
 
@@ -1308,7 +1336,7 @@ void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* out
       raft::util::arch::SM_range(raft::util::arch::SM_70(), raft::util::arch::SM_future());
 
     if (wmma_range.contains(runtime_arch)) {
-      local_join(stream);
+      local_join(stream, distance_epilogue);
     } else {
       THROW("NN_DESCENT cannot be run for __CUDA_ARCH__ < 700");
     }
@@ -1338,6 +1366,26 @@ void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* out
 
   // Reuse graph_.h_dists as the buffer for shrink the lists in graph
   static_assert(sizeof(decltype(*(graph_.h_dists.data_handle()))) >= sizeof(Index_t));
+
+  if (return_distances) {
+    auto graph_d_dists = raft::make_device_matrix<DistData_t, int64_t, raft::row_major>(
+      res, nrow_, build_config_.node_degree);
+    raft::copy(graph_d_dists.data_handle(),
+               graph_.h_dists.data_handle(),
+               nrow_ * build_config_.node_degree,
+               raft::resource::get_cuda_stream(res));
+
+    auto output_dist_view = raft::make_device_matrix_view<DistData_t, int64_t, raft::row_major>(
+      output_distances, nrow_, build_config_.output_graph_degree);
+
+    raft::matrix::slice_coordinates coords{static_cast<int64_t>(0),
+                                           static_cast<int64_t>(0),
+                                           static_cast<int64_t>(nrow_),
+                                           static_cast<int64_t>(build_config_.output_graph_degree)};
+    raft::matrix::slice<DistData_t, int64_t, raft::row_major>(
+      res, raft::make_const_mdspan(graph_d_dists.view()), output_dist_view, coords);
+  }
+
   Index_t* graph_shrink_buffer = (Index_t*)graph_.h_dists.data_handle();
 
 #pragma omp parallel for
@@ -1365,13 +1413,15 @@ void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* out
 }
 
 template <typename T,
-          typename IdxT = uint32_t,
+          typename IdxT        = uint32_t,
+          typename epilogue_op = raft::identity_op,
           typename Accessor =
             host_device_accessor<std::experimental::default_accessor<T>, memory_type::host>>
 void build(raft::resources const& res,
            const index_params& params,
            mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> dataset,
-           index<IdxT>& idx)
+           index<IdxT>& idx,
+           epilogue_op distance_epilogue = raft::identity_op())
 {
   RAFT_EXPECTS(dataset.extent(0) < std::numeric_limits<int>::max() - 1,
                "The dataset size for GNND should be less than %d",
@@ -1410,10 +1460,25 @@ void build(raft::resources const& res,
                            .node_degree           = extended_graph_degree,
                            .internal_node_degree  = extended_intermediate_degree,
                            .max_iterations        = params.max_iterations,
-                           .termination_threshold = params.termination_threshold};
-
-  GNND<const T, int> nnd(res, build_config);
-  nnd.build(dataset.data_handle(), dataset.extent(0), int_graph.data_handle());
+                           .termination_threshold = params.termination_threshold,
+                           .output_graph_degree   = params.graph_degree};
+
+  GNND<const T, int, epilogue_op> nnd(res, build_config);
+
+  if (idx.distances().has_value() || !params.return_distances) {
+    nnd.build(dataset.data_handle(),
+              dataset.extent(0),
+              int_graph.data_handle(),
+              params.return_distances,
+              idx.distances()
+                .value_or(raft::make_device_matrix<float, int64_t>(res, 0, 0).view())
+                .data_handle(),
+              distance_epilogue);
+  } else {
+    RAFT_EXPECTS(!params.return_distances,
+                 "Distance view not allocated. Using return_distances set to true requires "
+                 "distance view to be allocated.");
+  }
 
 #pragma omp parallel for
   for (size_t i = 0; i < static_cast<size_t>(dataset.extent(0)); i++) {
@@ -1425,12 +1490,14 @@ void build(raft::resources const& res,
 }
 
 template <typename T,
-          typename IdxT = uint32_t,
+          typename IdxT        = uint32_t,
+          typename epilogue_op = raft::identity_op,
           typename Accessor =
             host_device_accessor<std::experimental::default_accessor<T>, memory_type::host>>
 index<IdxT> build(raft::resources const& res,
                   const index_params& params,
-                  mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> dataset)
+                  mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> dataset,
+                  epilogue_op distance_epilogue = raft::identity_op())
 {
   size_t intermediate_degree = params.intermediate_graph_degree;
   size_t graph_degree        = params.graph_degree;
@@ -1444,9 +1511,10 @@ index<IdxT> build(raft::resources const& res,
     graph_degree = intermediate_degree;
   }
 
-  index<IdxT> idx{res, dataset.extent(0), static_cast<int64_t>(graph_degree)};
+  index<IdxT> idx{
+    res, dataset.extent(0), static_cast<int64_t>(graph_degree), params.return_distances};
 
-  build(res, params, dataset, idx);
+  build(res, params, dataset, idx, distance_epilogue);
 
   return idx;
 }
diff --git a/cpp/include/raft/neighbors/hnsw.hpp b/cpp/include/raft/neighbors/hnsw.hpp
index 964c3ffacd..ee3f61e550 100644
--- a/cpp/include/raft/neighbors/hnsw.hpp
+++ b/cpp/include/raft/neighbors/hnsw.hpp
@@ -35,7 +35,7 @@ namespace raft::neighbors::hnsw {
 
 /**
  * @brief Construct an hnswlib base-layer-only index from a CAGRA index
- * NOTE: 1. This method uses the filesystem to write the CAGRA index in `/tmp/cagra_index.bin`
+ * NOTE: 1. This method uses the filesystem to write the CAGRA index in `/tmp/<random_number>.bin`
  * before reading it as an hnswlib index, then deleting the temporary file.
  *       2. This function is only offered as a compiled symbol in `libraft.so`
  *
diff --git a/cpp/include/raft/neighbors/hnsw_types.hpp b/cpp/include/raft/neighbors/hnsw_types.hpp
index 645a0903b7..f90de6f01b 100644
--- a/cpp/include/raft/neighbors/hnsw_types.hpp
+++ b/cpp/include/raft/neighbors/hnsw_types.hpp
@@ -62,6 +62,11 @@ struct index : ann::index {
 
   auto metric() const -> raft::distance::DistanceType { return metric_; }
 
+  /**
+  @brief Set ef for search
+  */
+  virtual void set_ef(int ef) const;
+
  private:
   int dim_;
   raft::distance::DistanceType metric_;
diff --git a/cpp/include/raft/neighbors/nn_descent.cuh b/cpp/include/raft/neighbors/nn_descent.cuh
index ceb5ae5643..a46a2006d6 100644
--- a/cpp/include/raft/neighbors/nn_descent.cuh
+++ b/cpp/include/raft/neighbors/nn_descent.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,19 +48,22 @@ namespace raft::neighbors::experimental::nn_descent {
  *
  * @tparam T data-type of the input dataset
  * @tparam IdxT data-type for the output index
+ * @tparam epilogue_op epilogue operation type for distances
  * @param[in] res raft::resources is an object mangaging resources
  * @param[in] params an instance of nn_descent::index_params that are parameters
  *               to run the nn-descent algorithm
  * @param[in] dataset raft::device_matrix_view input dataset expected to be located
  *                in device memory
+ * @param[in] distance_epilogue epilogue operation for distances
  * @return index<IdxT> index containing all-neighbors knn graph in host memory
  */
-template <typename T, typename IdxT = uint32_t>
+template <typename T, typename IdxT = uint32_t, typename epilogue_op = raft::identity_op>
 index<IdxT> build(raft::resources const& res,
                   index_params const& params,
-                  raft::device_matrix_view<const T, int64_t, row_major> dataset)
+                  raft::device_matrix_view<const T, int64_t, row_major> dataset,
+                  epilogue_op distance_epilogue = raft::identity_op())
 {
-  return detail::build<T, IdxT>(res, params, dataset);
+  return detail::build<T, IdxT>(res, params, dataset, distance_epilogue);
 }
 
 /**
@@ -85,6 +88,7 @@ index<IdxT> build(raft::resources const& res,
  *
  * @tparam T data-type of the input dataset
  * @tparam IdxT data-type for the output index
+ * @tparam epilogue_op epilogue operation type for distances
  * @param res raft::resources is an object mangaging resources
  * @param[in] params an instance of nn_descent::index_params that are parameters
  *               to run the nn-descent algorithm
@@ -92,14 +96,16 @@ index<IdxT> build(raft::resources const& res,
  *                in device memory
  * @param[out] idx raft::neighbors::experimental::nn_descentindex containing all-neighbors knn graph
  * in host memory
+ * @param[in] distance_epilogue epilogue operation for distances
  */
-template <typename T, typename IdxT = uint32_t>
+template <typename T, typename IdxT = uint32_t, typename epilogue_op = raft::identity_op>
 void build(raft::resources const& res,
            index_params const& params,
            raft::device_matrix_view<const T, int64_t, row_major> dataset,
-           index<IdxT>& idx)
+           index<IdxT>& idx,
+           epilogue_op distance_epilogue = raft::identity_op())
 {
-  detail::build<T, IdxT>(res, params, dataset, idx);
+  detail::build<T, IdxT>(res, params, dataset, idx, distance_epilogue);
 }
 
 /**
@@ -122,19 +128,22 @@ void build(raft::resources const& res,
  *
  * @tparam T data-type of the input dataset
  * @tparam IdxT data-type for the output index
+ * @tparam epilogue_op epilogue operation type for distances
  * @param res raft::resources is an object mangaging resources
  * @param[in] params an instance of nn_descent::index_params that are parameters
  *               to run the nn-descent algorithm
  * @param[in] dataset raft::host_matrix_view input dataset expected to be located
  *                in host memory
+ * @param[in] distance_epilogue epilogue operation for distances
  * @return index<IdxT> index containing all-neighbors knn graph in host memory
  */
-template <typename T, typename IdxT = uint32_t>
+template <typename T, typename IdxT = uint32_t, typename epilogue_op = raft::identity_op>
 index<IdxT> build(raft::resources const& res,
                   index_params const& params,
-                  raft::host_matrix_view<const T, int64_t, row_major> dataset)
+                  raft::host_matrix_view<const T, int64_t, row_major> dataset,
+                  epilogue_op distance_epilogue = raft::identity_op())
 {
-  return detail::build<T, IdxT>(res, params, dataset);
+  return detail::build<T, IdxT>(res, params, dataset, distance_epilogue);
 }
 
 /**
@@ -159,6 +168,7 @@ index<IdxT> build(raft::resources const& res,
  *
  * @tparam T data-type of the input dataset
  * @tparam IdxT data-type for the output index
+ * @tparam epilogue_op epilogue operation type for distances
  * @param[in] res raft::resources is an object mangaging resources
  * @param[in] params an instance of nn_descent::index_params that are parameters
  *               to run the nn-descent algorithm
@@ -166,14 +176,16 @@ index<IdxT> build(raft::resources const& res,
  *                in host memory
  * @param[out] idx raft::neighbors::experimental::nn_descentindex containing all-neighbors knn graph
  * in host memory
+ * @param[in] distance_epilogue epilogue operation for distances
  */
-template <typename T, typename IdxT = uint32_t>
+template <typename T, typename IdxT = uint32_t, typename epilogue_op = raft::identity_op>
 void build(raft::resources const& res,
            index_params const& params,
            raft::host_matrix_view<const T, int64_t, row_major> dataset,
-           index<IdxT>& idx)
+           index<IdxT>& idx,
+           epilogue_op distance_epilogue = raft::identity_op())
 {
-  detail::build<T, IdxT>(res, params, dataset, idx);
+  detail::build<T, IdxT>(res, params, dataset, idx, distance_epilogue);
 }
 
 /** @} */  // end group nn-descent
diff --git a/cpp/include/raft/neighbors/nn_descent_types.hpp b/cpp/include/raft/neighbors/nn_descent_types.hpp
index e1fc96878a..5d23ff2c2e 100644
--- a/cpp/include/raft/neighbors/nn_descent_types.hpp
+++ b/cpp/include/raft/neighbors/nn_descent_types.hpp
@@ -18,6 +18,8 @@
 
 #include "ann_types.hpp"
 
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdarray.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/mdspan_types.hpp>
@@ -25,6 +27,8 @@
 #include <raft/core/resources.hpp>
 #include <raft/distance/distance_types.hpp>
 
+#include <optional>
+
 namespace raft::neighbors::experimental::nn_descent {
 /**
  * @ingroup nn-descent
@@ -51,6 +55,7 @@ struct index_params : ann::index_params {
   size_t intermediate_graph_degree = 128;     // Degree of input graph for pruning.
   size_t max_iterations            = 20;      // Number of nn-descent iterations.
   float termination_threshold      = 0.0001;  // Termination threshold of nn-descent.
+  bool return_distances            = false;   // return distances if true
 };
 
 /**
@@ -79,14 +84,20 @@ struct index : ann::index {
    * @param res raft::resources is an object mangaging resources
    * @param n_rows number of rows in knn-graph
    * @param n_cols number of cols in knn-graph
+   * @param return_distances whether to allocate and get distances information
    */
-  index(raft::resources const& res, int64_t n_rows, int64_t n_cols)
+  index(raft::resources const& res, int64_t n_rows, int64_t n_cols, bool return_distances = false)
     : ann::index(),
       res_{res},
       metric_{raft::distance::DistanceType::L2Expanded},
       graph_{raft::make_host_matrix<IdxT, int64_t, row_major>(n_rows, n_cols)},
-      graph_view_{graph_.view()}
+      graph_view_{graph_.view()},
+      return_distances_(return_distances)
   {
+    if (return_distances) {
+      distances_      = raft::make_device_matrix<float, int64_t>(res_, n_rows, n_cols);
+      distances_view_ = distances_.value().view();
+    }
   }
 
   /**
@@ -98,14 +109,23 @@ struct index : ann::index {
    *
    * @param res raft::resources is an object mangaging resources
    * @param graph_view raft::host_matrix_view<IdxT, int64_t, raft::row_major> for storing knn-graph
+   * @param distances_view std::optional<raft::device_matrix_view<T, int64_t, row_major>> for
+   * storing knn-graph distances
+   * @param return_distances whether to allocate and get distances information
    */
   index(raft::resources const& res,
-        raft::host_matrix_view<IdxT, int64_t, raft::row_major> graph_view)
+        raft::host_matrix_view<IdxT, int64_t, raft::row_major> graph_view,
+        std::optional<raft::device_matrix_view<float, int64_t, row_major>> distances_view =
+          std::nullopt,
+        bool return_distances = false)
     : ann::index(),
       res_{res},
       metric_{raft::distance::DistanceType::L2Expanded},
       graph_{raft::make_host_matrix<IdxT, int64_t, row_major>(0, 0)},
-      graph_view_{graph_view}
+      distances_{raft::make_device_matrix<float, int64_t>(res_, 0, 0)},
+      graph_view_{graph_view},
+      distances_view_(distances_view),
+      return_distances_(return_distances)
   {
   }
 
@@ -133,6 +153,13 @@ struct index : ann::index {
     return graph_view_;
   }
 
+  /** neighborhood graph distances [size, graph-degree] */
+  [[nodiscard]] inline auto distances() noexcept
+    -> std::optional<device_matrix_view<float, int64_t, row_major>>
+  {
+    return distances_view_;
+  }
+
   // Don't allow copying the index for performance reasons (try avoiding copying data)
   index(const index&)                    = delete;
   index(index&&)                         = default;
@@ -144,8 +171,11 @@ struct index : ann::index {
   raft::resources const& res_;
   raft::distance::DistanceType metric_;
   raft::host_matrix<IdxT, int64_t, row_major> graph_;  // graph to return for non-int IdxT
+  std::optional<raft::device_matrix<float, int64_t, row_major>> distances_;
   raft::host_matrix_view<IdxT, int64_t, row_major>
     graph_view_;  // view of graph for user provided matrix
+  std::optional<raft::device_matrix_view<float, int64_t, row_major>> distances_view_;
+  bool return_distances_;
 };
 
 /** @} */
diff --git a/cpp/include/raft/sparse/detail/cusparse_wrappers.h b/cpp/include/raft/sparse/detail/cusparse_wrappers.h
index 08efbb7106..ae552cc687 100644
--- a/cpp/include/raft/sparse/detail/cusparse_wrappers.h
+++ b/cpp/include/raft/sparse/detail/cusparse_wrappers.h
@@ -393,6 +393,34 @@ inline cusparseStatus_t cusparsespmv(cusparseHandle_t handle,
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
   return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F, alg, externalBuffer);
 }
+// cusparseSpMV_preprocess is only available starting CUDA 12.4
+#if CUDA_VER_12_4_UP
+template <
+  typename T,
+  typename std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double>>* = nullptr>
+cusparseStatus_t cusparsespmv_preprocess(cusparseHandle_t handle,
+                                         cusparseOperation_t opA,
+                                         const T* alpha,
+                                         const cusparseSpMatDescr_t matA,
+                                         const cusparseDnVecDescr_t vecX,
+                                         const T* beta,
+                                         const cusparseDnVecDescr_t vecY,
+                                         cusparseSpMVAlg_t alg,
+                                         T* externalBuffer,
+                                         cudaStream_t stream)
+{
+  auto constexpr float_type = []() constexpr {
+    if constexpr (std::is_same_v<T, float>) {
+      return CUDA_R_32F;
+    } else if constexpr (std::is_same_v<T, double>) {
+      return CUDA_R_64F;
+    }
+  }();
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseSpMV_preprocess(
+    handle, opA, alpha, matA, vecX, beta, vecY, float_type, alg, externalBuffer);
+}
+#endif
 /** @} */
 #else
 /**
diff --git a/cpp/include/raft/sparse/linalg/detail/masked_matmul.cuh b/cpp/include/raft/sparse/linalg/detail/masked_matmul.cuh
new file mode 100644
index 0000000000..ef74316d04
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/detail/masked_matmul.cuh
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain A copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/bitmap.cuh>
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/sparse/convert/csr.cuh>
+#include <raft/sparse/distance/detail/utils.cuh>
+#include <raft/sparse/linalg/sddmm.hpp>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+namespace detail {
+
+template <typename value_t, typename index_t, typename nnz_t, typename bitmap_t>
+void masked_matmul(raft::resources const& handle,
+                   raft::device_matrix_view<const value_t, index_t, raft::row_major>& A,
+                   raft::device_matrix_view<const value_t, index_t, raft::row_major>& B,
+                   raft::core::bitmap_view<const bitmap_t, index_t>& mask,
+                   raft::device_csr_matrix_view<value_t, index_t, index_t, nnz_t>& C,
+                   std::optional<raft::host_scalar_view<value_t>> alpha,
+                   std::optional<raft::host_scalar_view<value_t>> beta)
+{
+  index_t m   = A.extent(0);
+  index_t n   = B.extent(0);
+  index_t dim = A.extent(1);
+
+  auto compressed_C_view = C.structure_view();
+
+  RAFT_EXPECTS(A.extent(1) == B.extent(1), "The dim of A must be equal to the dim of B.");
+  RAFT_EXPECTS(A.extent(0) == compressed_C_view.get_n_rows(),
+               "Number of rows in C must match the number of rows in A.");
+  RAFT_EXPECTS(B.extent(0) == compressed_C_view.get_n_cols(),
+               "Number of columns in C must match the number of columns in B.");
+
+  auto stream = raft::resource::get_cuda_stream(handle);
+
+  auto C_matrix = raft::make_device_csr_matrix<value_t, index_t>(handle, compressed_C_view);
+
+  // fill C
+  raft::sparse::convert::bitmap_to_csr(handle, mask, C_matrix);
+
+  if (m > 10 || alpha.has_value() || beta.has_value()) {
+    auto C_view = raft::make_device_csr_matrix_view<value_t, index_t, index_t, index_t>(
+      C.get_elements().data(), compressed_C_view);
+
+    // create B col_major view
+    auto B_col_major = raft::make_device_matrix_view<const value_t, index_t, raft::col_major>(
+      B.data_handle(), dim, n);
+
+    value_t default_alpha = static_cast<value_t>(1.0f);
+    value_t default_beta  = static_cast<value_t>(0.0f);
+
+    if (!alpha.has_value()) { alpha = raft::make_host_scalar_view<value_t>(&default_alpha); }
+    if (!beta.has_value()) { beta = raft::make_host_scalar_view<value_t>(&default_beta); }
+
+    raft::sparse::linalg::sddmm(handle,
+                                A,
+                                B_col_major,
+                                C_view,
+                                raft::linalg::Operation::NON_TRANSPOSE,
+                                raft::linalg::Operation::NON_TRANSPOSE,
+                                *alpha,
+                                *beta);
+  } else {
+    raft::sparse::distance::detail::faster_dot_on_csr(handle,
+                                                      C.get_elements().data(),
+                                                      compressed_C_view.get_nnz(),
+                                                      compressed_C_view.get_indptr().data(),
+                                                      compressed_C_view.get_indices().data(),
+                                                      A.data_handle(),
+                                                      B.data_handle(),
+                                                      compressed_C_view.get_n_rows(),
+                                                      dim);
+  }
+}
+
+}  // namespace detail
+}  // namespace linalg
+}  // namespace sparse
+}  // namespace raft
diff --git a/cpp/include/raft/sparse/linalg/masked_matmul.hpp b/cpp/include/raft/sparse/linalg/masked_matmul.hpp
new file mode 100644
index 0000000000..560cd3f715
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/masked_matmul.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain A copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/sparse/linalg/detail/masked_matmul.cuh>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+/**
+ * @defgroup masked_matmul Masked Matrix Multiplication
+ * @{
+ */
+
+/**
+ * @brief Performs a masked multiplication of dense matrices A and B, followed by an element-wise
+ * multiplication with the sparsity pattern defined by the mask, resulting in the computation
+ * C = alpha * ((A * B) ∘ spy(mask)) + beta * C.
+ *
+ * This function multiplies two dense matrices A and B, and then applies an element-wise
+ * multiplication using the sparsity pattern provided by the mask. The result is scaled by alpha
+ * and added to beta times the original matrix C.
+ *
+ * @tparam value_t Data type of elements in the input/output matrices (e.g., float, double)
+ * @tparam index_t Type used for matrix indices
+ * @tparam nnz_t Type used for the number of non-zero entries in CSR format
+ * @tparam bitmap_t Type of the bitmap used for the mask
+ *
+ * @param[in] handle RAFT handle for resource management
+ * @param[in] A Input dense matrix (device_matrix_view) with shape [m, k]
+ * @param[in] B Input dense matrix (device_matrix_view) with shape [n, k]
+ * @param[in] mask Bitmap view representing the sparsity pattern (bitmap_view) with logical shape
+ * [m, n]. Each bit in the mask indicates whether the corresponding element pair in A and B is
+ * included (1) or masked out (0).
+ * @param[inout] C Output sparse matrix in CSR format (device_csr_matrix_view) with dense shape [m,
+ * n]
+ * @param[in] alpha Optional scalar multiplier for the product of A and B (default: 1.0 if
+ * std::nullopt)
+ * @param[in] beta Optional scalar multiplier for the original matrix C (default: 0 if std::nullopt)
+ */
+template <typename value_t, typename index_t, typename nnz_t, typename bitmap_t>
+void masked_matmul(raft::resources const& handle,
+                   raft::device_matrix_view<const value_t, index_t, raft::row_major> A,
+                   raft::device_matrix_view<const value_t, index_t, raft::row_major> B,
+                   raft::core::bitmap_view<const bitmap_t, index_t> mask,
+                   raft::device_csr_matrix_view<value_t, index_t, index_t, nnz_t> C,
+                   std::optional<raft::host_scalar_view<value_t>> alpha = std::nullopt,
+                   std::optional<raft::host_scalar_view<value_t>> beta  = std::nullopt)
+{
+  detail::masked_matmul(handle, A, B, mask, C, alpha, beta);
+}
+
+/** @} */  // end of masked_matmul
+
+}  // end namespace linalg
+}  // end namespace sparse
+}  // end namespace raft
diff --git a/cpp/include/raft/stats/detail/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh
index 6c330acb26..ee39c87a68 100644
--- a/cpp/include/raft/stats/detail/mean.cuh
+++ b/cpp/include/raft/stats/detail/mean.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <raft/linalg/eltwise.cuh>
+#include <raft/linalg/reduce.cuh>
 #include <raft/util/cuda_utils.cuh>
 
 #include <cub/cub.cuh>
@@ -25,61 +26,23 @@ namespace raft {
 namespace stats {
 namespace detail {
 
-///@todo: ColsPerBlk has been tested only for 32!
-template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
-RAFT_KERNEL meanKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N)
-{
-  const int RowsPerBlkPerIter = TPB / ColsPerBlk;
-  IdxType thisColId           = threadIdx.x % ColsPerBlk;
-  IdxType thisRowId           = threadIdx.x / ColsPerBlk;
-  IdxType colId               = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
-  IdxType rowId               = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
-  Type thread_data            = Type(0);
-  const IdxType stride        = RowsPerBlkPerIter * gridDim.x;
-  for (IdxType i = rowId; i < N; i += stride)
-    thread_data += (colId < D) ? data[i * D + colId] : Type(0);
-  __shared__ Type smu[ColsPerBlk];
-  if (threadIdx.x < ColsPerBlk) smu[threadIdx.x] = Type(0);
-  __syncthreads();
-  raft::myAtomicAdd(smu + thisColId, thread_data);
-  __syncthreads();
-  if (threadIdx.x < ColsPerBlk && colId < D) raft::myAtomicAdd(mu + colId, smu[thisColId]);
-}
-
-template <typename Type, typename IdxType, int TPB>
-RAFT_KERNEL meanKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N)
-{
-  typedef cub::BlockReduce<Type, TPB> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  Type thread_data = Type(0);
-  IdxType colStart = N * blockIdx.x;
-  for (IdxType i = threadIdx.x; i < N; i += TPB) {
-    IdxType idx = colStart + i;
-    thread_data += data[idx];
-  }
-  Type acc = BlockReduce(temp_storage).Sum(thread_data);
-  if (threadIdx.x == 0) { mu[blockIdx.x] = acc / N; }
-}
-
 template <typename Type, typename IdxType = int>
 void mean(
   Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream)
 {
-  static const int TPB = 256;
-  if (rowMajor) {
-    static const int RowsPerThread = 4;
-    static const int ColsPerBlk    = 32;
-    static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
-    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk));
-    RAFT_CUDA_TRY(cudaMemsetAsync(mu, 0, sizeof(Type) * D, stream));
-    meanKernelRowMajor<Type, IdxType, TPB, ColsPerBlk><<<grid, TPB, 0, stream>>>(mu, data, D, N);
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-    Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N));
-    raft::linalg::scalarMultiply(mu, mu, ratio, D, stream);
-  } else {
-    meanKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(mu, data, D, N);
-  }
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  Type ratio = Type(1) / ((sample) ? Type(N - 1) : Type(N));
+  raft::linalg::reduce(mu,
+                       data,
+                       D,
+                       N,
+                       Type(0),
+                       rowMajor,
+                       false,
+                       stream,
+                       false,
+                       raft::identity_op(),
+                       raft::add_op(),
+                       raft::mul_const_op<Type>(ratio));
 }
 
 }  // namespace detail
diff --git a/cpp/include/raft/stats/detail/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh
index bc2644a233..4c861b49fb 100644
--- a/cpp/include/raft/stats/detail/stddev.cuh
+++ b/cpp/include/raft/stats/detail/stddev.cuh
@@ -16,7 +16,9 @@
 
 #pragma once
 
+#include <raft/core/operators.hpp>
 #include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/reduce.cuh>
 #include <raft/util/cuda_utils.cuh>
 
 #include <cub/cub.cuh>
@@ -25,63 +27,6 @@ namespace raft {
 namespace stats {
 namespace detail {
 
-///@todo: ColPerBlk has been tested only for 32!
-template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
-RAFT_KERNEL stddevKernelRowMajor(Type* std, const Type* data, IdxType D, IdxType N)
-{
-  const int RowsPerBlkPerIter = TPB / ColsPerBlk;
-  IdxType thisColId           = threadIdx.x % ColsPerBlk;
-  IdxType thisRowId           = threadIdx.x / ColsPerBlk;
-  IdxType colId               = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
-  IdxType rowId               = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
-  Type thread_data            = Type(0);
-  const IdxType stride        = RowsPerBlkPerIter * gridDim.x;
-  for (IdxType i = rowId; i < N; i += stride) {
-    Type val = (colId < D) ? data[i * D + colId] : Type(0);
-    thread_data += val * val;
-  }
-  __shared__ Type sstd[ColsPerBlk];
-  if (threadIdx.x < ColsPerBlk) sstd[threadIdx.x] = Type(0);
-  __syncthreads();
-  raft::myAtomicAdd(sstd + thisColId, thread_data);
-  __syncthreads();
-  if (threadIdx.x < ColsPerBlk && colId < D) raft::myAtomicAdd(std + colId, sstd[thisColId]);
-}
-
-template <typename Type, typename IdxType, int TPB>
-RAFT_KERNEL stddevKernelColMajor(Type* std, const Type* data, const Type* mu, IdxType D, IdxType N)
-{
-  typedef cub::BlockReduce<Type, TPB> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  Type thread_data = Type(0);
-  IdxType colStart = N * blockIdx.x;
-  Type m           = mu[blockIdx.x];
-  for (IdxType i = threadIdx.x; i < N; i += TPB) {
-    IdxType idx = colStart + i;
-    Type diff   = data[idx] - m;
-    thread_data += diff * diff;
-  }
-  Type acc = BlockReduce(temp_storage).Sum(thread_data);
-  if (threadIdx.x == 0) { std[blockIdx.x] = raft::sqrt(acc / N); }
-}
-
-template <typename Type, typename IdxType, int TPB>
-RAFT_KERNEL varsKernelColMajor(Type* var, const Type* data, const Type* mu, IdxType D, IdxType N)
-{
-  typedef cub::BlockReduce<Type, TPB> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  Type thread_data = Type(0);
-  IdxType colStart = N * blockIdx.x;
-  Type m           = mu[blockIdx.x];
-  for (IdxType i = threadIdx.x; i < N; i += TPB) {
-    IdxType idx = colStart + i;
-    Type diff   = data[idx] - m;
-    thread_data += diff * diff;
-  }
-  Type acc = BlockReduce(temp_storage).Sum(thread_data);
-  if (threadIdx.x == 0) { var[blockIdx.x] = acc / N; }
-}
-
 /**
  * @brief Compute stddev of the input matrix
  *
@@ -110,26 +55,22 @@ void stddev(Type* std,
             bool rowMajor,
             cudaStream_t stream)
 {
-  static const int TPB = 256;
-  if (rowMajor) {
-    static const int RowsPerThread = 4;
-    static const int ColsPerBlk    = 32;
-    static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
-    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk));
-    RAFT_CUDA_TRY(cudaMemset(std, 0, sizeof(Type) * D));
-    stddevKernelRowMajor<Type, IdxType, TPB, ColsPerBlk><<<grid, TPB, 0, stream>>>(std, data, D, N);
-    Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N));
-    raft::linalg::binaryOp(
-      std,
-      std,
-      mu,
-      D,
-      [ratio] __device__(Type a, Type b) { return raft::sqrt(a * ratio - b * b); },
-      stream);
-  } else {
-    stddevKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(std, data, mu, D, N);
-  }
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  raft::linalg::reduce(
+    std, data, D, N, Type(0), rowMajor, false, stream, false, [mu] __device__(Type a, IdxType i) {
+      return a * a;
+    });
+  Type ratio      = Type(1) / ((sample) ? Type(N - 1) : Type(N));
+  Type ratio_mean = sample ? ratio * Type(N) : Type(1);
+  raft::linalg::binaryOp(std,
+                         std,
+                         mu,
+                         D,
+                         raft::compose_op(raft::sqrt_op(),
+                                          raft::abs_op(),
+                                          [ratio, ratio_mean] __device__(Type a, Type b) {
+                                            return a * ratio - b * b * ratio_mean;
+                                          }),
+                         stream);
 }
 
 /**
@@ -160,21 +101,21 @@ void vars(Type* var,
           bool rowMajor,
           cudaStream_t stream)
 {
-  static const int TPB = 256;
-  if (rowMajor) {
-    static const int RowsPerThread = 4;
-    static const int ColsPerBlk    = 32;
-    static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
-    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk));
-    RAFT_CUDA_TRY(cudaMemset(var, 0, sizeof(Type) * D));
-    stddevKernelRowMajor<Type, IdxType, TPB, ColsPerBlk><<<grid, TPB, 0, stream>>>(var, data, D, N);
-    Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N));
-    raft::linalg::binaryOp(
-      var, var, mu, D, [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }, stream);
-  } else {
-    varsKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(var, data, mu, D, N);
-  }
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  raft::linalg::reduce(
+    var, data, D, N, Type(0), rowMajor, false, stream, false, [mu] __device__(Type a, IdxType i) {
+      return a * a;
+    });
+  Type ratio      = Type(1) / ((sample) ? Type(N - 1) : Type(N));
+  Type ratio_mean = sample ? ratio * Type(N) : Type(1);
+  raft::linalg::binaryOp(var,
+                         var,
+                         mu,
+                         D,
+                         raft::compose_op(raft::abs_op(),
+                                          [ratio, ratio_mean] __device__(Type a, Type b) {
+                                            return a * ratio - b * b * ratio_mean;
+                                          }),
+                         stream);
 }
 
 }  // namespace detail
diff --git a/cpp/include/raft/stats/detail/sum.cuh b/cpp/include/raft/stats/detail/sum.cuh
index 4f85536e6c..39bd2c3b6c 100644
--- a/cpp/include/raft/stats/detail/sum.cuh
+++ b/cpp/include/raft/stats/detail/sum.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <raft/linalg/eltwise.cuh>
+#include <raft/linalg/reduce.cuh>
 #include <raft/util/cuda_utils.cuh>
 
 #include <cub/cub.cuh>
@@ -25,106 +26,10 @@ namespace raft {
 namespace stats {
 namespace detail {
 
-///@todo: ColsPerBlk has been tested only for 32!
-template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
-RAFT_KERNEL sumKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N)
-{
-  const int RowsPerBlkPerIter = TPB / ColsPerBlk;
-  IdxType thisColId           = threadIdx.x % ColsPerBlk;
-  IdxType thisRowId           = threadIdx.x / ColsPerBlk;
-  IdxType colId               = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
-  IdxType rowId               = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
-  Type thread_sum             = Type(0);
-  const IdxType stride        = RowsPerBlkPerIter * gridDim.x;
-  for (IdxType i = rowId; i < N; i += stride) {
-    thread_sum += (colId < D) ? data[i * D + colId] : Type(0);
-  }
-  __shared__ Type smu[ColsPerBlk];
-  if (threadIdx.x < ColsPerBlk) smu[threadIdx.x] = Type(0);
-  __syncthreads();
-  raft::myAtomicAdd(smu + thisColId, thread_sum);
-  __syncthreads();
-  if (threadIdx.x < ColsPerBlk && colId < D) raft::myAtomicAdd(mu + colId, smu[thisColId]);
-}
-
-template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
-RAFT_KERNEL sumKahanKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N)
-{
-  constexpr int RowsPerBlkPerIter = TPB / ColsPerBlk;
-  IdxType thisColId               = threadIdx.x % ColsPerBlk;
-  IdxType thisRowId               = threadIdx.x / ColsPerBlk;
-  IdxType colId                   = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
-  IdxType rowId                   = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
-  Type thread_sum                 = Type(0);
-  Type thread_c                   = Type(0);
-  const IdxType stride            = RowsPerBlkPerIter * gridDim.x;
-  for (IdxType i = rowId; i < N; i += stride) {
-    // KahanBabushkaNeumaierSum
-    const Type cur_value = (colId < D) ? data[i * D + colId] : Type(0);
-    const Type t         = thread_sum + cur_value;
-    if (abs(thread_sum) >= abs(cur_value)) {
-      thread_c += (thread_sum - t) + cur_value;
-    } else {
-      thread_c += (cur_value - t) + thread_sum;
-    }
-    thread_sum = t;
-  }
-  thread_sum += thread_c;
-  __shared__ Type smu[ColsPerBlk];
-  if (threadIdx.x < ColsPerBlk) smu[threadIdx.x] = Type(0);
-  __syncthreads();
-  raft::myAtomicAdd(smu + thisColId, thread_sum);
-  __syncthreads();
-  if (threadIdx.x < ColsPerBlk && colId < D) raft::myAtomicAdd(mu + colId, smu[thisColId]);
-}
-
-template <typename Type, typename IdxType, int TPB>
-RAFT_KERNEL sumKahanKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N)
-{
-  typedef cub::BlockReduce<Type, TPB> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  Type thread_sum  = Type(0);
-  Type thread_c    = Type(0);
-  IdxType colStart = N * blockIdx.x;
-  for (IdxType i = threadIdx.x; i < N; i += TPB) {
-    // KahanBabushkaNeumaierSum
-    IdxType idx          = colStart + i;
-    const Type cur_value = data[idx];
-    const Type t         = thread_sum + cur_value;
-    if (abs(thread_sum) >= abs(cur_value)) {
-      thread_c += (thread_sum - t) + cur_value;
-    } else {
-      thread_c += (cur_value - t) + thread_sum;
-    }
-    thread_sum = t;
-  }
-  thread_sum += thread_c;
-  Type acc = BlockReduce(temp_storage).Sum(thread_sum);
-  if (threadIdx.x == 0) { mu[blockIdx.x] = acc; }
-}
-
 template <typename Type, typename IdxType = int>
 void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream)
 {
-  static const int TPB = 256;
-  if (rowMajor) {
-    static const int ColsPerBlk       = 8;
-    static const int MinRowsPerThread = 16;
-    static const int MinRowsPerBlk    = (TPB / ColsPerBlk) * MinRowsPerThread;
-    static const int MaxBlocksDimX    = 8192;
-
-    const IdxType grid_y = raft::ceildiv(D, (IdxType)ColsPerBlk);
-    const IdxType grid_x =
-      raft::min((IdxType)MaxBlocksDimX, raft::ceildiv(N, (IdxType)MinRowsPerBlk));
-
-    dim3 grid(grid_x, grid_y);
-    RAFT_CUDA_TRY(cudaMemset(output, 0, sizeof(Type) * D));
-    sumKahanKernelRowMajor<Type, IdxType, TPB, ColsPerBlk>
-      <<<grid, TPB, 0, stream>>>(output, input, D, N);
-  } else {
-    sumKahanKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(output, input, D, N);
-  }
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  raft::linalg::reduce(output, input, D, N, Type(0), rowMajor, false, stream);
 }
 
 }  // namespace detail
diff --git a/cpp/include/raft/core/detail/popc.cuh b/cpp/include/raft/util/detail/popc.cuh
similarity index 86%
rename from cpp/include/raft/core/detail/popc.cuh
rename to cpp/include/raft/util/detail/popc.cuh
index d74b68b715..20b4814216 100644
--- a/cpp/include/raft/core/detail/popc.cuh
+++ b/cpp/include/raft/util/detail/popc.cuh
@@ -17,6 +17,7 @@
 
 #include <raft/core/detail/mdspan_util.cuh>
 #include <raft/core/device_mdarray.hpp>
+#include <raft/core/host_mdspan.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/coalesced_reduction.cuh>
 
@@ -28,15 +29,15 @@ namespace raft::detail {
  * @tparam value_t the value type of the vector.
  * @tparam index_t the index type of vector and scalar.
  *
- * @param[in] res raft handle for managing expensive resources
- * @param[in] values Number of row in the matrix.
+ * @param[in] res RAFT handle for managing expensive resources
+ * @param[in] values Device vector view containing the values to be processed.
  * @param[in] max_len Maximum number of bits to count.
- * @param[out] counter Number of bits that are set to 1.
+ * @param[out] counter Device scalar view to store the number of bits that are set to 1.
  */
 template <typename value_t, typename index_t>
 void popc(const raft::resources& res,
           device_vector_view<value_t, index_t> values,
-          index_t max_len,
+          raft::host_scalar_view<index_t> max_len,
           raft::device_scalar_view<index_t> counter)
 {
   auto values_size   = values.size();
@@ -46,7 +47,7 @@ void popc(const raft::resources& res,
 
   static constexpr index_t len_per_item = sizeof(value_t) * 8;
 
-  value_t tail_len  = (max_len % len_per_item);
+  value_t tail_len  = (max_len[0] % len_per_item);
   value_t tail_mask = tail_len ? (value_t)((value_t{1} << tail_len) - value_t{1}) : ~value_t{0};
   raft::linalg::coalesced_reduction(
     res,
diff --git a/cpp/include/raft/util/popc.cuh b/cpp/include/raft/util/popc.cuh
new file mode 100644
index 0000000000..153694e45e
--- /dev/null
+++ b/cpp/include/raft/util/popc.cuh
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/util/detail/popc.cuh>
+namespace raft {
+
+/**
+ * @brief Count the number of bits that are set to 1 in a vector.
+ *
+ * @tparam value_t the value type of the vector.
+ * @tparam index_t the index type of vector and scalar.
+ *
+ * @param[in] res RAFT handle for managing expensive resources
+ * @param[in] values Device vector view containing the values to be processed.
+ * @param[in] max_len Host scalar view to store the Maximum number of bits to count.
+ * @param[out] counter Device scalar view to store the number of bits that are set to 1.
+ */
+template <typename value_t, typename index_t>
+void popc(const raft::resources& res,
+          device_vector_view<value_t, index_t> values,
+          raft::host_scalar_view<index_t> max_len,
+          raft::device_scalar_view<index_t> counter)
+{
+  detail::popc(res, values, max_len, counter);
+}
+
+}  // namespace raft
diff --git a/cpp/include/raft/util/reduction.cuh b/cpp/include/raft/util/reduction.cuh
index c0d3da7609..504f35fa0f 100644
--- a/cpp/include/raft/util/reduction.cuh
+++ b/cpp/include/raft/util/reduction.cuh
@@ -98,7 +98,7 @@ DI T blockReduce(T val, char* smem, ReduceLambda reduce_op = raft::add_op{})
   val         = warpReduce(val, reduce_op);
   if (lid == 0) sTemp[wid] = val;
   __syncthreads();
-  val = lid < nWarps ? sTemp[lid] : T(0);
+  val = lid < nWarps ? sTemp[lid] : T();
   return warpReduce(val, reduce_op);
 }
 
diff --git a/cpp/include/raft_runtime/neighbors/refine.hpp b/cpp/include/raft_runtime/neighbors/refine.hpp
index fba7d0fc0e..592c8be82b 100644
--- a/cpp/include/raft_runtime/neighbors/refine.hpp
+++ b/cpp/include/raft_runtime/neighbors/refine.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,9 @@
 #pragma once
 
 #include <raft/core/device_mdspan.hpp>
+#include <raft/core/host_mdspan.hpp>
 #include <raft/core/resources.hpp>
-// #include <raft/core/host_mdspan.hpp>
+#include <raft/distance/distance_types.hpp>
 
 namespace raft::runtime::neighbors {
 
@@ -29,7 +30,7 @@ namespace raft::runtime::neighbors {
               raft::device_matrix_view<const IDX_T, int64_t, row_major> neighbor_candidates, \
               raft::device_matrix_view<IDX_T, int64_t, row_major> indices,                   \
               raft::device_matrix_view<float, int64_t, row_major> distances,                 \
-              distance::DistanceType metric);                                                \
+              raft::distance::DistanceType metric);                                          \
                                                                                              \
   void refine(raft::resources const& handle,                                                 \
               raft::host_matrix_view<const DATA_T, int64_t, row_major> dataset,              \
@@ -37,7 +38,7 @@ namespace raft::runtime::neighbors {
               raft::host_matrix_view<const IDX_T, int64_t, row_major> neighbor_candidates,   \
               raft::host_matrix_view<IDX_T, int64_t, row_major> indices,                     \
               raft::host_matrix_view<float, int64_t, row_major> distances,                   \
-              distance::DistanceType metric);
+              raft::distance::DistanceType metric);
 
 RAFT_INST_REFINE(int64_t, float);
 RAFT_INST_REFINE(int64_t, uint8_t);
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py b/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
index 97fe120458..6adff0eee1 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
 # NOTE: this template is not perfectly formatted. Use pre-commit to get
 # everything in shape again.
 header = """/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -95,6 +95,11 @@
         OpT="raft::distance::detail::ops::cosine_distance_op",
         archs = [60, 80],
     ),
+    dict(
+        path_prefix="dice",
+        OpT="raft::distance::detail::ops::dice_distance_op",
+        archs = [60, 80],
+    ),
     dict(
         path_prefix="hamming_unexpanded",
         OpT="raft::distance::detail::ops::hamming_distance_op",
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_double_double_double_int.cu
new file mode 100644
index 0000000000..a259f8b3b0
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_double_double_double_int.cu
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::dice_distance_op, double, double, double, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_float_float_float_int.cu
new file mode 100644
index 0000000000..e89f8b422c
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_float_float_float_int.cu
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::dice_distance_op, float, float, float, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/distance.cu b/cpp/src/distance/distance.cu
index 8c94608311..8fe0bf2007 100644
--- a/cpp/src/distance/distance.cu
+++ b/cpp/src/distance/distance.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -72,6 +72,10 @@ instantiate_raft_distance_distance(
   raft::distance::DistanceType::CosineExpanded, float, float, float, raft::identity_op, int);
 instantiate_raft_distance_distance(
   raft::distance::DistanceType::CosineExpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::DiceExpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::DiceExpanded, double, double, double, raft::identity_op, int);
 instantiate_raft_distance_distance(
   raft::distance::DistanceType::HammingUnexpanded, float, float, float, raft::identity_op, int);
 instantiate_raft_distance_distance(
@@ -154,6 +158,10 @@ instantiate_raft_distance_distance(
   raft::distance::DistanceType::CosineExpanded, float, float, float, int);
 instantiate_raft_distance_distance(
   raft::distance::DistanceType::CosineExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::DiceExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::DiceExpanded, double, double, double, int);
 instantiate_raft_distance_distance(
   raft::distance::DistanceType::HammingUnexpanded, float, float, float, int);
 instantiate_raft_distance_distance(
@@ -230,6 +238,10 @@ instantiate_raft_distance_distance(
   raft::distance::DistanceType::CosineExpanded, float, float, float, int);
 instantiate_raft_distance_distance(
   raft::distance::DistanceType::CosineExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::DiceExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::DiceExpanded, double, double, double, int);
 instantiate_raft_distance_distance(
   raft::distance::DistanceType::HammingUnexpanded, float, float, float, int);
 instantiate_raft_distance_distance(
@@ -297,6 +309,10 @@ instantiate_raft_distance_getWorkspaceSize(
   raft::distance::DistanceType::CosineExpanded, float, float, float, int);
 instantiate_raft_distance_getWorkspaceSize(
   raft::distance::DistanceType::CosineExpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::DiceExpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::DiceExpanded, double, double, double, int);
 instantiate_raft_distance_getWorkspaceSize(
   raft::distance::DistanceType::HammingUnexpanded, float, float, float, int);
 instantiate_raft_distance_getWorkspaceSize(
@@ -415,6 +431,22 @@ instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CosineE
                                            double,
                                            int,
                                            raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::DiceExpanded, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::DiceExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::DiceExpanded, float, float, float, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::DiceExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
 instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HammingUnexpanded,
                                            float,
                                            float,
@@ -690,6 +722,22 @@ instantiate_raft_distance_distance(raft::distance::DistanceType::CosineExpanded,
                                    double,
                                    raft::layout_f_contiguous,
                                    int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::DiceExpanded, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::DiceExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::DiceExpanded, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::DiceExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
 instantiate_raft_distance_distance(raft::distance::DistanceType::HammingUnexpanded,
                                    float,
                                    float,
diff --git a/cpp/src/raft_runtime/neighbors/hnsw.cpp b/cpp/src/raft_runtime/neighbors/hnsw.cpp
index 6eb770abd6..5356e708d2 100644
--- a/cpp/src/raft_runtime/neighbors/hnsw.cpp
+++ b/cpp/src/raft_runtime/neighbors/hnsw.cpp
@@ -21,6 +21,8 @@
 #include <raft_runtime/neighbors/hnsw.hpp>
 
 #include <filesystem>
+#include <random>
+#include <string>
 
 namespace raft::neighbors::hnsw {
 #define RAFT_INST_HNSW(T)                                                               \
@@ -28,7 +30,11 @@ namespace raft::neighbors::hnsw {
   std::unique_ptr<raft::neighbors::hnsw::index<T>> from_cagra(                          \
     raft::resources const& res, raft::neighbors::cagra::index<T, uint32_t> cagra_index) \
   {                                                                                     \
-    std::string filepath = "/tmp/cagra_index.bin";                                      \
+    std::random_device dev;                                                             \
+    std::mt19937 rng(dev());                                                            \
+    std::uniform_int_distribution<std::mt19937::result_type> dist(0);                   \
+    auto uuid            = std::to_string(dist(rng));                                   \
+    std::string filepath = "/tmp/" + uuid + ".bin";                                     \
     raft::runtime::neighbors::cagra::serialize_to_hnswlib(res, filepath, cagra_index);  \
     auto hnsw_index = raft::runtime::neighbors::hnsw::deserialize_file<T>(              \
       res, filepath, cagra_index.dim(), cagra_index.metric());                          \
diff --git a/cpp/template/cmake/thirdparty/fetch_rapids.cmake b/cpp/template/cmake/thirdparty/fetch_rapids.cmake
index 11d2403963..0f1d5ff020 100644
--- a/cpp/template/cmake/thirdparty/fetch_rapids.cmake
+++ b/cpp/template/cmake/thirdparty/fetch_rapids.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -12,7 +12,7 @@
 # the License.
 
 # Use this variable to update RAPIDS and RAFT versions
-set(RAPIDS_VERSION "24.06")
+set(RAPIDS_VERSION "24.08")
 
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake)
     file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index ff0518a4d0..e3af6ebb78 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -113,6 +113,7 @@ if(BUILD_TESTS)
     NAME
     CORE_TEST
     PATH
+    core/bitmap.cu
     core/bitset.cu
     core/device_resources_manager.cpp
     core/device_setter.cpp
@@ -156,6 +157,7 @@ if(BUILD_TESTS)
     distance/dist_canberra.cu
     distance/dist_correlation.cu
     distance/dist_cos.cu
+    distance/dist_dice.cu
     distance/dist_hamming.cu
     distance/dist_hellinger.cu
     distance/dist_inner_product.cu
@@ -315,6 +317,7 @@ if(BUILD_TESTS)
     sparse/csr_transpose.cu
     sparse/degree.cu
     sparse/filter.cu
+    sparse/masked_matmul.cu
     sparse/norm.cu
     sparse/normalize.cu
     sparse/reduce.cu
@@ -505,6 +508,7 @@ if(BUILD_TESTS)
     util/integer_utils.cpp
     util/integer_utils.cu
     util/memory_type_dispatcher.cu
+    util/popc.cu
     util/pow2_utils.cu
     util/reduction.cu
   )
diff --git a/cpp/test/core/bitmap.cu b/cpp/test/core/bitmap.cu
new file mode 100644
index 0000000000..358c08a50f
--- /dev/null
+++ b/cpp/test/core/bitmap.cu
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+
+#include <raft/core/bitmap.cuh>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/linalg/init.cuh>
+#include <raft/linalg/map.cuh>
+#include <raft/random/rng.cuh>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <numeric>
+
+namespace raft::core {
+
+template <typename index_t>
+struct test_spec_bitmap {
+  index_t rows;
+  index_t cols;
+  index_t mask_len;
+  index_t query_len;
+};
+
+template <typename index_t>
+auto operator<<(std::ostream& os, const test_spec_bitmap<index_t>& ss) -> std::ostream&
+{
+  os << "bitmap{rows: " << ss.rows << ", cols: " << ss.cols << ", mask_len: " << ss.mask_len
+     << ", query_len: " << ss.query_len << "}";
+  return os;
+}
+
+template <typename bitmap_t, typename index_t>
+void create_cpu_bitmap(std::vector<bitmap_t>& bitmap,
+                       std::vector<index_t>& mask_idx,
+                       const index_t rows,
+                       const index_t cols)
+{
+  for (size_t i = 0; i < bitmap.size(); i++) {
+    bitmap[i] = ~bitmap_t(0x00);
+  }
+  constexpr size_t bitmap_element_size = sizeof(bitmap_t) * 8;
+  for (size_t i = 0; i < mask_idx.size(); i++) {
+    auto row = mask_idx[i] / cols;
+    auto col = mask_idx[i] % cols;
+    auto idx = row * cols + col;
+    bitmap[idx / bitmap_element_size] &= ~(bitmap_t{1} << (idx % bitmap_element_size));
+  }
+}
+
+template <typename bitmap_t, typename index_t>
+void test_cpu_bitmap(const std::vector<bitmap_t>& bitmap,
+                     const std::vector<index_t>& queries,
+                     std::vector<uint8_t>& result,
+                     index_t rows,
+                     index_t cols)
+{
+  constexpr size_t bitmap_element_size = sizeof(bitmap_t) * 8;
+  for (size_t i = 0; i < queries.size(); i++) {
+    auto row  = queries[i] / cols;
+    auto col  = queries[i] % cols;
+    auto idx  = row * cols + col;
+    result[i] = uint8_t(
+      (bitmap[idx / bitmap_element_size] & (bitmap_t{1} << (idx % bitmap_element_size))) != 0);
+  }
+}
+
+template <typename bitmap_t, typename index_t>
+class BitmapTest : public testing::TestWithParam<test_spec_bitmap<index_t>> {
+ protected:
+  index_t static constexpr const bitmap_element_size = sizeof(bitmap_t) * 8;
+  const test_spec_bitmap<index_t> spec;
+  std::vector<bitmap_t> bitmap_result;
+  std::vector<bitmap_t> bitmap_ref;
+  raft::resources res;
+
+ public:
+  explicit BitmapTest()
+    : spec(testing::TestWithParam<test_spec_bitmap<index_t>>::GetParam()),
+      bitmap_result(raft::ceildiv(spec.rows * spec.cols, index_t(bitmap_element_size))),
+      bitmap_ref(raft::ceildiv(spec.rows * spec.cols, index_t(bitmap_element_size)))
+  {
+  }
+
+  void run()
+  {
+    auto stream = resource::get_cuda_stream(res);
+
+    raft::random::RngState rng(42);
+    auto mask_device = raft::make_device_vector<index_t, index_t>(res, spec.mask_len);
+    std::vector<index_t> mask_cpu(spec.mask_len);
+    raft::random::uniformInt(
+      res, rng, mask_device.view(), index_t(0), index_t(spec.rows * spec.cols));
+    raft::update_host(mask_cpu.data(), mask_device.data_handle(), mask_device.extent(0), stream);
+    resource::sync_stream(res, stream);
+
+    create_cpu_bitmap(bitmap_ref, mask_cpu, spec.rows, spec.cols);
+
+    auto bitset_d = raft::core::bitset<bitmap_t, index_t>(
+      res, raft::make_const_mdspan(mask_device.view()), index_t(spec.rows * spec.cols));
+
+    auto bitmap_view_d =
+      raft::core::bitmap_view<bitmap_t, index_t>(bitset_d.data(), spec.rows, spec.cols);
+
+    ASSERT_EQ(bitmap_view_d.get_n_rows(), spec.rows);
+    ASSERT_EQ(bitmap_view_d.get_n_cols(), spec.cols);
+
+    auto query_device  = raft::make_device_vector<index_t, index_t>(res, spec.query_len);
+    auto result_device = raft::make_device_vector<uint8_t, index_t>(res, spec.query_len);
+    auto query_cpu     = std::vector<index_t>(spec.query_len);
+    auto result_cpu    = std::vector<uint8_t>(spec.query_len);
+    auto result_ref    = std::vector<uint8_t>(spec.query_len);
+
+    raft::random::uniformInt(
+      res, rng, query_device.view(), index_t(0), index_t(spec.rows * spec.cols));
+    raft::update_host(query_cpu.data(), query_device.data_handle(), query_device.extent(0), stream);
+
+    auto queries_device_view =
+      raft::make_device_vector_view<const index_t>(query_device.data_handle(), spec.query_len);
+
+    raft::linalg::map(
+      res,
+      result_device.view(),
+      [bitmap_view_d] __device__(index_t query) {
+        auto row = query / bitmap_view_d.get_n_cols();
+        auto col = query % bitmap_view_d.get_n_cols();
+        return (uint8_t)(bitmap_view_d.test(row, col));
+      },
+      queries_device_view);
+
+    raft::update_host(result_cpu.data(), result_device.data_handle(), query_device.size(), stream);
+    resource::sync_stream(res, stream);
+
+    test_cpu_bitmap(bitmap_ref, query_cpu, result_ref, spec.rows, spec.cols);
+
+    ASSERT_TRUE(hostVecMatch(result_cpu, result_ref, Compare<uint8_t>()));
+
+    raft::random::uniformInt(
+      res, rng, mask_device.view(), index_t(0), index_t(spec.rows * spec.cols));
+    raft::update_host(mask_cpu.data(), mask_device.data_handle(), mask_device.extent(0), stream);
+    resource::sync_stream(res, stream);
+
+    thrust::for_each_n(raft::resource::get_thrust_policy(res),
+                       mask_device.data_handle(),
+                       mask_device.extent(0),
+                       [bitmap_view_d] __device__(const index_t sample_index) {
+                         auto row = sample_index / bitmap_view_d.get_n_cols();
+                         auto col = sample_index % bitmap_view_d.get_n_cols();
+                         bitmap_view_d.set(row, col, false);
+                       });
+
+    raft::update_host(bitmap_result.data(), bitmap_view_d.data(), bitmap_result.size(), stream);
+
+    for (size_t i = 0; i < mask_cpu.size(); i++) {
+      auto row = mask_cpu[i] / spec.cols;
+      auto col = mask_cpu[i] % spec.cols;
+      auto idx = row * spec.cols + col;
+      bitmap_ref[idx / bitmap_element_size] &= ~(bitmap_t{1} << (idx % bitmap_element_size));
+    }
+    resource::sync_stream(res, stream);
+    ASSERT_TRUE(hostVecMatch(bitmap_ref, bitmap_result, raft::Compare<bitmap_t>()));
+  }
+};
+
+template <typename index_t>
+auto inputs_bitmap =
+  ::testing::Values(test_spec_bitmap<index_t>{32, 32, 5, 10},
+                    test_spec_bitmap<index_t>{100, 100, 30, 10},
+                    test_spec_bitmap<index_t>{1024, 1024, 55, 100},
+                    test_spec_bitmap<index_t>{10000, 10000, 1000, 1000},
+                    test_spec_bitmap<index_t>{1 << 15, 1 << 15, 1 << 3, 1 << 12},
+                    test_spec_bitmap<index_t>{1 << 15, 1 << 15, 1 << 24, 1 << 13});
+
+using BitmapTest_Uint32_32 = BitmapTest<uint32_t, uint32_t>;
+TEST_P(BitmapTest_Uint32_32, Run) { run(); }
+INSTANTIATE_TEST_CASE_P(BitmapTest, BitmapTest_Uint32_32, inputs_bitmap<uint32_t>);
+
+using BitmapTest_Uint64_32 = BitmapTest<uint64_t, uint32_t>;
+TEST_P(BitmapTest_Uint64_32, Run) { run(); }
+INSTANTIATE_TEST_CASE_P(BitmapTest, BitmapTest_Uint64_32, inputs_bitmap<uint32_t>);
+
+using BitmapTest_Uint32_64 = BitmapTest<uint32_t, uint64_t>;
+TEST_P(BitmapTest_Uint32_64, Run) { run(); }
+INSTANTIATE_TEST_CASE_P(BitmapTest, BitmapTest_Uint32_64, inputs_bitmap<uint64_t>);
+
+using BitmapTest_Uint64_64 = BitmapTest<uint64_t, uint64_t>;
+TEST_P(BitmapTest_Uint64_64, Run) { run(); }
+INSTANTIATE_TEST_CASE_P(BitmapTest, BitmapTest_Uint64_64, inputs_bitmap<uint64_t>);
+
+}  // namespace raft::core
diff --git a/cpp/test/core/memory_type.cpp b/cpp/test/core/memory_type.cpp
index 07ca66ccc1..f020215595 100644
--- a/cpp/test/core/memory_type.cpp
+++ b/cpp/test/core/memory_type.cpp
@@ -49,7 +49,7 @@ TEST(MemoryTypeFromPointer, Host)
   auto ptr1 = static_cast<void*>(nullptr);
   cudaMallocHost(&ptr1, 1);
   EXPECT_EQ(memory_type_from_pointer(ptr1), memory_type::host);
-  cudaFree(ptr1);
+  cudaFreeHost(ptr1);
   auto ptr2 = static_cast<void*>(nullptr);
   EXPECT_EQ(memory_type_from_pointer(ptr2), memory_type::host);
 }
diff --git a/cpp/test/distance/dist_dice.cu b/cpp/test/distance/dist_dice.cu
new file mode 100644
index 0000000000..e127659dc6
--- /dev/null
+++ b/cpp/test/distance/dist_dice.cu
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+#include "distance_base.cuh"
+
+namespace raft {
+namespace distance {
+
+template <typename DataType>
+class DistanceExpDice : public DistanceTest<raft::distance::DistanceType::DiceExpanded, DataType> {
+};
+
+template <typename DataType>
+class DistanceExpDiceXequalY
+  : public DistanceTestSameBuffer<raft::distance::DistanceType::DiceExpanded, DataType> {};
+
+const std::vector<DistanceInputs<float>> inputsf = {
+  {0.001f, 128, (65536 + 128) * 128, 8, true, 1234ULL},
+  {0.001f, 1024, 1024, 32, true, 1234ULL},
+  {0.001f, 1024, 32, 1024, true, 1234ULL},
+  {0.001f, 32, 1024, 1024, true, 1234ULL},
+  {0.003f, 1024, 1024, 1024, true, 1234ULL},
+  {0.001f, (65536 + 128) * 128, 128, 8, false, 1234ULL},
+  {0.001f, 1024, 1024, 32, false, 1234ULL},
+  {0.001f, 1024, 32, 1024, false, 1234ULL},
+  {0.001f, 32, 1024, 1024, false, 1234ULL},
+  {0.003f, 1024, 1024, 1024, false, 1234ULL},
+};
+
+const std::vector<DistanceInputs<float>> inputsXeqYf = {
+  {0.01f, 1024, 1024, 32, true, 1234ULL},
+  {0.01f, 1024, 32, 1024, true, 1234ULL},
+  {0.01f, 32, 1024, 1024, true, 1234ULL},
+  {0.03f, 1024, 1024, 1024, true, 1234ULL},
+  {0.01f, 1024, 1024, 32, false, 1234ULL},
+  {0.01f, 1024, 32, 1024, false, 1234ULL},
+  {0.01f, 32, 1024, 1024, false, 1234ULL},
+  {0.03f, 1024, 1024, 1024, false, 1234ULL},
+};
+
+typedef DistanceExpDice<float> DistanceExpDiceF;
+TEST_P(DistanceExpDiceF, Result)
+{
+  int m = params.isRowMajor ? params.m : params.n;
+  int n = params.isRowMajor ? params.n : params.m;
+  ASSERT_TRUE(devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApproxNaN<float>(params.tolerance), stream));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpDiceF, ::testing::ValuesIn(inputsf));
+
+typedef DistanceExpDiceXequalY<float> DistanceExpDiceXequalYF;
+TEST_P(DistanceExpDiceXequalYF, Result)
+{
+  int m = params.m;
+  int n = params.m;
+  ASSERT_TRUE(raft::devArrMatch(dist_ref[0].data(),
+                                dist[0].data(),
+                                m,
+                                n,
+                                raft::CompareApproxNaN<float>(params.tolerance),
+                                stream));
+  n = params.isRowMajor ? m : m / 2;
+  m = params.isRowMajor ? m / 2 : m;
+
+  ASSERT_TRUE(raft::devArrMatch(dist_ref[1].data(),
+                                dist[1].data(),
+                                m,
+                                n,
+                                raft::CompareApproxNaN<float>(params.tolerance),
+                                stream));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpDiceXequalYF, ::testing::ValuesIn(inputsXeqYf));
+
+const std::vector<DistanceInputs<float>> inputsd = {
+  {0.001, 1024, 1024, 32, true, 1234ULL},
+  {0.001, 1024, 32, 1024, true, 1234ULL},
+  {0.001, 32, 1024, 1024, true, 1234ULL},
+  {0.003, 1024, 1024, 1024, true, 1234ULL},
+  {0.001f, 1024, 1024, 32, false, 1234ULL},
+  {0.001f, 1024, 32, 1024, false, 1234ULL},
+  {0.001f, 32, 1024, 1024, false, 1234ULL},
+  {0.003f, 1024, 1024, 1024, false, 1234ULL},
+};
+typedef DistanceExpDice<float> DistanceExpDiceD;
+TEST_P(DistanceExpDiceD, Result)
+{
+  int m = params.isRowMajor ? params.m : params.n;
+  int n = params.isRowMajor ? params.n : params.m;
+  ASSERT_TRUE(devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApproxNaN<float>(params.tolerance), stream));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpDiceD, ::testing::ValuesIn(inputsd));
+
+class BigMatrixDice : public BigMatrixDistanceTest<raft::distance::DistanceType::DiceExpanded> {};
+TEST_F(BigMatrixDice, Result) {}
+
+}  // end namespace distance
+}  // end namespace raft
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index 2854a8f3df..f44fb18519 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -96,6 +96,34 @@ RAFT_KERNEL naiveL1_Linf_CanberraDistanceKernel(DataType* dist,
   dist[outidx] = acc;
 }
 
+template <typename DataType>
+RAFT_KERNEL naiveDiceDistanceKernel(
+  DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
+{
+  int midx = threadIdx.x + blockIdx.x * blockDim.x;
+  int nidx = threadIdx.y + blockIdx.y * blockDim.y;
+  if (midx >= m || nidx >= n) { return; }
+
+  DataType acc_a  = DataType(0);
+  DataType acc_b  = DataType(0);
+  DataType acc_ab = DataType(0);
+
+  for (int i = 0; i < k; ++i) {
+    int xidx = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
+    auto a   = x[xidx];
+    auto b   = y[yidx];
+    acc_a += a;
+    acc_b += b;
+    acc_ab += a * b;
+  }
+
+  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+
+  // Use 1.0 - (dice dissimilarity) to calc the distance
+  dist[outidx] = (DataType)1.0 - (2 * acc_ab / ((acc_a) + (acc_b)));
+}
+
 template <typename DataType>
 RAFT_KERNEL naiveCosineDistanceKernel(
   DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
@@ -391,6 +419,9 @@ void naiveDistance(DataType* dist,
       naiveCorrelationDistanceKernel<DataType>
         <<<nblks, TPB, 0, stream>>>(dist, x, y, m, n, k, isRowMajor);
       break;
+    case raft::distance::DistanceType::DiceExpanded:
+      naiveDiceDistanceKernel<DataType><<<nblks, TPB, 0, stream>>>(dist, x, y, m, n, k, isRowMajor);
+      break;
     default: FAIL() << "should be here\n";
   }
   RAFT_CUDA_TRY(cudaPeekAtLastError());
@@ -482,7 +513,8 @@ class DistanceTest : public ::testing::TestWithParam<DistanceInputs<DataType>> {
       // Hellinger works only on positive numbers
       uniform(handle, r, x.data(), m * k, DataType(0.0), DataType(1.0));
       uniform(handle, r, y.data(), n * k, DataType(0.0), DataType(1.0));
-    } else if (distanceType == raft::distance::DistanceType::RusselRaoExpanded) {
+    } else if (distanceType == raft::distance::DistanceType::RusselRaoExpanded ||
+               distanceType == raft::distance::DistanceType::DiceExpanded) {
       uniform(handle, r, x.data(), m * k, DataType(0.0), DataType(1.0));
       uniform(handle, r, y.data(), n * k, DataType(0.0), DataType(1.0));
       // Russel rao works on boolean values.
@@ -571,7 +603,8 @@ class DistanceTestSameBuffer : public ::testing::TestWithParam<DistanceInputs<Da
         distanceType == raft::distance::DistanceType::KLDivergence) {
       // Hellinger works only on positive numbers
       uniform(handle, r, x.data(), m * k, DataType(0.0), DataType(1.0));
-    } else if (distanceType == raft::distance::DistanceType::RusselRaoExpanded) {
+    } else if (distanceType == raft::distance::DistanceType::RusselRaoExpanded ||
+               distanceType == raft::distance::DistanceType::DiceExpanded) {
       uniform(handle, r, x.data(), m * k, DataType(0.0), DataType(1.0));
       // Russel rao works on boolean values.
       bernoulli(handle, r, x.data(), m * k, 0.5f);
diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu
index 460b99aaa0..3ff117cf08 100644
--- a/cpp/test/linalg/eig.cu
+++ b/cpp/test/linalg/eig.cu
@@ -156,6 +156,24 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
     eig_vals_large, eig_vals_jacobi_large;
 };
 
+TEST(Raft, EigStream)
+{
+  // Separate test to check eig_dc stream workaround for CUDA 12+
+  raft::resources handle;
+  auto n_rows = 5000;
+  auto cov_matrix_stream =
+    raft::make_device_matrix<float, std::uint32_t, raft::col_major>(handle, n_rows, n_rows);
+  auto eig_vectors_stream =
+    raft::make_device_matrix<float, std::uint32_t, raft::col_major>(handle, n_rows, n_rows);
+  auto eig_vals_stream = raft::make_device_vector<float, std::uint32_t>(handle, n_rows);
+
+  raft::linalg::eig_dc(handle,
+                       raft::make_const_mdspan(cov_matrix_stream.view()),
+                       eig_vectors_stream.view(),
+                       eig_vals_stream.view());
+  raft::resource::sync_stream(handle, raft::resource::get_cuda_stream(handle));
+}
+
 const std::vector<EigInputs<float>> inputsf2 = {{0.001f, 4 * 4, 4, 4, 1234ULL, 256}};
 
 const std::vector<EigInputs<double>> inputsd2 = {{0.001, 4 * 4, 4, 4, 1234ULL, 256}};
diff --git a/cpp/test/neighbors/ann_nn_descent.cuh b/cpp/test/neighbors/ann_nn_descent.cuh
index 495af081f1..f74cadb415 100644
--- a/cpp/test/neighbors/ann_nn_descent.cuh
+++ b/cpp/test/neighbors/ann_nn_descent.cuh
@@ -15,11 +15,11 @@
  */
 #pragma once
 
-#include "../test_utils.cuh"
 #include "ann_utils.cuh"
 
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/neighbors/nn_descent.cuh>
+#include <raft/util/cudart_utils.hpp>
 #include <raft/util/itertools.hpp>
 
 #include <raft_internal/neighbors/naive_knn.cuh>
@@ -65,7 +65,9 @@ class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
   {
     size_t queries_size = ps.n_rows * ps.graph_degree;
     std::vector<IdxT> indices_NNDescent(queries_size);
+    std::vector<DistanceT> distances_NNDescent(queries_size);
     std::vector<IdxT> indices_naive(queries_size);
+    std::vector<DistanceT> distances_naive(queries_size);
 
     {
       rmm::device_uvector<DistanceT> distances_naive_dev(queries_size, stream_);
@@ -81,6 +83,7 @@ class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
                                         ps.graph_degree,
                                         ps.metric);
       update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_);
+      update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_);
       resource::sync_stream(handle_);
     }
 
@@ -91,6 +94,7 @@ class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
         index_params.graph_degree              = ps.graph_degree;
         index_params.intermediate_graph_degree = 2 * ps.graph_degree;
         index_params.max_iterations            = 100;
+        index_params.return_distances          = true;
 
         auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
           (const DataT*)database.data(), ps.n_rows, ps.dim);
@@ -102,20 +106,39 @@ class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
             auto database_host_view = raft::make_host_matrix_view<const DataT, int64_t>(
               (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim);
             auto index = nn_descent::build<DataT, IdxT>(handle_, index_params, database_host_view);
-            update_host(
+            raft::copy(
               indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_);
+            if (index.distances().has_value()) {
+              raft::copy(distances_NNDescent.data(),
+                         index.distances().value().data_handle(),
+                         queries_size,
+                         stream_);
+            }
+
           } else {
             auto index = nn_descent::build<DataT, IdxT>(handle_, index_params, database_view);
-            update_host(
+            raft::copy(
               indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_);
+            if (index.distances().has_value()) {
+              raft::copy(distances_NNDescent.data(),
+                         index.distances().value().data_handle(),
+                         queries_size,
+                         stream_);
+            }
           };
         }
         resource::sync_stream(handle_);
       }
 
       double min_recall = ps.min_recall;
-      EXPECT_TRUE(eval_recall(
-        indices_naive, indices_NNDescent, ps.n_rows, ps.graph_degree, 0.001, min_recall));
+      EXPECT_TRUE(eval_neighbours(indices_naive,
+                                  indices_NNDescent,
+                                  distances_naive,
+                                  distances_NNDescent,
+                                  ps.n_rows,
+                                  ps.graph_degree,
+                                  0.001,
+                                  min_recall));
     }
   }
 
diff --git a/cpp/test/sparse/masked_matmul.cu b/cpp/test/sparse/masked_matmul.cu
new file mode 100644
index 0000000000..0ece716a1b
--- /dev/null
+++ b/cpp/test/sparse/masked_matmul.cu
@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/random/make_blobs.cuh>
+#include <raft/sparse/linalg/masked_matmul.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+#include <thrust/reduce.h>
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <limits>
+
+namespace raft {
+namespace sparse {
+
+template <typename value_t, typename index_t>
+struct MaskedMatmulInputs {
+  value_t tolerance;
+
+  index_t m;
+  index_t k;
+  index_t n;
+
+  value_t sparsity;
+
+  unsigned long long int seed;
+};
+
+template <typename value_t>
+struct sum_abs_op {
+  __host__ __device__ value_t operator()(const value_t& x, const value_t& y) const
+  {
+    return y >= value_t(0.0) ? (x + y) : (x - y);
+  }
+};
+
+template <typename value_t, typename index_t>
+::std::ostream& operator<<(::std::ostream& os, const MaskedMatmulInputs<value_t, index_t>& params)
+{
+  os << " m: " << params.m << "\tk: " << params.k << "\tn: " << params.n
+     << "\tsparsity: " << params.sparsity;
+
+  return os;
+}
+
+template <typename value_t,
+          typename index_t,
+          typename bitmap_t      = uint32_t,
+          typename LayoutPolicyA = raft::row_major,
+          typename LayoutPolicyB = raft::row_major>
+class MaskedMatmulTest : public ::testing::TestWithParam<MaskedMatmulInputs<value_t, index_t>> {
+ public:
+  MaskedMatmulTest()
+    : params(::testing::TestWithParam<MaskedMatmulInputs<value_t, index_t>>::GetParam()),
+      stream(resource::get_cuda_stream(handle)),
+      a_data_d(0, resource::get_cuda_stream(handle)),
+      b_data_d(0, resource::get_cuda_stream(handle)),
+      bitmap_d(0, resource::get_cuda_stream(handle)),
+      c_indptr_d(0, resource::get_cuda_stream(handle)),
+      c_indices_d(0, resource::get_cuda_stream(handle)),
+      c_data_d(0, resource::get_cuda_stream(handle)),
+      c_expected_data_d(0, resource::get_cuda_stream(handle))
+  {
+  }
+
+ protected:
+  index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector<bitmap_t>& bitmap)
+  {
+    index_t total    = static_cast<index_t>(m * n);
+    index_t num_ones = static_cast<index_t>((total * 1.0f) * sparsity);
+    index_t res      = num_ones;
+
+    for (auto& item : bitmap) {
+      item = static_cast<bitmap_t>(0);
+    }
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<index_t> dis(0, total - 1);
+
+    while (num_ones > 0) {
+      index_t index = dis(gen);
+
+      bitmap_t& element    = bitmap[index / (8 * sizeof(bitmap_t))];
+      index_t bit_position = index % (8 * sizeof(bitmap_t));
+
+      if (((element >> bit_position) & 1) == 0) {
+        element |= (static_cast<index_t>(1) << bit_position);
+        num_ones--;
+      }
+    }
+    return res;
+  }
+
+  void cpu_convert_to_csr(std::vector<bitmap_t>& bitmap,
+                          index_t rows,
+                          index_t cols,
+                          std::vector<index_t>& indices,
+                          std::vector<index_t>& indptr)
+  {
+    index_t offset_indptr   = 0;
+    index_t offset_values   = 0;
+    indptr[offset_indptr++] = 0;
+
+    index_t index        = 0;
+    bitmap_t element     = 0;
+    index_t bit_position = 0;
+
+    for (index_t i = 0; i < rows; ++i) {
+      for (index_t j = 0; j < cols; ++j) {
+        index        = i * cols + j;
+        element      = bitmap[index / (8 * sizeof(bitmap_t))];
+        bit_position = index % (8 * sizeof(bitmap_t));
+
+        if (((element >> bit_position) & 1)) {
+          indices[offset_values] = static_cast<index_t>(j);
+          offset_values++;
+        }
+      }
+      indptr[offset_indptr++] = static_cast<index_t>(offset_values);
+    }
+  }
+
+  void cpu_sddmm(const std::vector<value_t>& A,
+                 const std::vector<value_t>& B,
+                 std::vector<value_t>& vals,
+                 const std::vector<index_t>& cols,
+                 const std::vector<index_t>& row_ptrs,
+                 bool is_row_major_A,
+                 bool is_row_major_B)
+  {
+    if (params.m * params.k != static_cast<index_t>(A.size()) ||
+        params.k * params.n != static_cast<index_t>(B.size())) {
+      std::cerr << "Matrix dimensions and vector size do not match!" << std::endl;
+      return;
+    }
+
+    for (index_t i = 0; i < params.m; ++i) {
+      for (index_t j = row_ptrs[i]; j < row_ptrs[i + 1]; ++j) {
+        value_t sum = 0;
+        for (index_t l = 0; l < params.k; ++l) {
+          index_t a_index = i * params.k + l;
+          index_t b_index = cols[j] * params.k + l;
+          sum += A[a_index] * B[b_index];
+        }
+        vals[j] = sum;
+      }
+    }
+  }
+
+  void make_data()
+  {
+    index_t a_size = params.m * params.k;
+    index_t b_size = params.k * params.n;
+    index_t c_size = params.m * params.n;
+
+    index_t element = raft::ceildiv(params.m * params.n, index_t(sizeof(bitmap_t) * 8));
+    std::vector<bitmap_t> bitmap_h(element);
+
+    std::vector<value_t> a_data_h(a_size);
+    std::vector<value_t> b_data_h(b_size);
+
+    a_data_d.resize(a_size, stream);
+    b_data_d.resize(b_size, stream);
+    bitmap_d.resize(bitmap_h.size(), stream);
+
+    auto blobs_a_b = raft::make_device_matrix<value_t, index_t>(handle, 1, a_size + b_size);
+    auto labels    = raft::make_device_vector<index_t, index_t>(handle, 1);
+
+    raft::random::make_blobs<value_t, index_t>(blobs_a_b.data_handle(),
+                                               labels.data_handle(),
+                                               1,
+                                               a_size + b_size,
+                                               1,
+                                               stream,
+                                               false,
+                                               nullptr,
+                                               nullptr,
+                                               value_t(1.0),
+                                               false,
+                                               value_t(-1.0f),
+                                               value_t(1.0f),
+                                               uint64_t(2024));
+
+    raft::copy(a_data_h.data(), blobs_a_b.data_handle(), a_size, stream);
+    raft::copy(b_data_h.data(), blobs_a_b.data_handle() + a_size, b_size, stream);
+
+    raft::copy(a_data_d.data(), blobs_a_b.data_handle(), a_size, stream);
+    raft::copy(b_data_d.data(), blobs_a_b.data_handle() + a_size, b_size, stream);
+
+    resource::sync_stream(handle);
+
+    index_t c_true_nnz = create_sparse_matrix(params.m, params.n, params.sparsity, bitmap_h);
+
+    std::vector<index_t> c_indptr_h(params.m + 1);
+    std::vector<index_t> c_indices_h(c_true_nnz);
+    std::vector<value_t> c_data_h(c_true_nnz);
+
+    cpu_convert_to_csr(bitmap_h, params.m, params.n, c_indices_h, c_indptr_h);
+
+    c_data_d.resize(c_data_h.size(), stream);
+
+    update_device(c_data_d.data(), c_data_h.data(), c_data_h.size(), stream);
+    update_device(bitmap_d.data(), bitmap_h.data(), bitmap_h.size(), stream);
+    resource::sync_stream(handle);
+
+    cpu_sddmm(a_data_h, b_data_h, c_data_h, c_indices_h, c_indptr_h, true, true);
+
+    c_indptr_d.resize(c_indptr_h.size(), stream);
+    c_indices_d.resize(c_indices_h.size(), stream);
+    c_expected_data_d.resize(c_data_h.size(), stream);
+
+    update_device(c_indptr_d.data(), c_indptr_h.data(), c_indptr_h.size(), stream);
+    update_device(c_indices_d.data(), c_indices_h.data(), c_indices_h.size(), stream);
+    update_device(c_expected_data_d.data(), c_data_h.data(), c_data_h.size(), stream);
+
+    resource::sync_stream(handle);
+  }
+
+  void SetUp() override { make_data(); }
+
+  void Run()
+  {
+    auto A =
+      raft::make_device_matrix_view<const value_t, index_t>(a_data_d.data(), params.m, params.k);
+    auto B =
+      raft::make_device_matrix_view<const value_t, index_t>(b_data_d.data(), params.n, params.k);
+
+    auto mask =
+      raft::core::bitmap_view<const bitmap_t, index_t>(bitmap_d.data(), params.m, params.n);
+
+    auto c_structure = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
+      c_indptr_d.data(),
+      c_indices_d.data(),
+      params.m,
+      params.n,
+      static_cast<index_t>(c_indices_d.size()));
+
+    auto C = raft::make_device_csr_matrix_view<value_t>(c_data_d.data(), c_structure);
+
+    raft::sparse::linalg::masked_matmul(handle, A, B, mask, C);
+
+    resource::sync_stream(handle);
+
+    ASSERT_TRUE(raft::devArrMatch<value_t>(c_expected_data_d.data(),
+                                           C.get_elements().data(),
+                                           c_expected_data_d.size(),
+                                           raft::CompareApprox<value_t>(params.tolerance),
+                                           stream));
+
+    thrust::device_ptr<value_t> expected_data_ptr =
+      thrust::device_pointer_cast(c_expected_data_d.data());
+    value_t sum_abs = thrust::reduce(thrust::cuda::par.on(stream),
+                                     expected_data_ptr,
+                                     expected_data_ptr + c_expected_data_d.size(),
+                                     value_t(0.0f),
+                                     sum_abs_op<value_t>());
+    value_t avg     = sum_abs / (1.0f * c_expected_data_d.size());
+
+    ASSERT_GE(avg, (params.tolerance * static_cast<value_t>(0.001f)));
+  }
+
+  raft::resources handle;
+  cudaStream_t stream;
+  MaskedMatmulInputs<value_t, index_t> params;
+
+  rmm::device_uvector<value_t> a_data_d;
+  rmm::device_uvector<value_t> b_data_d;
+  rmm::device_uvector<bitmap_t> bitmap_d;
+
+  rmm::device_uvector<index_t> c_indptr_d;
+  rmm::device_uvector<index_t> c_indices_d;
+  rmm::device_uvector<value_t> c_data_d;
+
+  rmm::device_uvector<value_t> c_expected_data_d;
+};
+
+using MaskedMatmulTestF = MaskedMatmulTest<float, int>;
+TEST_P(MaskedMatmulTestF, Result) { Run(); }
+
+using MaskedMatmulTestD = MaskedMatmulTest<double, int>;
+TEST_P(MaskedMatmulTestD, Result) { Run(); }
+
+const std::vector<MaskedMatmulInputs<float, int>> sddmm_inputs_f = {
+  {0.0001f, 10, 5, 32, 0.1, 1234ULL},
+  {0.0001f, 1024, 32, 1024, 0.1, 1234ULL},
+  {0.0003f, 32, 1024, 1024, 0.2, 1234ULL},
+  {0.001f, 1024, 1024, 1024, 0.19, 1234ULL},
+  {0.0001f, 1024, 1024, 32, 0.3, 1234ULL},
+  {0.0001f, 1024, 32, 1024, 0.4, 1234ULL},
+  {0.0003f, 32, 1024, 1024, 0.19, 1234ULL},
+  {0.001f, 1024, 1024, 1024, 0.1, 1234ULL}};
+
+const std::vector<MaskedMatmulInputs<double, int>> sddmm_inputs_d = {
+  {0.0001f, 10, 5, 32, 0.01, 1234ULL},
+  {0.0001f, 1024, 32, 1024, 0.1, 1234ULL},
+  {0.0001f, 32, 1024, 1024, 0.2, 1234ULL},
+  {0.0001f, 1024, 1024, 1024, 0.19, 1234ULL},
+  {0.0001f, 1024, 1024, 32, 0.3, 1234ULL},
+  {0.0001f, 1024, 32, 1024, 0.4, 1234ULL},
+  {0.0001f, 32, 1024, 1024, 0.19, 1234ULL},
+  {0.0001f, 1024, 1024, 1024, 0.1, 1234ULL}};
+
+INSTANTIATE_TEST_CASE_P(MaskedMatmulTest, MaskedMatmulTestF, ::testing::ValuesIn(sddmm_inputs_f));
+
+INSTANTIATE_TEST_CASE_P(MaskedMatmulTest, MaskedMatmulTestD, ::testing::ValuesIn(sddmm_inputs_d));
+
+}  // namespace sparse
+}  // namespace raft
diff --git a/cpp/test/stats/cov.cu b/cpp/test/stats/cov.cu
index 41812979b6..602f356b9f 100644
--- a/cpp/test/stats/cov.cu
+++ b/cpp/test/stats/cov.cu
@@ -40,7 +40,8 @@ struct CovInputs {
 template <typename T>
 ::std::ostream& operator<<(::std::ostream& os, const CovInputs<T>& dims)
 {
-  return os;
+  return os << "{ " << dims.tolerance << ", " << dims.rows << ", " << dims.cols << ", "
+            << dims.sample << ", " << dims.rowMajor << "}" << std::endl;
 }
 
 template <typename T>
@@ -71,8 +72,7 @@ class CovTest : public ::testing::TestWithParam<CovInputs<T>> {
     cov_act.resize(cols * cols, stream);
 
     normal(handle, r, data.data(), len, params.mean, var);
-    raft::stats::mean(
-      mean_act.data(), data.data(), cols, rows, params.sample, params.rowMajor, stream);
+    raft::stats::mean(mean_act.data(), data.data(), cols, rows, false, params.rowMajor, stream);
     if (params.rowMajor) {
       using layout = raft::row_major;
       cov(handle,
@@ -102,7 +102,7 @@ class CovTest : public ::testing::TestWithParam<CovInputs<T>> {
     raft::update_device(data_cm.data(), data_h, 6, stream);
     raft::update_device(cov_cm_ref.data(), cov_cm_ref_h, 4, stream);
 
-    raft::stats::mean(mean_cm.data(), data_cm.data(), 2, 3, true, false, stream);
+    raft::stats::mean(mean_cm.data(), data_cm.data(), 2, 3, false, false, stream);
     cov(handle, cov_cm.data(), data_cm.data(), mean_cm.data(), 2, 3, true, false, true, stream);
   }
 
diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu
index 61b57ce739..c5fe83d95b 100644
--- a/cpp/test/stats/mean.cu
+++ b/cpp/test/stats/mean.cu
@@ -35,12 +35,14 @@ struct MeanInputs {
   int rows, cols;
   bool sample, rowMajor;
   unsigned long long int seed;
+  T stddev = (T)1.0;
 };
 
 template <typename T>
 ::std::ostream& operator<<(::std::ostream& os, const MeanInputs<T>& dims)
 {
-  return os;
+  return os << "{ " << dims.tolerance << ", " << dims.rows << ", " << dims.cols << ", "
+            << dims.sample << ", " << dims.rowMajor << ", " << dims.stddev << "}" << std::endl;
 }
 
 template <typename T>
@@ -61,7 +63,7 @@ class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
   {
     raft::random::RngState r(params.seed);
     int len = rows * cols;
-    normal(handle, r, data.data(), len, params.mean, (T)1.0);
+    normal(handle, r, data.data(), len, params.mean, params.stddev);
     meanSGtest(data.data(), stream);
   }
 
@@ -96,38 +98,72 @@ class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
 // measured mean (of a normal distribution) will fall outside of an epsilon of
 // 0.15 only 4/10000 times. (epsilon of 0.1 will fail 30/100 times)
 const std::vector<MeanInputs<float>> inputsf = {
-  {0.15f, 1.f, 1024, 32, true, false, 1234ULL},    {0.15f, 1.f, 1024, 64, true, false, 1234ULL},
-  {0.15f, 1.f, 1024, 128, true, false, 1234ULL},   {0.15f, 1.f, 1024, 256, true, false, 1234ULL},
-  {0.15f, -1.f, 1024, 32, false, false, 1234ULL},  {0.15f, -1.f, 1024, 64, false, false, 1234ULL},
-  {0.15f, -1.f, 1024, 128, false, false, 1234ULL}, {0.15f, -1.f, 1024, 256, false, false, 1234ULL},
-  {0.15f, 1.f, 1024, 32, true, true, 1234ULL},     {0.15f, 1.f, 1024, 64, true, true, 1234ULL},
-  {0.15f, 1.f, 1024, 128, true, true, 1234ULL},    {0.15f, 1.f, 1024, 256, true, true, 1234ULL},
-  {0.15f, -1.f, 1024, 32, false, true, 1234ULL},   {0.15f, -1.f, 1024, 64, false, true, 1234ULL},
-  {0.15f, -1.f, 1024, 128, false, true, 1234ULL},  {0.15f, -1.f, 1024, 256, false, true, 1234ULL},
-  {0.15f, -1.f, 1030, 1, false, false, 1234ULL},   {0.15f, -1.f, 1030, 60, true, false, 1234ULL},
-  {2.0f, -1.f, 31, 120, false, false, 1234ULL},    {2.0f, -1.f, 1, 130, true, false, 1234ULL},
-  {0.15f, -1.f, 1030, 1, false, true, 1234ULL},    {0.15f, -1.f, 1030, 60, true, true, 1234ULL},
-  {2.0f, -1.f, 31, 120, false, true, 1234ULL},     {2.0f, -1.f, 1, 130, false, true, 1234ULL},
-  {2.0f, -1.f, 1, 1, false, false, 1234ULL},       {2.0f, -1.f, 1, 1, false, true, 1234ULL},
-  {2.0f, -1.f, 7, 23, false, false, 1234ULL},      {2.0f, -1.f, 7, 23, false, true, 1234ULL},
-  {2.0f, -1.f, 17, 5, false, false, 1234ULL},      {2.0f, -1.f, 17, 5, false, true, 1234ULL}};
+  {0.15f, 1.f, 1024, 32, true, false, 1234ULL},
+  {0.15f, 1.f, 1024, 64, true, false, 1234ULL},
+  {0.15f, 1.f, 1024, 128, true, false, 1234ULL},
+  {0.15f, 1.f, 1024, 256, true, false, 1234ULL},
+  {0.15f, -1.f, 1024, 32, false, false, 1234ULL},
+  {0.15f, -1.f, 1024, 64, false, false, 1234ULL},
+  {0.15f, -1.f, 1024, 128, false, false, 1234ULL},
+  {0.15f, -1.f, 1024, 256, false, false, 1234ULL},
+  {0.15f, 1.f, 1024, 32, true, true, 1234ULL},
+  {0.15f, 1.f, 1024, 64, true, true, 1234ULL},
+  {0.15f, 1.f, 1024, 128, true, true, 1234ULL},
+  {0.15f, 1.f, 1024, 256, true, true, 1234ULL},
+  {0.15f, -1.f, 1024, 32, false, true, 1234ULL},
+  {0.15f, -1.f, 1024, 64, false, true, 1234ULL},
+  {0.15f, -1.f, 1024, 128, false, true, 1234ULL},
+  {0.15f, -1.f, 1024, 256, false, true, 1234ULL},
+  {0.15f, -1.f, 1030, 1, false, false, 1234ULL},
+  {0.15f, -1.f, 1030, 60, true, false, 1234ULL},
+  {2.0f, -1.f, 31, 120, false, false, 1234ULL},
+  {2.0f, -1.f, 1, 130, false, false, 1234ULL},
+  {0.15f, -1.f, 1030, 1, false, true, 1234ULL},
+  {0.15f, -1.f, 1030, 60, true, true, 1234ULL},
+  {2.0f, -1.f, 31, 120, false, true, 1234ULL},
+  {2.0f, -1.f, 1, 130, false, true, 1234ULL},
+  {2.0f, -1.f, 1, 1, false, false, 1234ULL},
+  {2.0f, -1.f, 1, 1, false, true, 1234ULL},
+  {2.0f, -1.f, 7, 23, false, false, 1234ULL},
+  {2.0f, -1.f, 7, 23, false, true, 1234ULL},
+  {2.0f, -1.f, 17, 5, false, false, 1234ULL},
+  {2.0f, -1.f, 17, 5, false, true, 1234ULL},
+  {0.0001f, 0.1f, 1 << 27, 2, false, false, 1234ULL, 0.0001f},
+  {0.0001f, 0.1f, 1 << 27, 2, false, true, 1234ULL, 0.0001f}};
 
 const std::vector<MeanInputs<double>> inputsd = {
-  {0.15, 1.0, 1024, 32, true, false, 1234ULL},    {0.15, 1.0, 1024, 64, true, false, 1234ULL},
-  {0.15, 1.0, 1024, 128, true, false, 1234ULL},   {0.15, 1.0, 1024, 256, true, false, 1234ULL},
-  {0.15, -1.0, 1024, 32, false, false, 1234ULL},  {0.15, -1.0, 1024, 64, false, false, 1234ULL},
-  {0.15, -1.0, 1024, 128, false, false, 1234ULL}, {0.15, -1.0, 1024, 256, false, false, 1234ULL},
-  {0.15, 1.0, 1024, 32, true, true, 1234ULL},     {0.15, 1.0, 1024, 64, true, true, 1234ULL},
-  {0.15, 1.0, 1024, 128, true, true, 1234ULL},    {0.15, 1.0, 1024, 256, true, true, 1234ULL},
-  {0.15, -1.0, 1024, 32, false, true, 1234ULL},   {0.15, -1.0, 1024, 64, false, true, 1234ULL},
-  {0.15, -1.0, 1024, 128, false, true, 1234ULL},  {0.15, -1.0, 1024, 256, false, true, 1234ULL},
-  {0.15, -1.0, 1030, 1, false, false, 1234ULL},   {0.15, -1.0, 1030, 60, true, false, 1234ULL},
-  {2.0, -1.0, 31, 120, false, false, 1234ULL},    {2.0, -1.0, 1, 130, true, false, 1234ULL},
-  {0.15, -1.0, 1030, 1, false, true, 1234ULL},    {0.15, -1.0, 1030, 60, true, true, 1234ULL},
-  {2.0, -1.0, 31, 120, false, true, 1234ULL},     {2.0, -1.0, 1, 130, false, true, 1234ULL},
-  {2.0, -1.0, 1, 1, false, false, 1234ULL},       {2.0, -1.0, 1, 1, false, true, 1234ULL},
-  {2.0, -1.0, 7, 23, false, false, 1234ULL},      {2.0, -1.0, 7, 23, false, true, 1234ULL},
-  {2.0, -1.0, 17, 5, false, false, 1234ULL},      {2.0, -1.0, 17, 5, false, true, 1234ULL}};
+  {0.15, 1.0, 1024, 32, true, false, 1234ULL},
+  {0.15, 1.0, 1024, 64, true, false, 1234ULL},
+  {0.15, 1.0, 1024, 128, true, false, 1234ULL},
+  {0.15, 1.0, 1024, 256, true, false, 1234ULL},
+  {0.15, -1.0, 1024, 32, false, false, 1234ULL},
+  {0.15, -1.0, 1024, 64, false, false, 1234ULL},
+  {0.15, -1.0, 1024, 128, false, false, 1234ULL},
+  {0.15, -1.0, 1024, 256, false, false, 1234ULL},
+  {0.15, 1.0, 1024, 32, true, true, 1234ULL},
+  {0.15, 1.0, 1024, 64, true, true, 1234ULL},
+  {0.15, 1.0, 1024, 128, true, true, 1234ULL},
+  {0.15, 1.0, 1024, 256, true, true, 1234ULL},
+  {0.15, -1.0, 1024, 32, false, true, 1234ULL},
+  {0.15, -1.0, 1024, 64, false, true, 1234ULL},
+  {0.15, -1.0, 1024, 128, false, true, 1234ULL},
+  {0.15, -1.0, 1024, 256, false, true, 1234ULL},
+  {0.15, -1.0, 1030, 1, false, false, 1234ULL},
+  {0.15, -1.0, 1030, 60, true, false, 1234ULL},
+  {2.0, -1.0, 31, 120, false, false, 1234ULL},
+  {2.0, -1.0, 1, 130, false, false, 1234ULL},
+  {0.15, -1.0, 1030, 1, false, true, 1234ULL},
+  {0.15, -1.0, 1030, 60, true, true, 1234ULL},
+  {2.0, -1.0, 31, 120, false, true, 1234ULL},
+  {2.0, -1.0, 1, 130, false, true, 1234ULL},
+  {2.0, -1.0, 1, 1, false, false, 1234ULL},
+  {2.0, -1.0, 1, 1, false, true, 1234ULL},
+  {2.0, -1.0, 7, 23, false, false, 1234ULL},
+  {2.0, -1.0, 7, 23, false, true, 1234ULL},
+  {2.0, -1.0, 17, 5, false, false, 1234ULL},
+  {2.0, -1.0, 17, 5, false, true, 1234ULL},
+  {1e-8, 1e-1, 1 << 27, 2, false, false, 1234ULL, 0.0001},
+  {1e-8, 1e-1, 1 << 27, 2, false, true, 1234ULL, 0.0001}};
 
 typedef MeanTest<float> MeanTestF;
 TEST_P(MeanTestF, Result)
diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu
index 641621c1c6..f4c5f92f49 100644
--- a/cpp/test/stats/stddev.cu
+++ b/cpp/test/stats/stddev.cu
@@ -39,7 +39,8 @@ struct StdDevInputs {
 template <typename T>
 ::std::ostream& operator<<(::std::ostream& os, const StdDevInputs<T>& dims)
 {
-  return os;
+  return os << "{ " << dims.tolerance << ", " << dims.rows << ", " << dims.cols << ", "
+            << dims.sample << ", " << dims.rowMajor << "}" << std::endl;
 }
 
 template <typename T>
@@ -81,7 +82,7 @@ class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
       mean(handle,
            raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
            raft::make_device_vector_view<T, int>(mean_act.data(), cols),
-           params.sample);
+           false);
 
       stddev(handle,
              raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
@@ -99,7 +100,7 @@ class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
       mean(handle,
            raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
            raft::make_device_vector_view<T>(mean_act.data(), cols),
-           params.sample);
+           false);
 
       stddev(handle,
              raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
@@ -147,13 +148,15 @@ const std::vector<StdDevInputs<float>> inputsf = {
   {0.5f, -1.f, 2.f, 31, 1, true, true, 1234ULL},
   {1.f, -1.f, 2.f, 1, 257, false, true, 1234ULL},
   {0.5f, -1.f, 2.f, 31, 1, false, false, 1234ULL},
-  {1.f, -1.f, 2.f, 1, 257, true, false, 1234ULL},
+  {1.f, -1.f, 2.f, 1, 257, false, false, 1234ULL},
   {1.f, -1.f, 2.f, 1, 1, false, false, 1234ULL},
   {1.f, -1.f, 2.f, 7, 23, false, false, 1234ULL},
   {1.f, -1.f, 2.f, 17, 5, false, false, 1234ULL},
   {1.f, -1.f, 2.f, 1, 1, false, true, 1234ULL},
   {1.f, -1.f, 2.f, 7, 23, false, true, 1234ULL},
-  {1.f, -1.f, 2.f, 17, 5, false, true, 1234ULL}};
+  {1.f, -1.f, 2.f, 17, 5, false, true, 1234ULL},
+  {0.00001f, 0.001f, 0.f, 1 << 27, 2, false, true, 1234ULL},
+  {0.00001f, 0.001f, 0.f, 1 << 27, 2, false, false, 1234ULL}};
 
 const std::vector<StdDevInputs<double>> inputsd = {
   {0.1, 1.0, 2.0, 1024, 32, true, false, 1234ULL},
@@ -177,13 +180,15 @@ const std::vector<StdDevInputs<double>> inputsd = {
   {0.5, -1.0, 2.0, 31, 1, true, true, 1234ULL},
   {1.0, -1.0, 2.0, 1, 257, false, true, 1234ULL},
   {0.5, -1.0, 2.0, 31, 1, false, false, 1234ULL},
-  {1.0, -1.0, 2.0, 1, 257, true, false, 1234ULL},
+  {1.0, -1.0, 2.0, 1, 257, false, false, 1234ULL},
   {1.0, -1.0, 2.0, 1, 1, false, false, 1234ULL},
   {1.0, -1.0, 2.0, 7, 23, false, false, 1234ULL},
   {1.0, -1.0, 2.0, 17, 5, false, false, 1234ULL},
   {1.0, -1.0, 2.0, 1, 1, false, true, 1234ULL},
   {1.0, -1.0, 2.0, 7, 23, false, true, 1234ULL},
-  {1.0, -1.0, 2.0, 17, 5, false, true, 1234ULL}};
+  {1.0, -1.0, 2.0, 17, 5, false, true, 1234ULL},
+  {1e-7, 0.001, 0.0, 1 << 27, 2, false, true, 1234ULL},
+  {1e-7, 0.001, 0.0, 1 << 27, 2, false, false, 1234ULL}};
 
 typedef StdDevTest<float> StdDevTestF;
 TEST_P(StdDevTestF, Result)
diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu
index bf2aa44a2c..fbb398cc5b 100644
--- a/cpp/test/stats/sum.cu
+++ b/cpp/test/stats/sum.cu
@@ -40,7 +40,8 @@ struct SumInputs {
 template <typename T>
 ::std::ostream& operator<<(::std::ostream& os, const SumInputs<T>& dims)
 {
-  return os;
+  return os << "{ " << dims.tolerance << ", " << dims.rows << ", " << dims.cols << ", "
+            << dims.rowMajor << ", " << dims.value << "}" << std::endl;
 }
 
 template <typename T>
@@ -57,13 +58,31 @@ class SumTest : public ::testing::TestWithParam<SumInputs<T>> {
   }
 
  protected:
-  void runTest()
+  void runTest(bool checkErrorCompensation = false)
   {
     int len = rows * cols;
 
+    double large_factor = 1e7;
+
+    if constexpr (std::is_same_v<double, T>) large_factor = 1e12;
+
     std::vector<T> data_h(len);
     for (int i = 0; i < len; i++) {
-      data_h[i] = T(params.value);
+      data_h[i] = double(params.value);
+      int row   = params.rowMajor ? i / cols : i % rows;
+
+      // every 3 elements (in a column) contain 2 large dummy elements
+      // (one of them negative) and one element with 3x compensating
+      // for the 2 missing elements
+      if (checkErrorCompensation && row % 3 == 2) {
+        data_h[i] = double(params.value) * large_factor;
+        // compensate with opposite error 3 rows up
+        int idx2     = params.rowMajor ? (i - cols) : (i - 1);
+        data_h[idx2] = -1 * double(params.value) * large_factor;
+        // compensate 2 missing values
+        int idx3     = params.rowMajor ? (i - 2 * cols) : (i - 2);
+        data_h[idx3] = 3.0 * double(params.value);
+      }
     }
 
     raft::update_device(data.data(), data_h.data(), len, stream);
@@ -83,8 +102,10 @@ class SumTest : public ::testing::TestWithParam<SumInputs<T>> {
 
     double expected = double(params.rows) * params.value;
 
+    double tolerance = checkErrorCompensation ? 100 * params.tolerance : params.tolerance;
+
     ASSERT_TRUE(raft::devArrMatch(
-      T(expected), sum_act.data(), params.cols, raft::CompareApprox<T>(params.tolerance)));
+      T(expected), sum_act.data(), params.cols, raft::CompareApprox<T>(tolerance)));
   }
 
  protected:
@@ -96,43 +117,29 @@ class SumTest : public ::testing::TestWithParam<SumInputs<T>> {
   rmm::device_uvector<T> data, sum_act;
 };
 
-const std::vector<SumInputs<float>> inputsf = {{0.0001f, 4, 5, true, 1},
-                                               {0.0001f, 1024, 32, true, 1},
-                                               {0.0001f, 1024, 256, true, 1},
-                                               {0.0001f, 100000000, 1, true, 0.001},
-                                               {0.0001f, 1, 30, true, 0.001},
-                                               {0.0001f, 1, 1, true, 0.001},
-                                               {0.0001f, 17, 5, true, 0.001},
-                                               {0.0001f, 7, 23, true, 0.001},
-                                               {0.0001f, 3, 97, true, 0.001},
-                                               {0.0001f, 4, 5, false, 1},
-                                               {0.0001f, 1024, 32, false, 1},
-                                               {0.0001f, 1024, 256, false, 1},
-                                               {0.0001f, 100000000, 1, false, 0.001},
-                                               {0.0001f, 1, 30, false, 0.001},
-                                               {0.0001f, 1, 1, false, 0.001},
-                                               {0.0001f, 17, 5, false, 0.001},
-                                               {0.0001f, 7, 23, false, 0.001},
-                                               {0.0001f, 3, 97, false, 0.001}};
-
-const std::vector<SumInputs<double>> inputsd = {{0.000001, 1024, 32, true, 1},
-                                                {0.000001, 1024, 256, true, 1},
-                                                {0.000001, 1024, 256, true, 1},
-                                                {0.000001, 100000000, 1, true, 0.001},
-                                                {0.000001, 1, 30, true, 0.0001},
-                                                {0.000001, 1, 1, true, 0.0001},
-                                                {0.000001, 17, 5, true, 0.0001},
-                                                {0.000001, 7, 23, true, 0.0001},
-                                                {0.000001, 3, 97, true, 0.0001},
-                                                {0.000001, 1024, 32, false, 1},
-                                                {0.000001, 1024, 256, false, 1},
-                                                {0.000001, 1024, 256, false, 1},
-                                                {0.000001, 100000000, 1, false, 0.001},
-                                                {0.000001, 1, 30, false, 0.0001},
-                                                {0.000001, 1, 1, false, 0.0001},
-                                                {0.000001, 17, 5, false, 0.0001},
-                                                {0.000001, 7, 23, false, 0.0001},
-                                                {0.000001, 3, 97, false, 0.0001}};
+const std::vector<SumInputs<float>> inputsf = {
+  {0.0001f, 4, 5, true, 1},          {0.0001f, 1024, 32, true, 1},
+  {0.0001f, 1024, 256, true, 1},     {0.0001f, 100000000, 1, true, 0.001},
+  {0.0001f, 1 << 27, 2, true, 0.1},  {0.0001f, 1, 30, true, 0.001},
+  {0.0001f, 1, 1, true, 0.001},      {0.0001f, 17, 5, true, 0.001},
+  {0.0001f, 7, 23, true, 0.001},     {0.0001f, 3, 97, true, 0.001},
+  {0.0001f, 4, 5, false, 1},         {0.0001f, 1024, 32, false, 1},
+  {0.0001f, 1024, 256, false, 1},    {0.0001f, 100000000, 1, false, 0.001},
+  {0.0001f, 1 << 27, 2, false, 0.1}, {0.0001f, 1, 30, false, 0.001},
+  {0.0001f, 1, 1, false, 0.001},     {0.0001f, 17, 5, false, 0.001},
+  {0.0001f, 7, 23, false, 0.001},    {0.0001f, 3, 97, false, 0.001}};
+
+const std::vector<SumInputs<double>> inputsd = {
+  {0.000001, 1024, 32, true, 1},    {0.000001, 1024, 256, true, 1},
+  {0.000001, 1024, 256, true, 1},   {0.000001, 100000000, 1, true, 0.001},
+  {1e-9, 1 << 27, 2, true, 0.1},    {0.000001, 1, 30, true, 0.0001},
+  {0.000001, 1, 1, true, 0.0001},   {0.000001, 17, 5, true, 0.0001},
+  {0.000001, 7, 23, true, 0.0001},  {0.000001, 3, 97, true, 0.0001},
+  {0.000001, 1024, 32, false, 1},   {0.000001, 1024, 256, false, 1},
+  {0.000001, 1024, 256, false, 1},  {0.000001, 100000000, 1, false, 0.001},
+  {1e-9, 1 << 27, 2, false, 0.1},   {0.000001, 1, 30, false, 0.0001},
+  {0.000001, 1, 1, false, 0.0001},  {0.000001, 17, 5, false, 0.0001},
+  {0.000001, 7, 23, false, 0.0001}, {0.000001, 3, 97, false, 0.0001}};
 
 typedef SumTest<float> SumTestF;
 typedef SumTest<double> SumTestD;
@@ -140,6 +147,9 @@ typedef SumTest<double> SumTestD;
 TEST_P(SumTestF, Result) { runTest(); }
 TEST_P(SumTestD, Result) { runTest(); }
 
+TEST_P(SumTestF, Accuracy) { runTest(true); }
+TEST_P(SumTestD, Accuracy) { runTest(true); }
+
 INSTANTIATE_TEST_CASE_P(SumTests, SumTestF, ::testing::ValuesIn(inputsf));
 
 INSTANTIATE_TEST_CASE_P(SumTests, SumTestD, ::testing::ValuesIn(inputsd));
diff --git a/cpp/test/test_utils.h b/cpp/test/test_utils.h
index cf9a885cfb..63ec74d1da 100644
--- a/cpp/test/test_utils.h
+++ b/cpp/test/test_utils.h
@@ -55,6 +55,23 @@ struct CompareApprox {
   T eps;
 };
 
+template <typename T>
+struct CompareApproxNaN {
+  CompareApproxNaN(T eps_) : eps(eps_) {}
+  bool operator()(const T& a, const T& b) const
+  {
+    T diff  = std::abs(a - b);
+    T m     = std::max(std::abs(a), std::abs(b));
+    T ratio = diff > eps ? diff / m : diff;
+
+    if (std::isnan(a) && std::isnan(b)) { return true; }
+    return (ratio <= eps);
+  }
+
+ private:
+  T eps;
+};
+
 template <typename Key, typename Value>
 ::std::ostream& operator<<(::std::ostream& os, const raft::KeyValuePair<Key, Value>& kv)
 {
diff --git a/cpp/test/util/popc.cu b/cpp/test/util/popc.cu
new file mode 100644
index 0000000000..c08faacb07
--- /dev/null
+++ b/cpp/test/util/popc.cu
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/popc.cuh>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <optional>
+#include <queue>
+#include <random>
+#include <unordered_set>
+#include <vector>
+
+namespace raft {
+template <typename index_t>
+struct PopcInputs {
+  index_t n_rows;
+  index_t n_cols;
+  float sparsity;
+  bool owning;
+};
+
+template <typename index_t, typename bits_t = uint32_t>
+class PopcTest : public ::testing::TestWithParam<PopcInputs<index_t>> {
+ public:
+  PopcTest()
+    : stream(resource::get_cuda_stream(handle)),
+      params(::testing::TestWithParam<PopcInputs<index_t>>::GetParam()),
+      bits_d(0, stream)
+  {
+  }
+
+ protected:
+  index_t create_bitmap(index_t m, index_t n, float sparsity, std::vector<bits_t>& bitmap)
+  {
+    index_t total    = static_cast<index_t>(m * n);
+    index_t num_ones = static_cast<index_t>((total * 1.0f) * sparsity);
+    index_t res      = num_ones;
+
+    for (auto& item : bitmap) {
+      item = static_cast<bits_t>(0);
+    }
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<index_t> dis(0, total - 1);
+
+    while (num_ones > 0) {
+      index_t index = dis(gen);
+
+      bits_t& element      = bitmap[index / (8 * sizeof(bits_t))];
+      index_t bit_position = index % (8 * sizeof(bits_t));
+
+      if (((element >> bit_position) & 1) == 0) {
+        element |= (static_cast<index_t>(1) << bit_position);
+        num_ones--;
+      }
+    }
+    return res;
+  }
+
+  void SetUp() override
+  {
+    index_t element = raft::ceildiv(params.n_rows * params.n_cols, index_t(sizeof(bits_t) * 8));
+    std::vector<bits_t> bits_h(element);
+
+    nnz_expected = create_bitmap(params.n_rows, params.n_cols, params.sparsity, bits_h);
+    bits_d.resize(bits_h.size(), stream);
+    update_device(bits_d.data(), bits_h.data(), bits_h.size(), stream);
+
+    resource::sync_stream(handle);
+  }
+
+  void Run()
+  {
+    auto bits_view =
+      raft::make_device_vector_view<const bits_t, index_t>(bits_d.data(), bits_d.size());
+
+    index_t max_len   = params.n_rows * params.n_cols;
+    auto max_len_view = raft::make_host_scalar_view<index_t>(&max_len);
+
+    index_t nnz_actual_h = 0;
+    rmm::device_scalar<index_t> nnz_actual_d(0, stream);
+    auto nnz_actual_view = raft::make_device_scalar_view<index_t>(nnz_actual_d.data());
+
+    raft::popc(handle, bits_view, max_len_view, nnz_actual_view);
+    raft::copy(&nnz_actual_h, nnz_actual_d.data(), 1, stream);
+    resource::sync_stream(handle);
+
+    ASSERT_EQ(nnz_expected, nnz_actual_h);
+  }
+
+ protected:
+  raft::resources handle;
+  cudaStream_t stream;
+
+  PopcInputs<index_t> params;
+  rmm::device_uvector<bits_t> bits_d;
+  index_t nnz_expected;
+};
+
+using PopcTestI32 = PopcTest<int32_t>;
+TEST_P(PopcTestI32, Result) { Run(); }
+
+template <typename index_t>
+const std::vector<PopcInputs<index_t>> popc_inputs = {
+  {0, 0, 0.2},
+  {10, 32, 0.4},
+  {10, 3, 0.2},
+  {32, 1024, 0.4},
+  {1024, 1048576, 0.01},
+  {1024, 1024, 0.4},
+  {64 * 1024 + 10, 2, 0.3},
+  {16, 16, 0.3},
+  {17, 16, 0.3},
+  {18, 16, 0.3},
+  {32 + 9, 33, 0.2},
+  {2, 33, 0.2},
+  {0, 0, 0.2},
+  {10, 32, 0.4},
+  {10, 3, 0.2},
+  {32, 1024, 0.4},
+  {1024, 1048576, 0.01},
+  {1024, 1024, 0.4},
+  {64 * 1024 + 10, 2, 0.3},
+  {16, 16, 0.3},
+  {17, 16, 0.3},
+  {18, 16, 0.3},
+  {32 + 9, 33, 0.2},
+  {2, 33, 0.2},
+};
+
+INSTANTIATE_TEST_CASE_P(PopcTest, PopcTestI32, ::testing::ValuesIn(popc_inputs<int32_t>));
+
+}  // namespace raft
diff --git a/dependencies.yaml b/dependencies.yaml
index 98fc7fa8fc..34e7998ddf 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -3,10 +3,10 @@ files:
   all:
     output: conda
     matrix:
-      cuda: ["11.8", "12.2"]
+      cuda: ["11.8", "12.5"]
       arch: [x86_64, aarch64]
     includes:
-      - build
+      - rapids_build
       - build_pylibraft
       - cuda
       - cuda_version
@@ -14,9 +14,10 @@ files:
       - depends_on_distributed_ucxx
       - develop
       - checks
-      - build_wheels
       - test_libraft
       - docs
+      - rapids_build_setuptools
+      - rapids_build_skbuild
       - run_raft_dask
       - run_pylibraft
       - test_python_common
@@ -27,12 +28,13 @@ files:
       cuda: ["11.8", "12.0"]
       arch: [x86_64, aarch64]
     includes:
-      - build
+      - rapids_build
       - cuda
       - cuda_version
       - develop
       - nn_bench
       - nn_bench_python
+      - rapids_build_skbuild
   test_cpp:
     output: none
     includes:
@@ -65,7 +67,15 @@ files:
     extras:
       table: build-system
     includes:
-      - build
+      - rapids_build_skbuild
+  py_rapids_build_pylibraft:
+    output: pyproject
+    pyproject_dir: python/pylibraft
+    extras:
+      table: tool.rapids-build-backend
+      key: requires
+    includes:
+      - rapids_build
       - build_pylibraft
   py_run_pylibraft:
     output: pyproject
@@ -90,7 +100,15 @@ files:
     extras:
       table: build-system
     includes:
-      - build
+      - rapids_build_skbuild
+  py_rapids_build_raft_dask:
+    output: pyproject
+    pyproject_dir: python/raft-dask
+    extras:
+      table: tool.rapids-build-backend
+      key: requires
+    includes:
+      - rapids_build
       - depends_on_ucx_build
   py_run_raft_dask:
     output: pyproject
@@ -114,7 +132,7 @@ files:
     extras:
       table: build-system
     includes:
-      - build_wheels
+      - rapids_build_setuptools
   py_run_raft_ann_bench:
     output: pyproject
     pyproject_dir: python/raft-ann-bench
@@ -129,11 +147,22 @@ channels:
   - conda-forge
   - nvidia
 dependencies:
-  build:
+  rapids_build_skbuild:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - &rapids_build_backend rapids-build-backend>=0.3.0,<0.4.0.dev0
+      - output_types: [conda]
+        packages:
+          - scikit-build-core>=0.7.0
+      - output_types: [requirements, pyproject]
+        packages:
+          - scikit-build-core[pyproject]>=0.7.0
+  rapids_build:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - &cmake_ver cmake>=3.26.4
+          - &cmake_ver cmake>=3.26.4,!=3.30.0
           - cython>=3.0.0
           - ninja
       - output_types: [conda]
@@ -141,11 +170,7 @@ dependencies:
           - c-compiler
           - cxx-compiler
           - nccl>=2.9.9
-          - libucxx==0.38.*
-          - scikit-build-core>=0.7.0
-      - output_types: [requirements, pyproject]
-        packages:
-          - scikit-build-core[pyproject]>=0.7.0
+          - libucxx==0.39.*
     specific:
       - output_types: conda
         matrices:
@@ -184,7 +209,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - &rmm_conda rmm==24.6.*
+          - &rmm_unsuffixed rmm==24.8.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -198,18 +223,26 @@ dependencies:
               cuda: "12.*"
             packages:
               - &cuda_python12 cuda-python>=12.0,<13.0a0
-          - matrix: # All CUDA 11 versions
+          - matrix:
+              cuda: "11.*"
             packages:
               - &cuda_python11 cuda-python>=11.7.1,<12.0a0
+          - matrix:
+            packages:
+              - &cuda_python cuda-python
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
-              - &rmm_cu12 rmm-cu12==24.6.*
-          - matrix: {cuda: "11.*"}
+              - &rmm_cu12 rmm-cu12==24.8.*
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
-              - &rmm_cu11 rmm-cu11==24.6.*
-          - {matrix: null, packages: [*rmm_conda] }
+              - &rmm_cu11 rmm-cu11==24.8.*
+          - {matrix: null, packages: [*rmm_unsuffixed] }
   checks:
     common:
       - output_types: [conda, requirements]
@@ -231,7 +264,7 @@ dependencies:
           - h5py>=3.8.0
           - benchmark>=1.8.2
           - openblas
-          - *rmm_conda
+          - *rmm_unsuffixed
   nn_bench_python:
     common:
       - output_types: [conda]
@@ -268,6 +301,10 @@ dependencies:
               cuda: "12.2"
             packages:
               - cuda-version=12.2
+          - matrix:
+              cuda: "12.5"
+            packages:
+              - cuda-version=12.5
   cuda:
     specific:
       - output_types: conda
@@ -375,12 +412,13 @@ dependencies:
           - recommonmark
           - sphinx-copybutton
           - sphinx-markdown-tables
-  build_wheels:
+  rapids_build_setuptools:
     common:
       - output_types: [requirements, pyproject]
         packages:
           - wheel
           - setuptools
+          - *rapids_build_backend
   py_version:
     specific:
       - output_types: conda
@@ -407,7 +445,7 @@ dependencies:
           - &numpy numpy>=1.23,<2.0a0
       - output_types: [conda]
         packages:
-          - *rmm_conda
+          - *rmm_unsuffixed
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -421,9 +459,13 @@ dependencies:
               cuda: "12.*"
             packages:
               - *cuda_python12
-          - matrix: # All CUDA 11 versions
+          - matrix:
+              cuda: "11.*"
             packages:
               - *cuda_python11
+          - matrix:
+            packages:
+              - *cuda_python
       - output_types: [requirements, pyproject]
         matrices:
           - matrix: {cuda: "12.*"}
@@ -432,23 +474,20 @@ dependencies:
           - matrix: {cuda: "11.*"}
             packages:
               - *rmm_cu11
-          - {matrix: null, packages: [*rmm_conda]}
+          - {matrix: null, packages: [*rmm_unsuffixed]}
   run_raft_dask:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - dask-cuda==24.6.*
+          - dask-cuda==24.8.*
           - joblib>=0.11
           - numba>=0.57
           - *numpy
-          - rapids-dask-dependency==24.6.*
-          - ucx-py==0.38.*
+          - rapids-dask-dependency==24.8.*
       - output_types: conda
         packages:
-          - &ucx_py_conda ucx-py==0.38.*
-      - output_types: pyproject
-        packages:
-          - &pylibraft_conda pylibraft==24.6.*
+          - &pylibraft_unsuffixed pylibraft==24.8.*
+          - &ucx_py_unsuffixed ucx-py==0.39.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -458,15 +497,19 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
-              - &pylibraft_cu12 pylibraft-cu12==24.6.*
-              - &ucx_py_cu12 ucx-py-cu12==0.38.*
-          - matrix: {cuda: "11.*"}
+              - &pylibraft_cu12 pylibraft-cu12==24.8.*
+              - &ucx_py_cu12 ucx-py-cu12==0.39.*
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
-              - &pylibraft_cu11 pylibraft-cu11==24.6.*
-              - &ucx_py_cu11 ucx-py-cu11==0.38.*
-          - {matrix: null, packages: [*pylibraft_conda, *ucx_py_conda]}
+              - &pylibraft_cu11 pylibraft-cu11==24.8.*
+              - &ucx_py_cu11 ucx-py-cu11==0.39.*
+          - {matrix: null, packages: [*pylibraft_unsuffixed, *ucx_py_unsuffixed]}
   test_python_common:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -485,7 +528,7 @@ dependencies:
         packages:
           # UCXX is not currently a hard-dependency thus only installed during tests,
           # this will change in the future.
-          - &distributed_ucxx_conda distributed-ucxx==0.38.*
+          - &distributed_ucxx_unsuffixed distributed-ucxx==0.39.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -494,18 +537,22 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
-              - distributed-ucxx-cu12==0.38.*
-          - matrix: {cuda: "11.*"}
+              - distributed-ucxx-cu12==0.39.*
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
-              - distributed-ucxx-cu11==0.38.*
-          - {matrix: null, packages: [*distributed_ucxx_conda]}
+              - distributed-ucxx-cu11==0.39.*
+          - {matrix: null, packages: [*distributed_ucxx_unsuffixed]}
   depends_on_ucx_build:
     common:
       - output_types: conda
         packages:
-          - &ucx_conda_build ucx==1.15.0
+          - ucx==1.15.0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -514,10 +561,14 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
               - libucx-cu12==1.15.0
-          - matrix: {cuda: "11.*"}
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
               - libucx-cu11==1.15.0
           - matrix: null
@@ -527,7 +578,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &ucx_conda_run ucx>=1.15.0
+          - ucx>=1.15.0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -536,10 +587,14 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
               - libucx-cu12>=1.15.0
-          - matrix: {cuda: "11.*"}
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
               - libucx-cu11>=1.15.0
           - matrix: null
diff --git a/docs/source/ann_benchmarks_build.md b/docs/source/ann_benchmarks_build.md
index 80730c5d68..56af8e555c 100644
--- a/docs/source/ann_benchmarks_build.md
+++ b/docs/source/ann_benchmarks_build.md
@@ -36,9 +36,12 @@ You can limit the algorithms that are built by providing a semicolon-delimited l
 ```
 
 Available targets to use with `--limit-bench-ann` are:
-- FAISS_IVF_FLAT_ANN_BENCH
-- FAISS_IVF_PQ_ANN_BENCH
-- FAISS_BFKNN_ANN_BENCH
+- FAISS_GPU_IVF_FLAT_ANN_BENCH
+- FAISS_GPU_IVF_PQ_ANN_BENCH
+- FAISS_CPU_IVF_FLAT_ANN_BENCH
+- FAISS_CPU_IVF_PQ_ANN_BENCH
+- FAISS_GPU_FLAT_ANN_BENCH
+- FAISS_CPU_FLAT_ANN_BENCH
 - GGNN_ANN_BENCH
 - HNSWLIB_ANN_BENCH
 - RAFT_CAGRA_ANN_BENCH
diff --git a/docs/source/build.md b/docs/source/build.md
index c0abf3f995..64f3bd01a2 100644
--- a/docs/source/build.md
+++ b/docs/source/build.md
@@ -56,7 +56,7 @@ You can also install the conda packages individually using the `mamba` command a
 mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-headers cuda-version=12.0
 ```
 
-If installing the C++ APIs Please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.06/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
+If installing the C++ APIs Please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.08/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
 
 ## Installing Python through Pip
 
@@ -106,7 +106,7 @@ In addition to the libraries included with cudatoolkit 11.0+, there are some oth
 
 Conda environment scripts are provided for installing the necessary dependencies to build both the C++ and Python libraries from source. It is preferred to use `mamba`, as it provides significant speedup over `conda`:
 ```bash
-mamba env create --name rapids_raft -f conda/environments/all_cuda-122_arch-x86_64.yaml
+mamba env create --name rapids_raft -f conda/environments/all_cuda-125_arch-x86_64.yaml
 mamba activate rapids_raft
 ```
 
@@ -257,19 +257,20 @@ RAFT's CMake has the following configurable flags available:
 
 | Flag                            | Possible Values      | Default Value | Behavior                                                                     |
 |---------------------------------|----------------------| --- |------------------------------------------------------------------------------|
-| BUILD_TESTS                     | ON, OFF              | ON | Compile Googletests                                                          |
+| BUILD_TESTS                     | ON, OFF              | ON  | Compile Googletests                                                          |
 | BUILD_PRIMS_BENCH               | ON, OFF              | OFF | Compile benchmarks                                                           |
-| BUILD_ANN_BENCH                 | ON, OFF              | OFF | Compile end-to-end ANN benchmarks |
+| BUILD_ANN_BENCH                 | ON, OFF              | OFF | Compile end-to-end ANN benchmarks                                            |
 | CUDA_ENABLE_KERNELINFO          | ON, OFF              | OFF | Enables `kernelinfo` in nvcc. This is useful for `compute-sanitizer`         |
 | CUDA_ENABLE_LINEINFO            | ON, OFF              | OFF | Enable the -lineinfo option for nvcc                                         |
 | CUDA_STATIC_RUNTIME             | ON, OFF              | OFF | Statically link the CUDA runtime                                             |
-| DETECT_CONDA_ENV                | ON, OFF              | ON | Enable detection of conda environment for dependencies                       |
-| raft_FIND_COMPONENTS            | compiled distributed | | Configures the optional components as a space-separated list                 |
+| CUDA_STATIC_MATH_LIBRARIES      | ON, OFF              | OFF | Statically link the CUDA math libraries                                      |
+| DETECT_CONDA_ENV                | ON, OFF              | ON  | Enable detection of conda environment for dependencies                       |
+| raft_FIND_COMPONENTS            | compiled distributed |     | Configures the optional components as a space-separated list                 |
 | RAFT_COMPILE_LIBRARY            | ON, OFF              | ON if either BUILD_TESTS or BUILD_PRIMS_BENCH is ON; otherwise OFF | Compiles all `libraft` shared libraries (these are required for Googletests) |
-| RAFT_ENABLE_CUBLAS_DEPENDENCY   | ON, OFF | ON | Link against cublas library in `raft::raft`                                  | 
-| RAFT_ENABLE_CUSOLVER_DEPENDENCY | ON, OFF | ON | Link against cusolver library in `raft::raft`                                | 
-| RAFT_ENABLE_CUSPARSE_DEPENDENCY | ON, OFF | ON | Link against cusparse library in `raft::raft`                                | 
-| RAFT_ENABLE_CUSOLVER_DEPENDENCY | ON, OFF | ON | Link against curand library in `raft::raft`                                  | 
+| RAFT_ENABLE_CUBLAS_DEPENDENCY   | ON, OFF              | ON  | Link against cublas library in `raft::raft`                                  | 
+| RAFT_ENABLE_CUSOLVER_DEPENDENCY | ON, OFF              | ON  | Link against cusolver library in `raft::raft`                                | 
+| RAFT_ENABLE_CUSPARSE_DEPENDENCY | ON, OFF              | ON  | Link against cusparse library in `raft::raft`                                | 
+| RAFT_ENABLE_CUSOLVER_DEPENDENCY | ON, OFF              | ON  | Link against curand library in `raft::raft`                                  | 
 | RAFT_NVTX                       | ON, OFF              | OFF | Enable NVTX Markers                                                          |
 
 ### Build documentation
diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md
index 5e288e9f2f..36588f3450 100644
--- a/docs/source/developer_guide.md
+++ b/docs/source/developer_guide.md
@@ -187,7 +187,7 @@ RAFT relies on `clang-format` to enforce code style across all C++ and CUDA sour
 1. Do not split empty functions/records/namespaces.
 2. Two-space indentation everywhere, including the line continuations.
 3. Disable reflowing of comments.
-   The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-24.06/cpp/.clang-format).
+   The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-24.08/cpp/.clang-format).
 
 [`doxygen`](https://doxygen.nl/) is used as documentation generator and also as a documentation linter.
 In order to run doxygen as a linter on C++/CUDA code, run
@@ -205,7 +205,7 @@ you can run  `codespell -i 3 -w .` from the repository root directory.
 This will bring up an interactive prompt to select which spelling fixes to apply.
 
 ### #include style
-[include_checker.py](https://github.com/rapidsai/raft/blob/branch-24.06/cpp/scripts/include_checker.py) is used to enforce the include style as follows:
+[include_checker.py](https://github.com/rapidsai/raft/blob/branch-24.08/cpp/scripts/include_checker.py) is used to enforce the include style as follows:
 1. `#include "..."` should be used for referencing local files only. It is acceptable to be used for referencing files in a sub-folder/parent-folder of the same algorithm, but should never be used to include files in other algorithms or between algorithms and the primitives or other dependencies.
 2. `#include <...>` should be used for referencing everything else
 
@@ -230,7 +230,7 @@ Call CUDA APIs via the provided helper macros `RAFT_CUDA_TRY`, `RAFT_CUBLAS_TRY`
 ## Logging
 
 ### Introduction
-Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-24.06/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all.
+Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-24.08/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all.
 
 ### Usage
 ```cpp
diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
index 4b3aef5600..21a8404212 100644
--- a/docs/source/raft_ann_benchmarks.md
+++ b/docs/source/raft_ann_benchmarks.md
@@ -62,7 +62,7 @@ Nightly images are located in [dockerhub](https://hub.docker.com/r/rapidsai/raft
 - The following command pulls the nightly container for python version 10, cuda version 12, and RAFT version 23.10:
 
 ```bash
-docker pull rapidsai/raft-ann-bench:24.06a-cuda12.0-py3.10 #substitute raft-ann-bench for the exact desired container.
+docker pull rapidsai/raft-ann-bench:24.08a-cuda12.0-py3.10 #substitute raft-ann-bench for the exact desired container.
 ```
 
 The CUDA and python versions can be changed for the supported values:
@@ -83,7 +83,7 @@ You can see the exact versions as well in the dockerhub site:
 [//]: # ()
 [//]: # (```bash)
 
-[//]: # (docker pull nvcr.io/nvidia/rapidsai/raft-ann-bench:24.06-cuda11.8-py3.10 #substitute raft-ann-bench for the exact desired container.)
+[//]: # (docker pull nvcr.io/nvidia/rapidsai/raft-ann-bench:24.08-cuda11.8-py3.10 #substitute raft-ann-bench for the exact desired container.)
 
 [//]: # (```)
 
@@ -344,7 +344,7 @@ For GPU-enabled systems, the `DATA_FOLDER` variable should be a local folder whe
 export DATA_FOLDER=path/to/store/datasets/and/results
 docker run --gpus all --rm -it -u $(id -u)                      \
     -v $DATA_FOLDER:/data/benchmarks                            \
-    rapidsai/raft-ann-bench:24.06a-cuda11.8-py3.10              \
+    rapidsai/raft-ann-bench:24.08a-cuda11.8-py3.10              \
     "--dataset deep-image-96-angular"                           \
     "--normalize"                                               \
     "--algorithms raft_cagra,raft_ivf_pq --batch-size 10 -k 10" \
@@ -355,7 +355,7 @@ Usage of the above command is as follows:
 
 | Argument                                                  | Description                                                                                        |
 |-----------------------------------------------------------|----------------------------------------------------------------------------------------------------|
-| `rapidsai/raft-ann-bench:24.06a-cuda11.8-py3.10`          | Image to use. Can be either `raft-ann-bench` or `raft-ann-bench-datasets`                          |
+| `rapidsai/raft-ann-bench:24.08a-cuda11.8-py3.10`          | Image to use. Can be either `raft-ann-bench` or `raft-ann-bench-datasets`                          |
 | `"--dataset deep-image-96-angular"`                       | Dataset name                                                                                       |
 | `"--normalize"`                                           | Whether to normalize the dataset                                                                   |
 | `"--algorithms raft_cagra,hnswlib --batch-size 10 -k 10"` | Arguments passed to the `run` script, such as the algorithms to benchmark, the batch size, and `k` |
@@ -372,7 +372,7 @@ The container arguments in the above section also be used for the CPU-only conta
 export DATA_FOLDER=path/to/store/datasets/and/results
 docker run  --rm -it -u $(id -u)                  \
     -v $DATA_FOLDER:/data/benchmarks              \
-    rapidsai/raft-ann-bench-cpu:24.06a-py3.10     \
+    rapidsai/raft-ann-bench-cpu:24.08a-py3.10     \
      "--dataset deep-image-96-angular"            \
      "--normalize"                                \
      "--algorithms hnswlib --batch-size 10 -k 10" \
@@ -389,7 +389,7 @@ docker run --gpus all --rm -it -u $(id -u)          \
     --entrypoint /bin/bash                          \
     --workdir /data/benchmarks                      \
     -v $DATA_FOLDER:/data/benchmarks                \
-    rapidsai/raft-ann-bench:24.06a-cuda11.8-py3.10 
+    rapidsai/raft-ann-bench:24.08a-cuda11.8-py3.10 
 ```
 
 This will drop you into a command line in the container, with the `raft-ann-bench` python package ready to use, as described in the [Running the benchmarks](#running-the-benchmarks) section above:
diff --git a/python/pylibraft/CMakeLists.txt b/python/pylibraft/CMakeLists.txt
index 7a2d77041d..6cbe8e4cbf 100644
--- a/python/pylibraft/CMakeLists.txt
+++ b/python/pylibraft/CMakeLists.txt
@@ -53,6 +53,7 @@ if(NOT raft_FOUND)
   set(BUILD_ANN_BENCH OFF)
   set(RAFT_COMPILE_LIBRARY ON)
   set(CUDA_STATIC_RUNTIME ON)
+  set(CUDA_STATIC_MATH_LIBRARIES ON)
 
   add_subdirectory(../../cpp raft-cpp EXCLUDE_FROM_ALL)
 
diff --git a/python/pylibraft/pylibraft/_version.py b/python/pylibraft/pylibraft/_version.py
index 3b359e2ff3..0fa0ba80bc 100644
--- a/python/pylibraft/pylibraft/_version.py
+++ b/python/pylibraft/pylibraft/_version.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,13 +13,22 @@
 # limitations under the License.
 #
 
-
 import importlib.resources
 
 __version__ = (
-    importlib.resources.files("pylibraft")
+    importlib.resources.files(__package__)
     .joinpath("VERSION")
     .read_text()
     .strip()
 )
-__git_commit__ = ""
+try:
+    __git_commit__ = (
+        importlib.resources.files(__package__)
+        .joinpath("GIT_COMMIT")
+        .read_text()
+        .strip()
+    )
+except FileNotFoundError:
+    __git_commit__ = ""
+
+__all__ = ["__version__", "__git_commit__"]
diff --git a/python/pylibraft/pylibraft/neighbors/hnsw.pyx b/python/pylibraft/pylibraft/neighbors/hnsw.pyx
index aa589ffb65..e6f2d69eb8 100644
--- a/python/pylibraft/pylibraft/neighbors/hnsw.pyx
+++ b/python/pylibraft/pylibraft/neighbors/hnsw.pyx
@@ -52,6 +52,7 @@ from pylibraft.common.mdspan cimport (
 from pylibraft.neighbors.common cimport _get_metric_string
 
 import os
+import uuid
 
 import numpy as np
 
@@ -292,7 +293,7 @@ def from_cagra(Index index, handle=None):
     Returns an hnswlib base-layer-only index from a CAGRA index.
 
     NOTE: This method uses the filesystem to write the CAGRA index in
-          `/tmp/cagra_index.bin` before reading it as an hnswlib index,
+          `/tmp/<random_number>.bin` before reading it as an hnswlib index,
           then deleting the temporary file.
 
     Saving / loading the index is experimental. The serialization format is
@@ -320,7 +321,8 @@ def from_cagra(Index index, handle=None):
     >>> # Serialize the CAGRA index to hnswlib base layer only index format
     >>> hnsw_index = hnsw.from_cagra(index, handle=handle)
     """
-    filename = "/tmp/cagra_index.bin"
+    uuid_num = uuid.uuid4()
+    filename = f"/tmp/{uuid_num}.bin"
     save(filename, index, handle=handle)
     hnsw_index = load(filename, index.dim, np.dtype(index.active_index_type),
                       _get_metric_string(index.metric), handle=handle)
diff --git a/python/pylibraft/pylibraft/test/test_version.py b/python/pylibraft/pylibraft/test/test_version.py
new file mode 100644
index 0000000000..6d6c107bcc
--- /dev/null
+++ b/python/pylibraft/pylibraft/test/test_version.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pylibraft
+
+
+def test_version_constants_are_populated():
+    # __git_commit__ will only be non-empty in a built distribution
+    assert isinstance(pylibraft.__git_commit__, str)
+
+    # __version__ should always be non-empty
+    assert isinstance(pylibraft.__version__, str)
+    assert len(pylibraft.__version__) > 0
diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
index df1001538e..8b61df5d1b 100644
--- a/python/pylibraft/pyproject.toml
+++ b/python/pylibraft/pyproject.toml
@@ -15,14 +15,10 @@
 [build-system]
 
 requires = [
-    "cmake>=3.26.4",
-    "cuda-python>=11.7.1,<12.0a0",
-    "cython>=3.0.0",
-    "ninja",
-    "rmm==24.6.*",
+    "rapids-build-backend>=0.3.0,<0.4.0.dev0",
     "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-build-backend = "scikit_build_core.build"
+build-backend = "rapids_build_backend.build"
 
 [project]
 name = "pylibraft"
@@ -35,9 +31,9 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cuda-python>=11.7.1,<12.0a0",
+    "cuda-python",
     "numpy>=1.23,<2.0a0",
-    "rmm==24.6.*",
+    "rmm==24.8.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -117,6 +113,18 @@ provider = "scikit_build_core.metadata.regex"
 input = "pylibraft/VERSION"
 regex = "(?P<value>.*)"
 
+[tool.rapids-build-backend]
+build-backend = "scikit_build_core.build"
+requires = [
+    "cmake>=3.26.4,!=3.30.0",
+    "cuda-python",
+    "cython>=3.0.0",
+    "ninja",
+    "rmm==24.8.*",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true"
+
 [tool.pytest.ini_options]
 filterwarnings = [
     "error",
diff --git a/python/raft-ann-bench/pyproject.toml b/python/raft-ann-bench/pyproject.toml
index e1f0e18304..d22dd567fe 100644
--- a/python/raft-ann-bench/pyproject.toml
+++ b/python/raft-ann-bench/pyproject.toml
@@ -1,8 +1,9 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.
 
 [build-system]
-build-backend = "setuptools.build_meta"
+build-backend = "rapids_build_backend.build"
 requires = [
+    "rapids-build-backend>=0.3.0,<0.4.0.dev0",
     "setuptools",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
@@ -61,3 +62,10 @@ skip = [
 
 [tool.setuptools.dynamic]
 version = { file = "raft_ann_bench/VERSION" }
+
+[tool.rapids-build-backend]
+build-backend = "setuptools.build_meta"
+requires = []
+dependencies-file = "../../dependencies.yaml"
+commit-files = ["src/raft_ann_bench/GIT_COMMIT"]
+matrix-entry = "cuda_suffixed=true"
diff --git a/python/raft-ann-bench/src/raft_ann_bench/_version.py b/python/raft-ann-bench/src/raft_ann_bench/_version.py
index 394acd755d..0fa0ba80bc 100644
--- a/python/raft-ann-bench/src/raft_ann_bench/_version.py
+++ b/python/raft-ann-bench/src/raft_ann_bench/_version.py
@@ -13,13 +13,22 @@
 # limitations under the License.
 #
 
-
 import importlib.resources
 
 __version__ = (
-    importlib.resources.files("raft_ann_bench")
+    importlib.resources.files(__package__)
     .joinpath("VERSION")
     .read_text()
     .strip()
 )
-__git_commit__ = ""
+try:
+    __git_commit__ = (
+        importlib.resources.files(__package__)
+        .joinpath("GIT_COMMIT")
+        .read_text()
+        .strip()
+    )
+except FileNotFoundError:
+    __git_commit__ = ""
+
+__all__ = ["__version__", "__git_commit__"]
diff --git a/python/raft-ann-bench/src/raft_ann_bench/constraints/__init__.py b/python/raft-ann-bench/src/raft_ann_bench/constraints/__init__.py
index 2b7b2728fe..e94ee56c92 100644
--- a/python/raft-ann-bench/src/raft_ann_bench/constraints/__init__.py
+++ b/python/raft-ann-bench/src/raft_ann_bench/constraints/__init__.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -51,3 +51,27 @@ def raft_cagra_search_constraints(params, build_params, k, batch_size):
 def hnswlib_search_constraints(params, build_params, k, batch_size):
     if "ef" in params:
         return params["ef"] >= k
+
+
+def faiss_gpu_ivf_pq_build_constraints(params, dims):
+    ret = True
+    # M must be defined
+    ret = params["M"] <= dims and dims % params["M"] == 0
+    if "use_raft" in params and params["use_raft"]:
+        return ret
+    pq_bits = 8
+    if "bitsPerCode" in params:
+        pq_bits = params["bitsPerCode"]
+    lookup_table_size = 4
+    if "useFloat16" in params and params["useFloat16"]:
+        lookup_table_size = 2
+    # FAISS constraint to check if lookup table fits in shared memory
+    # for now hard code maximum shared memory per block to 49 kB (the value for A100 and V100)
+    return ret and lookup_table_size * params["M"] * (2**pq_bits) <= 49152
+
+
+def faiss_gpu_ivf_pq_search_constraints(params, build_params, k, batch_size):
+    ret = True
+    if "nlist" in build_params and "nprobe" in params:
+        ret = ret and build_params["nlist"] >= params["nprobe"]
+    return ret
diff --git a/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_cpu_ivf_flat.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_cpu_ivf_flat.yaml
new file mode 100644
index 0000000000..29c145f86d
--- /dev/null
+++ b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_cpu_ivf_flat.yaml
@@ -0,0 +1,10 @@
+name: faiss_cpu_ivf_flat
+groups:
+  base:
+    build:
+      nlist: [2048]
+      ratio: [10]
+      useFloat16: [False]
+    search:
+      nprobe: [1, 5, 10, 50, 100, 200]
+      refine_ratio: [1]
\ No newline at end of file
diff --git a/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_cpu_ivf_pq.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_cpu_ivf_pq.yaml
new file mode 100644
index 0000000000..a531ec8294
--- /dev/null
+++ b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_cpu_ivf_pq.yaml
@@ -0,0 +1,18 @@
+name: faiss_cpu_ivf_pq
+groups:
+  base:
+    build:
+      nlist: [1024, 2048, 4096, 8192]
+      M: [48, 32, 16]
+      ratio: [10]
+      bitsPerCode: [8, 6, 5, 4]
+    search:
+      nprobe: [1, 5, 10, 50, 100, 200]
+  large:
+    build:
+      nlist: [8192, 16384, 32768, 65536]
+      M: [48, 32, 16]
+      ratio: [10]
+      bitsPerCode: [8, 6, 5, 4]
+    search:
+      nprobe: [20, 30, 40, 50, 100, 200, 500, 1000]
diff --git a/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_gpu_ivf_flat.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_gpu_ivf_flat.yaml
index ed237becb3..e4abc35f5c 100644
--- a/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_gpu_ivf_flat.yaml
+++ b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_gpu_ivf_flat.yaml
@@ -3,8 +3,19 @@ groups:
   base:
     build:
       nlist: [2048]
-      ratio: [1, 4, 10]
-      useFloat16: [False]
+      ratio: [10]
+      useFloat16: [False, True]
+      use_raft: [False]
     search:
-      nprobe: [2048]
+      nprobe: [1, 5, 10, 50, 100, 200]
       refine_ratio: [1]
+groups:
+  baseraft:
+    build:
+      nlist: [2048]
+      ratio: [10]
+      useFloat16: [False, True]
+      use_raft: [True]
+    search:
+      nprobe: [1, 5, 10, 50, 100, 200]
+      refine_ratio: [1]
\ No newline at end of file
diff --git a/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_gpu_ivf_pq.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_gpu_ivf_pq.yaml
index 87c3afc727..7560ceaa9c 100644
--- a/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_gpu_ivf_pq.yaml
+++ b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_gpu_ivf_pq.yaml
@@ -1,12 +1,77 @@
 name: faiss_gpu_ivf_pq
+constraints:
+  build: raft-ann-bench.constraints.faiss_gpu_ivf_pq_build_constraints
+  search: raft-ann-bench.constraints.faiss_gpu_ivf_pq_search_constraints
 groups:
   base:
     build:
       nlist: [1024, 2048, 4096, 8192]
-      M: [8, 16]
-      ratio: [10, 25]
+      M: [64, 32, 16]
+      ratio: [10]
+      usePrecomputed: [False, True]
+      useFloat16: [False, True]
+      use_raft: [False]
+      bitsPerCode: [8]
+    search:
+      nprobe: [1, 5, 10, 50, 100, 200]
+      refine_ratio: [1, 2, 4]
+  baseraft:
+    build:
+      nlist: [1024, 2048, 4096, 8192]
+      M: [64, 32, 16]
+      ratio: [10]
       usePrecomputed: [False]
-      useFloat16: [False]
+      useFloat16: [False, True]
+      use_raft: [True]
+      bitsPerCode: [8, 6, 5, 4]
     search:
       nprobe: [1, 5, 10, 50, 100, 200]
+      refine_ratio: [1, 2, 4]
+  large:
+    build:
+      nlist: [8192, 16384, 32768, 65536]
+      M: [48, 32, 16]
+      ratio: [4]
+      usePrecomputed: [False, True]
+      useFloat16: [False, True]
+      use_raft: [False]
+      bitsPerCode: [8]
+    search:
+      nprobe: [20, 30, 40, 50, 100, 200, 500, 1000]
+      refine_ratio: [1, 2, 4]
+  largeraft:
+    build:
+      nlist: [8192, 16384, 32768, 65536]
+      M: [48, 32, 16]
+      ratio: [4]
+      usePrecomputed: [False]
+      useFloat16: [False, True]
+      use_raft: [True]
+      bitsPerCode: [8, 6, 5, 4]
+    search:
+      nprobe: [20, 30, 40, 50, 100, 200, 500, 1000]
+      refine_ratio: [1, 2, 4]
+  100M:
+    build:
+      nlist: [50000]
+      M: [48]
+      ratio: [10]
+      usePrecomputed: [False, True]
+      useFloat16: [False, True]
+      use_raft: [False]
+      bitsPerCode: [8]
+    search:
+      nprobe: [20, 30, 40, 50, 100, 200, 500, 1000]
+      refine_ratio: [1]
+  100Mraft:
+    build:
+      nlist: [50000]
+      M: [48]
+      ratio: [10]
+      usePrecomputed: [False, True]
+      useFloat16: [False, True]
+      use_raft: [True]
+      bitsPerCode: [8, 6, 5, 4]
+    search:
+      nprobe: [20, 30, 40, 50, 100, 200, 500, 1000]
       refine_ratio: [1]
\ No newline at end of file
diff --git a/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_ivf_pq.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_ivf_pq.yaml
index 7eaec2b77b..bcdcde42a2 100644
--- a/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_ivf_pq.yaml
+++ b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_ivf_pq.yaml
@@ -6,12 +6,36 @@ groups:
   base:
     build:
       nlist: [1024, 2048, 4096, 8192]
-      pq_dim: [64, 32]
+      pq_dim: [64, 32, 16]
       pq_bits: [8, 6, 5, 4]
-      ratio: [10, 25]
+      ratio: [10]
       niter: [25]
     search:
       nprobe: [1, 5, 10, 50, 100, 200]
       internalDistanceDtype: ["float"]
       smemLutDtype: ["float", "fp8", "half"]
-      refine_ratio: [1, 2, 4]
\ No newline at end of file
+      refine_ratio: [1, 2, 4]
+  large:
+    build:
+      nlist: [8192, 16384, 32768, 65536]
+      pq_dim: [48, 32, 16]
+      pq_bits: [8, 6, 5, 4]
+      ratio: [4]
+      niter: [20]
+    search:
+      nprobe: [20, 30, 40, 50, 100, 200, 500, 1000]
+      internalDistanceDtype: ["float"]
+      smemLutDtype: ["float", "fp8", "half"]
+      refine_ratio: [1, 2, 4]
+  100M:
+    build:
+      nlist: [50000]
+      pq_dim: [48]
+      pq_bits: [8, 6, 5, 4]
+      ratio: [10]
+      niter: [10]
+    search:
+      nprobe: [20, 30, 40, 50, 100, 200, 500, 1000]
+      internalDistanceDtype: ["float"]
+      smemLutDtype: ["float", "fp8", "half"]
+      refine_ratio: [1]
diff --git a/python/raft-dask/CMakeLists.txt b/python/raft-dask/CMakeLists.txt
index 2c629f3b73..197ddae05f 100644
--- a/python/raft-dask/CMakeLists.txt
+++ b/python/raft-dask/CMakeLists.txt
@@ -51,6 +51,7 @@ if(NOT raft_FOUND)
   set(RAFT_COMPILE_DIST_LIBRARY OFF)
   set(RAFT_COMPILE_NN_LIBRARY OFF)
   set(CUDA_STATIC_RUNTIME ON)
+  set(CUDA_STATIC_MATH_LIBRARIES ON)
   set(RAFT_DASK_UCXX_STATIC ON)
 
   add_subdirectory(../../cpp raft-cpp EXCLUDE_FROM_ALL)
diff --git a/python/raft-dask/cmake/thirdparty/get_ucxx.cmake b/python/raft-dask/cmake/thirdparty/get_ucxx.cmake
index 8e340eec73..de6a4b109c 100644
--- a/python/raft-dask/cmake/thirdparty/get_ucxx.cmake
+++ b/python/raft-dask/cmake/thirdparty/get_ucxx.cmake
@@ -47,9 +47,9 @@ endfunction()
 # Change pinned tag here to test a commit in CI
 # To use a different RAFT locally, set the CMake variable
 # CPM_raft_SOURCE=/path/to/local/raft
-find_and_configure_ucxx(VERSION  0.38
+find_and_configure_ucxx(VERSION  0.39
         FORK             rapidsai
-        PINNED_TAG       branch-0.38
+        PINNED_TAG       branch-0.39
         EXCLUDE_FROM_ALL YES
         UCXX_STATIC      ${RAFT_DASK_UCXX_STATIC}
     )
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index e8ded4cd4a..29c0db0048 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -14,12 +14,9 @@
 
 [build-system]
 
-build-backend = "scikit_build_core.build"
+build-backend = "rapids_build_backend.build"
 requires = [
-    "cmake>=3.26.4",
-    "cython>=3.0.0",
-    "libucx==1.15.0",
-    "ninja",
+    "rapids-build-backend>=0.3.0,<0.4.0.dev0",
     "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
@@ -34,14 +31,14 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "dask-cuda==24.6.*",
-    "distributed-ucxx==0.38.*",
+    "dask-cuda==24.8.*",
+    "distributed-ucxx==0.39.*",
     "joblib>=0.11",
     "numba>=0.57",
     "numpy>=1.23,<2.0a0",
-    "pylibraft==24.6.*",
-    "rapids-dask-dependency==24.6.*",
-    "ucx-py==0.38.*",
+    "pylibraft==24.8.*",
+    "rapids-dask-dependency==24.8.*",
+    "ucx-py==0.39.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -118,3 +115,14 @@ wheel.packages = ["raft_dask"]
 provider = "scikit_build_core.metadata.regex"
 input = "raft_dask/VERSION"
 regex = "(?P<value>.*)"
+
+[tool.rapids-build-backend]
+build-backend = "scikit_build_core.build"
+requires = [
+    "cmake>=3.26.4,!=3.30.0",
+    "cython>=3.0.0",
+    "libucx==1.15.0",
+    "ninja",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true"
diff --git a/python/raft-dask/raft_dask/_version.py b/python/raft-dask/raft_dask/_version.py
index 03cddef557..0fa0ba80bc 100644
--- a/python/raft-dask/raft_dask/_version.py
+++ b/python/raft-dask/raft_dask/_version.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,13 +13,22 @@
 # limitations under the License.
 #
 
-
 import importlib.resources
 
 __version__ = (
-    importlib.resources.files("raft_dask")
+    importlib.resources.files(__package__)
     .joinpath("VERSION")
     .read_text()
     .strip()
 )
-__git_commit__ = ""
+try:
+    __git_commit__ = (
+        importlib.resources.files(__package__)
+        .joinpath("GIT_COMMIT")
+        .read_text()
+        .strip()
+    )
+except FileNotFoundError:
+    __git_commit__ = ""
+
+__all__ = ["__version__", "__git_commit__"]
diff --git a/python/raft-dask/raft_dask/test/test_version.py b/python/raft-dask/raft_dask/test/test_version.py
new file mode 100644
index 0000000000..94ba33d051
--- /dev/null
+++ b/python/raft-dask/raft_dask/test/test_version.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import raft_dask
+
+
+def test_version_constants_are_populated():
+    # __git_commit__ will only be non-empty in a built distribution
+    assert isinstance(raft_dask.__git_commit__, str)
+
+    # __version__ should always be non-empty
+    assert isinstance(raft_dask.__version__, str)
+    assert len(raft_dask.__version__) > 0