diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
new file mode 100644
index 0000000000..9d35e3f97f
--- /dev/null
+++ b/.devcontainer/Dockerfile
@@ -0,0 +1,30 @@
+# syntax=docker/dockerfile:1.5
+
+ARG BASE
+ARG PYTHON_PACKAGE_MANAGER=conda
+
+FROM ${BASE} as pip-base
+
+ENV DEFAULT_VIRTUAL_ENV=rapids
+
+FROM ${BASE} as conda-base
+
+ENV DEFAULT_CONDA_ENV=rapids
+
+FROM ${PYTHON_PACKAGE_MANAGER}-base
+
+ARG CUDA
+ENV CUDAARCHS="RAPIDS"
+ENV CUDA_VERSION="${CUDA_VERSION:-${CUDA}}"
+
+ARG PYTHON_PACKAGE_MANAGER
+ENV PYTHON_PACKAGE_MANAGER="${PYTHON_PACKAGE_MANAGER}"
+
+ENV PYTHONSAFEPATH="1"
+ENV PYTHONUNBUFFERED="1"
+ENV PYTHONDONTWRITEBYTECODE="1"
+
+ENV SCCACHE_REGION="us-east-2"
+ENV SCCACHE_BUCKET="rapids-sccache-devs"
+ENV VAULT_HOST="https://vault.ops.k8s.rapids.ai"
+ENV HISTFILE="/home/coder/.cache/._bash_history"
diff --git a/.devcontainer/README.md b/.devcontainer/README.md
new file mode 100644
index 0000000000..3c76b8963d
--- /dev/null
+++ b/.devcontainer/README.md
@@ -0,0 +1,64 @@
+# RAFT Development Containers
+
+This directory contains [devcontainer configurations](https://containers.dev/implementors/json_reference/) for using VSCode to [develop in a container](https://code.visualstudio.com/docs/devcontainers/containers) via the `Remote Containers` [extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) or [GitHub Codespaces](https://github.com/codespaces).
+
+This container is a turnkey development environment for building and testing the RAFT C++ and Python libraries.
+
+## Table of Contents
+
+* [Prerequisites](#prerequisites)
+* [Host bind mounts](#host-bind-mounts)
+* [Launch a Dev Container](#launch-a-dev-container)
+
+## Prerequisites
+
+* [VSCode](https://code.visualstudio.com/download)
+* [VSCode Remote Containers extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers)
+
+## Host bind mounts
+
+By default, the following directories are bind-mounted into the devcontainer:
+
+* `${repo}:/home/coder/raft`
+* `${repo}/../.aws:/home/coder/.aws`
+* `${repo}/../.local:/home/coder/.local`
+* `${repo}/../.cache:/home/coder/.cache`
+* `${repo}/../.conda:/home/coder/.conda`
+* `${repo}/../.config:/home/coder/.config`
+
+This ensures caches, configurations, dependencies, and your commits are persisted on the host across container runs.
+
+## Launch a Dev Container
+
+To launch a devcontainer from VSCode, open the RAFT repo and select the "Reopen in Container" button in the bottom right:<br/><img src="https://user-images.githubusercontent.com/178183/221771999-97ab29d5-e718-4e5f-b32f-2cdd51bba25c.png"/>
+
+Alternatively, open the VSCode command palette (typically `cmd/ctrl + shift + P`) and run the "Rebuild and Reopen in Container" command.
+
+## Using the devcontainer
+
+On startup, the devcontainer creates or updates the conda/pip environment using `raft/dependencies.yaml`.
+
+The container includes convenience functions to clean, configure, and build the various RAFT components:
+
+```shell
+$ clean-raft-cpp # only cleans the C++ build dir
+$ clean-pylibraft-python # only cleans the Python build dir
+$ clean-raft # cleans both C++ and Python build dirs
+
+$ configure-raft-cpp # only configures raft C++ lib
+
+$ build-raft-cpp # only builds raft C++ lib
+$ build-pylibraft-python # only builds raft Python lib
+$ build-raft # builds both C++ and Python libs
+```
+
+* The C++ build script is a small wrapper around `cmake -S ~/raft/cpp -B ~/raft/cpp/build` and `cmake --build ~/raft/cpp/build`
+* The Python build script is a small wrapper around `pip install --editable ~/raft/cpp`
+
+Unlike `build.sh`, these convenience scripts *don't* install the libraries after building them. Instead, they automatically inject the correct arguments to build the C++ libraries from source and use their build dirs as package roots:
+
+```shell
+$ cmake -S ~/raft/cpp -B ~/raft/cpp/build
+$ CMAKE_ARGS="-Draft_ROOT=~/raft/cpp/build" \ # <-- this argument is automatic
+  pip install -e ~/raft/cpp
+```
diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
new file mode 100644
index 0000000000..8da9b5428a
--- /dev/null
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -0,0 +1,37 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "11.8",
+      "PYTHON_PACKAGE_MANAGER": "conda",
+      "BASE": "rapidsai/devcontainers:23.10-cpp-llvm16-cuda11.8-mambaforge-ubuntu22.04"
+    }
+  },
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.10": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda11.8-envs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/raft,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda11.8-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
new file mode 100644
index 0000000000..0b3ec79e37
--- /dev/null
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -0,0 +1,38 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "11.8",
+      "PYTHON_PACKAGE_MANAGER": "pip",
+      "BASE": "rapidsai/devcontainers:23.10-cpp-llvm16-cuda11.8-ubuntu22.04"
+    }
+  },
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/ucx:23.10": {"version": "1.14.1"},
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.10": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/ucx",
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda11.8-venvs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/raft,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda11.8-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.devcontainer/cuda12.0-conda/devcontainer.json b/.devcontainer/cuda12.0-conda/devcontainer.json
new file mode 100644
index 0000000000..f5af166b46
--- /dev/null
+++ b/.devcontainer/cuda12.0-conda/devcontainer.json
@@ -0,0 +1,37 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "12.0",
+      "PYTHON_PACKAGE_MANAGER": "conda",
+      "BASE": "rapidsai/devcontainers:23.10-cpp-mambaforge-ubuntu22.04"
+    }
+  },
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.10": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.0-envs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/raft,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.0-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.devcontainer/cuda12.0-pip/devcontainer.json b/.devcontainer/cuda12.0-pip/devcontainer.json
new file mode 100644
index 0000000000..9f28002d38
--- /dev/null
+++ b/.devcontainer/cuda12.0-pip/devcontainer.json
@@ -0,0 +1,38 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "12.0",
+      "PYTHON_PACKAGE_MANAGER": "pip",
+      "BASE": "rapidsai/devcontainers:23.10-cpp-llvm16-cuda12.0-ubuntu22.04"
+    }
+  },
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/ucx:23.10": {"version": "1.14.1"},
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.10": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/ucx",
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.0-venvs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/raft,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.0-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 00004c4e4d..107823d5ee 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -62,7 +62,7 @@ jobs:
       arch: "amd64"
       branch: ${{ inputs.branch }}
       build_type: ${{ inputs.build_type || 'branch' }}
-      container_image: "rapidsai/ci:latest"
+      container_image: "rapidsai/ci-conda:latest"
       date: ${{ inputs.date }}
       node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 4437e0dc85..e539877851 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -22,6 +22,7 @@ jobs:
       - wheel-tests-pylibraft
       - wheel-build-raft-dask
       - wheel-tests-raft-dask
+      - devcontainer
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.10
   checks:
@@ -62,7 +63,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci:latest"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/build_docs.sh"
   wheel-build-pylibraft:
     needs: checks
@@ -92,3 +93,11 @@ jobs:
     with:
       build_type: pull-request
       script: ci/test_wheel_raft_dask.sh
+  devcontainer:
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/build-in-devcontainer.yaml@branch-23.10
+    with:
+      build_command: |
+        sccache -z;
+        build-all -DBUILD_PRIMS_BENCH=ON -DBUILD_ANN_BENCH=ON --verbose;
+        sccache -s;
diff --git a/.gitignore b/.gitignore
index 7939fc1622..11b7bc3eba 100644
--- a/.gitignore
+++ b/.gitignore
@@ -62,3 +62,7 @@ _xml
 # sphinx
 _html
 _text
+
+# clang tooling
+compile_commands.json
+.clangd/
diff --git a/build.sh b/build.sh
index 071820ba93..6200e6a2fa 100755
--- a/build.sh
+++ b/build.sh
@@ -78,8 +78,8 @@ INSTALL_TARGET=install
 BUILD_REPORT_METRICS=""
 BUILD_REPORT_INCL_CACHE_STATS=OFF
 
-TEST_TARGETS="CLUSTER_TEST;CORE_TEST;DISTANCE_TEST;LABEL_TEST;LINALG_TEST;MATRIX_TEST;NEIGHBORS_TEST;NEIGHBORS_ANN_CAGRA_TEST;RANDOM_TEST;SOLVERS_TEST;SPARSE_TEST;SPARSE_DIST_TEST;SPARSE_NEIGHBORS_TEST;STATS_TEST;UTILS_TEST"
-BENCH_TARGETS="CLUSTER_BENCH;NEIGHBORS_BENCH;DISTANCE_BENCH;LINALG_BENCH;MATRIX_BENCH;SPARSE_BENCH;RANDOM_BENCH"
+TEST_TARGETS="CLUSTER_TEST;CORE_TEST;DISTANCE_TEST;LABEL_TEST;LINALG_TEST;MATRIX_TEST;NEIGHBORS_TEST;NEIGHBORS_ANN_CAGRA_TEST;NEIGHBORS_ANN_NN_DESCENT_TEST;RANDOM_TEST;SOLVERS_TEST;SPARSE_TEST;SPARSE_DIST_TEST;SPARSE_NEIGHBORS_TEST;STATS_TEST;UTILS_TEST"
+BENCH_TARGETS="CLUSTER_BENCH;CORE_BENCH;NEIGHBORS_BENCH;DISTANCE_BENCH;LINALG_BENCH;MATRIX_BENCH;SPARSE_BENCH;RANDOM_BENCH"
 
 CACHE_ARGS=""
 NVTX=ON
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index d2d2d08b99..a41f81152d 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
 set -euo pipefail
 
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 6a7e319f5d..a867a71f68 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -47,10 +47,6 @@ sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cma
 sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/pylibraft/pylibraft/__init__.py
 sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/raft-dask/raft_dask/__init__.py
 
-# Python pyproject.toml updates
-sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/pylibraft/pyproject.toml
-sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/raft-dask/pyproject.toml
-
 # Wheel testing script
 sed_runner "s/branch-.*/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_raft_dask.sh
 
@@ -74,6 +70,7 @@ for FILE in python/*/pyproject.toml; do
   for DEP in "${DEPENDENCIES[@]}"; do
     sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*\"/g" ${FILE}
   done
+  sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" "${FILE}"
   sed_runner "/\"ucx-py==/ s/==.*\"/==${NEXT_UCX_PY_SHORT_TAG_PEP440}.*\"/g" ${FILE}
 done
 
@@ -94,3 +91,10 @@ sed_runner "/^PROJECT_NUMBER/ s|\".*\"|\"${NEXT_SHORT_TAG}\"|g" cpp/doxygen/Doxy
 sed_runner "/^set(RAFT_VERSION/ s|\".*\"|\"${NEXT_SHORT_TAG}\"|g" docs/source/build.md
 sed_runner "/GIT_TAG.*branch-/ s|branch-.*|branch-${NEXT_SHORT_TAG}|g" docs/source/build.md
 sed_runner "/rapidsai\/raft/ s|branch-[0-9][0-9].[0-9][0-9]|branch-${NEXT_SHORT_TAG}|g" docs/source/developer_guide.md
+
+# .devcontainer files
+find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r -d '' filename; do
+    sed_runner "s@rapidsai/devcontainers:[0-9.]*@rapidsai/devcontainers:${NEXT_SHORT_TAG}@g" "${filename}"
+    sed_runner "s@rapidsai/devcontainers/features/ucx:[0-9.]*@rapidsai/devcontainers/features/ucx:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
+    sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
+done
diff --git a/ci/test_wheel_raft_dask.sh b/ci/test_wheel_raft_dask.sh
index 676d642de9..fd9668e968 100755
--- a/ci/test_wheel_raft_dask.sh
+++ b/ci/test_wheel_raft_dask.sh
@@ -12,7 +12,7 @@ RAPIDS_PY_WHEEL_NAME="pylibraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels
 python -m pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl
 
 # Always install latest dask for testing
-python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.10
+python -m pip install git+https://github.com/dask/dask.git@2023.9.2 git+https://github.com/dask/distributed.git@2023.9.2 git+https://github.com/rapidsai/dask-cuda.git@branch-23.10
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/raft_dask*.whl)[test]
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 65b4232d83..739e1e9785 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -10,7 +10,7 @@ dependencies:
 - breathe
 - c-compiler
 - clang-tools=16.0.6
-- clang=16.0.6
+- clang==16.0.6
 - cmake>=3.26.4
 - cuda-profiler-api=11.8.86
 - cuda-python>=11.7.1,<12.0a0
@@ -19,10 +19,10 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-core>=2023.7.1
+- dask-core==2023.9.2
 - dask-cuda==23.10.*
-- dask>=2023.7.1
-- distributed>=2023.7.1
+- dask==2023.9.2
+- distributed==2023.9.2
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
 - gmock>=1.13.0
@@ -43,6 +43,8 @@ dependencies:
 - numba>=0.57
 - numpy>=1.21
 - numpydoc
+- nvcc_linux-64=11.8
+- pre-commit
 - pydata-sphinx-theme
 - pytest
 - pytest-cov
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 9db38ed1de..321c17bf4f 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -10,19 +10,20 @@ dependencies:
 - breathe
 - c-compiler
 - clang-tools=16.0.6
-- clang=16.0.6
+- clang==16.0.6
 - cmake>=3.26.4
 - cuda-cudart-dev
+- cuda-nvcc
 - cuda-profiler-api
 - cuda-python>=12.0,<13.0a0
 - cuda-version=12.0
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-core>=2023.7.1
+- dask-core==2023.9.2
 - dask-cuda==23.10.*
-- dask>=2023.7.1
-- distributed>=2023.7.1
+- dask==2023.9.2
+- distributed==2023.9.2
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
 - gmock>=1.13.0
@@ -39,6 +40,7 @@ dependencies:
 - numba>=0.57
 - numpy>=1.21
 - numpydoc
+- pre-commit
 - pydata-sphinx-theme
 - pytest
 - pytest-cov
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index 5a9ef5bd32..4f1df12dfa 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -10,7 +10,7 @@ dependencies:
 - benchmark>=1.8.2
 - c-compiler
 - clang-tools=16.0.6
-- clang=16.0.6
+- clang==16.0.6
 - cmake>=3.26.4
 - cuda-profiler-api=11.8.86
 - cuda-version=11.8
@@ -34,6 +34,7 @@ dependencies:
 - nccl>=2.9.9
 - ninja
 - nlohmann_json>=3.11.2
+- nvcc_linux-64=11.8
 - scikit-build>=0.13.1
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-118_arch-x86_64
diff --git a/conda/recipes/raft-ann-bench/meta.yaml b/conda/recipes/raft-ann-bench/meta.yaml
index 91d0fdb729..a2ab0af643 100644
--- a/conda/recipes/raft-ann-bench/meta.yaml
+++ b/conda/recipes/raft-ann-bench/meta.yaml
@@ -78,11 +78,11 @@ requirements:
     - h5py {{ h5py_version }}
     - benchmark
     - matplotlib
-    # rmm is needed to determine if package is gpu-enabled
-    - rmm ={{ minor_version }}
     - python
     - pandas
     - pyyaml
+    # rmm is needed to determine if package is gpu-enabled
+    - rmm ={{ minor_version }}
 
   run:
     - python
@@ -104,6 +104,8 @@ requirements:
     - python
     - pandas
     - pyyaml
+    # rmm is needed to determine if package is gpu-enabled
+    - rmm ={{ minor_version }}
 about:
   home: https://rapids.ai/
   license: Apache-2.0
diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml
index c9caa4dd9b..04dfef5063 100644
--- a/conda/recipes/raft-dask/meta.yaml
+++ b/conda/recipes/raft-dask/meta.yaml
@@ -60,10 +60,10 @@ requirements:
     - cudatoolkit
     {% endif %}
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
-    - dask >=2023.7.1
-    - dask-core >=2023.7.1
+    - dask ==2023.9.2
+    - dask-core ==2023.9.2
     - dask-cuda ={{ minor_version }}
-    - distributed >=2023.7.1
+    - distributed ==2023.9.2
     - joblib >=0.11
     - nccl >=2.9.9
     - pylibraft {{ version }}
diff --git a/cpp/.clangd b/cpp/.clangd
new file mode 100644
index 0000000000..7c4fe036dd
--- /dev/null
+++ b/cpp/.clangd
@@ -0,0 +1,65 @@
+# https://clangd.llvm.org/config
+
+# Apply a config conditionally to all C files
+If:
+  PathMatch: .*\.(c|h)$
+
+---
+
+# Apply a config conditionally to all C++ files
+If:
+  PathMatch: .*\.(c|h)pp
+
+---
+
+# Apply a config conditionally to all CUDA files
+If:
+  PathMatch: .*\.cuh?
+CompileFlags:
+  Add:
+    - "-x"
+    - "cuda"
+    # No error on unknown CUDA versions
+    - "-Wno-unknown-cuda-version"
+    # Allow variadic CUDA functions
+    - "-Xclang=-fcuda-allow-variadic-functions"
+Diagnostics:
+  Suppress:
+    - "variadic_device_fn"
+    - "attributes_not_allowed"
+
+---
+
+# Tweak the clangd parse settings for all files
+CompileFlags:
+  Add:
+    # report all errors
+    - "-ferror-limit=0"
+    - "-fmacro-backtrace-limit=0"
+    - "-ftemplate-backtrace-limit=0"
+    # Skip the CUDA version check
+    - "--no-cuda-version-check"
+  Remove:
+    # remove gcc's -fcoroutines
+    - -fcoroutines
+    # remove nvc++ flags unknown to clang
+    - "-gpu=*"
+    - "-stdpar*"
+    # remove nvcc flags unknown to clang
+    - "-arch*"
+    - "-gencode*"
+    - "--generate-code*"
+    - "-ccbin*"
+    - "-t=*"
+    - "--threads*"
+    - "-Xptxas*"
+    - "-Xcudafe*"
+    - "-Xfatbin*"
+    - "-Xcompiler*"
+    - "--diag-suppress*"
+    - "--diag_suppress*"
+    - "--compiler-options*"
+    - "--expt-extended-lambda"
+    - "--expt-relaxed-constexpr"
+    - "-forward-unknown-to-host-compiler"
+    - "-Werror=cross-execution-space-call"
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index d93b19f784..7d63751906 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -22,7 +22,8 @@ include(rapids-find)
 
 option(BUILD_CPU_ONLY "Build CPU only components. Applies to RAFT ANN benchmarks currently" OFF)
 
-# workaround for rapids_cuda_init_architectures not working for arch detection with enable_language(CUDA)
+# workaround for rapids_cuda_init_architectures not working for arch detection with
+# enable_language(CUDA)
 set(lang_list "CXX")
 
 if(NOT BUILD_CPU_ONLY)
@@ -286,7 +287,8 @@ endif()
 set_target_properties(raft_compiled PROPERTIES EXPORT_NAME compiled)
 
 if(RAFT_COMPILE_LIBRARY)
-  add_library(raft_objs OBJECT
+  add_library(
+    raft_objs OBJECT
     src/core/logger.cpp
     src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu
     src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu
@@ -331,6 +333,7 @@ if(RAFT_COMPILE_LIBRARY)
     src/neighbors/brute_force_knn_int64_t_float_uint32_t.cu
     src/neighbors/brute_force_knn_int_float_int.cu
     src/neighbors/brute_force_knn_uint32_t_float_uint32_t.cu
+    src/neighbors/brute_force_knn_index_float.cu
     src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu
     src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu
     src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu
@@ -452,18 +455,21 @@ if(RAFT_COMPILE_LIBRARY)
     src/spatial/knn/detail/fused_l2_knn_int64_t_float.cu
     src/spatial/knn/detail/fused_l2_knn_uint32_t_float.cu
     src/util/memory_pool.cpp
-    )
+  )
   set_target_properties(
     raft_objs
     PROPERTIES CXX_STANDARD 17
                CXX_STANDARD_REQUIRED ON
                CUDA_STANDARD 17
                CUDA_STANDARD_REQUIRED ON
-               POSITION_INDEPENDENT_CODE ON)
+               POSITION_INDEPENDENT_CODE ON
+  )
 
   target_compile_definitions(raft_objs PRIVATE "RAFT_EXPLICIT_INSTANTIATE_ONLY")
-  target_compile_options(raft_objs PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
-                                           "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>")
+  target_compile_options(
+    raft_objs PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
+                      "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
+  )
 
   add_library(raft_lib SHARED $<TARGET_OBJECTS:raft_objs>)
   add_library(raft_lib_static STATIC $<TARGET_OBJECTS:raft_objs>)
@@ -477,13 +483,15 @@ if(RAFT_COMPILE_LIBRARY)
   )
 
   foreach(target raft_lib raft_lib_static raft_objs)
-    target_link_libraries(${target} PUBLIC
-      raft::raft
-      ${RAFT_CTK_MATH_DEPENDENCIES} # TODO: Once `raft::resources` is used everywhere, this
-                                    # will just be cublas
-      $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>)
+    target_link_libraries(
+      ${target}
+      PUBLIC raft::raft
+             ${RAFT_CTK_MATH_DEPENDENCIES} # TODO: Once `raft::resources` is used everywhere, this
+                                           # will just be cublas
+             $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+    )
 
-    #So consumers know when using libraft.so/libraft.a
+    # So consumers know when using libraft.so/libraft.a
     target_compile_definitions(${target} PUBLIC "RAFT_COMPILED")
     # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
     target_link_options(${target} PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
diff --git a/cpp/bench/ann/src/raft/raft_benchmark.cu b/cpp/bench/ann/src/raft/raft_benchmark.cu
index 7ba381ab0a..a9ff6c2922 100644
--- a/cpp/bench/ann/src/raft/raft_benchmark.cu
+++ b/cpp/bench/ann/src/raft/raft_benchmark.cu
@@ -147,6 +147,13 @@ void parse_build_param(const nlohmann::json& conf,
   if (conf.contains("intermediate_graph_degree")) {
     param.intermediate_graph_degree = conf.at("intermediate_graph_degree");
   }
+  if (conf.contains("graph_build_algo")) {
+    if (conf.at("graph_build_algo") == "IVF_PQ") {
+      param.build_algo = raft::neighbors::cagra::graph_build_algo::IVF_PQ;
+    } else if (conf.at("graph_build_algo") == "NN_DESCENT") {
+      param.build_algo = raft::neighbors::cagra::graph_build_algo::NN_DESCENT;
+    }
+  }
 }
 
 template <typename T, typename IdxT>
diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
index e8d4739384..ca4b0f099d 100644
--- a/cpp/bench/prims/CMakeLists.txt
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -77,6 +77,7 @@ if(BUILD_PRIMS_BENCH)
     NAME CLUSTER_BENCH PATH bench/prims/cluster/kmeans_balanced.cu bench/prims/cluster/kmeans.cu
     bench/prims/main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY
   )
+  ConfigureBench(NAME CORE_BENCH PATH bench/prims/core/bitset.cu bench/prims/main.cpp)
 
   ConfigureBench(
     NAME TUNE_DISTANCE PATH bench/prims/distance/tune_pairwise/kernel.cu
@@ -155,4 +156,5 @@ if(BUILD_PRIMS_BENCH)
     LIB
     EXPLICIT_INSTANTIATE_ONLY
   )
+
 endif()
diff --git a/cpp/bench/prims/core/bitset.cu b/cpp/bench/prims/core/bitset.cu
new file mode 100644
index 0000000000..5f44aa9af5
--- /dev/null
+++ b/cpp/bench/prims/core/bitset.cu
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+#include <raft/core/bitset.cuh>
+#include <raft/core/device_mdspan.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft::bench::core {
+
+struct bitset_inputs {
+  uint32_t bitset_len;
+  uint32_t mask_len;
+  uint32_t query_len;
+};  // struct bitset_inputs
+
+template <typename bitset_t, typename index_t>
+struct bitset_bench : public fixture {
+  bitset_bench(const bitset_inputs& p)
+    : params(p),
+      mask{raft::make_device_vector<index_t, index_t>(res, p.mask_len)},
+      queries{raft::make_device_vector<index_t, index_t>(res, p.query_len)},
+      outputs{raft::make_device_vector<bool, index_t>(res, p.query_len)}
+  {
+    raft::random::RngState state{42};
+    raft::random::uniformInt(res, state, mask.view(), index_t{0}, index_t{p.bitset_len});
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    loop_on_state(state, [this]() {
+      auto my_bitset = raft::core::bitset<bitset_t, index_t>(
+        this->res, raft::make_const_mdspan(mask.view()), params.bitset_len);
+      my_bitset.test(res, raft::make_const_mdspan(queries.view()), outputs.view());
+    });
+  }
+
+ private:
+  raft::resources res;
+  bitset_inputs params;
+  raft::device_vector<index_t, index_t> mask, queries;
+  raft::device_vector<bool, index_t> outputs;
+};  // struct bitset
+
+const std::vector<bitset_inputs> bitset_input_vecs{
+  {256 * 1024 * 1024, 64 * 1024 * 1024, 256 * 1024 * 1024},    // Standard Bench
+  {256 * 1024 * 1024, 64 * 1024 * 1024, 1024 * 1024 * 1024},   // Extra queries
+  {128 * 1024 * 1024, 1024 * 1024 * 1024, 256 * 1024 * 1024},  // Extra mask to test atomics impact
+};
+
+using Uint8_32  = bitset_bench<uint8_t, uint32_t>;
+using Uint16_64 = bitset_bench<uint16_t, uint32_t>;
+using Uint32_32 = bitset_bench<uint32_t, uint32_t>;
+using Uint32_64 = bitset_bench<uint32_t, uint64_t>;
+
+RAFT_BENCH_REGISTER(Uint8_32, "", bitset_input_vecs);
+RAFT_BENCH_REGISTER(Uint16_64, "", bitset_input_vecs);
+RAFT_BENCH_REGISTER(Uint32_32, "", bitset_input_vecs);
+RAFT_BENCH_REGISTER(Uint32_64, "", bitset_input_vecs);
+
+}  // namespace raft::bench::core
diff --git a/cpp/bench/prims/neighbors/cagra_bench.cuh b/cpp/bench/prims/neighbors/cagra_bench.cuh
index bb405088bb..63f6c14686 100644
--- a/cpp/bench/prims/neighbors/cagra_bench.cuh
+++ b/cpp/bench/prims/neighbors/cagra_bench.cuh
@@ -18,8 +18,10 @@
 
 #include <common/benchmark.hpp>
 #include <raft/neighbors/cagra.cuh>
+#include <raft/neighbors/sample_filter.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/util/itertools.hpp>
+#include <thrust/sequence.h>
 
 #include <optional>
 
@@ -40,6 +42,8 @@ struct params {
   int block_size;
   int search_width;
   int max_iterations;
+  /** Ratio of removed indices. */
+  double removed_ratio;
 };
 
 template <typename T, typename IdxT>
@@ -49,7 +53,8 @@ struct CagraBench : public fixture {
       params_(ps),
       queries_(make_device_matrix<T, int64_t>(handle, ps.n_queries, ps.n_dims)),
       dataset_(make_device_matrix<T, int64_t>(handle, ps.n_samples, ps.n_dims)),
-      knn_graph_(make_device_matrix<IdxT, int64_t>(handle, ps.n_samples, ps.degree))
+      knn_graph_(make_device_matrix<IdxT, int64_t>(handle, ps.n_samples, ps.degree)),
+      removed_indices_bitset_(handle, ps.n_samples)
   {
     // Generate random dataset and queriees
     raft::random::RngState state{42};
@@ -74,6 +79,13 @@ struct CagraBench : public fixture {
 
     auto metric = raft::distance::DistanceType::L2Expanded;
 
+    auto removed_indices =
+      raft::make_device_vector<IdxT, int64_t>(handle, ps.removed_ratio * ps.n_samples);
+    thrust::sequence(
+      resource::get_thrust_policy(handle),
+      thrust::device_pointer_cast(removed_indices.data_handle()),
+      thrust::device_pointer_cast(removed_indices.data_handle() + removed_indices.extent(0)));
+    removed_indices_bitset_.set(handle, removed_indices.view());
     index_.emplace(raft::neighbors::cagra::index<T, IdxT>(
       handle, metric, make_const_mdspan(dataset_.view()), make_const_mdspan(knn_graph_.view())));
   }
@@ -95,10 +107,18 @@ struct CagraBench : public fixture {
       distances.data_handle(), params_.n_queries, params_.k);
 
     auto queries_v = make_const_mdspan(queries_.view());
-    loop_on_state(state, [&]() {
-      raft::neighbors::cagra::search(
-        this->handle, search_params, *this->index_, queries_v, ind_v, dist_v);
-    });
+    if (params_.removed_ratio > 0) {
+      auto filter = raft::neighbors::filtering::bitset_filter(removed_indices_bitset_.view());
+      loop_on_state(state, [&]() {
+        raft::neighbors::cagra::search_with_filtering(
+          this->handle, search_params, *this->index_, queries_v, ind_v, dist_v, filter);
+      });
+    } else {
+      loop_on_state(state, [&]() {
+        raft::neighbors::cagra::search(
+          this->handle, search_params, *this->index_, queries_v, ind_v, dist_v);
+      });
+    }
 
     double data_size  = params_.n_samples * params_.n_dims * sizeof(T);
     double graph_size = params_.n_samples * params_.degree * sizeof(IdxT);
@@ -120,6 +140,7 @@ struct CagraBench : public fixture {
     state.counters["block_size"]    = params_.block_size;
     state.counters["search_width"]  = params_.search_width;
     state.counters["iterations"]    = iterations;
+    state.counters["removed_ratio"] = params_.removed_ratio;
   }
 
  private:
@@ -128,6 +149,7 @@ struct CagraBench : public fixture {
   raft::device_matrix<T, int64_t, row_major> queries_;
   raft::device_matrix<T, int64_t, row_major> dataset_;
   raft::device_matrix<IdxT, int64_t, row_major> knn_graph_;
+  raft::core::bitset<std::uint32_t, IdxT> removed_indices_bitset_;
 };
 
 inline const std::vector<params> generate_inputs()
@@ -141,7 +163,8 @@ inline const std::vector<params> generate_inputs()
                                            {64},                   // itopk_size
                                            {0},                    // block_size
                                            {1},                    // search_width
-                                           {0}                     // max_iterations
+                                           {0},                    // max_iterations
+                                           {0.0}                   // removed_ratio
     );
   auto inputs2 = raft::util::itertools::product<params>({2000000ull, 10000000ull},  // n_samples
                                                         {128},                      // dataset dim
@@ -151,7 +174,22 @@ inline const std::vector<params> generate_inputs()
                                                         {64},  // itopk_size
                                                         {64, 128, 256, 512, 1024},  // block_size
                                                         {1},                        // search_width
-                                                        {0}  // max_iterations
+                                                        {0},   // max_iterations
+                                                        {0.0}  // removed_ratio
+  );
+  inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
+
+  inputs2 = raft::util::itertools::product<params>(
+    {2000000ull, 10000000ull},                 // n_samples
+    {128},                                     // dataset dim
+    {1, 10, 10000},                            // n_queries
+    {255},                                     // k
+    {64},                                      // knn graph degree
+    {300},                                     // itopk_size
+    {256},                                     // block_size
+    {2},                                       // search_width
+    {0},                                       // max_iterations
+    {0.0, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64}  // removed_ratio
   );
   inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
   return inputs;
diff --git a/cpp/include/raft/core/bitset.cuh b/cpp/include/raft/core/bitset.cuh
new file mode 100644
index 0000000000..6747c5fab0
--- /dev/null
+++ b/cpp/include/raft/core/bitset.cuh
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_container_policy.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/linalg/map.cuh>
+#include <raft/util/device_atomics.cuh>
+#include <thrust/for_each.h>
+
+namespace raft::core {
+/**
+ * @defgroup bitset Bitset
+ * @{
+ */
+/**
+ * @brief View of a RAFT Bitset.
+ *
+ * This lightweight structure stores a pointer to a bitset in device memory with it's length.
+ * It provides a test() device function to check if a given index is set in the bitset.
+ *
+ * @tparam bitset_t Underlying type of the bitset array. Default is uint32_t.
+ * @tparam index_t Indexing type used. Default is uint32_t.
+ */
+template <typename bitset_t = uint32_t, typename index_t = uint32_t>
+struct bitset_view {
+  index_t static constexpr const bitset_element_size = sizeof(bitset_t) * 8;
+
+  _RAFT_HOST_DEVICE bitset_view(bitset_t* bitset_ptr, index_t bitset_len)
+    : bitset_ptr_{bitset_ptr}, bitset_len_{bitset_len}
+  {
+  }
+  /**
+   * @brief Create a bitset view from a device vector view of the bitset.
+   *
+   * @param bitset_span Device vector view of the bitset
+   * @param bitset_len Number of bits in the bitset
+   */
+  _RAFT_HOST_DEVICE bitset_view(raft::device_vector_view<bitset_t, index_t> bitset_span,
+                                index_t bitset_len)
+    : bitset_ptr_{bitset_span.data_handle()}, bitset_len_{bitset_len}
+  {
+  }
+  /**
+   * @brief Device function to test if a given index is set in the bitset.
+   *
+   * @param sample_index Single index to test
+   * @return bool True if index has not been unset in the bitset
+   */
+  inline _RAFT_DEVICE auto test(const index_t sample_index) const -> bool
+  {
+    const bitset_t bit_element = bitset_ptr_[sample_index / bitset_element_size];
+    const index_t bit_index    = sample_index % bitset_element_size;
+    const bool is_bit_set      = (bit_element & (bitset_t{1} << bit_index)) != 0;
+    return is_bit_set;
+  }
+
+  /**
+   * @brief Get the device pointer to the bitset.
+   */
+  inline _RAFT_HOST_DEVICE auto data_handle() -> bitset_t* { return bitset_ptr_; }
+  inline _RAFT_HOST_DEVICE auto data_handle() const -> const bitset_t* { return bitset_ptr_; }
+  /**
+   * @brief Get the number of bits of the bitset representation.
+   */
+  inline _RAFT_HOST_DEVICE auto size() const -> index_t { return bitset_len_; }
+
+  /**
+   * @brief Get the number of elements used by the bitset representation.
+   */
+  inline _RAFT_HOST_DEVICE auto n_elements() const -> index_t
+  {
+    return raft::ceildiv(bitset_len_, bitset_element_size);
+  }
+
+  inline auto to_mdspan() -> raft::device_vector_view<bitset_t, index_t>
+  {
+    return raft::make_device_vector_view<bitset_t, index_t>(bitset_ptr_, n_elements());
+  }
+  inline auto to_mdspan() const -> raft::device_vector_view<const bitset_t, index_t>
+  {
+    return raft::make_device_vector_view<const bitset_t, index_t>(bitset_ptr_, n_elements());
+  }
+
+ private:
+  bitset_t* bitset_ptr_;
+  index_t bitset_len_;
+};
+
+/**
+ * @brief RAFT Bitset.
+ *
+ * This structure encapsulates a bitset in device memory. It provides a view() method to get a
+ * device-usable lightweight view of the bitset.
+ * Each index is represented by a single bit in the bitset. The total number of bytes used is
+ * ceil(bitset_len / 8).
+ * @tparam bitset_t Underlying type of the bitset array. Default is uint32_t.
+ * @tparam index_t Indexing type used. Default is uint32_t.
+ */
+template <typename bitset_t = uint32_t, typename index_t = uint32_t>
+struct bitset {
+  index_t static constexpr const bitset_element_size = sizeof(bitset_t) * 8;
+
+  /**
+   * @brief Construct a new bitset object with a list of indices to unset.
+   *
+   * @param res RAFT resources
+   * @param mask_index List of indices to unset in the bitset
+   * @param bitset_len Length of the bitset
+   * @param default_value Default value to set the bits to. Default is true.
+   */
+  bitset(const raft::resources& res,
+         raft::device_vector_view<const index_t, index_t> mask_index,
+         index_t bitset_len,
+         bool default_value = true)
+    : bitset_{std::size_t(raft::ceildiv(bitset_len, bitset_element_size)),
+              raft::resource::get_cuda_stream(res)},
+      bitset_len_{bitset_len},
+      default_value_{default_value}
+  {
+    cudaMemsetAsync(bitset_.data(),
+                    default_value ? 0xff : 0x00,
+                    n_elements() * sizeof(bitset_t),
+                    resource::get_cuda_stream(res));
+    set(res, mask_index, !default_value);
+  }
+
+  /**
+   * @brief Construct a new bitset object
+   *
+   * @param res RAFT resources
+   * @param bitset_len Length of the bitset
+   * @param default_value Default value to set the bits to. Default is true.
+   */
+  bitset(const raft::resources& res, index_t bitset_len, bool default_value = true)
+    : bitset_{std::size_t(raft::ceildiv(bitset_len, bitset_element_size)),
+              resource::get_cuda_stream(res)},
+      bitset_len_{bitset_len},
+      default_value_{default_value}
+  {
+    cudaMemsetAsync(bitset_.data(),
+                    default_value ? 0xff : 0x00,
+                    n_elements() * sizeof(bitset_t),
+                    resource::get_cuda_stream(res));
+  }
+  // Disable copy constructor
+  bitset(const bitset&)            = delete;
+  bitset(bitset&&)                 = default;
+  bitset& operator=(const bitset&) = delete;
+  bitset& operator=(bitset&&)      = default;
+
+  /**
+   * @brief Create a device-usable view of the bitset.
+   *
+   * @return bitset_view<bitset_t, index_t>
+   */
+  inline auto view() -> raft::core::bitset_view<bitset_t, index_t>
+  {
+    return bitset_view<bitset_t, index_t>(to_mdspan(), bitset_len_);
+  }
+  [[nodiscard]] inline auto view() const -> raft::core::bitset_view<const bitset_t, index_t>
+  {
+    return bitset_view<const bitset_t, index_t>(to_mdspan(), bitset_len_);
+  }
+
+  /**
+   * @brief Get the device pointer to the bitset.
+   */
+  inline auto data_handle() -> bitset_t* { return bitset_.data(); }
+  inline auto data_handle() const -> const bitset_t* { return bitset_.data(); }
+  /**
+   * @brief Get the number of bits of the bitset representation.
+   */
+  inline auto size() const -> index_t { return bitset_len_; }
+
+  /**
+   * @brief Get the number of elements used by the bitset representation.
+   */
+  inline auto n_elements() const -> index_t
+  {
+    return raft::ceildiv(bitset_len_, bitset_element_size);
+  }
+
+  /** @brief Get an mdspan view of the current bitset */
+  inline auto to_mdspan() -> raft::device_vector_view<bitset_t, index_t>
+  {
+    return raft::make_device_vector_view<bitset_t, index_t>(bitset_.data(), n_elements());
+  }
+  [[nodiscard]] inline auto to_mdspan() const -> raft::device_vector_view<const bitset_t, index_t>
+  {
+    return raft::make_device_vector_view<const bitset_t, index_t>(bitset_.data(), n_elements());
+  }
+
+  /** @brief Resize the bitset. If the requested size is larger, new memory is allocated and set to
+   * the default value. */
+  void resize(const raft::resources& res, index_t new_bitset_len)
+  {
+    auto old_size = raft::ceildiv(bitset_len_, bitset_element_size);
+    auto new_size = raft::ceildiv(new_bitset_len, bitset_element_size);
+    bitset_.resize(new_size);
+    bitset_len_ = new_bitset_len;
+    if (old_size < new_size) {
+      // If the new size is larger, set the new bits to the default value
+      cudaMemsetAsync(bitset_.data() + old_size,
+                      default_value_ ? 0xff : 0x00,
+                      (new_size - old_size) * sizeof(bitset_t),
+                      resource::get_cuda_stream(res));
+    }
+  }
+
+  /**
+   * @brief Test a list of indices in a bitset.
+   *
+   * @tparam output_t Output type of the test. Default is bool.
+   * @param res RAFT resources
+   * @param queries List of indices to test
+   * @param output List of outputs
+   */
+  template <typename output_t = bool>
+  void test(const raft::resources& res,
+            raft::device_vector_view<const index_t, index_t> queries,
+            raft::device_vector_view<output_t, index_t> output) const
+  {
+    RAFT_EXPECTS(output.extent(0) == queries.extent(0), "Output and queries must be same size");
+    auto bitset_view = view();
+    raft::linalg::map(
+      res,
+      output,
+      [bitset_view] __device__(index_t query) { return output_t(bitset_view.test(query)); },
+      queries);
+  }
+  /**
+   * @brief Set a list of indices in a bitset to set_value.
+   *
+   * @param res RAFT resources
+   * @param mask_index indices to remove from the bitset
+   * @param set_value Value to set the bits to (true or false)
+   */
+  void set(const raft::resources& res,
+           raft::device_vector_view<const index_t, index_t> mask_index,
+           bool set_value = false)
+  {
+    auto* bitset_ptr = this->data_handle();
+    thrust::for_each_n(resource::get_thrust_policy(res),
+                       mask_index.data_handle(),
+                       mask_index.extent(0),
+                       [bitset_ptr, set_value] __device__(const index_t sample_index) {
+                         const index_t bit_element = sample_index / bitset_element_size;
+                         const index_t bit_index   = sample_index % bitset_element_size;
+                         const bitset_t bitmask    = bitset_t{1} << bit_index;
+                         if (set_value) {
+                           atomicOr(bitset_ptr + bit_element, bitmask);
+                         } else {
+                           const bitset_t bitmask2 = ~bitmask;
+                           atomicAnd(bitset_ptr + bit_element, bitmask2);
+                         }
+                       });
+  }
+  /**
+   * @brief Flip all the bits in a bitset.
+   *
+   * @param res RAFT resources
+   */
+  void flip(const raft::resources& res)
+  {
+    auto bitset_span = this->to_mdspan();
+    raft::linalg::map(
+      res,
+      bitset_span,
+      [] __device__(bitset_t element) { return bitset_t(~element); },
+      raft::make_const_mdspan(bitset_span));
+  }
+  /**
+   * @brief Reset the bits in a bitset.
+   *
+   * @param res RAFT resources
+   */
+  void reset(const raft::resources& res)
+  {
+    cudaMemsetAsync(bitset_.data(),
+                    default_value_ ? 0xff : 0x00,
+                    n_elements() * sizeof(bitset_t),
+                    resource::get_cuda_stream(res));
+  }
+
+ private:
+  raft::device_uvector<bitset_t> bitset_;
+  index_t bitset_len_;
+  bool default_value_;
+};
+
+/** @} */
+}  // end namespace raft::core
diff --git a/cpp/include/raft/neighbors/brute_force-ext.cuh b/cpp/include/raft/neighbors/brute_force-ext.cuh
index 862db75866..b8c00616da 100644
--- a/cpp/include/raft/neighbors/brute_force-ext.cuh
+++ b/cpp/include/raft/neighbors/brute_force-ext.cuh
@@ -22,7 +22,8 @@
 #include <raft/core/operators.hpp>           // raft::identity_op
 #include <raft/core/resources.hpp>           // raft::resources
 #include <raft/distance/distance_types.hpp>  // raft::distance::DistanceType
-#include <raft/util/raft_explicit.hpp>       // RAFT_EXPLICIT
+#include <raft/neighbors/brute_force_types.hpp>
+#include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
 
 #ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
 
@@ -38,6 +39,19 @@ inline void knn_merge_parts(
   size_t n_samples,
   std::optional<raft::device_vector_view<idx_t, idx_t>> translations = std::nullopt) RAFT_EXPLICIT;
 
+template <typename T, typename Accessor>
+index<T> build(raft::resources const& res,
+               mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> dataset,
+               raft::distance::DistanceType metric = distance::DistanceType::L2Unexpanded,
+               T metric_arg                        = 0.0) RAFT_EXPLICIT;
+
+template <typename T, typename IdxT>
+void search(raft::resources const& res,
+            const index<T>& idx,
+            raft::device_matrix_view<const T, int64_t, row_major> queries,
+            raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, row_major> distances) RAFT_EXPLICIT;
+
 template <typename idx_t,
           typename value_t,
           typename matrix_idx,
@@ -93,6 +107,29 @@ instantiate_raft_neighbors_brute_force_knn(
 
 #undef instantiate_raft_neighbors_brute_force_knn
 
+namespace raft::neighbors::brute_force {
+
+extern template void search<float, int>(
+  raft::resources const& res,
+  const raft::neighbors::brute_force::index<float>& idx,
+  raft::device_matrix_view<const float, int64_t, row_major> queries,
+  raft::device_matrix_view<int, int64_t, row_major> neighbors,
+  raft::device_matrix_view<float, int64_t, row_major> distances);
+
+extern template void search<float, int64_t>(
+  raft::resources const& res,
+  const raft::neighbors::brute_force::index<float>& idx,
+  raft::device_matrix_view<const float, int64_t, row_major> queries,
+  raft::device_matrix_view<int64_t, int64_t, row_major> neighbors,
+  raft::device_matrix_view<float, int64_t, row_major> distances);
+
+extern template raft::neighbors::brute_force::index<float> build<float>(
+  raft::resources const& res,
+  raft::device_matrix_view<const float, int64_t, row_major> dataset,
+  raft::distance::DistanceType metric,
+  float metric_arg);
+}  // namespace raft::neighbors::brute_force
+
 #define instantiate_raft_neighbors_brute_force_fused_l2_knn(            \
   value_t, idx_t, idx_layout, query_layout)                             \
   extern template void raft::neighbors::brute_force::fused_l2_knn(      \
diff --git a/cpp/include/raft/neighbors/brute_force-inl.cuh b/cpp/include/raft/neighbors/brute_force-inl.cuh
index bc9e09e5b0..88439a738b 100644
--- a/cpp/include/raft/neighbors/brute_force-inl.cuh
+++ b/cpp/include/raft/neighbors/brute_force-inl.cuh
@@ -19,6 +19,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/distance/distance_types.hpp>
+#include <raft/neighbors/brute_force_types.hpp>
 #include <raft/neighbors/detail/knn_brute_force.cuh>
 #include <raft/spatial/knn/detail/fused_l2_knn.cuh>
 
@@ -280,6 +281,101 @@ void fused_l2_knn(raft::resources const& handle,
                                          metric);
 }
 
-/** @} */  // end group brute_force_knn
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * @tparam T data element type
+ *
+ * @param[in] res
+ * @param[in] dataset a matrix view (host or device) to a row-major matrix [n_rows, dim]
+ * @param[in] metric: distance metric to use. Euclidean (L2) is used by default
+ * @param[in] metric_arg: the value of `p` for Minkowski (l-p) distances. This
+ *           is ignored if the metric_type is not Minkowski.
+ *
+ * @return the constructed brute force index
+ */
+template <typename T, typename Accessor>
+index<T> build(raft::resources const& res,
+               mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> dataset,
+               raft::distance::DistanceType metric = distance::DistanceType::L2Unexpanded,
+               T metric_arg                        = 0.0)
+{
+  // certain distance metrics can benefit by pre-calculating the norms for the index dataset
+  // which lets us avoid calculating these at query time
+  std::optional<device_vector<T, int64_t>> norms;
+  if (metric == raft::distance::DistanceType::L2Expanded ||
+      metric == raft::distance::DistanceType::L2SqrtExpanded ||
+      metric == raft::distance::DistanceType::CosineExpanded) {
+    norms = make_device_vector<T, int64_t>(res, dataset.extent(0));
+    // cosine needs the l2norm, where as l2 distances needs the squared norm
+    if (metric == raft::distance::DistanceType::CosineExpanded) {
+      raft::linalg::norm(res,
+                         dataset,
+                         norms->view(),
+                         raft::linalg::NormType::L2Norm,
+                         raft::linalg::Apply::ALONG_ROWS,
+                         raft::sqrt_op{});
+    } else {
+      raft::linalg::norm(res,
+                         dataset,
+                         norms->view(),
+                         raft::linalg::NormType::L2Norm,
+                         raft::linalg::Apply::ALONG_ROWS);
+    }
+  }
+
+  return index<T>(res, dataset, std::move(norms), metric, metric_arg);
+}
 
+/**
+ * @brief Brute Force search using the constructed index.
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices
+ *
+ * @param[in] res raft resources
+ * @param[in] idx brute force index
+ * @param[in] queries a device matrix view to a row-major matrix [n_queries, index->dim()]
+ * @param[out] neighbors a device matrix view to the indices of the neighbors in the source dataset
+ * [n_queries, k]
+ * @param[out] distances a device matrix view to the distances to the selected neighbors [n_queries,
+ * k]
+ */
+template <typename T, typename IdxT>
+void search(raft::resources const& res,
+            const index<T>& idx,
+            raft::device_matrix_view<const T, int64_t, row_major> queries,
+            raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, row_major> distances)
+{
+  RAFT_EXPECTS(neighbors.extent(1) == distances.extent(1), "Value of k must match for outputs");
+  RAFT_EXPECTS(idx.dataset().extent(1) == queries.extent(1),
+               "Number of columns in queries must match brute force index");
+
+  auto k = neighbors.extent(1);
+  auto d = idx.dataset().extent(1);
+
+  std::vector<T*> dataset    = {const_cast<T*>(idx.dataset().data_handle())};
+  std::vector<int64_t> sizes = {idx.dataset().extent(0)};
+  std::vector<T*> norms;
+  if (idx.has_norms()) { norms.push_back(const_cast<T*>(idx.norms().data_handle())); }
+
+  detail::brute_force_knn_impl<int64_t, IdxT, T>(res,
+                                                 dataset,
+                                                 sizes,
+                                                 d,
+                                                 const_cast<T*>(queries.data_handle()),
+                                                 queries.extent(0),
+                                                 neighbors.data_handle(),
+                                                 distances.data_handle(),
+                                                 k,
+                                                 true,
+                                                 true,
+                                                 nullptr,
+                                                 idx.metric(),
+                                                 idx.metric_arg(),
+                                                 raft::identity_op(),
+                                                 norms.size() ? &norms : nullptr);
+}
+/** @} */  // end group brute_force_knn
 }  // namespace raft::neighbors::brute_force
diff --git a/cpp/include/raft/neighbors/brute_force_types.hpp b/cpp/include/raft/neighbors/brute_force_types.hpp
new file mode 100644
index 0000000000..19dd6b8350
--- /dev/null
+++ b/cpp/include/raft/neighbors/brute_force_types.hpp
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "ann_types.hpp"
+#include <raft/core/resource/cuda_stream.hpp>
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/error.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/mdspan_types.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/distance/distance_types.hpp>
+
+#include <raft/core/logger.hpp>
+
+namespace raft::neighbors::brute_force {
+/**
+ * @addtogroup brute_force
+ * @{
+ */
+
+/**
+ * @brief Brute Force index.
+ *
+ * The index stores the dataset and norms for the dataset in device memory.
+ *
+ * @tparam T data element type
+ */
+template <typename T>
+struct index : ann::index {
+ public:
+  /** Distance metric used for retrieval */
+  [[nodiscard]] constexpr inline raft::distance::DistanceType metric() const noexcept
+  {
+    return metric_;
+  }
+
+  /** Total length of the index (number of vectors). */
+  [[nodiscard]] constexpr inline int64_t size() const noexcept { return dataset_view_.extent(0); }
+
+  /** Dimensionality of the data. */
+  [[nodiscard]] constexpr inline uint32_t dim() const noexcept { return dataset_view_.extent(1); }
+
+  /** Dataset [size, dim] */
+  [[nodiscard]] inline auto dataset() const noexcept
+    -> device_matrix_view<const T, int64_t, row_major>
+  {
+    return dataset_view_;
+  }
+
+  /** Dataset norms */
+  [[nodiscard]] inline auto norms() const -> device_vector_view<const T, int64_t, row_major>
+  {
+    return norms_view_.value();
+  }
+
+  /** Whether ot not this index has dataset norms */
+  [[nodiscard]] inline bool has_norms() const noexcept { return norms_view_.has_value(); }
+
+  [[nodiscard]] inline T metric_arg() const noexcept { return metric_arg_; }
+
+  // Don't allow copying the index for performance reasons (try avoiding copying data)
+  index(const index&)                    = delete;
+  index(index&&)                         = default;
+  auto operator=(const index&) -> index& = delete;
+  auto operator=(index&&) -> index&      = default;
+  ~index()                               = default;
+
+  /** Construct a brute force index from dataset
+   *
+   * Constructs a brute force index from a dataset. This lets us precompute norms for
+   * the dataset, providing a speed benefit over doing this at query time.
+
+   * If the dataset is already in GPU memory, then this class stores a non-owning reference to
+   * the dataset. If the dataset is in host memory, it will be copied to the device and the
+   * index will own the device memory.
+   */
+  template <typename data_accessor>
+  index(raft::resources const& res,
+        mdspan<const T, matrix_extent<int64_t>, row_major, data_accessor> dataset,
+        std::optional<raft::device_vector<T, int64_t>>&& norms,
+        raft::distance::DistanceType metric,
+        T metric_arg = 0.0)
+    : ann::index(),
+      metric_(metric),
+      dataset_(make_device_matrix<T, int64_t>(res, 0, 0)),
+      norms_(std::move(norms)),
+      metric_arg_(metric_arg)
+  {
+    if (norms_) { norms_view_ = make_const_mdspan(norms_.value().view()); }
+    update_dataset(res, dataset);
+    resource::sync_stream(res);
+  }
+
+  /** Construct a brute force index from dataset
+   *
+   * This class stores a non-owning reference to the dataset and norms here.
+   * Having precomputed norms gives us a performance advantage at query time.
+   */
+  index(raft::resources const& res,
+        raft::device_matrix_view<const T, int64_t, row_major> dataset_view,
+        std::optional<raft::device_vector_view<const T, int64_t>> norms_view,
+        raft::distance::DistanceType metric,
+        T metric_arg = 0.0)
+    : ann::index(),
+      metric_(metric),
+      dataset_(make_device_matrix<T, int64_t>(res, 0, 0)),
+      dataset_view_(dataset_view),
+      norms_view_(norms_view),
+      metric_arg_(metric_arg)
+  {
+  }
+
+ private:
+  /**
+   * Replace the dataset with a new dataset.
+   */
+  void update_dataset(raft::resources const& res,
+                      raft::device_matrix_view<const T, int64_t, row_major> dataset)
+  {
+    dataset_view_ = dataset;
+  }
+
+  /**
+   * Replace the dataset with a new dataset.
+   *
+   * We create a copy of the dataset on the device. The index manages the lifetime of this copy.
+   */
+  void update_dataset(raft::resources const& res,
+                      raft::host_matrix_view<const T, int64_t, row_major> dataset)
+  {
+    dataset_ = make_device_matrix<T, int64_t>(dataset.extents(0), dataset.extents(1));
+    raft::copy(dataset_.data_handle(),
+               dataset.data_handle(),
+               dataset.size(),
+               resource::get_cuda_stream(res));
+    dataset_view_ = make_const_mdspan(dataset_.view());
+  }
+
+  raft::distance::DistanceType metric_;
+  raft::device_matrix<T, int64_t, row_major> dataset_;
+  std::optional<raft::device_vector<T, int64_t>> norms_;
+  std::optional<raft::device_vector_view<const T, int64_t>> norms_view_;
+  raft::device_matrix_view<const T, int64_t, row_major> dataset_view_;
+  T metric_arg_;
+};
+
+/** @} */
+
+}  // namespace raft::neighbors::brute_force
diff --git a/cpp/include/raft/neighbors/cagra.cuh b/cpp/include/raft/neighbors/cagra.cuh
index 903d0571dc..f9682a973f 100644
--- a/cpp/include/raft/neighbors/cagra.cuh
+++ b/cpp/include/raft/neighbors/cagra.cuh
@@ -35,12 +35,11 @@ namespace raft::neighbors::cagra {
  */
 
 /**
- * @brief Build a kNN graph.
+ * @brief Build a kNN graph using IVF-PQ.
  *
  * The kNN graph is the first building block for CAGRA index.
- * This function uses the IVF-PQ method to build a kNN graph.
  *
- * The output is a dense matrix that stores the neighbor indices for each pont in the dataset.
+ * The output is a dense matrix that stores the neighbor indices for each point in the dataset.
  * Each point has the same number of neighbors.
  *
  * See [cagra::build](#cagra::build) for an alternative method.
@@ -52,16 +51,16 @@ namespace raft::neighbors::cagra {
  * @code{.cpp}
  *   using namespace raft::neighbors;
  *   // use default index parameters
- *   cagra::index_params build_params;
- *   cagra::search_params search_params
+ *   ivf_pq::index_params build_params;
+ *   ivf_pq::search_params search_params
  *   auto knn_graph      = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 128);
  *   // create knn graph
  *   cagra::build_knn_graph(res, dataset, knn_graph.view(), 2, build_params, search_params);
- *   auto optimized_gaph      = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 64);
+ *   auto optimized_gaph = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 64);
  *   cagra::optimize(res, dataset, knn_graph.view(), optimized_graph.view());
  *   // Construct an index from dataset and optimized knn_graph
  *   auto index = cagra::index<T, IdxT>(res, build_params.metric(), dataset,
- * optimized_graph.view());
+ *                                      optimized_graph.view());
  * @endcode
  *
  * @tparam DataT data element type
@@ -70,7 +69,7 @@ namespace raft::neighbors::cagra {
  * @param[in] res raft resources
  * @param[in] dataset a matrix view (host or device) to a row-major matrix [n_rows, dim]
  * @param[out] knn_graph a host matrix view to store the output knn graph [n_rows, graph_degree]
- * @param[in] refine_rate refinement rate for ivf-pq search
+ * @param[in] refine_rate (optional) refinement rate for ivf-pq search
  * @param[in] build_params (optional) ivf_pq index building parameters for knn graph
  * @param[in] search_params (optional) ivf_pq search parameters
  */
@@ -95,6 +94,58 @@ void build_knn_graph(raft::resources const& res,
     res, dataset_internal, knn_graph_internal, refine_rate, build_params, search_params);
 }
 
+/**
+ * @brief Build a kNN graph using NN-descent.
+ *
+ * The kNN graph is the first building block for CAGRA index.
+ *
+ * The output is a dense matrix that stores the neighbor indices for each point in the dataset.
+ * Each point has the same number of neighbors.
+ *
+ * See [cagra::build](#cagra::build) for an alternative method.
+ *
+ * The following distance metrics are supported:
+ * - L2Expanded
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   using namespace raft::neighbors::experimental;
+ *   // use default index parameters
+ *   nn_descent::index_params build_params;
+ *   build_params.graph_degree = 128;
+ *   auto knn_graph      = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 128);
+ *   // create knn graph
+ *   cagra::build_knn_graph(res, dataset, knn_graph.view(), build_params);
+ *   auto optimized_gaph      = raft::make_host_matrix<IdxT, int64_t>(dataset.extent(0), 64);
+ *   cagra::optimize(res, dataset, nn_descent_index.graph.view(), optimized_graph.view());
+ *   // Construct an index from dataset and optimized knn_graph
+ *   auto index = cagra::index<T, IdxT>(res, build_params.metric(), dataset,
+ * optimized_graph.view());
+ * @endcode
+ *
+ * @tparam DataT data element type
+ * @tparam IdxT type of the dataset vector indices
+ * @tparam accessor host or device accessor_type for the dataset
+ * @param[in] res raft::resources is an object mangaging resources
+ * @param[in] dataset input raft::host/device_matrix_view that can be located in
+ *                in host or device memory
+ * @param[out] knn_graph a host matrix view to store the output knn graph [n_rows, graph_degree]
+ * @param[in] build_params an instance of experimental::nn_descent::index_params that are parameters
+ *                     to run the nn-descent algorithm
+ */
+template <typename DataT,
+          typename IdxT = uint32_t,
+          typename accessor =
+            host_device_accessor<std::experimental::default_accessor<DataT>, memory_type::device>>
+void build_knn_graph(raft::resources const& res,
+                     mdspan<const DataT, matrix_extent<int64_t>, row_major, accessor> dataset,
+                     raft::host_matrix_view<IdxT, int64_t, row_major> knn_graph,
+                     experimental::nn_descent::index_params build_params)
+{
+  detail::build_knn_graph<DataT, IdxT>(res, dataset, knn_graph, build_params);
+}
+
 /**
  * @brief Sort a KNN graph index.
  * Preprocessing step for `cagra::optimize`: If a KNN graph is not built using
@@ -106,7 +157,7 @@ void build_knn_graph(raft::resources const& res,
  * @code{.cpp}
  *   using namespace raft::neighbors;
  *   cagra::index_params build_params;
- *   auto knn_graph      = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 128);
+ *   auto knn_graph = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 128);
  *   // build KNN graph not using `cagra::build_knn_graph`
  *   // build(knn_graph, dataset, ...);
  *   // sort graph index
@@ -115,7 +166,7 @@ void build_knn_graph(raft::resources const& res,
  *   cagra::optimize(res, dataset, knn_graph.view(), optimized_graph.view());
  *   // Construct an index from dataset and optimized knn_graph
  *   auto index = cagra::index<T, IdxT>(res, build_params.metric(), dataset,
- * optimized_graph.view());
+ *                                      optimized_graph.view());
  * @endcode
  *
  * @tparam DataT type of the data in the source dataset
@@ -259,7 +310,16 @@ index<T, IdxT> build(raft::resources const& res,
   std::optional<raft::host_matrix<IdxT, int64_t>> knn_graph(
     raft::make_host_matrix<IdxT, int64_t>(dataset.extent(0), intermediate_degree));
 
-  build_knn_graph(res, dataset, knn_graph->view());
+  if (params.build_algo == graph_build_algo::IVF_PQ) {
+    build_knn_graph(res, dataset, knn_graph->view());
+
+  } else {
+    // Use nn-descent to build CAGRA knn graph
+    auto nn_descent_params                      = experimental::nn_descent::index_params();
+    nn_descent_params.graph_degree              = intermediate_degree;
+    nn_descent_params.intermediate_graph_degree = 1.5 * intermediate_degree;
+    build_knn_graph<T, IdxT>(res, dataset, knn_graph->view(), nn_descent_params);
+  }
 
   auto cagra_graph = raft::make_host_matrix<IdxT, int64_t>(dataset.extent(0), graph_degree);
 
@@ -316,9 +376,88 @@ void search(raft::resources const& res,
   auto distances_internal = raft::make_device_matrix_view<float, int64_t, row_major>(
     distances.data_handle(), distances.extent(0), distances.extent(1));
 
-  cagra::detail::search_main<T, internal_IdxT, IdxT>(
-    res, params, idx, queries_internal, neighbors_internal, distances_internal);
+  cagra::detail::search_main<T,
+                             internal_IdxT,
+                             decltype(raft::neighbors::filtering::none_cagra_sample_filter()),
+                             IdxT>(res,
+                                   params,
+                                   idx,
+                                   queries_internal,
+                                   neighbors_internal,
+                                   distances_internal,
+                                   raft::neighbors::filtering::none_cagra_sample_filter());
+}
+
+/**
+ * @brief Search ANN using the constructed index with the given sample filter.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   // use default index parameters
+ *   cagra::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = cagra::build(res, index_params, dataset);
+ *   // use default search parameters
+ *   cagra::search_params search_params;
+ *   // create a bitset to filter the search
+ *   auto removed_indices = raft::make_device_vector<IdxT>(res, n_removed_indices);
+ *   raft::core::bitset<std::uint32_t, IdxT> removed_indices_bitset(
+ *     res, removed_indices.view(), dataset.extent(0));
+ *   // search K nearest neighbours according to a bitset
+ *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
+ *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
+ *   cagra::search_with_filtering(res, search_params, index, queries, neighbors, distances,
+ *     filtering::bitset_filter(removed_indices_bitset.view()));
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices
+ * @tparam CagraSampleFilterT Device filter function, with the signature
+ *         `(uint32_t query ix, uint32_t sample_ix) -> bool`
+ *
+ * @param[in] res raft resources
+ * @param[in] params configure the search
+ * @param[in] idx cagra index
+ * @param[in] queries a device matrix view to a row-major matrix [n_queries, index->dim()]
+ * @param[out] neighbors a device matrix view to the indices of the neighbors in the source dataset
+ * [n_queries, k]
+ * @param[out] distances a device matrix view to the distances to the selected neighbors [n_queries,
+ * k]
+ * @param[in] sample_filter a device filter function that greenlights samples for a given query
+ */
+template <typename T, typename IdxT, typename CagraSampleFilterT>
+void search_with_filtering(raft::resources const& res,
+                           const search_params& params,
+                           const index<T, IdxT>& idx,
+                           raft::device_matrix_view<const T, int64_t, row_major> queries,
+                           raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,
+                           raft::device_matrix_view<float, int64_t, row_major> distances,
+                           CagraSampleFilterT sample_filter = CagraSampleFilterT())
+{
+  RAFT_EXPECTS(
+    queries.extent(0) == neighbors.extent(0) && queries.extent(0) == distances.extent(0),
+    "Number of rows in output neighbors and distances matrices must equal the number of queries.");
+
+  RAFT_EXPECTS(neighbors.extent(1) == distances.extent(1),
+               "Number of columns in output neighbors and distances matrices must equal k");
+  RAFT_EXPECTS(queries.extent(1) == idx.dim(),
+               "Number of query dimensions should equal number of dimensions in the index.");
+
+  using internal_IdxT   = typename std::make_unsigned<IdxT>::type;
+  auto queries_internal = raft::make_device_matrix_view<const T, int64_t, row_major>(
+    queries.data_handle(), queries.extent(0), queries.extent(1));
+  auto neighbors_internal = raft::make_device_matrix_view<internal_IdxT, int64_t, row_major>(
+    reinterpret_cast<internal_IdxT*>(neighbors.data_handle()),
+    neighbors.extent(0),
+    neighbors.extent(1));
+  auto distances_internal = raft::make_device_matrix_view<float, int64_t, row_major>(
+    distances.data_handle(), distances.extent(0), distances.extent(1));
+
+  cagra::detail::search_main<T, internal_IdxT, CagraSampleFilterT, IdxT>(
+    res, params, idx, queries_internal, neighbors_internal, distances_internal, sample_filter);
 }
+
 /** @} */  // end group cagra
 
 }  // namespace raft::neighbors::cagra
diff --git a/cpp/include/raft/neighbors/cagra_types.hpp b/cpp/include/raft/neighbors/cagra_types.hpp
index 02e3f5338e..5061d6082d 100644
--- a/cpp/include/raft/neighbors/cagra_types.hpp
+++ b/cpp/include/raft/neighbors/cagra_types.hpp
@@ -40,11 +40,24 @@ namespace raft::neighbors::cagra {
  * @{
  */
 
+/**
+ * @brief ANN algorithm used by CAGRA to build knn graph
+ *
+ */
+enum class graph_build_algo {
+  /* Use IVF-PQ to build all-neighbors knn graph */
+  IVF_PQ,
+  /* Experimental, use NN-Descent to build all-neighbors knn graph */
+  NN_DESCENT
+};
+
 struct index_params : ann::index_params {
   /** Degree of input graph for pruning. */
   size_t intermediate_graph_degree = 128;
   /** Degree of output graph. */
   size_t graph_degree = 64;
+  /** ANN algorithm to build knn graph. */
+  graph_build_algo build_algo = graph_build_algo::IVF_PQ;
 };
 
 enum class search_algo {
@@ -165,9 +178,10 @@ struct index : ann::index {
   ~index()                               = default;
 
   /** Construct an empty index. */
-  index(raft::resources const& res)
+  index(raft::resources const& res,
+        raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded)
     : ann::index(),
-      metric_(raft::distance::DistanceType::L2Expanded),
+      metric_(metric),
       dataset_(make_device_matrix<T, int64_t>(res, 0, 0)),
       graph_(make_device_matrix<IdxT, int64_t>(res, 0, 0))
   {
@@ -296,7 +310,11 @@ struct index : ann::index {
                     raft::host_matrix_view<const IdxT, int64_t, row_major> knn_graph)
   {
     RAFT_LOG_DEBUG("Copying CAGRA knn graph from host to device");
-    graph_ = make_device_matrix<IdxT, int64_t>(res, knn_graph.extent(0), knn_graph.extent(1));
+    if ((graph_.extent(0) != knn_graph.extent(0)) || (graph_.extent(1) != knn_graph.extent(1))) {
+      // clear existing memory before allocating to prevent OOM errors on large graphs
+      if (graph_.size()) { graph_ = make_device_matrix<IdxT, int64_t>(res, 0, 0); }
+      graph_ = make_device_matrix<IdxT, int64_t>(res, knn_graph.extent(0), knn_graph.extent(1));
+    }
     raft::copy(graph_.data_handle(),
                knn_graph.data_handle(),
                knn_graph.size(),
@@ -311,7 +329,13 @@ struct index : ann::index {
                    mdspan<const T, matrix_extent<int64_t>, row_major, data_accessor> dataset)
   {
     size_t padded_dim = round_up_safe<size_t>(dataset.extent(1) * sizeof(T), 16) / sizeof(T);
-    dataset_          = make_device_matrix<T, int64_t>(res, dataset.extent(0), padded_dim);
+
+    if ((dataset_.extent(0) != dataset.extent(0)) ||
+        (static_cast<size_t>(dataset_.extent(1)) != padded_dim)) {
+      // clear existing memory before allocating to prevent OOM errors on large datasets
+      if (dataset_.size()) { dataset_ = make_device_matrix<T, int64_t>(res, 0, 0); }
+      dataset_ = make_device_matrix<T, int64_t>(res, dataset.extent(0), padded_dim);
+    }
     if (dataset_.extent(1) == dataset.extent(1)) {
       raft::copy(dataset_.data_handle(),
                  dataset.data_handle(),
@@ -351,6 +375,7 @@ struct index : ann::index {
 
 // TODO: Remove deprecated experimental namespace in 23.12 release
 namespace raft::neighbors::experimental::cagra {
+using raft::neighbors::cagra::graph_build_algo;
 using raft::neighbors::cagra::hash_mode;
 using raft::neighbors::cagra::index;
 using raft::neighbors::cagra::index_params;
diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
index d19d7e7904..40024a3deb 100644
--- a/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
@@ -28,12 +28,14 @@
 #include <raft/core/host_mdarray.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/logger.hpp>
+#include <raft/core/resource/detail/device_memory_resource.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 
 #include <raft/neighbors/detail/refine.cuh>
 #include <raft/neighbors/ivf_pq.cuh>
 #include <raft/neighbors/ivf_pq_types.hpp>
+#include <raft/neighbors/nn_descent.cuh>
 #include <raft/neighbors/refine.cuh>
 
 namespace raft::neighbors::cagra::detail {
@@ -46,6 +48,7 @@ void build_knn_graph(raft::resources const& res,
                      std::optional<ivf_pq::index_params> build_params   = std::nullopt,
                      std::optional<ivf_pq::search_params> search_params = std::nullopt)
 {
+  resource::detail::warn_non_pool_workspace(res, "raft::neighbors::cagra::build");
   RAFT_EXPECTS(!build_params || build_params->metric == distance::DistanceType::L2Expanded,
                "Currently only L2Expanded metric is supported");
 
@@ -238,4 +241,27 @@ void build_knn_graph(raft::resources const& res,
   if (!first) RAFT_LOG_DEBUG("# Finished building kNN graph");
 }
 
+template <typename DataT, typename IdxT, typename accessor>
+void build_knn_graph(raft::resources const& res,
+                     mdspan<const DataT, matrix_extent<int64_t>, row_major, accessor> dataset,
+                     raft::host_matrix_view<IdxT, int64_t, row_major> knn_graph,
+                     experimental::nn_descent::index_params build_params)
+{
+  auto nn_descent_idx = experimental::nn_descent::index<IdxT>(res, knn_graph);
+  experimental::nn_descent::build<DataT, IdxT>(res, build_params, dataset, nn_descent_idx);
+
+  using internal_IdxT = typename std::make_unsigned<IdxT>::type;
+  using g_accessor    = typename decltype(nn_descent_idx.graph())::accessor_type;
+  using g_accessor_internal =
+    host_device_accessor<std::experimental::default_accessor<internal_IdxT>, g_accessor::mem_type>;
+
+  auto knn_graph_internal =
+    mdspan<internal_IdxT, matrix_extent<int64_t>, row_major, g_accessor_internal>(
+      reinterpret_cast<internal_IdxT*>(nn_descent_idx.graph().data_handle()),
+      nn_descent_idx.graph().extent(0),
+      nn_descent_idx.graph().extent(1));
+
+  graph::sort_knn_graph(res, dataset, knn_graph_internal);
+}
+
 }  // namespace raft::neighbors::cagra::detail
diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
index 8190817b5b..81e714dc4e 100644
--- a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
@@ -18,10 +18,13 @@
 
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/neighbors/detail/ivf_pq_search.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
+#include <raft/core/nvtx.hpp>
+#include <raft/core/resource/detail/device_memory_resource.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/neighbors/cagra_types.hpp>
 #include <rmm/cuda_stream_view.hpp>
@@ -32,6 +35,48 @@
 
 namespace raft::neighbors::cagra::detail {
 
+template <class CagraSampleFilterT>
+struct CagraSampleFilterWithQueryIdOffset {
+  const uint32_t offset;
+  CagraSampleFilterT filter;
+
+  CagraSampleFilterWithQueryIdOffset(const uint32_t offset, const CagraSampleFilterT filter)
+    : offset(offset), filter(filter)
+  {
+  }
+
+  _RAFT_DEVICE auto operator()(const uint32_t query_id, const uint32_t sample_id)
+  {
+    return filter(query_id + offset, sample_id);
+  }
+};
+
+template <class CagraSampleFilterT>
+struct CagraSampleFilterT_Selector {
+  using type = CagraSampleFilterWithQueryIdOffset<CagraSampleFilterT>;
+};
+template <>
+struct CagraSampleFilterT_Selector<raft::neighbors::filtering::none_cagra_sample_filter> {
+  using type = raft::neighbors::filtering::none_cagra_sample_filter;
+};
+
+// A helper function to set a query id offset
+template <class CagraSampleFilterT>
+inline typename CagraSampleFilterT_Selector<CagraSampleFilterT>::type set_offset(
+  CagraSampleFilterT filter, const uint32_t offset)
+{
+  typename CagraSampleFilterT_Selector<CagraSampleFilterT>::type new_filter(offset, filter);
+  return new_filter;
+}
+template <>
+inline
+  typename CagraSampleFilterT_Selector<raft::neighbors::filtering::none_cagra_sample_filter>::type
+  set_offset<raft::neighbors::filtering::none_cagra_sample_filter>(
+    raft::neighbors::filtering::none_cagra_sample_filter filter, const uint32_t)
+{
+  return filter;
+}
+
 /**
  * @brief Search ANN using the constructed index.
  *
@@ -52,27 +97,37 @@ namespace raft::neighbors::cagra::detail {
  * k]
  */
 
-template <typename T, typename internal_IdxT, typename IdxT = uint32_t, typename DistanceT = float>
+template <typename T,
+          typename internal_IdxT,
+          typename CagraSampleFilterT,
+          typename IdxT      = uint32_t,
+          typename DistanceT = float>
 void search_main(raft::resources const& res,
                  search_params params,
                  const index<T, IdxT>& index,
                  raft::device_matrix_view<const T, int64_t, row_major> queries,
                  raft::device_matrix_view<internal_IdxT, int64_t, row_major> neighbors,
-                 raft::device_matrix_view<DistanceT, int64_t, row_major> distances)
+                 raft::device_matrix_view<DistanceT, int64_t, row_major> distances,
+                 CagraSampleFilterT sample_filter = CagraSampleFilterT())
 {
+  resource::detail::warn_non_pool_workspace(res, "raft::neighbors::cagra::search");
   RAFT_LOG_DEBUG("# dataset size = %lu, dim = %lu\n",
                  static_cast<size_t>(index.dataset().extent(0)),
                  static_cast<size_t>(index.dataset().extent(1)));
   RAFT_LOG_DEBUG("# query size = %lu, dim = %lu\n",
                  static_cast<size_t>(queries.extent(0)),
                  static_cast<size_t>(queries.extent(1)));
-  RAFT_EXPECTS(queries.extent(1) == index.dim(), "Querise and index dim must match");
+  RAFT_EXPECTS(queries.extent(1) == index.dim(), "Queries and index dim must match");
   const uint32_t topk = neighbors.extent(1);
 
   if (params.max_queries == 0) { params.max_queries = queries.extent(0); }
 
-  std::unique_ptr<search_plan_impl<T, internal_IdxT, DistanceT>> plan =
-    factory<T, internal_IdxT, DistanceT>::create(
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "cagra::search(max_queries = %u, k = %u, dim = %zu)", params.max_queries, topk, index.dim());
+
+  using CagraSampleFilterT_s = typename CagraSampleFilterT_Selector<CagraSampleFilterT>::type;
+  std::unique_ptr<search_plan_impl<T, internal_IdxT, DistanceT, CagraSampleFilterT_s>> plan =
+    factory<T, internal_IdxT, DistanceT, CagraSampleFilterT_s>::create(
       res, params, index.dim(), index.graph_degree(), topk);
 
   plan->check(neighbors.extent(1));
@@ -113,7 +168,8 @@ void search_main(raft::resources const& res,
             n_queries,
             _seed_ptr,
             _num_executed_iterations,
-            topk);
+            topk,
+            set_offset(sample_filter, qid));
   }
 
   static_assert(std::is_same_v<DistanceT, float>,
diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh
index 2c9cbd2563..8261f637e1 100644
--- a/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <raft/core/mdarray.hpp>
+#include <raft/core/nvtx.hpp>
 #include <raft/core/serialize.hpp>
 #include <raft/neighbors/cagra_types.hpp>
 
@@ -54,6 +55,8 @@ void serialize(raft::resources const& res,
                const index<T, IdxT>& index_,
                bool include_dataset)
 {
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope("cagra::serialize");
+
   RAFT_LOG_DEBUG(
     "Saving CAGRA index, size %zu, dim %u", static_cast<size_t>(index_.size()), index_.dim());
 
@@ -113,6 +116,8 @@ void serialize(raft::resources const& res,
 template <typename T, typename IdxT>
 auto deserialize(raft::resources const& res, std::istream& is) -> index<T, IdxT>
 {
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope("cagra::deserialize");
+
   char dtype_string[4];
   is.read(dtype_string, 4);
 
@@ -125,15 +130,22 @@ auto deserialize(raft::resources const& res, std::istream& is) -> index<T, IdxT>
   auto graph_degree = deserialize_scalar<std::uint32_t>(res, is);
   auto metric       = deserialize_scalar<raft::distance::DistanceType>(res, is);
 
-  auto dataset = raft::make_host_matrix<T, int64_t>(n_rows, dim);
-  auto graph   = raft::make_host_matrix<IdxT, int64_t>(n_rows, graph_degree);
+  auto graph = raft::make_host_matrix<IdxT, int64_t>(n_rows, graph_degree);
   deserialize_mdspan(res, is, graph.view());
 
   bool has_dataset = deserialize_scalar<bool>(res, is);
-  if (has_dataset) { deserialize_mdspan(res, is, dataset.view()); }
-
-  return index<T, IdxT>(
-    res, metric, raft::make_const_mdspan(dataset.view()), raft::make_const_mdspan(graph.view()));
+  if (has_dataset) {
+    auto dataset = raft::make_host_matrix<T, int64_t>(n_rows, dim);
+    deserialize_mdspan(res, is, dataset.view());
+    return index<T, IdxT>(
+      res, metric, raft::make_const_mdspan(dataset.view()), raft::make_const_mdspan(graph.view()));
+  } else {
+    // create a new index with no dataset - the user must supply via update_dataset themselves
+    // later (this avoids allocating GPU memory in the meantime)
+    index<T, IdxT> idx(res, metric);
+    idx.update_graph(res, raft::make_const_mdspan(graph.view()));
+    return idx;
+  }
 }
 
 template <typename T, typename IdxT>
diff --git a/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp b/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp
index 47e976e252..55b7b47508 100644
--- a/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp
+++ b/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp
@@ -133,7 +133,6 @@ _RAFT_DEVICE void compute_distance_to_random_nodes(
 }
 
 template <unsigned TEAM_SIZE,
-          unsigned BLOCK_SIZE,
           unsigned MAX_DATASET_DIM,
           unsigned MAX_N_FRAGS,
           class LOAD_T,
@@ -155,17 +154,20 @@ _RAFT_DEVICE void compute_distance_to_child_nodes(INDEX_T* const result_child_in
                                                   INDEX_T* const visited_hashmap_ptr,
                                                   const std::uint32_t hash_bitlen,
                                                   const INDEX_T* const parent_indices,
+                                                  const INDEX_T* const internal_topk_list,
                                                   const std::uint32_t search_width)
 {
-  const INDEX_T invalid_index = utils::get_max_value<INDEX_T>();
+  constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
+  const INDEX_T invalid_index        = utils::get_max_value<INDEX_T>();
 
   // Read child indices of parents from knn graph and check if the distance
   // computaiton is necessary.
-  for (uint32_t i = threadIdx.x; i < knn_k * search_width; i += BLOCK_SIZE) {
-    const INDEX_T parent_id = parent_indices[i / knn_k];
-    INDEX_T child_id        = invalid_index;
-    if (parent_id != invalid_index) {
-      child_id = knn_graph[(i % knn_k) + ((uint64_t)knn_k * parent_id)];
+  for (uint32_t i = threadIdx.x; i < knn_k * search_width; i += blockDim.x) {
+    const INDEX_T smem_parent_id = parent_indices[i / knn_k];
+    INDEX_T child_id             = invalid_index;
+    if (smem_parent_id != invalid_index) {
+      const auto parent_id = internal_topk_list[smem_parent_id] & ~index_msb_1_mask;
+      child_id             = knn_graph[(i % knn_k) + ((uint64_t)knn_k * parent_id)];
     }
     if (child_id != invalid_index) {
       if (hashmap::insert(visited_hashmap_ptr, hash_bitlen, child_id) == 0) {
@@ -205,7 +207,8 @@ _RAFT_DEVICE void compute_distance_to_child_nodes(INDEX_T* const result_child_in
   // Compute the distance to child nodes
   std::uint32_t max_i = knn_k * search_width;
   if (max_i % (32 / TEAM_SIZE)) { max_i += (32 / TEAM_SIZE) - (max_i % (32 / TEAM_SIZE)); }
-  for (std::uint32_t i = threadIdx.x / TEAM_SIZE; i < max_i; i += BLOCK_SIZE / TEAM_SIZE) {
+  for (std::uint32_t tid = threadIdx.x; tid < max_i * TEAM_SIZE; tid += blockDim.x) {
+    const auto i       = tid / TEAM_SIZE;
     const bool valid_i = (i < (knn_k * search_width));
     INDEX_T child_id   = invalid_index;
     if (valid_i) { child_id = result_child_indices_ptr[i]; }
diff --git a/cpp/include/raft/neighbors/detail/cagra/factory.cuh b/cpp/include/raft/neighbors/detail/cagra/factory.cuh
index 625040194b..78111a9310 100644
--- a/cpp/include/raft/neighbors/detail/cagra/factory.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/factory.cuh
@@ -20,20 +20,25 @@
 #include "search_multi_kernel.cuh"
 #include "search_plan.cuh"
 #include "search_single_cta.cuh"
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail {
 
-template <typename T, typename IdxT = uint32_t, typename DistanceT = float>
+template <typename T,
+          typename IdxT               = uint32_t,
+          typename DistanceT          = float,
+          typename CagraSampleFilterT = raft::neighbors::filtering::none_cagra_sample_filter>
 class factory {
  public:
   /**
    * Create a search structure for dataset with dim features.
    */
-  static std::unique_ptr<search_plan_impl<T, IdxT, DistanceT>> create(raft::resources const& res,
-                                                                      search_params const& params,
-                                                                      int64_t dim,
-                                                                      int64_t graph_degree,
-                                                                      uint32_t topk)
+  static std::unique_ptr<search_plan_impl<T, IdxT, DistanceT, CagraSampleFilterT>> create(
+    raft::resources const& res,
+    search_params const& params,
+    int64_t dim,
+    int64_t graph_degree,
+    uint32_t topk)
   {
     search_plan_impl_base plan(params, dim, graph_degree, topk);
     switch (plan.max_dim) {
@@ -63,26 +68,29 @@ class factory {
         break;
       default: RAFT_LOG_DEBUG("Incorrect max_dim (%lu)\n", plan.max_dim);
     }
-    return std::unique_ptr<search_plan_impl<T, IdxT, DistanceT>>();
+    return std::unique_ptr<search_plan_impl<T, IdxT, DistanceT, CagraSampleFilterT>>();
   }
 
  private:
   template <unsigned MAX_DATASET_DIM, unsigned TEAM_SIZE>
-  static std::unique_ptr<search_plan_impl<T, IdxT, DistanceT>> dispatch_kernel(
+  static std::unique_ptr<search_plan_impl<T, IdxT, DistanceT, CagraSampleFilterT>> dispatch_kernel(
     raft::resources const& res, search_plan_impl_base& plan)
   {
     if (plan.algo == search_algo::SINGLE_CTA) {
-      return std::unique_ptr<search_plan_impl<T, IdxT, DistanceT>>(
-        new single_cta_search::search<TEAM_SIZE, MAX_DATASET_DIM, T, IdxT, DistanceT>(
-          res, plan, plan.dim, plan.graph_degree, plan.topk));
+      return std::unique_ptr<search_plan_impl<T, IdxT, DistanceT, CagraSampleFilterT>>(
+        new single_cta_search::
+          search<TEAM_SIZE, MAX_DATASET_DIM, T, IdxT, DistanceT, CagraSampleFilterT>(
+            res, plan, plan.dim, plan.graph_degree, plan.topk));
     } else if (plan.algo == search_algo::MULTI_CTA) {
-      return std::unique_ptr<search_plan_impl<T, IdxT, DistanceT>>(
-        new multi_cta_search::search<TEAM_SIZE, MAX_DATASET_DIM, T, IdxT, DistanceT>(
-          res, plan, plan.dim, plan.graph_degree, plan.topk));
+      return std::unique_ptr<search_plan_impl<T, IdxT, DistanceT, CagraSampleFilterT>>(
+        new multi_cta_search::
+          search<TEAM_SIZE, MAX_DATASET_DIM, T, IdxT, DistanceT, CagraSampleFilterT>(
+            res, plan, plan.dim, plan.graph_degree, plan.topk));
     } else {
-      return std::unique_ptr<search_plan_impl<T, IdxT, DistanceT>>(
-        new multi_kernel_search::search<TEAM_SIZE, MAX_DATASET_DIM, T, IdxT, DistanceT>(
-          res, plan, plan.dim, plan.graph_degree, plan.topk));
+      return std::unique_ptr<search_plan_impl<T, IdxT, DistanceT, CagraSampleFilterT>>(
+        new multi_kernel_search::
+          search<TEAM_SIZE, MAX_DATASET_DIM, T, IdxT, DistanceT, CagraSampleFilterT>(
+            res, plan, plan.dim, plan.graph_degree, plan.topk));
     }
   }
 };
diff --git a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
index 18d451be60..8845e37973 100644
--- a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
@@ -244,7 +244,7 @@ void sort_knn_graph(raft::resources const& res,
   const uint32_t input_graph_degree = knn_graph.extent(1);
   IdxT* const input_graph_ptr       = knn_graph.data_handle();
 
-  auto d_input_graph = raft::make_device_matrix<IdxT, IdxT>(res, graph_size, input_graph_degree);
+  auto d_input_graph = raft::make_device_matrix<IdxT, int64_t>(res, graph_size, input_graph_degree);
 
   //
   // Sorting kNN graph
diff --git a/cpp/include/raft/neighbors/detail/cagra/hashmap.hpp b/cpp/include/raft/neighbors/detail/cagra/hashmap.hpp
index 346bbeaa9e..ed4763e475 100644
--- a/cpp/include/raft/neighbors/detail/cagra/hashmap.hpp
+++ b/cpp/include/raft/neighbors/detail/cagra/hashmap.hpp
@@ -28,8 +28,8 @@ namespace hashmap {
 
 _RAFT_HOST_DEVICE inline uint32_t get_size(const uint32_t bitlen) { return 1U << bitlen; }
 
-template <unsigned FIRST_TID = 0, class IdxT = void>
-_RAFT_DEVICE inline void init(IdxT* const table, const unsigned bitlen)
+template <class IdxT>
+_RAFT_DEVICE inline void init(IdxT* const table, const unsigned bitlen, unsigned FIRST_TID = 0)
 {
   if (threadIdx.x < FIRST_TID) return;
   for (unsigned i = threadIdx.x - FIRST_TID; i < get_size(bitlen); i += blockDim.x - FIRST_TID) {
@@ -37,15 +37,6 @@ _RAFT_DEVICE inline void init(IdxT* const table, const unsigned bitlen)
   }
 }
 
-template <unsigned FIRST_TID, unsigned LAST_TID, class IdxT>
-_RAFT_DEVICE inline void init(IdxT* const table, const uint32_t bitlen)
-{
-  if ((FIRST_TID > 0 && threadIdx.x < FIRST_TID) || threadIdx.x >= LAST_TID) return;
-  for (unsigned i = threadIdx.x - FIRST_TID; i < get_size(bitlen); i += LAST_TID - FIRST_TID) {
-    table[i] = utils::get_max_value<IdxT>();
-  }
-}
-
 template <class IdxT>
 _RAFT_DEVICE inline uint32_t insert(IdxT* const table, const uint32_t bitlen, const IdxT key)
 {
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh
index 6ea1e34032..c6478bef84 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh
@@ -48,42 +48,43 @@ template <unsigned TEAM_SIZE,
           unsigned MAX_DATASET_DIM,
           typename DATA_T,
           typename INDEX_T,
-          typename DISTANCE_T>
-
-struct search : public search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::max_queries;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::itopk_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::algo;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::team_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::search_width;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::min_iterations;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::max_iterations;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::thread_block_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_mode;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_min_bitlen;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_max_fill_rate;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_random_samplings;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::rand_xor_mask;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::max_dim;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::dim;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::graph_degree;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::topk;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hash_bitlen;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::small_hash_bitlen;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::small_hash_reset_interval;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::dataset_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::result_buffer_size;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::smem_size;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_executed_iterations;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::dev_seed;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_seeds;
+          typename DISTANCE_T,
+          typename SAMPLE_FILTER_T>
+
+struct search : public search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T> {
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::max_queries;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::itopk_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::algo;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::team_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::search_width;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::min_iterations;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::max_iterations;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::thread_block_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap_mode;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap_min_bitlen;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap_max_fill_rate;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::num_random_samplings;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::rand_xor_mask;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::max_dim;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::dim;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::graph_degree;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::topk;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hash_bitlen;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::small_hash_bitlen;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::small_hash_reset_interval;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::dataset_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::result_buffer_size;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::smem_size;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::num_executed_iterations;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::dev_seed;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::num_seeds;
 
   uint32_t num_cta_per_query;
   rmm::device_uvector<INDEX_T> intermediate_indices;
@@ -96,7 +97,8 @@ struct search : public search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
          int64_t dim,
          int64_t graph_degree,
          uint32_t topk)
-    : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>(res, params, dim, graph_degree, topk),
+    : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>(
+        res, params, dim, graph_degree, topk),
       intermediate_indices(0, resource::get_cuda_stream(res)),
       intermediate_distances(0, resource::get_cuda_stream(res)),
       topk_workspace(0, resource::get_cuda_stream(res))
@@ -107,9 +109,10 @@ struct search : public search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
 
   void set_params(raft::resources const& res, const search_params& params)
   {
-    this->itopk_size   = 32;
-    search_width       = 1;
-    num_cta_per_query  = max(params.search_width, params.itopk_size / 32);
+    constexpr unsigned muti_cta_itopk_size = 32;
+    this->itopk_size                       = muti_cta_itopk_size;
+    search_width                           = 1;
+    num_cta_per_query  = max(params.search_width, params.itopk_size / muti_cta_itopk_size);
     result_buffer_size = itopk_size + search_width * graph_degree;
     typedef raft::Pow2<32> AlignBytes;
     unsigned result_buffer_size_32 = AlignBytes::roundUp(result_buffer_size);
@@ -196,7 +199,8 @@ struct search : public search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
                   const uint32_t num_queries,
                   const INDEX_T* dev_seed_ptr,              // [num_queries, num_seeds]
                   uint32_t* const num_executed_iterations,  // [num_queries,]
-                  uint32_t topk)
+                  uint32_t topk,
+                  SAMPLE_FILTER_T sample_filter)
   {
     cudaStream_t stream = resource::get_cuda_stream(res);
 
@@ -223,6 +227,7 @@ struct search : public search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
       search_width,
       min_iterations,
       max_iterations,
+      sample_filter,
       stream);
     RAFT_CUDA_TRY(cudaPeekAtLastError());
 
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh
index de83acbb64..ee525587d7 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh
@@ -15,7 +15,8 @@
  */
 #pragma once
 
-#include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
+#include <raft/neighbors/sample_filter_types.hpp>  // none_cagra_sample_filter
+#include <raft/util/raft_explicit.hpp>             // RAFT_EXPLICIT
 
 namespace raft::neighbors::cagra::detail {
 namespace multi_cta_search {
@@ -26,7 +27,8 @@ template <unsigned TEAM_SIZE,
           unsigned MAX_DATASET_DIM,
           class DATA_T,
           class INDEX_T,
-          class DISTANCE_T>
+          class DISTANCE_T,
+          class SAMPLE_FILTER_T>
 void select_and_run(raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,
                     raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,
                     INDEX_T* const topk_indices_ptr,
@@ -49,47 +51,63 @@ void select_and_run(raft::device_matrix_view<const DATA_T, int64_t, layout_strid
                     size_t search_width,
                     size_t min_iterations,
                     size_t max_iterations,
+                    SAMPLE_FILTER_T sample_filter,
                     cudaStream_t stream) RAFT_EXPLICIT;
 #endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)   \
-  extern template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                     \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                          \
-    INDEX_T* const topk_indices_ptr,                                                            \
-    DISTANCE_T* const topk_distances_ptr,                                                       \
-    const DATA_T* const queries_ptr,                                                            \
-    const uint32_t num_queries,                                                                 \
-    const INDEX_T* dev_seed_ptr,                                                                \
-    uint32_t* const num_executed_iterations,                                                    \
-    uint32_t topk,                                                                              \
-    uint32_t block_size,                                                                        \
-    uint32_t result_buffer_size,                                                                \
-    uint32_t smem_size,                                                                         \
-    int64_t hash_bitlen,                                                                        \
-    INDEX_T* hashmap_ptr,                                                                       \
-    uint32_t num_cta_per_query,                                                                 \
-    uint32_t num_random_samplings,                                                              \
-    uint64_t rand_xor_mask,                                                                     \
-    uint32_t num_seeds,                                                                         \
-    size_t itopk_size,                                                                          \
-    size_t search_width,                                                                        \
-    size_t min_iterations,                                                                      \
-    size_t max_iterations,                                                                      \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  extern template void                                                                      \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(32, 1024, float, uint32_t, float);
-instantiate_kernel_selection(8, 128, float, uint32_t, float);
-instantiate_kernel_selection(16, 256, float, uint32_t, float);
-instantiate_kernel_selection(32, 512, float, uint32_t, float);
-instantiate_kernel_selection(32, 1024, int8_t, uint32_t, float);
-instantiate_kernel_selection(8, 128, int8_t, uint32_t, float);
-instantiate_kernel_selection(16, 256, int8_t, uint32_t, float);
-instantiate_kernel_selection(32, 512, int8_t, uint32_t, float);
-instantiate_kernel_selection(32, 1024, uint8_t, uint32_t, float);
-instantiate_kernel_selection(8, 128, uint8_t, uint32_t, float);
-instantiate_kernel_selection(16, 256, uint8_t, uint32_t, float);
-instantiate_kernel_selection(32, 512, uint8_t, uint32_t, float);
+instantiate_kernel_selection(
+  32, 1024, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(
+  8, 128, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(
+  16, 256, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(
+  32, 512, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(
+  32, 1024, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(
+  8, 128, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(
+  16, 256, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(
+  32, 512, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(
+  32, 1024, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(
+  8, 128, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(
+  16, 256, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(
+  32, 512, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 }  // namespace multi_cta_search
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
index 4fc051ac09..358a183971 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
@@ -26,6 +26,7 @@
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/device_properties.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 #include <vector>
 
@@ -75,7 +76,7 @@ __device__ void pickup_next_parents(INDEX_T* const next_parent_indices,  // [sea
     if (new_parent) {
       const auto i = __popc(ballot_mask & ((1 << lane_id) - 1)) + num_new_parents;
       if (i < search_width) {
-        next_parent_indices[i] = index;
+        next_parent_indices[i] = j;
         itopk_indices[j] |= index_msb_1_mask;  // set most significant bit as used node
       }
     }
@@ -124,15 +125,14 @@ __device__ inline void topk_by_bitonic_sort(float* distances,  // [num_elements]
 // multiple CTAs per single query
 //
 template <unsigned TEAM_SIZE,
-          unsigned BLOCK_SIZE,
-          unsigned BLOCK_COUNT,
           unsigned MAX_ELEMENTS,
           unsigned MAX_DATASET_DIM,
           class DATA_T,
           class DISTANCE_T,
           class INDEX_T,
-          class LOAD_T>
-__launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__ void search_kernel(
+          class LOAD_T,
+          class SAMPLE_FILTER_T>
+__launch_bounds__(1024, 1) __global__ void search_kernel(
   INDEX_T* const result_indices_ptr,       // [num_queries, num_cta_per_query, itopk_size]
   DISTANCE_T* const result_distances_ptr,  // [num_queries, num_cta_per_query, itopk_size]
   const DATA_T* const dataset_ptr,         // [dataset_size, dataset_dim]
@@ -152,10 +152,9 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__ void search_kernel(
   const uint32_t search_width,
   const uint32_t min_iteration,
   const uint32_t max_iteration,
-  uint32_t* const num_executed_iterations /* stats */
-)
+  uint32_t* const num_executed_iterations, /* stats */
+  SAMPLE_FILTER_T sample_filter)
 {
-  assert(blockDim.x == BLOCK_SIZE);
   assert(dataset_dim <= MAX_DATASET_DIM);
 
   const auto num_queries       = gridDim.y;
@@ -207,7 +206,7 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__ void search_kernel(
     }
 #endif
   const DATA_T* const query_ptr = queries_ptr + (dataset_dim * query_id);
-  for (unsigned i = threadIdx.x; i < MAX_DATASET_DIM; i += BLOCK_SIZE) {
+  for (unsigned i = threadIdx.x; i < MAX_DATASET_DIM; i += blockDim.x) {
     unsigned j = device::swizzling(i);
     if (i < dataset_dim) {
       query_buffer[j] = spatial::knn::detail::utils::mapping<float>{}(query_ptr[i]);
@@ -274,27 +273,70 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__ void search_kernel(
     _CLK_START();
     // constexpr unsigned max_n_frags = 16;
     constexpr unsigned max_n_frags = 0;
-    device::
-      compute_distance_to_child_nodes<TEAM_SIZE, BLOCK_SIZE, MAX_DATASET_DIM, max_n_frags, LOAD_T>(
-        result_indices_buffer + itopk_size,
-        result_distances_buffer + itopk_size,
-        query_buffer,
-        dataset_ptr,
-        dataset_dim,
-        dataset_ld,
-        knn_graph,
-        graph_degree,
-        local_visited_hashmap_ptr,
-        hash_bitlen,
-        parent_indices_buffer,
-        search_width);
+    device::compute_distance_to_child_nodes<TEAM_SIZE, MAX_DATASET_DIM, max_n_frags, LOAD_T>(
+      result_indices_buffer + itopk_size,
+      result_distances_buffer + itopk_size,
+      query_buffer,
+      dataset_ptr,
+      dataset_dim,
+      dataset_ld,
+      knn_graph,
+      graph_degree,
+      local_visited_hashmap_ptr,
+      hash_bitlen,
+      parent_indices_buffer,
+      result_indices_buffer,
+      search_width);
     _CLK_REC(clk_compute_distance);
     __syncthreads();
 
+    // Filtering
+    if constexpr (!std::is_same<SAMPLE_FILTER_T,
+                                raft::neighbors::filtering::none_cagra_sample_filter>::value) {
+      constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
+      const INDEX_T invalid_index        = utils::get_max_value<INDEX_T>();
+
+      for (unsigned p = threadIdx.x; p < search_width; p += blockDim.x) {
+        if (parent_indices_buffer[p] != invalid_index) {
+          const auto parent_id =
+            result_indices_buffer[parent_indices_buffer[p]] & ~index_msb_1_mask;
+          if (!sample_filter(query_id, parent_id)) {
+            // If the parent must not be in the resulting top-k list, remove from the parent list
+            result_distances_buffer[parent_indices_buffer[p]] = utils::get_max_value<DISTANCE_T>();
+            result_indices_buffer[parent_indices_buffer[p]]   = invalid_index;
+          }
+        }
+      }
+      __syncthreads();
+    }
+
     iter++;
   }
 
-  for (uint32_t i = threadIdx.x; i < itopk_size; i += BLOCK_SIZE) {
+  // Post process for filtering
+  if constexpr (!std::is_same<SAMPLE_FILTER_T,
+                              raft::neighbors::filtering::none_cagra_sample_filter>::value) {
+    constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
+    const INDEX_T invalid_index        = utils::get_max_value<INDEX_T>();
+
+    for (unsigned i = threadIdx.x; i < itopk_size + search_width * graph_degree; i += blockDim.x) {
+      const auto node_id = result_indices_buffer[i] & ~index_msb_1_mask;
+      if (node_id != (invalid_index & ~index_msb_1_mask) && !sample_filter(query_id, node_id)) {
+        // If the parent must not be in the resulting top-k list, remove from the parent list
+        result_distances_buffer[i] = utils::get_max_value<DISTANCE_T>();
+        result_indices_buffer[i]   = invalid_index;
+      }
+    }
+
+    __syncthreads();
+    topk_by_bitonic_sort<MAX_ELEMENTS, INDEX_T>(result_distances_buffer,
+                                                result_indices_buffer,
+                                                itopk_size + (search_width * graph_degree),
+                                                itopk_size);
+    __syncthreads();
+  }
+
+  for (uint32_t i = threadIdx.x; i < itopk_size; i += blockDim.x) {
     uint32_t j = i + (itopk_size * (cta_id + (num_cta_per_query * query_id)));
     if (result_distances_ptr != nullptr) { result_distances_ptr[j] = result_distances_buffer[i]; }
     constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
@@ -361,88 +403,52 @@ template <unsigned TEAM_SIZE,
           unsigned MAX_DATASET_DIM,
           typename DATA_T,
           typename INDEX_T,
-          typename DISTANCE_T>
+          typename DISTANCE_T,
+          typename SAMPLE_FILTER_T>
 struct search_kernel_config {
   // Search kernel function type. Note that the actual values for the template value
   // parameters do not matter, because they are not part of the function signature. The
   // second to fourth value parameters will be selected by the choose_* functions below.
   using kernel_t = decltype(&search_kernel<TEAM_SIZE,
-                                           64,
-                                           16,
                                            128,
                                            MAX_DATASET_DIM,
                                            DATA_T,
                                            DISTANCE_T,
                                            INDEX_T,
-                                           device::LOAD_128BIT_T>);
+                                           device::LOAD_128BIT_T,
+                                           SAMPLE_FILTER_T>);
 
   static auto choose_buffer_size(unsigned result_buffer_size, unsigned block_size) -> kernel_t
   {
     if (result_buffer_size <= 64) {
-      return choose_max_elements<64>(block_size);
-    } else if (result_buffer_size <= 128) {
-      return choose_max_elements<128>(block_size);
-    } else if (result_buffer_size <= 256) {
-      return choose_max_elements<256>(block_size);
-    }
-    THROW("Result buffer size %u larger than max buffer size %u", result_buffer_size, 256);
-  }
-
-  template <unsigned MAX_ELEMENTS>
-  // Todo: rename this to choose block_size
-  static auto choose_max_elements(unsigned block_size) -> kernel_t
-  {
-    if (block_size == 64) {
       return search_kernel<TEAM_SIZE,
                            64,
-                           16,
-                           MAX_ELEMENTS,
                            MAX_DATASET_DIM,
                            DATA_T,
                            DISTANCE_T,
                            INDEX_T,
-                           device::LOAD_128BIT_T>;
-    } else if (block_size == 128) {
+                           device::LOAD_128BIT_T,
+                           SAMPLE_FILTER_T>;
+    } else if (result_buffer_size <= 128) {
       return search_kernel<TEAM_SIZE,
                            128,
-                           8,
-                           MAX_ELEMENTS,
                            MAX_DATASET_DIM,
                            DATA_T,
                            DISTANCE_T,
                            INDEX_T,
-                           device::LOAD_128BIT_T>;
-    } else if (block_size == 256) {
+                           device::LOAD_128BIT_T,
+                           SAMPLE_FILTER_T>;
+    } else if (result_buffer_size <= 256) {
       return search_kernel<TEAM_SIZE,
                            256,
-                           4,
-                           MAX_ELEMENTS,
-                           MAX_DATASET_DIM,
-                           DATA_T,
-                           DISTANCE_T,
-                           INDEX_T,
-                           device::LOAD_128BIT_T>;
-    } else if (block_size == 512) {
-      return search_kernel<TEAM_SIZE,
-                           512,
-                           2,
-                           MAX_ELEMENTS,
-                           MAX_DATASET_DIM,
-                           DATA_T,
-                           DISTANCE_T,
-                           INDEX_T,
-                           device::LOAD_128BIT_T>;
-    } else {
-      return search_kernel<TEAM_SIZE,
-                           1024,
-                           1,
-                           MAX_ELEMENTS,
                            MAX_DATASET_DIM,
                            DATA_T,
                            DISTANCE_T,
                            INDEX_T,
-                           device::LOAD_128BIT_T>;
+                           device::LOAD_128BIT_T,
+                           SAMPLE_FILTER_T>;
     }
+    THROW("Result buffer size %u larger than max buffer size %u", result_buffer_size, 256);
   }
 };
 
@@ -450,7 +456,8 @@ template <unsigned TEAM_SIZE,
           unsigned MAX_DATASET_DIM,
           typename DATA_T,
           typename INDEX_T,
-          typename DISTANCE_T>
+          typename DISTANCE_T,
+          typename SAMPLE_FILTER_T>
 void select_and_run(  // raft::resources const& res,
   raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,
   raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,
@@ -475,10 +482,12 @@ void select_and_run(  // raft::resources const& res,
   size_t search_width,
   size_t min_iterations,
   size_t max_iterations,
+  SAMPLE_FILTER_T sample_filter,
   cudaStream_t stream)
 {
-  auto kernel = search_kernel_config<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>::
-    choose_buffer_size(result_buffer_size, block_size);
+  auto kernel =
+    search_kernel_config<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::
+      choose_buffer_size(result_buffer_size, block_size);
 
   RAFT_CUDA_TRY(
     cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
@@ -489,7 +498,7 @@ void select_and_run(  // raft::resources const& res,
 
   dim3 block_dims(block_size, 1, 1);
   dim3 grid_dims(num_cta_per_query, num_queries, 1);
-  RAFT_LOG_DEBUG("Launching kernel with %u threads, (%u, %u) blocks %lu smem",
+  RAFT_LOG_DEBUG("Launching kernel with %u threads, (%u, %u) blocks %u smem",
                  block_size,
                  num_cta_per_query,
                  num_queries,
@@ -513,7 +522,8 @@ void select_and_run(  // raft::resources const& res,
                                                        search_width,
                                                        min_iterations,
                                                        max_iterations,
-                                                       num_executed_iterations);
+                                                       num_executed_iterations,
+                                                       sample_filter);
 }
 
 }  // namespace multi_cta_search
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
index f312226f42..9392bde440 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
@@ -25,6 +25,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/neighbors/sample_filter_types.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <vector>
@@ -242,7 +243,7 @@ __global__ void pickup_next_parents_kernel(
       if (new_parent) {
         const auto i = __popc(ballot_mask & ((1 << threadIdx.x) - 1)) + num_new_parents;
         if (i < parent_list_size) {
-          parent_list_ptr[i + (ldd * query_id)] = index;
+          parent_list_ptr[i + (ldd * query_id)] = j;
           parent_candidates_ptr[j + (lds * query_id)] |=
             index_msb_1_mask;  // set most significant bit as used node
         }
@@ -253,7 +254,7 @@ __global__ void pickup_next_parents_kernel(
     if ((num_new_parents > 0) && (threadIdx.x == 0)) { *terminate_flag = 0; }
   } else if (small_hash_bitlen) {
     // reset small-hash
-    hashmap::init<32>(visited_hashmap_ptr + (ldb * query_id), hash_bitlen);
+    hashmap::init(visited_hashmap_ptr + (ldb * query_id), hash_bitlen, 32);
   }
 
   if (small_hash_bitlen) {
@@ -306,9 +307,13 @@ template <unsigned TEAM_SIZE,
           unsigned MAX_DATASET_DIM,
           class DATA_T,
           class INDEX_T,
-          class DISTANCE_T>
+          class DISTANCE_T,
+          class SAMPLE_FILTER_T>
 __global__ void compute_distance_to_child_nodes_kernel(
   const INDEX_T* const parent_node_list,  // [num_queries, search_width]
+  INDEX_T* const parent_candidates_ptr,   // [num_queries, search_width]
+  DISTANCE_T* const parent_distance_ptr,  // [num_queries, search_width]
+  const std::size_t lds,
   const std::uint32_t search_width,
   const DATA_T* const dataset_ptr,  // [dataset_size, data_dim]
   const std::uint32_t data_dim,
@@ -321,16 +326,25 @@ __global__ void compute_distance_to_child_nodes_kernel(
   const std::uint32_t hash_bitlen,
   INDEX_T* const result_indices_ptr,       // [num_queries, ldd]
   DISTANCE_T* const result_distances_ptr,  // [num_queries, ldd]
-  const std::uint32_t ldd                  // (*) ldd >= search_width * graph_degree
-)
+  const std::uint32_t ldd,                 // (*) ldd >= search_width * graph_degree
+  SAMPLE_FILTER_T sample_filter)
 {
   const uint32_t ldb        = hashmap::get_size(hash_bitlen);
   const auto tid            = threadIdx.x + blockDim.x * blockIdx.x;
   const auto global_team_id = tid / TEAM_SIZE;
+  const auto query_id       = blockIdx.y;
+
   if (global_team_id >= search_width * graph_degree) { return; }
 
-  const std::size_t parent_index =
+  const std::size_t parent_list_index =
     parent_node_list[global_team_id / graph_degree + (search_width * blockIdx.y)];
+
+  if (parent_list_index == utils::get_max_value<INDEX_T>()) { return; }
+
+  constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
+  const auto parent_index =
+    parent_candidates_ptr[parent_list_index + (lds * query_id)] & ~index_msb_1_mask;
+
   if (parent_index == utils::get_max_value<INDEX_T>()) {
     result_distances_ptr[ldd * blockIdx.y + global_team_id] = utils::get_max_value<DISTANCE_T>();
     return;
@@ -361,15 +375,28 @@ __global__ void compute_distance_to_child_nodes_kernel(
       result_distances_ptr[ldd * blockIdx.y + global_team_id] = utils::get_max_value<DISTANCE_T>();
     }
   }
+
+  if constexpr (!std::is_same<SAMPLE_FILTER_T,
+                              raft::neighbors::filtering::none_cagra_sample_filter>::value) {
+    if (!sample_filter(query_id, parent_index)) {
+      parent_candidates_ptr[parent_list_index + (lds * query_id)] = utils::get_max_value<INDEX_T>();
+      parent_distance_ptr[parent_list_index + (lds * query_id)] =
+        utils::get_max_value<DISTANCE_T>();
+    }
+  }
 }
 
 template <unsigned TEAM_SIZE,
           unsigned MAX_DATASET_DIM,
           class DATA_T,
           class INDEX_T,
-          class DISTANCE_T>
+          class DISTANCE_T,
+          class SAMPLE_FILTER_T>
 void compute_distance_to_child_nodes(
   const INDEX_T* const parent_node_list,  // [num_queries, search_width]
+  INDEX_T* const parent_candidates_ptr,   // [num_queries, search_width]
+  DISTANCE_T* const parent_distance_ptr,  // [num_queries, search_width]
+  const std::size_t lds,
   const uint32_t search_width,
   const DATA_T* const dataset_ptr,  // [dataset_size, data_dim]
   const std::uint32_t data_dim,
@@ -384,6 +411,7 @@ void compute_distance_to_child_nodes(
   INDEX_T* const result_indices_ptr,       // [num_queries, ldd]
   DISTANCE_T* const result_distances_ptr,  // [num_queries, ldd]
   const std::uint32_t ldd,                 // (*) ldd >= search_width * graph_degree
+  SAMPLE_FILTER_T sample_filter,
   cudaStream_t cuda_stream = 0)
 {
   const auto block_size = 128;
@@ -392,6 +420,9 @@ void compute_distance_to_child_nodes(
     num_queries);
   compute_distance_to_child_nodes_kernel<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>
     <<<grid_size, block_size, 0, cuda_stream>>>(parent_node_list,
+                                                parent_candidates_ptr,
+                                                parent_distance_ptr,
+                                                lds,
                                                 search_width,
                                                 dataset_ptr,
                                                 data_dim,
@@ -404,7 +435,8 @@ void compute_distance_to_child_nodes(
                                                 hash_bitlen,
                                                 result_indices_ptr,
                                                 result_distances_ptr,
-                                                ldd);
+                                                ldd,
+                                                sample_filter);
 }
 
 template <class INDEX_T>
@@ -436,6 +468,52 @@ void remove_parent_bit(const std::uint32_t num_queries,
     num_queries, num_topk, topk_indices_ptr, ld);
 }
 
+// This function called after the `remove_parent_bit` function
+template <class INDEX_T, class DISTANCE_T, class SAMPLE_FILTER_T>
+__global__ void apply_filter_kernel(INDEX_T* const result_indices_ptr,
+                                    DISTANCE_T* const result_distances_ptr,
+                                    const std::size_t lds,
+                                    const std::uint32_t result_buffer_size,
+                                    const std::uint32_t num_queries,
+                                    const INDEX_T query_id_offset,
+                                    SAMPLE_FILTER_T sample_filter)
+{
+  constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
+  const auto tid                     = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= result_buffer_size * num_queries) { return; }
+  const auto i     = tid % result_buffer_size;
+  const auto j     = tid / result_buffer_size;
+  const auto index = i + j * lds;
+
+  if (result_indices_ptr[index] != ~index_msb_1_mask &&
+      !sample_filter(query_id_offset + j, result_indices_ptr[index])) {
+    result_indices_ptr[index]   = utils::get_max_value<INDEX_T>();
+    result_distances_ptr[index] = utils::get_max_value<DISTANCE_T>();
+  }
+}
+
+template <class INDEX_T, class DISTANCE_T, class SAMPLE_FILTER_T>
+void apply_filter(INDEX_T* const result_indices_ptr,
+                  DISTANCE_T* const result_distances_ptr,
+                  const std::size_t lds,
+                  const std::uint32_t result_buffer_size,
+                  const std::uint32_t num_queries,
+                  const INDEX_T query_id_offset,
+                  SAMPLE_FILTER_T sample_filter,
+                  cudaStream_t cuda_stream)
+{
+  const std::uint32_t block_size = 256;
+  const std::uint32_t grid_size  = ceildiv(num_queries * result_buffer_size, block_size);
+
+  apply_filter_kernel<<<grid_size, block_size, 0, cuda_stream>>>(result_indices_ptr,
+                                                                 result_distances_ptr,
+                                                                 lds,
+                                                                 result_buffer_size,
+                                                                 num_queries,
+                                                                 query_id_offset,
+                                                                 sample_filter);
+}
+
 template <class T>
 __global__ void batched_memcpy_kernel(T* const dst,  // [batch_size, ld_dst]
                                       const uint64_t ld_dst,
@@ -508,41 +586,42 @@ template <unsigned TEAM_SIZE,
           unsigned MAX_DATASET_DIM,
           typename DATA_T,
           typename INDEX_T,
-          typename DISTANCE_T>
-struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::max_queries;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::itopk_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::algo;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::team_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::search_width;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::min_iterations;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::max_iterations;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::thread_block_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_mode;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_min_bitlen;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_max_fill_rate;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_random_samplings;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::rand_xor_mask;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::max_dim;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::dim;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::graph_degree;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::topk;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hash_bitlen;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::small_hash_bitlen;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::small_hash_reset_interval;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::dataset_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::result_buffer_size;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::smem_size;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_executed_iterations;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::dev_seed;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_seeds;
+          typename DISTANCE_T,
+          typename SAMPLE_FILTER_T>
+struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T> {
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::max_queries;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::itopk_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::algo;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::team_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::search_width;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::min_iterations;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::max_iterations;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::thread_block_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap_mode;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap_min_bitlen;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap_max_fill_rate;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::num_random_samplings;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::rand_xor_mask;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::max_dim;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::dim;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::graph_degree;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::topk;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hash_bitlen;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::small_hash_bitlen;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::small_hash_reset_interval;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::dataset_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::result_buffer_size;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::smem_size;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::num_executed_iterations;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::dev_seed;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::num_seeds;
 
   size_t result_buffer_allocation_size;
   rmm::device_uvector<INDEX_T> result_indices;  // results_indices_buffer
@@ -557,7 +636,8 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
          int64_t dim,
          int64_t graph_degree,
          uint32_t topk)
-    : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>(res, params, dim, graph_degree, topk),
+    : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>(
+        res, params, dim, graph_degree, topk),
       result_indices(0, resource::get_cuda_stream(res)),
       result_distances(0, resource::get_cuda_stream(res)),
       parent_node_list(0, resource::get_cuda_stream(res)),
@@ -602,7 +682,8 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
                   const uint32_t num_queries,
                   const INDEX_T* dev_seed_ptr,              // [num_queries, num_seeds]
                   uint32_t* const num_executed_iterations,  // [num_queries,]
-                  uint32_t topk)
+                  uint32_t topk,
+                  SAMPLE_FILTER_T sample_filter)
   {
     // Init hashmap
     cudaStream_t stream      = resource::get_cuda_stream(res);
@@ -684,6 +765,9 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
       // Compute distance to child nodes that are adjacent to the parent node
       compute_distance_to_child_nodes<TEAM_SIZE, MAX_DATASET_DIM>(
         parent_node_list.data(),
+        result_indices.data() + (1 - (iter & 0x1)) * result_buffer_size,
+        result_distances.data() + (1 - (iter & 0x1)) * result_buffer_size,
+        result_buffer_allocation_size,
         search_width,
         dataset.data_handle(),
         dataset.extent(1),
@@ -698,22 +782,60 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
         result_indices.data() + itopk_size,
         result_distances.data() + itopk_size,
         result_buffer_allocation_size,
+        sample_filter,
         stream);
 
       iter++;
     }  // while ( 1 )
+    auto result_indices_ptr   = result_indices.data() + (iter & 0x1) * result_buffer_size;
+    auto result_distances_ptr = result_distances.data() + (iter & 0x1) * result_buffer_size;
+
+    if constexpr (!std::is_same<SAMPLE_FILTER_T,
+                                raft::neighbors::filtering::none_cagra_sample_filter>::value) {
+      // Remove parent bit in search results
+      remove_parent_bit(num_queries,
+                        result_buffer_size,
+                        result_indices.data() + (iter & 0x1) * itopk_size,
+                        result_buffer_allocation_size,
+                        stream);
+
+      apply_filter<INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>(
+        result_indices.data() + (iter & 0x1) * itopk_size,
+        result_distances.data() + (iter & 0x1) * itopk_size,
+        result_buffer_allocation_size,
+        result_buffer_size,
+        num_queries,
+        0,
+        sample_filter,
+        stream);
 
-    // Remove parent bit in search results
-    remove_parent_bit(num_queries,
-                      itopk_size,
-                      result_indices.data() + (iter & 0x1) * result_buffer_size,
-                      result_buffer_allocation_size,
-                      stream);
+      result_indices_ptr   = result_indices.data() + (1 - (iter & 0x1)) * result_buffer_size;
+      result_distances_ptr = result_distances.data() + (1 - (iter & 0x1)) * result_buffer_size;
+      _cuann_find_topk(itopk_size,
+                       num_queries,
+                       result_buffer_size,
+                       result_distances.data() + (iter & 0x1) * itopk_size,
+                       result_buffer_allocation_size,
+                       result_indices.data() + (iter & 0x1) * itopk_size,
+                       result_buffer_allocation_size,
+                       result_distances_ptr,
+                       result_buffer_allocation_size,
+                       result_indices_ptr,
+                       result_buffer_allocation_size,
+                       topk_workspace.data(),
+                       true,
+                       topk_hint.data(),
+                       stream);
+    } else {
+      // Remove parent bit in search results
+      remove_parent_bit(
+        num_queries, itopk_size, result_indices_ptr, result_buffer_allocation_size, stream);
+    }
 
     // Copy results from working buffer to final buffer
     batched_memcpy(topk_indices_ptr,
                    topk,
-                   result_indices.data() + (iter & 0x1) * result_buffer_size,
+                   result_indices_ptr,
                    result_buffer_allocation_size,
                    topk,
                    num_queries,
@@ -721,7 +843,7 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
     if (topk_distances_ptr) {
       batched_memcpy(topk_distances_ptr,
                      topk,
-                     result_distances.data() + (iter & 0x1) * result_buffer_size,
+                     result_distances_ptr,
                      result_buffer_allocation_size,
                      topk,
                      num_queries,
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh b/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
index 33c77db61e..147b8b753d 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
@@ -38,12 +38,13 @@ struct search_plan_impl_base : public search_params {
   {
     set_max_dim_team(dim);
     if (algo == search_algo::AUTO) {
-      if (itopk_size <= 512) {
+      const size_t num_sm = raft::getMultiProcessorCount();
+      if (itopk_size <= 512 && search_params::max_queries >= num_sm * 2lu) {
         algo = search_algo::SINGLE_CTA;
         RAFT_LOG_DEBUG("Auto strategy: selecting single-cta");
       } else {
-        algo = search_algo::MULTI_KERNEL;
-        RAFT_LOG_DEBUG("Auto strategy: selecting multi-kernel");
+        algo = search_algo::MULTI_CTA;
+        RAFT_LOG_DEBUG("Auto strategy: selecting multi-cta");
       }
     }
   }
@@ -65,7 +66,7 @@ struct search_plan_impl_base : public search_params {
   }
 };
 
-template <class DATA_T, class INDEX_T, class DISTANCE_T>
+template <class DATA_T, class INDEX_T, class DISTANCE_T, class SAMPLE_FILTER_T>
 struct search_plan_impl : public search_plan_impl_base {
   int64_t hash_bitlen;
 
@@ -113,7 +114,8 @@ struct search_plan_impl : public search_plan_impl_base {
                           const std::uint32_t num_queries,
                           const INDEX_T* dev_seed_ptr,                   // [num_queries, num_seeds]
                           std::uint32_t* const num_executed_iterations,  // [num_queries]
-                          uint32_t topk){};
+                          uint32_t topk,
+                          SAMPLE_FILTER_T sample_filter){};
 
   void adjust_search_params()
   {
@@ -129,13 +131,13 @@ struct search_plan_impl : public search_plan_impl_base {
     if (max_iterations < min_iterations) { _max_iterations = min_iterations; }
     if (max_iterations < _max_iterations) {
       RAFT_LOG_DEBUG(
-        "# max_iterations is increased from %u to %u.", max_iterations, _max_iterations);
+        "# max_iterations is increased from %lu to %u.", max_iterations, _max_iterations);
       max_iterations = _max_iterations;
     }
     if (itopk_size % 32) {
       uint32_t itopk32 = itopk_size;
       itopk32 += 32 - (itopk_size % 32);
-      RAFT_LOG_DEBUG("# internal_topk is increased from %u to %u, as it must be multiple of 32.",
+      RAFT_LOG_DEBUG("# internal_topk is increased from %lu to %u, as it must be multiple of 32.",
                      itopk_size,
                      itopk32);
       itopk_size = itopk32;
@@ -289,6 +291,14 @@ struct search_plan_impl : public search_plan_impl_base {
         "`hashmap_max_fill_rate` must be equal to or greater than 0.1 and smaller than 0.9. " +
         std::to_string(hashmap_max_fill_rate) + " has been given.";
     }
+    if constexpr (!std::is_same<SAMPLE_FILTER_T,
+                                raft::neighbors::filtering::none_cagra_sample_filter>::value) {
+      if (hashmap_mode == hash_mode::SMALL) {
+        error_message += "`SMALL` hash is not available when filtering";
+      } else {
+        hashmap_mode = hash_mode::HASH;
+      }
+    }
     if (algo == search_algo::MULTI_CTA) {
       if (hashmap_mode == hash_mode::SMALL) {
         error_message += "`small_hash` is not available when 'search_mode' is \"multi-cta\"";
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh
index 45dd535e1d..b36bc6f77b 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh
@@ -49,41 +49,42 @@ template <unsigned TEAM_SIZE,
           unsigned MAX_DATASET_DIM,
           typename DATA_T,
           typename INDEX_T,
-          typename DISTANCE_T>
-struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::max_queries;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::itopk_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::algo;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::team_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::search_width;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::min_iterations;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::max_iterations;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::thread_block_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_mode;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_min_bitlen;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_max_fill_rate;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_random_samplings;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::rand_xor_mask;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::max_dim;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::dim;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::graph_degree;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::topk;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hash_bitlen;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::small_hash_bitlen;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::small_hash_reset_interval;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::dataset_size;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::result_buffer_size;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::smem_size;
-
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_executed_iterations;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::dev_seed;
-  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_seeds;
+          typename DISTANCE_T,
+          typename SAMPLE_FILTER_T>
+struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T> {
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::max_queries;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::itopk_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::algo;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::team_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::search_width;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::min_iterations;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::max_iterations;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::thread_block_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap_mode;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap_min_bitlen;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap_max_fill_rate;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::num_random_samplings;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::rand_xor_mask;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::max_dim;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::dim;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::graph_degree;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::topk;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hash_bitlen;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::small_hash_bitlen;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::small_hash_reset_interval;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::dataset_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::result_buffer_size;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::smem_size;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::hashmap;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::num_executed_iterations;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::dev_seed;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::num_seeds;
 
   uint32_t num_itopk_candidates;
 
@@ -92,7 +93,8 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
          int64_t dim,
          int64_t graph_degree,
          uint32_t topk)
-    : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>(res, params, dim, graph_degree, topk)
+    : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>(
+        res, params, dim, graph_degree, topk)
   {
     set_params(res);
   }
@@ -111,7 +113,7 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
     RAFT_EXPECTS(itopk_size <= max_itopk, "itopk_size cannot be larger than %u", max_itopk);
 
     RAFT_LOG_DEBUG("# num_itopk_candidates: %u", num_itopk_candidates);
-    RAFT_LOG_DEBUG("# num_itopk: %u", itopk_size);
+    RAFT_LOG_DEBUG("# num_itopk: %lu", itopk_size);
     //
     // Determine the thread block size
     //
@@ -129,11 +131,9 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
       // Tentatively calculate the required share memory size when radix
       // sort based topk is used, assuming the block size is the maximum.
       if (itopk_size <= 256) {
-        smem_size +=
-          topk_by_radix_sort<256, max_block_size, INDEX_T>::smem_size * sizeof(std::uint32_t);
+        smem_size += topk_by_radix_sort<256, INDEX_T>::smem_size * sizeof(std::uint32_t);
       } else {
-        smem_size +=
-          topk_by_radix_sort<512, max_block_size, INDEX_T>::smem_size * sizeof(std::uint32_t);
+        smem_size += topk_by_radix_sort<512, INDEX_T>::smem_size * sizeof(std::uint32_t);
       }
     }
 
@@ -186,34 +186,10 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
       smem_size = base_smem_size;
       if (itopk_size <= 256) {
         constexpr unsigned MAX_ITOPK = 256;
-        if (block_size == 256) {
-          constexpr unsigned BLOCK_SIZE = 256;
-          smem_size +=
-            topk_by_radix_sort<MAX_ITOPK, BLOCK_SIZE, INDEX_T>::smem_size * sizeof(std::uint32_t);
-        } else if (block_size == 512) {
-          constexpr unsigned BLOCK_SIZE = 512;
-          smem_size +=
-            topk_by_radix_sort<MAX_ITOPK, BLOCK_SIZE, INDEX_T>::smem_size * sizeof(std::uint32_t);
-        } else {
-          constexpr unsigned BLOCK_SIZE = 1024;
-          smem_size +=
-            topk_by_radix_sort<MAX_ITOPK, BLOCK_SIZE, INDEX_T>::smem_size * sizeof(std::uint32_t);
-        }
+        smem_size += topk_by_radix_sort<MAX_ITOPK, INDEX_T>::smem_size * sizeof(std::uint32_t);
       } else {
         constexpr unsigned MAX_ITOPK = 512;
-        if (block_size == 256) {
-          constexpr unsigned BLOCK_SIZE = 256;
-          smem_size +=
-            topk_by_radix_sort<MAX_ITOPK, BLOCK_SIZE, INDEX_T>::smem_size * sizeof(std::uint32_t);
-        } else if (block_size == 512) {
-          constexpr unsigned BLOCK_SIZE = 512;
-          smem_size +=
-            topk_by_radix_sort<MAX_ITOPK, BLOCK_SIZE, INDEX_T>::smem_size * sizeof(std::uint32_t);
-        } else {
-          constexpr unsigned BLOCK_SIZE = 1024;
-          smem_size +=
-            topk_by_radix_sort<MAX_ITOPK, BLOCK_SIZE, INDEX_T>::smem_size * sizeof(std::uint32_t);
-        }
+        smem_size += topk_by_radix_sort<MAX_ITOPK, INDEX_T>::smem_size * sizeof(std::uint32_t);
       }
     }
     RAFT_LOG_DEBUG("# smem_size: %u", smem_size);
@@ -234,7 +210,8 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
                   const std::uint32_t num_queries,
                   const INDEX_T* dev_seed_ptr,                   // [num_queries, num_seeds]
                   std::uint32_t* const num_executed_iterations,  // [num_queries]
-                  uint32_t topk)
+                  uint32_t topk,
+                  SAMPLE_FILTER_T sample_filter)
   {
     cudaStream_t stream = resource::get_cuda_stream(res);
     select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(
@@ -261,6 +238,7 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
       search_width,
       min_iterations,
       max_iterations,
+      sample_filter,
       stream);
   }
 };
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh
index 5f5df1a818..35d239563a 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh
@@ -15,7 +15,9 @@
  */
 #pragma once
 
+#include <raft/neighbors/sample_filter_types.hpp>
 #include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
+
 namespace raft::neighbors::cagra::detail {
 namespace single_cta_search {
 
@@ -25,7 +27,8 @@ template <unsigned TEAM_SIZE,
           unsigned MAX_DATASET_DIM,
           typename DATA_T,
           typename INDEX_T,
-          typename DISTANCE_T>
+          typename DISTANCE_T,
+          typename SAMPLE_FILTER_T>
 void select_and_run(  // raft::resources const& res,
   raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,
   raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,
@@ -50,50 +53,65 @@ void select_and_run(  // raft::resources const& res,
   size_t search_width,
   size_t min_iterations,
   size_t max_iterations,
+  SAMPLE_FILTER_T sample_filter,
   cudaStream_t stream) RAFT_EXPLICIT;
 
 #endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
 
-#define instantiate_single_cta_select_and_run(                                                  \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                                      \
-  extern template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                     \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                          \
-    INDEX_T* const topk_indices_ptr,                                                            \
-    DISTANCE_T* const topk_distances_ptr,                                                       \
-    const DATA_T* const queries_ptr,                                                            \
-    const uint32_t num_queries,                                                                 \
-    const INDEX_T* dev_seed_ptr,                                                                \
-    uint32_t* const num_executed_iterations,                                                    \
-    uint32_t topk,                                                                              \
-    uint32_t num_itopk_candidates,                                                              \
-    uint32_t block_size,                                                                        \
-    uint32_t smem_size,                                                                         \
-    int64_t hash_bitlen,                                                                        \
-    INDEX_T* hashmap_ptr,                                                                       \
-    size_t small_hash_bitlen,                                                                   \
-    size_t small_hash_reset_interval,                                                           \
-    uint32_t num_random_samplings,                                                              \
-    uint64_t rand_xor_mask,                                                                     \
-    uint32_t num_seeds,                                                                         \
-    size_t itopk_size,                                                                          \
-    size_t search_width,                                                                        \
-    size_t min_iterations,                                                                      \
-    size_t max_iterations,                                                                      \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  extern template void                                                                      \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(32, 1024, float, uint32_t, float);
-instantiate_single_cta_select_and_run(8, 128, float, uint32_t, float);
-instantiate_single_cta_select_and_run(16, 256, float, uint32_t, float);
-instantiate_single_cta_select_and_run(32, 512, float, uint32_t, float);
-instantiate_single_cta_select_and_run(32, 1024, int8_t, uint32_t, float);
-instantiate_single_cta_select_and_run(8, 128, int8_t, uint32_t, float);
-instantiate_single_cta_select_and_run(16, 256, int8_t, uint32_t, float);
-instantiate_single_cta_select_and_run(32, 512, int8_t, uint32_t, float);
-instantiate_single_cta_select_and_run(32, 1024, uint8_t, uint32_t, float);
-instantiate_single_cta_select_and_run(8, 128, uint8_t, uint32_t, float);
-instantiate_single_cta_select_and_run(16, 256, uint8_t, uint32_t, float);
-instantiate_single_cta_select_and_run(32, 512, uint8_t, uint32_t, float);
+instantiate_single_cta_select_and_run(
+  32, 1024, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_single_cta_select_and_run(
+  8, 128, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_single_cta_select_and_run(
+  16, 256, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_single_cta_select_and_run(
+  32, 512, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_single_cta_select_and_run(
+  32, 1024, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_single_cta_select_and_run(
+  8, 128, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_single_cta_select_and_run(
+  16, 256, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_single_cta_select_and_run(
+  32, 512, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_single_cta_select_and_run(
+  32, 1024, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_single_cta_select_and_run(
+  8, 128, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_single_cta_select_and_run(
+  16, 256, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_single_cta_select_and_run(
+  32, 512, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_select_and_run
 
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 81325fd5da..3a5501f545 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -25,6 +25,7 @@
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/device_properties.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/neighbors/sample_filter_types.hpp>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 #include <rmm/device_uvector.hpp>
 #include <vector>
@@ -78,7 +79,7 @@ __device__ void pickup_next_parents(std::uint32_t* const terminate_flag,
     if (new_parent) {
       const auto i = __popc(ballot_mask & ((1 << threadIdx.x) - 1)) + num_new_parents;
       if (i < search_width) {
-        next_parent_indices[i] = index;
+        next_parent_indices[i] = jj;
         // set most significant bit as used node
         internal_topk_indices[jj] |= index_msb_1_mask;
       }
@@ -89,11 +90,12 @@ __device__ void pickup_next_parents(std::uint32_t* const terminate_flag,
   if (threadIdx.x == 0 && (num_new_parents == 0)) { *terminate_flag = 1; }
 }
 
-template <unsigned MAX_CANDIDATES, unsigned MULTI_WARPS = 0, class IdxT = void>
+template <unsigned MAX_CANDIDATES, class IdxT = void>
 __device__ inline void topk_by_bitonic_sort_1st(float* candidate_distances,  // [num_candidates]
                                                 IdxT* candidate_indices,     // [num_candidates]
                                                 const std::uint32_t num_candidates,
-                                                const std::uint32_t num_itopk)
+                                                const std::uint32_t num_itopk,
+                                                unsigned MULTI_WARPS = 0)
 {
   const unsigned lane_id = threadIdx.x % 32;
   const unsigned warp_id = threadIdx.x / 32;
@@ -191,7 +193,7 @@ __device__ inline void topk_by_bitonic_sort_1st(float* candidate_distances,  //
   }
 }
 
-template <unsigned MAX_ITOPK, unsigned MULTI_WARPS = 0, class IdxT = void>
+template <unsigned MAX_ITOPK, class IdxT = void>
 __device__ inline void topk_by_bitonic_sort_2nd(float* itopk_distances,  // [num_itopk]
                                                 IdxT* itopk_indices,     // [num_itopk]
                                                 const std::uint32_t num_itopk,
@@ -199,7 +201,8 @@ __device__ inline void topk_by_bitonic_sort_2nd(float* itopk_distances,  // [num
                                                 IdxT* candidate_indices,     // [num_candidates]
                                                 const std::uint32_t num_candidates,
                                                 std::uint32_t* work_buf,
-                                                const bool first)
+                                                const bool first,
+                                                unsigned MULTI_WARPS = 0)
 {
   const unsigned lane_id = threadIdx.x % 32;
   const unsigned warp_id = threadIdx.x / 32;
@@ -398,8 +401,6 @@ __device__ inline void topk_by_bitonic_sort_2nd(float* itopk_distances,  // [num
 
 template <unsigned MAX_ITOPK,
           unsigned MAX_CANDIDATES,
-          unsigned MULTI_WARPS_1,
-          unsigned MULTI_WARPS_2,
           class IdxT>
 __device__ void topk_by_bitonic_sort(float* itopk_distances,  // [num_itopk]
                                      IdxT* itopk_indices,     // [num_itopk]
@@ -408,33 +409,37 @@ __device__ void topk_by_bitonic_sort(float* itopk_distances,  // [num_itopk]
                                      IdxT* candidate_indices,     // [num_candidates]
                                      const std::uint32_t num_candidates,
                                      std::uint32_t* work_buf,
-                                     const bool first)
+                                     const bool first,
+                                     const unsigned MULTI_WARPS_1,
+                                     const unsigned MULTI_WARPS_2)
 {
   // The results in candidate_distances/indices are sorted by bitonic sort.
-  topk_by_bitonic_sort_1st<MAX_CANDIDATES, MULTI_WARPS_1, IdxT>(
-    candidate_distances, candidate_indices, num_candidates, num_itopk);
+  topk_by_bitonic_sort_1st<MAX_CANDIDATES, IdxT>(
+    candidate_distances, candidate_indices, num_candidates, num_itopk, MULTI_WARPS_1);
 
   // The results sorted above are merged with the internal intermediate top-k
   // results so far using bitonic merge.
-  topk_by_bitonic_sort_2nd<MAX_ITOPK, MULTI_WARPS_2, IdxT>(itopk_distances,
-                                                           itopk_indices,
-                                                           num_itopk,
-                                                           candidate_distances,
-                                                           candidate_indices,
-                                                           num_candidates,
-                                                           work_buf,
-                                                           first);
+  topk_by_bitonic_sort_2nd<MAX_ITOPK, IdxT>(itopk_distances,
+                                            itopk_indices,
+                                            num_itopk,
+                                            candidate_distances,
+                                            candidate_indices,
+                                            num_candidates,
+                                            work_buf,
+                                            first,
+                                            MULTI_WARPS_2);
 }
 
-template <unsigned FIRST_TID, unsigned LAST_TID, class INDEX_T>
+template <class INDEX_T>
 __device__ inline void hashmap_restore(INDEX_T* const hashmap_ptr,
                                        const size_t hashmap_bitlen,
                                        const INDEX_T* itopk_indices,
-                                       uint32_t itopk_size)
+                                       const uint32_t itopk_size,
+                                       const uint32_t first_tid = 0)
 {
   constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
-  if (threadIdx.x < FIRST_TID || threadIdx.x >= LAST_TID) return;
-  for (unsigned i = threadIdx.x - FIRST_TID; i < itopk_size; i += LAST_TID - FIRST_TID) {
+  if (threadIdx.x < first_tid) return;
+  for (unsigned i = threadIdx.x - first_tid; i < itopk_size; i += blockDim.x - first_tid) {
     auto key = itopk_indices[i] & ~index_msb_1_mask;  // clear most significant bit
     hashmap::insert(hashmap_ptr, hashmap_bitlen, key);
   }
@@ -450,16 +455,15 @@ __device__ inline void set_value_device(T* const ptr, const T fill, const std::u
 
 // One query one thread block
 template <unsigned TEAM_SIZE,
-          unsigned BLOCK_SIZE,
-          unsigned BLOCK_COUNT,
           unsigned MAX_ITOPK,
           unsigned MAX_CANDIDATES,
           unsigned TOPK_BY_BITONIC_SORT,
           unsigned MAX_DATASET_DIM,
           class DATA_T,
           class DISTANCE_T,
-          class INDEX_T>
-__launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__
+          class INDEX_T,
+          class SAMPLE_FILTER_T>
+__launch_bounds__(1024, 1) __global__
   void search_kernel(INDEX_T* const result_indices_ptr,       // [num_queries, top_k]
                      DISTANCE_T* const result_distances_ptr,  // [num_queries, top_k]
                      const std::uint32_t top_k,
@@ -482,7 +486,8 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__
                      std::uint32_t* const num_executed_iterations,  // [num_queries]
                      const std::uint32_t hash_bitlen,
                      const std::uint32_t small_hash_bitlen,
-                     const std::uint32_t small_hash_reset_interval)
+                     const std::uint32_t small_hash_reset_interval,
+                     SAMPLE_FILTER_T sample_filter)
 {
   using LOAD_T        = device::LOAD_128BIT_T;
   const auto query_id = blockIdx.y;
@@ -527,8 +532,11 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__
   auto terminate_flag     = reinterpret_cast<std::uint32_t*>(topk_ws + 3);
   auto smem_working_ptr   = reinterpret_cast<std::uint32_t*>(terminate_flag + 1);
 
+  // A flag for filtering.
+  auto filter_flag = terminate_flag;
+
   const DATA_T* const query_ptr = queries_ptr + query_id * dataset_dim;
-  for (unsigned i = threadIdx.x; i < MAX_DATASET_DIM; i += BLOCK_SIZE) {
+  for (unsigned i = threadIdx.x; i < MAX_DATASET_DIM; i += blockDim.x) {
     unsigned j = device::swizzling(i);
     if (i < dataset_dim) {
       query_buffer[j] = spatial::knn::detail::utils::mapping<float>{}(query_ptr[i]);
@@ -548,7 +556,7 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__
   } else {
     local_visited_hashmap_ptr = visited_hashmap_ptr + (hashmap::get_size(hash_bitlen) * query_id);
   }
-  hashmap::init<0, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen);
+  hashmap::init(local_visited_hashmap_ptr, hash_bitlen, 0);
   __syncthreads();
   _CLK_REC(clk_init);
 
@@ -576,7 +584,7 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__
   std::uint32_t iter = 0;
   while (1) {
     // sort
-    if (TOPK_BY_BITONIC_SORT) {
+    if constexpr (TOPK_BY_BITONIC_SORT) {
       // [Notice]
       // It is good to use multiple warps in topk_by_bitonic_sort() when
       // batch size is small (short-latency), but it might not be always good
@@ -584,8 +592,8 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__
       // topk_by_bitonic_sort() consists of two operations:
       // if MAX_CANDIDATES is greater than 128, the first operation uses two warps;
       // if MAX_ITOPK is greater than 256, the second operation used two warps.
-      constexpr unsigned multi_warps_1 = ((BLOCK_SIZE >= 64) && (MAX_CANDIDATES > 128)) ? 1 : 0;
-      constexpr unsigned multi_warps_2 = ((BLOCK_SIZE >= 64) && (MAX_ITOPK > 256)) ? 1 : 0;
+      const unsigned multi_warps_1 = ((blockDim.x >= 64) && (MAX_CANDIDATES > 128)) ? 1 : 0;
+      const unsigned multi_warps_2 = ((blockDim.x >= 64) && (MAX_ITOPK > 256)) ? 1 : 0;
 
       // reset small-hash table.
       if ((iter + 1) % small_hash_reset_interval == 0) {
@@ -594,41 +602,56 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__
         // the small hash and whether they are performed in overlap with
         // topk_by_bitonic_sort().
         _CLK_START();
-        if (BLOCK_SIZE == 32) {
-          hashmap::init<0, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen);
-        } else if (BLOCK_SIZE == 64) {
+        unsigned hash_start_tid;
+        if (blockDim.x == 32) {
+          hash_start_tid = 0;
+        } else if (blockDim.x == 64) {
           if (multi_warps_1 || multi_warps_2) {
-            hashmap::init<0, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen);
+            hash_start_tid = 0;
           } else {
-            hashmap::init<32, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen);
+            hash_start_tid = 32;
           }
         } else {
           if (multi_warps_1 || multi_warps_2) {
-            hashmap::init<64, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen);
+            hash_start_tid = 64;
           } else {
-            hashmap::init<32, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen);
+            hash_start_tid = 32;
           }
         }
+        hashmap::init(local_visited_hashmap_ptr, hash_bitlen, hash_start_tid);
         _CLK_REC(clk_reset_hash);
       }
 
       // topk with bitonic sort
       _CLK_START();
-      topk_by_bitonic_sort<MAX_ITOPK, MAX_CANDIDATES, multi_warps_1, multi_warps_2>(
-        result_distances_buffer,
-        result_indices_buffer,
-        internal_topk,
-        result_distances_buffer + internal_topk,
-        result_indices_buffer + internal_topk,
-        search_width * graph_degree,
-        topk_ws,
-        (iter == 0));
+      if (std::is_same<SAMPLE_FILTER_T,
+                       raft::neighbors::filtering::none_cagra_sample_filter>::value ||
+          *filter_flag == 0) {
+        topk_by_bitonic_sort<MAX_ITOPK, MAX_CANDIDATES>(result_distances_buffer,
+                                                        result_indices_buffer,
+                                                        internal_topk,
+                                                        result_distances_buffer + internal_topk,
+                                                        result_indices_buffer + internal_topk,
+                                                        search_width * graph_degree,
+                                                        topk_ws,
+                                                        (iter == 0),
+                                                        multi_warps_1,
+                                                        multi_warps_2);
+        __syncthreads();
+      } else {
+        topk_by_bitonic_sort_1st<MAX_ITOPK + MAX_CANDIDATES>(
+          result_distances_buffer,
+          result_indices_buffer,
+          internal_topk + search_width * graph_degree,
+          internal_topk,
+          false);
+        if (threadIdx.x == 0) { *terminate_flag = 0; }
+      }
       _CLK_REC(clk_topk);
-
     } else {
       _CLK_START();
       // topk with radix block sort
-      topk_by_radix_sort<MAX_ITOPK, BLOCK_SIZE, INDEX_T>{}(
+      topk_by_radix_sort<MAX_ITOPK, INDEX_T>{}(
         internal_topk,
         gridDim.x,
         result_buffer_size,
@@ -645,7 +668,7 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__
       // reset small-hash table
       if ((iter + 1) % small_hash_reset_interval == 0) {
         _CLK_START();
-        hashmap::init<0, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen);
+        hashmap::init(local_visited_hashmap_ptr, hash_bitlen);
         _CLK_REC(clk_reset_hash);
       }
     }
@@ -667,10 +690,10 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__
 
     // restore small-hash table by putting internal-topk indices in it
     if ((iter + 1) % small_hash_reset_interval == 0) {
-      constexpr unsigned first_tid = ((BLOCK_SIZE <= 32) ? 0 : 32);
+      const unsigned first_tid = ((blockDim.x <= 32) ? 0 : 32);
       _CLK_START();
-      hashmap_restore<first_tid, BLOCK_SIZE>(
-        local_visited_hashmap_ptr, hash_bitlen, result_indices_buffer, internal_topk);
+      hashmap_restore(
+        local_visited_hashmap_ptr, hash_bitlen, result_indices_buffer, internal_topk, first_tid);
       _CLK_REC(clk_restore_hash);
     }
     __syncthreads();
@@ -680,26 +703,75 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__
     // compute the norms between child nodes and query node
     _CLK_START();
     constexpr unsigned max_n_frags = 16;
-    device::
-      compute_distance_to_child_nodes<TEAM_SIZE, BLOCK_SIZE, MAX_DATASET_DIM, max_n_frags, LOAD_T>(
-        result_indices_buffer + internal_topk,
-        result_distances_buffer + internal_topk,
-        query_buffer,
-        dataset_ptr,
-        dataset_dim,
-        dataset_ld,
-        knn_graph,
-        graph_degree,
-        local_visited_hashmap_ptr,
-        hash_bitlen,
-        parent_list_buffer,
-        search_width);
+    device::compute_distance_to_child_nodes<TEAM_SIZE, MAX_DATASET_DIM, max_n_frags, LOAD_T>(
+      result_indices_buffer + internal_topk,
+      result_distances_buffer + internal_topk,
+      query_buffer,
+      dataset_ptr,
+      dataset_dim,
+      dataset_ld,
+      knn_graph,
+      graph_degree,
+      local_visited_hashmap_ptr,
+      hash_bitlen,
+      parent_list_buffer,
+      result_indices_buffer,
+      search_width);
     __syncthreads();
     _CLK_REC(clk_compute_distance);
 
+    // Filtering
+    if constexpr (!std::is_same<SAMPLE_FILTER_T,
+                                raft::neighbors::filtering::none_cagra_sample_filter>::value) {
+      if (threadIdx.x == 0) { *filter_flag = 0; }
+      __syncthreads();
+
+      constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
+      const INDEX_T invalid_index        = utils::get_max_value<INDEX_T>();
+
+      for (unsigned p = threadIdx.x; p < search_width; p += blockDim.x) {
+        if (parent_list_buffer[p] != invalid_index) {
+          const auto parent_id = result_indices_buffer[parent_list_buffer[p]] & ~index_msb_1_mask;
+          if (!sample_filter(query_id, parent_id)) {
+            // If the parent must not be in the resulting top-k list, remove from the parent list
+            result_distances_buffer[parent_list_buffer[p]] = utils::get_max_value<DISTANCE_T>();
+            result_indices_buffer[parent_list_buffer[p]]   = invalid_index;
+            *filter_flag                                   = 1;
+          }
+        }
+      }
+      __syncthreads();
+    }
+
     iter++;
   }
-  for (std::uint32_t i = threadIdx.x; i < top_k; i += BLOCK_SIZE) {
+
+  // Post process for filtering
+  if constexpr (!std::is_same<SAMPLE_FILTER_T,
+                              raft::neighbors::filtering::none_cagra_sample_filter>::value) {
+    constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
+    const INDEX_T invalid_index        = utils::get_max_value<INDEX_T>();
+
+    for (unsigned i = threadIdx.x; i < internal_topk + search_width * graph_degree;
+         i += blockDim.x) {
+      const auto node_id = result_indices_buffer[i] & ~index_msb_1_mask;
+      if (node_id != (invalid_index & ~index_msb_1_mask) && !sample_filter(query_id, node_id)) {
+        result_distances_buffer[i] = utils::get_max_value<DISTANCE_T>();
+        result_indices_buffer[i]   = invalid_index;
+      }
+    }
+
+    __syncthreads();
+    topk_by_bitonic_sort_1st<MAX_ITOPK + MAX_CANDIDATES>(
+      result_distances_buffer,
+      result_indices_buffer,
+      internal_topk + search_width * graph_degree,
+      top_k,
+      false);
+    __syncthreads();
+  }
+
+  for (std::uint32_t i = threadIdx.x; i < top_k; i += blockDim.x) {
     unsigned j  = i + (top_k * query_id);
     unsigned ii = i;
     if (TOPK_BY_BITONIC_SORT) { ii = device::swizzling(i); }
@@ -737,36 +809,53 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__
 #endif
 }
 
-template <unsigned TEAM_SIZE, unsigned MX_DIM, typename T, typename IdxT, typename DistT>
+template <unsigned TEAM_SIZE,
+          unsigned MX_DIM,
+          typename T,
+          typename IdxT,
+          typename DistT,
+          typename SAMPLE_FILTER_T>
 struct search_kernel_config {
-  using kernel_t = decltype(&search_kernel<TEAM_SIZE, 64, 16, 64, 64, 0, MX_DIM, T, DistT, IdxT>);
+  using kernel_t =
+    decltype(&search_kernel<TEAM_SIZE, 64, 64, 0, MX_DIM, T, DistT, IdxT, SAMPLE_FILTER_T>);
 
-  template <unsigned MAX_ITOPK, unsigned CANDIDATES, unsigned USE_BITONIC_SORT>
-  static auto choose_block_size(unsigned block_size) -> kernel_t
+  template <unsigned MAX_CANDIDATES, unsigned USE_BITONIC_SORT>
+  static auto choose_search_kernel(unsigned itopk_size) -> kernel_t
   {
-    constexpr unsigned BS = USE_BITONIC_SORT;
-    if constexpr (BS) {
-      if (block_size == 64) {
-        return search_kernel<TEAM_SIZE, 64, 16, MAX_ITOPK, CANDIDATES, BS, MX_DIM, T, DistT, IdxT>;
-      } else if (block_size == 128) {
-        return search_kernel<TEAM_SIZE, 128, 8, MAX_ITOPK, CANDIDATES, BS, MX_DIM, T, DistT, IdxT>;
-      } else if (block_size == 256) {
-        return search_kernel<TEAM_SIZE, 256, 4, MAX_ITOPK, CANDIDATES, BS, MX_DIM, T, DistT, IdxT>;
-      } else if (block_size == 512) {
-        return search_kernel<TEAM_SIZE, 512, 2, MAX_ITOPK, CANDIDATES, BS, MX_DIM, T, DistT, IdxT>;
-      } else {
-        return search_kernel<TEAM_SIZE, 1024, 1, MAX_ITOPK, CANDIDATES, BS, MX_DIM, T, DistT, IdxT>;
-      }
-
-    } else {
-      if (block_size == 256) {
-        return search_kernel<TEAM_SIZE, 256, 4, MAX_ITOPK, CANDIDATES, BS, MX_DIM, T, DistT, IdxT>;
-      } else if (block_size == 512) {
-        return search_kernel<TEAM_SIZE, 512, 2, MAX_ITOPK, CANDIDATES, BS, MX_DIM, T, DistT, IdxT>;
-      } else {
-        return search_kernel<TEAM_SIZE, 1024, 1, MAX_ITOPK, CANDIDATES, BS, MX_DIM, T, DistT, IdxT>;
-      }
+    if (itopk_size <= 64) {
+      return search_kernel<TEAM_SIZE, 64, MAX_CANDIDATES, USE_BITONIC_SORT, MX_DIM, T, DistT, IdxT>;
+    } else if (itopk_size <= 128) {
+      return search_kernel<TEAM_SIZE,
+                           128,
+                           MAX_CANDIDATES,
+                           USE_BITONIC_SORT,
+                           MX_DIM,
+                           T,
+                           DistT,
+                           IdxT,
+                           SAMPLE_FILTER_T>;
+    } else if (itopk_size <= 256) {
+      return search_kernel<TEAM_SIZE,
+                           256,
+                           MAX_CANDIDATES,
+                           USE_BITONIC_SORT,
+                           MX_DIM,
+                           T,
+                           DistT,
+                           IdxT,
+                           SAMPLE_FILTER_T>;
+    } else if (itopk_size <= 512) {
+      return search_kernel<TEAM_SIZE,
+                           512,
+                           MAX_CANDIDATES,
+                           USE_BITONIC_SORT,
+                           MX_DIM,
+                           T,
+                           DistT,
+                           IdxT,
+                           SAMPLE_FILTER_T>;
     }
+    THROW("No kernel for parametels itopk_size %u, max_candidates %u", itopk_size, MAX_CANDIDATES);
   }
 
   static auto choose_itopk_and_mx_candidates(unsigned itopk_size,
@@ -775,45 +864,18 @@ struct search_kernel_config {
   {
     if (num_itopk_candidates <= 64) {
       // use bitonic sort based topk
-      constexpr unsigned max_candidates = 64;
-      if (itopk_size <= 64) {
-        return choose_block_size<64, max_candidates, 1>(block_size);
-      } else if (itopk_size <= 128) {
-        return choose_block_size<128, max_candidates, 1>(block_size);
-      } else if (itopk_size <= 256) {
-        return choose_block_size<256, max_candidates, 1>(block_size);
-      } else if (itopk_size <= 512) {
-        return choose_block_size<512, max_candidates, 1>(block_size);
-      }
+      return choose_search_kernel<64, 1>(itopk_size);
     } else if (num_itopk_candidates <= 128) {
-      constexpr unsigned max_candidates = 128;
-      if (itopk_size <= 64) {
-        return choose_block_size<64, max_candidates, 1>(block_size);
-      } else if (itopk_size <= 128) {
-        return choose_block_size<128, max_candidates, 1>(block_size);
-      } else if (itopk_size <= 256) {
-        return choose_block_size<256, max_candidates, 1>(block_size);
-      } else if (itopk_size <= 512) {
-        return choose_block_size<512, max_candidates, 1>(block_size);
-      }
+      return choose_search_kernel<128, 1>(itopk_size);
     } else if (num_itopk_candidates <= 256) {
-      constexpr unsigned max_candidates = 256;
-      if (itopk_size <= 64) {
-        return choose_block_size<64, max_candidates, 1>(block_size);
-      } else if (itopk_size <= 128) {
-        return choose_block_size<128, max_candidates, 1>(block_size);
-      } else if (itopk_size <= 256) {
-        return choose_block_size<256, max_candidates, 1>(block_size);
-      } else if (itopk_size <= 512) {
-        return choose_block_size<512, max_candidates, 1>(block_size);
-      }
+      return choose_search_kernel<256, 1>(itopk_size);
     } else {
       // Radix-based topk is used
       constexpr unsigned max_candidates = 32;  // to avoid build failure
       if (itopk_size <= 256) {
-        return choose_block_size<256, max_candidates, 0>(block_size);
+        return search_kernel<TEAM_SIZE, 256, max_candidates, 0, MX_DIM, T, DistT, IdxT>;
       } else if (itopk_size <= 512) {
-        return choose_block_size<512, max_candidates, 0>(block_size);
+        return search_kernel<TEAM_SIZE, 512, max_candidates, 0, MX_DIM, T, DistT, IdxT>;
       }
     }
     THROW("No kernel for parametels itopk_size %u, num_itopk_candidates %u",
@@ -826,7 +888,8 @@ template <unsigned TEAM_SIZE,
           unsigned MAX_DATASET_DIM,
           typename DATA_T,
           typename INDEX_T,
-          typename DISTANCE_T>
+          typename DISTANCE_T,
+          typename SAMPLE_FILTER_T>
 void select_and_run(  // raft::resources const& res,
   raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,
   raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,
@@ -851,16 +914,18 @@ void select_and_run(  // raft::resources const& res,
   size_t search_width,
   size_t min_iterations,
   size_t max_iterations,
+  SAMPLE_FILTER_T sample_filter,
   cudaStream_t stream)
 {
-  auto kernel = search_kernel_config<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>::
-    choose_itopk_and_mx_candidates(itopk_size, num_itopk_candidates, block_size);
+  auto kernel =
+    search_kernel_config<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>::
+      choose_itopk_and_mx_candidates(itopk_size, num_itopk_candidates, block_size);
   RAFT_CUDA_TRY(
     cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
   dim3 thread_dims(block_size, 1, 1);
   dim3 block_dims(1, num_queries, 1);
   RAFT_LOG_DEBUG(
-    "Launching kernel with %u threads, %u block %lu smem", block_size, num_queries, smem_size);
+    "Launching kernel with %u threads, %u block %u smem", block_size, num_queries, smem_size);
   kernel<<<block_dims, thread_dims, smem_size, stream>>>(topk_indices_ptr,
                                                          topk_distances_ptr,
                                                          topk,
@@ -883,7 +948,8 @@ void select_and_run(  // raft::resources const& res,
                                                          num_executed_iterations,
                                                          hash_bitlen,
                                                          small_hash_bitlen,
-                                                         small_hash_reset_interval);
+                                                         small_hash_reset_interval,
+                                                         sample_filter);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 }  // namespace single_cta_search
diff --git a/cpp/include/raft/neighbors/detail/cagra/topk_by_radix.cuh b/cpp/include/raft/neighbors/detail/cagra/topk_by_radix.cuh
index a1b7f930d3..6a6a3cddf4 100644
--- a/cpp/include/raft/neighbors/detail/cagra/topk_by_radix.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/topk_by_radix.cuh
@@ -26,14 +26,11 @@ struct topk_by_radix_sort_base {
   static constexpr std::uint32_t state_bit_lenght = 0;
   static constexpr std::uint32_t vecLen           = 2;  // TODO
 };
-template <unsigned MAX_INTERNAL_TOPK, unsigned BLOCK_SIZE, class IdxT, class = void>
+template <unsigned MAX_INTERNAL_TOPK, class IdxT, class = void>
 struct topk_by_radix_sort : topk_by_radix_sort_base<MAX_INTERNAL_TOPK> {};
 
-template <unsigned MAX_INTERNAL_TOPK, unsigned BLOCK_SIZE, class IdxT>
-struct topk_by_radix_sort<MAX_INTERNAL_TOPK,
-                          BLOCK_SIZE,
-                          IdxT,
-                          std::enable_if_t<((MAX_INTERNAL_TOPK <= 64))>>
+template <unsigned MAX_INTERNAL_TOPK, class IdxT>
+struct topk_by_radix_sort<MAX_INTERNAL_TOPK, IdxT, std::enable_if_t<((MAX_INTERNAL_TOPK <= 64))>>
   : topk_by_radix_sort_base<MAX_INTERNAL_TOPK> {
   __device__ void operator()(uint32_t topk,
                              uint32_t batch_size,
@@ -48,8 +45,7 @@ struct topk_by_radix_sort<MAX_INTERNAL_TOPK,
                              uint32_t* _smem)
   {
     std::uint8_t* const state = reinterpret_cast<std::uint8_t*>(work);
-    topk_cta_11_core<BLOCK_SIZE,
-                     topk_by_radix_sort_base<MAX_INTERNAL_TOPK>::state_bit_lenght,
+    topk_cta_11_core<topk_by_radix_sort_base<MAX_INTERNAL_TOPK>::state_bit_lenght,
                      topk_by_radix_sort_base<MAX_INTERNAL_TOPK>::vecLen,
                      64,
                      32,
@@ -58,10 +54,9 @@ struct topk_by_radix_sort<MAX_INTERNAL_TOPK,
 };
 
 #define TOP_FUNC_PARTIAL_SPECIALIZATION(V)                                           \
-  template <unsigned MAX_INTERNAL_TOPK, unsigned BLOCK_SIZE, class IdxT>             \
+  template <unsigned MAX_INTERNAL_TOPK, class IdxT>                                  \
   struct topk_by_radix_sort<                                                         \
     MAX_INTERNAL_TOPK,                                                               \
-    BLOCK_SIZE,                                                                      \
     IdxT,                                                                            \
     std::enable_if_t<((MAX_INTERNAL_TOPK <= V) && (2 * MAX_INTERNAL_TOPK > V))>>     \
     : topk_by_radix_sort_base<MAX_INTERNAL_TOPK> {                                   \
@@ -77,10 +72,9 @@ struct topk_by_radix_sort<MAX_INTERNAL_TOPK,
                                bool sort,                                            \
                                uint32_t* _smem)                                      \
     {                                                                                \
-      assert(BLOCK_SIZE >= V / 4);                                                   \
+      assert(blockDim.x >= V / 4);                                                   \
       std::uint8_t* state = (std::uint8_t*)work;                                     \
-      topk_cta_11_core<BLOCK_SIZE,                                                   \
-                       topk_by_radix_sort_base<MAX_INTERNAL_TOPK>::state_bit_lenght, \
+      topk_cta_11_core<topk_by_radix_sort_base<MAX_INTERNAL_TOPK>::state_bit_lenght, \
                        topk_by_radix_sort_base<MAX_INTERNAL_TOPK>::vecLen,           \
                        V,                                                            \
                        V / 4,                                                        \
diff --git a/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh b/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh
index 0fcfe2cc16..fd4aeb9bb3 100644
--- a/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh
@@ -22,8 +22,6 @@
 #include <stdio.h>
 
 namespace raft::neighbors::cagra::detail {
-using namespace cub;
-
 //
 __device__ inline uint32_t convert(uint32_t x)
 {
@@ -174,8 +172,46 @@ __device__ inline uint16_t get_element_from_u16_vector(struct u16_vector& vec, i
   return xi;
 }
 
+template <typename T>
+__device__ inline void block_scan(const T input, T& output)
+{
+  switch (blockDim.x) {
+    case 32: {
+      typedef cub::BlockScan<T, 32> BlockScanT;
+      __shared__ typename BlockScanT::TempStorage temp_storage;
+      BlockScanT(temp_storage).InclusiveSum(input, output);
+    } break;
+    case 64: {
+      typedef cub::BlockScan<T, 64> BlockScanT;
+      __shared__ typename BlockScanT::TempStorage temp_storage;
+      BlockScanT(temp_storage).InclusiveSum(input, output);
+    } break;
+    case 128: {
+      typedef cub::BlockScan<T, 128> BlockScanT;
+      __shared__ typename BlockScanT::TempStorage temp_storage;
+      BlockScanT(temp_storage).InclusiveSum(input, output);
+    } break;
+    case 256: {
+      typedef cub::BlockScan<T, 256> BlockScanT;
+      __shared__ typename BlockScanT::TempStorage temp_storage;
+      BlockScanT(temp_storage).InclusiveSum(input, output);
+    } break;
+    case 512: {
+      typedef cub::BlockScan<T, 512> BlockScanT;
+      __shared__ typename BlockScanT::TempStorage temp_storage;
+      BlockScanT(temp_storage).InclusiveSum(input, output);
+    } break;
+    case 1024: {
+      typedef cub::BlockScan<T, 1024> BlockScanT;
+      __shared__ typename BlockScanT::TempStorage temp_storage;
+      BlockScanT(temp_storage).InclusiveSum(input, output);
+    } break;
+    default: break;
+  }
+}
+
 //
-template <typename T, int blockDim_x, int stateBitLen, int vecLen>
+template <typename T, int stateBitLen, int vecLen>
 __device__ inline void update_histogram(int itr,
                                         uint32_t thread_id,
                                         uint32_t num_threads,
@@ -220,7 +256,7 @@ __device__ inline void update_histogram(int itr,
     return;
   }
   if (itr > 0) {
-    for (int i = threadIdx.x; i < num_bins; i += blockDim_x) {
+    for (int i = threadIdx.x; i < num_bins; i += blockDim.x) {
       hist[i] = 0;
     }
     __syncthreads();
@@ -285,8 +321,53 @@ __device__ inline void update_histogram(int itr,
   __syncthreads();
 }
 
+template <unsigned blockDim_x>
+__device__ inline void select_best_index_for_next_threshold_core(uint32_t& my_index,
+                                                                 uint32_t& my_csum,
+                                                                 const unsigned num_bins,
+                                                                 const uint32_t* const hist,
+                                                                 const uint32_t nx_below_threshold,
+                                                                 const uint32_t max_threshold,
+                                                                 const uint32_t threshold,
+                                                                 const uint32_t shift,
+                                                                 const uint32_t topk)
+{
+  typedef cub::BlockScan<uint32_t, blockDim_x> BlockScanT;
+  __shared__ typename BlockScanT::TempStorage temp_storage;
+  if (num_bins == 2048) {
+    constexpr int n_data = 2048 / blockDim_x;
+    uint32_t csum[n_data];
+    for (int i = 0; i < n_data; i++) {
+      csum[i] = hist[i + (n_data * threadIdx.x)];
+    }
+    BlockScanT(temp_storage).InclusiveSum(csum, csum);
+    for (int i = n_data - 1; i >= 0; i--) {
+      if (nx_below_threshold + csum[i] > topk) continue;
+      const uint32_t index = i + (n_data * threadIdx.x);
+      if (threshold + (index << shift) > max_threshold) continue;
+      my_index = index;
+      my_csum  = csum[i];
+      break;
+    }
+  } else if (num_bins == 1024) {
+    constexpr int n_data = 1024 / blockDim_x;
+    uint32_t csum[n_data];
+    for (int i = 0; i < n_data; i++) {
+      csum[i] = hist[i + (n_data * threadIdx.x)];
+    }
+    BlockScanT(temp_storage).InclusiveSum(csum, csum);
+    for (int i = n_data - 1; i >= 0; i--) {
+      if (nx_below_threshold + csum[i] > topk) continue;
+      const uint32_t index = i + (n_data * threadIdx.x);
+      if (threshold + (index << shift) > max_threshold) continue;
+      my_index = index;
+      my_csum  = csum[i];
+      break;
+    }
+  }
+}
+
 //
-template <int blockDim_x>
 __device__ inline void select_best_index_for_next_threshold(
   const uint32_t topk,
   const uint32_t threshold,
@@ -302,15 +383,12 @@ __device__ inline void select_best_index_for_next_threshold(
   // index under the condition that the sum of the number of elements found
   // so far ('nx_below_threshold') and the csum value does not exceed the
   // topk value.
-  typedef BlockScan<uint32_t, blockDim_x> BlockScanT;
-  __shared__ typename BlockScanT::TempStorage temp_storage;
-
   uint32_t my_index = 0xffffffff;
   uint32_t my_csum  = 0;
-  if (num_bins <= blockDim_x) {
+  if (num_bins <= blockDim.x) {
     uint32_t csum = 0;
     if (threadIdx.x < num_bins) { csum = hist[threadIdx.x]; }
-    BlockScanT(temp_storage).InclusiveSum(csum, csum);
+    detail::block_scan(csum, csum);
     if (threadIdx.x < num_bins) {
       const uint32_t index = threadIdx.x;
       if ((nx_below_threshold + csum <= topk) && (threshold + (index << shift) <= max_threshold)) {
@@ -319,36 +397,62 @@ __device__ inline void select_best_index_for_next_threshold(
       }
     }
   } else {
-    if (num_bins == 2048) {
-      constexpr int n_data = 2048 / blockDim_x;
-      uint32_t csum[n_data];
-      for (int i = 0; i < n_data; i++) {
-        csum[i] = hist[i + (n_data * threadIdx.x)];
-      }
-      BlockScanT(temp_storage).InclusiveSum(csum, csum);
-      for (int i = n_data - 1; i >= 0; i--) {
-        if (nx_below_threshold + csum[i] > topk) continue;
-        const uint32_t index = i + (n_data * threadIdx.x);
-        if (threshold + (index << shift) > max_threshold) continue;
-        my_index = index;
-        my_csum  = csum[i];
+    switch (blockDim.x) {
+      case 64:
+        select_best_index_for_next_threshold_core<64>(my_index,
+                                                      my_csum,
+                                                      num_bins,
+                                                      hist,
+                                                      nx_below_threshold,
+                                                      max_threshold,
+                                                      threshold,
+                                                      shift,
+                                                      topk);
         break;
-      }
-    } else if (num_bins == 1024) {
-      constexpr int n_data = 1024 / blockDim_x;
-      uint32_t csum[n_data];
-      for (int i = 0; i < n_data; i++) {
-        csum[i] = hist[i + (n_data * threadIdx.x)];
-      }
-      BlockScanT(temp_storage).InclusiveSum(csum, csum);
-      for (int i = n_data - 1; i >= 0; i--) {
-        if (nx_below_threshold + csum[i] > topk) continue;
-        const uint32_t index = i + (n_data * threadIdx.x);
-        if (threshold + (index << shift) > max_threshold) continue;
-        my_index = index;
-        my_csum  = csum[i];
+      case 128:
+        select_best_index_for_next_threshold_core<128>(my_index,
+                                                       my_csum,
+                                                       num_bins,
+                                                       hist,
+                                                       nx_below_threshold,
+                                                       max_threshold,
+                                                       threshold,
+                                                       shift,
+                                                       topk);
+        break;
+      case 256:
+        select_best_index_for_next_threshold_core<256>(my_index,
+                                                       my_csum,
+                                                       num_bins,
+                                                       hist,
+                                                       nx_below_threshold,
+                                                       max_threshold,
+                                                       threshold,
+                                                       shift,
+                                                       topk);
+        break;
+      case 512:
+        select_best_index_for_next_threshold_core<512>(my_index,
+                                                       my_csum,
+                                                       num_bins,
+                                                       hist,
+                                                       nx_below_threshold,
+                                                       max_threshold,
+                                                       threshold,
+                                                       shift,
+                                                       topk);
+        break;
+      case 1024:
+        select_best_index_for_next_threshold_core<1024>(my_index,
+                                                        my_csum,
+                                                        num_bins,
+                                                        hist,
+                                                        nx_below_threshold,
+                                                        max_threshold,
+                                                        threshold,
+                                                        shift,
+                                                        topk);
         break;
-      }
     }
   }
   if (threadIdx.x < num_bins) {
@@ -481,10 +585,14 @@ __device__ inline uint32_t max_value_of<uint32_t>()
   return ~0u;
 }
 
-template <int blockDim_x, int stateBitLen>
+template <int stateBitLen, unsigned BLOCK_SIZE = 0>
 __device__ __host__ inline uint32_t get_state_size(uint32_t len_x)
 {
-  const uint32_t num_threads = blockDim_x;
+#ifdef __CUDA_ARCH__
+  const uint32_t num_threads = blockDim.x;
+#else
+  const uint32_t num_threads = BLOCK_SIZE;
+#endif
   if (stateBitLen == 8) {
     uint32_t numElements_perThread = (len_x + num_threads - 1) / num_threads;
     uint32_t numState_perThread    = (numElements_perThread + stateBitLen - 1) / stateBitLen;
@@ -494,7 +602,7 @@ __device__ __host__ inline uint32_t get_state_size(uint32_t len_x)
 }
 
 //
-template <int blockDim_x, int stateBitLen, int vecLen, int maxTopk, int numSortThreads, class ValT>
+template <int stateBitLen, int vecLen, int maxTopk, int numSortThreads, class ValT>
 __device__ inline void topk_cta_11_core(uint32_t topk,
                                         uint32_t len_x,
                                         const uint32_t* _x,    // [size_batch, ld_x,]
@@ -511,7 +619,7 @@ __device__ inline void topk_cta_11_core(uint32_t topk,
   uint32_t* const best_index    = &(_smem[2 * maxTopk + 2048]);
   uint32_t* const best_csum     = &(_smem[2 * maxTopk + 2048 + 3]);
 
-  const uint32_t num_threads = blockDim_x;
+  const uint32_t num_threads = blockDim.x;
   const uint32_t thread_id   = threadIdx.x;
   uint32_t nx                = len_x;
   const uint32_t* const x    = _x;
@@ -541,29 +649,29 @@ __device__ inline void topk_cta_11_core(uint32_t topk,
   for (int j = 0; j < 3; j += 1) {
     uint32_t num_bins;
     uint32_t shift;
-    update_histogram<uint32_t, blockDim_x, stateBitLen, vecLen>(j,
-                                                                thread_id,
-                                                                num_threads,
-                                                                hint,
-                                                                threshold,
-                                                                num_bins,
-                                                                shift,
-                                                                x,
-                                                                nx,
-                                                                hist,
-                                                                state,
-                                                                smem_out_vals,
-                                                                output_count);
-
-    select_best_index_for_next_threshold<blockDim_x>(topk,
-                                                     threshold,
-                                                     hint,
-                                                     nx_below_threshold,
-                                                     num_bins,
-                                                     shift,
-                                                     hist,
-                                                     best_index + j,
-                                                     best_csum + j);
+
+    update_histogram<uint32_t, stateBitLen, vecLen>(j,
+                                                    thread_id,
+                                                    num_threads,
+                                                    hint,
+                                                    threshold,
+                                                    num_bins,
+                                                    shift,
+                                                    x,
+                                                    nx,
+                                                    hist,
+                                                    state,
+                                                    smem_out_vals,
+                                                    output_count);
+    select_best_index_for_next_threshold(topk,
+                                         threshold,
+                                         hint,
+                                         nx_below_threshold,
+                                         num_bins,
+                                         shift,
+                                         hist,
+                                         best_index + j,
+                                         best_csum + j);
 
     threshold += (best_index[j] << shift);
     nx_below_threshold += best_csum[j];
@@ -601,7 +709,7 @@ __device__ inline void topk_cta_11_core(uint32_t topk,
 #endif
 
   if (!sort) {
-    for (int k = thread_id; k < topk; k += blockDim_x) {
+    for (int k = thread_id; k < topk; k += blockDim.x) {
       const uint32_t i = smem_out_vals[k];
       if (y) { y[k] = x[i]; }
       if (out_vals) {
@@ -756,7 +864,7 @@ int _get_vecLen(uint32_t maxSamples, int maxVecLen = MAX_VEC_LENGTH)
 }
 }  // unnamed namespace
 
-template <int blockDim_x, int stateBitLen, int vecLen, int maxTopk, int numSortThreads, class ValT>
+template <int stateBitLen, int vecLen, int maxTopk, int numSortThreads, class ValT>
 __launch_bounds__(1024, 1) __global__
   void kern_topk_cta_11(uint32_t topk,
                         uint32_t size_batch,
@@ -781,14 +889,14 @@ __launch_bounds__(1024, 1) __global__
                 "maxTopk * sizeof(ValT) must be smaller or equal to 8192 byte");
   __shared__ uint32_t _smem[smem_len];
 
-  topk_cta_11_core<blockDim_x, stateBitLen, vecLen, maxTopk, numSortThreads, ValT>(
+  topk_cta_11_core<stateBitLen, vecLen, maxTopk, numSortThreads, ValT>(
     topk,
     len_x,
     (_x == NULL ? NULL : _x + i_batch * ld_x),
     (_in_vals == NULL ? NULL : _in_vals + i_batch * ld_iv),
     (_y == NULL ? NULL : _y + i_batch * ld_y),
     (_out_vals == NULL ? NULL : _out_vals + i_batch * ld_ov),
-    (_state == NULL ? NULL : _state + i_batch * get_state_size<blockDim_x, stateBitLen>(len_x)),
+    (_state == NULL ? NULL : _state + i_batch * get_state_size<stateBitLen>(len_x)),
     (_hints == NULL ? NULL : _hints + i_batch),
     sort,
     _smem);
@@ -808,7 +916,7 @@ size_t inline _cuann_find_topk_bufferSize(uint32_t topK,
   // state
   if (stateBitLen == 8) {
     workspaceSize = _cuann_aligned(
-      sizeof(uint8_t) * get_state_size<numThreads, stateBitLen>(numElements) * sizeBatch);
+      sizeof(uint8_t) * get_state_size<stateBitLen, numThreads>(numElements) * sizeBatch);
   }
 
   return workspaceSize;
@@ -862,12 +970,12 @@ inline void _cuann_find_topk(uint32_t topK,
                      bool) = nullptr;
 
   // V:vecLen, K:maxTopk, T:numSortThreads
-#define SET_KERNEL_VKT(V, K, T, ValT)                                      \
-  do {                                                                     \
-    assert(numThreads >= T);                                               \
-    assert((K % T) == 0);                                                  \
-    assert((K / T) <= 4);                                                  \
-    cta_kernel = kern_topk_cta_11<numThreads, stateBitLen, V, K, T, ValT>; \
+#define SET_KERNEL_VKT(V, K, T, ValT)                          \
+  do {                                                         \
+    assert(numThreads >= T);                                   \
+    assert((K % T) == 0);                                      \
+    assert((K / T) <= 4);                                      \
+    cta_kernel = kern_topk_cta_11<stateBitLen, V, K, T, ValT>; \
   } while (0)
 
   // V: vecLen
diff --git a/cpp/include/raft/neighbors/detail/knn_brute_force.cuh b/cpp/include/raft/neighbors/detail/knn_brute_force.cuh
index 123a902ef9..be05d5545f 100644
--- a/cpp/include/raft/neighbors/detail/knn_brute_force.cuh
+++ b/cpp/include/raft/neighbors/detail/knn_brute_force.cuh
@@ -64,10 +64,11 @@ void tiled_brute_force_knn(const raft::resources& handle,
                            ElementType* distances,  // size (m, k)
                            IndexType* indices,      // size (m, k)
                            raft::distance::DistanceType metric,
-                           float metric_arg                   = 2.0,
-                           size_t max_row_tile_size           = 0,
-                           size_t max_col_tile_size           = 0,
-                           DistanceEpilogue distance_epilogue = raft::identity_op())
+                           float metric_arg                           = 2.0,
+                           size_t max_row_tile_size                   = 0,
+                           size_t max_col_tile_size                   = 0,
+                           DistanceEpilogue distance_epilogue         = raft::identity_op(),
+                           const ElementType* precomputed_index_norms = nullptr)
 {
   // Figure out the number of rows/cols to tile for
   size_t tile_rows   = 0;
@@ -97,7 +98,7 @@ void tiled_brute_force_knn(const raft::resources& handle,
       metric == raft::distance::DistanceType::L2SqrtExpanded ||
       metric == raft::distance::DistanceType::CosineExpanded) {
     search_norms.resize(m, stream);
-    index_norms.resize(n, stream);
+    if (!precomputed_index_norms) { index_norms.resize(n, stream); }
     // cosine needs the l2norm, where as l2 distances needs the squared norm
     if (metric == raft::distance::DistanceType::CosineExpanded) {
       raft::linalg::rowNorm(search_norms.data(),
@@ -108,19 +109,24 @@ void tiled_brute_force_knn(const raft::resources& handle,
                             true,
                             stream,
                             raft::sqrt_op{});
-      raft::linalg::rowNorm(index_norms.data(),
-                            index,
-                            d,
-                            n,
-                            raft::linalg::NormType::L2Norm,
-                            true,
-                            stream,
-                            raft::sqrt_op{});
+      if (!precomputed_index_norms) {
+        raft::linalg::rowNorm(index_norms.data(),
+                              index,
+                              d,
+                              n,
+                              raft::linalg::NormType::L2Norm,
+                              true,
+                              stream,
+                              raft::sqrt_op{});
+      }
     } else {
       raft::linalg::rowNorm(
         search_norms.data(), search, d, m, raft::linalg::NormType::L2Norm, true, stream);
-      raft::linalg::rowNorm(
-        index_norms.data(), index, d, n, raft::linalg::NormType::L2Norm, true, stream);
+
+      if (!precomputed_index_norms) {
+        raft::linalg::rowNorm(
+          index_norms.data(), index, d, n, raft::linalg::NormType::L2Norm, true, stream);
+      }
     }
     pairwise_metric = raft::distance::DistanceType::InnerProduct;
   }
@@ -178,7 +184,7 @@ void tiled_brute_force_knn(const raft::resources& handle,
       if (metric == raft::distance::DistanceType::L2Expanded ||
           metric == raft::distance::DistanceType::L2SqrtExpanded) {
         auto row_norms = search_norms.data();
-        auto col_norms = index_norms.data();
+        auto col_norms = precomputed_index_norms ? precomputed_index_norms : index_norms.data();
         auto dist      = temp_distances.data();
 
         raft::linalg::map_offset(
@@ -200,7 +206,7 @@ void tiled_brute_force_knn(const raft::resources& handle,
           });
       } else if (metric == raft::distance::DistanceType::CosineExpanded) {
         auto row_norms = search_norms.data();
-        auto col_norms = index_norms.data();
+        auto col_norms = precomputed_index_norms ? precomputed_index_norms : index_norms.data();
         auto dist      = temp_distances.data();
 
         raft::linalg::map_offset(
@@ -330,7 +336,8 @@ void brute_force_knn_impl(
   std::vector<IdxType>* translations  = nullptr,
   raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded,
   float metricArg                     = 0,
-  DistanceEpilogue distance_epilogue  = raft::identity_op())
+  DistanceEpilogue distance_epilogue  = raft::identity_op(),
+  std::vector<value_t*>* input_norms  = nullptr)
 {
   auto userStream = resource::get_cuda_stream(handle);
 
@@ -424,7 +431,8 @@ void brute_force_knn_impl(
                  rowMajorIndex,
                  rowMajorQuery,
                  stream,
-                 metric);
+                 metric,
+                 input_norms ? (*input_norms)[i] : nullptr);
 
       // Perform necessary post-processing
       if (metric == raft::distance::DistanceType::L2SqrtExpanded ||
@@ -473,7 +481,8 @@ void brute_force_knn_impl(
                                                   metricArg,
                                                   0,
                                                   0,
-                                                  distance_epilogue);
+                                                  distance_epilogue,
+                                                  input_norms ? (*input_norms)[i] : nullptr);
           break;
       }
     }
diff --git a/cpp/include/raft/neighbors/detail/nn_descent.cuh b/cpp/include/raft/neighbors/detail/nn_descent.cuh
new file mode 100644
index 0000000000..009ffd4684
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/nn_descent.cuh
@@ -0,0 +1,1452 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <mma.h>
+#include <omp.h>
+
+#include <cub/cub.cuh>
+#include <limits>
+#include <queue>
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/execution_policy.h>
+#include <thrust/fill.h>
+#include <thrust/host_vector.h>
+#include <thrust/mr/allocator.h>
+#include <thrust/mr/device_memory_resource.h>
+
+#include "../nn_descent_types.hpp"
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/error.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/neighbors/detail/cagra/device_common.hpp>
+#include <raft/spatial/knn/detail/ann_utils.cuh>
+#include <raft/util/arch.cuh>  // raft::util::arch::SM_*
+#include <raft/util/cuda_dev_essentials.cuh>
+#include <raft/util/cuda_rt_essentials.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <raft/util/pow2_utils.cuh>
+
+namespace raft::neighbors::experimental::nn_descent::detail {
+
+using pinned_memory_resource = thrust::universal_host_pinned_memory_resource;
+template <typename T>
+using pinned_memory_allocator = thrust::mr::stateless_resource_allocator<T, pinned_memory_resource>;
+
+using DistData_t = float;
+constexpr int DEGREE_ON_DEVICE{32};
+constexpr int SEGMENT_SIZE{32};
+constexpr int counter_interval{100};
+template <typename Index_t>
+struct InternalID_t;
+
+// InternalID_t uses 1 bit for marking (new or old).
+template <>
+class InternalID_t<int> {
+ private:
+  using Index_t = int;
+  Index_t id_{std::numeric_limits<Index_t>::max()};
+
+ public:
+  __host__ __device__ bool is_new() const { return id_ >= 0; }
+  __host__ __device__ Index_t& id_with_flag() { return id_; }
+  __host__ __device__ Index_t id() const
+  {
+    if (is_new()) return id_;
+    return -id_ - 1;
+  }
+  __host__ __device__ void mark_old()
+  {
+    if (id_ >= 0) id_ = -id_ - 1;
+  }
+  __host__ __device__ bool operator==(const InternalID_t<int>& other) const
+  {
+    return id() == other.id();
+  }
+};
+
+template <typename Index_t>
+struct ResultItem;
+
+template <>
+class ResultItem<int> {
+ private:
+  using Index_t = int;
+  Index_t id_;
+  DistData_t dist_;
+
+ public:
+  __host__ __device__ ResultItem()
+    : id_(std::numeric_limits<Index_t>::max()), dist_(std::numeric_limits<DistData_t>::max()){};
+  __host__ __device__ ResultItem(const Index_t id_with_flag, const DistData_t dist)
+    : id_(id_with_flag), dist_(dist){};
+  __host__ __device__ bool is_new() const { return id_ >= 0; }
+  __host__ __device__ Index_t& id_with_flag() { return id_; }
+  __host__ __device__ Index_t id() const
+  {
+    if (is_new()) return id_;
+    return -id_ - 1;
+  }
+  __host__ __device__ DistData_t& dist() { return dist_; }
+
+  __host__ __device__ void mark_old()
+  {
+    if (id_ >= 0) id_ = -id_ - 1;
+  }
+
+  __host__ __device__ bool operator<(const ResultItem<Index_t>& other) const
+  {
+    if (dist_ == other.dist_) return id() < other.id();
+    return dist_ < other.dist_;
+  }
+  __host__ __device__ bool operator==(const ResultItem<Index_t>& other) const
+  {
+    return id() == other.id();
+  }
+  __host__ __device__ bool operator>=(const ResultItem<Index_t>& other) const
+  {
+    return !(*this < other);
+  }
+  __host__ __device__ bool operator<=(const ResultItem<Index_t>& other) const
+  {
+    return (*this == other) || (*this < other);
+  }
+  __host__ __device__ bool operator>(const ResultItem<Index_t>& other) const
+  {
+    return !(*this <= other);
+  }
+  __host__ __device__ bool operator!=(const ResultItem<Index_t>& other) const
+  {
+    return !(*this == other);
+  }
+};
+
+using align32 = raft::Pow2<32>;
+
+template <typename T>
+int get_batch_size(const int it_now, const T nrow, const int batch_size)
+{
+  int it_total = ceildiv(nrow, batch_size);
+  return (it_now == it_total - 1) ? nrow - it_now * batch_size : batch_size;
+}
+
+// for avoiding bank conflict
+template <typename T>
+constexpr __host__ __device__ __forceinline__ int skew_dim(int ndim)
+{
+  // all "4"s are for alignment
+  if constexpr (std::is_same<T, float>::value) {
+    ndim = ceildiv(ndim, 4) * 4;
+    return ndim + (ndim % 32 == 0) * 4;
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ ResultItem<T> xor_swap(ResultItem<T> x, int mask, int dir)
+{
+  ResultItem<T> y;
+  y.dist() = __shfl_xor_sync(raft::warp_full_mask(), x.dist(), mask, raft::warp_size());
+  y.id_with_flag() =
+    __shfl_xor_sync(raft::warp_full_mask(), x.id_with_flag(), mask, raft::warp_size());
+  return x < y == dir ? y : x;
+}
+
+__device__ __forceinline__ int xor_swap(int x, int mask, int dir)
+{
+  int y = __shfl_xor_sync(raft::warp_full_mask(), x, mask, raft::warp_size());
+  return x < y == dir ? y : x;
+}
+
+// TODO: Move to RAFT utils https://github.com/rapidsai/raft/issues/1827
+__device__ __forceinline__ uint bfe(uint lane_id, uint pos)
+{
+  uint res;
+  asm("bfe.u32 %0,%1,%2,%3;" : "=r"(res) : "r"(lane_id), "r"(pos), "r"(1));
+  return res;
+}
+
+template <typename T>
+__device__ __forceinline__ void warp_bitonic_sort(T* element_ptr, const int lane_id)
+{
+  static_assert(raft::warp_size() == 32);
+  auto& element = *element_ptr;
+  element       = xor_swap(element, 0x01, bfe(lane_id, 1) ^ bfe(lane_id, 0));
+  element       = xor_swap(element, 0x02, bfe(lane_id, 2) ^ bfe(lane_id, 1));
+  element       = xor_swap(element, 0x01, bfe(lane_id, 2) ^ bfe(lane_id, 0));
+  element       = xor_swap(element, 0x04, bfe(lane_id, 3) ^ bfe(lane_id, 2));
+  element       = xor_swap(element, 0x02, bfe(lane_id, 3) ^ bfe(lane_id, 1));
+  element       = xor_swap(element, 0x01, bfe(lane_id, 3) ^ bfe(lane_id, 0));
+  element       = xor_swap(element, 0x08, bfe(lane_id, 4) ^ bfe(lane_id, 3));
+  element       = xor_swap(element, 0x04, bfe(lane_id, 4) ^ bfe(lane_id, 2));
+  element       = xor_swap(element, 0x02, bfe(lane_id, 4) ^ bfe(lane_id, 1));
+  element       = xor_swap(element, 0x01, bfe(lane_id, 4) ^ bfe(lane_id, 0));
+  element       = xor_swap(element, 0x10, bfe(lane_id, 4));
+  element       = xor_swap(element, 0x08, bfe(lane_id, 3));
+  element       = xor_swap(element, 0x04, bfe(lane_id, 2));
+  element       = xor_swap(element, 0x02, bfe(lane_id, 1));
+  element       = xor_swap(element, 0x01, bfe(lane_id, 0));
+  return;
+}
+
+struct BuildConfig {
+  size_t max_dataset_size;
+  size_t dataset_dim;
+  size_t node_degree{64};
+  size_t internal_node_degree{0};
+  // If internal_node_degree == 0, the value of node_degree will be assigned to it
+  size_t max_iterations{50};
+  float termination_threshold{0.0001};
+};
+
+template <typename Index_t>
+class BloomFilter {
+ public:
+  BloomFilter(size_t nrow, size_t num_sets_per_list, size_t num_hashs)
+    : nrow_(nrow),
+      num_sets_per_list_(num_sets_per_list),
+      num_hashs_(num_hashs),
+      bitsets_(nrow * num_bits_per_set_ * num_sets_per_list)
+  {
+  }
+
+  void add(size_t list_id, Index_t key)
+  {
+    if (is_cleared) { is_cleared = false; }
+    uint32_t hash         = hash_0(key);
+    size_t global_set_idx = list_id * num_bits_per_set_ * num_sets_per_list_ +
+                            key % num_sets_per_list_ * num_bits_per_set_;
+    bitsets_[global_set_idx + hash % num_bits_per_set_] = 1;
+    for (size_t i = 1; i < num_hashs_; i++) {
+      hash                                                = hash + hash_1(key);
+      bitsets_[global_set_idx + hash % num_bits_per_set_] = 1;
+    }
+  }
+
+  bool check(size_t list_id, Index_t key)
+  {
+    bool is_present       = true;
+    uint32_t hash         = hash_0(key);
+    size_t global_set_idx = list_id * num_bits_per_set_ * num_sets_per_list_ +
+                            key % num_sets_per_list_ * num_bits_per_set_;
+    is_present &= bitsets_[global_set_idx + hash % num_bits_per_set_];
+
+    if (!is_present) return false;
+    for (size_t i = 1; i < num_hashs_; i++) {
+      hash = hash + hash_1(key);
+      is_present &= bitsets_[global_set_idx + hash % num_bits_per_set_];
+      if (!is_present) return false;
+    }
+    return true;
+  }
+
+  void clear()
+  {
+    if (is_cleared) return;
+#pragma omp parallel for
+    for (size_t i = 0; i < nrow_ * num_bits_per_set_ * num_sets_per_list_; i++) {
+      bitsets_[i] = 0;
+    }
+    is_cleared = true;
+  }
+
+ private:
+  uint32_t hash_0(uint32_t value)
+  {
+    value *= 1103515245;
+    value += 12345;
+    value ^= value << 13;
+    value ^= value >> 17;
+    value ^= value << 5;
+    return value;
+  }
+
+  uint32_t hash_1(uint32_t value)
+  {
+    value *= 1664525;
+    value += 1013904223;
+    value ^= value << 13;
+    value ^= value >> 17;
+    value ^= value << 5;
+    return value;
+  }
+
+  static constexpr int num_bits_per_set_ = 512;
+  bool is_cleared{true};
+  std::vector<bool> bitsets_;
+  size_t nrow_;
+  size_t num_sets_per_list_;
+  size_t num_hashs_;
+};
+
+template <typename Index_t>
+struct GnndGraph {
+  static constexpr int segment_size = 32;
+  InternalID_t<Index_t>* h_graph;
+
+  size_t nrow;
+  size_t node_degree;
+  int num_samples;
+  int num_segments;
+
+  raft::host_matrix<DistData_t, size_t, raft::row_major> h_dists;
+
+  thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_graph_new;
+  thrust::host_vector<int2, pinned_memory_allocator<int2>> h_list_sizes_new;
+
+  thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_graph_old;
+  thrust::host_vector<int2, pinned_memory_allocator<int2>> h_list_sizes_old;
+  BloomFilter<Index_t> bloom_filter;
+
+  GnndGraph(const GnndGraph&)            = delete;
+  GnndGraph& operator=(const GnndGraph&) = delete;
+  GnndGraph(const size_t nrow,
+            const size_t node_degree,
+            const size_t internal_node_degree,
+            const size_t num_samples);
+  void init_random_graph();
+  // TODO: Create a generic bloom filter utility https://github.com/rapidsai/raft/issues/1827
+  // Use Bloom filter to sample "new" neighbors for local joining
+  void sample_graph_new(InternalID_t<Index_t>* new_neighbors, const size_t width);
+  void sample_graph(bool sample_new);
+  void update_graph(const InternalID_t<Index_t>* new_neighbors,
+                    const DistData_t* new_dists,
+                    const size_t width,
+                    std::atomic<int64_t>& update_counter);
+  void sort_lists();
+  void clear();
+  ~GnndGraph();
+};
+
+template <typename Data_t = float, typename Index_t = int>
+class GNND {
+ public:
+  GNND(raft::resources const& res, const BuildConfig& build_config);
+  GNND(const GNND&)            = delete;
+  GNND& operator=(const GNND&) = delete;
+
+  void build(Data_t* data, const Index_t nrow, Index_t* output_graph);
+  ~GNND()    = default;
+  using ID_t = InternalID_t<Index_t>;
+
+ private:
+  void add_reverse_edges(Index_t* graph_ptr,
+                         Index_t* h_rev_graph_ptr,
+                         Index_t* d_rev_graph_ptr,
+                         int2* list_sizes,
+                         cudaStream_t stream = 0);
+  void local_join(cudaStream_t stream = 0);
+
+  raft::resources const& res;
+
+  BuildConfig build_config_;
+  GnndGraph<Index_t> graph_;
+  std::atomic<int64_t> update_counter_;
+
+  Index_t nrow_;
+  const int ndim_;
+
+  raft::device_matrix<__half, Index_t, raft::row_major> d_data_;
+  raft::device_vector<DistData_t, Index_t> l2_norms_;
+
+  raft::device_matrix<ID_t, Index_t, raft::row_major> graph_buffer_;
+  raft::device_matrix<DistData_t, Index_t, raft::row_major> dists_buffer_;
+
+  // TODO: Investigate using RMM/RAFT types https://github.com/rapidsai/raft/issues/1827
+  thrust::host_vector<ID_t, pinned_memory_allocator<ID_t>> graph_host_buffer_;
+  thrust::host_vector<DistData_t, pinned_memory_allocator<DistData_t>> dists_host_buffer_;
+
+  raft::device_vector<int, Index_t> d_locks_;
+
+  thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_rev_graph_new_;
+  thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_graph_old_;
+  thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_rev_graph_old_;
+  // int2.x is the number of forward edges, int2.y is the number of reverse edges
+
+  raft::device_vector<int2, Index_t> d_list_sizes_new_;
+  raft::device_vector<int2, Index_t> d_list_sizes_old_;
+};
+
+constexpr int TILE_ROW_WIDTH = 64;
+constexpr int TILE_COL_WIDTH = 128;
+
+constexpr int NUM_SAMPLES = 32;
+// For now, the max. number of samples is 32, so the sample cache size is fixed
+// to 64 (32 * 2).
+constexpr int MAX_NUM_BI_SAMPLES        = 64;
+constexpr int SKEWED_MAX_NUM_BI_SAMPLES = skew_dim<float>(MAX_NUM_BI_SAMPLES);
+constexpr int BLOCK_SIZE                = 512;
+constexpr int WMMA_M                    = 16;
+constexpr int WMMA_N                    = 16;
+constexpr int WMMA_K                    = 16;
+
+template <typename Data_t>
+__device__ __forceinline__ void load_vec(Data_t* vec_buffer,
+                                         const Data_t* d_vec,
+                                         const int load_dims,
+                                         const int padding_dims,
+                                         const int lane_id)
+{
+  if constexpr (std::is_same_v<Data_t, float> or std::is_same_v<Data_t, uint8_t> or
+                std::is_same_v<Data_t, int8_t>) {
+    constexpr int num_load_elems_per_warp = raft::warp_size();
+    for (int step = 0; step < ceildiv(padding_dims, num_load_elems_per_warp); step++) {
+      int idx = step * num_load_elems_per_warp + lane_id;
+      if (idx < load_dims) {
+        vec_buffer[idx] = d_vec[idx];
+      } else if (idx < padding_dims) {
+        vec_buffer[idx] = 0.0f;
+      }
+    }
+  }
+  if constexpr (std::is_same_v<Data_t, __half>) {
+    if ((size_t)d_vec % sizeof(float2) == 0 && (size_t)vec_buffer % sizeof(float2) == 0 &&
+        load_dims % 4 == 0 && padding_dims % 4 == 0) {
+      constexpr int num_load_elems_per_warp = raft::warp_size() * 4;
+#pragma unroll
+      for (int step = 0; step < ceildiv(padding_dims, num_load_elems_per_warp); step++) {
+        int idx_in_vec = step * num_load_elems_per_warp + lane_id * 4;
+        if (idx_in_vec + 4 <= load_dims) {
+          *(float2*)(vec_buffer + idx_in_vec) = *(float2*)(d_vec + idx_in_vec);
+        } else if (idx_in_vec + 4 <= padding_dims) {
+          *(float2*)(vec_buffer + idx_in_vec) = float2({0.0f, 0.0f});
+        }
+      }
+    } else {
+      constexpr int num_load_elems_per_warp = raft::warp_size();
+      for (int step = 0; step < ceildiv(padding_dims, num_load_elems_per_warp); step++) {
+        int idx = step * num_load_elems_per_warp + lane_id;
+        if (idx < load_dims) {
+          vec_buffer[idx] = d_vec[idx];
+        } else if (idx < padding_dims) {
+          vec_buffer[idx] = 0.0f;
+        }
+      }
+    }
+  }
+}
+
+// TODO: Replace with RAFT utilities https://github.com/rapidsai/raft/issues/1827
+/** Calculate L2 norm, and cast data to __half */
+template <typename Data_t>
+__global__ void preprocess_data_kernel(const Data_t* input_data,
+                                       __half* output_data,
+                                       int dim,
+                                       DistData_t* l2_norms,
+                                       size_t list_offset = 0)
+{
+  extern __shared__ char buffer[];
+  __shared__ float l2_norm;
+  Data_t* s_vec  = (Data_t*)buffer;
+  size_t list_id = list_offset + blockIdx.x;
+
+  load_vec(s_vec, input_data + blockIdx.x * dim, dim, dim, threadIdx.x % raft::warp_size());
+  if (threadIdx.x == 0) { l2_norm = 0; }
+  __syncthreads();
+  int lane_id = threadIdx.x % raft::warp_size();
+  for (int step = 0; step < ceildiv(dim, raft::warp_size()); step++) {
+    int idx         = step * raft::warp_size() + lane_id;
+    float part_dist = 0;
+    if (idx < dim) {
+      part_dist = s_vec[idx];
+      part_dist = part_dist * part_dist;
+    }
+    __syncwarp();
+    for (int offset = raft::warp_size() >> 1; offset >= 1; offset >>= 1) {
+      part_dist += __shfl_down_sync(raft::warp_full_mask(), part_dist, offset);
+    }
+    if (lane_id == 0) { l2_norm += part_dist; }
+    __syncwarp();
+  }
+
+  for (int step = 0; step < ceildiv(dim, raft::warp_size()); step++) {
+    int idx = step * raft::warp_size() + threadIdx.x;
+    if (idx < dim) {
+      if (l2_norms == nullptr) {
+        output_data[list_id * dim + idx] =
+          (float)input_data[(size_t)blockIdx.x * dim + idx] / sqrt(l2_norm);
+      } else {
+        output_data[list_id * dim + idx] = input_data[(size_t)blockIdx.x * dim + idx];
+        if (idx == 0) { l2_norms[list_id] = l2_norm; }
+      }
+    }
+  }
+}
+
+template <typename Index_t>
+__global__ void add_rev_edges_kernel(const Index_t* graph,
+                                     Index_t* rev_graph,
+                                     int num_samples,
+                                     int2* list_sizes)
+{
+  size_t list_id = blockIdx.x;
+  int2 list_size = list_sizes[list_id];
+
+  for (int idx = threadIdx.x; idx < list_size.x; idx += blockDim.x) {
+    // each node has same number (num_samples) of forward and reverse edges
+    size_t rev_list_id = graph[list_id * num_samples + idx];
+    // there are already num_samples forward edges
+    int idx_in_rev_list = atomicAdd(&list_sizes[rev_list_id].y, 1);
+    if (idx_in_rev_list >= num_samples) {
+      atomicExch(&list_sizes[rev_list_id].y, num_samples);
+    } else {
+      rev_graph[rev_list_id * num_samples + idx_in_rev_list] = list_id;
+    }
+  }
+}
+
+template <typename Index_t, typename ID_t = InternalID_t<Index_t>>
+__device__ void insert_to_global_graph(ResultItem<Index_t> elem,
+                                       size_t list_id,
+                                       ID_t* graph,
+                                       DistData_t* dists,
+                                       int node_degree,
+                                       int* locks)
+{
+  int tx                 = threadIdx.x;
+  int lane_id            = tx % raft::warp_size();
+  size_t global_idx_base = list_id * node_degree;
+  if (elem.id() == list_id) return;
+
+  const int num_segments = ceildiv(node_degree, raft::warp_size());
+
+  int loop_flag = 0;
+  do {
+    int segment_id = elem.id() % num_segments;
+    if (lane_id == 0) {
+      loop_flag = atomicCAS(&locks[list_id * num_segments + segment_id], 0, 1) == 0;
+    }
+
+    loop_flag = __shfl_sync(raft::warp_full_mask(), loop_flag, 0);
+
+    if (loop_flag == 1) {
+      ResultItem<Index_t> knn_list_frag;
+      int local_idx     = segment_id * raft::warp_size() + lane_id;
+      size_t global_idx = global_idx_base + local_idx;
+      if (local_idx < node_degree) {
+        knn_list_frag.id_with_flag() = graph[global_idx].id_with_flag();
+        knn_list_frag.dist()         = dists[global_idx];
+      }
+
+      int pos_to_insert = -1;
+      ResultItem<Index_t> prev_elem;
+
+      prev_elem.id_with_flag() =
+        __shfl_up_sync(raft::warp_full_mask(), knn_list_frag.id_with_flag(), 1);
+      prev_elem.dist() = __shfl_up_sync(raft::warp_full_mask(), knn_list_frag.dist(), 1);
+
+      if (lane_id == 0) {
+        prev_elem = ResultItem<Index_t>{std::numeric_limits<Index_t>::min(),
+                                        std::numeric_limits<DistData_t>::lowest()};
+      }
+      if (elem > prev_elem && elem < knn_list_frag) {
+        pos_to_insert = segment_id * raft::warp_size() + lane_id;
+      } else if (elem == prev_elem || elem == knn_list_frag) {
+        pos_to_insert = -2;
+      }
+      uint mask = __ballot_sync(raft::warp_full_mask(), pos_to_insert >= 0);
+      if (mask) {
+        uint set_lane_id = __fns(mask, 0, 1);
+        pos_to_insert    = __shfl_sync(raft::warp_full_mask(), pos_to_insert, set_lane_id);
+      }
+
+      if (pos_to_insert >= 0) {
+        int local_idx = segment_id * raft::warp_size() + lane_id;
+        if (local_idx > pos_to_insert) {
+          local_idx++;
+        } else if (local_idx == pos_to_insert) {
+          graph[global_idx_base + local_idx].id_with_flag() = elem.id_with_flag();
+          dists[global_idx_base + local_idx]                = elem.dist();
+          local_idx++;
+        }
+        size_t global_pos = global_idx_base + local_idx;
+        if (local_idx < (segment_id + 1) * raft::warp_size() && local_idx < node_degree) {
+          graph[global_pos].id_with_flag() = knn_list_frag.id_with_flag();
+          dists[global_pos]                = knn_list_frag.dist();
+        }
+      }
+      __threadfence();
+      if (loop_flag && lane_id == 0) { atomicExch(&locks[list_id * num_segments + segment_id], 0); }
+    }
+  } while (!loop_flag);
+}
+
+template <typename Index_t>
+__device__ ResultItem<Index_t> get_min_item(const Index_t id,
+                                            const int idx_in_list,
+                                            const Index_t* neighbs,
+                                            const DistData_t* distances,
+                                            const bool find_in_row = true)
+{
+  int lane_id = threadIdx.x % raft::warp_size();
+
+  static_assert(MAX_NUM_BI_SAMPLES == 64);
+  int idx[MAX_NUM_BI_SAMPLES / raft::warp_size()];
+  float dist[MAX_NUM_BI_SAMPLES / raft::warp_size()] = {std::numeric_limits<DistData_t>::max(),
+                                                        std::numeric_limits<DistData_t>::max()};
+  idx[0]                                             = lane_id;
+  idx[1]                                             = raft::warp_size() + lane_id;
+
+  if (neighbs[idx[0]] != id) {
+    dist[0] = find_in_row ? distances[idx_in_list * SKEWED_MAX_NUM_BI_SAMPLES + lane_id]
+                          : distances[idx_in_list + lane_id * SKEWED_MAX_NUM_BI_SAMPLES];
+  }
+
+  if (neighbs[idx[1]] != id) {
+    dist[1] =
+      find_in_row
+        ? distances[idx_in_list * SKEWED_MAX_NUM_BI_SAMPLES + raft::warp_size() + lane_id]
+        : distances[idx_in_list + (raft::warp_size() + lane_id) * SKEWED_MAX_NUM_BI_SAMPLES];
+  }
+
+  if (dist[1] < dist[0]) {
+    dist[0] = dist[1];
+    idx[0]  = idx[1];
+  }
+  __syncwarp();
+  for (int offset = raft::warp_size() >> 1; offset >= 1; offset >>= 1) {
+    float other_idx  = __shfl_down_sync(raft::warp_full_mask(), idx[0], offset);
+    float other_dist = __shfl_down_sync(raft::warp_full_mask(), dist[0], offset);
+    if (other_dist < dist[0]) {
+      dist[0] = other_dist;
+      idx[0]  = other_idx;
+    }
+  }
+
+  ResultItem<Index_t> result;
+  result.dist()         = __shfl_sync(raft::warp_full_mask(), dist[0], 0);
+  result.id_with_flag() = neighbs[__shfl_sync(raft::warp_full_mask(), idx[0], 0)];
+  return result;
+}
+
+template <typename T>
+__device__ __forceinline__ void remove_duplicates(
+  T* list_a, int list_a_size, T* list_b, int list_b_size, int& unique_counter, int execute_warp_id)
+{
+  static_assert(raft::warp_size() == 32);
+  if (!(threadIdx.x >= execute_warp_id * raft::warp_size() &&
+        threadIdx.x < execute_warp_id * raft::warp_size() + raft::warp_size())) {
+    return;
+  }
+  int lane_id = threadIdx.x % raft::warp_size();
+  T elem      = std::numeric_limits<T>::max();
+  if (lane_id < list_a_size) { elem = list_a[lane_id]; }
+  warp_bitonic_sort(&elem, lane_id);
+
+  if (elem != std::numeric_limits<T>::max()) { list_a[lane_id] = elem; }
+
+  T elem_b = std::numeric_limits<T>::max();
+
+  if (lane_id < list_b_size) { elem_b = list_b[lane_id]; }
+  __syncwarp();
+
+  int idx_l    = 0;
+  int idx_r    = list_a_size;
+  bool existed = false;
+  while (idx_l < idx_r) {
+    int idx  = (idx_l + idx_r) / 2;
+    int elem = list_a[idx];
+    if (elem == elem_b) {
+      existed = true;
+      break;
+    }
+    if (elem_b > elem) {
+      idx_l = idx + 1;
+    } else {
+      idx_r = idx;
+    }
+  }
+  if (!existed && elem_b != std::numeric_limits<T>::max()) {
+    int idx                   = atomicAdd(&unique_counter, 1);
+    list_a[list_a_size + idx] = elem_b;
+  }
+}
+
+// launch_bounds here denote BLOCK_SIZE = 512 and MIN_BLOCKS_PER_SM = 4
+// Per
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications,
+// MAX_RESIDENT_THREAD_PER_SM = BLOCK_SIZE * BLOCKS_PER_SM = 2048
+// For architectures 750 and 860, the values for MAX_RESIDENT_THREAD_PER_SM
+// is 1024 and 1536 respectively, which means the bounds don't work anymore
+template <typename Index_t, typename ID_t = InternalID_t<Index_t>>
+__global__ void
+#ifdef __CUDA_ARCH__
+#if (__CUDA_ARCH__) == 750 || (__CUDA_ARCH__) == 860
+__launch_bounds__(BLOCK_SIZE)
+#else
+__launch_bounds__(BLOCK_SIZE, 4)
+#endif
+#endif
+  local_join_kernel(const Index_t* graph_new,
+                    const Index_t* rev_graph_new,
+                    const int2* sizes_new,
+                    const Index_t* graph_old,
+                    const Index_t* rev_graph_old,
+                    const int2* sizes_old,
+                    const int width,
+                    const __half* data,
+                    const int data_dim,
+                    ID_t* graph,
+                    DistData_t* dists,
+                    int graph_width,
+                    int* locks,
+                    DistData_t* l2_norms)
+{
+#if (__CUDA_ARCH__ >= 700)
+  using namespace nvcuda;
+  __shared__ int s_list[MAX_NUM_BI_SAMPLES * 2];
+
+  constexpr int APAD = 8;
+  constexpr int BPAD = 8;
+  __shared__ __half s_nv[MAX_NUM_BI_SAMPLES][TILE_COL_WIDTH + APAD];  // New vectors
+  __shared__ __half s_ov[MAX_NUM_BI_SAMPLES][TILE_COL_WIDTH + BPAD];  // Old vectors
+  static_assert(sizeof(float) * MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES <=
+                sizeof(__half) * MAX_NUM_BI_SAMPLES * (TILE_COL_WIDTH + BPAD));
+  // s_distances: MAX_NUM_BI_SAMPLES x SKEWED_MAX_NUM_BI_SAMPLES, reuse the space of s_ov
+  float* s_distances    = (float*)&s_ov[0][0];
+  int* s_unique_counter = (int*)&s_ov[0][0];
+
+  if (threadIdx.x == 0) {
+    s_unique_counter[0] = 0;
+    s_unique_counter[1] = 0;
+  }
+
+  Index_t* new_neighbors = s_list;
+  Index_t* old_neighbors = s_list + MAX_NUM_BI_SAMPLES;
+
+  size_t list_id      = blockIdx.x;
+  int2 list_new_size2 = sizes_new[list_id];
+  int list_new_size   = list_new_size2.x + list_new_size2.y;
+  int2 list_old_size2 = sizes_old[list_id];
+  int list_old_size   = list_old_size2.x + list_old_size2.y;
+
+  if (!list_new_size) return;
+  int tx = threadIdx.x;
+
+  if (tx < list_new_size2.x) {
+    new_neighbors[tx] = graph_new[list_id * width + tx];
+  } else if (tx >= list_new_size2.x && tx < list_new_size) {
+    new_neighbors[tx] = rev_graph_new[list_id * width + tx - list_new_size2.x];
+  }
+
+  if (tx < list_old_size2.x) {
+    old_neighbors[tx] = graph_old[list_id * width + tx];
+  } else if (tx >= list_old_size2.x && tx < list_old_size) {
+    old_neighbors[tx] = rev_graph_old[list_id * width + tx - list_old_size2.x];
+  }
+
+  __syncthreads();
+
+  remove_duplicates(new_neighbors,
+                    list_new_size2.x,
+                    new_neighbors + list_new_size2.x,
+                    list_new_size2.y,
+                    s_unique_counter[0],
+                    0);
+
+  remove_duplicates(old_neighbors,
+                    list_old_size2.x,
+                    old_neighbors + list_old_size2.x,
+                    list_old_size2.y,
+                    s_unique_counter[1],
+                    1);
+  __syncthreads();
+  list_new_size = list_new_size2.x + s_unique_counter[0];
+  list_old_size = list_old_size2.x + s_unique_counter[1];
+
+  int warp_id             = threadIdx.x / raft::warp_size();
+  int lane_id             = threadIdx.x % raft::warp_size();
+  constexpr int num_warps = BLOCK_SIZE / raft::warp_size();
+
+  int warp_id_y = warp_id / 4;
+  int warp_id_x = warp_id % 4;
+
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> a_frag;
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::col_major> b_frag;
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float> c_frag;
+  wmma::fill_fragment(c_frag, 0.0);
+  for (int step = 0; step < ceildiv(data_dim, TILE_COL_WIDTH); step++) {
+    int num_load_elems = (step == ceildiv(data_dim, TILE_COL_WIDTH) - 1)
+                           ? data_dim - step * TILE_COL_WIDTH
+                           : TILE_COL_WIDTH;
+#pragma unroll
+    for (int i = 0; i < MAX_NUM_BI_SAMPLES / num_warps; i++) {
+      int idx = i * num_warps + warp_id;
+      if (idx < list_new_size) {
+        size_t neighbor_id = new_neighbors[idx];
+        size_t idx_in_data = neighbor_id * data_dim;
+        load_vec(s_nv[idx],
+                 data + idx_in_data + step * TILE_COL_WIDTH,
+                 num_load_elems,
+                 TILE_COL_WIDTH,
+                 lane_id);
+      }
+    }
+    __syncthreads();
+
+    for (int i = 0; i < TILE_COL_WIDTH / WMMA_K; i++) {
+      wmma::load_matrix_sync(a_frag, s_nv[warp_id_y * WMMA_M] + i * WMMA_K, TILE_COL_WIDTH + APAD);
+      wmma::load_matrix_sync(b_frag, s_nv[warp_id_x * WMMA_N] + i * WMMA_K, TILE_COL_WIDTH + BPAD);
+      wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
+      __syncthreads();
+    }
+  }
+
+  wmma::store_matrix_sync(
+    s_distances + warp_id_y * WMMA_M * SKEWED_MAX_NUM_BI_SAMPLES + warp_id_x * WMMA_N,
+    c_frag,
+    SKEWED_MAX_NUM_BI_SAMPLES,
+    wmma::mem_row_major);
+  __syncthreads();
+
+  for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) {
+    if (i % SKEWED_MAX_NUM_BI_SAMPLES < list_new_size &&
+        i / SKEWED_MAX_NUM_BI_SAMPLES < list_new_size) {
+      if (l2_norms == nullptr) {
+        s_distances[i] = -s_distances[i];
+      } else {
+        s_distances[i] = l2_norms[new_neighbors[i % SKEWED_MAX_NUM_BI_SAMPLES]] +
+                         l2_norms[new_neighbors[i / SKEWED_MAX_NUM_BI_SAMPLES]] -
+                         2.0 * s_distances[i];
+      }
+    } else {
+      s_distances[i] = std::numeric_limits<float>::max();
+    }
+  }
+  __syncthreads();
+
+  for (int step = 0; step < ceildiv(list_new_size, num_warps); step++) {
+    int idx_in_list = step * num_warps + tx / raft::warp_size();
+    if (idx_in_list >= list_new_size) continue;
+    auto min_elem = get_min_item(s_list[idx_in_list], idx_in_list, new_neighbors, s_distances);
+    if (min_elem.id() < gridDim.x) {
+      insert_to_global_graph(min_elem, s_list[idx_in_list], graph, dists, graph_width, locks);
+    }
+  }
+
+  if (!list_old_size) return;
+
+  __syncthreads();
+
+  wmma::fill_fragment(c_frag, 0.0);
+  for (int step = 0; step < ceildiv(data_dim, TILE_COL_WIDTH); step++) {
+    int num_load_elems = (step == ceildiv(data_dim, TILE_COL_WIDTH) - 1)
+                           ? data_dim - step * TILE_COL_WIDTH
+                           : TILE_COL_WIDTH;
+    if (TILE_COL_WIDTH < data_dim) {
+#pragma unroll
+      for (int i = 0; i < MAX_NUM_BI_SAMPLES / num_warps; i++) {
+        int idx = i * num_warps + warp_id;
+        if (idx < list_new_size) {
+          size_t neighbor_id = new_neighbors[idx];
+          size_t idx_in_data = neighbor_id * data_dim;
+          load_vec(s_nv[idx],
+                   data + idx_in_data + step * TILE_COL_WIDTH,
+                   num_load_elems,
+                   TILE_COL_WIDTH,
+                   lane_id);
+        }
+      }
+    }
+#pragma unroll
+    for (int i = 0; i < MAX_NUM_BI_SAMPLES / num_warps; i++) {
+      int idx = i * num_warps + warp_id;
+      if (idx < list_old_size) {
+        size_t neighbor_id = old_neighbors[idx];
+        size_t idx_in_data = neighbor_id * data_dim;
+        load_vec(s_ov[idx],
+                 data + idx_in_data + step * TILE_COL_WIDTH,
+                 num_load_elems,
+                 TILE_COL_WIDTH,
+                 lane_id);
+      }
+    }
+    __syncthreads();
+
+    for (int i = 0; i < TILE_COL_WIDTH / WMMA_K; i++) {
+      wmma::load_matrix_sync(a_frag, s_nv[warp_id_y * WMMA_M] + i * WMMA_K, TILE_COL_WIDTH + APAD);
+      wmma::load_matrix_sync(b_frag, s_ov[warp_id_x * WMMA_N] + i * WMMA_K, TILE_COL_WIDTH + BPAD);
+      wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
+      __syncthreads();
+    }
+  }
+
+  wmma::store_matrix_sync(
+    s_distances + warp_id_y * WMMA_M * SKEWED_MAX_NUM_BI_SAMPLES + warp_id_x * WMMA_N,
+    c_frag,
+    SKEWED_MAX_NUM_BI_SAMPLES,
+    wmma::mem_row_major);
+  __syncthreads();
+
+  for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) {
+    if (i % SKEWED_MAX_NUM_BI_SAMPLES < list_old_size &&
+        i / SKEWED_MAX_NUM_BI_SAMPLES < list_new_size) {
+      if (l2_norms == nullptr) {
+        s_distances[i] = -s_distances[i];
+      } else {
+        s_distances[i] = l2_norms[old_neighbors[i % SKEWED_MAX_NUM_BI_SAMPLES]] +
+                         l2_norms[new_neighbors[i / SKEWED_MAX_NUM_BI_SAMPLES]] -
+                         2.0 * s_distances[i];
+      }
+    } else {
+      s_distances[i] = std::numeric_limits<float>::max();
+    }
+  }
+  __syncthreads();
+
+  for (int step = 0; step < ceildiv(MAX_NUM_BI_SAMPLES * 2, num_warps); step++) {
+    int idx_in_list = step * num_warps + tx / raft::warp_size();
+    if (idx_in_list >= list_new_size && idx_in_list < MAX_NUM_BI_SAMPLES) continue;
+    if (idx_in_list >= MAX_NUM_BI_SAMPLES + list_old_size && idx_in_list < MAX_NUM_BI_SAMPLES * 2)
+      continue;
+    ResultItem<Index_t> min_elem{std::numeric_limits<Index_t>::max(),
+                                 std::numeric_limits<DistData_t>::max()};
+    if (idx_in_list < MAX_NUM_BI_SAMPLES) {
+      auto temp_min_item =
+        get_min_item(s_list[idx_in_list], idx_in_list, old_neighbors, s_distances);
+      if (temp_min_item.dist() < min_elem.dist()) { min_elem = temp_min_item; }
+    } else {
+      auto temp_min_item = get_min_item(
+        s_list[idx_in_list], idx_in_list - MAX_NUM_BI_SAMPLES, new_neighbors, s_distances, false);
+      if (temp_min_item.dist() < min_elem.dist()) { min_elem = temp_min_item; }
+    }
+
+    if (min_elem.id() < gridDim.x) {
+      insert_to_global_graph(min_elem, s_list[idx_in_list], graph, dists, graph_width, locks);
+    }
+  }
+#endif
+}
+
+namespace {
+template <typename Index_t>
+int insert_to_ordered_list(InternalID_t<Index_t>* list,
+                           DistData_t* dist_list,
+                           const int width,
+                           const InternalID_t<Index_t> neighb_id,
+                           const DistData_t dist)
+{
+  if (dist > dist_list[width - 1]) { return width; }
+
+  int idx_insert      = width;
+  bool position_found = false;
+  for (int i = 0; i < width; i++) {
+    if (list[i].id() == neighb_id.id()) { return width; }
+    if (!position_found && dist_list[i] > dist) {
+      idx_insert     = i;
+      position_found = true;
+    }
+  }
+  if (idx_insert == width) return idx_insert;
+
+  memmove(list + idx_insert + 1, list + idx_insert, sizeof(*list) * (width - idx_insert - 1));
+  memmove(dist_list + idx_insert + 1,
+          dist_list + idx_insert,
+          sizeof(*dist_list) * (width - idx_insert - 1));
+
+  list[idx_insert]      = neighb_id;
+  dist_list[idx_insert] = dist;
+  return idx_insert;
+};
+
+}  // namespace
+
+template <typename Index_t>
+GnndGraph<Index_t>::GnndGraph(const size_t nrow,
+                              const size_t node_degree,
+                              const size_t internal_node_degree,
+                              const size_t num_samples)
+  : nrow(nrow),
+    node_degree(node_degree),
+    num_samples(num_samples),
+    bloom_filter(nrow, internal_node_degree / segment_size, 3),
+    h_dists{raft::make_host_matrix<DistData_t, size_t, raft::row_major>(nrow, node_degree)},
+    h_graph_new{nrow * num_samples},
+    h_list_sizes_new{nrow},
+    h_graph_old{nrow * num_samples},
+    h_list_sizes_old{nrow}
+{
+  // node_degree must be a multiple of segment_size;
+  assert(node_degree % segment_size == 0);
+  assert(internal_node_degree % segment_size == 0);
+
+  num_segments = node_degree / segment_size;
+  // To save the CPU memory, graph should be allocated by external function
+  h_graph = nullptr;
+}
+
+// This is the only operation on the CPU that cannot be overlapped.
+// So it should be as fast as possible.
+template <typename Index_t>
+void GnndGraph<Index_t>::sample_graph_new(InternalID_t<Index_t>* new_neighbors, const size_t width)
+{
+#pragma omp parallel for
+  for (size_t i = 0; i < nrow; i++) {
+    auto list_new         = h_graph_new.data() + i * num_samples;
+    h_list_sizes_new[i].x = 0;
+    h_list_sizes_new[i].y = 0;
+
+    for (size_t j = 0; j < width; j++) {
+      auto new_neighb_id = new_neighbors[i * width + j].id();
+      if ((size_t)new_neighb_id >= nrow) break;
+      if (bloom_filter.check(i, new_neighb_id)) { continue; }
+      bloom_filter.add(i, new_neighb_id);
+      new_neighbors[i * width + j].mark_old();
+      list_new[h_list_sizes_new[i].x++] = new_neighb_id;
+      if (h_list_sizes_new[i].x == num_samples) break;
+    }
+  }
+}
+
+template <typename Index_t>
+void GnndGraph<Index_t>::init_random_graph()
+{
+  for (size_t seg_idx = 0; seg_idx < static_cast<size_t>(num_segments); seg_idx++) {
+    // random sequence (range: 0~nrow)
+    // segment_x stores neighbors which id % num_segments == x
+    std::vector<Index_t> rand_seq(nrow / num_segments);
+    std::iota(rand_seq.begin(), rand_seq.end(), 0);
+    std::random_shuffle(rand_seq.begin(), rand_seq.end());
+
+#pragma omp parallel for
+    for (size_t i = 0; i < nrow; i++) {
+      size_t base_idx      = i * node_degree + seg_idx * segment_size;
+      auto h_neighbor_list = h_graph + base_idx;
+      auto h_dist_list     = h_dists.data_handle() + base_idx;
+      for (size_t j = 0; j < static_cast<size_t>(segment_size); j++) {
+        size_t idx = base_idx + j;
+        Index_t id = rand_seq[idx % rand_seq.size()] * num_segments + seg_idx;
+        if ((size_t)id == i) {
+          id = rand_seq[(idx + segment_size) % rand_seq.size()] * num_segments + seg_idx;
+        }
+        h_neighbor_list[j].id_with_flag() = id;
+        h_dist_list[j]                    = std::numeric_limits<DistData_t>::max();
+      }
+    }
+  }
+}
+
+template <typename Index_t>
+void GnndGraph<Index_t>::sample_graph(bool sample_new)
+{
+#pragma omp parallel for
+  for (size_t i = 0; i < nrow; i++) {
+    h_list_sizes_old[i].x = 0;
+    h_list_sizes_old[i].y = 0;
+    h_list_sizes_new[i].x = 0;
+    h_list_sizes_new[i].y = 0;
+
+    auto list     = h_graph + i * node_degree;
+    auto list_old = h_graph_old.data() + i * num_samples;
+    auto list_new = h_graph_new.data() + i * num_samples;
+    for (int j = 0; j < segment_size; j++) {
+      for (int k = 0; k < num_segments; k++) {
+        auto neighbor = list[k * segment_size + j];
+        if ((size_t)neighbor.id() >= nrow) continue;
+        if (!neighbor.is_new()) {
+          if (h_list_sizes_old[i].x < num_samples) {
+            list_old[h_list_sizes_old[i].x++] = neighbor.id();
+          }
+        } else if (sample_new) {
+          if (h_list_sizes_new[i].x < num_samples) {
+            list[k * segment_size + j].mark_old();
+            list_new[h_list_sizes_new[i].x++] = neighbor.id();
+          }
+        }
+        if (h_list_sizes_old[i].x == num_samples && h_list_sizes_new[i].x == num_samples) { break; }
+      }
+      if (h_list_sizes_old[i].x == num_samples && h_list_sizes_new[i].x == num_samples) { break; }
+    }
+  }
+}
+
+template <typename Index_t>
+void GnndGraph<Index_t>::update_graph(const InternalID_t<Index_t>* new_neighbors,
+                                      const DistData_t* new_dists,
+                                      const size_t width,
+                                      std::atomic<int64_t>& update_counter)
+{
+#pragma omp parallel for
+  for (size_t i = 0; i < nrow; i++) {
+    for (size_t j = 0; j < width; j++) {
+      auto new_neighb_id = new_neighbors[i * width + j];
+      auto new_dist      = new_dists[i * width + j];
+      if (new_dist == std::numeric_limits<DistData_t>::max()) break;
+      if ((size_t)new_neighb_id.id() == i) continue;
+      int seg_idx    = new_neighb_id.id() % num_segments;
+      auto list      = h_graph + i * node_degree + seg_idx * segment_size;
+      auto dist_list = h_dists.data_handle() + i * node_degree + seg_idx * segment_size;
+      int insert_pos =
+        insert_to_ordered_list(list, dist_list, segment_size, new_neighb_id, new_dist);
+      if (i % counter_interval == 0 && insert_pos != segment_size) { update_counter++; }
+    }
+  }
+}
+
+template <typename Index_t>
+void GnndGraph<Index_t>::sort_lists()
+{
+#pragma omp parallel for
+  for (size_t i = 0; i < nrow; i++) {
+    std::vector<std::pair<DistData_t, Index_t>> new_list;
+    for (size_t j = 0; j < node_degree; j++) {
+      new_list.emplace_back(h_dists.data_handle()[i * node_degree + j],
+                            h_graph[i * node_degree + j].id());
+    }
+    std::sort(new_list.begin(), new_list.end());
+    for (size_t j = 0; j < node_degree; j++) {
+      h_graph[i * node_degree + j].id_with_flag() = new_list[j].second;
+      h_dists.data_handle()[i * node_degree + j]  = new_list[j].first;
+    }
+  }
+}
+
+template <typename Index_t>
+void GnndGraph<Index_t>::clear()
+{
+  bloom_filter.clear();
+}
+
+template <typename Index_t>
+GnndGraph<Index_t>::~GnndGraph()
+{
+  assert(h_graph == nullptr);
+}
+
+template <typename Data_t, typename Index_t>
+GNND<Data_t, Index_t>::GNND(raft::resources const& res, const BuildConfig& build_config)
+  : res(res),
+    build_config_(build_config),
+    graph_(build_config.max_dataset_size,
+           align32::roundUp(build_config.node_degree),
+           align32::roundUp(build_config.internal_node_degree ? build_config.internal_node_degree
+                                                              : build_config.node_degree),
+           NUM_SAMPLES),
+    nrow_(build_config.max_dataset_size),
+    ndim_(build_config.dataset_dim),
+    d_data_{raft::make_device_matrix<__half, Index_t, raft::row_major>(
+      res, nrow_, build_config.dataset_dim)},
+    l2_norms_{raft::make_device_vector<DistData_t, Index_t>(res, nrow_)},
+    graph_buffer_{
+      raft::make_device_matrix<ID_t, Index_t, raft::row_major>(res, nrow_, DEGREE_ON_DEVICE)},
+    dists_buffer_{
+      raft::make_device_matrix<DistData_t, Index_t, raft::row_major>(res, nrow_, DEGREE_ON_DEVICE)},
+    graph_host_buffer_{static_cast<size_t>(nrow_ * DEGREE_ON_DEVICE)},
+    dists_host_buffer_{static_cast<size_t>(nrow_ * DEGREE_ON_DEVICE)},
+    d_locks_{raft::make_device_vector<int, Index_t>(res, nrow_)},
+    h_rev_graph_new_{static_cast<size_t>(nrow_ * NUM_SAMPLES)},
+    h_graph_old_{static_cast<size_t>(nrow_ * NUM_SAMPLES)},
+    h_rev_graph_old_{static_cast<size_t>(nrow_ * NUM_SAMPLES)},
+    d_list_sizes_new_{raft::make_device_vector<int2, Index_t>(res, nrow_)},
+    d_list_sizes_old_{raft::make_device_vector<int2, Index_t>(res, nrow_)}
+{
+  static_assert(NUM_SAMPLES <= 32);
+
+  thrust::fill(thrust::device,
+               dists_buffer_.data_handle(),
+               dists_buffer_.data_handle() + dists_buffer_.size(),
+               std::numeric_limits<float>::max());
+  thrust::fill(thrust::device,
+               reinterpret_cast<Index_t*>(graph_buffer_.data_handle()),
+               reinterpret_cast<Index_t*>(graph_buffer_.data_handle()) + graph_buffer_.size(),
+               std::numeric_limits<Index_t>::max());
+  thrust::fill(thrust::device, d_locks_.data_handle(), d_locks_.data_handle() + d_locks_.size(), 0);
+};
+
+template <typename Data_t, typename Index_t>
+void GNND<Data_t, Index_t>::add_reverse_edges(Index_t* graph_ptr,
+                                              Index_t* h_rev_graph_ptr,
+                                              Index_t* d_rev_graph_ptr,
+                                              int2* list_sizes,
+                                              cudaStream_t stream)
+{
+  add_rev_edges_kernel<<<nrow_, raft::warp_size(), 0, stream>>>(
+    graph_ptr, d_rev_graph_ptr, NUM_SAMPLES, list_sizes);
+  raft::copy(
+    h_rev_graph_ptr, d_rev_graph_ptr, nrow_ * NUM_SAMPLES, raft::resource::get_cuda_stream(res));
+}
+
+template <typename Data_t, typename Index_t>
+void GNND<Data_t, Index_t>::local_join(cudaStream_t stream)
+{
+  thrust::fill(thrust::device.on(stream),
+               dists_buffer_.data_handle(),
+               dists_buffer_.data_handle() + dists_buffer_.size(),
+               std::numeric_limits<float>::max());
+  local_join_kernel<<<nrow_, BLOCK_SIZE, 0, stream>>>(
+    thrust::raw_pointer_cast(graph_.h_graph_new.data()),
+    thrust::raw_pointer_cast(h_rev_graph_new_.data()),
+    d_list_sizes_new_.data_handle(),
+    thrust::raw_pointer_cast(h_graph_old_.data()),
+    thrust::raw_pointer_cast(h_rev_graph_old_.data()),
+    d_list_sizes_old_.data_handle(),
+    NUM_SAMPLES,
+    d_data_.data_handle(),
+    ndim_,
+    graph_buffer_.data_handle(),
+    dists_buffer_.data_handle(),
+    DEGREE_ON_DEVICE,
+    d_locks_.data_handle(),
+    l2_norms_.data_handle());
+}
+
+template <typename Data_t, typename Index_t>
+void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* output_graph)
+{
+  using input_t = typename std::remove_const<Data_t>::type;
+
+  cudaStream_t stream = raft::resource::get_cuda_stream(res);
+  nrow_               = nrow;
+  graph_.h_graph      = (InternalID_t<Index_t>*)output_graph;
+
+  cudaPointerAttributes data_ptr_attr;
+  RAFT_CUDA_TRY(cudaPointerGetAttributes(&data_ptr_attr, data));
+  size_t batch_size = (data_ptr_attr.devicePointer == nullptr) ? 100000 : nrow_;
+
+  raft::spatial::knn::detail::utils::batch_load_iterator vec_batches{
+    data, static_cast<size_t>(nrow_), build_config_.dataset_dim, batch_size, stream};
+  for (auto const& batch : vec_batches) {
+    preprocess_data_kernel<<<
+      batch.size(),
+      raft::warp_size(),
+      sizeof(Data_t) * ceildiv(build_config_.dataset_dim, static_cast<size_t>(raft::warp_size())) *
+        raft::warp_size(),
+      stream>>>(batch.data(),
+                d_data_.data_handle(),
+                build_config_.dataset_dim,
+                l2_norms_.data_handle(),
+                batch.offset());
+  }
+
+  thrust::fill(thrust::device.on(stream),
+               (Index_t*)graph_buffer_.data_handle(),
+               (Index_t*)graph_buffer_.data_handle() + graph_buffer_.size(),
+               std::numeric_limits<Index_t>::max());
+
+  graph_.clear();
+  graph_.init_random_graph();
+  graph_.sample_graph(true);
+
+  auto update_and_sample = [&](bool update_graph) {
+    if (update_graph) {
+      update_counter_ = 0;
+      graph_.update_graph(thrust::raw_pointer_cast(graph_host_buffer_.data()),
+                          thrust::raw_pointer_cast(dists_host_buffer_.data()),
+                          DEGREE_ON_DEVICE,
+                          update_counter_);
+      if (update_counter_ < build_config_.termination_threshold * nrow_ *
+                              build_config_.dataset_dim / counter_interval) {
+        update_counter_ = -1;
+      }
+    }
+    graph_.sample_graph(false);
+  };
+
+  for (size_t it = 0; it < build_config_.max_iterations; it++) {
+    raft::copy(d_list_sizes_new_.data_handle(),
+               thrust::raw_pointer_cast(graph_.h_list_sizes_new.data()),
+               nrow_,
+               raft::resource::get_cuda_stream(res));
+    raft::copy(thrust::raw_pointer_cast(h_graph_old_.data()),
+               thrust::raw_pointer_cast(graph_.h_graph_old.data()),
+               nrow_ * NUM_SAMPLES,
+               raft::resource::get_cuda_stream(res));
+    raft::copy(d_list_sizes_old_.data_handle(),
+               thrust::raw_pointer_cast(graph_.h_list_sizes_old.data()),
+               nrow_,
+               raft::resource::get_cuda_stream(res));
+    raft::resource::sync_stream(res);
+
+    std::thread update_and_sample_thread(update_and_sample, it);
+
+    RAFT_LOG_DEBUG("# GNND iteraton: %lu / %lu", it + 1, build_config_.max_iterations);
+
+    // Reuse dists_buffer_ to save GPU memory. graph_buffer_ cannot be reused, because it
+    // contains some information for local_join.
+    static_assert(DEGREE_ON_DEVICE * sizeof(*(dists_buffer_.data_handle())) >=
+                  NUM_SAMPLES * sizeof(*(graph_buffer_.data_handle())));
+    add_reverse_edges(thrust::raw_pointer_cast(graph_.h_graph_new.data()),
+                      thrust::raw_pointer_cast(h_rev_graph_new_.data()),
+                      (Index_t*)dists_buffer_.data_handle(),
+                      d_list_sizes_new_.data_handle(),
+                      stream);
+    add_reverse_edges(thrust::raw_pointer_cast(h_graph_old_.data()),
+                      thrust::raw_pointer_cast(h_rev_graph_old_.data()),
+                      (Index_t*)dists_buffer_.data_handle(),
+                      d_list_sizes_old_.data_handle(),
+                      stream);
+
+    // Tensor operations from `mma.h` are guarded with archicteture
+    // __CUDA_ARCH__ >= 700. Since RAFT supports compilation for ARCH 600,
+    // we need to ensure that `local_join_kernel` (which uses tensor) operations
+    // is not only not compiled, but also a runtime error is presented to the user
+    auto kernel       = preprocess_data_kernel<input_t>;
+    void* kernel_ptr  = reinterpret_cast<void*>(kernel);
+    auto runtime_arch = raft::util::arch::kernel_virtual_arch(kernel_ptr);
+    auto wmma_range =
+      raft::util::arch::SM_range(raft::util::arch::SM_70(), raft::util::arch::SM_future());
+
+    if (wmma_range.contains(runtime_arch)) {
+      local_join(stream);
+    } else {
+      THROW("NN_DESCENT cannot be run for __CUDA_ARCH__ < 700");
+    }
+
+    update_and_sample_thread.join();
+
+    if (update_counter_ == -1) { break; }
+    raft::copy(thrust::raw_pointer_cast(graph_host_buffer_.data()),
+               graph_buffer_.data_handle(),
+               nrow_ * DEGREE_ON_DEVICE,
+               raft::resource::get_cuda_stream(res));
+    raft::resource::sync_stream(res);
+    raft::copy(thrust::raw_pointer_cast(dists_host_buffer_.data()),
+               dists_buffer_.data_handle(),
+               nrow_ * DEGREE_ON_DEVICE,
+               raft::resource::get_cuda_stream(res));
+
+    graph_.sample_graph_new(thrust::raw_pointer_cast(graph_host_buffer_.data()), DEGREE_ON_DEVICE);
+  }
+
+  graph_.update_graph(thrust::raw_pointer_cast(graph_host_buffer_.data()),
+                      thrust::raw_pointer_cast(dists_host_buffer_.data()),
+                      DEGREE_ON_DEVICE,
+                      update_counter_);
+  raft::resource::sync_stream(res);
+  graph_.sort_lists();
+
+  // Reuse graph_.h_dists as the buffer for shrink the lists in graph
+  static_assert(sizeof(decltype(*(graph_.h_dists.data_handle()))) >= sizeof(Index_t));
+  Index_t* graph_shrink_buffer = (Index_t*)graph_.h_dists.data_handle();
+
+#pragma omp parallel for
+  for (size_t i = 0; i < (size_t)nrow_; i++) {
+    for (size_t j = 0; j < build_config_.node_degree; j++) {
+      size_t idx = i * graph_.node_degree + j;
+      Index_t id = graph_.h_graph[idx].id();
+      if (id < nrow_) {
+        graph_shrink_buffer[i * build_config_.node_degree + j] = id;
+      } else {
+        graph_shrink_buffer[i * build_config_.node_degree + j] =
+          raft::neighbors::cagra::detail::device::xorshift64(idx) % nrow_;
+      }
+    }
+  }
+  graph_.h_graph = nullptr;
+
+#pragma omp parallel for
+  for (size_t i = 0; i < (size_t)nrow_; i++) {
+    for (size_t j = 0; j < build_config_.node_degree; j++) {
+      output_graph[i * build_config_.node_degree + j] =
+        graph_shrink_buffer[i * build_config_.node_degree + j];
+    }
+  }
+}
+
+template <typename T,
+          typename IdxT = uint32_t,
+          typename Accessor =
+            host_device_accessor<std::experimental::default_accessor<T>, memory_type::host>>
+void build(raft::resources const& res,
+           const index_params& params,
+           mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> dataset,
+           index<IdxT>& idx)
+{
+  RAFT_EXPECTS(dataset.extent(0) < std::numeric_limits<int>::max() - 1,
+               "The dataset size for GNND should be less than %d",
+               std::numeric_limits<int>::max() - 1);
+  size_t intermediate_degree = params.intermediate_graph_degree;
+  size_t graph_degree        = params.graph_degree;
+
+  if (intermediate_degree >= static_cast<size_t>(dataset.extent(0))) {
+    RAFT_LOG_WARN(
+      "Intermediate graph degree cannot be larger than dataset size, reducing it to %lu",
+      dataset.extent(0));
+    intermediate_degree = dataset.extent(0) - 1;
+  }
+  if (intermediate_degree < graph_degree) {
+    RAFT_LOG_WARN(
+      "Graph degree (%lu) cannot be larger than intermediate graph degree (%lu), reducing "
+      "graph_degree.",
+      graph_degree,
+      intermediate_degree);
+    graph_degree = intermediate_degree;
+  }
+
+  // The elements in each knn-list are partitioned into different buckets, and we need more buckets
+  // to mitigate bucket collisions. `intermediate_degree` is OK to larger than
+  // extended_graph_degree.
+  size_t extended_graph_degree =
+    align32::roundUp(static_cast<size_t>(graph_degree * (graph_degree <= 32 ? 1.0 : 1.3)));
+  size_t extended_intermediate_degree = align32::roundUp(
+    static_cast<size_t>(intermediate_degree * (intermediate_degree <= 32 ? 1.0 : 1.3)));
+
+  auto int_graph = raft::make_host_matrix<int, int64_t, row_major>(
+    dataset.extent(0), static_cast<int64_t>(extended_graph_degree));
+
+  BuildConfig build_config{.max_dataset_size      = static_cast<size_t>(dataset.extent(0)),
+                           .dataset_dim           = static_cast<size_t>(dataset.extent(1)),
+                           .node_degree           = extended_graph_degree,
+                           .internal_node_degree  = extended_intermediate_degree,
+                           .max_iterations        = params.max_iterations,
+                           .termination_threshold = params.termination_threshold};
+
+  GNND<const T, int> nnd(res, build_config);
+  nnd.build(dataset.data_handle(), dataset.extent(0), int_graph.data_handle());
+
+#pragma omp parallel for
+  for (size_t i = 0; i < static_cast<size_t>(dataset.extent(0)); i++) {
+    for (size_t j = 0; j < graph_degree; j++) {
+      auto graph                  = idx.graph().data_handle();
+      graph[i * graph_degree + j] = int_graph.data_handle()[i * extended_graph_degree + j];
+    }
+  }
+}
+
+template <typename T,
+          typename IdxT = uint32_t,
+          typename Accessor =
+            host_device_accessor<std::experimental::default_accessor<T>, memory_type::host>>
+index<IdxT> build(raft::resources const& res,
+                  const index_params& params,
+                  mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> dataset)
+{
+  size_t intermediate_degree = params.intermediate_graph_degree;
+  size_t graph_degree        = params.graph_degree;
+
+  if (intermediate_degree < graph_degree) {
+    RAFT_LOG_WARN(
+      "Graph degree (%lu) cannot be larger than intermediate graph degree (%lu), reducing "
+      "graph_degree.",
+      graph_degree,
+      intermediate_degree);
+    graph_degree = intermediate_degree;
+  }
+
+  index<IdxT> idx{res, dataset.extent(0), static_cast<int64_t>(graph_degree)};
+
+  build(res, params, dataset, idx);
+
+  return idx;
+}
+
+}  // namespace raft::neighbors::experimental::nn_descent::detail
diff --git a/cpp/include/raft/neighbors/ivf_flat-inl.cuh b/cpp/include/raft/neighbors/ivf_flat-inl.cuh
index a18ee065bf..6641346a67 100644
--- a/cpp/include/raft/neighbors/ivf_flat-inl.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat-inl.cuh
@@ -342,7 +342,7 @@ void extend(raft::resources const& handle,
 /** @} */
 
 /**
- * @brief Search ANN using the constructed index.
+ * @brief Search ANN using the constructed index with the given filter.
  *
  * See the [ivf_flat::build](#ivf_flat::build) documentation for a usage example.
  *
@@ -374,6 +374,8 @@ void extend(raft::resources const& handle,
  *
  * @tparam T data element type
  * @tparam IdxT type of the indices
+ * @tparam IvfSampleFilterT Device filter function, with the signature
+ *         `(uint32_t query_ix, uint32 cluster_ix, uint32_t sample_ix) -> bool`
  *
  * @param[in] handle
  * @param[in] params configure the search
@@ -386,7 +388,7 @@ void extend(raft::resources const& handle,
  * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
  * @param[in] mr an optional memory resource to use across the searches (you can provide a large
  * enough memory pool here to avoid memory allocations within search).
- * @param[in] sample_filter a filter the greenlights samples for a given query
+ * @param[in] sample_filter a device filter function that greenlights samples for a given query
  */
 template <typename T, typename IdxT, typename IvfSampleFilterT>
 void search_with_filtering(raft::resources const& handle,
@@ -475,7 +477,7 @@ void search(raft::resources const& handle,
  */
 
 /**
- * @brief Search ANN using the constructed index using the given filter.
+ * @brief Search ANN using the constructed index with the given filter.
  *
  * See the [ivf_flat::build](#ivf_flat::build) documentation for a usage example.
  *
@@ -501,6 +503,8 @@ void search(raft::resources const& handle,
  *
  * @tparam T data element type
  * @tparam IdxT type of the indices
+ * @tparam IvfSampleFilterT Device filter function, with the signature
+ *         `(uint32_t query_ix, uint32 cluster_ix, uint32_t sample_ix) -> bool`
  *
  * @param[in] handle
  * @param[in] params configure the search
@@ -509,7 +513,7 @@ void search(raft::resources const& handle,
  * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
  * [n_queries, k]
  * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
- * @param[in] sample_filter a filter the greenlights samples for a given query
+ * @param[in] sample_filter a device filter function that greenlights samples for a given query
  */
 template <typename T, typename IdxT, typename IvfSampleFilterT>
 void search_with_filtering(raft::resources const& handle,
diff --git a/cpp/include/raft/neighbors/ivf_pq-inl.cuh b/cpp/include/raft/neighbors/ivf_pq-inl.cuh
index ccf8717486..9f203d92fb 100644
--- a/cpp/include/raft/neighbors/ivf_pq-inl.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq-inl.cuh
@@ -134,7 +134,7 @@ void extend(raft::resources const& handle,
 }
 
 /**
- * @brief Search ANN using the constructed index using the given filter.
+ * @brief Search ANN using the constructed index with the given filter.
  *
  * See the [ivf_pq::build](#ivf_pq::build) documentation for a usage example.
  *
@@ -148,6 +148,8 @@ void extend(raft::resources const& handle,
  *
  * @tparam T data element type
  * @tparam IdxT type of the indices
+ * @tparam IvfSampleFilterT Device filter function, with the signature
+ *         `(uint32_t query_ix, uint32 cluster_ix, uint32_t sample_ix) -> bool`
  *
  * @param[in] handle
  * @param[in] params configure the search
@@ -157,7 +159,7 @@ void extend(raft::resources const& handle,
  * [n_queries, k]
  * @param[out] distances a device matrix view to the distances to the selected neighbors [n_queries,
  * k]
- * @param[in] sample_filter a filter the greenlights samples for a given query.
+ * @param[in] sample_filter a device filter function that greenlights samples for a given query.
  */
 template <typename T, typename IdxT, typename IvfSampleFilterT>
 void search_with_filtering(raft::resources const& handle,
@@ -343,7 +345,7 @@ void extend(raft::resources const& handle,
 }
 
 /**
- * @brief Search ANN using the constructed index using the given filter.
+ * @brief Search ANN using the constructed index with the given filter.
  *
  * See the [ivf_pq::build](#ivf_pq::build) documentation for a usage example.
  *
@@ -372,6 +374,8 @@ void extend(raft::resources const& handle,
  *
  * @tparam T data element type
  * @tparam IdxT type of the indices
+ * @tparam IvfSampleFilterT Device filter function, with the signature
+ *         `(uint32_t query_ix, uint32 cluster_ix, uint32_t sample_ix) -> bool`
  *
  * @param[in] handle
  * @param[in] params configure the search
@@ -382,7 +386,7 @@ void extend(raft::resources const& handle,
  * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
  * [n_queries, k]
  * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
- * @param[in] sample_filter a filter the greenlights samples for a given query
+ * @param[in] sample_filter a device filter function that greenlights samples for a given query
  */
 template <typename T, typename IdxT, typename IvfSampleFilterT>
 void search_with_filtering(raft::resources const& handle,
diff --git a/cpp/include/raft/neighbors/nn_descent.cuh b/cpp/include/raft/neighbors/nn_descent.cuh
new file mode 100644
index 0000000000..ceb5ae5643
--- /dev/null
+++ b/cpp/include/raft/neighbors/nn_descent.cuh
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/nn_descent.cuh"
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/host_mdspan.hpp>
+
+namespace raft::neighbors::experimental::nn_descent {
+
+/**
+ * @defgroup nn-descent CUDA gradient descent nearest neighbor
+ * @{
+ */
+
+/**
+ * @brief Build nn-descent Index with dataset in device memory
+ *
+ * The following distance metrics are supported:
+ * - L2
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors::experimental;
+ *   // use default index parameters
+ *   nn_descent::index_params index_params;
+ *   // create and fill the index from a [N, D] raft::device_matrix_view dataset
+ *   auto index = cagra::build(res, index_params, dataset);
+ *   // index.graph() provides a raft::host_matrix_view of an
+ *   // all-neighbors knn graph of dimensions [N, k] of the input
+ *   // dataset
+ * @endcode
+ *
+ * @tparam T data-type of the input dataset
+ * @tparam IdxT data-type for the output index
+ * @param[in] res raft::resources is an object mangaging resources
+ * @param[in] params an instance of nn_descent::index_params that are parameters
+ *               to run the nn-descent algorithm
+ * @param[in] dataset raft::device_matrix_view input dataset expected to be located
+ *                in device memory
+ * @return index<IdxT> index containing all-neighbors knn graph in host memory
+ */
+template <typename T, typename IdxT = uint32_t>
+index<IdxT> build(raft::resources const& res,
+                  index_params const& params,
+                  raft::device_matrix_view<const T, int64_t, row_major> dataset)
+{
+  return detail::build<T, IdxT>(res, params, dataset);
+}
+
+/**
+ * @brief Build nn-descent Index with dataset in device memory
+ *
+ * The following distance metrics are supported:
+ * - L2
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors::experimental;
+ *   // use default index parameters
+ *   nn_descent::index_params index_params;
+ *   // create and fill the index from a [N, D] raft::device_matrix_view dataset
+ *   auto knn_graph = raft::make_host_matrix<uint32_t, int64_t>(N, D);
+ *   auto index = nn_descent::index{res, knn_graph.view()};
+ *   cagra::build(res, index_params, dataset, index);
+ *   // index.graph() provides a raft::host_matrix_view of an
+ *   // all-neighbors knn graph of dimensions [N, k] of the input
+ *   // dataset
+ * @endcode
+ *
+ * @tparam T data-type of the input dataset
+ * @tparam IdxT data-type for the output index
+ * @param res raft::resources is an object mangaging resources
+ * @param[in] params an instance of nn_descent::index_params that are parameters
+ *               to run the nn-descent algorithm
+ * @param[in] dataset raft::device_matrix_view input dataset expected to be located
+ *                in device memory
+ * @param[out] idx raft::neighbors::experimental::nn_descentindex containing all-neighbors knn graph
+ * in host memory
+ */
+template <typename T, typename IdxT = uint32_t>
+void build(raft::resources const& res,
+           index_params const& params,
+           raft::device_matrix_view<const T, int64_t, row_major> dataset,
+           index<IdxT>& idx)
+{
+  detail::build<T, IdxT>(res, params, dataset, idx);
+}
+
+/**
+ * @brief Build nn-descent Index with dataset in host memory
+ *
+ * The following distance metrics are supported:
+ * - L2
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors::experimental;
+ *   // use default index parameters
+ *   nn_descent::index_params index_params;
+ *   // create and fill the index from a [N, D] raft::host_matrix_view dataset
+ *   auto index = cagra::build(res, index_params, dataset);
+ *   // index.graph() provides a raft::host_matrix_view of an
+ *   // all-neighbors knn graph of dimensions [N, k] of the input
+ *   // dataset
+ * @endcode
+ *
+ * @tparam T data-type of the input dataset
+ * @tparam IdxT data-type for the output index
+ * @param res raft::resources is an object mangaging resources
+ * @param[in] params an instance of nn_descent::index_params that are parameters
+ *               to run the nn-descent algorithm
+ * @param[in] dataset raft::host_matrix_view input dataset expected to be located
+ *                in host memory
+ * @return index<IdxT> index containing all-neighbors knn graph in host memory
+ */
+template <typename T, typename IdxT = uint32_t>
+index<IdxT> build(raft::resources const& res,
+                  index_params const& params,
+                  raft::host_matrix_view<const T, int64_t, row_major> dataset)
+{
+  return detail::build<T, IdxT>(res, params, dataset);
+}
+
+/**
+ * @brief Build nn-descent Index with dataset in host memory
+ *
+ * The following distance metrics are supported:
+ * - L2
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors::experimental;
+ *   // use default index parameters
+ *   nn_descent::index_params index_params;
+ *   // create and fill the index from a [N, D] raft::host_matrix_view dataset
+ *   auto knn_graph = raft::make_host_matrix<uint32_t, int64_t>(N, D);
+ *   auto index = nn_descent::index{res, knn_graph.view()};
+ *   cagra::build(res, index_params, dataset, index);
+ *   // index.graph() provides a raft::host_matrix_view of an
+ *   // all-neighbors knn graph of dimensions [N, k] of the input
+ *   // dataset
+ * @endcode
+ *
+ * @tparam T data-type of the input dataset
+ * @tparam IdxT data-type for the output index
+ * @param[in] res raft::resources is an object mangaging resources
+ * @param[in] params an instance of nn_descent::index_params that are parameters
+ *               to run the nn-descent algorithm
+ * @param[in] dataset raft::host_matrix_view input dataset expected to be located
+ *                in host memory
+ * @param[out] idx raft::neighbors::experimental::nn_descentindex containing all-neighbors knn graph
+ * in host memory
+ */
+template <typename T, typename IdxT = uint32_t>
+void build(raft::resources const& res,
+           index_params const& params,
+           raft::host_matrix_view<const T, int64_t, row_major> dataset,
+           index<IdxT>& idx)
+{
+  detail::build<T, IdxT>(res, params, dataset, idx);
+}
+
+/** @} */  // end group nn-descent
+
+}  // namespace raft::neighbors::experimental::nn_descent
diff --git a/cpp/include/raft/neighbors/nn_descent_types.hpp b/cpp/include/raft/neighbors/nn_descent_types.hpp
new file mode 100644
index 0000000000..64e464c618
--- /dev/null
+++ b/cpp/include/raft/neighbors/nn_descent_types.hpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "ann_types.hpp"
+#include <raft/core/resource/cuda_stream.hpp>
+
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/mdspan_types.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/distance/distance_types.hpp>
+
+namespace raft::neighbors::experimental::nn_descent {
+/**
+ * @ingroup nn_descent
+ * @{
+ */
+
+/**
+ * @brief Parameters used to build an nn-descent index
+ *
+ * `graph_degree`: For an input dataset of dimensions (N, D),
+ * determines the final dimensions of the all-neighbors knn graph
+ * which turns out to be of dimensions (N, graph_degree)
+ * `intermediate_graph_degree`: Internally, nn-descent builds an
+ * all-neighbors knn graph of dimensions (N, intermediate_graph_degree)
+ * before selecting the final `graph_degree` neighbors. It's recommended
+ * that `intermediate_graph_degree` >= 1.5 * graph_degree
+ * `max_iterations`: The number of iterations that nn-descent will refine
+ * the graph for. More iterations produce a better quality graph at cost of performance
+ * `termination_threshold`: The delta at which nn-descent will terminate its iterations
+ *
+ */
+struct index_params : ann::index_params {
+  size_t graph_degree              = 64;      // Degree of output graph.
+  size_t intermediate_graph_degree = 128;     // Degree of input graph for pruning.
+  size_t max_iterations            = 20;      // Number of nn-descent iterations.
+  float termination_threshold      = 0.0001;  // Termination threshold of nn-descent.
+};
+
+/**
+ * @brief nn-descent Build an nn-descent index
+ * The index contains an all-neighbors graph of the input dataset
+ * stored in host memory of dimensions (n_rows, n_cols)
+ *
+ * @tparam IdxT dtype to be used for constructing knn-graph
+ */
+template <typename IdxT>
+struct index : ann::index {
+ public:
+  /**
+   * @brief Construct a new index object
+   *
+   * This constructor creates an nn-descent index which is a knn-graph in host memory.
+   * The type of the knn-graph is a dense raft::host_matrix and dimensions are
+   * (n_rows, n_cols).
+   *
+   * @param res raft::resources is an object mangaging resources
+   * @param n_rows number of rows in knn-graph
+   * @param n_cols number of cols in knn-graph
+   */
+  index(raft::resources const& res, int64_t n_rows, int64_t n_cols)
+    : ann::index(),
+      res_{res},
+      metric_{raft::distance::DistanceType::L2Expanded},
+      graph_{raft::make_host_matrix<IdxT, int64_t, row_major>(n_rows, n_cols)},
+      graph_view_{graph_.view()}
+  {
+  }
+
+  /**
+   * @brief Construct a new index object
+   *
+   * This constructor creates an nn-descent index using a user allocated host memory knn-graph.
+   * The type of the knn-graph is a dense raft::host_matrix and dimensions are
+   * (n_rows, n_cols).
+   *
+   * @param res raft::resources is an object mangaging resources
+   * @param graph_view raft::host_matrix_view<IdxT, int64_t, raft::row_major> for storing knn-graph
+   */
+  index(raft::resources const& res,
+        raft::host_matrix_view<IdxT, int64_t, raft::row_major> graph_view)
+    : ann::index(),
+      res_{res},
+      metric_{raft::distance::DistanceType::L2Expanded},
+      graph_{raft::make_host_matrix<IdxT, int64_t, row_major>(0, 0)},
+      graph_view_{graph_view}
+  {
+  }
+
+  /** Distance metric used for clustering. */
+  [[nodiscard]] constexpr inline auto metric() const noexcept -> raft::distance::DistanceType
+  {
+    return metric_;
+  }
+
+  // /** Total length of the index (number of vectors). */
+  [[nodiscard]] constexpr inline auto size() const noexcept -> IdxT
+  {
+    return graph_view_.extent(0);
+  }
+
+  /** Graph degree */
+  [[nodiscard]] constexpr inline auto graph_degree() const noexcept -> uint32_t
+  {
+    return graph_view_.extent(1);
+  }
+
+  /** neighborhood graph [size, graph-degree] */
+  [[nodiscard]] inline auto graph() noexcept -> host_matrix_view<IdxT, int64_t, row_major>
+  {
+    return graph_view_;
+  }
+
+  // Don't allow copying the index for performance reasons (try avoiding copying data)
+  index(const index&)                    = delete;
+  index(index&&)                         = default;
+  auto operator=(const index&) -> index& = delete;
+  auto operator=(index&&) -> index&      = default;
+  ~index()                               = default;
+
+ private:
+  raft::resources const& res_;
+  raft::distance::DistanceType metric_;
+  raft::host_matrix<IdxT, int64_t, row_major> graph_;  // graph to return for non-int IdxT
+  raft::host_matrix_view<IdxT, int64_t, row_major>
+    graph_view_;  // view of graph for user provided matrix
+};
+
+/** @} */
+
+}  // namespace raft::neighbors::experimental::nn_descent
diff --git a/cpp/include/raft/neighbors/sample_filter.cuh b/cpp/include/raft/neighbors/sample_filter.cuh
new file mode 100644
index 0000000000..9182d72da9
--- /dev/null
+++ b/cpp/include/raft/neighbors/sample_filter.cuh
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#include <raft/core/bitset.cuh>
+
+namespace raft::neighbors::filtering {
+/**
+ * @brief Filter an index with a bitset
+ *
+ * @tparam index_t Indexing type
+ */
+template <typename bitset_t, typename index_t>
+struct bitset_filter {
+  // View of the bitset to use as a filter
+  const raft::core::bitset_view<bitset_t, index_t> bitset_view_;
+
+  bitset_filter(const raft::core::bitset_view<bitset_t, index_t> bitset_for_filtering)
+    : bitset_view_{bitset_for_filtering}
+  {
+  }
+  inline _RAFT_HOST_DEVICE bool operator()(
+    // query index
+    const uint32_t query_ix,
+    // the index of the current sample
+    const uint32_t sample_ix) const
+  {
+    return bitset_view_.test(sample_ix);
+  }
+};
+}  // namespace raft::neighbors::filtering
diff --git a/cpp/include/raft/neighbors/sample_filter_types.hpp b/cpp/include/raft/neighbors/sample_filter_types.hpp
index 5a301e9d2f..10c5e99372 100644
--- a/cpp/include/raft/neighbors/sample_filter_types.hpp
+++ b/cpp/include/raft/neighbors/sample_filter_types.hpp
@@ -37,6 +37,18 @@ struct none_ivf_sample_filter {
   }
 };
 
+/* A filter that filters nothing. This is the default behavior. */
+struct none_cagra_sample_filter {
+  inline _RAFT_HOST_DEVICE bool operator()(
+    // query index
+    const uint32_t query_ix,
+    // the index of the current sample
+    const uint32_t sample_ix) const
+  {
+    return true;
+  }
+};
+
 /**
  * If the filtering depends on the index of a sample, then the following
  * filter template can be used:
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh
index 390436939f..1a48e1adde 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh
@@ -36,7 +36,9 @@ void fusedL2Knn(size_t D,
                 bool rowMajorIndex,
                 bool rowMajorQuery,
                 cudaStream_t stream,
-                raft::distance::DistanceType metric) RAFT_EXPLICIT;
+                raft::distance::DistanceType metric,
+                const value_t* index_norms = NULL,
+                const value_t* query_norms = NULL) RAFT_EXPLICIT;
 
 }  // namespace raft::spatial::knn::detail
 
@@ -56,7 +58,9 @@ void fusedL2Knn(size_t D,
     bool rowMajorIndex,                                                                     \
     bool rowMajorQuery,                                                                     \
     cudaStream_t stream,                                                                    \
-    raft::distance::DistanceType metric)
+    raft::distance::DistanceType metric,                                                    \
+    const Mvalue_t* index_norms,                                                            \
+    const Mvalue_t* query_norms);
 
 instantiate_raft_spatial_knn_detail_fusedL2Knn(int32_t, float, true);
 instantiate_raft_spatial_knn_detail_fusedL2Knn(int32_t, float, false);
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn-inl.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn-inl.cuh
index 4a571c1447..67abab3d1e 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn-inl.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn-inl.cuh
@@ -706,6 +706,8 @@ template <typename DataT,
           bool isRowMajor>
 void fusedL2ExpKnnImpl(const DataT* x,
                        const DataT* y,
+                       const DataT* xn,
+                       const DataT* yn,
                        IdxT m,
                        IdxT n,
                        IdxT k,
@@ -787,19 +789,25 @@ void fusedL2ExpKnnImpl(const DataT* x,
       }
     }
 
-    DataT* xn = (DataT*)workspace;
-    DataT* yn = (DataT*)workspace;
-
-    if (x != y) {
-      yn += m;
-      raft::linalg::rowNorm(
-        xn, x, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
-      raft::linalg::rowNorm(
-        yn, y, k, n, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
-    } else {
+    // calculate norms if they haven't been passed in
+    if (!xn) {
+      DataT* xn_ = (DataT*)workspace;
+      workspace  = xn_ + m;
       raft::linalg::rowNorm(
-        xn, x, k, n, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
+        xn_, x, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
+      xn = xn_;
     }
+    if (!yn) {
+      if (x == y) {
+        yn = xn;
+      } else {
+        DataT* yn_ = (DataT*)(workspace);
+        raft::linalg::rowNorm(
+          yn_, y, k, n, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
+        yn = yn_;
+      }
+    }
+
     fusedL2ExpKnnRowMajor<<<grid, blk, sharedMemSize, stream>>>(x,
                                                                 y,
                                                                 xn,
@@ -836,6 +844,8 @@ void fusedL2ExpKnn(IdxT m,
                    IdxT ldd,
                    const DataT* x,
                    const DataT* y,
+                   const DataT* xn,
+                   const DataT* yn,
                    bool sqrt,
                    OutT* out_dists,
                    IdxT* out_inds,
@@ -850,6 +860,8 @@ void fusedL2ExpKnn(IdxT m,
     fusedL2ExpKnnImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), usePrevTopKs, isRowMajor>(
       x,
       y,
+      xn,
+      yn,
       m,
       n,
       k,
@@ -867,6 +879,8 @@ void fusedL2ExpKnn(IdxT m,
     fusedL2ExpKnnImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), usePrevTopKs, isRowMajor>(
       x,
       y,
+      xn,
+      yn,
       m,
       n,
       k,
@@ -883,6 +897,8 @@ void fusedL2ExpKnn(IdxT m,
   } else {
     fusedL2ExpKnnImpl<DataT, AccT, OutT, IdxT, 1, usePrevTopKs, isRowMajor>(x,
                                                                             y,
+                                                                            xn,
+                                                                            yn,
                                                                             m,
                                                                             n,
                                                                             k,
@@ -927,7 +943,9 @@ void fusedL2Knn(size_t D,
                 bool rowMajorIndex,
                 bool rowMajorQuery,
                 cudaStream_t stream,
-                raft::distance::DistanceType metric)
+                raft::distance::DistanceType metric,
+                const value_t* index_norms = NULL,
+                const value_t* query_norms = NULL)
 {
   // Validate the input data
   ASSERT(k > 0, "l2Knn: k must be > 0");
@@ -968,6 +986,8 @@ void fusedL2Knn(size_t D,
                                                                               ldd,
                                                                               query,
                                                                               index,
+                                                                              query_norms,
+                                                                              index_norms,
                                                                               sqrt,
                                                                               out_dists,
                                                                               out_inds,
@@ -985,6 +1005,8 @@ void fusedL2Knn(size_t D,
                                                                                 ldd,
                                                                                 query,
                                                                                 index,
+                                                                                query_norms,
+                                                                                index_norms,
                                                                                 sqrt,
                                                                                 out_dists,
                                                                                 out_inds,
diff --git a/cpp/include/raft/util/memory_pool-inl.hpp b/cpp/include/raft/util/memory_pool-inl.hpp
index 070c8f4e30..ad94ee0096 100644
--- a/cpp/include/raft/util/memory_pool-inl.hpp
+++ b/cpp/include/raft/util/memory_pool-inl.hpp
@@ -25,6 +25,10 @@
 
 namespace raft {
 
+/**
+ * @defgroup memory_pool Memory Pool
+ * @{
+ */
 /**
  * @brief Get a pointer to a pooled memory resource within the scope of the lifetime of the returned
  * unique pointer.
@@ -73,4 +77,5 @@ RAFT_INLINE_CONDITIONAL std::unique_ptr<rmm::mr::device_memory_resource> get_poo
   return pool_res;
 }
 
+/** @} */
 }  // namespace raft
diff --git a/cpp/src/neighbors/brute_force_knn_index_float.cu b/cpp/src/neighbors/brute_force_knn_index_float.cu
new file mode 100644
index 0000000000..f2fda93a97
--- /dev/null
+++ b/cpp/src/neighbors/brute_force_knn_index_float.cu
@@ -0,0 +1,39 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/neighbors/brute_force-inl.cuh>
+
+template void raft::neighbors::brute_force::search<float, int>(
+  raft::resources const& res,
+  const raft::neighbors::brute_force::index<float>& idx,
+  raft::device_matrix_view<const float, int64_t, row_major> queries,
+  raft::device_matrix_view<int, int64_t, row_major> neighbors,
+  raft::device_matrix_view<float, int64_t, row_major> distances);
+
+template void raft::neighbors::brute_force::search<float, int64_t>(
+  raft::resources const& res,
+  const raft::neighbors::brute_force::index<float>& idx,
+  raft::device_matrix_view<const float, int64_t, row_major> queries,
+  raft::device_matrix_view<int64_t, int64_t, row_major> neighbors,
+  raft::device_matrix_view<float, int64_t, row_major> distances);
+
+template raft::neighbors::brute_force::index<float> raft::neighbors::brute_force::build<float>(
+  raft::resources const& res,
+  raft::device_matrix_view<const float, int64_t, row_major> dataset,
+  raft::distance::DistanceType metric,
+  float metric_arg);
\ No newline at end of file
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py b/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py
index 784d116503..15eb0a9e65 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py
@@ -39,41 +39,45 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \\
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \\
-      raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \\
-      raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \\
-      INDEX_T* const topk_indices_ptr,                                                        \\
-      DISTANCE_T* const topk_distances_ptr,                                                   \\
-      const DATA_T* const queries_ptr,                                                        \\
-      const uint32_t num_queries,                                                             \\
-      const INDEX_T* dev_seed_ptr,                                                            \\
-      uint32_t* const num_executed_iterations,                                                \\
-      uint32_t topk,                                                                          \\
-      uint32_t block_size,                                                                    \\
-      uint32_t result_buffer_size,                                                            \\
-      uint32_t smem_size,                                                                     \\
-      int64_t hash_bitlen,                                                                    \\
-      INDEX_T* hashmap_ptr,                                                                   \\
-      uint32_t num_cta_per_query,                                                             \\
-      uint32_t num_random_samplings,                                                          \\
-      uint64_t rand_xor_mask,                                                                 \\
-      uint32_t num_seeds,                                                                     \\
-      size_t itopk_size,                                                                      \\
-      size_t search_width,                                                                     \\
-      size_t min_iterations,                                                                  \\
-      size_t max_iterations,                                                                  \\
-      cudaStream_t stream);
+#define instantiate_kernel_selection(                                                       \\
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \\
+  template void                                                                             \\
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \\
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \\
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \\
+    INDEX_T* const topk_indices_ptr,                                                        \\
+    DISTANCE_T* const topk_distances_ptr,                                                   \\
+    const DATA_T* const queries_ptr,                                                        \\
+    const uint32_t num_queries,                                                             \\
+    const INDEX_T* dev_seed_ptr,                                                            \\
+    uint32_t* const num_executed_iterations,                                                \\
+    uint32_t topk,                                                                          \\
+    uint32_t block_size,                                                                    \\
+    uint32_t result_buffer_size,                                                            \\
+    uint32_t smem_size,                                                                     \\
+    int64_t hash_bitlen,                                                                    \\
+    INDEX_T* hashmap_ptr,                                                                   \\
+    uint32_t num_cta_per_query,                                                             \\
+    uint32_t num_random_samplings,                                                          \\
+    uint64_t rand_xor_mask,                                                                 \\
+    uint32_t num_seeds,                                                                     \\
+    size_t itopk_size,                                                                      \\
+    size_t search_width,                                                                    \\
+    size_t min_iterations,                                                                  \\
+    size_t max_iterations,                                                                  \\
+    SAMPLE_FILTER_T sample_filter,                                                          \\
+    cudaStream_t stream);
 
 """
 
 trailer = """
 #undef instantiate_kernel_selection
 
-} // namespace raft::neighbors::cagra::detail::namespace multi_cta_search
+}  // namespace raft::neighbors::cagra::detail::multi_cta_search
 """
 
 mxdim_team = [(128, 8), (256, 16), (512, 32), (1024, 32)]
@@ -97,7 +101,7 @@
         with open(path, "w") as f:
             f.write(header)
             f.write(
-                f"instantiate_kernel_selection({team}, {mxdim}, {data_t}, {idx_t}, {distance_t});\n"
+                f"instantiate_kernel_selection(\n  {team}, {mxdim}, {data_t}, {idx_t}, {distance_t}, raft::neighbors::filtering::none_cagra_sample_filter);\n"
             )
             f.write(trailer)
             # For pasting into CMakeLists.txt
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu
index 2a4e7ac607..1a3b2284bd 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_widthhhhhhhhh,                                                              \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(32, 1024, float, uint32_t, float);
+instantiate_kernel_selection(
+  32, 1024, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu
index 115ce3b48b..36e86d9ed6 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_width,                                                                      \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(8, 128, float, uint32_t, float);
+instantiate_kernel_selection(
+  8, 128, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu
index c5e704a85f..6f1af2d93f 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_width,                                                                      \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(16, 256, float, uint32_t, float);
+instantiate_kernel_selection(
+  16, 256, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu
index 3469facf39..1279f8e415 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_width,                                                                      \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(32, 512, float, uint32_t, float);
+instantiate_kernel_selection(
+  32, 512, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu
index 327bfc73b4..0dabff0df5 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_width,                                                                      \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(32, 1024, float, uint64_t, float);
+instantiate_kernel_selection(
+  32, 1024, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu
index 1abe0cd8af..72bb74cdb8 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_width,                                                                      \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(8, 128, float, uint64_t, float);
+instantiate_kernel_selection(
+  8, 128, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu
index dd61810d06..dceea10b5d 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_width,                                                                      \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(16, 256, float, uint64_t, float);
+instantiate_kernel_selection(
+  16, 256, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu
index 8e12bab514..acb8bd6a12 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_width,                                                                      \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(32, 512, float, uint64_t, float);
+instantiate_kernel_selection(
+  32, 512, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu
index d946ac9c79..0254f09ff0 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_width,                                                                      \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(32, 1024, int8_t, uint32_t, float);
+instantiate_kernel_selection(
+  32, 1024, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu
index e4d7b44d1e..2b67e7e968 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_width,                                                                      \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(8, 128, int8_t, uint32_t, float);
+instantiate_kernel_selection(
+  8, 128, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu
index b8dc3b38a8..17d6722e58 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_width,                                                                      \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(16, 256, int8_t, uint32_t, float);
+instantiate_kernel_selection(
+  16, 256, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu
index 749b35bad6..38f02812e2 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_width,                                                                      \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(32, 512, int8_t, uint32_t, float);
+instantiate_kernel_selection(
+  32, 512, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu
index 428d460ba8..fa111196c6 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_widthh,                                                                     \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(32, 1024, uint8_t, uint32_t, float);
+instantiate_kernel_selection(
+  32, 1024, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu
index 28a20b865e..1ef3c28aa3 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_width,                                                                      \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(8, 128, uint8_t, uint32_t, float);
+instantiate_kernel_selection(
+  8, 128, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu
index e85a84ae8e..d26cb44843 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_width,                                                                      \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(16, 256, uint8_t, uint32_t, float);
+instantiate_kernel_selection(
+  16, 256, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu
index 232b62ebcd..4d4322f261 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu
@@ -25,36 +25,41 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>(      \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                   \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                        \
-    INDEX_T* const topk_indices_ptr,                                                          \
-    DISTANCE_T* const topk_distances_ptr,                                                     \
-    const DATA_T* const queries_ptr,                                                          \
-    const uint32_t num_queries,                                                               \
-    const INDEX_T* dev_seed_ptr,                                                              \
-    uint32_t* const num_executed_iterations,                                                  \
-    uint32_t topk,                                                                            \
-    uint32_t block_size,                                                                      \
-    uint32_t result_buffer_size,                                                              \
-    uint32_t smem_size,                                                                       \
-    int64_t hash_bitlen,                                                                      \
-    INDEX_T* hashmap_ptr,                                                                     \
-    uint32_t num_cta_per_query,                                                               \
-    uint32_t num_random_samplings,                                                            \
-    uint64_t rand_xor_mask,                                                                   \
-    uint32_t num_seeds,                                                                       \
-    size_t itopk_size,                                                                        \
-    size_t search_width,                                                                      \
-    size_t min_iterations,                                                                    \
-    size_t max_iterations,                                                                    \
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_kernel_selection(32, 512, uint8_t, uint32_t, float);
+instantiate_kernel_selection(
+  32, 512, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_kernel_selection
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py b/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py
index cf61a45b4a..249555082e 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py
@@ -39,35 +39,38 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \\
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \\
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \\
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \\
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \\
-    INDEX_T* const topk_indices_ptr,                                                     \\
-    DISTANCE_T* const topk_distances_ptr,                                                \\
-    const DATA_T* const queries_ptr,                                                     \\
-    const uint32_t num_queries,                                                          \\
-    const INDEX_T* dev_seed_ptr,                                                         \\
-    uint32_t* const num_executed_iterations,                                             \\
-    uint32_t topk,                                                                       \\
-    uint32_t num_itopk_candidates,                                                       \\
-    uint32_t block_size,                                                                 \\
-    uint32_t smem_size,                                                                  \\
-    int64_t hash_bitlen,                                                                 \\
-    INDEX_T* hashmap_ptr,                                                                \\
-    size_t small_hash_bitlen,                                                           \\
-    size_t small_hash_reset_interval,                                                    \\
-    uint32_t num_random_samplings,                                                       \\
-    uint64_t rand_xor_mask,                                                              \\
-    uint32_t num_seeds,                                                                  \\
-    size_t itopk_size,                                                                   \\
-    size_t search_width,                                                                  \\
-    size_t min_iterations,                                                               \\
-    size_t max_iterations,                                                               \\
+#define instantiate_single_cta_select_and_run(                                              \\
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \\
+  template void                                                                             \\
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \\
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \\
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \\
+    INDEX_T* const topk_indices_ptr,                                                        \\
+    DISTANCE_T* const topk_distances_ptr,                                                   \\
+    const DATA_T* const queries_ptr,                                                        \\
+    const uint32_t num_queries,                                                             \\
+    const INDEX_T* dev_seed_ptr,                                                            \\
+    uint32_t* const num_executed_iterations,                                                \\
+    uint32_t topk,                                                                          \\
+    uint32_t num_itopk_candidates,                                                          \\
+    uint32_t block_size,                                                                    \\
+    uint32_t smem_size,                                                                     \\
+    int64_t hash_bitlen,                                                                    \\
+    INDEX_T* hashmap_ptr,                                                                   \\
+    size_t small_hash_bitlen,                                                               \\
+    size_t small_hash_reset_interval,                                                       \\
+    uint32_t num_random_samplings,                                                          \\
+    uint64_t rand_xor_mask,                                                                 \\
+    uint32_t num_seeds,                                                                     \\
+    size_t itopk_size,                                                                      \\
+    size_t search_width,                                                                    \\
+    size_t min_iterations,                                                                  \\
+    size_t max_iterations,                                                                  \\
+    SAMPLE_FILTER_T sample_filter,                                                          \\
     cudaStream_t stream);
 
 """
@@ -75,7 +78,7 @@
 trailer = """
 #undef instantiate_single_cta_search_kernel
 
-} // namespace raft::neighbors::cagra::detail::single_cta_search
+}  // namespace raft::neighbors::cagra::detail::single_cta_search
 """
 
 mxdim_team = [(128, 8), (256, 16), (512, 32), (1024, 32)]
@@ -102,7 +105,7 @@
         with open(path, "w") as f:
             f.write(header)
             f.write(
-                f"instantiate_single_cta_select_and_run({team}, {mxdim},{data_t}, {idx_t}, {distance_t});\n"
+                f"instantiate_single_cta_select_and_run(\n  {team}, {mxdim}, {data_t}, {idx_t}, {distance_t}, raft::neighbors::filtering::none_cagra_sample_filter);\n"
             )
 
             f.write(trailer)
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu
index eb45d4ff08..b8c23103ba 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(32, 1024, float, uint32_t, float);
+instantiate_single_cta_select_and_run(
+  32, 1024, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu
index 049715aa20..8ab1897119 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(8, 128, float, uint32_t, float);
+instantiate_single_cta_select_and_run(
+  8, 128, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu
index 6028c283db..9fd36b4cb9 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(16, 256, float, uint32_t, float);
+instantiate_single_cta_select_and_run(
+  16, 256, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu
index 2566e9cbd9..a9ee2c864b 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(32, 512, float, uint32_t, float);
+instantiate_single_cta_select_and_run(
+  32, 512, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu
index 4cd96ad9c0..dadc574b65 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(32, 1024, float, uint64_t, float);
+instantiate_single_cta_select_and_run(
+  32, 1024, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu
index 822a2efb2f..30e043f47e 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(8, 128, float, uint64_t, float);
+instantiate_single_cta_select_and_run(
+  8, 128, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu
index 80d1f76b9b..089e4c930f 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(16, 256, float, uint64_t, float);
+instantiate_single_cta_select_and_run(
+  16, 256, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu
index 06c3eaf10b..3e8ffb8bf8 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(32, 512, float, uint64_t, float);
+instantiate_single_cta_select_and_run(
+  32, 512, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu
index b4c30ac943..279587738e 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(32, 1024, int8_t, uint32_t, float);
+instantiate_single_cta_select_and_run(
+  32, 1024, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu
index c8d0df3ac4..ef127d3f7d 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(8, 128, int8_t, uint32_t, float);
+instantiate_single_cta_select_and_run(
+  8, 128, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu
index 19ecee91af..7fcfdcc28e 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(16, 256, int8_t, uint32_t, float);
+instantiate_single_cta_select_and_run(
+  16, 256, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu
index 52c4eb7d6b..a6c606d99b 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(32, 512, int8_t, uint32_t, float);
+instantiate_single_cta_select_and_run(
+  32, 512, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu
index 4675e17084..0b8be56614 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(32, 1024, uint8_t, uint32_t, float);
+instantiate_single_cta_select_and_run(
+  32, 1024, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu
index e73e1071ee..4c193b9408 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(8, 128, uint8_t, uint32_t, float);
+instantiate_single_cta_select_and_run(
+  8, 128, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu
index 01e26b5f29..bdf16d2f03 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(16, 256, uint8_t, uint32_t, float);
+instantiate_single_cta_select_and_run(
+  16, 256, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu
index b0534b555f..93624df4aa 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu
@@ -25,38 +25,42 @@
  */
 
 #include <raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
 
 namespace raft::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_single_cta_select_and_run(                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                               \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                   \
-    INDEX_T* const topk_indices_ptr,                                                     \
-    DISTANCE_T* const topk_distances_ptr,                                                \
-    const DATA_T* const queries_ptr,                                                     \
-    const uint32_t num_queries,                                                          \
-    const INDEX_T* dev_seed_ptr,                                                         \
-    uint32_t* const num_executed_iterations,                                             \
-    uint32_t topk,                                                                       \
-    uint32_t num_itopk_candidates,                                                       \
-    uint32_t block_size,                                                                 \
-    uint32_t smem_size,                                                                  \
-    int64_t hash_bitlen,                                                                 \
-    INDEX_T* hashmap_ptr,                                                                \
-    size_t small_hash_bitlen,                                                            \
-    size_t small_hash_reset_interval,                                                    \
-    uint32_t num_random_samplings,                                                       \
-    uint64_t rand_xor_mask,                                                              \
-    uint32_t num_seeds,                                                                  \
-    size_t itopk_size,                                                                   \
-    size_t search_width,                                                                 \
-    size_t min_iterations,                                                               \
-    size_t max_iterations,                                                               \
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  template void                                                                             \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
     cudaStream_t stream);
 
-instantiate_single_cta_select_and_run(32, 512, uint8_t, uint32_t, float);
+instantiate_single_cta_select_and_run(
+  32, 512, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
 
 #undef instantiate_single_cta_search_kernel
 
diff --git a/cpp/src/spatial/knn/detail/fused_l2_knn_int32_t_float.cu b/cpp/src/spatial/knn/detail/fused_l2_knn_int32_t_float.cu
index 67b08655e6..b73cf31c58 100644
--- a/cpp/src/spatial/knn/detail/fused_l2_knn_int32_t_float.cu
+++ b/cpp/src/spatial/knn/detail/fused_l2_knn_int32_t_float.cu
@@ -32,7 +32,9 @@
     bool rowMajorIndex,                                                                      \
     bool rowMajorQuery,                                                                      \
     cudaStream_t stream,                                                                     \
-    raft::distance::DistanceType metric)
+    raft::distance::DistanceType metric,                                                     \
+    const Mvalue_t* index_norms,                                                             \
+    const Mvalue_t* query_norms)
 
 instantiate_raft_spatial_knn_detail_fusedL2Knn(int32_t, float, true);
 instantiate_raft_spatial_knn_detail_fusedL2Knn(int32_t, float, false);
diff --git a/cpp/src/spatial/knn/detail/fused_l2_knn_int64_t_float.cu b/cpp/src/spatial/knn/detail/fused_l2_knn_int64_t_float.cu
index 3c0d13710e..35ef37c984 100644
--- a/cpp/src/spatial/knn/detail/fused_l2_knn_int64_t_float.cu
+++ b/cpp/src/spatial/knn/detail/fused_l2_knn_int64_t_float.cu
@@ -32,7 +32,9 @@
     bool rowMajorIndex,                                                                      \
     bool rowMajorQuery,                                                                      \
     cudaStream_t stream,                                                                     \
-    raft::distance::DistanceType metric)
+    raft::distance::DistanceType metric,                                                     \
+    const Mvalue_t* index_norms,                                                             \
+    const Mvalue_t* query_norms)
 
 instantiate_raft_spatial_knn_detail_fusedL2Knn(int64_t, float, true);
 instantiate_raft_spatial_knn_detail_fusedL2Knn(int64_t, float, false);
diff --git a/cpp/src/spatial/knn/detail/fused_l2_knn_uint32_t_float.cu b/cpp/src/spatial/knn/detail/fused_l2_knn_uint32_t_float.cu
index e799c5181f..ff23d9c41b 100644
--- a/cpp/src/spatial/knn/detail/fused_l2_knn_uint32_t_float.cu
+++ b/cpp/src/spatial/knn/detail/fused_l2_knn_uint32_t_float.cu
@@ -32,7 +32,9 @@
     bool rowMajorIndex,                                                                      \
     bool rowMajorQuery,                                                                      \
     cudaStream_t stream,                                                                     \
-    raft::distance::DistanceType metric)
+    raft::distance::DistanceType metric,                                                     \
+    const Mvalue_t* index_norms,                                                             \
+    const Mvalue_t* query_norms)
 
 // These are used by brute_force_knn:
 instantiate_raft_spatial_knn_detail_fusedL2Knn(uint32_t, float, true);
diff --git a/cpp/template/CMakeLists.txt b/cpp/template/CMakeLists.txt
index 44b06e1b5f..538eac07ef 100644
--- a/cpp/template/CMakeLists.txt
+++ b/cpp/template/CMakeLists.txt
@@ -34,5 +34,8 @@ rapids_cpm_init()
 include(cmake/thirdparty/get_raft.cmake)
 
 # -------------- compile tasks ----------------- #
-add_executable(TEST_RAFT src/test_vector_search.cu)
-target_link_libraries(TEST_RAFT PRIVATE raft::raft raft::compiled)
+add_executable(CAGRA_EXAMPLE src/cagra_example.cu)
+target_link_libraries(CAGRA_EXAMPLE PRIVATE raft::raft raft::compiled)
+
+add_executable(IVF_FLAT_EXAMPLE src/ivf_flat_example.cu)
+target_link_libraries(IVF_FLAT_EXAMPLE PRIVATE raft::raft raft::compiled)
diff --git a/cpp/template/src/cagra_example.cu b/cpp/template/src/cagra_example.cu
new file mode 100644
index 0000000000..7f3a7d6676
--- /dev/null
+++ b/cpp/template/src/cagra_example.cu
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/neighbors/cagra.cuh>
+#include <raft/random/make_blobs.cuh>
+
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include "common.cuh"
+
+void cagra_build_search_simple(raft::device_resources const& dev_resources,
+                               raft::device_matrix_view<const float, int64_t> dataset,
+                               raft::device_matrix_view<const float, int64_t> queries)
+{
+  using namespace raft::neighbors;
+
+  int64_t topk      = 12;
+  int64_t n_queries = queries.extent(0);
+
+  // create output arrays
+  auto neighbors = raft::make_device_matrix<uint32_t>(dev_resources, n_queries, topk);
+  auto distances = raft::make_device_matrix<float>(dev_resources, n_queries, topk);
+
+  // use default index parameters
+  cagra::index_params index_params;
+
+  std::cout << "Building CAGRA index (search graph)" << std::endl;
+  auto index = cagra::build<float, uint32_t>(dev_resources, index_params, dataset);
+
+  std::cout << "CAGRA index has " << index.size() << " vectors" << std::endl;
+  std::cout << "CAGRA graph has degree " << index.graph_degree() << ", graph size ["
+            << index.graph().extent(0) << ", " << index.graph().extent(1) << "]" << std::endl;
+
+  // use default search parameters
+  cagra::search_params search_params;
+  // search K nearest neighbors
+  cagra::search<float, uint32_t>(
+    dev_resources, search_params, index, queries, neighbors.view(), distances.view());
+
+  // The call to ivf_flat::search is asynchronous. Before accessing the data, sync by calling
+  // raft::resource::sync_stream(dev_resources);
+
+  print_results(dev_resources, neighbors.view(), distances.view());
+}
+
+int main()
+{
+  raft::device_resources dev_resources;
+
+  // Set pool memory resource with 1 GiB initial pool size. All allocations use the same pool.
+  rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> pool_mr(
+    rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull);
+  rmm::mr::set_current_device_resource(&pool_mr);
+
+  // Alternatively, one could define a pool allocator for temporary arrays (used within RAFT
+  // algorithms). In that case only the internal arrays would use the pool, any other allocation
+  // uses the default RMM memory resource. Here is how to change the workspace memory resource to
+  // a pool with 2 GiB upper limit.
+  // raft::resource::set_workspace_to_pool_resource(dev_resources, 2 * 1024 * 1024 * 1024ull);
+
+  // Create input arrays.
+  int64_t n_samples = 10000;
+  int64_t n_dim     = 90;
+  int64_t n_queries = 10;
+  auto dataset      = raft::make_device_matrix<float, int64_t>(dev_resources, n_samples, n_dim);
+  auto queries      = raft::make_device_matrix<float, int64_t>(dev_resources, n_queries, n_dim);
+  generate_dataset(dev_resources, dataset.view(), queries.view());
+
+  // Simple build and search example.
+  cagra_build_search_simple(dev_resources,
+                            raft::make_const_mdspan(dataset.view()),
+                            raft::make_const_mdspan(queries.view()));
+}
diff --git a/cpp/template/src/common.cuh b/cpp/template/src/common.cuh
new file mode 100644
index 0000000000..0b72d3bf3b
--- /dev/null
+++ b/cpp/template/src/common.cuh
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+#include <raft/matrix/copy.cuh>
+#include <raft/random/make_blobs.cuh>
+#include <raft/random/sample_without_replacement.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+
+// Fill dataset and queries with synthetic data.
+void generate_dataset(raft::device_resources const& dev_resources,
+                      raft::device_matrix_view<float, int64_t> dataset,
+                      raft::device_matrix_view<float, int64_t> queries)
+{
+  auto labels = raft::make_device_vector<int64_t, int64_t>(dev_resources, dataset.extent(0));
+  raft::random::make_blobs(dev_resources, dataset, labels.view());
+  raft::random::RngState r(1234ULL);
+  raft::random::uniform(dev_resources,
+                        r,
+                        raft::make_device_vector_view(queries.data_handle(), queries.size()),
+                        -1.0f,
+                        1.0f);
+}
+
+// Copy the results to host and print a few samples
+template <typename IdxT>
+void print_results(raft::device_resources const& dev_resources,
+                   raft::device_matrix_view<IdxT, int64_t> neighbors,
+                   raft::device_matrix_view<float, int64_t> distances)
+{
+  int64_t topk        = neighbors.extent(1);
+  auto neighbors_host = raft::make_host_matrix<IdxT, int64_t>(neighbors.extent(0), topk);
+  auto distances_host = raft::make_host_matrix<float, int64_t>(distances.extent(0), topk);
+
+  cudaStream_t stream = raft::resource::get_cuda_stream(dev_resources);
+
+  raft::copy(neighbors_host.data_handle(), neighbors.data_handle(), neighbors.size(), stream);
+  raft::copy(distances_host.data_handle(), distances.data_handle(), distances.size(), stream);
+
+  // The calls to RAFT algorithms and  raft::copy is asynchronous.
+  // We need to sync the stream before accessing the data.
+  raft::resource::sync_stream(dev_resources, stream);
+
+  for (int query_id = 0; query_id < 2; query_id++) {
+    std::cout << "Query " << query_id << " neighbor indices: ";
+    raft::print_host_vector("", &neighbors_host(query_id, 0), topk, std::cout);
+    std::cout << "Query " << query_id << " neighbor distances: ";
+    raft::print_host_vector("", &distances_host(query_id, 0), topk, std::cout);
+  }
+}
+
+/** Subsample the dataset to create a training set*/
+raft::device_matrix<float, int64_t> subsample(
+  raft::device_resources const& dev_resources,
+  raft::device_matrix_view<const float, int64_t> dataset,
+  raft::device_vector_view<const int64_t, int64_t> data_indices,
+  float fraction)
+{
+  int64_t n_samples = dataset.extent(0);
+  int64_t n_dim     = dataset.extent(1);
+  int64_t n_train   = n_samples * fraction;
+  auto trainset     = raft::make_device_matrix<float, int64_t>(dev_resources, n_train, n_dim);
+
+  int seed = 137;
+  raft::random::RngState rng(seed);
+  auto train_indices = raft::make_device_vector<int64_t>(dev_resources, n_train);
+
+  raft::random::sample_without_replacement(
+    dev_resources, rng, data_indices, std::nullopt, train_indices.view(), std::nullopt);
+
+  raft::matrix::copy_rows(
+    dev_resources, dataset, trainset.view(), raft::make_const_mdspan(train_indices.view()));
+
+  return trainset;
+}
diff --git a/cpp/template/src/ivf_flat_example.cu b/cpp/template/src/ivf_flat_example.cu
new file mode 100644
index 0000000000..5d91f8fe8b
--- /dev/null
+++ b/cpp/template/src/ivf_flat_example.cu
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <optional>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+#include <raft/neighbors/ivf_flat.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+
+#include "common.cuh"
+
+void ivf_flat_build_search_simple(raft::device_resources const& dev_resources,
+                                  raft::device_matrix_view<const float, int64_t> dataset,
+                                  raft::device_matrix_view<const float, int64_t> queries)
+{
+  using namespace raft::neighbors;
+
+  ivf_flat::index_params index_params;
+  index_params.n_lists                  = 1024;
+  index_params.kmeans_trainset_fraction = 0.1;
+  index_params.metric                   = raft::distance::DistanceType::L2Expanded;
+
+  std::cout << "Building IVF-Flat index" << std::endl;
+  auto index = ivf_flat::build(dev_resources, index_params, dataset);
+
+  std::cout << "Number of clusters " << index.n_lists() << ", number of vectors added to index "
+            << index.size() << std::endl;
+
+  // Create output arrays.
+  int64_t topk      = 10;
+  int64_t n_queries = queries.extent(0);
+  auto neighbors    = raft::make_device_matrix<int64_t>(dev_resources, n_queries, topk);
+  auto distances    = raft::make_device_matrix<float>(dev_resources, n_queries, topk);
+
+  // Set search parameters.
+  ivf_flat::search_params search_params;
+  search_params.n_probes = 50;
+
+  // Search K nearest neighbors for each of the queries.
+  ivf_flat::search(
+    dev_resources, search_params, index, queries, neighbors.view(), distances.view());
+
+  // The call to ivf_flat::search is asynchronous. Before accessing the data, sync by calling
+  // raft::resource::sync_stream(dev_resources);
+
+  print_results(dev_resources, neighbors.view(), distances.view());
+}
+
+void ivf_flat_build_extend_search(raft::device_resources const& dev_resources,
+                                  raft::device_matrix_view<const float, int64_t> dataset,
+                                  raft::device_matrix_view<const float, int64_t> queries)
+{
+  using namespace raft::neighbors;
+
+  // Define dataset indices.
+  auto data_indices = raft::make_device_vector<int64_t, int64_t>(dev_resources, dataset.extent(0));
+  thrust::counting_iterator<int64_t> first(0);
+  thrust::device_ptr<int64_t> ptr(data_indices.data_handle());
+  thrust::copy(
+    raft::resource::get_thrust_policy(dev_resources), first, first + dataset.extent(0), ptr);
+
+  // Sub-sample the dataset to create a training set.
+  auto trainset =
+    subsample(dev_resources, dataset, raft::make_const_mdspan(data_indices.view()), 0.1);
+
+  ivf_flat::index_params index_params;
+  index_params.n_lists           = 100;
+  index_params.metric            = raft::distance::DistanceType::L2Expanded;
+  index_params.add_data_on_build = false;
+
+  std::cout << "\nRun k-means clustering using the training set" << std::endl;
+  auto index =
+    ivf_flat::build(dev_resources, index_params, raft::make_const_mdspan(trainset.view()));
+
+  std::cout << "Number of clusters " << index.n_lists() << ", number of vectors added to index "
+            << index.size() << std::endl;
+
+  std::cout << "Filling index with the dataset vectors" << std::endl;
+  index = ivf_flat::extend(dev_resources,
+                           dataset,
+                           std::make_optional(raft::make_const_mdspan(data_indices.view())),
+                           index);
+
+  std::cout << "Index size after addin dataset vectors " << index.size() << std::endl;
+
+  // Set search parameters.
+  ivf_flat::search_params search_params;
+  search_params.n_probes = 10;
+
+  // Create output arrays.
+  int64_t topk      = 10;
+  int64_t n_queries = queries.extent(0);
+  auto neighbors    = raft::make_device_matrix<int64_t, int64_t>(dev_resources, n_queries, topk);
+  auto distances    = raft::make_device_matrix<float, int64_t>(dev_resources, n_queries, topk);
+
+  // Search K nearest neighbors for each queries.
+  ivf_flat::search(
+    dev_resources, search_params, index, queries, neighbors.view(), distances.view());
+
+  // The call to ivf_flat::search is asynchronous. Before accessing the data, sync using:
+  // raft::resource::sync_stream(dev_resources);
+
+  print_results(dev_resources, neighbors.view(), distances.view());
+}
+
+int main()
+{
+  raft::device_resources dev_resources;
+
+  // Set pool memory resource with 1 GiB initial pool size. All allocations use the same pool.
+  rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> pool_mr(
+    rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull);
+  rmm::mr::set_current_device_resource(&pool_mr);
+
+  // Alternatively, one could define a pool allocator for temporary arrays (used within RAFT
+  // algorithms). In that case only the internal arrays would use the pool, any other allocation
+  // uses the default RMM memory resource. Here is how to change the workspace memory resource to
+  // a pool with 2 GiB upper limit.
+  // raft::resource::set_workspace_to_pool_resource(dev_resources, 2 * 1024 * 1024 * 1024ull);
+
+  // Create input arrays.
+  int64_t n_samples = 10000;
+  int64_t n_dim     = 3;
+  int64_t n_queries = 10;
+  auto dataset      = raft::make_device_matrix<float, int64_t>(dev_resources, n_samples, n_dim);
+  auto queries      = raft::make_device_matrix<float, int64_t>(dev_resources, n_queries, n_dim);
+  generate_dataset(dev_resources, dataset.view(), queries.view());
+
+  // Simple build and search example.
+  ivf_flat_build_search_simple(dev_resources,
+                               raft::make_const_mdspan(dataset.view()),
+                               raft::make_const_mdspan(queries.view()));
+
+  // Build and extend example.
+  ivf_flat_build_extend_search(dev_resources,
+                               raft::make_const_mdspan(dataset.view()),
+                               raft::make_const_mdspan(queries.view()));
+}
diff --git a/cpp/template/src/test_vector_search.cu b/cpp/template/src/test_vector_search.cu
deleted file mode 100644
index f54cfc03e7..0000000000
--- a/cpp/template/src/test_vector_search.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/core/device_mdarray.hpp>
-#include <raft/core/device_resources.hpp>
-#include <raft/neighbors/cagra.cuh>
-#include <raft/random/make_blobs.cuh>
-
-int main()
-{
-  using namespace raft::neighbors;
-  raft::device_resources dev_resources;
-  // Use 5 GB of pool memory
-  raft::resource::set_workspace_to_pool_resource(
-    dev_resources, std::make_optional<std::size_t>(5 * 1024 * 1024 * 1024ull));
-
-  int64_t n_samples = 50000;
-  int64_t n_dim     = 90;
-  int64_t topk      = 12;
-  int64_t n_queries = 1;
-
-  // create input and output arrays
-  auto input     = raft::make_device_matrix<float>(dev_resources, n_samples, n_dim);
-  auto labels    = raft::make_device_vector<int64_t>(dev_resources, n_samples);
-  auto queries   = raft::make_device_matrix<float>(dev_resources, n_queries, n_dim);
-  auto neighbors = raft::make_device_matrix<int64_t>(dev_resources, n_queries, topk);
-  auto distances = raft::make_device_matrix<float>(dev_resources, n_queries, topk);
-
-  raft::random::make_blobs(dev_resources, input.view(), labels.view());
-
-  // use default index parameters
-  cagra::index_params index_params;
-  // create and fill the index from a [n_samples, n_dim] input
-  auto index = cagra::build<float, int64_t>(
-    dev_resources, index_params, raft::make_const_mdspan(input.view()));
-  // use default search parameters
-  cagra::search_params search_params;
-  // search K nearest neighbors
-  cagra::search<float, int64_t>(dev_resources,
-                                search_params,
-                                index,
-                                raft::make_const_mdspan(queries.view()),
-                                neighbors.view(),
-                                distances.view());
-}
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index db4c59c807..0651ccac86 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -105,6 +105,7 @@ if(BUILD_TESTS)
     NAME
     CORE_TEST
     PATH
+    test/core/bitset.cu
     test/core/device_resources_manager.cpp
     test/core/device_setter.cpp
     test/core/logger.cpp
@@ -379,6 +380,21 @@ if(BUILD_TESTS)
     100
   )
 
+  ConfigureTest(
+    NAME
+    NEIGHBORS_ANN_NN_DESCENT_TEST
+    PATH
+    test/neighbors/ann_nn_descent/test_float_uint32_t.cu
+    test/neighbors/ann_nn_descent/test_int8_t_uint32_t.cu
+    test/neighbors/ann_nn_descent/test_uint8_t_uint32_t.cu
+    LIB
+    EXPLICIT_INSTANTIATE_ONLY
+    GPUS
+    1
+    PERCENT
+    100
+  )
+
   ConfigureTest(
     NAME NEIGHBORS_SELECTION_TEST PATH test/neighbors/selection.cu LIB EXPLICIT_INSTANTIATE_ONLY
     GPUS 1 PERCENT 50
diff --git a/cpp/test/core/bitset.cu b/cpp/test/core/bitset.cu
new file mode 100644
index 0000000000..215de98aaf
--- /dev/null
+++ b/cpp/test/core/bitset.cu
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+
+#include <raft/core/bitset.cuh>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/random/rng.cuh>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <numeric>
+
+namespace raft::core {
+
+struct test_spec_bitset {
+  uint64_t bitset_len;
+  uint64_t mask_len;
+  uint64_t query_len;
+};
+
+auto operator<<(std::ostream& os, const test_spec_bitset& ss) -> std::ostream&
+{
+  os << "bitset{bitset_len: " << ss.bitset_len << ", mask_len: " << ss.mask_len
+     << ", query_len: " << ss.query_len << "}";
+  return os;
+}
+
+template <typename bitset_t, typename index_t>
+void add_cpu_bitset(std::vector<bitset_t>& bitset, const std::vector<index_t>& mask_idx)
+{
+  static size_t constexpr const bitset_element_size = sizeof(bitset_t) * 8;
+  for (size_t i = 0; i < mask_idx.size(); i++) {
+    auto idx = mask_idx[i];
+    bitset[idx / bitset_element_size] &= ~(bitset_t{1} << (idx % bitset_element_size));
+  }
+}
+
+template <typename bitset_t, typename index_t>
+void create_cpu_bitset(std::vector<bitset_t>& bitset, const std::vector<index_t>& mask_idx)
+{
+  for (size_t i = 0; i < bitset.size(); i++) {
+    bitset[i] = ~bitset_t(0x00);
+  }
+  add_cpu_bitset(bitset, mask_idx);
+}
+
+template <typename bitset_t, typename index_t>
+void test_cpu_bitset(const std::vector<bitset_t>& bitset,
+                     const std::vector<index_t>& queries,
+                     std::vector<uint8_t>& result)
+{
+  static size_t constexpr const bitset_element_size = sizeof(bitset_t) * 8;
+  for (size_t i = 0; i < queries.size(); i++) {
+    result[i] = uint8_t((bitset[queries[i] / bitset_element_size] &
+                         (bitset_t{1} << (queries[i] % bitset_element_size))) != 0);
+  }
+}
+
+template <typename bitset_t>
+void flip_cpu_bitset(std::vector<bitset_t>& bitset)
+{
+  for (size_t i = 0; i < bitset.size(); i++) {
+    bitset[i] = ~bitset[i];
+  }
+}
+
+template <typename bitset_t, typename index_t>
+class BitsetTest : public testing::TestWithParam<test_spec_bitset> {
+ protected:
+  index_t static constexpr const bitset_element_size = sizeof(bitset_t) * 8;
+  const test_spec_bitset spec;
+  std::vector<bitset_t> bitset_result;
+  std::vector<bitset_t> bitset_ref;
+  raft::resources res;
+
+ public:
+  explicit BitsetTest()
+    : spec(testing::TestWithParam<test_spec_bitset>::GetParam()),
+      bitset_result(raft::ceildiv(spec.bitset_len, uint64_t(bitset_element_size))),
+      bitset_ref(raft::ceildiv(spec.bitset_len, uint64_t(bitset_element_size)))
+  {
+  }
+
+  void run()
+  {
+    auto stream = resource::get_cuda_stream(res);
+
+    // generate input and mask
+    raft::random::RngState rng(42);
+    auto mask_device = raft::make_device_vector<index_t, index_t>(res, spec.mask_len);
+    std::vector<index_t> mask_cpu(spec.mask_len);
+    raft::random::uniformInt(res, rng, mask_device.view(), index_t(0), index_t(spec.bitset_len));
+    update_host(mask_cpu.data(), mask_device.data_handle(), mask_device.extent(0), stream);
+    resource::sync_stream(res, stream);
+
+    // calculate the results
+    auto my_bitset = raft::core::bitset<bitset_t, index_t>(
+      res, raft::make_const_mdspan(mask_device.view()), index_t(spec.bitset_len));
+    update_host(bitset_result.data(), my_bitset.data_handle(), bitset_result.size(), stream);
+
+    // calculate the reference
+    create_cpu_bitset(bitset_ref, mask_cpu);
+    resource::sync_stream(res, stream);
+    ASSERT_TRUE(hostVecMatch(bitset_ref, bitset_result, raft::Compare<bitset_t>()));
+
+    auto query_device  = raft::make_device_vector<index_t, index_t>(res, spec.query_len);
+    auto result_device = raft::make_device_vector<uint8_t, index_t>(res, spec.query_len);
+    auto query_cpu     = std::vector<index_t>(spec.query_len);
+    auto result_cpu    = std::vector<uint8_t>(spec.query_len);
+    auto result_ref    = std::vector<uint8_t>(spec.query_len);
+
+    // Create queries and verify the test results
+    raft::random::uniformInt(res, rng, query_device.view(), index_t(0), index_t(spec.bitset_len));
+    update_host(query_cpu.data(), query_device.data_handle(), query_device.extent(0), stream);
+    my_bitset.test(res, raft::make_const_mdspan(query_device.view()), result_device.view());
+    update_host(result_cpu.data(), result_device.data_handle(), result_device.extent(0), stream);
+    test_cpu_bitset(bitset_ref, query_cpu, result_ref);
+    resource::sync_stream(res, stream);
+    ASSERT_TRUE(hostVecMatch(result_cpu, result_ref, Compare<uint8_t>()));
+
+    // Add more sample to the bitset and re-test
+    raft::random::uniformInt(res, rng, mask_device.view(), index_t(0), index_t(spec.bitset_len));
+    update_host(mask_cpu.data(), mask_device.data_handle(), mask_device.extent(0), stream);
+    resource::sync_stream(res, stream);
+    my_bitset.set(res, mask_device.view());
+    update_host(bitset_result.data(), my_bitset.data_handle(), bitset_result.size(), stream);
+
+    add_cpu_bitset(bitset_ref, mask_cpu);
+    resource::sync_stream(res, stream);
+    ASSERT_TRUE(hostVecMatch(bitset_ref, bitset_result, raft::Compare<bitset_t>()));
+
+    // Flip the bitset and re-test
+    my_bitset.flip(res);
+    update_host(bitset_result.data(), my_bitset.data_handle(), bitset_result.size(), stream);
+    flip_cpu_bitset(bitset_ref);
+    resource::sync_stream(res, stream);
+    ASSERT_TRUE(hostVecMatch(bitset_ref, bitset_result, raft::Compare<bitset_t>()));
+  }
+};
+
+auto inputs_bitset = ::testing::Values(test_spec_bitset{32, 5, 10},
+                                       test_spec_bitset{100, 30, 10},
+                                       test_spec_bitset{1024, 55, 100},
+                                       test_spec_bitset{10000, 1000, 1000},
+                                       test_spec_bitset{1 << 15, 1 << 3, 1 << 12},
+                                       test_spec_bitset{1 << 15, 1 << 24, 1 << 13},
+                                       test_spec_bitset{1 << 25, 1 << 23, 1 << 14});
+
+using Uint16_32 = BitsetTest<uint16_t, uint32_t>;
+TEST_P(Uint16_32, Run) { run(); }
+INSTANTIATE_TEST_CASE_P(BitsetTest, Uint16_32, inputs_bitset);
+
+using Uint32_32 = BitsetTest<uint32_t, uint32_t>;
+TEST_P(Uint32_32, Run) { run(); }
+INSTANTIATE_TEST_CASE_P(BitsetTest, Uint32_32, inputs_bitset);
+
+using Uint64_32 = BitsetTest<uint64_t, uint32_t>;
+TEST_P(Uint64_32, Run) { run(); }
+INSTANTIATE_TEST_CASE_P(BitsetTest, Uint64_32, inputs_bitset);
+
+using Uint8_64 = BitsetTest<uint8_t, uint64_t>;
+TEST_P(Uint8_64, Run) { run(); }
+INSTANTIATE_TEST_CASE_P(BitsetTest, Uint8_64, inputs_bitset);
+
+using Uint32_64 = BitsetTest<uint32_t, uint64_t>;
+TEST_P(Uint32_64, Run) { run(); }
+INSTANTIATE_TEST_CASE_P(BitsetTest, Uint32_64, inputs_bitset);
+
+using Uint64_64 = BitsetTest<uint64_t, uint64_t>;
+TEST_P(Uint64_64, Run) { run(); }
+INSTANTIATE_TEST_CASE_P(BitsetTest, Uint64_64, inputs_bitset);
+
+}  // namespace raft::core
diff --git a/cpp/test/core/math_device.cu b/cpp/test/core/math_device.cu
index 15c7b2b33a..8e3a9df01b 100644
--- a/cpp/test/core/math_device.cu
+++ b/cpp/test/core/math_device.cu
@@ -21,7 +21,9 @@
 #include <rmm/cuda_stream.hpp>
 #include <rmm/device_scalar.hpp>
 
-#if _RAFT_HAS_CUDA
+#include <cuda/std/type_traits>
+
+#ifdef _RAFT_HAS_CUDA
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #endif
@@ -35,7 +37,7 @@ __global__ void math_eval_kernel(OutT* out, OpT op, Args... args)
 template <typename OpT, typename... Args>
 auto math_eval(OpT op, Args&&... args)
 {
-  typedef decltype(op(args...)) OutT;
+  using OutT  = cuda::std::invoke_result_t<OpT, Args...>;
   auto stream = rmm::cuda_stream_default;
   rmm::device_scalar<OutT> result(stream);
   math_eval_kernel<<<1, 1, 0, stream>>>(result.data(), op, std::forward<Args>(args)...);
diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh
index eadc88085f..e6c3873063 100644
--- a/cpp/test/neighbors/ann_cagra.cuh
+++ b/cpp/test/neighbors/ann_cagra.cuh
@@ -15,6 +15,8 @@
  */
 #pragma once
 
+#undef RAFT_EXPLICIT_INSTANTIATE_ONLY  // Search with filter instantiation
+
 #include "../test_utils.cuh"
 #include "ann_utils.cuh"
 #include <raft/core/resource/cuda_stream.hpp>
@@ -25,8 +27,10 @@
 #include <raft/core/device_resources.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_types.hpp>
+#include <raft/linalg/add.cuh>
 #include <raft/neighbors/cagra.cuh>
 #include <raft/neighbors/cagra_serialize.cuh>
+#include <raft/neighbors/sample_filter.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/util/itertools.hpp>
 
@@ -41,8 +45,22 @@
 #include <string>
 #include <vector>
 
-namespace raft::neighbors::experimental::cagra {
+namespace raft::neighbors::cagra {
 namespace {
+
+/* A filter that excludes all indices below `offset`. */
+struct test_cagra_sample_filter {
+  static constexpr unsigned offset = 300;
+  inline _RAFT_HOST_DEVICE auto operator()(
+    // query index
+    const uint32_t query_ix,
+    // the index of the current sample inside the current inverted list
+    const uint32_t sample_ix) const
+  {
+    return sample_ix >= offset;
+  }
+};
+
 // For sort_knn_graph test
 template <typename IdxT>
 void RandomSuffle(raft::host_matrix_view<IdxT, int64_t> index)
@@ -130,6 +148,7 @@ struct AnnCagraInputs {
   int n_rows;
   int dim;
   int k;
+  graph_build_algo build_algo;
   search_algo algo;
   int max_queries;
   int team_size;
@@ -144,12 +163,13 @@ struct AnnCagraInputs {
 
 inline ::std::ostream& operator<<(::std::ostream& os, const AnnCagraInputs& p)
 {
-  std::vector<std::string> algo = {"single-cta", "multi_cta", "multi_kernel", "auto"};
+  std::vector<std::string> algo       = {"single-cta", "multi_cta", "multi_kernel", "auto"};
+  std::vector<std::string> build_algo = {"IVF_PQ", "NN_DESCENT"};
   os << "{n_queries=" << p.n_queries << ", dataset shape=" << p.n_rows << "x" << p.dim
      << ", k=" << p.k << ", " << algo.at((int)p.algo) << ", max_queries=" << p.max_queries
      << ", itopk_size=" << p.itopk_size << ", search_width=" << p.search_width
-     << ", metric=" << static_cast<int>(p.metric) << (p.host_dataset ? ", host" : ", device") << '}'
-     << std::endl;
+     << ", metric=" << static_cast<int>(p.metric) << (p.host_dataset ? ", host" : ", device")
+     << ", build_algo=" << build_algo.at((int)p.build_algo) << '}' << std::endl;
   return os;
 }
 
@@ -199,6 +219,7 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
         cagra::index_params index_params;
         index_params.metric = ps.metric;  // Note: currently ony the cagra::index_params metric is
                                           // not used for knn_graph building.
+        index_params.build_algo = ps.build_algo;
         cagra::search_params search_params;
         search_params.algo        = ps.algo;
         search_params.max_queries = ps.max_queries;
@@ -323,11 +344,25 @@ class AnnCagraSortTest : public ::testing::TestWithParam<AnnCagraInputs> {
       auto knn_graph =
         raft::make_host_matrix<IdxT, int64_t>(ps.n_rows, index_params.intermediate_graph_degree);
 
-      if (ps.host_dataset) {
-        cagra::build_knn_graph<DataT, IdxT>(handle_, database_host_view, knn_graph.view());
+      if (ps.build_algo == graph_build_algo::IVF_PQ) {
+        if (ps.host_dataset) {
+          cagra::build_knn_graph<DataT, IdxT>(handle_, database_host_view, knn_graph.view());
+        } else {
+          cagra::build_knn_graph<DataT, IdxT>(handle_, database_view, knn_graph.view());
+        }
       } else {
-        cagra::build_knn_graph<DataT, IdxT>(handle_, database_view, knn_graph.view());
-      };
+        auto nn_descent_idx_params                      = experimental::nn_descent::index_params{};
+        nn_descent_idx_params.graph_degree              = index_params.intermediate_graph_degree;
+        nn_descent_idx_params.intermediate_graph_degree = index_params.intermediate_graph_degree;
+
+        if (ps.host_dataset) {
+          cagra::build_knn_graph<DataT, IdxT>(
+            handle_, database_host_view, knn_graph.view(), nn_descent_idx_params);
+        } else {
+          cagra::build_knn_graph<DataT, IdxT>(
+            handle_, database_host_view, knn_graph.view(), nn_descent_idx_params);
+        }
+      }
 
       handle_.sync_stream();
       ASSERT_TRUE(CheckOrder<DistanceT>(knn_graph.view(), database_host.view()));
@@ -365,6 +400,275 @@ class AnnCagraSortTest : public ::testing::TestWithParam<AnnCagraInputs> {
   rmm::device_uvector<DataT> database;
 };
 
+template <typename DistanceT, typename DataT, typename IdxT>
+class AnnCagraFilterTest : public ::testing::TestWithParam<AnnCagraInputs> {
+ public:
+  AnnCagraFilterTest()
+    : stream_(resource::get_cuda_stream(handle_)),
+      ps(::testing::TestWithParam<AnnCagraInputs>::GetParam()),
+      database(0, stream_),
+      search_queries(0, stream_)
+  {
+  }
+
+ protected:
+  void testCagraFilter()
+  {
+    size_t queries_size = ps.n_queries * ps.k;
+    std::vector<IdxT> indices_Cagra(queries_size);
+    std::vector<IdxT> indices_naive(queries_size);
+    std::vector<DistanceT> distances_Cagra(queries_size);
+    std::vector<DistanceT> distances_naive(queries_size);
+
+    {
+      rmm::device_uvector<DistanceT> distances_naive_dev(queries_size, stream_);
+      rmm::device_uvector<IdxT> indices_naive_dev(queries_size, stream_);
+      auto* database_filtered_ptr = database.data() + test_cagra_sample_filter::offset * ps.dim;
+      naive_knn<DistanceT, DataT, IdxT>(handle_,
+                                        distances_naive_dev.data(),
+                                        indices_naive_dev.data(),
+                                        search_queries.data(),
+                                        database_filtered_ptr,
+                                        ps.n_queries,
+                                        ps.n_rows - test_cagra_sample_filter::offset,
+                                        ps.dim,
+                                        ps.k,
+                                        ps.metric);
+      raft::linalg::addScalar(indices_naive_dev.data(),
+                              indices_naive_dev.data(),
+                              IdxT(test_cagra_sample_filter::offset),
+                              queries_size,
+                              stream_);
+      update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_);
+      update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_);
+      resource::sync_stream(handle_);
+    }
+
+    {
+      rmm::device_uvector<DistanceT> distances_dev(queries_size, stream_);
+      rmm::device_uvector<IdxT> indices_dev(queries_size, stream_);
+
+      {
+        cagra::index_params index_params;
+        index_params.metric = ps.metric;  // Note: currently ony the cagra::index_params metric is
+                                          // not used for knn_graph building.
+        cagra::search_params search_params;
+        search_params.algo         = ps.algo;
+        search_params.max_queries  = ps.max_queries;
+        search_params.team_size    = ps.team_size;
+        search_params.hashmap_mode = cagra::hash_mode::HASH;
+
+        auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
+          (const DataT*)database.data(), ps.n_rows, ps.dim);
+
+        cagra::index<DataT, IdxT> index(handle_);
+        if (ps.host_dataset) {
+          auto database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
+          raft::copy(database_host.data_handle(), database.data(), database.size(), stream_);
+          auto database_host_view = raft::make_host_matrix_view<const DataT, int64_t>(
+            (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim);
+          index = cagra::build<DataT, IdxT>(handle_, index_params, database_host_view);
+        } else {
+          index = cagra::build<DataT, IdxT>(handle_, index_params, database_view);
+        }
+
+        if (!ps.include_serialized_dataset) { index.update_dataset(handle_, database_view); }
+
+        auto search_queries_view = raft::make_device_matrix_view<const DataT, int64_t>(
+          search_queries.data(), ps.n_queries, ps.dim);
+        auto indices_out_view =
+          raft::make_device_matrix_view<IdxT, int64_t>(indices_dev.data(), ps.n_queries, ps.k);
+        auto dists_out_view = raft::make_device_matrix_view<DistanceT, int64_t>(
+          distances_dev.data(), ps.n_queries, ps.k);
+
+        cagra::search_with_filtering(handle_,
+                                     search_params,
+                                     index,
+                                     search_queries_view,
+                                     indices_out_view,
+                                     dists_out_view,
+                                     test_cagra_sample_filter());
+        update_host(distances_Cagra.data(), distances_dev.data(), queries_size, stream_);
+        update_host(indices_Cagra.data(), indices_dev.data(), queries_size, stream_);
+        resource::sync_stream(handle_);
+      }
+
+      // Test filter
+      bool unacceptable_node = false;
+      for (int q = 0; q < ps.n_queries; q++) {
+        for (int i = 0; i < ps.k; i++) {
+          const auto n      = indices_Cagra[q * ps.k + i];
+          unacceptable_node = unacceptable_node | !test_cagra_sample_filter()(q, n);
+        }
+      }
+      EXPECT_FALSE(unacceptable_node);
+
+      double min_recall = ps.min_recall;
+      EXPECT_TRUE(eval_neighbours(indices_naive,
+                                  indices_Cagra,
+                                  distances_naive,
+                                  distances_Cagra,
+                                  ps.n_queries,
+                                  ps.k,
+                                  0.001,
+                                  min_recall));
+      EXPECT_TRUE(eval_distances(handle_,
+                                 database.data(),
+                                 search_queries.data(),
+                                 indices_dev.data(),
+                                 distances_dev.data(),
+                                 ps.n_rows,
+                                 ps.dim,
+                                 ps.n_queries,
+                                 ps.k,
+                                 ps.metric,
+                                 1.0e-4));
+    }
+  }
+
+  void testCagraRemoved()
+  {
+    size_t queries_size = ps.n_queries * ps.k;
+    std::vector<IdxT> indices_Cagra(queries_size);
+    std::vector<IdxT> indices_naive(queries_size);
+    std::vector<DistanceT> distances_Cagra(queries_size);
+    std::vector<DistanceT> distances_naive(queries_size);
+
+    {
+      rmm::device_uvector<DistanceT> distances_naive_dev(queries_size, stream_);
+      rmm::device_uvector<IdxT> indices_naive_dev(queries_size, stream_);
+      auto* database_filtered_ptr = database.data() + test_cagra_sample_filter::offset * ps.dim;
+      naive_knn<DistanceT, DataT, IdxT>(handle_,
+                                        distances_naive_dev.data(),
+                                        indices_naive_dev.data(),
+                                        search_queries.data(),
+                                        database_filtered_ptr,
+                                        ps.n_queries,
+                                        ps.n_rows - test_cagra_sample_filter::offset,
+                                        ps.dim,
+                                        ps.k,
+                                        ps.metric);
+      raft::linalg::addScalar(indices_naive_dev.data(),
+                              indices_naive_dev.data(),
+                              IdxT(test_cagra_sample_filter::offset),
+                              queries_size,
+                              stream_);
+      update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_);
+      update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_);
+      resource::sync_stream(handle_);
+    }
+
+    {
+      rmm::device_uvector<DistanceT> distances_dev(queries_size, stream_);
+      rmm::device_uvector<IdxT> indices_dev(queries_size, stream_);
+
+      {
+        cagra::index_params index_params;
+        index_params.metric = ps.metric;  // Note: currently ony the cagra::index_params metric is
+                                          // not used for knn_graph building.
+        cagra::search_params search_params;
+        search_params.algo         = ps.algo;
+        search_params.max_queries  = ps.max_queries;
+        search_params.team_size    = ps.team_size;
+        search_params.hashmap_mode = cagra::hash_mode::HASH;
+
+        auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
+          (const DataT*)database.data(), ps.n_rows, ps.dim);
+
+        cagra::index<DataT, IdxT> index(handle_);
+        if (ps.host_dataset) {
+          auto database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
+          raft::copy(database_host.data_handle(), database.data(), database.size(), stream_);
+          auto database_host_view = raft::make_host_matrix_view<const DataT, int64_t>(
+            (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim);
+          index = cagra::build<DataT, IdxT>(handle_, index_params, database_host_view);
+        } else {
+          index = cagra::build<DataT, IdxT>(handle_, index_params, database_view);
+        }
+
+        if (!ps.include_serialized_dataset) { index.update_dataset(handle_, database_view); }
+
+        auto search_queries_view = raft::make_device_matrix_view<const DataT, int64_t>(
+          search_queries.data(), ps.n_queries, ps.dim);
+        auto indices_out_view =
+          raft::make_device_matrix_view<IdxT, int64_t>(indices_dev.data(), ps.n_queries, ps.k);
+        auto dists_out_view = raft::make_device_matrix_view<DistanceT, int64_t>(
+          distances_dev.data(), ps.n_queries, ps.k);
+        auto removed_indices =
+          raft::make_device_vector<IdxT, int64_t>(handle_, test_cagra_sample_filter::offset);
+        thrust::sequence(
+          resource::get_thrust_policy(handle_),
+          thrust::device_pointer_cast(removed_indices.data_handle()),
+          thrust::device_pointer_cast(removed_indices.data_handle() + removed_indices.extent(0)));
+        resource::sync_stream(handle_);
+        raft::core::bitset<std::uint32_t, IdxT> removed_indices_bitset(
+          handle_, removed_indices.view(), ps.n_rows);
+        cagra::search_with_filtering(
+          handle_,
+          search_params,
+          index,
+          search_queries_view,
+          indices_out_view,
+          dists_out_view,
+          raft::neighbors::filtering::bitset_filter(removed_indices_bitset.view()));
+        update_host(distances_Cagra.data(), distances_dev.data(), queries_size, stream_);
+        update_host(indices_Cagra.data(), indices_dev.data(), queries_size, stream_);
+        resource::sync_stream(handle_);
+      }
+
+      double min_recall = ps.min_recall;
+      EXPECT_TRUE(eval_neighbours(indices_naive,
+                                  indices_Cagra,
+                                  distances_naive,
+                                  distances_Cagra,
+                                  ps.n_queries,
+                                  ps.k,
+                                  0.001,
+                                  min_recall));
+      EXPECT_TRUE(eval_distances(handle_,
+                                 database.data(),
+                                 search_queries.data(),
+                                 indices_dev.data(),
+                                 distances_dev.data(),
+                                 ps.n_rows,
+                                 ps.dim,
+                                 ps.n_queries,
+                                 ps.k,
+                                 ps.metric,
+                                 1.0e-4));
+    }
+  }
+
+  void SetUp() override
+  {
+    database.resize(((size_t)ps.n_rows) * ps.dim, stream_);
+    search_queries.resize(ps.n_queries * ps.dim, stream_);
+    raft::random::Rng r(1234ULL);
+    if constexpr (std::is_same<DataT, float>{}) {
+      r.normal(database.data(), ps.n_rows * ps.dim, DataT(0.1), DataT(2.0), stream_);
+      r.normal(search_queries.data(), ps.n_queries * ps.dim, DataT(0.1), DataT(2.0), stream_);
+    } else {
+      r.uniformInt(database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20), stream_);
+      r.uniformInt(search_queries.data(), ps.n_queries * ps.dim, DataT(1), DataT(20), stream_);
+    }
+    resource::sync_stream(handle_);
+  }
+
+  void TearDown() override
+  {
+    resource::sync_stream(handle_);
+    database.resize(0, stream_);
+    search_queries.resize(0, stream_);
+  }
+
+ private:
+  raft::resources handle_;
+  rmm::cuda_stream_view stream_;
+  AnnCagraInputs ps;
+  rmm::device_uvector<DataT> database;
+  rmm::device_uvector<DataT> search_queries;
+};
+
 inline std::vector<AnnCagraInputs> generate_inputs()
 {
   // TODO(tfeher): test MULTI_CTA kernel with search_width > 1 to allow multiple CTA per queries
@@ -373,6 +677,7 @@ inline std::vector<AnnCagraInputs> generate_inputs()
     {1000},
     {1, 8, 17},
     {1, 16},  // k
+    {graph_build_algo::IVF_PQ, graph_build_algo::NN_DESCENT},
     {search_algo::SINGLE_CTA, search_algo::MULTI_CTA, search_algo::MULTI_KERNEL},
     {0, 1, 10, 100},  // query size
     {0},
@@ -388,6 +693,7 @@ inline std::vector<AnnCagraInputs> generate_inputs()
     {1000},
     {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024},  // dim
     {16},                                                         // k
+    {graph_build_algo::IVF_PQ, graph_build_algo::NN_DESCENT},
     {search_algo::AUTO},
     {10},
     {0},
@@ -398,68 +704,55 @@ inline std::vector<AnnCagraInputs> generate_inputs()
     {true},
     {0.995});
   inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
-  inputs2 =
-    raft::util::itertools::product<AnnCagraInputs>({100},
-                                                   {1000},
-                                                   {64},
-                                                   {16},
-                                                   {search_algo::AUTO},
-                                                   {10},
-                                                   {0, 4, 8, 16, 32},  // team_size
-                                                   {64},
-                                                   {1},
-                                                   {raft::distance::DistanceType::L2Expanded},
-                                                   {false},
-                                                   {false},
-                                                   {0.995});
-  inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
-
-  inputs2 =
-    raft::util::itertools::product<AnnCagraInputs>({100},
-                                                   {1000},
-                                                   {64},
-                                                   {16},
-                                                   {search_algo::AUTO},
-                                                   {10},
-                                                   {0},  // team_size
-                                                   {32, 64, 128, 256, 512, 768},
-                                                   {1},
-                                                   {raft::distance::DistanceType::L2Expanded},
-                                                   {false},
-                                                   {true},
-                                                   {0.995});
+  inputs2 = raft::util::itertools::product<AnnCagraInputs>(
+    {100},
+    {1000},
+    {64},
+    {16},
+    {graph_build_algo::IVF_PQ, graph_build_algo::NN_DESCENT},
+    {search_algo::AUTO},
+    {10},
+    {0, 4, 8, 16, 32},  // team_size
+    {64},
+    {1},
+    {raft::distance::DistanceType::L2Expanded},
+    {false},
+    {false},
+    {0.995});
   inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
 
-  inputs2 =
-    raft::util::itertools::product<AnnCagraInputs>({100},
-                                                   {10000, 20000},
-                                                   {32},
-                                                   {10},
-                                                   {search_algo::AUTO},
-                                                   {10},
-                                                   {0},  // team_size
-                                                   {64},
-                                                   {1},
-                                                   {raft::distance::DistanceType::L2Expanded},
-                                                   {false, true},
-                                                   {false},
-                                                   {0.995});
+  inputs2 = raft::util::itertools::product<AnnCagraInputs>(
+    {100},
+    {1000},
+    {64},
+    {16},
+    {graph_build_algo::IVF_PQ, graph_build_algo::NN_DESCENT},
+    {search_algo::AUTO},
+    {10},
+    {0},  // team_size
+    {32, 64, 128, 256, 512, 768},
+    {1},
+    {raft::distance::DistanceType::L2Expanded},
+    {false},
+    {true},
+    {0.995});
   inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
 
-  inputs2 =
-    raft::util::itertools::product<AnnCagraInputs>({100},
-                                                   {10000, 20000},
-                                                   {32},
-                                                   {10},
-                                                   {search_algo::AUTO},
-                                                   {10},
-                                                   {0},  // team_size
-                                                   {64},
-                                                   {1},
-                                                   {raft::distance::DistanceType::L2Expanded},
-                                                   {false, true},
-                                                   {true},
-                                                   {0.995});
+  inputs2 = raft::util::itertools::product<AnnCagraInputs>(
+    {100},
+    {10000, 20000},
+    {32},
+    {10},
+    {graph_build_algo::IVF_PQ, graph_build_algo::NN_DESCENT},
+    {search_algo::AUTO},
+    {10},
+    {0},  // team_size
+    {64},
+    {1},
+    {raft::distance::DistanceType::L2Expanded},
+    {false, true},
+    {false},
+    {0.995});
   inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
 
   return inputs;
@@ -467,4 +760,4 @@ inline std::vector<AnnCagraInputs> generate_inputs()
 
 const std::vector<AnnCagraInputs> inputs = generate_inputs();
 
-}  // namespace raft::neighbors::experimental::cagra
+}  // namespace raft::neighbors::cagra
diff --git a/cpp/test/neighbors/ann_cagra/search_kernel_uint64_t.cuh b/cpp/test/neighbors/ann_cagra/search_kernel_uint64_t.cuh
index f61e476652..175e4ef483 100644
--- a/cpp/test/neighbors/ann_cagra/search_kernel_uint64_t.cuh
+++ b/cpp/test/neighbors/ann_cagra/search_kernel_uint64_t.cuh
@@ -1,93 +1,107 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
-
-namespace raft::neighbors::cagra::detail {
-
-namespace multi_cta_search {
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)   \
-  extern template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                     \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                          \
-    INDEX_T* const topk_indices_ptr,                                                            \
-    DISTANCE_T* const topk_distances_ptr,                                                       \
-    const DATA_T* const queries_ptr,                                                            \
-    const uint32_t num_queries,                                                                 \
-    const INDEX_T* dev_seed_ptr,                                                                \
-    uint32_t* const num_executed_iterations,                                                    \
-    uint32_t topk,                                                                              \
-    uint32_t block_size,                                                                        \
-    uint32_t result_buffer_size,                                                                \
-    uint32_t smem_size,                                                                         \
-    int64_t hash_bitlen,                                                                        \
-    INDEX_T* hashmap_ptr,                                                                       \
-    uint32_t num_cta_per_query,                                                                 \
-    uint32_t num_random_samplings,                                                              \
-    uint64_t rand_xor_mask,                                                                     \
-    uint32_t num_seeds,                                                                         \
-    size_t itopk_size,                                                                          \
-    size_t search_width,                                                                        \
-    size_t min_iterations,                                                                      \
-    size_t max_iterations,                                                                      \
-    cudaStream_t stream);
-
-instantiate_kernel_selection(32, 1024, float, uint64_t, float);
-instantiate_kernel_selection(8, 128, float, uint64_t, float);
-instantiate_kernel_selection(16, 256, float, uint64_t, float);
-instantiate_kernel_selection(32, 512, float, uint64_t, float);
-
-#undef instantiate_kernel_selection
-}  // namespace multi_cta_search
-
-namespace single_cta_search {
-
-#define instantiate_single_cta_select_and_run(                                                  \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T)                                      \
-  extern template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
-    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                     \
-    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                          \
-    INDEX_T* const topk_indices_ptr,                                                            \
-    DISTANCE_T* const topk_distances_ptr,                                                       \
-    const DATA_T* const queries_ptr,                                                            \
-    const uint32_t num_queries,                                                                 \
-    const INDEX_T* dev_seed_ptr,                                                                \
-    uint32_t* const num_executed_iterations,                                                    \
-    uint32_t topk,                                                                              \
-    uint32_t num_itopk_candidates,                                                              \
-    uint32_t block_size,                                                                        \
-    uint32_t smem_size,                                                                         \
-    int64_t hash_bitlen,                                                                        \
-    INDEX_T* hashmap_ptr,                                                                       \
-    size_t small_hash_bitlen,                                                                   \
-    size_t small_hash_reset_interval,                                                           \
-    uint32_t num_random_samplings,                                                              \
-    uint64_t rand_xor_mask,                                                                     \
-    uint32_t num_seeds,                                                                         \
-    size_t itopk_size,                                                                          \
-    size_t search_width,                                                                        \
-    size_t min_iterations,                                                                      \
-    size_t max_iterations,                                                                      \
-    cudaStream_t stream);
-
-instantiate_single_cta_select_and_run(32, 1024, float, uint64_t, float);
-instantiate_single_cta_select_and_run(8, 128, float, uint64_t, float);
-instantiate_single_cta_select_and_run(16, 256, float, uint64_t, float);
-instantiate_single_cta_select_and_run(32, 512, float, uint64_t, float);
-
-}  // namespace single_cta_search
-}  // namespace raft::neighbors::cagra::detail
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/neighbors/sample_filter_types.hpp>  // none_cagra_sample_filter
+#include <raft/util/raft_explicit.hpp>             // RAFT_EXPLICIT
+
+namespace raft::neighbors::cagra::detail {
+
+namespace multi_cta_search {
+#define instantiate_kernel_selection(                                                       \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  extern template void                                                                      \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t result_buffer_size,                                                            \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    uint32_t num_cta_per_query,                                                             \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
+    cudaStream_t stream);
+
+instantiate_kernel_selection(
+  32, 1024, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(
+  8, 128, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(
+  16, 256, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(
+  32, 512, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+
+#undef instantiate_kernel_selection
+}  // namespace multi_cta_search
+
+namespace single_cta_search {
+
+#define instantiate_single_cta_select_and_run(                                              \
+  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                 \
+  extern template void                                                                      \
+  select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T>( \
+    raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,                 \
+    raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,                      \
+    INDEX_T* const topk_indices_ptr,                                                        \
+    DISTANCE_T* const topk_distances_ptr,                                                   \
+    const DATA_T* const queries_ptr,                                                        \
+    const uint32_t num_queries,                                                             \
+    const INDEX_T* dev_seed_ptr,                                                            \
+    uint32_t* const num_executed_iterations,                                                \
+    uint32_t topk,                                                                          \
+    uint32_t num_itopk_candidates,                                                          \
+    uint32_t block_size,                                                                    \
+    uint32_t smem_size,                                                                     \
+    int64_t hash_bitlen,                                                                    \
+    INDEX_T* hashmap_ptr,                                                                   \
+    size_t small_hash_bitlen,                                                               \
+    size_t small_hash_reset_interval,                                                       \
+    uint32_t num_random_samplings,                                                          \
+    uint64_t rand_xor_mask,                                                                 \
+    uint32_t num_seeds,                                                                     \
+    size_t itopk_size,                                                                      \
+    size_t search_width,                                                                    \
+    size_t min_iterations,                                                                  \
+    size_t max_iterations,                                                                  \
+    SAMPLE_FILTER_T sample_filter,                                                          \
+    cudaStream_t stream);
+
+instantiate_single_cta_select_and_run(
+  32, 1024, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_single_cta_select_and_run(
+  8, 128, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_single_cta_select_and_run(
+  16, 256, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+instantiate_single_cta_select_and_run(
+  32, 512, float, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter);
+
+}  // namespace single_cta_search
+}  // namespace raft::neighbors::cagra::detail
\ No newline at end of file
diff --git a/cpp/test/neighbors/ann_cagra/test_float_int64_t.cu b/cpp/test/neighbors/ann_cagra/test_float_int64_t.cu
index fa3d76d066..6f9e8dbd43 100644
--- a/cpp/test/neighbors/ann_cagra/test_float_int64_t.cu
+++ b/cpp/test/neighbors/ann_cagra/test_float_int64_t.cu
@@ -19,11 +19,11 @@
 #include "../ann_cagra.cuh"
 #include "search_kernel_uint64_t.cuh"
 
-namespace raft::neighbors::experimental::cagra {
+namespace raft::neighbors::cagra {
 
 typedef AnnCagraTest<float, float, std::int64_t> AnnCagraTestF_I64;
 TEST_P(AnnCagraTestF_I64, AnnCagra) { this->testCagra(); }
 
 INSTANTIATE_TEST_CASE_P(AnnCagraTest, AnnCagraTestF_I64, ::testing::ValuesIn(inputs));
 
-}  // namespace raft::neighbors::experimental::cagra
+}  // namespace raft::neighbors::cagra
diff --git a/cpp/test/neighbors/ann_cagra/test_float_uint32_t.cu b/cpp/test/neighbors/ann_cagra/test_float_uint32_t.cu
index dbaf4dedd9..944c2cbc89 100644
--- a/cpp/test/neighbors/ann_cagra/test_float_uint32_t.cu
+++ b/cpp/test/neighbors/ann_cagra/test_float_uint32_t.cu
@@ -18,7 +18,7 @@
 
 #include "../ann_cagra.cuh"
 
-namespace raft::neighbors::experimental::cagra {
+namespace raft::neighbors::cagra {
 
 typedef AnnCagraTest<float, float, std::uint32_t> AnnCagraTestF_U32;
 TEST_P(AnnCagraTestF_U32, AnnCagra) { this->testCagra(); }
@@ -26,7 +26,15 @@ TEST_P(AnnCagraTestF_U32, AnnCagra) { this->testCagra(); }
 typedef AnnCagraSortTest<float, float, std::uint32_t> AnnCagraSortTestF_U32;
 TEST_P(AnnCagraSortTestF_U32, AnnCagraSort) { this->testCagraSort(); }
 
+typedef AnnCagraFilterTest<float, float, std::uint32_t> AnnCagraFilterTestF_U32;
+TEST_P(AnnCagraFilterTestF_U32, AnnCagraFilter)
+{
+  this->testCagraFilter();
+  this->testCagraRemoved();
+}
+
 INSTANTIATE_TEST_CASE_P(AnnCagraTest, AnnCagraTestF_U32, ::testing::ValuesIn(inputs));
 INSTANTIATE_TEST_CASE_P(AnnCagraSortTest, AnnCagraSortTestF_U32, ::testing::ValuesIn(inputs));
+INSTANTIATE_TEST_CASE_P(AnnCagraFilterTest, AnnCagraFilterTestF_U32, ::testing::ValuesIn(inputs));
 
-}  // namespace raft::neighbors::experimental::cagra
+}  // namespace raft::neighbors::cagra
diff --git a/cpp/test/neighbors/ann_cagra/test_int8_t_uint32_t.cu b/cpp/test/neighbors/ann_cagra/test_int8_t_uint32_t.cu
index ba60131677..3d9dc76953 100644
--- a/cpp/test/neighbors/ann_cagra/test_int8_t_uint32_t.cu
+++ b/cpp/test/neighbors/ann_cagra/test_int8_t_uint32_t.cu
@@ -18,14 +18,21 @@
 
 #include "../ann_cagra.cuh"
 
-namespace raft::neighbors::experimental::cagra {
+namespace raft::neighbors::cagra {
 
 typedef AnnCagraTest<float, std::int8_t, std::uint32_t> AnnCagraTestI8_U32;
 TEST_P(AnnCagraTestI8_U32, AnnCagra) { this->testCagra(); }
 typedef AnnCagraSortTest<float, std::int8_t, std::uint32_t> AnnCagraSortTestI8_U32;
 TEST_P(AnnCagraSortTestI8_U32, AnnCagraSort) { this->testCagraSort(); }
+typedef AnnCagraFilterTest<float, std::int8_t, std::uint32_t> AnnCagraFilterTestI8_U32;
+TEST_P(AnnCagraFilterTestI8_U32, AnnCagraFilter)
+{
+  this->testCagraFilter();
+  this->testCagraRemoved();
+}
 
 INSTANTIATE_TEST_CASE_P(AnnCagraTest, AnnCagraTestI8_U32, ::testing::ValuesIn(inputs));
 INSTANTIATE_TEST_CASE_P(AnnCagraSortTest, AnnCagraSortTestI8_U32, ::testing::ValuesIn(inputs));
+INSTANTIATE_TEST_CASE_P(AnnCagraFilterTest, AnnCagraFilterTestI8_U32, ::testing::ValuesIn(inputs));
 
-}  // namespace raft::neighbors::experimental::cagra
+}  // namespace raft::neighbors::cagra
diff --git a/cpp/test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu b/cpp/test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu
index cc172e4833..c5b1b1704b 100644
--- a/cpp/test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu
+++ b/cpp/test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu
@@ -18,7 +18,7 @@
 
 #include "../ann_cagra.cuh"
 
-namespace raft::neighbors::experimental::cagra {
+namespace raft::neighbors::cagra {
 
 typedef AnnCagraTest<float, std::uint8_t, std::uint32_t> AnnCagraTestU8_U32;
 TEST_P(AnnCagraTestU8_U32, AnnCagra) { this->testCagra(); }
@@ -26,7 +26,15 @@ TEST_P(AnnCagraTestU8_U32, AnnCagra) { this->testCagra(); }
 typedef AnnCagraSortTest<float, std::uint8_t, std::uint32_t> AnnCagraSortTestU8_U32;
 TEST_P(AnnCagraSortTestU8_U32, AnnCagraSort) { this->testCagraSort(); }
 
+typedef AnnCagraFilterTest<float, std::uint8_t, std::uint32_t> AnnCagraFilterTestU8_U32;
+TEST_P(AnnCagraFilterTestU8_U32, AnnCagraSort)
+{
+  this->testCagraFilter();
+  this->testCagraRemoved();
+}
+
 INSTANTIATE_TEST_CASE_P(AnnCagraTest, AnnCagraTestU8_U32, ::testing::ValuesIn(inputs));
 INSTANTIATE_TEST_CASE_P(AnnCagraSortTest, AnnCagraSortTestU8_U32, ::testing::ValuesIn(inputs));
+INSTANTIATE_TEST_CASE_P(AnnCagraFilterTest, AnnCagraFilterTestU8_U32, ::testing::ValuesIn(inputs));
 
-}  // namespace raft::neighbors::experimental::cagra
+}  // namespace raft::neighbors::cagra
diff --git a/cpp/test/neighbors/ann_nn_descent.cuh b/cpp/test/neighbors/ann_nn_descent.cuh
new file mode 100644
index 0000000000..948323cf6e
--- /dev/null
+++ b/cpp/test/neighbors/ann_nn_descent.cuh
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../test_utils.cuh"
+#include "ann_utils.cuh"
+
+#include <raft_internal/neighbors/naive_knn.cuh>
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/neighbors/nn_descent.cuh>
+#include <raft/util/itertools.hpp>
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+#include <iostream>
+#include <string>
+#include <vector>
+
+namespace raft::neighbors::experimental::nn_descent {
+
+struct AnnNNDescentInputs {
+  int n_rows;
+  int dim;
+  int graph_degree;
+  raft::distance::DistanceType metric;
+  bool host_dataset;
+  double min_recall;
+};
+
+inline ::std::ostream& operator<<(::std::ostream& os, const AnnNNDescentInputs& p)
+{
+  os << "dataset shape=" << p.n_rows << "x" << p.dim << ", graph_degree=" << p.graph_degree
+     << ", metric=" << static_cast<int>(p.metric) << (p.host_dataset ? ", host" : ", device")
+     << std::endl;
+  return os;
+}
+
+template <typename DistanceT, typename DataT, typename IdxT>
+class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
+ public:
+  AnnNNDescentTest()
+    : stream_(resource::get_cuda_stream(handle_)),
+      ps(::testing::TestWithParam<AnnNNDescentInputs>::GetParam()),
+      database(0, stream_)
+  {
+  }
+
+ protected:
+  void testNNDescent()
+  {
+    size_t queries_size = ps.n_rows * ps.graph_degree;
+    std::vector<IdxT> indices_NNDescent(queries_size);
+    std::vector<IdxT> indices_naive(queries_size);
+
+    {
+      rmm::device_uvector<DistanceT> distances_naive_dev(queries_size, stream_);
+      rmm::device_uvector<IdxT> indices_naive_dev(queries_size, stream_);
+      naive_knn<DistanceT, DataT, IdxT>(handle_,
+                                        distances_naive_dev.data(),
+                                        indices_naive_dev.data(),
+                                        database.data(),
+                                        database.data(),
+                                        ps.n_rows,
+                                        ps.n_rows,
+                                        ps.dim,
+                                        ps.graph_degree,
+                                        ps.metric);
+      update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_);
+      resource::sync_stream(handle_);
+    }
+
+    {
+      {
+        nn_descent::index_params index_params;
+        index_params.metric                    = ps.metric;
+        index_params.graph_degree              = ps.graph_degree;
+        index_params.intermediate_graph_degree = 2 * ps.graph_degree;
+
+        auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
+          (const DataT*)database.data(), ps.n_rows, ps.dim);
+
+        {
+          if (ps.host_dataset) {
+            auto database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
+            raft::copy(database_host.data_handle(), database.data(), database.size(), stream_);
+            auto database_host_view = raft::make_host_matrix_view<const DataT, int64_t>(
+              (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim);
+            auto index = nn_descent::build<DataT, IdxT>(handle_, index_params, database_host_view);
+            update_host(
+              indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_);
+          } else {
+            auto index = nn_descent::build<DataT, IdxT>(handle_, index_params, database_view);
+            update_host(
+              indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_);
+          };
+        }
+        resource::sync_stream(handle_);
+      }
+
+      double min_recall = ps.min_recall;
+      EXPECT_TRUE(eval_recall(
+        indices_naive, indices_NNDescent, ps.n_rows, ps.graph_degree, 0.001, min_recall));
+    }
+  }
+
+  void SetUp() override
+  {
+    database.resize(((size_t)ps.n_rows) * ps.dim, stream_);
+    raft::random::Rng r(1234ULL);
+    if constexpr (std::is_same<DataT, float>{}) {
+      r.normal(database.data(), ps.n_rows * ps.dim, DataT(0.1), DataT(2.0), stream_);
+    } else {
+      r.uniformInt(database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20), stream_);
+    }
+    resource::sync_stream(handle_);
+  }
+
+  void TearDown() override
+  {
+    resource::sync_stream(handle_);
+    database.resize(0, stream_);
+  }
+
+ private:
+  raft::resources handle_;
+  rmm::cuda_stream_view stream_;
+  AnnNNDescentInputs ps;
+  rmm::device_uvector<DataT> database;
+};
+
+const std::vector<AnnNNDescentInputs> inputs = raft::util::itertools::product<AnnNNDescentInputs>(
+  {1000, 2000},                                              // n_rows
+  {3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024},  // dim
+  {32, 64},                                                  // graph_degree
+  {raft::distance::DistanceType::L2Expanded},
+  {false, true},
+  {0.92});
+
+}  // namespace raft::neighbors::experimental::nn_descent
diff --git a/cpp/test/neighbors/ann_nn_descent/test_float_uint32_t.cu b/cpp/test/neighbors/ann_nn_descent/test_float_uint32_t.cu
new file mode 100644
index 0000000000..13bff6ac90
--- /dev/null
+++ b/cpp/test/neighbors/ann_nn_descent/test_float_uint32_t.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../ann_nn_descent.cuh"
+
+namespace raft::neighbors::experimental::nn_descent {
+
+typedef AnnNNDescentTest<float, float, std::uint32_t> AnnNNDescentTestF_U32;
+TEST_P(AnnNNDescentTestF_U32, AnnCagra) { this->testNNDescent(); }
+
+INSTANTIATE_TEST_CASE_P(AnnNNDescentTest, AnnNNDescentTestF_U32, ::testing::ValuesIn(inputs));
+
+}  // namespace raft::neighbors::experimental::nn_descent
diff --git a/cpp/test/neighbors/ann_nn_descent/test_int8_t_uint32_t.cu b/cpp/test/neighbors/ann_nn_descent/test_int8_t_uint32_t.cu
new file mode 100644
index 0000000000..5895303e09
--- /dev/null
+++ b/cpp/test/neighbors/ann_nn_descent/test_int8_t_uint32_t.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../ann_nn_descent.cuh"
+
+namespace raft::neighbors::experimental::nn_descent {
+
+typedef AnnNNDescentTest<float, int8_t, std::uint32_t> AnnNNDescentTestI8_U32;
+TEST_P(AnnNNDescentTestI8_U32, AnnCagra) { this->testNNDescent(); }
+
+INSTANTIATE_TEST_CASE_P(AnnNNDescentTest, AnnNNDescentTestI8_U32, ::testing::ValuesIn(inputs));
+
+}  // namespace raft::neighbors::experimental::nn_descent
diff --git a/cpp/test/neighbors/ann_nn_descent/test_uint8_t_uint32_t.cu b/cpp/test/neighbors/ann_nn_descent/test_uint8_t_uint32_t.cu
new file mode 100644
index 0000000000..a034e84074
--- /dev/null
+++ b/cpp/test/neighbors/ann_nn_descent/test_uint8_t_uint32_t.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../ann_nn_descent.cuh"
+
+namespace raft::neighbors::experimental::nn_descent {
+
+typedef AnnNNDescentTest<float, uint8_t, std::uint32_t> AnnNNDescentTestUI8_U32;
+TEST_P(AnnNNDescentTestUI8_U32, AnnCagra) { this->testNNDescent(); }
+
+INSTANTIATE_TEST_CASE_P(AnnNNDescentTest, AnnNNDescentTestUI8_U32, ::testing::ValuesIn(inputs));
+
+}  // namespace raft::neighbors::experimental::nn_descent
diff --git a/cpp/test/neighbors/ann_utils.cuh b/cpp/test/neighbors/ann_utils.cuh
index 0e54e29c01..be60ec5b6d 100644
--- a/cpp/test/neighbors/ann_utils.cuh
+++ b/cpp/test/neighbors/ann_utils.cuh
@@ -123,6 +123,49 @@ struct idx_dist_pair {
   idx_dist_pair(IdxT x, DistT y, CompareDist op) : idx(x), dist(y), eq_compare(op) {}
 };
 
+template <typename T>
+auto eval_recall(const std::vector<T>& expected_idx,
+                 const std::vector<T>& actual_idx,
+                 size_t rows,
+                 size_t cols,
+                 double eps,
+                 double min_recall) -> testing::AssertionResult
+{
+  size_t match_count = 0;
+  size_t total_count = static_cast<size_t>(rows) * static_cast<size_t>(cols);
+  for (size_t i = 0; i < rows; ++i) {
+    for (size_t k = 0; k < cols; ++k) {
+      size_t idx_k = i * cols + k;  // row major assumption!
+      auto act_idx = actual_idx[idx_k];
+      for (size_t j = 0; j < cols; ++j) {
+        size_t idx   = i * cols + j;  // row major assumption!
+        auto exp_idx = expected_idx[idx];
+        if (act_idx == exp_idx) {
+          match_count++;
+          break;
+        }
+      }
+    }
+  }
+  double actual_recall = static_cast<double>(match_count) / static_cast<double>(total_count);
+  double error_margin  = (actual_recall - min_recall) / std::max(1.0 - min_recall, eps);
+  RAFT_LOG_INFO("Recall = %f (%zu/%zu), the error is %2.1f%% %s the threshold (eps = %f).",
+                actual_recall,
+                match_count,
+                total_count,
+                std::abs(error_margin * 100.0),
+                error_margin < 0 ? "above" : "below",
+                eps);
+  if (actual_recall < min_recall - eps) {
+    return testing::AssertionFailure()
+           << "actual recall (" << actual_recall << ") is lower than the minimum expected recall ("
+           << min_recall << "); eps = " << eps << ". ";
+  }
+  return testing::AssertionSuccess();
+}
+
+/** same as eval_recall, but in case indices do not match,
+ * then check distances as well, and accept match if actual dist is equal to expected_dist */
 template <typename T, typename DistT>
 auto eval_neighbours(const std::vector<T>& expected_idx,
                      const std::vector<T>& actual_idx,
diff --git a/cpp/test/neighbors/tiled_knn.cu b/cpp/test/neighbors/tiled_knn.cu
index 2ab82b845e..ebde8e6d35 100644
--- a/cpp/test/neighbors/tiled_knn.cu
+++ b/cpp/test/neighbors/tiled_knn.cu
@@ -180,6 +180,36 @@ class TiledKNNTest : public ::testing::TestWithParam<TiledKNNInputs> {
                                                        float(0.001),
                                                        stream_,
                                                        true));
+
+    // Also test out the 'index' api - where we can use precomputed norms
+    if (params_.row_major) {
+      auto idx =
+        raft::neighbors::brute_force::build<T>(handle_,
+                                               raft::make_device_matrix_view<const T, int64_t>(
+                                                 database.data(), params_.num_db_vecs, params_.dim),
+                                               metric,
+                                               metric_arg);
+
+      raft::neighbors::brute_force::search<T, int>(
+        handle_,
+        idx,
+        raft::make_device_matrix_view<const T, int64_t>(
+          search_queries.data(), params_.num_queries, params_.dim),
+        raft::make_device_matrix_view<int, int64_t>(
+          raft_indices_.data(), params_.num_queries, params_.k),
+        raft::make_device_matrix_view<T, int64_t>(
+          raft_distances_.data(), params_.num_queries, params_.k));
+
+      ASSERT_TRUE(raft::spatial::knn::devArrMatchKnnPair(ref_indices_.data(),
+                                                         raft_indices_.data(),
+                                                         ref_distances_.data(),
+                                                         raft_distances_.data(),
+                                                         num_queries,
+                                                         k_,
+                                                         float(0.001),
+                                                         stream_,
+                                                         true));
+    }
   }
 
   void SetUp() override
diff --git a/dependencies.yaml b/dependencies.yaml
index 700a6db1bf..3ad51a6377 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -10,12 +10,15 @@ files:
       - build_pylibraft
       - cudatoolkit
       - develop
+      - checks
+      - build_wheels
       - test_libraft
       - docs
       - run_raft_dask
       - run_pylibraft
       - test_python_common
       - test_pylibraft
+      - cupy
   bench_ann:
     output: conda
     matrix:
@@ -38,6 +41,7 @@ files:
       - py_version
       - test_python_common
       - test_pylibraft
+      - cupy
   checks:
     output: none
     includes:
@@ -47,6 +51,7 @@ files:
     output: none
     includes:
       - test_pylibraft
+      - cupy
       - cudatoolkit
       - docs
       - py_version
@@ -75,6 +80,7 @@ files:
     includes:
       - test_python_common
       - test_pylibraft
+      - cupy
   py_build_raft_dask:
     output: pyproject
     pyproject_dir: python/raft-dask
@@ -145,11 +151,37 @@ dependencies:
             packages:
               - gcc_linux-aarch64=11.*
               - sysroot_linux-aarch64==2.17
+      - output_types: conda
+        matrices:
+          - matrix: {cuda: "12.0"}
+            packages: [cuda-version=12.0, cuda-nvcc]
+          - matrix: {cuda: "11.8", arch: x86_64}
+            packages: [nvcc_linux-64=11.8]
+          - matrix: {cuda: "11.8", arch: aarch64}
+            packages: [nvcc_linux-aarch64=11.8]
+          - matrix: {cuda: "11.5", arch: x86_64}
+            packages: [nvcc_linux-64=11.5]
+          - matrix: {cuda: "11.5", arch: aarch64}
+            packages: [nvcc_linux-aarch64=11.5]
+          - matrix: {cuda: "11.4", arch: x86_64}
+            packages: [nvcc_linux-64=11.4]
+          - matrix: {cuda: "11.4", arch: aarch64}
+            packages: [nvcc_linux-aarch64=11.4]
+          - matrix: {cuda: "11.2", arch: x86_64}
+            packages: [nvcc_linux-64=11.2]
+          - matrix: {cuda: "11.2", arch: aarch64}
+            packages: [nvcc_linux-aarch64=11.2]
+
   build_pylibraft:
     common:
-      - output_types: [conda, requirements, pyproject]
+      - output_types: [conda]
         packages:
-          - &rmm rmm==23.10.*
+          - &rmm_conda rmm==23.10.*
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for rmm-cu{11,12}.
+          - --extra-index-url=https://pypi.nvidia.com
     specific:
       - output_types: [conda, requirements, pyproject]
         matrices:
@@ -160,6 +192,20 @@ dependencies:
           - matrix: # All CUDA 11 versions
             packages:
               - &cuda_python11 cuda-python>=11.7.1,<12.0a0
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.2"}
+            packages: &build_pylibraft_packages_cu12
+              - &rmm_cu12 rmm-cu12==23.10.*
+          - {matrix: {cuda: "12.1"}, packages: *build_pylibraft_packages_cu12}
+          - {matrix: {cuda: "12.0"}, packages: *build_pylibraft_packages_cu12}
+          - matrix: {cuda: "11.8"}
+            packages: &build_pylibraft_packages_cu11
+              - &rmm_cu11 rmm-cu11==23.10.*
+          - {matrix: {cuda: "11.5"}, packages: *build_pylibraft_packages_cu11}
+          - {matrix: {cuda: "11.4"}, packages: *build_pylibraft_packages_cu11}
+          - {matrix: {cuda: "11.2"}, packages: *build_pylibraft_packages_cu11}
+          - {matrix: null, packages: [*rmm_conda] }
   checks:
     common:
       - output_types: [conda, requirements]
@@ -167,11 +213,9 @@ dependencies:
           - pre-commit
   develop:
     common:
-      - output_types: [conda, requirements]
-        packages:
-          - clang=16.0.6
-      - output_types: [conda]
+      - output_types: conda
         packages:
+          - clang==16.0.6
           - clang-tools=16.0.6
   nn_bench:
     common:
@@ -265,6 +309,45 @@ dependencies:
               - *libcusolver114
               - *libcusparse_dev114
               - *libcusparse114
+
+  cupy:
+    common:
+      - output_types: conda
+        packages:
+          - cupy>=12.0.0
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          # All CUDA 12 + x86_64 versions
+          - matrix: {cuda: "12.2", arch: x86_64}
+            packages: &cupy_packages_cu12_x86_64
+              - &cupy_cu12_x86_64 cupy-cuda12x>=12.0.0
+          - {matrix: {cuda: "12.1", arch: x86_64}, packages: *cupy_packages_cu12_x86_64}
+          - {matrix: {cuda: "12.0", arch: x86_64}, packages: *cupy_packages_cu12_x86_64}
+          # All CUDA 12 + aarch64 versions
+          - matrix: {cuda: "12.2", arch: aarch64}
+            packages: &cupy_packages_cu12_aarch64
+              - &cupy_cu12_aarch64 cupy-cuda12x -f https://pip.cupy.dev/aarch64 # TODO: Verify that this works.
+          - {matrix: {cuda: "12.1", arch: aarch64}, packages: *cupy_packages_cu12_aarch64}
+          - {matrix: {cuda: "12.0", arch: aarch64}, packages: *cupy_packages_cu12_aarch64}
+
+          # All CUDA 11 + x86_64 versions
+          - matrix: {cuda: "11.8", arch: x86_64}
+            packages: &cupy_packages_cu11_x86_64
+              - cupy-cuda11x>=12.0.0
+          - {matrix: {cuda: "11.5", arch: x86_64}, packages: *cupy_packages_cu11_x86_64}
+          - {matrix: {cuda: "11.4", arch: x86_64}, packages: *cupy_packages_cu11_x86_64}
+          - {matrix: {cuda: "11.2", arch: x86_64}, packages: *cupy_packages_cu11_x86_64}
+
+          # All CUDA 11 + aarch64 versions
+          - matrix: {cuda: "11.8", arch: aarch64}
+            packages: &cupy_packages_cu11_aarch64
+              - cupy-cuda11x -f https://pip.cupy.dev/aarch64 # TODO: Verify that this works.
+          - {matrix: {cuda: "11.5", arch: aarch64}, packages: *cupy_packages_cu11_aarch64}
+          - {matrix: {cuda: "11.4", arch: aarch64}, packages: *cupy_packages_cu11_aarch64}
+          - {matrix: {cuda: "11.2", arch: aarch64}, packages: *cupy_packages_cu11_aarch64}
+          - {matrix: null, packages: [cupy-cuda11x>=12.0.0]}
+
   test_libraft:
     common:
       - output_types: [conda]
@@ -287,7 +370,7 @@ dependencies:
           - sphinx-markdown-tables
   build_wheels:
     common:
-      - output_types: pyproject
+      - output_types: [requirements, pyproject]
         packages:
           - wheel
           - setuptools
@@ -311,7 +394,14 @@ dependencies:
       - output_types: [conda, pyproject]
         packages:
           - &numpy numpy>=1.21
-          - *rmm
+      - output_types: [conda]
+        packages:
+          - *rmm_conda
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for cudf and rmm.
+          - --extra-index-url=https://pypi.nvidia.com
     specific:
       - output_types: [conda, requirements, pyproject]
         matrices:
@@ -322,25 +412,61 @@ dependencies:
           - matrix: # All CUDA 11 versions
             packages:
               - *cuda_python11
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.2"}
+            packages: &run_pylibraft_packages_cu12
+              - *rmm_cu12
+          - {matrix: {cuda: "12.1"}, packages: *run_pylibraft_packages_cu12}
+          - {matrix: {cuda: "12.0"}, packages: *run_pylibraft_packages_cu12}
+          - matrix: {cuda: "11.8"}
+            packages: &run_pylibraft_packages_cu11
+              - *rmm_cu11
+          - {matrix: {cuda: "11.5"}, packages: *run_pylibraft_packages_cu11}
+          - {matrix: {cuda: "11.4"}, packages: *run_pylibraft_packages_cu11}
+          - {matrix: {cuda: "11.2"}, packages: *run_pylibraft_packages_cu11}
+          - {matrix: null, packages: [*rmm_conda]}
   run_raft_dask:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - dask>=2023.7.1
+          - dask==2023.9.2
           - dask-cuda==23.10.*
-          - distributed>=2023.7.1
+          - distributed==2023.9.2
           - joblib>=0.11
           - numba>=0.57
           - *numpy
-          - ucx-py==0.34.*
       - output_types: conda
         packages:
-          - dask-core>=2023.7.1
+          - dask-core==2023.9.2
           - ucx>=1.13.0
           - ucx-proc=*=gpu
+          - &ucx_py_conda ucx-py==0.34.*
       - output_types: pyproject
         packages:
-          - pylibraft==23.10.*
+          - &pylibraft_conda pylibraft==23.10.*
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for cudf and rmm.
+          - --extra-index-url=https://pypi.nvidia.com
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.2"}
+            packages: &run_raft_dask_packages_cu12
+              - &pylibraft_cu12 pylibraft-cu12==23.10.*
+              - &ucx_py_cu12 ucx-py-cu12==0.34.*
+          - {matrix: {cuda: "12.1"}, packages: *run_raft_dask_packages_cu12}
+          - {matrix: {cuda: "12.0"}, packages: *run_raft_dask_packages_cu12}
+          - matrix: {cuda: "11.8"}
+            packages: &run_raft_dask_packages_cu11
+              - &pylibraft_cu11 pylibraft-cu11==23.10.*
+              - &ucx_py_cu11 ucx-py-cu11==0.34.*
+          - {matrix: {cuda: "11.5"}, packages: *run_raft_dask_packages_cu11}
+          - {matrix: {cuda: "11.4"}, packages: *run_raft_dask_packages_cu11}
+          - {matrix: {cuda: "11.2"}, packages: *run_raft_dask_packages_cu11}
+          - {matrix: null, packages: [*pylibraft_conda, *ucx_py_conda]}
   test_python_common:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -353,9 +479,3 @@ dependencies:
         packages:
           - scikit-learn
           - scipy
-      - output_types: conda
-        packages:
-          - cupy>=12.0.0
-      - output_types: pyproject
-        packages:
-          - cupy-cuda11x>=12.0.0
diff --git a/docs/source/ann_benchmarks_param_tuning.md b/docs/source/ann_benchmarks_param_tuning.md
index dd6090c5e2..433df2ae2f 100644
--- a/docs/source/ann_benchmarks_param_tuning.md
+++ b/docs/source/ann_benchmarks_param_tuning.md
@@ -48,7 +48,8 @@ CAGRA uses a graph-based index, which creates an intermediate, approximate kNN g
 |-----------------------------|----------------|----------|----------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | `graph_degree`              | `build_param`  | N        | Positive Integer >0        | 64 | Degree of the final kNN graph index. |
 | `intermediate_graph_degree` | `build_param`  | N        | Positive Integer >0        | 128 | Degree of the intermediate kNN graph. |
-| `dataset_memory_type`       | `build_param` | N | ["device", "host", "mmap"] | "device" | What memory type should the dataset reside?                                                                                                                                       |
+| `graph_build_algo`          | `build_param`  | N | ["IVF_PQ", "NN_DESCENT"]   | "IVF_PQ" | Algorithm to use for search |
+| `dataset_memory_type`       | `build_param`  | N | ["device", "host", "mmap"] | "device" | What memory type should the dataset reside?                                                                                                                                       |
 | `query_memory_type`         | `search_params` | N | ["device", "host", "mmap"] | "device | What memory type should the queries reside? |
 | `itopk`                     | `search_wdith`  | N        | Positive Integer >0        | 64 | Number of intermediate search results retained during the search. Higher values improve search accuracy at the cost of speed. |
 | `search_width`              | `search_param`  | N        | Positive Integer >0        | 1 | Number of graph nodes to select as the starting point for the search in each iteration. |
diff --git a/docs/source/cpp_api.rst b/docs/source/cpp_api.rst
index 0e82d81e35..e60ef4e697 100644
--- a/docs/source/cpp_api.rst
+++ b/docs/source/cpp_api.rst
@@ -18,4 +18,5 @@ C++ API
    cpp_api/random.rst
    cpp_api/solver.rst
    cpp_api/sparse.rst
-   cpp_api/stats.rst
\ No newline at end of file
+   cpp_api/stats.rst
+   cpp_api/utils.rst
\ No newline at end of file
diff --git a/docs/source/cpp_api/core.rst b/docs/source/cpp_api/core.rst
index 7e69f92948..39e57fd69a 100644
--- a/docs/source/cpp_api/core.rst
+++ b/docs/source/cpp_api/core.rst
@@ -20,4 +20,5 @@ expose in public APIs.
    core_nvtx.rst
    core_interruptible.rst
    core_operators.rst
-   core_math.rst
\ No newline at end of file
+   core_math.rst
+   core_bitset.rst
\ No newline at end of file
diff --git a/docs/source/cpp_api/core_bitset.rst b/docs/source/cpp_api/core_bitset.rst
new file mode 100644
index 0000000000..af1cff6d37
--- /dev/null
+++ b/docs/source/cpp_api/core_bitset.rst
@@ -0,0 +1,15 @@
+Bitset
+======
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <raft/core/bitset.cuh>``
+
+namespace *raft::core*
+
+.. doxygengroup:: bitset
+    :project: RAFT
+    :members:
+    :content-only:
\ No newline at end of file
diff --git a/docs/source/cpp_api/utils.rst b/docs/source/cpp_api/utils.rst
new file mode 100644
index 0000000000..4471093c8b
--- /dev/null
+++ b/docs/source/cpp_api/utils.rst
@@ -0,0 +1,21 @@
+Utilities
+=========
+
+RAFT contains numerous utility functions and primitives that are easily usable.
+This page provides C++ API references for the publicly-exposed utility functions.
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+Memory Pool
+-----------
+
+``#include <raft/utils/memory_pool.cuh>``
+
+namespace *raft*
+
+.. doxygengroup:: memory_pool
+    :project: RAFT
+    :members:
+    :content-only:
diff --git a/notebooks/ivf_flat_example.ipynb b/notebooks/ivf_flat_example.ipynb
new file mode 100644
index 0000000000..08b9d78169
--- /dev/null
+++ b/notebooks/ivf_flat_example.ipynb
@@ -0,0 +1,674 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "4f49c5c4-1170-42a7-9d6a-b90acd00c3c3",
+   "metadata": {},
+   "source": [
+    "# RAFT IVF Flat Example Notebook"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4bcfe810-f120-422c-b2bb-72cc43d0c4ca",
+   "metadata": {},
+   "source": [
+    "## Introduction\n",
+    "\n",
+    "This notebook demonstrates how to run approximate nearest neighbor search using RAFT IVF-Flat algorithm.\n",
+    "It builds and searches an index using a dataset from the ann-benchmarks million-scale datasets, saves/loads the index to disk, and explores important parameters for fine-tuning the search performance and accuracy of the index."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "fe73ada7-7b7f-4005-9440-85428194311b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import cupy as cp\n",
+    "import numpy as np\n",
+    "from pylibraft.common import DeviceResources\n",
+    "from pylibraft.neighbors import ivf_flat\n",
+    "import matplotlib.pyplot as plt\n",
+    "import tempfile\n",
+    "from utils import BenchmarkTimer, calc_recall, load_dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "da9e8615-ea9f-4735-b70f-15ccab36c0d9",
+   "metadata": {},
+   "source": [
+    "For best performance it is recommended to use an RMM pooling allocator, to minimize the overheads of repeated CUDA allocations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "5350e4d9-0993-406a-80af-29538b5677c2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import rmm\n",
+    "from rmm.allocators.cupy import rmm_cupy_allocator\n",
+    "mr = rmm.mr.PoolMemoryResource(\n",
+    "     rmm.mr.CudaMemoryResource(),\n",
+    "     initial_pool_size=2**30\n",
+    ")\n",
+    "rmm.mr.set_current_device_resource(mr)\n",
+    "cp.cuda.set_allocator(rmm_cupy_allocator)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b0d935f2-ba24-44fc-bdfe-a769b7fcd8e6",
+   "metadata": {},
+   "source": [
+    "The following GPU is used for this notebook"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "a5daa4b4-96de-4e74-bfd6-505b13595f62",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Thu Sep 21 02:30:53 2023       \n",
+      "+---------------------------------------------------------------------------------------+\n",
+      "| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |\n",
+      "|-----------------------------------------+----------------------+----------------------+\n",
+      "| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
+      "| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\n",
+      "|                                         |                      |               MIG M. |\n",
+      "|=========================================+======================+======================|\n",
+      "|   0  NVIDIA H100 PCIe               On  | 00000000:41:00.0 Off |                    0 |\n",
+      "| N/A   35C    P0              69W / 350W |   1487MiB / 81559MiB |      0%      Default |\n",
+      "|                                         |                      |             Disabled |\n",
+      "+-----------------------------------------+----------------------+----------------------+\n",
+      "                                                                                         \n",
+      "+---------------------------------------------------------------------------------------+\n",
+      "| Processes:                                                                            |\n",
+      "|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\n",
+      "|        ID   ID                                                             Usage      |\n",
+      "|=======================================================================================|\n",
+      "|    0   N/A  N/A      3940      C   /opt/conda/envs/rapids/bin/python          1474MiB |\n",
+      "+---------------------------------------------------------------------------------------+\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Report the GPU in use\n",
+    "!nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "88a654cc-6389-4526-a3e6-826de5606a09",
+   "metadata": {},
+   "source": [
+    "## Load dataset\n",
+    "\n",
+    "The ANN benchmarks website provides the datasets in HDF5 format.\n",
+    "\n",
+    "The list of prepared datasets can be found at https://github.com/erikbern/ann-benchmarks/#data-sets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "5f529ad6-b0bd-495c-bf7c-43f10fb6aa14",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The index and data will be saved in /tmp/raft_example\n"
+     ]
+    }
+   ],
+   "source": [
+    "WORK_FOLDER = os.path.join(tempfile.gettempdir(), \"raft_example\")\n",
+    "f = load_dataset(\"http://ann-benchmarks.com/sift-128-euclidean.hdf5\", work_folder=WORK_FOLDER)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "3d68a7db-bcf4-449c-96c3-1e8ab146c84d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded dataset of size (1000000, 128),  0.5 GiB; metric: 'euclidean'.\n",
+      "Number of test queries: 10000\n"
+     ]
+    }
+   ],
+   "source": [
+    "metric = f.attrs['distance']\n",
+    "\n",
+    "dataset = cp.array(f['train'])\n",
+    "queries = cp.array(f['test'])\n",
+    "gt_neighbors = cp.array(f['neighbors'])\n",
+    "gt_distances = cp.array(f['distances'])\n",
+    "\n",
+    "itemsize = dataset.dtype.itemsize \n",
+    "\n",
+    "print(f\"Loaded dataset of size {dataset.shape}, {dataset.size*itemsize/(1<<30):4.1f} GiB; metric: '{metric}'.\")\n",
+    "print(f\"Number of test queries: {queries.shape[0]}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9f463c50-d1d3-49be-bcfe-952602efa603",
+   "metadata": {},
+   "source": [
+    "## Build index\n",
+    "We set [IndexParams](https://docs.rapids.ai/api/raft/nightly/pylibraft_api/neighbors/#pylibraft.neighbors.ivf_flat.IndexParams) and build the index. The index parameters will be discussed in more detail in later sections of this notebook."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "737f8841-93f9-4c8e-b2e1-787d4474ef94",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 120 ms, sys: 5.33 ms, total: 125 ms\n",
+      "Wall time: 124 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "build_params = ivf_flat.IndexParams(\n",
+    "        n_lists=1024,\n",
+    "        metric=\"euclidean\",\n",
+    "        kmeans_trainset_fraction=0.1,\n",
+    "        kmeans_n_iters=20,\n",
+    "        add_data_on_build=True\n",
+    "    )\n",
+    "\n",
+    "index = ivf_flat.build(build_params, dataset)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a16a0cf6-3b05-4afd-9bb8-54431e0d7439",
+   "metadata": {},
+   "source": [
+    "The index is built. We can print some basic information of the index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "1aec7024-6e5d-4d2c-82e6-7b5734aec958",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Index(type=IVF-FLAT, metric=euclidean, size=1000000, dim=128, n_lists=1024, adaptive_centers=False)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(index)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "df7d4958-56a3-48ea-bd64-3486fdb57fb7",
+   "metadata": {},
+   "source": [
+    "## Search neighbors"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "89ba2eaa-4c85-4e1c-b07c-920394e55dce",
+   "metadata": {},
+   "source": [
+    "It is recommended to reuse [device recosources](https://docs.rapids.ai/api/raft/nightly/pylibraft_api/common/#pylibraft.common.DeviceResources) across multiple invocations of search, since constructing these can be time consuming. We will reuse the resources by passing the same handle to each  RAFT API call."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "46e0421b-9335-47a2-8451-a91f56c2f086",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "handle = DeviceResources()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a6365229-18fd-468f-af30-e24b950cbd6e",
+   "metadata": {},
+   "source": [
+    "After setting [SearchParams](https://docs.rapids.ai/api/raft/nightly/pylibraft_api/neighbors/#pylibraft.neighbors.ivf_flat.SearchParams) we search for for `k=10` neighbors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "595454e1-7240-4b43-9a73-963d5670b00c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 171 ms, sys: 52.6 ms, total: 224 ms\n",
+      "Wall time: 236 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "n_queries=10000\n",
+    "# n_probes is the number of clusters we select in the first (coarse) search step. This is the only hyper parameter for search.\n",
+    "search_params = ivf_flat.SearchParams(n_probes=30)\n",
+    "\n",
+    "# Search 10 nearest neighbors.\n",
+    "distances, indices = ivf_flat.search(search_params, index, cp.asarray(queries[:n_queries,:]), k=10, handle=handle)\n",
+    "    \n",
+    "# RAFT calls are asynchronous (when handle arg is provided), we need to sync before accessing the results.\n",
+    "handle.sync()\n",
+    "distances, neighbors = cp.asnumpy(distances), cp.asnumpy(indices)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "43d20ca7-7b9e-4046-bb52-640a2744db75",
+   "metadata": {},
+   "source": [
+    "The returned arrays have shape {n_queries x 10] and store the distance values and the indices of the searched vectors. We check how accurate the search is. The accuracy of the search is quantified as `recall`, which is a value between 0 and 1 and tells us what fraction of the returned neighbors are actual k nearest neighbors. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "8cd9cd20-ca00-4a35-a0a0-86636521b31a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.97406"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "calc_recall(neighbors, gt_neighbors)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cde5079c-9777-45a1-9545-cffbcc59988f",
+   "metadata": {},
+   "source": [
+    "## Save and load the index\n",
+    "You can serialize the index to file using [save](https://docs.rapids.ai/api/raft/nightly/pylibraft_api/neighbors/#pylibraft.neighbors.ivf_flat.save), and [load](https://docs.rapids.ai/api/raft/nightly/pylibraft_api/neighbors/#pylibraft.neighbors.ivf_flat.load) it later."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "bf94e45c-e7fb-4aa3-a611-ddaee7ac41ae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "index_file = os.path.join(WORK_FOLDER, \"my_ivf_flat_index.bin\")\n",
+    "ivf_flat.save(index_file, index)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "1622d9be-be41-4d25-be99-d348c5e54957",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "index = ivf_flat.load(index_file)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "15d503e5-05e8-47ce-8501-e13fc512099c",
+   "metadata": {},
+   "source": [
+    "## Tune search parameters\n",
+    "Search has a single hyper parameter: `n_probes`, which describes how many neighboring cluster is searched (probed) for each query. Within a probed cluster, the distance is computed between all the vectors in the cluster and the query point, and the top-k neighbors are selected. Finally, the top-k neighbors are selected from all the neighbor candidates from the probed clusters.\n",
+    "\n",
+    "Let's see how search accuracy and latency changes when we change the `n_probes` parameter."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "ace0c31f-af75-4352-a438-123a9a03612c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Benchmarking search with n_probes = 10\n",
+      "recall 0.86625\n",
+      "Average search time:   0.026 +/- 0.000259 s\n",
+      "Queries per second (QPS):   384968\n",
+      "\n",
+      "Benchmarking search with n_probes = 20\n",
+      "recall 0.94705\n",
+      "Average search time:   0.050 +/- 5.43e-05 s\n",
+      "Queries per second (QPS):   198880\n",
+      "\n",
+      "Benchmarking search with n_probes = 30\n",
+      "recall 0.97406\n",
+      "Average search time:   0.075 +/- 8.59e-05 s\n",
+      "Queries per second (QPS):   133954\n",
+      "\n",
+      "Benchmarking search with n_probes = 50\n",
+      "recall 0.99169\n",
+      "Average search time:   0.123 +/- 4.78e-05 s\n",
+      "Queries per second (QPS):    80997\n",
+      "\n",
+      "Benchmarking search with n_probes = 100\n",
+      "recall 0.99844\n",
+      "Average search time:   0.244 +/- 0.000249 s\n",
+      "Queries per second (QPS):    40934\n",
+      "\n",
+      "Benchmarking search with n_probes = 200\n",
+      "recall 0.99932\n",
+      "Average search time:   0.468 +/- 0.000367 s\n",
+      "Queries per second (QPS):    21382\n",
+      "\n",
+      "Benchmarking search with n_probes = 500\n",
+      "recall 0.99933\n",
+      "Average search time:   1.039 +/- 0.000209 s\n",
+      "Queries per second (QPS):     9625\n",
+      "\n",
+      "Benchmarking search with n_probes = 1024\n",
+      "recall 0.99935\n",
+      "Average search time:   0.701 +/- 0.00579 s\n",
+      "Queries per second (QPS):    14273\n"
+     ]
+    }
+   ],
+   "source": [
+    "n_probes = np.asarray([10, 20, 30, 50, 100, 200, 500, 1024]);\n",
+    "qps = np.zeros(n_probes.shape);\n",
+    "recall = np.zeros(n_probes.shape);\n",
+    "\n",
+    "for i in range(len(n_probes)):\n",
+    "    print(\"\\nBenchmarking search with n_probes =\", n_probes[i])\n",
+    "    timer = BenchmarkTimer(reps=1, warmup=1)\n",
+    "    for rep in timer.benchmark_runs():\n",
+    "        distances, neighbors = ivf_flat.search(\n",
+    "            ivf_flat.SearchParams(n_probes=n_probes[i]),\n",
+    "            index,\n",
+    "            cp.asarray(queries),\n",
+    "            k=10,\n",
+    "            handle=handle,\n",
+    "        )\n",
+    "        handle.sync()\n",
+    "    \n",
+    "    recall[i] = calc_recall(cp.asnumpy(neighbors), gt_neighbors)\n",
+    "    print(\"recall\", recall[i])\n",
+    "\n",
+    "    timings = np.asarray(timer.timings)\n",
+    "    avg_time = timings.mean()\n",
+    "    std_time = timings.std()\n",
+    "    qps[i] = queries.shape[0] / avg_time\n",
+    "    print(\"Average search time: {0:7.3f} +/- {1:7.3} s\".format(avg_time, std_time))\n",
+    "    print(\"Queries per second (QPS): {0:8.0f}\".format(qps[i]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "20b2498c-7231-4211-990e-600d5c26a9a1",
+   "metadata": {},
+   "source": [
+    "The plots below illustrate how the accuracy (recall) and the throughput (queries per second) depends on the `n_probes` parameter."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e1ac370f-91c8-4054-95c7-a749df5f16d2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plt.figure(figsize=(12,3))\n",
+    "ax = fig.add_subplot(131)\n",
+    "ax.plot(n_probes, recall,'o-')\n",
+    "#ax.set_xticks(bench_k, bench_k)\n",
+    "ax.set_xlabel('n_probes')\n",
+    "ax.grid()\n",
+    "ax.set_ylabel('recall (@k=10)')\n",
+    "\n",
+    "ax = fig.add_subplot(132)\n",
+    "ax.plot(n_probes, qps,'o-')\n",
+    "#ax.set_xticks(bench_k, bench_k)\n",
+    "ax.set_xlabel('n_probes')\n",
+    "ax.grid()\n",
+    "ax.set_ylabel('queries per second');\n",
+    "\n",
+    "ax = fig.add_subplot(133)\n",
+    "ax.plot(recall, qps,'o-')\n",
+    "#ax.set_xticks(bench_k, bench_k)\n",
+    "ax.set_xlabel('recall')\n",
+    "ax.grid()\n",
+    "ax.set_ylabel('queries per second');\n",
+    "#ax.set_yscale('log')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "81e7ad6a-bddc-45de-9cce-0fb913f91efe",
+   "metadata": {},
+   "source": [
+    "## Adjust build parameters\n",
+    "### n_lists\n",
+    "The number of clusters (or lists) is set by the n_list parameter. Let's change it to 100 clusters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "addbfff3-7773-4290-9608-5489edf4886d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "build_params = ivf_flat.IndexParams(\n",
+    "        n_lists=100,\n",
+    "        metric=\"euclidean\",\n",
+    "        kmeans_trainset_fraction=1,\n",
+    "        kmeans_n_iters=20,\n",
+    "        add_data_on_build=True\n",
+    "    )\n",
+    "\n",
+    "index = ivf_flat.build(build_params, dataset, handle=handle)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "48db27f9-54c8-4dac-839b-af94ada8885f",
+   "metadata": {},
+   "source": [
+    "The ratio of n_probes / n_list will determine how large fraction of the dataset is searched for each query. The right combination depends on the use case. Here we will search 10 of the clusters for each query."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8a0149ad-de38-4195-97a5-ce5d5d877036",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "n_queries=10000\n",
+    "\n",
+    "search_params = ivf_flat.SearchParams(n_probes=10)\n",
+    "\n",
+    "# Search 10 nearest neighbors.\n",
+    "distances, indices = ivf_flat.search(search_params, index, cp.asarray(queries[:n_queries,:]), k=10, handle=handle)\n",
+    "    \n",
+    "handle.sync()\n",
+    "distances, neighbors = cp.asnumpy(distances), cp.asnumpy(indices)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eedc3ec4-06af-42c5-8cdf-490a5c2bc49a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "calc_recall(neighbors, gt_neighbors)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0c44800f-1e9e-4f7b-87fe-0f25e6590faa",
+   "metadata": {},
+   "source": [
+    "### trainset_fraction\n",
+    "During clustering we can sub-sample the dataset. The parameter `trainset_fraction` determines what fraction to use. Often we get good results by using only 1/10th of the dataset for clustering. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5a54d190-64d4-4cd4-a497-365cbffda871",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "build_params = ivf_flat.IndexParams( \n",
+    "        n_lists=100, \n",
+    "        metric=\"sqeuclidean\", \n",
+    "        kmeans_trainset_fraction=0.1, \n",
+    "        kmeans_n_iters=20 \n",
+    "    ) \n",
+    "index = ivf_flat.build(build_params, dataset, handle=handle)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9d86a213-d6ae-4fca-9082-cb5a4d1dab36",
+   "metadata": {},
+   "source": [
+    "We see only a minimal change in the recall"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4cc992e8-a5e5-4508-b790-0e934160b660",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "search_params = ivf_flat.SearchParams(n_probes=10)\n",
+    "\n",
+    "distances, indices = ivf_flat.search(search_params, index, cp.asarray(queries[:n_queries,:]), k=10, handle=handle)\n",
+    "    \n",
+    "handle.sync()\n",
+    "distances, neighbors = cp.asnumpy(distances), cp.asnumpy(indices)\n",
+    "calc_recall(neighbors, gt_neighbors)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "25289ebc-7d89-4fa6-bc62-e25b6e77750c",
+   "metadata": {},
+   "source": [
+    "### Add vectors on build\n",
+    "Currently you cannot configure how RAFT sub-samples the input. If you want to have a fine control on how the training set is selected, then create the index in two steps:\n",
+    "1. Define cluster centers on a training set, but do not add any vector to the index\n",
+    "2. Add vectors to the index (extend)\n",
+    "\n",
+    "This workflow shall be familiar to FAISS users. Note that raft does not require adding the data in batches, internal batching is used when necessary.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7ebcf970-94ed-4825-9885-277bd984b90c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# subsample the dataset\n",
+    "n_train = 10000\n",
+    "train_set = dataset[cp.random.choice(dataset.shape[0], n_train, replace=False),:]\n",
+    "\n",
+    "# build using training set\n",
+    "build_params = ivf_flat.IndexParams(\n",
+    "        n_lists=1024,\n",
+    "        metric=\"sqeuclidean\",\n",
+    "        kmeans_trainset_fraction=1,\n",
+    "        kmeans_n_iters=20,\n",
+    "        add_data_on_build=False\n",
+    "    )\n",
+    "index = ivf_flat.build(build_params, train_set)\n",
+    "\n",
+    "print(\"Index before adding vectors\", index)\n",
+    "\n",
+    "ivf_flat.extend(index, dataset, cp.arange(dataset.shape[0], dtype=cp.int64))\n",
+    "\n",
+    "print(\"Index after adding vectors\", index)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "029d48a9-baf7-4263-af43-9e500ef3cce4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/tutorial_ivf_pq.ipynb b/notebooks/tutorial_ivf_pq.ipynb
index 6aa8cd6495..397e39bfba 100644
--- a/notebooks/tutorial_ivf_pq.ipynb
+++ b/notebooks/tutorial_ivf_pq.ipynb
@@ -79,6 +79,7 @@
     "from pylibraft.common import DeviceResources\n",
     "from pylibraft.neighbors import ivf_pq, refine\n",
     "from adjustText import adjust_text\n",
+    "from utils import calc_recall, load_dataset\n",
     "\n",
     "%matplotlib inline"
    ]
@@ -194,15 +195,18 @@
    "cell_type": "code",
    "execution_count": 7,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The index and data will be saved in /tmp/raft_example\n"
+     ]
+    }
+   ],
    "source": [
     "DATASET_URL = \"http://ann-benchmarks.com/sift-128-euclidean.hdf5\"\n",
-    "DATASET_FILENAME = DATASET_URL.split('/')[-1]\n",
-    "\n",
-    "## download the dataset\n",
-    "dataset_path = os.path.join(WORK_FOLDER, DATASET_FILENAME)\n",
-    "if not os.path.exists(dataset_path):\n",
-    "    urllib.request.urlretrieve(DATASET_URL, dataset_path)"
+    "f = load_dataset(DATASET_URL)"
    ]
   },
   {
@@ -227,8 +231,6 @@
     }
    ],
    "source": [
-    "f = h5py.File(dataset_path, \"r\")\n",
-    "\n",
     "metric = f.attrs['distance']\n",
     "\n",
     "dataset = cp.array(f['train'])\n",
@@ -456,28 +458,6 @@
     }
    ],
    "source": [
-    "## Check the quality of the prediction (recall)\n",
-    "def calc_recall(found_indices, ground_truth):\n",
-    "    found_indices = cp.asarray(found_indices)\n",
-    "    bs, k = found_indices.shape\n",
-    "    if bs != ground_truth.shape[0]:\n",
-    "        raise RuntimeError(\n",
-    "            \"Batch sizes do not match {} vs {}\".format(\n",
-    "                bs, ground_truth.shape[0])\n",
-    "        )\n",
-    "    if k > ground_truth.shape[1]:\n",
-    "        raise RuntimeError(\n",
-    "            \"Not enough indices in the ground truth ({} > {})\".format(\n",
-    "                k, ground_truth.shape[1])\n",
-    "        )\n",
-    "    n = 0\n",
-    "    # Go over the batch\n",
-    "    for i in range(bs):\n",
-    "        # Note, ivf-pq does not guarantee the ordered input, hence the use of intersect1d\n",
-    "        n += cp.intersect1d(found_indices[i, :k], ground_truth[i, :k]).size\n",
-    "    recall = n / found_indices.size\n",
-    "    return recall\n",
-    "\n",
     "recall_first_try = calc_recall(neighbors, gt_neighbors)\n",
     "print(f\"Got recall = {recall_first_try} with the default parameters (k = {k}).\")"
    ]
diff --git a/notebooks/utils.py b/notebooks/utils.py
new file mode 100644
index 0000000000..1c2e44a6ae
--- /dev/null
+++ b/notebooks/utils.py
@@ -0,0 +1,103 @@
+#
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import cupy as cp
+import h5py
+import os
+import tempfile
+import time
+import urllib
+
+## Check the quality of the prediction (recall)
+def calc_recall(found_indices, ground_truth):
+    found_indices = cp.asarray(found_indices)
+    bs, k = found_indices.shape
+    if bs != ground_truth.shape[0]:
+        raise RuntimeError(
+            "Batch sizes do not match {} vs {}".format(
+                bs, ground_truth.shape[0]
+            )
+        )
+    if k > ground_truth.shape[1]:
+        raise RuntimeError(
+            "Not enough indices in the ground truth ({} > {})".format(
+                k, ground_truth.shape[1]
+            )
+        )
+    n = 0
+    # Go over the batch
+    for i in range(bs):
+        # Note, ivf-pq does not guarantee the ordered input, hence the use of intersect1d
+        n += cp.intersect1d(found_indices[i, :k], ground_truth[i, :k]).size
+    recall = n / found_indices.size
+    return recall
+
+
+class BenchmarkTimer:
+    """Provides a context manager that runs a code block `reps` times
+    and records results to the instance variable `timings`. Use like:
+    .. code-block:: python
+        timer = BenchmarkTimer(rep=5)
+        for _ in timer.benchmark_runs():
+            ... do something ...
+        print(np.min(timer.timings))
+
+        This class is borrowed from the rapids/cuml benchmark suite
+    """
+
+    def __init__(self, reps=1, warmup=0):
+        self.warmup = warmup
+        self.reps = reps
+        self.timings = []
+
+    def benchmark_runs(self):
+        for r in range(self.reps + self.warmup):
+            t0 = time.time()
+            yield r
+            t1 = time.time()
+            self.timings.append(t1 - t0)
+            if r >= self.warmup:
+                self.timings.append(t1 - t0)
+
+
+def load_dataset(dataset_url, work_folder=None):
+    """Download dataset from url. It is expected that the dataset contains a hdf5 file in ann-benchmarks format
+
+    Parameters
+    ----------
+      dataset_url address of hdf5 file
+      work_folder name of the local folder to store the dataset
+
+    """
+    dataset_url = "http://ann-benchmarks.com/sift-128-euclidean.hdf5"
+    dataset_filename = dataset_url.split("/")[-1]
+
+    # We'll need to load store some data in this tutorial
+    if work_folder is None:
+        work_folder = os.path.join(tempfile.gettempdir(), "raft_example")
+
+    if not os.path.exists(work_folder):
+        os.makedirs(work_folder)
+    print("The index and data will be saved in", work_folder)
+
+    ## download the dataset
+    dataset_path = os.path.join(work_folder, dataset_filename)
+    if not os.path.exists(dataset_path):
+        urllib.request.urlretrieve(dataset_url, dataset_path)
+
+    f = h5py.File(dataset_path, "r")
+
+    return f
diff --git a/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx b/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx
index e0c59a5ed3..c11d933b27 100644
--- a/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx
+++ b/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx
@@ -104,11 +104,13 @@ cdef class IndexParams:
 
     graph_degree : int, default = 64
 
-    add_data_on_build : bool, default = True
-        After training the coarse and fine quantizers, we will populate
-        the index with the dataset if add_data_on_build == True, otherwise
-        the index is left empty, and the extend method can be used
-        to add new vectors to the index.
+    build_algo: string denoting the graph building algorithm to use,
+                default = "ivf_pq"
+        Valid values for algo: ["ivf_pq", "nn_descent"], where
+        - ivf_pq will use the IVF-PQ algorithm for building the knn graph
+        - nn_descent (experimental) will use the NN-Descent algorithm for
+          building the knn graph. It is expected to be generally
+          faster than ivf_pq.
     """
     cdef c_cagra.index_params params
 
@@ -116,12 +118,15 @@ cdef class IndexParams:
                  metric="sqeuclidean",
                  intermediate_graph_degree=128,
                  graph_degree=64,
-                 add_data_on_build=True):
+                 build_algo="ivf_pq"):
         self.params.metric = _get_metric(metric)
         self.params.metric_arg = 0
         self.params.intermediate_graph_degree = intermediate_graph_degree
         self.params.graph_degree = graph_degree
-        self.params.add_data_on_build = add_data_on_build
+        if build_algo == "ivf_pq":
+            self.params.build_algo = c_cagra.graph_build_algo.IVF_PQ
+        elif build_algo == "nn_descent":
+            self.params.build_algo = c_cagra.graph_build_algo.NN_DESCENT
 
     @property
     def metric(self):
@@ -135,10 +140,6 @@ cdef class IndexParams:
     def graph_degree(self):
         return self.params.graph_degree
 
-    @property
-    def add_data_on_build(self):
-        return self.params.add_data_on_build
-
 
 cdef class Index:
     cdef readonly bool trained
diff --git a/python/pylibraft/pylibraft/neighbors/cagra/cpp/c_cagra.pxd b/python/pylibraft/pylibraft/neighbors/cagra/cpp/c_cagra.pxd
index 0c683bcd9b..7e22f274e9 100644
--- a/python/pylibraft/pylibraft/neighbors/cagra/cpp/c_cagra.pxd
+++ b/python/pylibraft/pylibraft/neighbors/cagra/cpp/c_cagra.pxd
@@ -51,9 +51,14 @@ from pylibraft.neighbors.ivf_pq.cpp.c_ivf_pq cimport (
 cdef extern from "raft/neighbors/cagra_types.hpp" \
         namespace "raft::neighbors::cagra" nogil:
 
+    ctypedef enum graph_build_algo:
+        IVF_PQ "raft::neighbors::cagra::graph_build_algo::IVF_PQ",
+        NN_DESCENT "raft::neighbors::cagra::graph_build_algo::NN_DESCENT"
+
     cpdef cppclass index_params(ann_index_params):
         size_t intermediate_graph_degree
         size_t graph_degree
+        graph_build_algo build_algo
 
     ctypedef enum search_algo:
         SINGLE_CTA "raft::neighbors::cagra::search_algo::SINGLE_CTA",
diff --git a/python/pylibraft/pylibraft/test/test_cagra.py b/python/pylibraft/pylibraft/test/test_cagra.py
index 74e9f53b91..24126c0c5a 100644
--- a/python/pylibraft/pylibraft/test/test_cagra.py
+++ b/python/pylibraft/pylibraft/test/test_cagra.py
@@ -52,6 +52,7 @@ def run_cagra_build_search_test(
     metric="euclidean",
     intermediate_graph_degree=128,
     graph_degree=64,
+    build_algo="ivf_pq",
     array_type="device",
     compare=True,
     inplace=True,
@@ -67,6 +68,7 @@ def run_cagra_build_search_test(
         metric=metric,
         intermediate_graph_degree=intermediate_graph_degree,
         graph_degree=graph_degree,
+        build_algo=build_algo,
     )
 
     if array_type == "device":
@@ -139,13 +141,17 @@ def run_cagra_build_search_test(
 @pytest.mark.parametrize("inplace", [True, False])
 @pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8])
 @pytest.mark.parametrize("array_type", ["device", "host"])
-def test_cagra_dataset_dtype_host_device(dtype, array_type, inplace):
+@pytest.mark.parametrize("build_algo", ["ivf_pq", "nn_descent"])
+def test_cagra_dataset_dtype_host_device(
+    dtype, array_type, inplace, build_algo
+):
     # Note that inner_product tests use normalized input which we cannot
     # represent in int8, therefore we test only sqeuclidean metric here.
     run_cagra_build_search_test(
         dtype=dtype,
         inplace=inplace,
         array_type=array_type,
+        build_algo=build_algo,
     )
 
 
@@ -158,6 +164,7 @@ def test_cagra_dataset_dtype_host_device(dtype, array_type, inplace):
             "add_data_on_build": True,
             "k": 1,
             "metric": "euclidean",
+            "build_algo": "ivf_pq",
         },
         {
             "intermediate_graph_degree": 32,
@@ -165,6 +172,7 @@ def test_cagra_dataset_dtype_host_device(dtype, array_type, inplace):
             "add_data_on_build": False,
             "k": 5,
             "metric": "sqeuclidean",
+            "build_algo": "ivf_pq",
         },
         {
             "intermediate_graph_degree": 128,
@@ -172,6 +180,7 @@ def test_cagra_dataset_dtype_host_device(dtype, array_type, inplace):
             "add_data_on_build": True,
             "k": 10,
             "metric": "inner_product",
+            "build_algo": "nn_descent",
         },
     ],
 )
@@ -184,6 +193,7 @@ def test_cagra_index_params(params):
         graph_degree=params["graph_degree"],
         intermediate_graph_degree=params["intermediate_graph_degree"],
         compare=False,
+        build_algo=params["build_algo"],
     )
 
 
@@ -241,7 +251,7 @@ def test_cagra_index_params(params):
             "search_width": 4,
             "min_iterations": 0,
             "thread_block_size": 0,
-            "hashmap_mode": "small",
+            "hashmap_mode": "auto",
             "hashmap_min_bitlen": 0,
             "hashmap_max_fill_rate": 0.5,
             "num_random_samplings": 1,
diff --git a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py
index 198d0a2b14..233607c281 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py
@@ -254,18 +254,18 @@ def create_plot_build(
     xn = "k-nn"
     yn = "qps"
 
-    # recall_85 = [-1] * len(linestyles)
     qps_85 = [-1] * len(linestyles)
     bt_85 = [0] * len(linestyles)
     i_85 = [-1] * len(linestyles)
-    # recall_90 = [-1] * len(linestyles)
+
     qps_90 = [-1] * len(linestyles)
     bt_90 = [0] * len(linestyles)
     i_90 = [-1] * len(linestyles)
-    # recall_95 = [-1] * len(linestyles)
+
     qps_95 = [-1] * len(linestyles)
     bt_95 = [0] * len(linestyles)
     i_95 = [-1] * len(linestyles)
+
     data = OrderedDict()
     colors = OrderedDict()
 
@@ -303,7 +303,7 @@ def mean_y(algo):
     plt.figure(figsize=(12, 9))
     ax = df.plot.bar(rot=0, color=colors)
     fig = ax.get_figure()
-    print(f"writing search output to {fn_out}")
+    print(f"writing build output to {fn_out}")
     plt.title("Build Time for Highest QPS")
     plt.suptitle(f"{dataset} k={k} batch_size={batch_size}")
     plt.ylabel("Build Time (s)")
@@ -313,35 +313,22 @@ def mean_y(algo):
 def load_lines(results_path, result_files, method, index_key):
     results = dict()
 
-    linebreaker = "name,iterations"
-
     for result_filename in result_files:
         if result_filename.endswith(".csv"):
             with open(os.path.join(results_path, result_filename), "r") as f:
                 lines = f.readlines()
                 lines = lines[:-1] if lines[-1] == "\n" else lines
-                idx = 0
-                for pos, line in enumerate(lines):
-                    if linebreaker in line:
-                        idx = pos
-                        break
 
                 if method == "build":
-                    if "hnswlib" in result_filename:
-                        key_idx = [2]
-                    else:
-                        key_idx = [10]
+                    key_idx = [2]
                 elif method == "search":
-                    if "hnswlib" in result_filename:
-                        key_idx = [10, 6]
-                    else:
-                        key_idx = [12, 10]
+                    key_idx = [2, 3]
 
-                for line in lines[idx + 1 :]:
+                for line in lines[1:]:
                     split_lines = line.split(",")
 
-                    algo_name = split_lines[0].split(".")[0].strip('"')
-                    index_name = split_lines[0].split("/")[0].strip('"')
+                    algo_name = split_lines[0]
+                    index_name = split_lines[1]
 
                     if index_key == "algo":
                         dict_key = algo_name
@@ -394,9 +381,7 @@ def main():
     )
     parser.add_argument(
         "--dataset-path",
-        help="path to dataset folder, by default will look in "
-        "RAPIDS_DATASET_ROOT_DIR if defined, otherwise a datasets "
-        "subdirectory from the calling directory",
+        help="path to dataset folder",
         default=default_dataset_path,
     )
     parser.add_argument(
@@ -460,10 +445,12 @@ def main():
         search = args.search
 
     search_output_filepath = os.path.join(
-        args.output_filepath, f"search-{args.dataset}-{k}-{batch_size}.png"
+        args.output_filepath,
+        f"search-{args.dataset}-k{k}-batch_size{batch_size}.png",
     )
     build_output_filepath = os.path.join(
-        args.output_filepath, f"build-{args.dataset}-{k}-{batch_size}.png"
+        args.output_filepath,
+        f"build-{args.dataset}-k{k}-batch_size{batch_size}.png",
     )
 
     search_results = load_all_results(
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-image-96-inner.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-image-96-inner.json
index f1c033e415..ab82405439 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-image-96-inner.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-image-96-inner.json
@@ -3,6 +3,7 @@
     "name": "deep-image-96-inner",
     "base_file": "deep-image-96-inner/base.fbin",
     "query_file": "deep-image-96-inner/query.fbin",
+    "groundtruth_neighbors_file": "deep-image-96-inner/groundtruth.neighbors.ibin",
     "distance": "euclidean"
   },
   "search_basic_param": {
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/fashion-mnist-784-euclidean.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/fashion-mnist-784-euclidean.json
index 65f28fc81a..0efe1fc498 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/fashion-mnist-784-euclidean.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/fashion-mnist-784-euclidean.json
@@ -3,6 +3,7 @@
     "name": "fashion-mnist-784-euclidean",
     "base_file": "fashion-mnist-784-euclidean/base.fbin",
     "query_file": "fashion-mnist-784-euclidean/query.fbin",
+    "groundtruth_neighbors_file": "fashion-mnist-784-euclidean/groundtruth.neighbors.ibin",
     "distance": "euclidean"
   },
   "search_basic_param": {
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-angular.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-angular.json
index 526aef2db0..3595084d19 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-angular.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-angular.json
@@ -735,37 +735,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "half",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "half",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "half",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "half",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "half",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "half",
           "smemLutDtype": "half"
         }
@@ -785,55 +785,55 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 1,
+          "nprobe": 1,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 1,
+          "nprobe": 1,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 5,
+          "nprobe": 5,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         }
@@ -853,37 +853,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         }
@@ -903,37 +903,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         }
@@ -953,37 +953,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         }
@@ -1003,37 +1003,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         }
@@ -1053,37 +1053,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         }
@@ -1103,37 +1103,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         }
@@ -1153,37 +1153,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "half",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "half",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "half",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "half",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "half",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "half",
           "smemLutDtype": "float"
         }
@@ -1203,37 +1203,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         }
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-inner.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-inner.json
index 7c95ceb439..8b9f1cfb35 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-inner.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-inner.json
@@ -3,711 +3,1349 @@
     "name": "glove-100-inner",
     "base_file": "glove-100-inner/base.fbin",
     "query_file": "glove-100-inner/query.fbin",
-    "groundtruth_neighbors_file": "glove-100-inner/groundtruth.neighbors.ibin",
-    "distance": "inner_product"
+    "distance": "euclidean"
   },
-
   "search_basic_param": {
-    "batch_size": 1,
-    "k": 10
+    "batch_size": 5000,
+    "k": 10,
+    "run_count": 3
   },
-
   "index": [
     {
-      "name": "hnswlib.M4",
-      "algo": "hnswlib",
-      "build_param": {"M":4, "efConstruction":500, "numThreads":4},
-      "file": "glove-100-inner/hnswlib/M4",
-      "search_params": [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
-      ]
-    },
-    {
-      "name": "hnswlib.M8",
-      "algo": "hnswlib",
-      "build_param": {"M":8, "efConstruction":500, "numThreads":4},
-      "file": "glove-100-inner/hnswlib/M8",
-      "search_params": [
+      "name" : "hnswlib.M12",
+      "algo" : "hnswlib",
+      "build_param": {"M":12, "efConstruction":500, "numThreads":32},
+      "file" : "index/glove-100-inner/hnswlib/M12",
+      "search_params" : [
         {"ef":10, "numThreads":1},
         {"ef":20, "numThreads":1},
         {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
         {"ef":80, "numThreads":1},
         {"ef":120, "numThreads":1},
         {"ef":200, "numThreads":1},
         {"ef":400, "numThreads":1},
         {"ef":600, "numThreads":1},
         {"ef":800, "numThreads":1}
-      ]
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M12"
     },
     {
-      "name": "hnswlib.M12",
-      "algo": "hnswlib",
-      "build_param": {"M":12, "efConstruction":500, "numThreads":4},
-      "file": "glove-100-inner/hnswlib/M12",
-      "search_params": [
+      "name" : "hnswlib.M16",
+      "algo" : "hnswlib",
+      "build_param": {"M":16, "efConstruction":500, "numThreads":32},
+      "file" : "index/glove-100-inner/hnswlib/M16",
+      "search_params" : [
         {"ef":10, "numThreads":1},
         {"ef":20, "numThreads":1},
         {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
         {"ef":80, "numThreads":1},
         {"ef":120, "numThreads":1},
         {"ef":200, "numThreads":1},
         {"ef":400, "numThreads":1},
         {"ef":600, "numThreads":1},
         {"ef":800, "numThreads":1}
-      ]
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M16"
     },
     {
-      "name": "hnswlib.M16",
-      "algo": "hnswlib",
-      "build_param": {"M":16, "efConstruction":500, "numThreads":4},
-      "file": "glove-100-inner/hnswlib/M16",
-      "search_params": [
+      "name" : "hnswlib.M24",
+      "algo" : "hnswlib",
+      "build_param": {"M":24, "efConstruction":500, "numThreads":32},
+      "file" : "index/glove-100-inner/hnswlib/M24",
+      "search_params" : [
         {"ef":10, "numThreads":1},
         {"ef":20, "numThreads":1},
         {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
         {"ef":80, "numThreads":1},
         {"ef":120, "numThreads":1},
         {"ef":200, "numThreads":1},
         {"ef":400, "numThreads":1},
         {"ef":600, "numThreads":1},
         {"ef":800, "numThreads":1}
-      ]
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M24"
     },
     {
-      "name": "hnswlib.M24",
-      "algo": "hnswlib",
-      "build_param": {"M":24, "efConstruction":500, "numThreads":4},
-      "file": "glove-100-inner/hnswlib/M24",
-      "search_params": [
+      "name" : "hnswlib.M36",
+      "algo" : "hnswlib",
+      "build_param": {"M":36, "efConstruction":500, "numThreads":32},
+      "file" : "index/glove-100-inner/hnswlib/M36",
+      "search_params" : [
         {"ef":10, "numThreads":1},
         {"ef":20, "numThreads":1},
         {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
         {"ef":80, "numThreads":1},
         {"ef":120, "numThreads":1},
         {"ef":200, "numThreads":1},
         {"ef":400, "numThreads":1},
         {"ef":600, "numThreads":1},
         {"ef":800, "numThreads":1}
-      ]
-    },
-    {
-      "name": "hnswlib.M36",
-      "algo": "hnswlib",
-      "build_param": {"M":36, "efConstruction":500, "numThreads":4},
-      "file": "glove-100-inner/hnswlib/M36",
-      "search_params": [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
-      ]
-    },
-    {
-      "name": "hnswlib.M48",
-      "algo": "hnswlib",
-      "build_param": {"M":48, "efConstruction":500, "numThreads":4},
-      "file": "glove-100-inner/hnswlib/M48",
-      "search_params": [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
-      ]
-    },
-    {
-      "name": "hnswlib.M64",
-      "algo": "hnswlib",
-      "build_param": {"M":64, "efConstruction":500, "numThreads":4},
-      "file": "glove-100-inner/hnswlib/M64",
-      "search_params": [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
-      ]
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M36"
     },
+
+
+
+
     {
-      "name": "hnswlib.M96",
-      "algo": "hnswlib",
-      "build_param": {"M":96, "efConstruction":500, "numThreads":4},
-      "file": "glove-100-inner/hnswlib/M96",
+      "name": "raft_bfknn",
+      "algo": "raft_bfknn",
+      "build_param": {},
+      "file": "index/glove-100-inner/raft_bfknn/bfknn",
       "search_params": [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
-      ]
+        {
+          "probe": 1
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/raft_bfknn/bfknn"
     },
     {
       "name": "faiss_ivf_flat.nlist1024",
       "algo": "faiss_gpu_ivf_flat",
-      "build_param": {"nlist":1024},
-      "file": "glove-100-inner/faiss_ivf_flat/nlist1024",
+      "build_param": {
+        "nlist": 1024
+      },
+      "file": "index/glove-100-inner/faiss_ivf_flat/nlist1024",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_flat/nlist1024"
     },
     {
       "name": "faiss_ivf_flat.nlist2048",
       "algo": "faiss_gpu_ivf_flat",
-      "build_param": {"nlist":2048},
-      "file": "glove-100-inner/faiss_ivf_flat/nlist2048",
+      "build_param": {
+        "nlist": 2048
+      },
+      "file": "index/glove-100-inner/faiss_ivf_flat/nlist2048",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_flat/nlist2048"
     },
     {
       "name": "faiss_ivf_flat.nlist4096",
       "algo": "faiss_gpu_ivf_flat",
-      "build_param": {"nlist":4096},
-      "file": "glove-100-inner/faiss_ivf_flat/nlist4096",
+      "build_param": {
+        "nlist": 4096
+      },
+      "file": "index/glove-100-inner/faiss_ivf_flat/nlist4096",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_flat/nlist4096"
     },
     {
       "name": "faiss_ivf_flat.nlist8192",
       "algo": "faiss_gpu_ivf_flat",
-      "build_param": {"nlist":8192},
-      "file": "glove-100-inner/faiss_ivf_flat/nlist8192",
+      "build_param": {
+        "nlist": 8192
+      },
+      "file": "index/glove-100-inner/faiss_ivf_flat/nlist8192",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_flat/nlist8192"
     },
     {
       "name": "faiss_ivf_flat.nlist16384",
       "algo": "faiss_gpu_ivf_flat",
-      "build_param": {"nlist":16384},
-      "file": "glove-100-inner/faiss_ivf_flat/nlist16384",
+      "build_param": {
+        "nlist": 16384
+      },
+      "file": "index/glove-100-inner/faiss_ivf_flat/nlist16384",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000},
-        {"nprobe":2000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_flat/nlist16384"
     },
-
-
-
     {
-      "name": "faiss_ivf_pq.M2-nlist1024",
-      "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":1024, "M":2},
-      "file": "glove-100-inner/faiss_ivf_pq/M2-nlist1024",
-      "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
-    },
-    {
-      "name": "faiss_ivf_pq.M2-nlist2048",
-      "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":2048, "M":2},
-      "file": "glove-100-inner/faiss_ivf_pq/M2-nlist2048",
-      "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
-    },
-    {
-      "name": "faiss_ivf_pq.M2-nlist4096",
-      "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":4096, "M":2},
-      "file": "glove-100-inner/faiss_ivf_pq/M2-nlist4096",
-      "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
-    },
-    {
-      "name": "faiss_ivf_pq.M2-nlist8192",
-      "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":8192, "M":2},
-      "file": "glove-100-inner/faiss_ivf_pq/M2-nlist8192",
-      "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
-    },
-    {
-      "name": "faiss_ivf_pq.M2-nlist16384",
-      "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":16384, "M":2},
-      "file": "glove-100-inner/faiss_ivf_pq/M2-nlist16384",
-      "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000},
-        {"nprobe":2000}
-      ]
-    },
-    {
-      "name": "faiss_ivf_pq.M4-nlist1024",
-      "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":1024, "M":4},
-      "file": "glove-100-inner/faiss_ivf_pq/M4-nlist1024",
-      "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
-    },
-    {
-      "name": "faiss_ivf_pq.M4-nlist2048",
-      "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":2048, "M":4},
-      "file": "glove-100-inner/faiss_ivf_pq/M4-nlist2048",
-      "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
-    },
-    {
-      "name": "faiss_ivf_pq.M4-nlist4096",
-      "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":4096, "M":4},
-      "file": "glove-100-inner/faiss_ivf_pq/M4-nlist4096",
-      "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
-    },
-    {
-      "name": "faiss_ivf_pq.M4-nlist8192",
-      "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":8192, "M":4},
-      "file": "glove-100-inner/faiss_ivf_pq/M4-nlist8192",
-      "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
-    },
-    {
-      "name": "faiss_ivf_pq.M4-nlist16384",
-      "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":16384, "M":4},
-      "file": "glove-100-inner/faiss_ivf_pq/M4-nlist16384",
-      "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000},
-        {"nprobe":2000}
-      ]
-    },
-    {
-      "name": "faiss_ivf_pq.M20-nlist1024",
+      "name": "faiss_ivf_pq.M64-nlist1024",
       "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":1024, "M":20},
-      "file": "glove-100-inner/faiss_ivf_pq/M20-nlist1024",
-      "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
-    },
-    {
-      "name": "faiss_ivf_pq.M20-nlist2048",
-      "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":2048, "M":20},
-      "file": "glove-100-inner/faiss_ivf_pq/M20-nlist2048",
-      "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
-    },
-    {
-      "name": "faiss_ivf_pq.M20-nlist4096",
-      "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":4096, "M":20},
-      "file": "glove-100-inner/faiss_ivf_pq/M20-nlist4096",
-      "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
-    },
-    {
-      "name": "faiss_ivf_pq.M20-nlist8192",
-      "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":8192, "M":20},
-      "file": "glove-100-inner/faiss_ivf_pq/M20-nlist8192",
+      "build_param": {
+        "nlist": 1024,
+        "M": 64,
+        "useFloat16": true,
+        "usePrecomputed": true
+      },
+      "file": "index/glove-100-inner/faiss_ivf_pq/M64-nlist1024",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
-    },
-    {
-      "name": "faiss_ivf_pq.M20-nlist16384",
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_pq/M64-nlist1024"
+    },
+    {
+      "name": "faiss_ivf_pq.M64-nlist1024.noprecomp",
       "algo": "faiss_gpu_ivf_pq",
-      "build_param": {"nlist":16384, "M":20},
-      "file": "glove-100-inner/faiss_ivf_pq/M20-nlist16384",
+      "build_param": {
+        "nlist": 1024,
+        "M": 64,
+        "useFloat16": true,
+        "usePrecomputed": false
+      },
+      "file": "index/glove-100-inner/faiss_ivf_pq/M64-nlist1024.noprecomp",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000},
-        {"nprobe":2000}
-      ]
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_pq/M64-nlist1024"
     },
-
-
     {
       "name": "faiss_ivf_sq.nlist1024-fp16",
       "algo": "faiss_gpu_ivf_sq",
-      "build_param": {"nlist":1024, "quantizer_type":"fp16"},
-      "file": "glove-100-inner/faiss_ivf_sq/nlist1024-fp16",
+      "build_param": {
+        "nlist": 1024,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/glove-100-inner/faiss_ivf_sq/nlist1024-fp16",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_sq/nlist1024-fp16"
     },
     {
       "name": "faiss_ivf_sq.nlist2048-fp16",
       "algo": "faiss_gpu_ivf_sq",
-      "build_param": {"nlist":2048, "quantizer_type":"fp16"},
-      "file": "glove-100-inner/faiss_ivf_sq/nlist2048-fp16",
+      "build_param": {
+        "nlist": 2048,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/glove-100-inner/faiss_ivf_sq/nlist2048-fp16",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_sq/nlist2048-fp16"
     },
     {
       "name": "faiss_ivf_sq.nlist4096-fp16",
       "algo": "faiss_gpu_ivf_sq",
-      "build_param": {"nlist":4096, "quantizer_type":"fp16"},
-      "file": "glove-100-inner/faiss_ivf_sq/nlist4096-fp16",
+      "build_param": {
+        "nlist": 4096,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/glove-100-inner/faiss_ivf_sq/nlist4096-fp16",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_sq/nlist4096-fp16"
     },
     {
       "name": "faiss_ivf_sq.nlist8192-fp16",
       "algo": "faiss_gpu_ivf_sq",
-      "build_param": {"nlist":8192, "quantizer_type":"fp16"},
-      "file": "glove-100-inner/faiss_ivf_sq/nlist8192-fp16",
+      "build_param": {
+        "nlist": 8192,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/glove-100-inner/faiss_ivf_sq/nlist8192-fp16",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_sq/nlist8192-fp16"
     },
     {
       "name": "faiss_ivf_sq.nlist16384-fp16",
       "algo": "faiss_gpu_ivf_sq",
-      "build_param": {"nlist":16384, "quantizer_type":"fp16"},
-      "file": "glove-100-inner/faiss_ivf_sq/nlist16384-fp16",
+      "build_param": {
+        "nlist": 16384,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/glove-100-inner/faiss_ivf_sq/nlist16384-fp16",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000},
-        {"nprobe":2000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_sq/nlist16384-fp16"
     },
     {
       "name": "faiss_ivf_sq.nlist1024-int8",
       "algo": "faiss_gpu_ivf_sq",
-      "build_param": {"nlist":1024, "quantizer_type":"int8"},
-      "file": "glove-100-inner/faiss_ivf_sq/nlist1024-int8",
+      "build_param": {
+        "nlist": 1024,
+        "quantizer_type": "int8"
+      },
+      "file": "index/glove-100-inner/faiss_ivf_sq/nlist1024-int8",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_sq/nlist1024-int8"
     },
     {
       "name": "faiss_ivf_sq.nlist2048-int8",
       "algo": "faiss_gpu_ivf_sq",
-      "build_param": {"nlist":2048, "quantizer_type":"int8"},
-      "file": "glove-100-inner/faiss_ivf_sq/nlist2048-int8",
+      "build_param": {
+        "nlist": 2048,
+        "quantizer_type": "int8"
+      },
+      "file": "index/glove-100-inner/faiss_ivf_sq/nlist2048-int8",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_sq/nlist2048-int8"
     },
     {
       "name": "faiss_ivf_sq.nlist4096-int8",
       "algo": "faiss_gpu_ivf_sq",
-      "build_param": {"nlist":4096, "quantizer_type":"int8"},
-      "file": "glove-100-inner/faiss_ivf_sq/nlist4096-int8",
+      "build_param": {
+        "nlist": 4096,
+        "quantizer_type": "int8"
+      },
+      "file": "index/glove-100-inner/faiss_ivf_sq/nlist4096-int8",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_sq/nlist4096-int8"
     },
     {
       "name": "faiss_ivf_sq.nlist8192-int8",
       "algo": "faiss_gpu_ivf_sq",
-      "build_param": {"nlist":8192, "quantizer_type":"int8"},
-      "file": "glove-100-inner/faiss_ivf_sq/nlist8192-int8",
+      "build_param": {
+        "nlist": 8192,
+        "quantizer_type": "int8"
+      },
+      "file": "index/glove-100-inner/faiss_ivf_sq/nlist8192-int8",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_sq/nlist8192-int8"
     },
     {
       "name": "faiss_ivf_sq.nlist16384-int8",
       "algo": "faiss_gpu_ivf_sq",
-      "build_param": {"nlist":16384, "quantizer_type":"int8"},
-      "file": "glove-100-inner/faiss_ivf_sq/nlist16384-int8",
+      "build_param": {
+        "nlist": 16384,
+        "quantizer_type": "int8"
+      },
+      "file": "index/glove-100-inner/faiss_ivf_sq/nlist16384-int8",
       "search_params": [
-        {"nprobe":1},
-        {"nprobe":5},
-        {"nprobe":10},
-        {"nprobe":50},
-        {"nprobe":100},
-        {"nprobe":200},
-        {"nprobe":500},
-        {"nprobe":1000},
-        {"nprobe":2000}
-      ]
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_ivf_sq/nlist16384-int8"
     },
     {
       "name": "faiss_flat",
       "algo": "faiss_gpu_flat",
       "build_param": {},
-      "file": "glove-100-inner/faiss_flat/flat",
-      "search_params": [{}]
+      "file": "index/glove-100-inner/faiss_flat/flat",
+      "search_params": [
+        {}
+      ],
+      "search_result_file": "result/glove-100-inner/faiss_flat/flat"
+    },
+
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-100-inner/raft_ivf_pq/dimpq128-cluster1024",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/raft_ivf_pq/dimpq128-cluster1024"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-float-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-100-inner/raft_ivf_pq/dimpq128-cluster1024-float-float",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 1,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 1,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 5,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/raft_ivf_pq/dimpq128-cluster1024-float-float"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-float-half",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-100-inner/raft_ivf_pq/dimpq128-cluster1024-float-half",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/raft_ivf_pq/dimpq128-cluster1024-float-half"
     },
     {
-      "name": "ggnn.kbuild96-segment64-refine2-k10",
-      "algo": "ggnn",
+      "name": "raft_ivf_pq.dimpq128-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
       "build_param": {
-        "k_build": 96,
-        "segment_size": 64,
-        "refine_iterations": 2,
-        "dataset_size": 1183514,
-        "k": 10
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
       },
-      "file": "glove-100-inner/ggnn/kbuild96-segment64-refine2-k10",
+      "file": "index/glove-100-inner/raft_ivf_pq/dimpq128-cluster1024-float-fp8",
       "search_params": [
-        {"tau":0.001, "block_dim":64, "sorted_size":32},
-        {"tau":0.005, "block_dim":64, "sorted_size":32},
-        {"tau":0.01,  "block_dim":64, "sorted_size":32},
-        {"tau":0.02,  "block_dim":64, "sorted_size":32},
-        {"tau":0.03,  "block_dim":64, "sorted_size":32},
-        {"tau":0.04,  "block_dim":64, "sorted_size":32},
-        {"tau":0.05,  "block_dim":64, "sorted_size":32},
-        {"tau":0.06,  "block_dim":64, "sorted_size":32},
-        {"tau":0.09,  "block_dim":64, "sorted_size":32},
-        {"tau":0.12,  "block_dim":64, "sorted_size":32},
-        {"tau":0.18,  "block_dim":64, "sorted_size":32},
-        {"tau":0.21,  "block_dim":64, "sorted_size":32},
-        {"tau":0.24,  "block_dim":64, "sorted_size":32},
-        {"tau":0.27,  "block_dim":64, "sorted_size":32},
-        {"tau":0.3,   "block_dim":64, "sorted_size":32},
-        {"tau":0.4,   "block_dim":64, "sorted_size":32},
-        {"tau":0.01, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
-        {"tau":0.02, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
-        {"tau":0.03, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
-        {"tau":0.04, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
-        {"tau":0.05, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
-        {"tau":0.06, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
-        {"tau":0.09, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
-        {"tau":0.12, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
-        {"tau":0.18, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
-        {"tau":0.21, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
-        {"tau":0.24, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
-        {"tau":0.27, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
-        {"tau":0.3,  "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
-        {"tau":0.4,  "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
-        {"tau":0.5,  "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32}
-      ]
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/raft_ivf_pq/dimpq128-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq64-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 64,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-100-inner/raft_ivf_pq/dimpq64-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/raft_ivf_pq/dimpq64-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq64-cluster1024-float-half",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 64,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-100-inner/raft_ivf_pq/dimpq64-cluster1024-float-half",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/raft_ivf_pq/dimpq64-cluster1024-float-half"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq32-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 32,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-100-inner/raft_ivf_pq/dimpq32-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/raft_ivf_pq/dimpq32-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq16-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 16,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-100-inner/raft_ivf_pq/dimpq16-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/raft_ivf_pq/dimpq16-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-half-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-100-inner/raft_ivf_pq/dimpq128-cluster1024-half-float",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/raft_ivf_pq/dimpq128-cluster1024-half-float"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq512-cluster1024-float-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 512,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-100-inner/raft_ivf_pq/dimpq512-cluster1024-float-float",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/raft_ivf_pq/dimpq512-cluster1024-float-float"
+    },
+    {
+      "name": "raft_ivf_flat.nlist1024",
+      "algo": "raft_ivf_flat",
+      "build_param": {
+        "nlist": 1024,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-100-inner/raft_ivf_flat/nlist1024",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/raft_ivf_flat/nlist1024"
+    },
+    {
+      "name": "raft_ivf_flat.nlist16384",
+      "algo": "raft_ivf_flat",
+      "build_param": {
+        "nlist": 16384,
+        "ratio": 2,
+        "niter": 20
+      },
+      "file": "index/glove-100-inner/raft_ivf_flat/nlist16384",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/glove-100-inner/raft_ivf_flat/nlist16384"
+    },
+
+    {
+      "name" : "raft_cagra.dim32",
+      "algo" : "raft_cagra",
+      "build_param": {
+        "graph_degree" : 32
+      },
+      "file" : "index/glove-100-inner/raft_cagra/dim32",
+      "search_params" : [
+        {"itopk": 32},
+        {"itopk": 64},
+        {"itopk": 128}
+      ],
+      "search_result_file" : "result/glove-100-inner/raft_cagra/dim32"
+    },
+
+    {
+      "name" : "raft_cagra.dim64",
+      "algo" : "raft_cagra",
+      "build_param": {
+        "graph_degree" : 64
+      },
+      "file" : "index/glove-100-inner/raft_cagra/dim64",
+      "search_params" : [
+        {"itopk": 32},
+        {"itopk": 64},
+        {"itopk": 128}
+      ],
+      "search_result_file" : "result/glove-100-inner/raft_cagra/dim64"
     }
   ]
 }
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-angular.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-angular.json
index 9b3f192c9f..0f02620cb2 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-angular.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-angular.json
@@ -735,37 +735,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "half",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "half",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "half",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "half",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "half",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "half",
           "smemLutDtype": "half"
         }
@@ -785,55 +785,55 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 1,
+          "nprobe": 1,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 1,
+          "nprobe": 1,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 5,
+          "nprobe": 5,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         }
@@ -853,37 +853,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         }
@@ -903,37 +903,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         }
@@ -953,37 +953,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         }
@@ -1003,37 +1003,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "half"
         }
@@ -1053,37 +1053,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         }
@@ -1103,37 +1103,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "fp8"
         }
@@ -1153,37 +1153,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "half",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "half",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "half",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "half",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "half",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "half",
           "smemLutDtype": "float"
         }
@@ -1203,37 +1203,37 @@
       "search_params": [
         {
           "k": 10,
-          "numProbes": 10,
+          "nprobe": 10,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 50,
+          "nprobe": 50,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 100,
+          "nprobe": 100,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 200,
+          "nprobe": 200,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 500,
+          "nprobe": 500,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         },
         {
           "k": 10,
-          "numProbes": 1024,
+          "nprobe": 1024,
           "internalDistanceDtype": "float",
           "smemLutDtype": "float"
         }
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-inner.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-inner.json
new file mode 100644
index 0000000000..41dec5adb3
--- /dev/null
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-inner.json
@@ -0,0 +1,1351 @@
+{
+  "dataset": {
+    "name": "glove-50-inner",
+    "base_file": "glove-50-inner/base.fbin",
+    "query_file": "glove-50-inner/query.fbin",
+    "distance": "euclidean"
+  },
+  "search_basic_param": {
+    "batch_size": 5000,
+    "k": 10,
+    "run_count": 3
+  },
+  "index": [
+    {
+      "name" : "hnswlib.M12",
+      "algo" : "hnswlib",
+      "build_param": {"M":12, "efConstruction":500, "numThreads":32},
+      "file" : "index/glove-50-inner/hnswlib/M12",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-50-inner/hnswlib/M12"
+    },
+    {
+      "name" : "hnswlib.M16",
+      "algo" : "hnswlib",
+      "build_param": {"M":16, "efConstruction":500, "numThreads":32},
+      "file" : "index/glove-50-inner/hnswlib/M16",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-50-inner/hnswlib/M16"
+    },
+    {
+      "name" : "hnswlib.M24",
+      "algo" : "hnswlib",
+      "build_param": {"M":24, "efConstruction":500, "numThreads":32},
+      "file" : "index/glove-50-inner/hnswlib/M24",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-50-inner/hnswlib/M24"
+    },
+    {
+      "name" : "hnswlib.M36",
+      "algo" : "hnswlib",
+      "build_param": {"M":36, "efConstruction":500, "numThreads":32},
+      "file" : "index/glove-50-inner/hnswlib/M36",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-50-inner/hnswlib/M36"
+    },
+
+
+
+
+    {
+      "name": "raft_bfknn",
+      "algo": "raft_bfknn",
+      "build_param": {},
+      "file": "index/glove-50-inner/raft_bfknn/bfknn",
+      "search_params": [
+        {
+          "probe": 1
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/raft_bfknn/bfknn"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist1024",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 1024
+      },
+      "file": "index/glove-50-inner/faiss_ivf_flat/nlist1024",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_flat/nlist1024"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist2048",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 2048
+      },
+      "file": "index/glove-50-inner/faiss_ivf_flat/nlist2048",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_flat/nlist2048"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist4096",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 4096
+      },
+      "file": "index/glove-50-inner/faiss_ivf_flat/nlist4096",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_flat/nlist4096"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist8192",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 8192
+      },
+      "file": "index/glove-50-inner/faiss_ivf_flat/nlist8192",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_flat/nlist8192"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist16384",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 16384
+      },
+      "file": "index/glove-50-inner/faiss_ivf_flat/nlist16384",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_flat/nlist16384"
+    },
+    {
+      "name": "faiss_ivf_pq.M64-nlist1024",
+      "algo": "faiss_gpu_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "M": 64,
+        "useFloat16": true,
+        "usePrecomputed": true
+      },
+      "file": "index/glove-50-inner/faiss_ivf_pq/M64-nlist1024",
+      "search_params": [
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_pq/M64-nlist1024"
+    },
+    {
+      "name": "faiss_ivf_pq.M64-nlist1024.noprecomp",
+      "algo": "faiss_gpu_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "M": 64,
+        "useFloat16": true,
+        "usePrecomputed": false
+      },
+      "file": "index/glove-50-inner/faiss_ivf_pq/M64-nlist1024.noprecomp",
+      "search_params": [
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_pq/M64-nlist1024"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist1024-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 1024,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/glove-50-inner/faiss_ivf_sq/nlist1024-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_sq/nlist1024-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist2048-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 2048,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/glove-50-inner/faiss_ivf_sq/nlist2048-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_sq/nlist2048-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist4096-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 4096,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/glove-50-inner/faiss_ivf_sq/nlist4096-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_sq/nlist4096-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist8192-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 8192,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/glove-50-inner/faiss_ivf_sq/nlist8192-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_sq/nlist8192-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist16384-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 16384,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/glove-50-inner/faiss_ivf_sq/nlist16384-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_sq/nlist16384-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist1024-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 1024,
+        "quantizer_type": "int8"
+      },
+      "file": "index/glove-50-inner/faiss_ivf_sq/nlist1024-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_sq/nlist1024-int8"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist2048-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 2048,
+        "quantizer_type": "int8"
+      },
+      "file": "index/glove-50-inner/faiss_ivf_sq/nlist2048-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_sq/nlist2048-int8"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist4096-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 4096,
+        "quantizer_type": "int8"
+      },
+      "file": "index/glove-50-inner/faiss_ivf_sq/nlist4096-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_sq/nlist4096-int8"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist8192-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 8192,
+        "quantizer_type": "int8"
+      },
+      "file": "index/glove-50-inner/faiss_ivf_sq/nlist8192-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_sq/nlist8192-int8"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist16384-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 16384,
+        "quantizer_type": "int8"
+      },
+      "file": "index/glove-50-inner/faiss_ivf_sq/nlist16384-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_ivf_sq/nlist16384-int8"
+    },
+    {
+      "name": "faiss_flat",
+      "algo": "faiss_gpu_flat",
+      "build_param": {},
+      "file": "index/glove-50-inner/faiss_flat/flat",
+      "search_params": [
+        {}
+      ],
+      "search_result_file": "result/glove-50-inner/faiss_flat/flat"
+    },
+
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-50-inner/raft_ivf_pq/dimpq128-cluster1024",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/raft_ivf_pq/dimpq128-cluster1024"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-float-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-50-inner/raft_ivf_pq/dimpq128-cluster1024-float-float",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 1,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 1,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 5,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/raft_ivf_pq/dimpq128-cluster1024-float-float"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-float-half",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-50-inner/raft_ivf_pq/dimpq128-cluster1024-float-half",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/raft_ivf_pq/dimpq128-cluster1024-float-half"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-50-inner/raft_ivf_pq/dimpq128-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/raft_ivf_pq/dimpq128-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq64-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 64,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-50-inner/raft_ivf_pq/dimpq64-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/raft_ivf_pq/dimpq64-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq64-cluster1024-float-half",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 64,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-50-inner/raft_ivf_pq/dimpq64-cluster1024-float-half",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/raft_ivf_pq/dimpq64-cluster1024-float-half"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq32-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 32,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-50-inner/raft_ivf_pq/dimpq32-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/raft_ivf_pq/dimpq32-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq16-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 16,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-50-inner/raft_ivf_pq/dimpq16-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/raft_ivf_pq/dimpq16-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-half-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-50-inner/raft_ivf_pq/dimpq128-cluster1024-half-float",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/raft_ivf_pq/dimpq128-cluster1024-half-float"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq512-cluster1024-float-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 512,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-50-inner/raft_ivf_pq/dimpq512-cluster1024-float-float",
+      "search_params": [
+        {
+          "k": 10,
+          "nprobe": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "nprobe": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/raft_ivf_pq/dimpq512-cluster1024-float-float"
+    },
+    {
+      "name": "raft_ivf_flat.nlist1024",
+      "algo": "raft_ivf_flat",
+      "build_param": {
+        "nlist": 1024,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/glove-50-inner/raft_ivf_flat/nlist1024",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/raft_ivf_flat/nlist1024"
+    },
+    {
+      "name": "raft_ivf_flat.nlist16384",
+      "algo": "raft_ivf_flat",
+      "build_param": {
+        "nlist": 16384,
+        "ratio": 2,
+        "niter": 20
+      },
+      "file": "index/glove-50-inner/raft_ivf_flat/nlist16384",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/glove-50-inner/raft_ivf_flat/nlist16384"
+    },
+
+    {
+      "name" : "raft_cagra.dim32",
+      "algo" : "raft_cagra",
+      "build_param": {
+        "graph_degree" : 32
+      },
+      "file" : "index/glove-50-inner/raft_cagra/dim32",
+      "search_params" : [
+        {"itopk": 32},
+        {"itopk": 64},
+        {"itopk": 128}
+      ],
+      "search_result_file" : "result/glove-50-inner/raft_cagra/dim32"
+    },
+
+    {
+      "name" : "raft_cagra.dim64",
+      "algo" : "raft_cagra",
+      "build_param": {
+        "graph_degree" : 64
+      },
+      "file" : "index/glove-50-inner/raft_cagra/dim64",
+      "search_params" : [
+        {"itopk": 32},
+        {"itopk": 64},
+        {"itopk": 128}
+      ],
+      "search_result_file" : "result/glove-50-inner/raft_cagra/dim64"
+    }
+  ]
+}
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/mnist-784-euclidean.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/mnist-784-euclidean.json
index 2a493edeed..343deb8927 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/mnist-784-euclidean.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/mnist-784-euclidean.json
@@ -3,6 +3,7 @@
     "name": "mnist-784-euclidean",
     "base_file": "mnist-784-euclidean/base.fbin",
     "query_file": "mnist-784-euclidean/query.fbin",
+    "groundtruth_neighbors_file": "mnist-784-euclidean/groundtruth.neighbors.ibin",
     "distance": "euclidean"
   },
   "search_basic_param": {
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-angular.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-angular.json
index 630b700ba5..e94a9969d9 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-angular.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-angular.json
@@ -3,6 +3,7 @@
     "name": "nytimes-256-angular",
     "base_file": "nytimes-256-angular/base.fbin",
     "query_file": "nytimes-256-angular/query.fbin",
+    "groundtruth_neighbors_file": "nytimes-256-angular/groundtruth.neighbors.ibin",
     "distance": "euclidean"
   },
   "search_basic_param": {
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-inner.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-inner.json
new file mode 100644
index 0000000000..f849abad35
--- /dev/null
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-inner.json
@@ -0,0 +1,1352 @@
+{
+  "dataset": {
+    "name": "nytimes-256-inner",
+    "base_file": "nytimes-256-inner/base.fbin",
+    "query_file": "nytimes-256-inner/query.fbin",
+    "groundtruth_neighbors_file": "nytimes-256-inner/groundtruth.neighbors.ibin",
+    "distance": "euclidean"
+  },
+  "search_basic_param": {
+    "batch_size": 5000,
+    "k": 10,
+    "run_count": 3
+  },
+  "index": [
+    {
+      "name" : "hnswlib.M12",
+      "algo" : "hnswlib",
+      "build_param": {"M":12, "efConstruction":500, "numThreads":32},
+      "file" : "index/nytimes-256-inner/hnswlib/M12",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/nytimes-256-inner/hnswlib/M12"
+    },
+    {
+      "name" : "hnswlib.M16",
+      "algo" : "hnswlib",
+      "build_param": {"M":16, "efConstruction":500, "numThreads":32},
+      "file" : "index/nytimes-256-inner/hnswlib/M16",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/nytimes-256-inner/hnswlib/M16"
+    },
+    {
+      "name" : "hnswlib.M24",
+      "algo" : "hnswlib",
+      "build_param": {"M":24, "efConstruction":500, "numThreads":32},
+      "file" : "index/nytimes-256-inner/hnswlib/M24",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/nytimes-256-inner/hnswlib/M24"
+    },
+    {
+      "name" : "hnswlib.M36",
+      "algo" : "hnswlib",
+      "build_param": {"M":36, "efConstruction":500, "numThreads":32},
+      "file" : "index/nytimes-256-inner/hnswlib/M36",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/nytimes-256-inner/hnswlib/M36"
+    },
+
+
+
+
+    {
+      "name": "raft_bfknn",
+      "algo": "raft_bfknn",
+      "build_param": {},
+      "file": "index/nytimes-256-inner/raft_bfknn/bfknn",
+      "search_params": [
+        {
+          "probe": 1
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/raft_bfknn/bfknn"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist1024",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 1024
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_flat/nlist1024",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_flat/nlist1024"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist2048",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 2048
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_flat/nlist2048",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_flat/nlist2048"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist4096",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 4096
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_flat/nlist4096",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_flat/nlist4096"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist8192",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 8192
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_flat/nlist8192",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_flat/nlist8192"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist16384",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 16384
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_flat/nlist16384",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_flat/nlist16384"
+    },
+    {
+      "name": "faiss_ivf_pq.M64-nlist1024",
+      "algo": "faiss_gpu_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "M": 64,
+        "useFloat16": true,
+        "usePrecomputed": true
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_pq/M64-nlist1024",
+      "search_params": [
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_pq/M64-nlist1024"
+    },
+    {
+      "name": "faiss_ivf_pq.M64-nlist1024.noprecomp",
+      "algo": "faiss_gpu_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "M": 64,
+        "useFloat16": true,
+        "usePrecomputed": false
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_pq/M64-nlist1024.noprecomp",
+      "search_params": [
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_pq/M64-nlist1024"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist1024-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 1024,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_sq/nlist1024-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_sq/nlist1024-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist2048-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 2048,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_sq/nlist2048-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_sq/nlist2048-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist4096-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 4096,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_sq/nlist4096-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_sq/nlist4096-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist8192-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 8192,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_sq/nlist8192-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_sq/nlist8192-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist16384-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 16384,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_sq/nlist16384-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_sq/nlist16384-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist1024-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 1024,
+        "quantizer_type": "int8"
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_sq/nlist1024-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_sq/nlist1024-int8"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist2048-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 2048,
+        "quantizer_type": "int8"
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_sq/nlist2048-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_sq/nlist2048-int8"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist4096-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 4096,
+        "quantizer_type": "int8"
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_sq/nlist4096-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_sq/nlist4096-int8"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist8192-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 8192,
+        "quantizer_type": "int8"
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_sq/nlist8192-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_sq/nlist8192-int8"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist16384-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 16384,
+        "quantizer_type": "int8"
+      },
+      "file": "index/nytimes-256-inner/faiss_ivf_sq/nlist16384-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_ivf_sq/nlist16384-int8"
+    },
+    {
+      "name": "faiss_flat",
+      "algo": "faiss_gpu_flat",
+      "build_param": {},
+      "file": "index/nytimes-256-inner/faiss_flat/flat",
+      "search_params": [
+        {}
+      ],
+      "search_result_file": "result/nytimes-256-inner/faiss_flat/flat"
+    },
+
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/nytimes-256-inner/raft_ivf_pq/dimpq128-cluster1024",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/raft_ivf_pq/dimpq128-cluster1024"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-float-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/nytimes-256-inner/raft_ivf_pq/dimpq128-cluster1024-float-float",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 1,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 1,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 5,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/raft_ivf_pq/dimpq128-cluster1024-float-float"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-float-half",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/nytimes-256-inner/raft_ivf_pq/dimpq128-cluster1024-float-half",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/raft_ivf_pq/dimpq128-cluster1024-float-half"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/nytimes-256-inner/raft_ivf_pq/dimpq128-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/raft_ivf_pq/dimpq128-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq64-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 64,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/nytimes-256-inner/raft_ivf_pq/dimpq64-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/raft_ivf_pq/dimpq64-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq64-cluster1024-float-half",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 64,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/nytimes-256-inner/raft_ivf_pq/dimpq64-cluster1024-float-half",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/raft_ivf_pq/dimpq64-cluster1024-float-half"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq32-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 32,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/nytimes-256-inner/raft_ivf_pq/dimpq32-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/raft_ivf_pq/dimpq32-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq16-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 16,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/nytimes-256-inner/raft_ivf_pq/dimpq16-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/raft_ivf_pq/dimpq16-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-half-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/nytimes-256-inner/raft_ivf_pq/dimpq128-cluster1024-half-float",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/raft_ivf_pq/dimpq128-cluster1024-half-float"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq512-cluster1024-float-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 512,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/nytimes-256-inner/raft_ivf_pq/dimpq512-cluster1024-float-float",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/raft_ivf_pq/dimpq512-cluster1024-float-float"
+    },
+    {
+      "name": "raft_ivf_flat.nlist1024",
+      "algo": "raft_ivf_flat",
+      "build_param": {
+        "nlist": 1024,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/nytimes-256-inner/raft_ivf_flat/nlist1024",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/raft_ivf_flat/nlist1024"
+    },
+    {
+      "name": "raft_ivf_flat.nlist16384",
+      "algo": "raft_ivf_flat",
+      "build_param": {
+        "nlist": 16384,
+        "ratio": 2,
+        "niter": 20
+      },
+      "file": "index/nytimes-256-inner/raft_ivf_flat/nlist16384",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/nytimes-256-inner/raft_ivf_flat/nlist16384"
+    },
+
+    {
+      "name" : "raft_cagra.dim32",
+      "algo" : "raft_cagra",
+      "build_param": {
+        "graph_degree" : 32
+      },
+      "file" : "index/nytimes-256-inner/raft_cagra/dim32",
+      "search_params" : [
+        {"itopk": 32},
+        {"itopk": 64},
+        {"itopk": 128}
+      ],
+      "search_result_file" : "result/nytimes-256-inner/raft_cagra/dim32"
+    },
+
+    {
+      "name" : "raft_cagra.dim64",
+      "algo" : "raft_cagra",
+      "build_param": {
+        "graph_degree" : 64
+      },
+      "file" : "index/nytimes-256-inner/raft_cagra/dim64",
+      "search_params" : [
+        {"itopk": 32},
+        {"itopk": 64},
+        {"itopk": 128}
+      ],
+      "search_result_file" : "result/nytimes-256-inner/raft_cagra/dim64"
+    }
+  ]
+}
diff --git a/python/raft-dask/CMakeLists.txt b/python/raft-dask/CMakeLists.txt
index f9a20b46bb..af98f31857 100644
--- a/python/raft-dask/CMakeLists.txt
+++ b/python/raft-dask/CMakeLists.txt
@@ -17,6 +17,8 @@ cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 set(raft_dask_version 23.10.00)
 
 include(../../fetch_rapids.cmake)
+include(rapids-cuda)
+rapids_cuda_init_architectures(raft-dask-python)
 
 project(
   raft-dask-python
@@ -25,7 +27,7 @@ project(
             # language to be enabled here. The test project that is built in scikit-build to verify
             # various linking options for the python library is hardcoded to build with C, so until
             # that is fixed we need to keep C.
-            C CXX
+            C CXX CUDA
 )
 
 option(FIND_RAFT_CPP "Search for existing RAFT C++ installations before defaulting to local files"
@@ -42,14 +44,6 @@ else()
 endif()
 
 if(NOT raft_FOUND)
-  # TODO: This will not be necessary once we upgrade to CMake 3.22, which will pull in the required
-  # languages for the C++ project even if this project does not require those languages.
-  include(rapids-cuda)
-  rapids_cuda_init_architectures(raft-dask)
-  enable_language(CUDA)
-  # Since raft-dask only enables CUDA optionally we need to manually include the file that
-  # rapids_cuda_init_architectures relies on `project` including.
-  include("${CMAKE_PROJECT_raft-dask_INCLUDE}")
   find_package(ucx REQUIRED)
 
   # raft-dask doesn't actually use raft libraries, it just needs the headers, so we can turn off all
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index bdbcf61e0f..3e0ffc2848 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -35,8 +35,8 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "dask-cuda==23.10.*",
-    "dask>=2023.7.1",
-    "distributed>=2023.7.1",
+    "dask==2023.9.2",
+    "distributed==2023.9.2",
     "joblib>=0.11",
     "numba>=0.57",
     "numpy>=1.21",