From 3da9f040345c93e96424945100ff98da48e2cd81 Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Tue, 24 Sep 2024 14:12:39 -0500
Subject: [PATCH 1/6] Update update-version.sh to use packaging lib (#344)

This PR updates the update-version.sh script to use the packaging
library, given that setuptools is no longer included by default in
Python 3.12.
---
 ci/release/update-version.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index feb0a400c..4cf1f0617 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -26,8 +26,8 @@ NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
 NEXT_UCXX_SHORT_TAG="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG})"
 
 # Need to distutils-normalize the original version
-NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
-NEXT_UCXX_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_UCXX_SHORT_TAG}'))")
+NEXT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_SHORT_TAG}'))")
+NEXT_UCXX_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_UCXX_SHORT_TAG}'))")
 
 echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
 

From 0a4298adfd5175a0dcbd792147d300f6dc57e30c Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Wed, 25 Sep 2024 18:06:11 +0200
Subject: [PATCH 2/6] CAGRA - separable compilation for distance computation
 (#296)

Factor the `compute_distance` function and related template parameters out of the CAGRA search kernels.
This reduces the total number of kernel instances, thus reducing the binary size and the compile time.

The change, however, has a few drawbacks:
 - CUDA separable compilation needs to be enabled to allow `compute_distance` functions being compiled in separate object files. I introduced a static library component for the affected sources to minimize the impact of the change.
 - The separable compilation and dynamic dispatch of `compute_distance` function means the compiler cannot optimize across the kernel-`compute_distance` boundary, which results in higher register usage and occasional register spilling. Most of the cases are optimized in this PR, but some compromises seem unavoidable.
 - Dynamic dispatch (constructing a dataset descriptor) requires an extra kernel call (`xxx_init_kernel`) to get the function pointer, which adds extra latency. This is mitigated to some extent by caching the constructed descriptor using raft custom resource.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Tamas Bela Feher (https://github.com/tfeher)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/296
---
 cpp/CMakeLists.txt                            | 194 +++---
 cpp/include/cuvs/neighbors/common.hpp         |  41 +-
 cpp/src/neighbors/detail/ann_utils.cuh        |  16 +-
 cpp/src/neighbors/detail/cagra/bitonic.hpp    |  37 +-
 .../neighbors/detail/cagra/cagra_search.cuh   | 152 +----
 .../detail/cagra/compute_distance-ext.cuh     | 511 +++++++++++++++
 .../detail/cagra/compute_distance.cu          |  82 +++
 .../detail/cagra/compute_distance.hpp         | 576 +++++++++--------
 .../cagra/compute_distance_00_generate.py     | 162 +++++
 .../cagra/compute_distance_standard-impl.cuh  | 279 +++++++++
 .../cagra/compute_distance_standard.hpp       |  80 +++
 ...rd_InnerProduct_float_uint32_dim128_t8.cu} |  24 +-
 ...d_InnerProduct_float_uint32_dim256_t16.cu} |  24 +-
 ...d_InnerProduct_float_uint32_dim512_t32.cu} |  24 +-
 ...dard_InnerProduct_half_uint32_dim128_t8.cu |  33 +
 ...rd_InnerProduct_half_uint32_dim256_t16.cu} |  24 +-
 ...ard_InnerProduct_half_uint32_dim512_t32.cu |  38 ++
 ...dard_InnerProduct_int8_uint32_dim128_t8.cu |  38 ++
 ...ard_InnerProduct_int8_uint32_dim256_t16.cu |  38 ++
 ...ard_InnerProduct_int8_uint32_dim512_t32.cu |  38 ++
 ...ard_InnerProduct_uint8_uint32_dim128_t8.cu |  38 ++
 ...rd_InnerProduct_uint8_uint32_dim256_t16.cu |  38 ++
 ...rd_InnerProduct_uint8_uint32_dim512_t32.cu |  38 ++
 ...ndard_L2Expanded_float_uint32_dim128_t8.cu |  33 +
 ...dard_L2Expanded_float_uint32_dim256_t16.cu |  33 +
 ...dard_L2Expanded_float_uint32_dim512_t32.cu |  33 +
 ...andard_L2Expanded_half_uint32_dim128_t8.cu |  33 +
 ...ndard_L2Expanded_half_uint32_dim256_t16.cu |  33 +
 ...ndard_L2Expanded_half_uint32_dim512_t32.cu |  33 +
 ...andard_L2Expanded_int8_uint32_dim128_t8.cu |  33 +
 ...ndard_L2Expanded_int8_uint32_dim256_t16.cu |  38 ++
 ...ndard_L2Expanded_int8_uint32_dim512_t32.cu |  38 ++
 ...ndard_L2Expanded_uint8_uint32_dim128_t8.cu |  38 ++
 ...dard_L2Expanded_uint8_uint32_dim256_t16.cu |  38 ++
 ...dard_L2Expanded_uint8_uint32_dim512_t32.cu |  38 ++
 .../cagra/compute_distance_vpq-impl.cuh       | 466 ++++++++++++++
 .../detail/cagra/compute_distance_vpq.cuh     | 231 -------
 .../detail/cagra/compute_distance_vpq.hpp     | 100 +++
 ..._float_uint32_dim128_t8_8pq_2subd_half.cu} |  27 +-
 ..._float_uint32_dim128_t8_8pq_4subd_half.cu} |  27 +-
 ..._float_uint32_dim256_t16_8pq_2subd_half.cu |  41 ++
 ..._float_uint32_dim256_t16_8pq_4subd_half.cu |  41 ++
 ..._float_uint32_dim512_t32_8pq_2subd_half.cu |  41 ++
 ..._float_uint32_dim512_t32_8pq_4subd_half.cu |  41 ++
 ...d_half_uint32_dim128_t8_8pq_2subd_half.cu} |  27 +-
 ...d_half_uint32_dim128_t8_8pq_4subd_half.cu} |  27 +-
 ...d_half_uint32_dim256_t16_8pq_2subd_half.cu |  41 ++
 ...d_half_uint32_dim256_t16_8pq_4subd_half.cu |  41 ++
 ...d_half_uint32_dim512_t32_8pq_2subd_half.cu |  41 ++
 ...d_half_uint32_dim512_t32_8pq_4subd_half.cu |  41 ++
 ...ed_int8_uint32_dim128_t8_8pq_2subd_half.cu |  41 ++
 ...ed_int8_uint32_dim128_t8_8pq_4subd_half.cu |  41 ++
 ...d_int8_uint32_dim256_t16_8pq_2subd_half.cu |  41 ++
 ...d_int8_uint32_dim256_t16_8pq_4subd_half.cu |  41 ++
 ...d_int8_uint32_dim512_t32_8pq_2subd_half.cu |  41 ++
 ...d_int8_uint32_dim512_t32_8pq_4subd_half.cu |  41 ++
 ...d_uint8_uint32_dim128_t8_8pq_2subd_half.cu |  41 ++
 ...d_uint8_uint32_dim128_t8_8pq_4subd_half.cu |  41 ++
 ..._uint8_uint32_dim256_t16_8pq_2subd_half.cu |  41 ++
 ..._uint8_uint32_dim256_t16_8pq_4subd_half.cu |  41 ++
 ..._uint8_uint32_dim512_t32_8pq_2subd_half.cu |  41 ++
 ..._uint8_uint32_dim512_t32_8pq_4subd_half.cu |  41 ++
 .../neighbors/detail/cagra/device_common.hpp  | 306 ++++++++-
 cpp/src/neighbors/detail/cagra/factory.cuh    | 175 ++++--
 cpp/src/neighbors/detail/cagra/graph_core.cuh |  94 +--
 cpp/src/neighbors/detail/cagra/hashmap.hpp    |  26 +-
 .../cagra/q_search_multi_cta_00_generate.py   |  83 ---
 ..._float_uint32_dim256_t16_8pq_2subd_half.cu |  36 --
 ..._float_uint32_dim256_t16_8pq_4subd_half.cu |  36 --
 ..._float_uint32_dim512_t32_8pq_2subd_half.cu |  36 --
 ..._float_uint32_dim512_t32_8pq_4subd_half.cu |  36 --
 ...float_uint64_dim1024_t32_8pq_2subd_half.cu |  36 --
 ...float_uint64_dim1024_t32_8pq_4subd_half.cu |  36 --
 ...a_float_uint64_dim128_t8_8pq_2subd_half.cu |  36 --
 ...a_float_uint64_dim128_t8_8pq_4subd_half.cu |  36 --
 ..._float_uint64_dim256_t16_8pq_2subd_half.cu |  36 --
 ..._float_uint64_dim256_t16_8pq_4subd_half.cu |  36 --
 ..._float_uint64_dim512_t32_8pq_2subd_half.cu |  36 --
 ..._float_uint64_dim512_t32_8pq_4subd_half.cu |  36 --
 ..._half_uint32_dim1024_t32_8pq_2subd_half.cu |  36 --
 ..._half_uint32_dim1024_t32_8pq_4subd_half.cu |  36 --
 ...a_half_uint32_dim256_t16_8pq_2subd_half.cu |  36 --
 ...a_half_uint32_dim256_t16_8pq_4subd_half.cu |  36 --
 ...a_half_uint32_dim512_t32_8pq_2subd_half.cu |  36 --
 ...a_half_uint32_dim512_t32_8pq_4subd_half.cu |  36 --
 ..._half_uint64_dim1024_t32_8pq_2subd_half.cu |  36 --
 ..._half_uint64_dim1024_t32_8pq_4subd_half.cu |  36 --
 ...a_half_uint64_dim256_t16_8pq_2subd_half.cu |  36 --
 ...a_half_uint64_dim256_t16_8pq_4subd_half.cu |  36 --
 ...a_half_uint64_dim512_t32_8pq_2subd_half.cu |  36 --
 ...a_half_uint64_dim512_t32_8pq_4subd_half.cu |  36 --
 ..._int8_uint32_dim1024_t32_8pq_2subd_half.cu |  36 --
 ..._int8_uint32_dim1024_t32_8pq_4subd_half.cu |  36 --
 ...ta_int8_uint32_dim128_t8_8pq_2subd_half.cu |  36 --
 ...ta_int8_uint32_dim128_t8_8pq_4subd_half.cu |  36 --
 ...a_int8_uint32_dim256_t16_8pq_2subd_half.cu |  36 --
 ...a_int8_uint32_dim256_t16_8pq_4subd_half.cu |  36 --
 ...a_int8_uint32_dim512_t32_8pq_2subd_half.cu |  36 --
 ...a_int8_uint32_dim512_t32_8pq_4subd_half.cu |  36 --
 ...uint8_uint32_dim1024_t32_8pq_2subd_half.cu |  36 --
 ...uint8_uint32_dim1024_t32_8pq_4subd_half.cu |  36 --
 ...a_uint8_uint32_dim128_t8_8pq_2subd_half.cu |  36 --
 ...a_uint8_uint32_dim128_t8_8pq_4subd_half.cu |  36 --
 ..._uint8_uint32_dim256_t16_8pq_2subd_half.cu |  36 --
 ..._uint8_uint32_dim256_t16_8pq_4subd_half.cu |  36 --
 ..._uint8_uint32_dim512_t32_8pq_2subd_half.cu |  36 --
 ..._uint8_uint32_dim512_t32_8pq_4subd_half.cu |  36 --
 .../cagra/q_search_single_cta_00_generate.py  |  88 ---
 ...float_uint32_dim1024_t32_8pq_2subd_half.cu |  36 --
 ...float_uint32_dim1024_t32_8pq_4subd_half.cu |  36 --
 ...a_float_uint32_dim128_t8_8pq_2subd_half.cu |  36 --
 ...a_float_uint32_dim128_t8_8pq_4subd_half.cu |  36 --
 ..._float_uint32_dim256_t16_8pq_2subd_half.cu |  36 --
 ..._float_uint32_dim256_t16_8pq_4subd_half.cu |  36 --
 ..._float_uint32_dim512_t32_8pq_2subd_half.cu |  36 --
 ..._float_uint32_dim512_t32_8pq_4subd_half.cu |  36 --
 ...float_uint64_dim1024_t32_8pq_2subd_half.cu |  36 --
 ...float_uint64_dim1024_t32_8pq_4subd_half.cu |  36 --
 ...a_float_uint64_dim128_t8_8pq_2subd_half.cu |  36 --
 ...a_float_uint64_dim128_t8_8pq_4subd_half.cu |  36 --
 ..._float_uint64_dim256_t16_8pq_2subd_half.cu |  36 --
 ..._float_uint64_dim256_t16_8pq_4subd_half.cu |  36 --
 ..._float_uint64_dim512_t32_8pq_2subd_half.cu |  36 --
 ..._float_uint64_dim512_t32_8pq_4subd_half.cu |  36 --
 ..._half_uint32_dim1024_t32_8pq_2subd_half.cu |  36 --
 ..._half_uint32_dim1024_t32_8pq_4subd_half.cu |  36 --
 ...ta_half_uint32_dim128_t8_8pq_2subd_half.cu |  36 --
 ...ta_half_uint32_dim128_t8_8pq_4subd_half.cu |  36 --
 ...a_half_uint32_dim256_t16_8pq_2subd_half.cu |  36 --
 ...a_half_uint32_dim256_t16_8pq_4subd_half.cu |  36 --
 ...a_half_uint32_dim512_t32_8pq_2subd_half.cu |  36 --
 ...a_half_uint32_dim512_t32_8pq_4subd_half.cu |  36 --
 ..._half_uint64_dim1024_t32_8pq_2subd_half.cu |  36 --
 ..._half_uint64_dim1024_t32_8pq_4subd_half.cu |  36 --
 ...ta_half_uint64_dim128_t8_8pq_2subd_half.cu |  36 --
 ...ta_half_uint64_dim128_t8_8pq_4subd_half.cu |  36 --
 ...a_half_uint64_dim256_t16_8pq_2subd_half.cu |  36 --
 ...a_half_uint64_dim256_t16_8pq_4subd_half.cu |  36 --
 ...a_half_uint64_dim512_t32_8pq_2subd_half.cu |  36 --
 ...a_half_uint64_dim512_t32_8pq_4subd_half.cu |  36 --
 ..._int8_uint32_dim1024_t32_8pq_2subd_half.cu |  36 --
 ..._int8_uint32_dim1024_t32_8pq_4subd_half.cu |  36 --
 ...ta_int8_uint32_dim128_t8_8pq_2subd_half.cu |  36 --
 ...ta_int8_uint32_dim128_t8_8pq_4subd_half.cu |  36 --
 ...a_int8_uint32_dim256_t16_8pq_2subd_half.cu |  36 --
 ...a_int8_uint32_dim256_t16_8pq_4subd_half.cu |  36 --
 ...a_int8_uint32_dim512_t32_8pq_2subd_half.cu |  36 --
 ...a_int8_uint32_dim512_t32_8pq_4subd_half.cu |  36 --
 ...uint8_uint32_dim1024_t32_8pq_2subd_half.cu |  36 --
 ...uint8_uint32_dim1024_t32_8pq_4subd_half.cu |  36 --
 ...a_uint8_uint32_dim128_t8_8pq_2subd_half.cu |  36 --
 ...a_uint8_uint32_dim128_t8_8pq_4subd_half.cu |  36 --
 ..._uint8_uint32_dim256_t16_8pq_2subd_half.cu |  36 --
 ..._uint8_uint32_dim256_t16_8pq_4subd_half.cu |  36 --
 ..._uint8_uint32_dim512_t32_8pq_2subd_half.cu |  36 --
 ..._uint8_uint32_dim512_t32_8pq_4subd_half.cu |  36 --
 .../detail/cagra/search_multi_cta.cuh         | 166 +++--
 .../cagra/search_multi_cta_00_generate.py     |  22 +-
 ...t8.cu => search_multi_cta_float_uint32.cu} |  11 +-
 ...arch_multi_cta_float_uint32_dim1024_t32.cu |  37 --
 ...earch_multi_cta_float_uint32_dim256_t16.cu |  37 --
 ...earch_multi_cta_float_uint32_dim512_t32.cu |  37 --
 ...t8.cu => search_multi_cta_float_uint64.cu} |  11 +-
 ...arch_multi_cta_float_uint64_dim1024_t32.cu |  37 --
 ...search_multi_cta_float_uint64_dim128_t8.cu |  37 --
 ...earch_multi_cta_float_uint64_dim256_t16.cu |  37 --
 ...earch_multi_cta_float_uint64_dim512_t32.cu |  37 --
 ..._t8.cu => search_multi_cta_half_uint32.cu} |  11 +-
 ...earch_multi_cta_half_uint32_dim1024_t32.cu |  37 --
 ...search_multi_cta_half_uint32_dim512_t32.cu |  37 --
 ...t16.cu => search_multi_cta_half_uint64.cu} |  11 +-
 ...earch_multi_cta_half_uint64_dim1024_t32.cu |  37 --
 ...search_multi_cta_half_uint64_dim256_t16.cu |  37 --
 ...search_multi_cta_half_uint64_dim512_t32.cu |  37 --
 .../detail/cagra/search_multi_cta_inst.cuh    |  44 +-
 .../cagra/search_multi_cta_int8_uint32.cu     |  34 +
 ...earch_multi_cta_int8_uint32_dim1024_t32.cu |  37 --
 .../search_multi_cta_int8_uint32_dim128_t8.cu |  37 --
 ...search_multi_cta_int8_uint32_dim256_t16.cu |  37 --
 ...search_multi_cta_int8_uint32_dim512_t32.cu |  37 --
 .../cagra/search_multi_cta_kernel-ext.cuh     | 405 ------------
 .../cagra/search_multi_cta_kernel-inl.cuh     | 216 +++----
 .../detail/cagra/search_multi_cta_kernel.cuh  |  36 +-
 .../cagra/search_multi_cta_uint8_uint32.cu    |  34 +
 ...arch_multi_cta_uint8_uint32_dim1024_t32.cu |  37 --
 ...search_multi_cta_uint8_uint32_dim128_t8.cu |  37 --
 ...earch_multi_cta_uint8_uint32_dim256_t16.cu |  37 --
 ...earch_multi_cta_uint8_uint32_dim512_t32.cu |  37 --
 .../detail/cagra/search_multi_kernel.cuh      | 458 +++++---------
 .../neighbors/detail/cagra/search_plan.cuh    |  71 +--
 .../detail/cagra/search_single_cta.cuh        | 142 ++---
 .../cagra/search_single_cta_00_generate.py    |  22 +-
 ...8.cu => search_single_cta_float_uint32.cu} |  11 +-
 ...rch_single_cta_float_uint32_dim1024_t32.cu |  37 --
 ...arch_single_cta_float_uint32_dim256_t16.cu |  37 --
 ...arch_single_cta_float_uint32_dim512_t32.cu |  37 --
 ...8.cu => search_single_cta_float_uint64.cu} |  11 +-
 ...rch_single_cta_float_uint64_dim1024_t32.cu |  37 --
 ...earch_single_cta_float_uint64_dim128_t8.cu |  37 --
 ...arch_single_cta_float_uint64_dim256_t16.cu |  37 --
 ...arch_single_cta_float_uint64_dim512_t32.cu |  37 --
 ...t8.cu => search_single_cta_half_uint32.cu} |  11 +-
 ...arch_single_cta_half_uint32_dim1024_t32.cu |  37 --
 ...earch_single_cta_half_uint32_dim512_t32.cu |  37 --
 ...16.cu => search_single_cta_half_uint64.cu} |  11 +-
 ...arch_single_cta_half_uint64_dim1024_t32.cu |  37 --
 ...earch_single_cta_half_uint64_dim256_t16.cu |  37 --
 ...earch_single_cta_half_uint64_dim512_t32.cu |  37 --
 .../detail/cagra/search_single_cta_inst.cuh   |  46 +-
 .../cagra/search_single_cta_int8_uint32.cu    |  34 +
 ...arch_single_cta_int8_uint32_dim1024_t32.cu |  37 --
 ...search_single_cta_int8_uint32_dim128_t8.cu |  37 --
 ...earch_single_cta_int8_uint32_dim256_t16.cu |  37 --
 ...earch_single_cta_int8_uint32_dim512_t32.cu |  37 --
 .../cagra/search_single_cta_kernel-ext.cuh    | 588 ------------------
 .../cagra/search_single_cta_kernel-inl.cuh    | 299 ++++-----
 .../detail/cagra/search_single_cta_kernel.cuh |  36 +-
 .../cagra/search_single_cta_uint8_uint32.cu   |  34 +
 ...rch_single_cta_uint8_uint32_dim1024_t32.cu |  37 --
 ...earch_single_cta_uint8_uint32_dim128_t8.cu |  37 --
 ...arch_single_cta_uint8_uint32_dim256_t16.cu |  37 --
 ...arch_single_cta_uint8_uint32_dim512_t32.cu |  37 --
 .../neighbors/detail/cagra/topk_by_radix.cuh  |  44 +-
 .../detail/cagra/topk_for_cagra/topk.cu       | 171 +++++
 .../detail/cagra/topk_for_cagra/topk_core.cuh | 258 ++------
 cpp/src/neighbors/detail/cagra/utils.hpp      |  10 +-
 cpp/test/neighbors/ann_cagra.cuh              |   2 +-
 227 files changed, 5416 insertions(+), 7935 deletions(-)
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard.hpp
 rename cpp/src/neighbors/detail/cagra/{q_search_multi_cta_half_uint32_dim128_t8_8pq_2subd_half.cu => compute_distance_standard_InnerProduct_float_uint32_dim128_t8.cu} (51%)
 rename cpp/src/neighbors/detail/cagra/{q_search_multi_cta_half_uint32_dim128_t8_8pq_4subd_half.cu => compute_distance_standard_InnerProduct_float_uint32_dim256_t16.cu} (51%)
 rename cpp/src/neighbors/detail/cagra/{q_search_multi_cta_half_uint64_dim128_t8_8pq_2subd_half.cu => compute_distance_standard_InnerProduct_float_uint32_dim512_t32.cu} (51%)
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim128_t8.cu
 rename cpp/src/neighbors/detail/cagra/{q_search_multi_cta_half_uint64_dim128_t8_8pq_4subd_half.cu => compute_distance_standard_InnerProduct_half_uint32_dim256_t16.cu} (51%)
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim512_t32.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim128_t8.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim256_t16.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim512_t32.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim128_t8.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim256_t16.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim512_t32.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim128_t8.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim256_t16.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim512_t32.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim128_t8.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim256_t16.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim512_t32.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim128_t8.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim256_t16.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim512_t32.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim128_t8.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim256_t16.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim512_t32.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
 delete mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
 rename cpp/src/neighbors/detail/cagra/{q_search_multi_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu => compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu} (50%)
 rename cpp/src/neighbors/detail/cagra/{q_search_multi_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu => compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu} (50%)
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu
 rename cpp/src/neighbors/detail/cagra/{q_search_multi_cta_float_uint32_dim128_t8_8pq_2subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu} (50%)
 rename cpp/src/neighbors/detail/cagra/{q_search_multi_cta_float_uint32_dim128_t8_8pq_4subd_half.cu => compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu} (50%)
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_00_generate.py
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim1024_t32_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim1024_t32_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim256_t16_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim256_t16_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim512_t32_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim512_t32_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim1024_t32_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim1024_t32_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim256_t16_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim256_t16_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim512_t32_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim512_t32_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_00_generate.py
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim1024_t32_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim1024_t32_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim128_t8_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim128_t8_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim256_t16_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim256_t16_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim512_t32_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim512_t32_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim1024_t32_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim1024_t32_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim128_t8_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim128_t8_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim256_t16_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim256_t16_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim512_t32_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim512_t32_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu
 rename cpp/src/neighbors/detail/cagra/{search_multi_cta_half_uint64_dim128_t8.cu => search_multi_cta_float_uint32.cu} (80%)
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu
 rename cpp/src/neighbors/detail/cagra/{search_multi_cta_half_uint32_dim128_t8.cu => search_multi_cta_float_uint64.cu} (80%)
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu
 rename cpp/src/neighbors/detail/cagra/{search_multi_cta_float_uint32_dim128_t8.cu => search_multi_cta_half_uint32.cu} (80%)
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim1024_t32.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim512_t32.cu
 rename cpp/src/neighbors/detail/cagra/{search_multi_cta_half_uint32_dim256_t16.cu => search_multi_cta_half_uint64.cu} (80%)
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh
 create mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu
 rename cpp/src/neighbors/detail/cagra/{search_single_cta_half_uint64_dim128_t8.cu => search_single_cta_float_uint32.cu} (80%)
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu
 rename cpp/src/neighbors/detail/cagra/{search_single_cta_half_uint32_dim128_t8.cu => search_single_cta_float_uint64.cu} (80%)
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu
 rename cpp/src/neighbors/detail/cagra/{search_single_cta_float_uint32_dim128_t8.cu => search_single_cta_half_uint32.cu} (80%)
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim1024_t32.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim512_t32.cu
 rename cpp/src/neighbors/detail/cagra/{search_single_cta_half_uint32_dim256_t16.cu => search_single_cta_half_uint64.cu} (80%)
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh
 create mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu
 delete mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu
 create mode 100644 cpp/src/neighbors/detail/cagra/topk_for_cagra/topk.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index fec1248bb..d8d554648 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -199,6 +199,96 @@ endif()
 
 # ##################################################################################################
 # * cuvs ---------------------------------------------------------------------
+add_library(
+  cuvs-cagra-search STATIC
+  src/neighbors/cagra_search_float.cu
+  src/neighbors/cagra_search_int8.cu
+  src/neighbors/cagra_search_uint8.cu
+  src/neighbors/detail/cagra/compute_distance.cu
+  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim128_t8.cu
+  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim256_t16.cu
+  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim512_t32.cu
+  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim128_t8.cu
+  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim256_t16.cu
+  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim512_t32.cu
+  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim128_t8.cu
+  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim256_t16.cu
+  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim512_t32.cu
+  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim128_t8.cu
+  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim256_t16.cu
+  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim512_t32.cu
+  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim128_t8.cu
+  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim256_t16.cu
+  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim512_t32.cu
+  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim128_t8.cu
+  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim256_t16.cu
+  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim512_t32.cu
+  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim128_t8.cu
+  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim256_t16.cu
+  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim512_t32.cu
+  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim128_t8.cu
+  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim256_t16.cu
+  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim512_t32.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu
+  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu
+  src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu
+  src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu
+  src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu
+  src/neighbors/detail/cagra/search_multi_cta_uint8_uint32.cu
+  src/neighbors/detail/cagra/search_multi_cta_float_uint64.cu
+  src/neighbors/detail/cagra/search_multi_cta_half_uint64.cu
+  src/neighbors/detail/cagra/search_single_cta_float_uint32.cu
+  src/neighbors/detail/cagra/search_single_cta_half_uint32.cu
+  src/neighbors/detail/cagra/search_single_cta_int8_uint32.cu
+  src/neighbors/detail/cagra/search_single_cta_uint8_uint32.cu
+  src/neighbors/detail/cagra/search_single_cta_float_uint64.cu
+  src/neighbors/detail/cagra/search_single_cta_half_uint64.cu
+)
+
+file(GLOB_RECURSE compute_distance_sources "src/neighbors/detail/cagra/compute_distance_*.cu")
+set_source_files_properties(${compute_distance_sources} PROPERTIES COMPILE_FLAGS -maxrregcount=64)
+
+set_target_properties(
+  cuvs-cagra-search
+  PROPERTIES BUILD_RPATH "\$ORIGIN"
+             CXX_STANDARD 17
+             CXX_STANDARD_REQUIRED ON
+             CUDA_STANDARD 17
+             CUDA_STANDARD_REQUIRED ON
+             CUDA_SEPARABLE_COMPILATION ON
+             INTERFACE_POSITION_INDEPENDENT_CODE ON
+             POSITION_INDEPENDENT_CODE ON
+)
+target_link_libraries(cuvs-cagra-search PRIVATE raft::raft)
+target_include_directories(
+  cuvs-cagra-search PRIVATE "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
+)
+target_compile_options(
+  cuvs-cagra-search PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUVS_CXX_FLAGS}>"
+                            "$<$<COMPILE_LANGUAGE:CUDA>:${CUVS_CUDA_FLAGS}>"
+)
 
 add_library(
   cuvs SHARED
@@ -266,109 +356,11 @@ add_library(
   src/neighbors/cagra_extend_int8.cu
   src/neighbors/cagra_extend_uint8.cu
   src/neighbors/cagra_optimize.cu
-  src/neighbors/cagra_search_float.cu
-  src/neighbors/cagra_search_int8.cu
-  src/neighbors/cagra_search_uint8.cu
   src/neighbors/cagra_serialize_float.cu
   src/neighbors/cagra_serialize_int8.cu
   src/neighbors/cagra_serialize_uint8.cu
   src/neighbors/detail/cagra/cagra_build.cpp
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu
-  src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu
-  src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu
-  src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu
-  src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu
-  src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu
-  src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu
-  src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu
-  src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu
-  src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu
-  src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu
-  src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu
-  src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu
-  src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu
-  src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu
-  src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu
-  src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu
-  src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu
-  src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu
-  src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu
-  src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu
-  src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu
-  src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu
-  src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu
-  src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu
-  src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu
-  src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu
-  src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu
-  src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu
-  src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu
-  src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu
-  src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu
+  src/neighbors/detail/cagra/topk_for_cagra/topk.cu
   $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:src/neighbors/hnsw.cpp>
   src/neighbors/ivf_flat_index.cpp
   src/neighbors/ivf_flat/ivf_flat_build_extend_float_int64_t.cu
@@ -463,7 +455,7 @@ if(NOT BUILD_CPU_ONLY)
   target_link_libraries(
     cuvs
     PUBLIC rmm::rmm raft::raft ${CUVS_CTK_MATH_DEPENDENCIES}
-    PRIVATE nvidia::cutlass::cutlass $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+    PRIVATE nvidia::cutlass::cutlass $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX> cuvs-cagra-search
   )
 endif()
 
@@ -539,7 +531,7 @@ target_compile_options(
                "$<$<COMPILE_LANGUAGE:CUDA>:${CUVS_CUDA_FLAGS}>"
 )
 # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
-target_link_options(cuvs PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
+target_link_options(cuvs PRIVATE $<HOST_LINK:${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld>)
 
 # ##################################################################################################
 # * cuvs_c -------------------------------------------------------------------------------
diff --git a/cpp/include/cuvs/neighbors/common.hpp b/cpp/include/cuvs/neighbors/common.hpp
index 414438067..8218b5f52 100644
--- a/cpp/include/cuvs/neighbors/common.hpp
+++ b/cpp/include/cuvs/neighbors/common.hpp
@@ -172,6 +172,22 @@ struct owning_dataset : public strided_dataset<DataT, IdxT> {
   };
 };
 
+template <typename DatasetT>
+struct is_strided_dataset : std::false_type {};
+
+template <typename DataT, typename IdxT>
+struct is_strided_dataset<strided_dataset<DataT, IdxT>> : std::true_type {};
+
+template <typename DataT, typename IdxT>
+struct is_strided_dataset<non_owning_dataset<DataT, IdxT>> : std::true_type {};
+
+template <typename DataT, typename IdxT, typename LayoutPolicy, typename ContainerPolicy>
+struct is_strided_dataset<owning_dataset<DataT, IdxT, LayoutPolicy, ContainerPolicy>>
+  : std::true_type {};
+
+template <typename DatasetT>
+inline constexpr bool is_strided_dataset_v = is_strided_dataset<DatasetT>::value;
+
 /**
  * @brief Contstruct a strided matrix from any mdarray or mdspan.
  *
@@ -284,23 +300,25 @@ auto make_aligned_dataset(const raft::resources& res, const SrcT& src, uint32_t
  */
 template <typename MathT, typename IdxT>
 struct vpq_dataset : public dataset<IdxT> {
+  using index_type = IdxT;
+  using math_type  = MathT;
   /** Vector Quantization codebook - "coarse cluster centers". */
-  raft::device_matrix<MathT, uint32_t, raft::row_major> vq_code_book;
+  raft::device_matrix<math_type, uint32_t, raft::row_major> vq_code_book;
   /** Product Quantization codebook - "fine cluster centers".  */
-  raft::device_matrix<MathT, uint32_t, raft::row_major> pq_code_book;
+  raft::device_matrix<math_type, uint32_t, raft::row_major> pq_code_book;
   /** Compressed dataset.  */
-  raft::device_matrix<uint8_t, IdxT, raft::row_major> data;
+  raft::device_matrix<uint8_t, index_type, raft::row_major> data;
 
-  vpq_dataset(raft::device_matrix<MathT, uint32_t, raft::row_major>&& vq_code_book,
-              raft::device_matrix<MathT, uint32_t, raft::row_major>&& pq_code_book,
-              raft::device_matrix<uint8_t, IdxT, raft::row_major>&& data)
+  vpq_dataset(raft::device_matrix<math_type, uint32_t, raft::row_major>&& vq_code_book,
+              raft::device_matrix<math_type, uint32_t, raft::row_major>&& pq_code_book,
+              raft::device_matrix<uint8_t, index_type, raft::row_major>&& data)
     : vq_code_book{std::move(vq_code_book)},
       pq_code_book{std::move(pq_code_book)},
       data{std::move(data)}
   {
   }
 
-  [[nodiscard]] auto n_rows() const noexcept -> IdxT final { return data.extent(0); }
+  [[nodiscard]] auto n_rows() const noexcept -> index_type final { return data.extent(0); }
   [[nodiscard]] auto dim() const noexcept -> uint32_t final { return vq_code_book.extent(1); }
   [[nodiscard]] auto is_owning() const noexcept -> bool final { return true; }
 
@@ -354,6 +372,15 @@ struct vpq_dataset : public dataset<IdxT> {
   }
 };
 
+template <typename DatasetT>
+struct is_vpq_dataset : std::false_type {};
+
+template <typename MathT, typename IdxT>
+struct is_vpq_dataset<vpq_dataset<MathT, IdxT>> : std::true_type {};
+
+template <typename DatasetT>
+inline constexpr bool is_vpq_dataset_v = is_vpq_dataset<DatasetT>::value;
+
 namespace filtering {
 
 /* A filter that filters nothing. This is the default behavior. */
diff --git a/cpp/src/neighbors/detail/ann_utils.cuh b/cpp/src/neighbors/detail/ann_utils.cuh
index 1db2dca64..29f790ec5 100644
--- a/cpp/src/neighbors/detail/ann_utils.cuh
+++ b/cpp/src/neighbors/detail/ann_utils.cuh
@@ -224,7 +224,7 @@ inline void memzero(T* ptr, IdxT n_elems, rmm::cuda_stream_view stream)
 }
 
 template <typename T, typename IdxT>
-RAFT_KERNEL outer_add_kernel(const T* a, IdxT len_a, const T* b, IdxT len_b, T* c)
+static __global__ void outer_add_kernel(const T* a, IdxT len_a, const T* b, IdxT len_b, T* c)
 {
   IdxT gid = threadIdx.x + blockDim.x * static_cast<IdxT>(blockIdx.x);
   IdxT i   = gid / len_b;
@@ -234,12 +234,12 @@ RAFT_KERNEL outer_add_kernel(const T* a, IdxT len_a, const T* b, IdxT len_b, T*
 }
 
 template <typename T, typename IdxT>
-RAFT_KERNEL block_copy_kernel(const IdxT* in_offsets,
-                              const IdxT* out_offsets,
-                              IdxT n_blocks,
-                              const T* in_data,
-                              T* out_data,
-                              IdxT n_mult)
+static __global__ void block_copy_kernel(const IdxT* in_offsets,
+                                         const IdxT* out_offsets,
+                                         IdxT n_blocks,
+                                         const T* in_data,
+                                         T* out_data,
+                                         IdxT n_mult)
 {
   IdxT i = static_cast<IdxT>(blockDim.x) * static_cast<IdxT>(blockIdx.x) + threadIdx.x;
   // find the source offset using the binary search.
@@ -317,7 +317,7 @@ void outer_add(const T* a, IdxT len_a, const T* b, IdxT len_b, T* c, rmm::cuda_s
 }
 
 template <typename T, typename S, typename IdxT, typename LabelT>
-RAFT_KERNEL copy_selected_kernel(
+static __global__ void copy_selected_kernel(
   IdxT n_rows, IdxT n_cols, const S* src, const LabelT* row_ids, IdxT ld_src, T* dst, IdxT ld_dst)
 {
   IdxT gid   = threadIdx.x + blockDim.x * static_cast<IdxT>(blockIdx.x);
diff --git a/cpp/src/neighbors/detail/cagra/bitonic.hpp b/cpp/src/neighbors/detail/cagra/bitonic.hpp
index 26195bd9c..ed609d6fd 100644
--- a/cpp/src/neighbors/detail/cagra/bitonic.hpp
+++ b/cpp/src/neighbors/detail/cagra/bitonic.hpp
@@ -26,7 +26,7 @@ namespace bitonic {
 namespace detail {
 
 template <class K, class V>
-_RAFT_DEVICE inline void swap_if_needed(K& k0, V& v0, K& k1, V& v1, const bool asc)
+RAFT_DEVICE_INLINE_FUNCTION void swap_if_needed(K& k0, V& v0, K& k1, V& v1, const bool asc)
 {
   if ((k0 != k1) && ((k0 < k1) != asc)) {
     const auto tmp_k = k0;
@@ -39,7 +39,10 @@ _RAFT_DEVICE inline void swap_if_needed(K& k0, V& v0, K& k1, V& v1, const bool a
 }
 
 template <class K, class V>
-_RAFT_DEVICE inline void swap_if_needed(K& k0, V& v0, const unsigned lane_offset, const bool asc)
+RAFT_DEVICE_INLINE_FUNCTION void swap_if_needed(K& k0,
+                                                V& v0,
+                                                const unsigned lane_offset,
+                                                const bool asc)
 {
   auto k1 = __shfl_xor_sync(~0u, k0, lane_offset);
   auto v1 = __shfl_xor_sync(~0u, v0, lane_offset);
@@ -51,7 +54,10 @@ _RAFT_DEVICE inline void swap_if_needed(K& k0, V& v0, const unsigned lane_offset
 
 template <class K, class V, unsigned N, unsigned warp_size = 32>
 struct warp_merge_core {
-  _RAFT_DEVICE inline void operator()(K k[N], V v[N], const std::uint32_t range, const bool asc)
+  RAFT_DEVICE_INLINE_FUNCTION void operator()(K k[N],
+                                              V v[N],
+                                              const std::uint32_t range,
+                                              const bool asc)
   {
     const auto lane_id = threadIdx.x % warp_size;
 
@@ -93,7 +99,10 @@ struct warp_merge_core {
 
 template <class K, class V, unsigned warp_size>
 struct warp_merge_core<K, V, 6, warp_size> {
-  _RAFT_DEVICE inline void operator()(K k[6], V v[6], const std::uint32_t range, const bool asc)
+  RAFT_DEVICE_INLINE_FUNCTION void operator()(K k[6],
+                                              V v[6],
+                                              const std::uint32_t range,
+                                              const bool asc)
   {
     constexpr unsigned N = 6;
     const auto lane_id   = threadIdx.x % warp_size;
@@ -141,7 +150,10 @@ struct warp_merge_core<K, V, 6, warp_size> {
 
 template <class K, class V, unsigned warp_size>
 struct warp_merge_core<K, V, 3, warp_size> {
-  _RAFT_DEVICE inline void operator()(K k[3], V v[3], const std::uint32_t range, const bool asc)
+  RAFT_DEVICE_INLINE_FUNCTION void operator()(K k[3],
+                                              V v[3],
+                                              const std::uint32_t range,
+                                              const bool asc)
   {
     constexpr unsigned N = 3;
     const auto lane_id   = threadIdx.x % warp_size;
@@ -171,7 +183,10 @@ struct warp_merge_core<K, V, 3, warp_size> {
 
 template <class K, class V, unsigned warp_size>
 struct warp_merge_core<K, V, 2, warp_size> {
-  _RAFT_DEVICE inline void operator()(K k[2], V v[2], const std::uint32_t range, const bool asc)
+  RAFT_DEVICE_INLINE_FUNCTION void operator()(K k[2],
+                                              V v[2],
+                                              const std::uint32_t range,
+                                              const bool asc)
   {
     constexpr unsigned N = 2;
     const auto lane_id   = threadIdx.x % warp_size;
@@ -197,7 +212,10 @@ struct warp_merge_core<K, V, 2, warp_size> {
 
 template <class K, class V, unsigned warp_size>
 struct warp_merge_core<K, V, 1, warp_size> {
-  _RAFT_DEVICE inline void operator()(K k[1], V v[1], const std::uint32_t range, const bool asc)
+  RAFT_DEVICE_INLINE_FUNCTION void operator()(K k[1],
+                                              V v[1],
+                                              const std::uint32_t range,
+                                              const bool asc)
   {
     const auto lane_id    = threadIdx.x % warp_size;
     const std::uint32_t b = range;
@@ -211,14 +229,15 @@ struct warp_merge_core<K, V, 1, warp_size> {
 }  // namespace detail
 
 template <class K, class V, unsigned N, unsigned warp_size = 32>
-__device__ void warp_merge(K k[N], V v[N], unsigned range, const bool asc = true)
+RAFT_DEVICE_INLINE_FUNCTION void warp_merge(K k[N], V v[N], unsigned range, const bool asc = true)
 {
   detail::warp_merge_core<K, V, N, warp_size>{}(k, v, range, asc);
 }
 
 template <class K, class V, unsigned N, unsigned warp_size = 32>
-__device__ void warp_sort(K k[N], V v[N], const bool asc = true)
+RAFT_DEVICE_INLINE_FUNCTION void warp_sort(K k[N], V v[N], const bool asc = true)
 {
+#pragma unroll
   for (std::uint32_t range = 1; range <= warp_size; range <<= 1) {
     warp_merge<K, V, N, warp_size>(k, v, range, asc);
   }
diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
index cfb5f7919..6dc601f32 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include "compute_distance_vpq.cuh"
 #include "factory.cuh"
 #include "search_plan.cuh"
 #include "search_single_cta_inst.cuh"
@@ -85,29 +84,22 @@ inline
   return filter;
 }
 
-template <typename DatasetDescriptorT, typename CagraSampleFilterT>
-void search_main_core(
-  raft::resources const& res,
-  search_params params,
-  DatasetDescriptorT dataset_desc,
-  raft::device_matrix_view<const typename DatasetDescriptorT::INDEX_T, int64_t, raft::row_major>
-    graph,
-  raft::device_matrix_view<const typename DatasetDescriptorT::DATA_T, int64_t, raft::row_major>
-    queries,
-  raft::device_matrix_view<typename DatasetDescriptorT::INDEX_T, int64_t, raft::row_major>
-    neighbors,
-  raft::device_matrix_view<typename DatasetDescriptorT::DISTANCE_T, int64_t, raft::row_major>
-    distances,
-  CagraSampleFilterT sample_filter    = CagraSampleFilterT(),
-  cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded)
+template <typename DataT, typename IndexT, typename DistanceT, typename CagraSampleFilterT>
+void search_main_core(raft::resources const& res,
+                      search_params params,
+                      const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc,
+                      raft::device_matrix_view<const IndexT, int64_t, raft::row_major> graph,
+                      raft::device_matrix_view<const DataT, int64_t, raft::row_major> queries,
+                      raft::device_matrix_view<IndexT, int64_t, raft::row_major> neighbors,
+                      raft::device_matrix_view<DistanceT, int64_t, raft::row_major> distances,
+                      CagraSampleFilterT sample_filter = CagraSampleFilterT())
 {
   RAFT_LOG_DEBUG("# dataset size = %lu, dim = %lu\n",
-                 static_cast<size_t>(dataset_desc.size),
-                 static_cast<size_t>(dataset_desc.dim));
+                 static_cast<size_t>(graph.extent(0)),
+                 static_cast<size_t>(queries.extent(1)));
   RAFT_LOG_DEBUG("# query size = %lu, dim = %lu\n",
                  static_cast<size_t>(queries.extent(0)),
                  static_cast<size_t>(queries.extent(1)));
-  RAFT_EXPECTS(queries.extent(1) == dataset_desc.dim, "Queries and index dim must match");
   const uint32_t topk = neighbors.extent(1);
 
   cudaDeviceProp deviceProp = raft::resource::get_device_properties(res);
@@ -119,12 +111,12 @@ void search_main_core(
     "cagra::search(max_queries = %u, k = %u, dim = %zu)",
     params.max_queries,
     topk,
-    dataset_desc.dim);
+    queries.extent(1));
 
   using CagraSampleFilterT_s = typename CagraSampleFilterT_Selector<CagraSampleFilterT>::type;
-  std::unique_ptr<search_plan_impl<DatasetDescriptorT, CagraSampleFilterT_s>> plan =
-    factory<DatasetDescriptorT, CagraSampleFilterT_s>::create(
-      res, params, dataset_desc.dim, graph.extent(1), topk, metric);
+  std::unique_ptr<search_plan_impl<DataT, IndexT, DistanceT, CagraSampleFilterT_s>> plan =
+    factory<DataT, IndexT, DistanceT, CagraSampleFilterT_s>::create(
+      res, params, dataset_desc, queries.extent(1), graph.extent(1), topk);
 
   plan->check(topk);
 
@@ -134,21 +126,17 @@ void search_main_core(
 
   for (unsigned qid = 0; qid < queries.extent(0); qid += max_queries) {
     const uint32_t n_queries = std::min<std::size_t>(max_queries, queries.extent(0) - qid);
-    auto _topk_indices_ptr =
-      reinterpret_cast<typename DatasetDescriptorT::INDEX_T*>(neighbors.data_handle()) +
-      (topk * qid);
+    auto _topk_indices_ptr   = reinterpret_cast<IndexT*>(neighbors.data_handle()) + (topk * qid);
     auto _topk_distances_ptr = distances.data_handle() + (topk * qid);
     // todo(tfeher): one could keep distances optional and pass nullptr
     const auto* _query_ptr = queries.data_handle() + (query_dim * qid);
     const auto* _seed_ptr =
       plan->num_seeds > 0
-        ? reinterpret_cast<const typename DatasetDescriptorT::INDEX_T*>(plan->dev_seed.data()) +
-            (plan->num_seeds * qid)
+        ? reinterpret_cast<const IndexT*>(plan->dev_seed.data()) + (plan->num_seeds * qid)
         : nullptr;
     uint32_t* _num_executed_iterations = nullptr;
 
     (*plan)(res,
-            dataset_desc,
             graph,
             _topk_indices_ptr,
             _topk_distances_ptr,
@@ -161,77 +149,6 @@ void search_main_core(
   }
 }
 
-template <class T,
-          class DatasetT,
-          class DatasetIdxT,
-          class InternalIdxT,
-          class DistanceT,
-          class CagraSampleFilterT>
-void launch_vpq_search_main_core(
-  raft::resources const& res,
-  const vpq_dataset<DatasetT, DatasetIdxT>* vpq_dset,
-  search_params params,
-  raft::device_matrix_view<const InternalIdxT, int64_t, raft::row_major> graph,
-  raft::device_matrix_view<const T, int64_t, raft::row_major> queries,
-  raft::device_matrix_view<InternalIdxT, int64_t, raft::row_major> neighbors,
-  raft::device_matrix_view<DistanceT, int64_t, raft::row_major> distances,
-  CagraSampleFilterT sample_filter,
-  const cuvs::distance::DistanceType metric)
-{
-  RAFT_EXPECTS(vpq_dset->pq_bits() == 8, "Only pq_bits = 8 is supported for now");
-  RAFT_EXPECTS(vpq_dset->pq_len() == 2 || vpq_dset->pq_len() == 4,
-               "Only pq_len 2 or 4 is supported for now");
-  RAFT_EXPECTS(vpq_dset->dim() % vpq_dset->pq_dim() == 0,
-               "dim must be a multiple of pq_dim at the moment");
-
-  const float vq_scale = 1.0f;
-  const float pq_scale = 1.0f;
-
-  if (vpq_dset->pq_bits() == 8) {
-    if (vpq_dset->pq_len() == 2) {
-      using dataset_desc_t = cagra_q_dataset_descriptor_t<T,
-                                                          DatasetT,
-                                                          8 /*PQ bit*/,
-                                                          2 /* Subspace dimension*/,
-                                                          DistanceT,
-                                                          InternalIdxT>;
-      dataset_desc_t dataset_desc(vpq_dset->data.data_handle(),
-                                  vpq_dset->encoded_row_length(),
-                                  vpq_dset->pq_dim(),
-                                  vpq_dset->vq_code_book.data_handle(),
-                                  vq_scale,
-                                  vpq_dset->pq_code_book.data_handle(),
-                                  pq_scale,
-                                  size_t(vpq_dset->n_rows()),
-                                  vpq_dset->dim());
-      search_main_core(
-        res, params, dataset_desc, graph, queries, neighbors, distances, sample_filter, metric);
-    } else if (vpq_dset->pq_len() == 4) {
-      using dataset_desc_t = cagra_q_dataset_descriptor_t<T,
-                                                          DatasetT,
-                                                          8 /*PQ bit*/,
-                                                          4 /* Subspace dimension*/,
-                                                          DistanceT,
-                                                          InternalIdxT>;
-      dataset_desc_t dataset_desc(vpq_dset->data.data_handle(),
-                                  vpq_dset->encoded_row_length(),
-                                  vpq_dset->pq_dim(),
-                                  vpq_dset->vq_code_book.data_handle(),
-                                  vq_scale,
-                                  vpq_dset->pq_code_book.data_handle(),
-                                  pq_scale,
-                                  size_t(vpq_dset->n_rows()),
-                                  vpq_dset->dim());
-      search_main_core(
-        res, params, dataset_desc, graph, queries, neighbors, distances, sample_filter, metric);
-    } else {
-      RAFT_FAIL("Subspace dimension must be 2 or 4");
-    }
-  } else {
-    RAFT_FAIL("Only 8-bit PQ is supported now");
-  }
-}
-
 /**
  * @brief Search ANN using the constructed index.
  *
@@ -264,6 +181,7 @@ void search_main(raft::resources const& res,
                  raft::device_matrix_view<DistanceT, int64_t, raft::row_major> distances,
                  CagraSampleFilterT sample_filter = CagraSampleFilterT())
 {
+  auto stream         = raft::resource::get_cuda_stream(res);
   const auto& graph   = index.graph();
   auto graph_internal = raft::make_device_matrix_view<const InternalIdxT, int64_t, raft::row_major>(
     reinterpret_cast<const InternalIdxT*>(graph.data_handle()), graph.extent(0), graph.extent(1));
@@ -273,39 +191,21 @@ void search_main(raft::resources const& res,
   // Dispatch search parameters based on the dataset kind.
   if (auto* strided_dset = dynamic_cast<const strided_dataset<T, ds_idx_type>*>(&index.data());
       strided_dset != nullptr) {
-    // Set TEAM_SIZE and DATASET_BLOCK_SIZE to zero tentatively since these parameters cannot be
-    // determined here. They are set just before kernel launch.
-    using dataset_desc_t = standard_dataset_descriptor_t<T, InternalIdxT, DistanceT>;
     // Search using a plain (strided) row-major dataset
-    const dataset_desc_t dataset_desc(strided_dset->view().data_handle(),
-                                      strided_dset->n_rows(),
-                                      strided_dset->dim(),
-                                      strided_dset->stride());
-    search_main_core<dataset_desc_t, CagraSampleFilterT>(res,
-                                                         params,
-                                                         dataset_desc,
-                                                         graph_internal,
-                                                         queries,
-                                                         neighbors,
-                                                         distances,
-                                                         sample_filter,
-                                                         index.metric());
+    auto& desc = dataset_descriptor_init_with_cache<T, InternalIdxT, DistanceT>(
+      res, params, *strided_dset, index.metric());
+    search_main_core<T, InternalIdxT, DistanceT, CagraSampleFilterT>(
+      res, params, desc, graph_internal, queries, neighbors, distances, sample_filter);
   } else if (auto* vpq_dset = dynamic_cast<const vpq_dataset<float, ds_idx_type>*>(&index.data());
              vpq_dset != nullptr) {
     // Search using a compressed dataset
     RAFT_FAIL("FP32 VPQ dataset support is coming soon");
   } else if (auto* vpq_dset = dynamic_cast<const vpq_dataset<half, ds_idx_type>*>(&index.data());
              vpq_dset != nullptr) {
-    launch_vpq_search_main_core<T, half, ds_idx_type, InternalIdxT, DistanceT, CagraSampleFilterT>(
-      res,
-      vpq_dset,
-      params,
-      graph_internal,
-      queries,
-      neighbors,
-      distances,
-      sample_filter,
-      index.metric());
+    auto& desc = dataset_descriptor_init_with_cache<T, InternalIdxT, DistanceT>(
+      res, params, *vpq_dset, index.metric());
+    search_main_core<T, InternalIdxT, DistanceT, CagraSampleFilterT>(
+      res, params, desc, graph_internal, queries, neighbors, distances, sample_filter);
   } else if (auto* empty_dset = dynamic_cast<const empty_dataset<ds_idx_type>*>(&index.data());
              empty_dset != nullptr) {
     // Forgot to add a dataset.
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh
new file mode 100644
index 000000000..8407ef055
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh
@@ -0,0 +1,511 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#pragma once
+
+#include "compute_distance_standard.hpp"
+#include "compute_distance_vpq.hpp"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                8,
+                                                128,
+                                                float,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                8,
+                                                128,
+                                                float,
+                                                uint32_t,
+                                                float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           128,
+                                           8,
+                                           2,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           128,
+                                           8,
+                                           4,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                16,
+                                                256,
+                                                float,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                16,
+                                                256,
+                                                float,
+                                                uint32_t,
+                                                float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
+                                           8,
+                                           2,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
+                                           8,
+                                           4,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                32,
+                                                512,
+                                                float,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                32,
+                                                512,
+                                                float,
+                                                uint32_t,
+                                                float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           2,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           4,
+                                           half,
+                                           float,
+                                           uint32_t,
+                                           float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                8,
+                                                128,
+                                                half,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                8,
+                                                128,
+                                                half,
+                                                uint32_t,
+                                                float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           128,
+                                           8,
+                                           2,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           128,
+                                           8,
+                                           4,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                16,
+                                                256,
+                                                half,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                16,
+                                                256,
+                                                half,
+                                                uint32_t,
+                                                float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
+                                           8,
+                                           2,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
+                                           8,
+                                           4,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                32,
+                                                512,
+                                                half,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                32,
+                                                512,
+                                                half,
+                                                uint32_t,
+                                                float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           2,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           4,
+                                           half,
+                                           half,
+                                           uint32_t,
+                                           float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                8,
+                                                128,
+                                                int8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                8,
+                                                128,
+                                                int8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           128,
+                                           8,
+                                           2,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           128,
+                                           8,
+                                           4,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                16,
+                                                256,
+                                                int8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                16,
+                                                256,
+                                                int8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
+                                           8,
+                                           2,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
+                                           8,
+                                           4,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                32,
+                                                512,
+                                                int8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                32,
+                                                512,
+                                                int8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           2,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           4,
+                                           half,
+                                           int8_t,
+                                           uint32_t,
+                                           float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                8,
+                                                128,
+                                                uint8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                8,
+                                                128,
+                                                uint8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           128,
+                                           8,
+                                           2,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           8,
+                                           128,
+                                           8,
+                                           4,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                16,
+                                                256,
+                                                uint8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                16,
+                                                256,
+                                                uint8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
+                                           8,
+                                           2,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           16,
+                                           256,
+                                           8,
+                                           4,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float>;
+extern template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                                32,
+                                                512,
+                                                uint8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                                32,
+                                                512,
+                                                uint8_t,
+                                                uint32_t,
+                                                float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           2,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float>;
+extern template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                           32,
+                                           512,
+                                           8,
+                                           4,
+                                           half,
+                                           uint8_t,
+                                           uint32_t,
+                                           float>;
+
+extern template struct instance_selector<
+  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, uint8_t, uint32_t, float>>;
+
+using descriptor_instances = instance_selector<
+  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, uint8_t, uint32_t, float>>;
+
+template <typename DataT, typename IndexT, typename DistanceT, typename DatasetT>
+auto dataset_descriptor_init(const cagra::search_params& params,
+                             const DatasetT& dataset,
+                             cuvs::distance::DistanceType metric,
+                             rmm::cuda_stream_view stream)
+  -> dataset_descriptor_host<DataT, IndexT, DistanceT>
+{
+  auto [init, priority] =
+    descriptor_instances::select<DataT, IndexT, DistanceT>(params, dataset, metric);
+  if (init == nullptr || priority < 0) {
+    RAFT_FAIL("No dataset descriptor instance compiled for this parameter combination.");
+  }
+  return init(params, dataset, metric, stream);
+}
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.cu b/cpp/src/neighbors/detail/cagra/compute_distance.cu
new file mode 100644
index 000000000..45316e59b
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance.cu
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance-ext.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+
+template struct instance_selector<
+  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, float, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, float, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, half, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, half, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, int8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, int8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 2, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 8, 128, 8, 4, half, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 16, 256, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 2, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 16, 256, 8, 4, half, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, uint8_t, uint32_t, float>,
+  standard_descriptor_spec<DistanceType::InnerProduct, 32, 512, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 2, half, uint8_t, uint32_t, float>,
+  vpq_descriptor_spec<DistanceType::L2Expanded, 32, 512, 8, 4, half, uint8_t, uint32_t, float>>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp
index 2b0c750ff..4bed275ab 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp
+++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp
@@ -20,303 +20,363 @@
 #include "utils.hpp"
 
 #include <cuvs/distance/distance.hpp>
+#include <cuvs/neighbors/cagra.hpp>
+#include <cuvs/neighbors/common.hpp>
+#include <raft/core/logger-macros.hpp>
 #include <raft/core/operators.hpp>
 
 // TODO: This shouldn't be invoking spatial/knn
 #include "../ann_utils.cuh"
 
+#include <raft/util/device_loads_stores.cuh>
 #include <raft/util/vectorized.cuh>
 
+#include <functional>
+#include <memory>
 #include <type_traits>
 
 namespace cuvs::neighbors::cagra::detail {
-namespace device {
 
-// using LOAD_256BIT_T = ulonglong4;
-using LOAD_128BIT_T = uint4;
-using LOAD_64BIT_T  = uint64_t;
-
-template <class LOAD_T, class DATA_T>
-_RAFT_DEVICE constexpr unsigned get_vlen()
-{
-  return utils::size_of<LOAD_T>() / utils::size_of<DATA_T>();
-}
-
-template <unsigned TEAM_SIZE,
-          unsigned DATASET_BLOCK_DIM,
-          class DATASET_DESCRIPTOR_T,
-          class DISTANCE_T,
-          class INDEX_T>
-_RAFT_DEVICE void compute_distance_to_random_nodes(
-  INDEX_T* const result_indices_ptr,       // [num_pickup]
-  DISTANCE_T* const result_distances_ptr,  // [num_pickup]
-  const typename DATASET_DESCRIPTOR_T::QUERY_T* const query_buffer,
-  const DATASET_DESCRIPTOR_T& dataset_desc,
-  const std::size_t num_pickup,
-  const unsigned num_distilation,
-  const uint64_t rand_xor_mask,
-  const INDEX_T* const seed_ptr,  // [num_seeds]
-  const uint32_t num_seeds,
-  INDEX_T* const visited_hash_ptr,
-  const uint32_t hash_bitlen,
-  const cuvs::distance::DistanceType metric,
-  const uint32_t block_id   = 0,
-  const uint32_t num_blocks = 1)
-{
-  uint32_t max_i = num_pickup;
-  if (max_i % (32 / TEAM_SIZE)) { max_i += (32 / TEAM_SIZE) - (max_i % (32 / TEAM_SIZE)); }
-
-  for (uint32_t i = threadIdx.x / TEAM_SIZE; i < max_i; i += blockDim.x / TEAM_SIZE) {
-    const bool valid_i = (i < num_pickup);
-
-    INDEX_T best_index_team_local;
-    DISTANCE_T best_norm2_team_local = utils::get_max_value<DISTANCE_T>();
-    for (uint32_t j = 0; j < num_distilation; j++) {
-      // Select a node randomly and compute the distance to it
-      INDEX_T seed_index;
-      if (valid_i) {
-        // uint32_t gid = i + (num_pickup * (j + (num_distilation * block_id)));
-        uint32_t gid = block_id + (num_blocks * (i + (num_pickup * j)));
-        if (seed_ptr && (gid < num_seeds)) {
-          seed_index = seed_ptr[gid];
-        } else {
-          seed_index = device::xorshift64(gid ^ rand_xor_mask) % dataset_desc.size;
-        }
-      }
-
-      DISTANCE_T norm2;
-      switch (metric) {
-        case cuvs::distance::DistanceType::L2Expanded:
-          norm2 =
-            dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
-                                                     TEAM_SIZE,
-                                                     cuvs::distance::DistanceType::L2Expanded>(
-              query_buffer, seed_index, valid_i);
-          break;
-        case cuvs::distance::DistanceType::InnerProduct:
-          norm2 =
-            dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
-                                                     TEAM_SIZE,
-                                                     cuvs::distance::DistanceType::InnerProduct>(
-              query_buffer, seed_index, valid_i);
-          break;
-        default: break;
-      }
-
-      if (valid_i && (norm2 < best_norm2_team_local)) {
-        best_norm2_team_local = norm2;
-        best_index_team_local = seed_index;
+/**
+ * @brief Dataset and distance description.
+ *
+ * This is the base type for the dataset/distance descriptors.
+ * The actual implementations are hidden in `compute_distance_***-impl.cuh` files, which should be
+ * included only in `compute_distance_***.cu` files to enforce separable compilation.
+ *
+ * [Note: manual dispatch]
+ * The descriptor type hierarchy declared here resembles the usual C++ inheritance: the search
+ * kernels take a pointer to the base type as an argument, but the actual implementation types are
+ * passed by the host. The kernels only ever need two functions `setup_workspace` and
+ * `compute_distance`; the choice of the implementation happens at the runtime.
+ *
+ * However, for performance reasons, we don't use the C++ virtual dispatch mechanics here.
+ * The extra pointer-chasing and register usage overheads associated with virtual tables turn out to
+ * cause a significant slowdown in the performance-critical `compute_distance`.
+ * Instead, we manually dispatch the two polymorphic functions and store them as fields in the
+ * descriptor structure.
+ *
+ * [Note: initialization/dispatch]
+ * The host doesn't know the addresses of the device symbols. That means we either need to resolve
+ * the device functions and store them in the descriptor directly on the device, or use
+ * `cudaMemcpyFromSymbolAsync` to fetch them (note, there is same problem with classes: if an object
+ * is created on the host, its pointer to the vtable would be invalid on device).
+ * We take the first approach: there's an `***_init_kernel` for each descriptor instance that is
+ * called before the search kernel; all it does is call a (placement) new with an appropriate type
+ * and arguments in a single GPU thread.
+ *
+ */
+template <typename DataT, typename IndexT, typename DistanceT>
+struct alignas(device::LOAD_128BIT_T) dataset_descriptor_base_t {
+  using base_type  = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
+  using LOAD_T     = device::LOAD_128BIT_T;
+  using DATA_T     = DataT;
+  using INDEX_T    = IndexT;
+  using DISTANCE_T = DistanceT;
+
+  /**
+   * @brief "polymorphic" `compute_distance` arguments.
+   *
+   * This is a tightly-packed POD arguments of `compute_distance`.
+   * **Important** this structure is passed by value to `compute_distance`; it's important it
+   * remains small.
+   *
+   * [Note: arguments layout]
+   * The descriptor implementations require different sets of arguments (with couple arguments
+   * overlapping). At the same time the `compute_distance` is defined such that it accepts the
+   * `args_t` by value. That means the layout of the struct must be identical for all descriptor
+   * implementations. We workaround this requirement by defining generic fields in this struct and
+   * assignging the meaning to them on the implementation side.
+   */
+  struct alignas(LOAD_T) args_t {
+    void* extra_ptr1;
+    void* extra_ptr2;
+    /** Pointer to the workspace in the shared memory (filled in every copy by a thread block). */
+    uint32_t smem_ws_ptr;
+    /** Dimensionality of the data/queries. */
+    uint32_t dim;
+    uint32_t extra_word1;
+    uint32_t extra_word2;
+
+    /**
+     * Load this struct from shared memory.
+     *
+     * NB: until `compute_distance` is called, the arguments struct is stored in the shared memory
+     * as a member of the descriptor struct. This helper functions saves a few instructions by
+     * forcing the compiler to assume it is indeed in the shared memory address space.
+     */
+    RAFT_DEVICE_INLINE_FUNCTION auto load() const -> args_t
+    {
+      constexpr int kCount = sizeof(*this) / sizeof(LOAD_T);
+      using blob_type      = LOAD_T[kCount];
+      args_t r;
+      auto& src = reinterpret_cast<const blob_type&>(*this);
+      auto& dst = reinterpret_cast<blob_type&>(r);
+#pragma unroll
+      for (int i = 0; i < kCount; i++) {
+        device::lds(dst[i], src + i);
       }
+      return r;
     }
-
-    const unsigned lane_id = threadIdx.x % TEAM_SIZE;
-    if (valid_i && lane_id == 0) {
-      if (hashmap::insert(visited_hash_ptr, hash_bitlen, best_index_team_local)) {
-        result_distances_ptr[i] = best_norm2_team_local;
-        result_indices_ptr[i]   = best_index_team_local;
-      } else {
-        result_distances_ptr[i] = utils::get_max_value<DISTANCE_T>();
-        result_indices_ptr[i]   = utils::get_max_value<INDEX_T>();
-      }
+  };
+
+  /** Shared memory usage and team_size packed into a single uint32_t to save on memory requests. */
+  struct smem_and_team_size_t {
+    uint32_t value;
+    RAFT_INLINE_FUNCTION constexpr smem_and_team_size_t(uint32_t smem_size_bytes,
+                                                        uint32_t team_size_bitshift)
+      : value{(team_size_bitshift << 24) | smem_size_bytes}
+    {
     }
-  }
-}
-
-template <unsigned TEAM_SIZE,
-          unsigned DATASET_BLOCK_DIM,
-          unsigned MAX_N_FRAGS,
-          class DATASET_DESCRIPTOR_T,
-          class DISTANCE_T,
-          class INDEX_T>
-_RAFT_DEVICE void compute_distance_to_child_nodes(
-  INDEX_T* const result_child_indices_ptr,
-  DISTANCE_T* const result_child_distances_ptr,
-  // query
-  const typename DATASET_DESCRIPTOR_T::QUERY_T* const query_buffer,
-  // [dataset_dim, dataset_size]
-  const DATASET_DESCRIPTOR_T& dataset_desc,
-  // [knn_k, dataset_size]
-  const INDEX_T* const knn_graph,
-  const std::uint32_t knn_k,
-  // hashmap
-  INDEX_T* const visited_hashmap_ptr,
-  const std::uint32_t hash_bitlen,
-  const INDEX_T* const parent_indices,
-  const INDEX_T* const internal_topk_list,
-  const std::uint32_t search_width,
-  const cuvs::distance::DistanceType metric)
-{
-  constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
-  const INDEX_T invalid_index        = utils::get_max_value<INDEX_T>();
-
-  // Read child indices of parents from knn graph and check if the distance
-  // computaiton is necessary.
-  for (uint32_t i = threadIdx.x; i < knn_k * search_width; i += blockDim.x) {
-    const INDEX_T smem_parent_id = parent_indices[i / knn_k];
-    INDEX_T child_id             = invalid_index;
-    if (smem_parent_id != invalid_index) {
-      const auto parent_id = internal_topk_list[smem_parent_id] & ~index_msb_1_mask;
-      child_id             = knn_graph[(i % knn_k) + (static_cast<int64_t>(knn_k) * parent_id)];
+    /** Total dynamic shared memory required by the descriptor.  */
+    RAFT_INLINE_FUNCTION constexpr auto smem_ws_size_in_bytes() const noexcept -> uint32_t
+    {
+      return value & 0xffffffu;
     }
-    if (child_id != invalid_index) {
-      if (hashmap::insert(visited_hashmap_ptr, hash_bitlen, child_id) == 0) {
-        child_id = invalid_index;
-      }
+    RAFT_INLINE_FUNCTION constexpr auto team_size_bitshift() const noexcept -> uint32_t
+    {
+      return (value >> 24) & 0xffu;
+    }
+    /** How many threads are involved in computing a single distance. */
+    RAFT_INLINE_FUNCTION constexpr auto team_size() const noexcept -> uint32_t
+    {
+      return 1u << team_size_bitshift();
     }
-    result_child_indices_ptr[i] = child_id;
+  };
+  static_assert(sizeof(smem_and_team_size_t) == sizeof(uint32_t));
+
+  using setup_workspace_type  = const base_type*(const base_type*, void*, const DATA_T*, uint32_t);
+  using compute_distance_type = DISTANCE_T(const args_t, const INDEX_T);
+
+  args_t args;
+
+  /** Copy the descriptor and the query into shared memory and do any other work, such as
+   * initializing the codebook. */
+  setup_workspace_type* setup_workspace_impl;
+  /** Compute the distance from the query vector (stored in the smem_workspace) and a dataset vector
+   * given by the dataset_index. */
+  compute_distance_type* compute_distance_impl;
+  /** A placeholder for an implementation-specific pointer. */
+  void* extra_ptr3;
+  smem_and_team_size_t smem_and_team_size;
+
+  /** Number of records in the database. */
+  INDEX_T size;
+
+  RAFT_INLINE_FUNCTION dataset_descriptor_base_t(setup_workspace_type* setup_workspace_impl,
+                                                 compute_distance_type* compute_distance_impl,
+                                                 INDEX_T size,
+                                                 uint32_t dim,
+                                                 uint32_t team_size_bitshift,
+                                                 uint32_t smem_ws_size_in_bytes)
+    : setup_workspace_impl(setup_workspace_impl),
+      compute_distance_impl(compute_distance_impl),
+      size(size),
+      smem_and_team_size(smem_ws_size_in_bytes, team_size_bitshift),
+      args{nullptr, nullptr, 0, dim, 0, 0}
+  {
   }
-  __syncthreads();
 
-  // Compute the distance to child nodes
-  std::uint32_t max_i = knn_k * search_width;
-  if (max_i % (32 / TEAM_SIZE)) { max_i += (32 / TEAM_SIZE) - (max_i % (32 / TEAM_SIZE)); }
-  for (std::uint32_t tid = threadIdx.x; tid < max_i * TEAM_SIZE; tid += blockDim.x) {
-    const auto i       = tid / TEAM_SIZE;
-    const bool valid_i = (i < (knn_k * search_width));
-    INDEX_T child_id   = invalid_index;
-    if (valid_i) { child_id = result_child_indices_ptr[i]; }
+  /** Total dynamic shared memory required by the descriptor.  */
+  RAFT_INLINE_FUNCTION constexpr auto smem_ws_size_in_bytes() const noexcept -> uint32_t
+  {
+    return smem_and_team_size.smem_ws_size_in_bytes();
+  }
+  RAFT_INLINE_FUNCTION constexpr auto team_size_bitshift() const noexcept -> uint32_t
+  {
+    return smem_and_team_size.team_size_bitshift();
+  }
+  RAFT_DEVICE_INLINE_FUNCTION constexpr auto team_size_bitshift_from_smem() const noexcept
+    -> uint32_t
+  {
+    uint32_t sts;
+    raft::lds(sts, reinterpret_cast<const uint32_t*>(&smem_and_team_size));
+    return reinterpret_cast<smem_and_team_size_t&>(sts).team_size_bitshift();
+  }
 
-    DISTANCE_T norm2;
-    switch (metric) {
-      case cuvs::distance::DistanceType::L2Expanded:
-        norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
-                                                         TEAM_SIZE,
-                                                         cuvs::distance::DistanceType::L2Expanded>(
-          query_buffer, child_id, child_id != invalid_index);
-        break;
-      case cuvs::distance::DistanceType::InnerProduct:
-        norm2 =
-          dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
-                                                   TEAM_SIZE,
-                                                   cuvs::distance::DistanceType::InnerProduct>(
-            query_buffer, child_id, child_id != invalid_index);
-        break;
-      default: break;
-    }
+  /** How many threads are involved in computing a single distance. */
+  RAFT_INLINE_FUNCTION constexpr auto team_size() const noexcept -> uint32_t
+  {
+    return smem_and_team_size.team_size();
+  }
 
-    // Store the distance
-    const unsigned lane_id = threadIdx.x % TEAM_SIZE;
-    if (valid_i && lane_id == 0) {
-      if (child_id != invalid_index) {
-        result_child_distances_ptr[i] = norm2;
-      } else {
-        result_child_distances_ptr[i] = utils::get_max_value<DISTANCE_T>();
-      }
-    }
+  RAFT_DEVICE_INLINE_FUNCTION auto setup_workspace(void* smem_ptr,
+                                                   const DATA_T* queries_ptr,
+                                                   uint32_t query_id) const -> const base_type*
+  {
+    return setup_workspace_impl(this, smem_ptr, queries_ptr, query_id);
   }
-}
 
-}  // namespace device
+  RAFT_DEVICE_INLINE_FUNCTION auto compute_distance(INDEX_T dataset_index, bool valid) const
+    -> DISTANCE_T
+  {
+    auto per_thread_distances = valid ? compute_distance_impl(args.load(), dataset_index) : 0;
+    return device::team_sum(per_thread_distances, team_size_bitshift_from_smem());
+  }
+};
 
-template <class QUERY_T_, class DISTANCE_T_, class INDEX_T_>
-struct dataset_descriptor_base_t {
-  using INDEX_T    = INDEX_T_;
-  using QUERY_T    = QUERY_T_;
-  using DISTANCE_T = DISTANCE_T_;
+/**
+ * @brief Hosting a device descriptor.
+ *
+ * The dataset descriptor is initialized on the device side and stays there.
+ * The host struct manages the lifetime of the associated device pointer and a couple parameters
+ * affecting the search kernel launch config.
+ *
+ */
+template <typename DataT, typename IndexT, typename DistanceT>
+struct dataset_descriptor_host {
+  using dev_descriptor_t         = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
+  uint32_t smem_ws_size_in_bytes = 0;
+  uint32_t team_size             = 0;
+
+  template <typename DescriptorImpl>
+  dataset_descriptor_host(const DescriptorImpl& dd_host, rmm::cuda_stream_view stream)
+    : dev_ptr_{[stream]() {
+                 dev_descriptor_t* p;
+                 RAFT_CUDA_TRY(cudaMallocAsync(&p, sizeof(DescriptorImpl), stream));
+                 return p;
+               }(),
+               [stream](dev_descriptor_t* p) { RAFT_CUDA_TRY_NO_THROW(cudaFreeAsync(p, stream)); }},
+      smem_ws_size_in_bytes{dd_host.smem_ws_size_in_bytes()},
+      team_size{dd_host.team_size()}
+  {
+  }
 
-  const INDEX_T size;
-  const std::uint32_t dim;
+  [[nodiscard]] auto dev_ptr() const -> const dev_descriptor_t* { return dev_ptr_.get(); }
+  [[nodiscard]] auto dev_ptr() -> dev_descriptor_t* { return dev_ptr_.get(); }
 
-  dataset_descriptor_base_t(const INDEX_T size, const std::uint32_t dim) : size(size), dim(dim) {}
+ private:
+  std::unique_ptr<dev_descriptor_t, std::function<void(dev_descriptor_t*)>> dev_ptr_;
 };
 
-template <class DATA_T_, class INDEX_T, class DISTANCE_T = float>
-struct standard_dataset_descriptor_t
-  : public dataset_descriptor_base_t<float, DISTANCE_T, INDEX_T> {
-  using LOAD_T  = device::LOAD_128BIT_T;
-  using DATA_T  = DATA_T_;
-  using QUERY_T = typename dataset_descriptor_base_t<float, DISTANCE_T, INDEX_T>::QUERY_T;
-
-  const DATA_T* const ptr;
-  const std::size_t ld;
-  using dataset_descriptor_base_t<float, DISTANCE_T, INDEX_T>::size;
-  using dataset_descriptor_base_t<float, DISTANCE_T, INDEX_T>::dim;
-
-  standard_dataset_descriptor_t(const DATA_T* const ptr,
-                                const std::size_t size,
-                                const std::uint32_t dim,
-                                const std::size_t ld)
-    : dataset_descriptor_base_t<float, DISTANCE_T, INDEX_T>(size, dim), ptr(ptr), ld(ld)
+/**
+ * @brief The signature for descriptor initialization.
+ *
+ * There is an init function associated with every descriptor implementation. It's responsible for
+ * initializing the device-side descriptor instance (calling the init kernel).
+ *
+ */
+template <typename DataT, typename IndexT, typename DistanceT, typename DatasetT>
+using init_desc_type =
+  dataset_descriptor_host<DataT, IndexT, DistanceT> (*)(const cagra::search_params&,
+                                                        const DatasetT&,
+                                                        cuvs::distance::DistanceType,
+                                                        rmm::cuda_stream_view);
+
+/**
+ * @brief Descriptor instance specification.
+ *
+ * This type provides a decentralized way for selecting a descriptor instance best suitable for the
+ * given dataset and distance metric.
+ * There is a spec for every descriptor (described in the interface files
+ * `compute_distance_***.hpp`).
+ *
+ * The `instance_spec` implementation must have the following static member template functions:
+ *   * constexpr bool accepts_dataset()
+ *     - tells whether the spec is compatible with the dataset type, executed at compile time.
+ *   * double priority(..)
+ *     - tells how to select a single spec out of possibly several compatible specs
+ *   * init_desc_type init
+ *     - (see `init_desc_type` above) the function to initialize the descriptor.
+ */
+template <typename DataT, typename IndexT, typename DistanceT>
+struct instance_spec {
+  using data_type     = DataT;
+  using index_type    = IndexT;
+  using distance_type = DistanceT;
+  using host_type     = dataset_descriptor_host<DataT, IndexT, DistanceT>;
+  /** Use this to constrain the input dataset type. */
+  template <typename DatasetT>
+  constexpr static inline bool accepts_dataset()
   {
+    return false;
   }
+};
 
-  static const std::uint32_t smem_buffer_size_in_byte = 0;
-  __device__ void set_smem_ptr(void* const){};
-
-  template <uint32_t DATASET_BLOCK_DIM>
-  __device__ void copy_query(const DATA_T* const dmem_query_ptr,
-                             QUERY_T* const smem_query_ptr,
-                             const std::uint32_t query_smem_buffer_length)
-  {
-    for (unsigned i = threadIdx.x; i < query_smem_buffer_length; i += blockDim.x) {
-      unsigned j = device::swizzling(i);
-      if (i < dim) {
-        smem_query_ptr[j] =
-          cuvs::spatial::knn::detail::utils::mapping<QUERY_T>{}(dmem_query_ptr[i]);
-      } else {
-        smem_query_ptr[j] = 0.0;
-      }
-    }
+/** Whether the descriptor is compatible with the dataset and arguments at the type level
+ * (compile-time check).
+ */
+template <typename InstanceSpec,
+          typename DataT,
+          typename IndexT,
+          typename DistanceT,
+          typename DatasetT>
+constexpr bool spec_sound = std::is_same_v<DataT, typename InstanceSpec::data_type> &&
+                            std::is_same_v<IndexT, typename InstanceSpec::index_type> &&
+                            std::is_same_v<DistanceT, typename InstanceSpec::distance_type> &&
+                            InstanceSpec::template accepts_dataset<DatasetT>();
+
+/**
+ * @brief Get the init function and the priority of the descriptor given by the InstanceSpec.
+ *
+ * @return (init function, priority)
+ */
+template <typename InstanceSpec,
+          typename DataT,
+          typename IndexT,
+          typename DistanceT,
+          typename DatasetT>
+constexpr auto spec_match(const cagra::search_params& params,
+                          const DatasetT& dataset,
+                          cuvs::distance::DistanceType metric)
+  -> std::tuple<init_desc_type<DataT, IndexT, DistanceT, DatasetT>, double>
+{
+  if constexpr (spec_sound<InstanceSpec, DataT, IndexT, DistanceT, DatasetT>) {
+    return std::make_tuple(InstanceSpec::template init<DatasetT>,
+                           InstanceSpec::template priority(params, dataset, metric));
   }
+  return std::make_tuple(nullptr, -1.0);
+}
 
-  template <typename T, cuvs::distance::DistanceType METRIC>
-  std::enable_if_t<METRIC == cuvs::distance::DistanceType::L2Expanded, T> __device__
-  dist_op(T a, T b) const
+/**
+ * @brief Select the best matching descriptor instance from the given type-level list.
+ *
+ * This is a helper struct that goes through the given list of specs (given as template arguments),
+ * filters is (partially at compile time and partially at runtime), and selects the descriptor with
+ * the highest priority.
+ *
+ * There is a single point in the codebase, where all specs are brought together; it's in the
+ * `neighbors/detail/cagra/compute_distance-ext.cuh`, which is generated by
+ * `neighbors/detail/cagra/compute_distance_00_generate.py`.
+ * Hence, `compute_distance_00_generate.py` is the only place you need to manually change to modify
+ * or extend the list supported dataset descriptors.
+ * The logic of selecting the descriptor is fully defined in this file, whereas the priorities of
+ * specific implementations are defined next to the implementations.
+ */
+template <typename... Specs>
+struct instance_selector {
+  template <typename DataT, typename IndexT, typename DistanceT, typename DatasetT>
+  static auto select(const cagra::search_params&, const DatasetT&, cuvs::distance::DistanceType)
+    -> std::tuple<init_desc_type<DataT, IndexT, DistanceT, DatasetT>, double>
   {
-    T diff = a - b;
-    return diff * diff;
+    return std::make_tuple(nullptr, -1.0);
   }
+};
 
-  template <typename T, cuvs::distance::DistanceType METRIC>
-  std::enable_if_t<METRIC == cuvs::distance::DistanceType::InnerProduct, T> __device__
-  dist_op(T a, T b) const
+template <typename Spec, typename... Specs>
+struct instance_selector<Spec, Specs...> {
+  template <typename DataT, typename IndexT, typename DistanceT, typename DatasetT>
+  static auto select(const cagra::search_params& params,
+                     const DatasetT& dataset,
+                     cuvs::distance::DistanceType metric)
+    -> std::enable_if_t<spec_sound<Spec, DataT, IndexT, DistanceT, DatasetT>,
+                        std::tuple<init_desc_type<DataT, IndexT, DistanceT, DatasetT>, double>>
   {
-    return -a * b;
+    auto s0 = spec_match<Spec, DataT, IndexT, DistanceT, DatasetT>(params, dataset, metric);
+    auto ss = instance_selector<Specs...>::template select<DataT, IndexT, DistanceT, DatasetT>(
+      params, dataset, metric);
+    return std::get<1>(s0) >= std::get<1>(ss) ? s0 : ss;
   }
 
-  template <uint32_t DATASET_BLOCK_DIM, uint32_t TEAM_SIZE, cuvs::distance::DistanceType METRIC>
-  __device__ DISTANCE_T compute_similarity(const QUERY_T* const query_ptr,
-                                           const INDEX_T dataset_i,
-                                           const bool valid) const
+  template <typename DataT, typename IndexT, typename DistanceT, typename DatasetT>
+  static auto select(const cagra::search_params& params,
+                     const DatasetT& dataset,
+                     cuvs::distance::DistanceType metric)
+    -> std::enable_if_t<!spec_sound<Spec, DataT, IndexT, DistanceT, DatasetT>,
+                        std::tuple<init_desc_type<DataT, IndexT, DistanceT, DatasetT>, double>>
   {
-    const auto dataset_ptr  = ptr + dataset_i * ld;
-    const unsigned lane_id  = threadIdx.x % TEAM_SIZE;
-    constexpr unsigned vlen = device::get_vlen<LOAD_T, DATA_T>();
-    // #include <raft/util/cuda_dev_essentials.cuh
-    constexpr unsigned reg_nelem = raft::ceildiv<unsigned>(DATASET_BLOCK_DIM, TEAM_SIZE * vlen);
-    raft::TxN_t<DATA_T, vlen> dl_buff[reg_nelem];
-
-    DISTANCE_T norm2 = 0;
-    if (valid) {
-      for (uint32_t elem_offset = 0; elem_offset < dim; elem_offset += DATASET_BLOCK_DIM) {
-#pragma unroll
-        for (uint32_t e = 0; e < reg_nelem; e++) {
-          const uint32_t k = (lane_id + (TEAM_SIZE * e)) * vlen + elem_offset;
-          if (k >= dim) break;
-          dl_buff[e].load(dataset_ptr, k);
-        }
-#pragma unroll
-        for (uint32_t e = 0; e < reg_nelem; e++) {
-          const uint32_t k = (lane_id + (TEAM_SIZE * e)) * vlen + elem_offset;
-          if (k >= dim) break;
-#pragma unroll
-          for (uint32_t v = 0; v < vlen; v++) {
-            const uint32_t kv = k + v;
-            // Note this loop can go above the dataset_dim for padded arrays. This is not a problem
-            // because:
-            // - Above the last element (dataset_dim-1), the query array is filled with zeros.
-            // - The data buffer has to be also padded with zeros.
-            DISTANCE_T d = query_ptr[device::swizzling(kv)];
-            norm2 += dist_op<DISTANCE_T, METRIC>(
-              d, cuvs::spatial::knn::detail::utils::mapping<float>{}(dl_buff[e].val.data[v]));
-          }
-        }
-      }
-    }
-    for (uint32_t offset = TEAM_SIZE / 2; offset > 0; offset >>= 1) {
-      norm2 += __shfl_xor_sync(0xffffffff, norm2, offset);
-    }
-    return norm2;
+    return instance_selector<Specs...>::template select<DataT, IndexT, DistanceT, DatasetT>(
+      params, dataset, metric);
   }
 };
 
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py
new file mode 100644
index 000000000..52a15e2a1
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py
@@ -0,0 +1,162 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import glob
+
+template = """/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+{includes}
+
+namespace cuvs::neighbors::cagra::detail {{
+
+using namespace cuvs::distance;
+{content}
+
+}}  // namespace cuvs::neighbors::cagra::detail
+"""
+
+mxdim_team = [(128, 8), (256, 16), (512, 32)]
+#mxdim_team = [(64, 8), (128, 16), (256, 32)]
+#mxdim_team = [(32, 8), (64, 16), (128, 32)]
+
+pq_bits = [8]
+pq_lens = [2, 4]
+
+# rblock = [(256, 4), (512, 2), (1024, 1)]
+# rcandidates = [32]
+# rsize = [256, 512]
+code_book_types = ["half"]
+
+search_types = dict(
+    float_uint32=("float", "uint32_t", "float"),  # data_t, idx_t, distance_t
+    half_uint32=("half", "uint32_t", "float"),
+    int8_uint32=("int8_t", "uint32_t", "float"),
+    uint8_uint32=("uint8_t", "uint32_t", "float"),
+    # float_uint64=("float", "uint64_t", "float"),
+    # half_uint64=("half", "uint64_t", "float"),
+)
+
+metric_prefix = 'DistanceType::'
+
+specs = []
+descs = []
+cmake_list = []
+
+
+
+
+# Cleanup first
+for f in glob.glob("compute_distance_standard_*.cu"):
+  os.remove(f)
+for f in glob.glob("compute_distance_vpq_*.cu"):
+  os.remove(f)
+
+# Generate new files
+for type_path, (data_t, idx_t, distance_t) in search_types.items():
+    for (mxdim, team) in mxdim_team:
+        # CAGRA
+        for metric in ['L2Expanded', 'InnerProduct']:
+            path = f"compute_distance_standard_{metric}_{type_path}_dim{mxdim}_t{team}.cu"
+            includes = '#include "compute_distance_standard-impl.cuh"'
+            params = f"{metric_prefix}{metric}, {team}, {mxdim}, {data_t}, {idx_t}, {distance_t}"
+            spec = f"standard_descriptor_spec<{params}>"
+            content = f"""template struct {spec};"""
+            specs.append(spec)
+            with open(path, "w") as f:
+                f.write(template.format(includes=includes, content=content))
+                cmake_list.append(f"  src/neighbors/detail/cagra/{path}")
+
+        # CAGRA-Q
+        for code_book_t in code_book_types:
+            for pq_len in pq_lens:
+                for pq_bit in pq_bits:
+                    for metric in ['L2Expanded']:
+                        path = f"compute_distance_vpq_{metric}_{type_path}_dim{mxdim}_t{team}_{pq_bit}pq_{pq_len}subd_{code_book_t}.cu"
+                        includes = '#include "compute_distance_vpq-impl.cuh"'
+                        params = f"{metric_prefix}{metric}, {team}, {mxdim}, {pq_bit}, {pq_len}, {code_book_t}, {data_t}, {idx_t}, {distance_t}"
+                        spec = f"vpq_descriptor_spec<{params}>"
+                        content = f"""template struct {spec};"""
+                        specs.append(spec)
+                        with open(path, "w") as f:
+                            f.write(template.format(includes=includes, content=content))
+                            cmake_list.append(f"  src/neighbors/detail/cagra/{path}")
+
+with open("compute_distance-ext.cuh", "w") as f:
+    includes = '''
+#pragma once
+
+#include "compute_distance_standard.hpp"
+#include "compute_distance_vpq.hpp"
+'''
+    newline = "\n"
+    contents = f'''
+{newline.join(map(lambda s: "extern template struct " + s + ";", specs))}
+
+extern template struct
+  instance_selector<{("," + newline + "                    ").join(specs)}>;
+
+using descriptor_instances =
+  instance_selector<{("," + newline + "                    ").join(specs)}>;
+
+template <typename DataT, typename IndexT, typename DistanceT, typename DatasetT>
+auto dataset_descriptor_init(const cagra::search_params& params,
+                             const DatasetT& dataset,
+                             cuvs::distance::DistanceType metric,
+                             rmm::cuda_stream_view stream)
+  -> dataset_descriptor_host<DataT, IndexT, DistanceT>
+{{
+  auto [init, priority] = descriptor_instances::select<DataT, IndexT, DistanceT>(params, dataset, metric);
+  if (init == nullptr || priority < 0) {{
+    RAFT_FAIL("No dataset descriptor instance compiled for this parameter combination.");
+  }}
+  return init(params, dataset, metric, stream);
+}}
+'''
+    f.write(template.format(includes=includes, content=contents))
+
+
+with open("compute_distance.cu", "w") as f:
+    includes = '#include "compute_distance-ext.cuh"'
+    newline = "\n"
+    contents = f'''
+template struct instance_selector<{("," + newline + "                    ").join(specs)}>;
+'''
+    f.write(template.format(includes=includes, content=contents))
+
+cmake_list.sort()
+for path in cmake_list:
+    print(path)
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh
new file mode 100644
index 000000000..b0205508a
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "compute_distance_standard.hpp"
+
+#include <cuvs/distance/distance.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/util/pow2_utils.cuh>
+
+#include <type_traits>
+
+namespace cuvs::neighbors::cagra::detail {
+namespace {
+template <typename T, cuvs::distance::DistanceType Metric>
+RAFT_DEVICE_INLINE_FUNCTION constexpr auto dist_op(T a, T b)
+  -> std::enable_if_t<Metric == cuvs::distance::DistanceType::L2Expanded, T>
+{
+  T diff = a - b;
+  return diff * diff;
+}
+
+template <typename T, cuvs::distance::DistanceType Metric>
+RAFT_DEVICE_INLINE_FUNCTION constexpr auto dist_op(T a, T b)
+  -> std::enable_if_t<Metric == cuvs::distance::DistanceType::InnerProduct, T>
+{
+  return -a * b;
+}
+}  // namespace
+
+template <cuvs::distance::DistanceType Metric,
+          uint32_t TeamSize,
+          uint32_t DatasetBlockDim,
+          typename DataT,
+          typename IndexT,
+          typename DistanceT>
+struct standard_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, IndexT, DistanceT> {
+  using base_type = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
+  using QUERY_T   = float;
+  using base_type::args;
+  using base_type::smem_ws_size_in_bytes;
+  using typename base_type::args_t;
+  using typename base_type::compute_distance_type;
+  using typename base_type::DATA_T;
+  using typename base_type::DISTANCE_T;
+  using typename base_type::INDEX_T;
+  using typename base_type::LOAD_T;
+  using typename base_type::setup_workspace_type;
+  constexpr static inline auto kMetric          = Metric;
+  constexpr static inline auto kTeamSize        = TeamSize;
+  constexpr static inline auto kDatasetBlockDim = DatasetBlockDim;
+
+  static constexpr RAFT_INLINE_FUNCTION auto ptr(const args_t& args) noexcept
+    -> const DATA_T* const&
+  {
+    return (const DATA_T* const&)(args.extra_ptr1);
+  }
+  static constexpr RAFT_INLINE_FUNCTION auto ptr(args_t& args) noexcept -> const DATA_T*&
+  {
+    return (const DATA_T*&)(args.extra_ptr1);
+  }
+
+  static constexpr RAFT_INLINE_FUNCTION auto ld(const args_t& args) noexcept -> const uint32_t&
+  {
+    return args.extra_word1;
+  }
+  static constexpr RAFT_INLINE_FUNCTION auto ld(args_t& args) noexcept -> uint32_t&
+  {
+    return args.extra_word1;
+  }
+
+  _RAFT_HOST_DEVICE standard_dataset_descriptor_t(setup_workspace_type* setup_workspace_impl,
+                                                  compute_distance_type* compute_distance_impl,
+                                                  const DATA_T* ptr,
+                                                  INDEX_T size,
+                                                  uint32_t dim,
+                                                  uint32_t ld)
+    : base_type(setup_workspace_impl,
+                compute_distance_impl,
+                size,
+                dim,
+                raft::Pow2<TeamSize>::Log2,
+                get_smem_ws_size_in_bytes(dim))
+  {
+    standard_dataset_descriptor_t::ptr(args) = ptr;
+    standard_dataset_descriptor_t::ld(args)  = ld;
+    static_assert(sizeof(*this) == sizeof(base_type));
+    static_assert(alignof(standard_dataset_descriptor_t) == alignof(base_type));
+  }
+
+ private:
+  RAFT_INLINE_FUNCTION constexpr static auto get_smem_ws_size_in_bytes(uint32_t dim) -> uint32_t
+  {
+    return sizeof(standard_dataset_descriptor_t) +
+           raft::round_up_safe<uint32_t>(dim, DatasetBlockDim) * sizeof(QUERY_T);
+  }
+};
+
+template <typename DescriptorT>
+_RAFT_DEVICE __noinline__ auto setup_workspace_standard(
+  const DescriptorT* that,
+  void* smem_ptr,
+  const typename DescriptorT::DATA_T* queries_ptr,
+  uint32_t query_id) -> const DescriptorT*
+{
+  using DATA_T                    = typename DescriptorT::DATA_T;
+  using LOAD_T                    = typename DescriptorT::LOAD_T;
+  using base_type                 = typename DescriptorT::base_type;
+  using QUERY_T                   = typename DescriptorT::QUERY_T;
+  using word_type                 = uint32_t;
+  constexpr auto kTeamSize        = DescriptorT::kTeamSize;
+  constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim;
+  auto* r                         = reinterpret_cast<DescriptorT*>(smem_ptr);
+  auto* buf                       = reinterpret_cast<QUERY_T*>(r + 1);
+  if (r != that) {
+    constexpr uint32_t kCount = sizeof(DescriptorT) / sizeof(word_type);
+    using blob_type           = word_type[kCount];
+    auto& src                 = reinterpret_cast<const blob_type&>(*that);
+    auto& dst                 = reinterpret_cast<blob_type&>(*r);
+    for (uint32_t i = threadIdx.x; i < kCount; i += blockDim.x) {
+      dst[i] = src[i];
+    }
+    const auto smem_ptr_offset =
+      reinterpret_cast<uint8_t*>(&(r->args.smem_ws_ptr)) - reinterpret_cast<uint8_t*>(r);
+    if (threadIdx.x == uint32_t(smem_ptr_offset / sizeof(word_type))) {
+      r->args.smem_ws_ptr = uint32_t(__cvta_generic_to_shared(buf));
+    }
+    __syncthreads();
+  }
+
+  uint32_t dim        = r->args.dim;
+  auto buf_len        = raft::round_up_safe<uint32_t>(dim, kDatasetBlockDim);
+  constexpr auto vlen = device::get_vlen<LOAD_T, DATA_T>();
+  queries_ptr += dim * query_id;
+  for (unsigned i = threadIdx.x; i < buf_len; i += blockDim.x) {
+    unsigned j = device::swizzling<kDatasetBlockDim, vlen * kTeamSize>(i);
+    if (i < dim) {
+      buf[j] = cuvs::spatial::knn::detail::utils::mapping<QUERY_T>{}(queries_ptr[i]);
+    } else {
+      buf[j] = 0.0;
+    }
+  }
+
+  return const_cast<const DescriptorT*>(r);
+}
+
+template <typename DescriptorT>
+RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_standard_worker(
+  const typename DescriptorT::DATA_T* __restrict__ dataset_ptr,
+  uint32_t dim,
+  uint32_t query_smem_ptr) -> typename DescriptorT::DISTANCE_T
+{
+  using DATA_T                    = typename DescriptorT::DATA_T;
+  using DISTANCE_T                = typename DescriptorT::DISTANCE_T;
+  using LOAD_T                    = typename DescriptorT::LOAD_T;
+  using QUERY_T                   = typename DescriptorT::QUERY_T;
+  constexpr auto kTeamSize        = DescriptorT::kTeamSize;
+  constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim;
+  constexpr auto vlen             = device::get_vlen<LOAD_T, DATA_T>();
+  constexpr auto reg_nelem =
+    raft::div_rounding_up_unsafe<uint32_t>(kDatasetBlockDim, kTeamSize * vlen);
+
+  DISTANCE_T r = 0;
+  for (uint32_t elem_offset = (threadIdx.x % kTeamSize) * vlen; elem_offset < dim;
+       elem_offset += kDatasetBlockDim) {
+    DATA_T data[reg_nelem][vlen];
+#pragma unroll
+    for (uint32_t e = 0; e < reg_nelem; e++) {
+      const uint32_t k = e * (kTeamSize * vlen) + elem_offset;
+      if (k >= dim) break;
+      device::ldg_cg(reinterpret_cast<LOAD_T&>(data[e]),
+                     reinterpret_cast<const LOAD_T*>(dataset_ptr + k));
+    }
+#pragma unroll
+    for (uint32_t e = 0; e < reg_nelem; e++) {
+      const uint32_t k = e * (kTeamSize * vlen) + elem_offset;
+      if (k >= dim) break;
+#pragma unroll
+      for (uint32_t v = 0; v < vlen; v++) {
+        // Note this loop can go above the dataset_dim for padded arrays. This is not a problem
+        // because:
+        // - Above the last element (dataset_dim-1), the query array is filled with zeros.
+        // - The data buffer has to be also padded with zeros.
+        DISTANCE_T d;
+        device::lds(
+          d,
+          query_smem_ptr +
+            sizeof(QUERY_T) * device::swizzling<kDatasetBlockDim, vlen * kTeamSize>(k + v));
+        r += dist_op<DISTANCE_T, DescriptorT::kMetric>(
+          d, cuvs::spatial::knn::detail::utils::mapping<DISTANCE_T>{}(data[e][v]));
+      }
+    }
+  }
+  return r;
+}
+
+template <typename DescriptorT>
+_RAFT_DEVICE __noinline__ auto compute_distance_standard(
+  const typename DescriptorT::args_t args, const typename DescriptorT::INDEX_T dataset_index) ->
+  typename DescriptorT::DISTANCE_T
+{
+  return compute_distance_standard_worker<DescriptorT>(
+    DescriptorT::ptr(args) + (static_cast<std::uint64_t>(DescriptorT::ld(args)) * dataset_index),
+    args.dim,
+    args.smem_ws_ptr);
+}
+
+template <cuvs::distance::DistanceType Metric,
+          uint32_t TeamSize,
+          uint32_t DatasetBlockDim,
+          typename DataT,
+          typename IndexT,
+          typename DistanceT>
+RAFT_KERNEL __launch_bounds__(1, 1)
+  standard_dataset_descriptor_init_kernel(dataset_descriptor_base_t<DataT, IndexT, DistanceT>* out,
+                                          const DataT* ptr,
+                                          IndexT size,
+                                          uint32_t dim,
+                                          uint32_t ld)
+{
+  using desc_type =
+    standard_dataset_descriptor_t<Metric, TeamSize, DatasetBlockDim, DataT, IndexT, DistanceT>;
+  using base_type = typename desc_type::base_type;
+  new (out) desc_type(reinterpret_cast<typename base_type::setup_workspace_type*>(
+                        &setup_workspace_standard<desc_type>),
+                      reinterpret_cast<typename base_type::compute_distance_type*>(
+                        &compute_distance_standard<desc_type>),
+                      ptr,
+                      size,
+                      dim,
+                      ld);
+}
+
+template <cuvs::distance::DistanceType Metric,
+          uint32_t TeamSize,
+          uint32_t DatasetBlockDim,
+          typename DataT,
+          typename IndexT,
+          typename DistanceT>
+dataset_descriptor_host<DataT, IndexT, DistanceT>
+standard_descriptor_spec<Metric, TeamSize, DatasetBlockDim, DataT, IndexT, DistanceT>::init_(
+  const cagra::search_params& params,
+  const DataT* ptr,
+  IndexT size,
+  uint32_t dim,
+  uint32_t ld,
+  rmm::cuda_stream_view stream)
+{
+  using desc_type =
+    standard_dataset_descriptor_t<Metric, TeamSize, DatasetBlockDim, DataT, IndexT, DistanceT>;
+  using base_type = typename desc_type::base_type;
+  desc_type dd_host{nullptr, nullptr, ptr, size, dim, ld};
+  host_type result{dd_host, stream};
+
+  standard_dataset_descriptor_init_kernel<Metric,
+                                          TeamSize,
+                                          DatasetBlockDim,
+                                          DataT,
+                                          IndexT,
+                                          DistanceT>
+    <<<1, 1, 0, stream>>>(result.dev_ptr(), ptr, size, dim, desc_type::ld(dd_host.args));
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  return result;
+}
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard.hpp b/cpp/src/neighbors/detail/cagra/compute_distance_standard.hpp
new file mode 100644
index 000000000..df1b77e86
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard.hpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "compute_distance.hpp"
+
+#include <cuvs/distance/distance.hpp>
+
+#include <type_traits>
+
+namespace cuvs::neighbors::cagra::detail {
+
+template <cuvs::distance::DistanceType Metric,
+          uint32_t TeamSize,
+          uint32_t DatasetBlockDim,
+          typename DataT,
+          typename IndexT,
+          typename DistanceT>
+struct standard_descriptor_spec : public instance_spec<DataT, IndexT, DistanceT> {
+  using base_type = instance_spec<DataT, IndexT, DistanceT>;
+  using typename base_type::data_type;
+  using typename base_type::distance_type;
+  using typename base_type::host_type;
+  using typename base_type::index_type;
+
+  template <typename DatasetT>
+  constexpr static inline bool accepts_dataset()
+  {
+    return is_strided_dataset_v<DatasetT>;
+  }
+
+  template <typename DatasetT>
+  static auto init(const cagra::search_params& params,
+                   const DatasetT& dataset,
+                   cuvs::distance::DistanceType metric,
+                   rmm::cuda_stream_view stream) -> host_type
+  {
+    return init_(params,
+                 dataset.view().data_handle(),
+                 IndexT(dataset.n_rows()),
+                 dataset.dim(),
+                 dataset.stride(),
+                 stream);
+  }
+
+  template <typename DatasetT>
+  static auto priority(const cagra::search_params& params,
+                       const DatasetT& dataset,
+                       cuvs::distance::DistanceType metric) -> double
+  {
+    // If explicit team_size is specified and doesn't match the instance, discard it
+    if (params.team_size != 0 && TeamSize != params.team_size) { return -1.0; }
+    if (Metric != metric) { return -1.0; }
+    // Otherwise, favor the closest dataset dimensionality.
+    return 1.0 / (0.1 + std::abs(double(dataset.dim()) - double(DatasetBlockDim)));
+  }
+
+ private:
+  static dataset_descriptor_host<DataT, IndexT, DistanceT> init_(const cagra::search_params& params,
+                                                                 const DataT* ptr,
+                                                                 IndexT size,
+                                                                 uint32_t dim,
+                                                                 uint32_t ld,
+                                                                 rmm::cuda_stream_view stream);
+};
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim128_t8.cu
similarity index 51%
rename from cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim128_t8_8pq_2subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim128_t8.cu
index 1116eaaa4..af5e89a76 100644
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim128_t8_8pq_2subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim128_t8.cu
@@ -15,22 +15,24 @@
  */
 
 /*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
+ * NOTE: this file is generated by compute_distance_00_generate.py
  *
  * Make changes there and run in this directory:
  *
- * > python q_search_multi_cta_00_generate.py
+ * > python compute_distance_00_generate.py
  *
  */
 
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
+#include "compute_distance_standard-impl.cuh"
 
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
+namespace cuvs::neighbors::cagra::detail {
 
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                         8,
+                                         128,
+                                         float,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim256_t16.cu
similarity index 51%
rename from cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim128_t8_8pq_4subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim256_t16.cu
index 7e3ec363d..332eb6bf9 100644
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim128_t8_8pq_4subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim256_t16.cu
@@ -15,22 +15,24 @@
  */
 
 /*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
+ * NOTE: this file is generated by compute_distance_00_generate.py
  *
  * Make changes there and run in this directory:
  *
- * > python q_search_multi_cta_00_generate.py
+ * > python compute_distance_00_generate.py
  *
  */
 
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
+#include "compute_distance_standard-impl.cuh"
 
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
+namespace cuvs::neighbors::cagra::detail {
 
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                         16,
+                                         256,
+                                         float,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim512_t32.cu
similarity index 51%
rename from cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim128_t8_8pq_2subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim512_t32.cu
index af60c776a..3e5c11240 100644
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim128_t8_8pq_2subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim512_t32.cu
@@ -15,22 +15,24 @@
  */
 
 /*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
+ * NOTE: this file is generated by compute_distance_00_generate.py
  *
  * Make changes there and run in this directory:
  *
- * > python q_search_multi_cta_00_generate.py
+ * > python compute_distance_00_generate.py
  *
  */
 
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
+#include "compute_distance_standard-impl.cuh"
 
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
+namespace cuvs::neighbors::cagra::detail {
 
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                         32,
+                                         512,
+                                         float,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim128_t8.cu
new file mode 100644
index 000000000..92ca114f7
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim128_t8.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::InnerProduct, 8, 128, half, uint32_t, float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim256_t16.cu
similarity index 51%
rename from cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim128_t8_8pq_4subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim256_t16.cu
index 5dd79a79b..cfad79f3a 100644
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim128_t8_8pq_4subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim256_t16.cu
@@ -15,22 +15,24 @@
  */
 
 /*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
+ * NOTE: this file is generated by compute_distance_00_generate.py
  *
  * Make changes there and run in this directory:
  *
- * > python q_search_multi_cta_00_generate.py
+ * > python compute_distance_00_generate.py
  *
  */
 
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
+#include "compute_distance_standard-impl.cuh"
 
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
+namespace cuvs::neighbors::cagra::detail {
 
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                         16,
+                                         256,
+                                         half,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim512_t32.cu
new file mode 100644
index 000000000..8c208044b
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim512_t32.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                         32,
+                                         512,
+                                         half,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim128_t8.cu
new file mode 100644
index 000000000..929df5bbe
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim128_t8.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                         8,
+                                         128,
+                                         int8_t,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim256_t16.cu
new file mode 100644
index 000000000..3cc4a2c95
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim256_t16.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                         16,
+                                         256,
+                                         int8_t,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim512_t32.cu
new file mode 100644
index 000000000..a87e866eb
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim512_t32.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                         32,
+                                         512,
+                                         int8_t,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim128_t8.cu
new file mode 100644
index 000000000..650d9ecac
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim128_t8.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                         8,
+                                         128,
+                                         uint8_t,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim256_t16.cu
new file mode 100644
index 000000000..6f7f4b97f
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim256_t16.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                         16,
+                                         256,
+                                         uint8_t,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim512_t32.cu
new file mode 100644
index 000000000..e7b96ab49
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim512_t32.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::InnerProduct,
+                                         32,
+                                         512,
+                                         uint8_t,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim128_t8.cu
new file mode 100644
index 000000000..b45cf3669
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim128_t8.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, float, uint32_t, float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim256_t16.cu
new file mode 100644
index 000000000..7d1206c37
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim256_t16.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, float, uint32_t, float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim512_t32.cu
new file mode 100644
index 000000000..251316b2c
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim512_t32.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, float, uint32_t, float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim128_t8.cu
new file mode 100644
index 000000000..e3870df40
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim128_t8.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, half, uint32_t, float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim256_t16.cu
new file mode 100644
index 000000000..1253d7cd4
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim256_t16.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::L2Expanded, 16, 256, half, uint32_t, float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim512_t32.cu
new file mode 100644
index 000000000..792532c2c
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim512_t32.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::L2Expanded, 32, 512, half, uint32_t, float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim128_t8.cu
new file mode 100644
index 000000000..c9c960cf9
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim128_t8.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::L2Expanded, 8, 128, int8_t, uint32_t, float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim256_t16.cu
new file mode 100644
index 000000000..d7a12804b
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim256_t16.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                         16,
+                                         256,
+                                         int8_t,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim512_t32.cu
new file mode 100644
index 000000000..a4f06c283
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim512_t32.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                         32,
+                                         512,
+                                         int8_t,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim128_t8.cu
new file mode 100644
index 000000000..199f05e49
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim128_t8.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                         8,
+                                         128,
+                                         uint8_t,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim256_t16.cu
new file mode 100644
index 000000000..0962ecd82
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim256_t16.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                         16,
+                                         256,
+                                         uint8_t,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim512_t32.cu
new file mode 100644
index 000000000..9c7e4ab03
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim512_t32.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_standard-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct standard_descriptor_spec<DistanceType::L2Expanded,
+                                         32,
+                                         512,
+                                         uint8_t,
+                                         uint32_t,
+                                         float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
new file mode 100644
index 000000000..86c592502
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -0,0 +1,466 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "compute_distance_vpq.hpp"
+
+#include <cuvs/distance/distance.hpp>
+#include <raft/util/integer_utils.hpp>
+#include <raft/util/pow2_utils.cuh>
+
+#include <type_traits>
+
+namespace cuvs::neighbors::cagra::detail {
+
+template <cuvs::distance::DistanceType Metric,
+          uint32_t TeamSize,
+          uint32_t DatasetBlockDim,
+          uint32_t PQ_BITS,
+          uint32_t PQ_LEN,
+          typename CodebookT,
+          typename DataT,
+          typename IndexT,
+          typename DistanceT>
+struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, IndexT, DistanceT> {
+  using base_type   = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
+  using CODE_BOOK_T = CodebookT;
+  using QUERY_T     = half;
+  using base_type::args;
+  using base_type::extra_ptr3;
+  using typename base_type::args_t;
+  using typename base_type::compute_distance_type;
+  using typename base_type::DATA_T;
+  using typename base_type::DISTANCE_T;
+  using typename base_type::INDEX_T;
+  using typename base_type::LOAD_T;
+  using typename base_type::setup_workspace_type;
+  constexpr static inline auto kMetric          = Metric;
+  constexpr static inline auto kTeamSize        = TeamSize;
+  constexpr static inline auto kDatasetBlockDim = DatasetBlockDim;
+  constexpr static inline auto kPqBits          = PQ_BITS;
+  constexpr static inline auto kPqLen           = PQ_LEN;
+
+  static_assert(std::is_same_v<CODE_BOOK_T, half>, "Only CODE_BOOK_T = `half` is supported now");
+
+  RAFT_INLINE_FUNCTION static constexpr auto encoded_dataset_ptr(args_t& args) noexcept
+    -> const uint8_t*&
+  {
+    return (const uint8_t*&)args.extra_ptr1;
+  }
+  RAFT_INLINE_FUNCTION static constexpr auto vq_code_book_ptr(args_t& args) noexcept
+    -> const CODE_BOOK_T*&
+  {
+    return (const CODE_BOOK_T*&)args.extra_ptr2;
+  }
+  RAFT_INLINE_FUNCTION constexpr auto pq_code_book_ptr() noexcept -> const CODE_BOOK_T*&
+  {
+    return (const CODE_BOOK_T*&)extra_ptr3;
+  }
+  RAFT_INLINE_FUNCTION static constexpr auto encoded_dataset_dim(args_t& args) noexcept -> uint32_t&
+  {
+    return args.extra_word1;
+  }
+
+  RAFT_INLINE_FUNCTION static constexpr auto encoded_dataset_ptr(const args_t& args) noexcept
+    -> const uint8_t* const&
+  {
+    return (const uint8_t*&)args.extra_ptr1;
+  }
+  RAFT_INLINE_FUNCTION static constexpr auto vq_code_book_ptr(const args_t& args) noexcept
+    -> const CODE_BOOK_T* const&
+  {
+    return (const CODE_BOOK_T*&)args.extra_ptr2;
+  }
+  RAFT_INLINE_FUNCTION constexpr auto pq_code_book_ptr() const noexcept -> const CODE_BOOK_T* const&
+  {
+    return (const CODE_BOOK_T*&)extra_ptr3;
+  }
+  RAFT_INLINE_FUNCTION static constexpr auto encoded_dataset_dim(const args_t& args) noexcept
+    -> const uint32_t&
+  {
+    return args.extra_word1;
+  }
+
+  static constexpr std::uint32_t kSMemCodeBookSizeInBytes =
+    (1 << PQ_BITS) * PQ_LEN * utils::size_of<CODE_BOOK_T>();
+
+  _RAFT_HOST_DEVICE cagra_q_dataset_descriptor_t(setup_workspace_type* setup_workspace_impl,
+                                                 compute_distance_type* compute_distance_impl,
+                                                 const std::uint8_t* encoded_dataset_ptr,
+                                                 std::uint32_t encoded_dataset_dim,
+                                                 const CODE_BOOK_T* vq_code_book_ptr,
+                                                 const CODE_BOOK_T* pq_code_book_ptr,
+                                                 IndexT size,
+                                                 std::uint32_t dim)
+    : base_type(setup_workspace_impl,
+                compute_distance_impl,
+                size,
+                dim,
+                raft::Pow2<TeamSize>::Log2,
+                get_smem_ws_size_in_bytes(dim))
+  {
+    cagra_q_dataset_descriptor_t::encoded_dataset_ptr(args) = encoded_dataset_ptr;
+    cagra_q_dataset_descriptor_t::vq_code_book_ptr(args)    = vq_code_book_ptr;
+    this->pq_code_book_ptr()                                = pq_code_book_ptr;
+    cagra_q_dataset_descriptor_t::encoded_dataset_dim(args) = encoded_dataset_dim;
+    static_assert(sizeof(*this) == sizeof(base_type));
+    static_assert(alignof(cagra_q_dataset_descriptor_t) == alignof(base_type));
+  }
+
+ private:
+  RAFT_INLINE_FUNCTION constexpr static auto get_smem_ws_size_in_bytes(uint32_t dim) -> uint32_t
+  {
+    /* SMEM workspace layout:
+      1. The descriptor itself
+      2. Codebook (kSMemCodeBookSizeInBytes bytes)
+      3. Queries (smem_query_buffer_length elems)
+    */
+    return sizeof(cagra_q_dataset_descriptor_t) + kSMemCodeBookSizeInBytes +
+           raft::round_up_safe<uint32_t>(dim, DatasetBlockDim) * sizeof(QUERY_T);
+  }
+};
+
+template <auto Block, auto Stride, typename T>
+RAFT_DEVICE_INLINE_FUNCTION constexpr auto transpose(T x) -> T
+{
+  auto i = x % Block;
+  auto j = x / Block;
+  auto k = i % Stride;
+  auto l = i / Stride;
+  return j * Block + k * (Block / Stride) + l;
+}
+
+template <typename DescriptorT>
+_RAFT_DEVICE __noinline__ auto setup_workspace_vpq(const DescriptorT* that,
+                                                   void* smem_ptr,
+                                                   const typename DescriptorT::DATA_T* queries_ptr,
+                                                   uint32_t query_id) -> const DescriptorT*
+{
+  using QUERY_T                   = typename DescriptorT::QUERY_T;
+  using CODE_BOOK_T               = typename DescriptorT::CODE_BOOK_T;
+  using word_type                 = uint32_t;
+  constexpr auto kDatasetBlockDim = DescriptorT::kDatasetBlockDim;
+  constexpr auto PQ_BITS          = DescriptorT::kPqBits;
+  constexpr auto PQ_LEN           = DescriptorT::kPqLen;
+
+  auto* r = reinterpret_cast<DescriptorT*>(smem_ptr);
+
+  if (r != that) {
+    constexpr uint32_t kCount = sizeof(DescriptorT) / sizeof(word_type);
+    using blob_type           = word_type[kCount];
+    auto& src                 = reinterpret_cast<const blob_type&>(*that);
+    auto& dst                 = reinterpret_cast<blob_type&>(*r);
+    for (uint32_t i = threadIdx.x; i < kCount; i += blockDim.x) {
+      dst[i] = src[i];
+    }
+
+    auto codebook_buf = uint32_t(__cvta_generic_to_shared(r + 1));
+    const auto smem_ptr_offset =
+      reinterpret_cast<uint8_t*>(&(r->args.smem_ws_ptr)) - reinterpret_cast<uint8_t*>(r);
+    if (threadIdx.x == uint32_t(smem_ptr_offset / sizeof(word_type))) {
+      r->args.smem_ws_ptr = codebook_buf;
+    }
+    __syncthreads();
+
+    // Copy PQ table
+    for (unsigned i = threadIdx.x * 2; i < (1 << PQ_BITS) * PQ_LEN; i += blockDim.x * 2) {
+      half2 buf2;
+      buf2.x = r->pq_code_book_ptr()[i];
+      buf2.y = r->pq_code_book_ptr()[i + 1];
+
+      // Change the order of PQ code book array to reduce the
+      // frequency of bank conflicts.
+      constexpr auto num_elements_per_bank  = 4 / utils::size_of<CODE_BOOK_T>();
+      constexpr auto num_banks_per_subspace = PQ_LEN / num_elements_per_bank;
+      const auto j                          = i / num_elements_per_bank;
+      const auto smem_index =
+        (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS);
+
+      device::sts(codebook_buf + smem_index * sizeof(half2), buf2);
+    }
+  }
+
+  uint32_t dim = r->args.dim;
+  queries_ptr += dim * query_id;
+
+  constexpr cuvs::spatial::knn::detail::utils::mapping<QUERY_T> mapping{};
+  auto smem_query_ptr =
+    reinterpret_cast<QUERY_T*>(reinterpret_cast<uint8_t*>(smem_ptr) + sizeof(DescriptorT) +
+                               DescriptorT::kSMemCodeBookSizeInBytes);
+  for (unsigned i = threadIdx.x * 2; i < dim; i += blockDim.x * 2) {
+    half2 buf2{0, 0};
+    if (i < dim) { buf2.x = mapping(queries_ptr[i]); }
+    if (i + 1 < dim) { buf2.y = mapping(queries_ptr[i + 1]); }
+    if constexpr ((PQ_BITS == 8) && (PQ_LEN % 2 == 0)) {
+      // Transpose the queries buffer to avoid bank conflicts in compute_distance.
+      constexpr uint32_t vlen = 4;  // **** DO NOT CHANGE ****
+      constexpr auto kStride  = vlen * PQ_LEN / 2;
+      reinterpret_cast<half2*>(smem_query_ptr)[transpose<kDatasetBlockDim / 2, kStride>(i / 2)] =
+        buf2;
+    } else {
+      (reinterpret_cast<half2*>(smem_query_ptr + i))[0] = buf2;
+    }
+  }
+
+  return const_cast<const DescriptorT*>(r);
+}
+
+template <typename DescriptorT>
+_RAFT_DEVICE RAFT_DEVICE_INLINE_FUNCTION auto compute_distance_vpq_worker(
+  const uint8_t* __restrict__ dataset_ptr,
+  const typename DescriptorT::CODE_BOOK_T* __restrict__ vq_code_book_ptr,
+  uint32_t dim,
+  uint32_t pq_codebook_ptr) -> typename DescriptorT::DISTANCE_T
+{
+  using DISTANCE_T               = typename DescriptorT::DISTANCE_T;
+  using LOAD_T                   = typename DescriptorT::LOAD_T;
+  using QUERY_T                  = typename DescriptorT::QUERY_T;
+  using CODE_BOOK_T              = typename DescriptorT::CODE_BOOK_T;
+  constexpr auto TeamSize        = DescriptorT::kTeamSize;
+  constexpr auto DatasetBlockDim = DescriptorT::kDatasetBlockDim;
+  constexpr auto PQ_BITS         = DescriptorT::kPqBits;
+  constexpr auto PQ_LEN          = DescriptorT::kPqLen;
+
+  const uint32_t query_ptr = pq_codebook_ptr + DescriptorT::kSMemCodeBookSizeInBytes;
+  static_assert(PQ_BITS == 8, "Only pq_bits == 8 is supported at the moment.");
+  constexpr uint32_t vlen = 4;  // **** DO NOT CHANGE ****
+  constexpr uint32_t nelem =
+    raft::div_rounding_up_unsafe<uint32_t>(DatasetBlockDim / PQ_LEN, TeamSize * vlen);
+
+  constexpr auto kTeamMask = DescriptorT::kTeamSize - 1;
+  constexpr auto kTeamVLen = TeamSize * vlen;
+
+  const auto n_subspace = raft::div_rounding_up_unsafe(dim, PQ_LEN);
+  const auto laneId     = threadIdx.x & kTeamMask;
+  DISTANCE_T norm       = 0;
+  for (uint32_t elem_offset = 0; elem_offset * PQ_LEN < dim;
+       elem_offset += DatasetBlockDim / PQ_LEN) {
+    // Loading PQ codes
+    uint32_t pq_codes[nelem];
+#pragma unroll
+    for (std::uint32_t e = 0; e < nelem; e++) {
+      const std::uint32_t k = e * kTeamVLen + elem_offset + laneId * vlen;
+      if (k >= n_subspace) break;
+      // Loading 4 x 8-bit PQ-codes using 32-bit load ops (from device memory)
+      device::ldg_cg(pq_codes[e], reinterpret_cast<const std::uint32_t*>(dataset_ptr + 4 + k));
+    }
+    //
+    if constexpr (PQ_LEN % 2 == 0) {
+      // **** Use half2 for distance computation ****
+#pragma unroll
+      for (std::uint32_t e = 0; e < nelem; e++) {
+        const std::uint32_t k = e * kTeamVLen + elem_offset + laneId * vlen;
+        if (k >= n_subspace) break;
+        // Loading VQ code-book
+        half2 vq_vals[PQ_LEN][vlen / 2];
+#pragma unroll
+        for (std::uint32_t m = 0; m < PQ_LEN; m++) {
+          const uint32_t d = (vlen * m) + (PQ_LEN * k);
+          if (d >= dim) break;
+          device::ldg_ca(vq_vals[m], vq_code_book_ptr + d);
+        }
+        // Compute distance
+        std::uint32_t pq_code = pq_codes[e];
+#pragma unroll
+        for (std::uint32_t v = 0; v < vlen; v++) {
+          if (PQ_LEN * (v + k) >= dim) break;
+#pragma unroll
+          for (std::uint32_t m = 0; m < PQ_LEN / 2; m++) {
+            constexpr auto kQueryBlock = DatasetBlockDim / (vlen * PQ_LEN);
+            const std::uint32_t d1     = m + (PQ_LEN / 2) * v;
+            const std::uint32_t d =
+              d1 * kQueryBlock + elem_offset * (PQ_LEN / 2) + e * TeamSize + laneId;
+            half2 q2, c2;
+            // Loading query vector from smem
+            device::lds(q2, query_ptr + sizeof(half2) * d);
+            // Loading PQ code book from smem
+            device::lds(c2,
+                        pq_codebook_ptr +
+                          sizeof(CODE_BOOK_T) * ((1 << PQ_BITS) * 2 * m + (2 * (pq_code & 0xff))));
+            // L2 distance
+            auto dist = q2 - c2 - reinterpret_cast<half2(&)[PQ_LEN * vlen / 2]>(vq_vals)[d1];
+            dist      = dist * dist;
+            norm += static_cast<DISTANCE_T>(dist.x + dist.y);
+          }
+          pq_code >>= 8;
+        }
+      }
+    } else {
+      // **** Use float for distance computation ****
+#pragma unroll
+      for (std::uint32_t e = 0; e < nelem; e++) {
+        const std::uint32_t k = e * kTeamVLen + elem_offset + laneId * vlen;
+        if (k >= n_subspace) break;
+        // Loading VQ code-book
+        CODE_BOOK_T vq_vals[PQ_LEN][vlen];
+#pragma unroll
+        for (std::uint32_t m = 0; m < PQ_LEN; m++) {
+          const std::uint32_t d = (vlen * m) + (PQ_LEN * k);
+          if (d >= dim) break;
+          // Loading 4 x 8/16-bit VQ-values using 32/64-bit load ops (from L2$ or device memory)
+          device::ldg_ca(vq_vals[m], vq_code_book_ptr + d);
+        }
+        // Compute distance
+        std::uint32_t pq_code = pq_codes[e];
+#pragma unroll
+        for (std::uint32_t v = 0; v < vlen; v++) {
+          if (PQ_LEN * (v + k) >= dim) break;
+          CODE_BOOK_T pq_vals[PQ_LEN];
+          device::lds(pq_vals, pq_codebook_ptr + sizeof(CODE_BOOK_T) * PQ_LEN * (pq_code & 0xff));
+#pragma unroll
+          for (std::uint32_t m = 0; m < PQ_LEN; m++) {
+            const std::uint32_t d1 = m + (PQ_LEN * v);
+            const std::uint32_t d  = d1 + (PQ_LEN * k);
+            // if (d >= dataset_dim) break;
+            DISTANCE_T diff;
+            device::lds(diff, query_ptr + sizeof(QUERY_T) * d);
+            diff -= static_cast<DISTANCE_T>(pq_vals[m]);
+            diff -=
+              static_cast<DISTANCE_T>(reinterpret_cast<CODE_BOOK_T(&)[PQ_LEN * vlen]>(vq_vals)[d1]);
+            norm += diff * diff;
+          }
+          pq_code >>= 8;
+        }
+      }
+    }
+  }
+  return norm;
+}
+
+template <typename DescriptorT>
+_RAFT_DEVICE __noinline__ auto compute_distance_vpq(
+  const typename DescriptorT::args_t args, const typename DescriptorT::INDEX_T dataset_index) ->
+  typename DescriptorT::DISTANCE_T
+{
+  const auto* dataset_ptr =
+    DescriptorT::encoded_dataset_ptr(args) +
+    (static_cast<std::uint64_t>(DescriptorT::encoded_dataset_dim(args)) * dataset_index);
+  uint32_t vq_code;
+  device::ldg_cg(vq_code, reinterpret_cast<const std::uint32_t*>(dataset_ptr));
+  return compute_distance_vpq_worker<DescriptorT>(
+    dataset_ptr /* advance dataset pointer by the size of vq_code */,
+    DescriptorT::vq_code_book_ptr(args) + args.dim * vq_code,
+    args.dim,
+    args.smem_ws_ptr);
+}
+
+template <cuvs::distance::DistanceType Metric,
+          uint32_t TeamSize,
+          uint32_t DatasetBlockDim,
+          uint32_t PqBits,
+          uint32_t PqLen,
+          typename CodebookT,
+          typename DataT,
+          typename IndexT,
+          typename DistanceT>
+RAFT_KERNEL __launch_bounds__(1, 1)
+  vpq_dataset_descriptor_init_kernel(dataset_descriptor_base_t<DataT, IndexT, DistanceT>* out,
+                                     const std::uint8_t* encoded_dataset_ptr,
+                                     uint32_t encoded_dataset_dim,
+                                     const CodebookT* vq_code_book_ptr,
+                                     const CodebookT* pq_code_book_ptr,
+                                     IndexT size,
+                                     uint32_t dim)
+{
+  using desc_type = cagra_q_dataset_descriptor_t<Metric,
+                                                 TeamSize,
+                                                 DatasetBlockDim,
+                                                 PqBits,
+                                                 PqLen,
+                                                 CodebookT,
+                                                 DataT,
+                                                 IndexT,
+                                                 DistanceT>;
+  using base_type = typename desc_type::base_type;
+  new (out) desc_type(
+    reinterpret_cast<typename base_type::setup_workspace_type*>(&setup_workspace_vpq<desc_type>),
+    reinterpret_cast<typename base_type::compute_distance_type*>(&compute_distance_vpq<desc_type>),
+    encoded_dataset_ptr,
+    encoded_dataset_dim,
+    vq_code_book_ptr,
+    pq_code_book_ptr,
+    size,
+    dim);
+}
+
+template <cuvs::distance::DistanceType Metric,
+          uint32_t TeamSize,
+          uint32_t DatasetBlockDim,
+          uint32_t PqBits,
+          uint32_t PqLen,
+          typename CodebookT,
+          typename DataT,
+          typename IndexT,
+          typename DistanceT>
+dataset_descriptor_host<DataT, IndexT, DistanceT>
+vpq_descriptor_spec<Metric,
+                    TeamSize,
+                    DatasetBlockDim,
+                    PqBits,
+                    PqLen,
+                    CodebookT,
+                    DataT,
+                    IndexT,
+                    DistanceT>::init_(const cagra::search_params& params,
+                                      const std::uint8_t* encoded_dataset_ptr,
+                                      uint32_t encoded_dataset_dim,
+                                      const CodebookT* vq_code_book_ptr,
+                                      const CodebookT* pq_code_book_ptr,
+                                      IndexT size,
+                                      uint32_t dim,
+                                      rmm::cuda_stream_view stream)
+{
+  using desc_type = cagra_q_dataset_descriptor_t<Metric,
+                                                 TeamSize,
+                                                 DatasetBlockDim,
+                                                 PqBits,
+                                                 PqLen,
+                                                 CodebookT,
+                                                 DataT,
+                                                 IndexT,
+                                                 DistanceT>;
+  using base_type = typename desc_type::base_type;
+
+  desc_type dd_host{nullptr,
+                    nullptr,
+                    encoded_dataset_ptr,
+                    encoded_dataset_dim,
+                    vq_code_book_ptr,
+                    pq_code_book_ptr,
+                    size,
+                    dim};
+  host_type result{dd_host, stream};
+  vpq_dataset_descriptor_init_kernel<Metric,
+                                     TeamSize,
+                                     DatasetBlockDim,
+                                     PqBits,
+                                     PqLen,
+                                     CodebookT,
+                                     DataT,
+                                     IndexT,
+                                     DistanceT><<<1, 1, 0, stream>>>(result.dev_ptr(),
+                                                                     encoded_dataset_ptr,
+                                                                     encoded_dataset_dim,
+                                                                     vq_code_book_ptr,
+                                                                     pq_code_book_ptr,
+                                                                     size,
+                                                                     dim);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  return result;
+}
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh
deleted file mode 100644
index 68973662f..000000000
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "compute_distance.hpp"
-
-#include <cuvs/distance/distance.hpp>
-#include <raft/util/integer_utils.hpp>
-
-namespace cuvs::neighbors::cagra::detail {
-template <class DATA_T_,
-          class CODE_BOOK_T_,
-          unsigned PQ_BITS,
-          unsigned PQ_LEN,
-          class DISTANCE_T,
-          class INDEX_T>
-struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<half, DISTANCE_T, INDEX_T> {
-  using LOAD_T      = device::LOAD_128BIT_T;
-  using DATA_T      = DATA_T_;
-  using CODE_BOOK_T = CODE_BOOK_T_;
-  using QUERY_T     = typename dataset_descriptor_base_t<half, DISTANCE_T, INDEX_T>::QUERY_T;
-
-  static_assert(std::is_same_v<CODE_BOOK_T, half>, "Only CODE_BOOK_T = `half` is supported now");
-
-  const std::uint8_t* encoded_dataset_ptr;
-  const std::uint32_t encoded_dataset_dim;
-  const std::uint32_t n_subspace;
-  const CODE_BOOK_T* vq_code_book_ptr;
-  const float vq_scale;
-  const CODE_BOOK_T* pq_code_book_ptr;
-  const float pq_scale;
-  using dataset_descriptor_base_t<half, DISTANCE_T, INDEX_T>::size;
-  using dataset_descriptor_base_t<half, DISTANCE_T, INDEX_T>::dim;
-
-  // Set on device
-  CODE_BOOK_T* smem_pq_code_book_ptr;
-  static const std::uint32_t smem_buffer_size_in_byte =
-    (1 << PQ_BITS) * PQ_LEN * utils::size_of<CODE_BOOK_T>();
-
-  __device__ void set_smem_ptr(void* const smem_ptr)
-  {
-    smem_pq_code_book_ptr = reinterpret_cast<CODE_BOOK_T*>(smem_ptr);
-
-    // Copy PQ table
-    for (unsigned i = threadIdx.x * 2; i < (1 << PQ_BITS) * PQ_LEN; i += blockDim.x * 2) {
-      half2 buf2;
-      buf2.x = pq_code_book_ptr[i];
-      buf2.y = pq_code_book_ptr[i + 1];
-
-      // Change the order of PQ code book array to reduce the
-      // frequency of bank conflicts.
-      constexpr auto num_elements_per_bank  = 4 / utils::size_of<CODE_BOOK_T>();
-      constexpr auto num_banks_per_subspace = PQ_LEN / num_elements_per_bank;
-      const auto j                          = i / num_elements_per_bank;
-      const auto smem_index =
-        (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS);
-      reinterpret_cast<half2*>(smem_pq_code_book_ptr)[smem_index] = buf2;
-    }
-  }
-
-  cagra_q_dataset_descriptor_t(const std::uint8_t* encoded_dataset_ptr,
-                               const std::uint32_t encoded_dataset_dim,
-                               const std::uint32_t n_subspace,
-                               const CODE_BOOK_T* const vq_code_book_ptr,
-                               const float vq_scale,
-                               const CODE_BOOK_T* const pq_code_book_ptr,
-                               const float pq_scale,
-                               const std::size_t size,
-                               const std::uint32_t dim)
-    : dataset_descriptor_base_t<half, DISTANCE_T, INDEX_T>(size, dim),
-      encoded_dataset_ptr(encoded_dataset_ptr),
-      encoded_dataset_dim(encoded_dataset_dim),
-      n_subspace(n_subspace),
-      vq_code_book_ptr(vq_code_book_ptr),
-      vq_scale(vq_scale),
-      pq_code_book_ptr(pq_code_book_ptr),
-      pq_scale(pq_scale)
-  {
-  }
-
-  template <uint32_t DATASET_BLOCK_DIM>
-  __device__ void copy_query(const DATA_T* const dmem_query_ptr,
-                             QUERY_T* const smem_query_ptr,
-                             const std::uint32_t query_smem_buffer_length)
-  {
-    constexpr cuvs::spatial::knn::detail::utils::mapping<half> mapping{};
-    for (unsigned i = threadIdx.x * 2; i < dim; i += blockDim.x * 2) {
-      half2 buf2{0, 0};
-      if (i < dim) { buf2.x = mapping(dmem_query_ptr[i]); }
-      if (i + 1 < dim) { buf2.y = mapping(dmem_query_ptr[i + 1]); }
-      if ((PQ_BITS == 8) && (PQ_LEN % 2 == 0)) {
-        // Use swizzling in the condition to reduce bank conflicts in shared
-        // memory, which are likely to occur when pq_code_book_dim is large.
-        ((half2*)smem_query_ptr)[device::swizzling<std::uint32_t, DATASET_BLOCK_DIM / 2>(i / 2)] =
-          buf2;
-      } else {
-        (reinterpret_cast<half2*>(smem_query_ptr + i))[0] = buf2;
-      }
-    }
-  }
-
-  template <uint32_t DATASET_BLOCK_DIM, uint32_t TEAM_SIZE, cuvs::distance::DistanceType METRIC>
-  __device__ DISTANCE_T compute_similarity(const QUERY_T* const query_ptr,
-                                           const INDEX_T node_id,
-                                           const bool valid) const
-  {
-    float norm = 0;
-    if (valid) {
-      const unsigned lane_id = threadIdx.x % TEAM_SIZE;
-      const uint32_t vq_code = *(reinterpret_cast<const std::uint32_t*>(
-        encoded_dataset_ptr + (static_cast<std::uint64_t>(encoded_dataset_dim) * node_id)));
-      if (PQ_BITS == 8) {
-        for (uint32_t elem_offset = 0; elem_offset < dim; elem_offset += DATASET_BLOCK_DIM) {
-          constexpr unsigned vlen = 4;  // **** DO NOT CHANGE ****
-          constexpr unsigned nelem =
-            raft::div_rounding_up_unsafe<unsigned>(DATASET_BLOCK_DIM / PQ_LEN, TEAM_SIZE * vlen);
-          // Loading PQ codes
-          uint32_t pq_codes[nelem];
-#pragma unroll
-          for (std::uint32_t e = 0; e < nelem; e++) {
-            const std::uint32_t k = (lane_id + (TEAM_SIZE * e)) * vlen + elem_offset / PQ_LEN;
-            if (k >= n_subspace) break;
-            // Loading 4 x 8-bit PQ-codes using 32-bit load ops (from device memory)
-            pq_codes[e] = *(reinterpret_cast<const std::uint32_t*>(
-              encoded_dataset_ptr + (static_cast<std::uint64_t>(encoded_dataset_dim) * node_id) +
-              4 + k));
-          }
-          //
-          if constexpr (PQ_LEN % 2 == 0) {
-            // **** Use half2 for distance computation ****
-            half2 norm2{0, 0};
-#pragma unroll
-            for (std::uint32_t e = 0; e < nelem; e++) {
-              const std::uint32_t k = (lane_id + (TEAM_SIZE * e)) * vlen + elem_offset / PQ_LEN;
-              if (k >= n_subspace) break;
-              // Loading VQ code-book
-              raft::TxN_t<half2, vlen / 2> vq_vals[PQ_LEN];
-#pragma unroll
-              for (std::uint32_t m = 0; m < PQ_LEN; m += 1) {
-                const uint32_t d = (vlen * m) + (PQ_LEN * k);
-                if (d >= dim) break;
-                vq_vals[m].load(
-                  reinterpret_cast<const half2*>(vq_code_book_ptr + d + (dim * vq_code)), 0);
-              }
-              // Compute distance
-              std::uint32_t pq_code = pq_codes[e];
-#pragma unroll
-              for (std::uint32_t v = 0; v < vlen; v++) {
-                if (PQ_LEN * (v + k) >= dim) break;
-#pragma unroll
-                for (std::uint32_t m = 0; m < PQ_LEN; m += 2) {
-                  const std::uint32_t d1 = m + (PQ_LEN * v);
-                  const std::uint32_t d  = d1 + (PQ_LEN * k);
-                  // Loading query vector in smem
-                  half2 diff2 = (reinterpret_cast<const half2*>(
-                    query_ptr))[device::swizzling<std::uint32_t, DATASET_BLOCK_DIM / 2>(d / 2)];
-                  // Loading PQ code book in smem
-                  diff2 -= *(reinterpret_cast<half2*>(
-                    smem_pq_code_book_ptr + (1 << PQ_BITS) * 2 * (m / 2) + (2 * (pq_code & 0xff))));
-                  diff2 -= vq_vals[d1 / vlen].val.data[(d1 % vlen) / 2];
-                  norm2 += diff2 * diff2;
-                }
-                pq_code >>= 8;
-              }
-            }
-            norm += static_cast<float>(norm2.x + norm2.y);
-          } else {
-            // **** Use float for distance computation ****
-#pragma unroll
-            for (std::uint32_t e = 0; e < nelem; e++) {
-              const std::uint32_t k = (lane_id + (TEAM_SIZE * e)) * vlen + elem_offset / PQ_LEN;
-              if (k >= n_subspace) break;
-              // Loading VQ code-book
-              raft::TxN_t<CODE_BOOK_T, vlen> vq_vals[PQ_LEN];
-#pragma unroll
-              for (std::uint32_t m = 0; m < PQ_LEN; m++) {
-                const std::uint32_t d = (vlen * m) + (PQ_LEN * k);
-                if (d >= dim) break;
-                // Loading 4 x 8/16-bit VQ-values using 32/64-bit load ops (from L2$ or device
-                // memory)
-                vq_vals[m].load(
-                  reinterpret_cast<const half2*>(vq_code_book_ptr + d + (dim * vq_code)), 0);
-              }
-              // Compute distance
-              std::uint32_t pq_code = pq_codes[e];
-#pragma unroll
-              for (std::uint32_t v = 0; v < vlen; v++) {
-                if (PQ_LEN * (v + k) >= dim) break;
-                raft::TxN_t<CODE_BOOK_T, PQ_LEN> pq_vals;
-                pq_vals.load(
-                  reinterpret_cast<const half2*>(smem_pq_code_book_ptr + PQ_LEN * (pq_code & 0xff)),
-                  0);  // (from L1$ or smem)
-#pragma unroll
-                for (std::uint32_t m = 0; m < PQ_LEN; m++) {
-                  const std::uint32_t d1 = m + (PQ_LEN * v);
-                  const std::uint32_t d  = d1 + (PQ_LEN * k);
-                  // if (d >= dataset_dim) break;
-                  DISTANCE_T diff = query_ptr[d];  // (from smem)
-                  diff -= pq_scale * static_cast<float>(pq_vals.data[m]);
-                  diff -= vq_scale * static_cast<float>(vq_vals[d1 / vlen].val.data[d1 % vlen]);
-                  norm += diff * diff;
-                }
-                pq_code >>= 8;
-              }
-            }
-          }
-        }
-      }
-    }
-    for (uint32_t offset = TEAM_SIZE / 2; offset > 0; offset >>= 1) {
-      norm += __shfl_xor_sync(0xffffffff, norm, offset);
-    }
-    return norm;
-  }
-};
-
-}  // namespace cuvs::neighbors::cagra::detail
\ No newline at end of file
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
new file mode 100644
index 000000000..378d2943e
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "compute_distance.hpp"
+
+#include <cuvs/distance/distance.hpp>
+
+#include <type_traits>
+
+namespace cuvs::neighbors::cagra::detail {
+
+template <cuvs::distance::DistanceType Metric,
+          uint32_t TeamSize,
+          uint32_t DatasetBlockDim,
+          uint32_t PqBits,
+          uint32_t PqLen,
+          typename CodebookT,
+          typename DataT,
+          typename IndexT,
+          typename DistanceT>
+struct vpq_descriptor_spec : public instance_spec<DataT, IndexT, DistanceT> {
+  using base_type = instance_spec<DataT, IndexT, DistanceT>;
+  using typename base_type::data_type;
+  using typename base_type::distance_type;
+  using typename base_type::host_type;
+  using typename base_type::index_type;
+
+  template <typename DatasetT>
+  constexpr static inline auto accepts_dataset()
+    -> std::enable_if_t<is_vpq_dataset_v<DatasetT>, bool>
+  {
+    return std::is_same_v<typename DatasetT::math_type, CodebookT>;
+  }
+
+  template <typename DatasetT>
+  constexpr static inline auto accepts_dataset()
+    -> std::enable_if_t<!is_vpq_dataset_v<DatasetT>, bool>
+  {
+    return false;
+  }
+
+  template <typename DatasetT>
+  static auto init(const cagra::search_params& params,
+                   const DatasetT& dataset,
+                   cuvs::distance::DistanceType metric,
+                   rmm::cuda_stream_view stream) -> host_type
+  {
+    return init_(params,
+                 dataset.data.data_handle(),
+                 dataset.encoded_row_length(),
+                 dataset.vq_code_book.data_handle(),
+                 dataset.pq_code_book.data_handle(),
+                 IndexT(dataset.n_rows()),
+                 dataset.dim(),
+                 stream);
+  }
+
+  template <typename DatasetT>
+  static auto priority(const cagra::search_params& params,
+                       const DatasetT& dataset,
+                       cuvs::distance::DistanceType metric) -> double
+  {
+    // If explicit team_size is specified and doesn't match the instance, discard it
+    if (params.team_size != 0 && TeamSize != params.team_size) { return -1.0; }
+    if (cuvs::distance::DistanceType::L2Expanded != metric) { return -1.0; }
+    // Match codebook params
+    if (dataset.pq_bits() != PqBits) { return -1.0; }
+    if (dataset.pq_len() != PqLen) { return -1.0; }
+    // Otherwise, favor the closest dataset dimensionality.
+    return 1.0 / (0.1 + std::abs(double(dataset.dim()) - double(DatasetBlockDim)));
+  }
+
+ private:
+  static dataset_descriptor_host<DataT, IndexT, DistanceT> init_(
+    const cagra::search_params& params,
+    const std::uint8_t* encoded_dataset_ptr,
+    uint32_t encoded_dataset_dim,
+    const CodebookT* vq_code_book_ptr,
+    const CodebookT* pq_code_book_ptr,
+    IndexT size,
+    uint32_t dim,
+    rmm::cuda_stream_view stream);
+};
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu
similarity index 50%
rename from cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu
index 9ec7ce3dd..a56a5a9df 100644
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu
@@ -15,22 +15,27 @@
  */
 
 /*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
+ * NOTE: this file is generated by compute_distance_00_generate.py
  *
  * Make changes there and run in this directory:
  *
- * > python q_search_multi_cta_00_generate.py
+ * > python compute_distance_00_generate.py
  *
  */
 
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
+#include "compute_distance_vpq-impl.cuh"
 
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
+namespace cuvs::neighbors::cagra::detail {
 
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    2,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu
similarity index 50%
rename from cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu
index 292a1429a..f58a8c7df 100644
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu
@@ -15,22 +15,27 @@
  */
 
 /*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
+ * NOTE: this file is generated by compute_distance_00_generate.py
  *
  * Make changes there and run in this directory:
  *
- * > python q_search_multi_cta_00_generate.py
+ * > python compute_distance_00_generate.py
  *
  */
 
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
+#include "compute_distance_vpq-impl.cuh"
 
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
+namespace cuvs::neighbors::cagra::detail {
 
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    4,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu
new file mode 100644
index 000000000..bdc072e61
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    256,
+                                    8,
+                                    2,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu
new file mode 100644
index 000000000..301c8c55b
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    256,
+                                    8,
+                                    4,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu
new file mode 100644
index 000000000..05ebeae2b
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    512,
+                                    8,
+                                    2,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu
new file mode 100644
index 000000000..e343d938c
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    512,
+                                    8,
+                                    4,
+                                    half,
+                                    float,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu
similarity index 50%
rename from cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_2subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu
index 1a5ad50e3..5d950351f 100644
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_2subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu
@@ -15,22 +15,27 @@
  */
 
 /*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
+ * NOTE: this file is generated by compute_distance_00_generate.py
  *
  * Make changes there and run in this directory:
  *
- * > python q_search_multi_cta_00_generate.py
+ * > python compute_distance_00_generate.py
  *
  */
 
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
+#include "compute_distance_vpq-impl.cuh"
 
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
+namespace cuvs::neighbors::cagra::detail {
 
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    2,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu
similarity index 50%
rename from cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_4subd_half.cu
rename to cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu
index 0ab23d7eb..453e15df3 100644
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim128_t8_8pq_4subd_half.cu
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu
@@ -15,22 +15,27 @@
  */
 
 /*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
+ * NOTE: this file is generated by compute_distance_00_generate.py
  *
  * Make changes there and run in this directory:
  *
- * > python q_search_multi_cta_00_generate.py
+ * > python compute_distance_00_generate.py
  *
  */
 
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
+#include "compute_distance_vpq-impl.cuh"
 
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
+namespace cuvs::neighbors::cagra::detail {
 
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    4,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu
new file mode 100644
index 000000000..c79cb74b6
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    256,
+                                    8,
+                                    2,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu
new file mode 100644
index 000000000..dee326d54
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    256,
+                                    8,
+                                    4,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu
new file mode 100644
index 000000000..a1ef9ba92
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    512,
+                                    8,
+                                    2,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu
new file mode 100644
index 000000000..f2f01c8d4
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    512,
+                                    8,
+                                    4,
+                                    half,
+                                    half,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu
new file mode 100644
index 000000000..1afccb8fd
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    2,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu
new file mode 100644
index 000000000..28ea523ee
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    4,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu
new file mode 100644
index 000000000..eca36cc36
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    256,
+                                    8,
+                                    2,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu
new file mode 100644
index 000000000..89aed8afc
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    256,
+                                    8,
+                                    4,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu
new file mode 100644
index 000000000..ff646b22c
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    512,
+                                    8,
+                                    2,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu
new file mode 100644
index 000000000..633a805c7
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    512,
+                                    8,
+                                    4,
+                                    half,
+                                    int8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu
new file mode 100644
index 000000000..3a09161ea
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    2,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu
new file mode 100644
index 000000000..85331d243
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    8,
+                                    128,
+                                    8,
+                                    4,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu
new file mode 100644
index 000000000..a7719074a
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    256,
+                                    8,
+                                    2,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu
new file mode 100644
index 000000000..7dd028b82
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    16,
+                                    256,
+                                    8,
+                                    4,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu
new file mode 100644
index 000000000..78f37b135
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    512,
+                                    8,
+                                    2,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu
new file mode 100644
index 000000000..d3eb20a05
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by compute_distance_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python compute_distance_00_generate.py
+ *
+ */
+
+#include "compute_distance_vpq-impl.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+using namespace cuvs::distance;
+template struct vpq_descriptor_spec<DistanceType::L2Expanded,
+                                    32,
+                                    512,
+                                    8,
+                                    4,
+                                    half,
+                                    uint8_t,
+                                    uint32_t,
+                                    float>;
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp
index 192d81aa8..b7cb9c42d 100644
--- a/cpp/src/neighbors/detail/cagra/device_common.hpp
+++ b/cpp/src/neighbors/detail/cagra/device_common.hpp
@@ -15,10 +15,15 @@
  */
 #pragma once
 
+#include "hashmap.hpp"
 #include "utils.hpp"
 
+#include <cuvs/distance/distance.hpp>
+
 // TODO: This shouldn't be invoking anything in detail APIs outside of cuvs/neighbors
 #include <raft/core/detail/macros.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <raft/util/warp_primitives.cuh>
 
 #include <cuda_fp16.h>
 
@@ -31,6 +36,16 @@ namespace device {
 // warpSize for compile time calculation
 constexpr unsigned warp_size = 32;
 
+// using LOAD_256BIT_T = ulonglong4;
+using LOAD_128BIT_T = uint4;
+using LOAD_64BIT_T  = uint64_t;
+
+template <class LOAD_T, class DATA_T>
+RAFT_DEVICE_INLINE_FUNCTION constexpr unsigned get_vlen()
+{
+  return utils::size_of<LOAD_T>() / utils::size_of<DATA_T>();
+}
+
 /** Xorshift rondem number generator.
  *
  * See https://en.wikipedia.org/wiki/Xorshift#xorshift for reference.
@@ -43,18 +58,299 @@ _RAFT_HOST_DEVICE inline uint64_t xorshift64(uint64_t u)
   return u * 0x2545F4914F6CDD1DULL;
 }
 
-template <class T, unsigned X_MAX = 1024>
-_RAFT_DEVICE inline T swizzling(T x)
+template <uint32_t Dim = 1024, uint32_t Stride = 128, typename T>
+RAFT_DEVICE_INLINE_FUNCTION constexpr auto swizzling(T x) -> T
 {
   // Address swizzling reduces bank conflicts in shared memory, but increases
   // the amount of operation instead.
   // return x;
-  if constexpr (X_MAX <= 1024) {
-    return (x) ^ ((x) >> 5);
+  if constexpr (Stride <= 32) {
+    return x;
+  } else if constexpr (Dim <= 1024) {
+    return x ^ (x >> 5);
   } else {
-    return (x) ^ (((x) >> 5) & 0x1f);
+    return x ^ ((x >> 5) & 0x1f);
+  }
+}
+
+template <uint32_t TeamSize, typename T>
+RAFT_DEVICE_INLINE_FUNCTION auto team_sum(T x) -> T
+{
+#pragma unroll
+  for (uint32_t stride = TeamSize >> 1; stride > 0; stride >>= 1) {
+    x += raft::shfl_xor(x, stride, TeamSize);
+  }
+  return x;
+}
+
+template <typename T>
+RAFT_DEVICE_INLINE_FUNCTION auto team_sum(T x, uint32_t team_size_bitshift) -> T
+{
+  switch (team_size_bitshift) {
+    case 5: x += raft::shfl_xor(x, 16);
+    case 4: x += raft::shfl_xor(x, 8);
+    case 3: x += raft::shfl_xor(x, 4);
+    case 2: x += raft::shfl_xor(x, 2);
+    case 1: x += raft::shfl_xor(x, 1);
+    default: return x;
+  }
+}
+
+template <typename IndexT,
+          typename DistanceT,
+          typename DATASET_DESCRIPTOR_T>
+RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes(
+  IndexT* __restrict__ result_indices_ptr,       // [num_pickup]
+  DistanceT* __restrict__ result_distances_ptr,  // [num_pickup]
+  const DATASET_DESCRIPTOR_T& dataset_desc,
+  const uint32_t num_pickup,
+  const uint32_t num_distilation,
+  const uint64_t rand_xor_mask,
+  const IndexT* __restrict__ seed_ptr,  // [num_seeds]
+  const uint32_t num_seeds,
+  IndexT* __restrict__ visited_hash_ptr,
+  const uint32_t hash_bitlen,
+  const uint32_t block_id   = 0,
+  const uint32_t num_blocks = 1)
+{
+  const auto team_size_bits = dataset_desc.team_size_bitshift_from_smem();
+  const auto max_i = raft::round_up_safe<uint32_t>(num_pickup, warp_size >> team_size_bits);
+  const auto compute_distance = dataset_desc.compute_distance_impl;
+
+  for (uint32_t i = threadIdx.x >> team_size_bits; i < max_i; i += (blockDim.x >> team_size_bits)) {
+    const bool valid_i = (i < num_pickup);
+
+    IndexT best_index_team_local;
+    DistanceT best_norm2_team_local = raft::upper_bound<DistanceT>();
+    for (uint32_t j = 0; j < num_distilation; j++) {
+      // Select a node randomly and compute the distance to it
+      IndexT seed_index;
+      if (valid_i) {
+        // uint32_t gid = i + (num_pickup * (j + (num_distilation * block_id)));
+        uint32_t gid = block_id + (num_blocks * (i + (num_pickup * j)));
+        if (seed_ptr && (gid < num_seeds)) {
+          seed_index = seed_ptr[gid];
+        } else {
+          seed_index = device::xorshift64(gid ^ rand_xor_mask) % dataset_desc.size;
+        }
+      }
+
+      const auto norm2 = dataset_desc.compute_distance(seed_index, valid_i);
+
+      if (valid_i && (norm2 < best_norm2_team_local)) {
+        best_norm2_team_local = norm2;
+        best_index_team_local = seed_index;
+      }
+    }
+
+    const unsigned lane_id = threadIdx.x & ((1u << team_size_bits) - 1u);
+    if (valid_i && lane_id == 0) {
+      if (hashmap::insert(visited_hash_ptr, hash_bitlen, best_index_team_local)) {
+        result_distances_ptr[i] = best_norm2_team_local;
+        result_indices_ptr[i]   = best_index_team_local;
+      } else {
+        result_distances_ptr[i] = raft::upper_bound<DistanceT>();
+        result_indices_ptr[i]   = raft::upper_bound<IndexT>();
+      }
+    }
   }
 }
 
+template <typename IndexT, typename DistanceT, typename DATASET_DESCRIPTOR_T>
+RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_child_nodes(
+  IndexT* __restrict__ result_child_indices_ptr,
+  DistanceT* __restrict__ result_child_distances_ptr,
+  // [dataset_dim, dataset_size]
+  const DATASET_DESCRIPTOR_T& dataset_desc,
+  // [knn_k, dataset_size]
+  const IndexT* __restrict__ knn_graph,
+  const uint32_t knn_k,
+  // hashmap
+  IndexT* __restrict__ visited_hashmap_ptr,
+  const uint32_t hash_bitlen,
+  const IndexT* __restrict__ parent_indices,
+  const IndexT* __restrict__ internal_topk_list,
+  const uint32_t search_width)
+{
+  constexpr IndexT index_msb_1_mask = utils::gen_index_msb_1_mask<IndexT>::value;
+  constexpr IndexT invalid_index    = raft::upper_bound<IndexT>();
+
+  // Read child indices of parents from knn graph and check if the distance
+  // computaiton is necessary.
+  for (uint32_t i = threadIdx.x; i < knn_k * search_width; i += blockDim.x) {
+    const IndexT smem_parent_id = parent_indices[i / knn_k];
+    IndexT child_id             = invalid_index;
+    if (smem_parent_id != invalid_index) {
+      const auto parent_id = internal_topk_list[smem_parent_id] & ~index_msb_1_mask;
+      child_id             = knn_graph[(i % knn_k) + (static_cast<int64_t>(knn_k) * parent_id)];
+    }
+    if (child_id != invalid_index) {
+      if (hashmap::insert(visited_hashmap_ptr, hash_bitlen, child_id) == 0) {
+        child_id = invalid_index;
+      }
+    }
+    result_child_indices_ptr[i] = child_id;
+  }
+  __syncthreads();
+
+  // Compute the distance to child nodes
+  const auto team_size_bits   = dataset_desc.team_size_bitshift_from_smem();
+  const auto num_k            = knn_k * search_width;
+  const auto max_i            = raft::round_up_safe(num_k, warp_size >> team_size_bits);
+  const auto compute_distance = dataset_desc.compute_distance_impl;
+  const auto args             = dataset_desc.args.load();
+  const bool lead_lane        = (threadIdx.x & ((1u << team_size_bits) - 1u)) == 0;
+  for (uint32_t i = threadIdx.x >> team_size_bits; i < max_i; i += blockDim.x >> team_size_bits) {
+    const bool valid_i  = i < num_k;
+    const auto child_id = valid_i ? result_child_indices_ptr[i] : invalid_index;
+
+    // We should be calling `dataset_desc.compute_distance(..)` here as follows:
+    // > const auto child_dist = dataset_desc.compute_distance(child_id, child_id != invalid_index);
+    // Instead, we manually inline this function for performance reasons.
+    // This allows us to move the fetching of the arguments from shared memory out of the loop.
+    const DistanceT child_dist = device::team_sum(
+      (child_id != invalid_index) ? compute_distance(args, child_id)
+                                  : (lead_lane ? raft::upper_bound<DistanceT>() : 0),
+      team_size_bits);
+
+    // Store the distance
+    if (valid_i && lead_lane) { result_child_distances_ptr[i] = child_dist; }
+  }
+}
+
+RAFT_DEVICE_INLINE_FUNCTION void lds(float& x, uint32_t addr)
+{
+  asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x) : "r"(addr));
+}
+RAFT_DEVICE_INLINE_FUNCTION void lds(half& x, uint32_t addr)
+{
+  asm volatile("ld.shared.u16 {%0}, [%1];" : "=h"(reinterpret_cast<uint16_t&>(x)) : "r"(addr));
+}
+RAFT_DEVICE_INLINE_FUNCTION void lds(half2& x, uint32_t addr)
+{
+  asm volatile("ld.shared.u32 {%0}, [%1];" : "=r"(reinterpret_cast<uint32_t&>(x)) : "r"(addr));
+}
+RAFT_DEVICE_INLINE_FUNCTION void lds(half (&x)[1], uint32_t addr)
+{
+  asm volatile("ld.shared.u16 {%0}, [%1];" : "=h"(*reinterpret_cast<uint16_t*>(x)) : "r"(addr));
+}
+RAFT_DEVICE_INLINE_FUNCTION void lds(half (&x)[2], uint32_t addr)
+{
+  asm volatile("ld.shared.v2.u16 {%0, %1}, [%2];"
+               : "=h"(*reinterpret_cast<uint16_t*>(x)), "=h"(*reinterpret_cast<uint16_t*>(x + 1))
+               : "r"(addr));
+}
+RAFT_DEVICE_INLINE_FUNCTION void lds(half (&x)[4], uint32_t addr)
+{
+  asm volatile("ld.shared.v4.u16 {%0, %1, %2, %3}, [%4];"
+               : "=h"(*reinterpret_cast<uint16_t*>(x)),
+                 "=h"(*reinterpret_cast<uint16_t*>(x + 1)),
+                 "=h"(*reinterpret_cast<uint16_t*>(x + 2)),
+                 "=h"(*reinterpret_cast<uint16_t*>(x + 3))
+               : "r"(addr));
+}
+
+RAFT_DEVICE_INLINE_FUNCTION void lds(uint32_t& x, uint32_t addr)
+{
+  asm volatile("ld.shared.u32 {%0}, [%1];" : "=r"(x) : "r"(addr));
+}
+
+RAFT_DEVICE_INLINE_FUNCTION void lds(uint32_t& x, const uint32_t* addr)
+{
+  lds(x, uint32_t(__cvta_generic_to_shared(addr)));
+}
+
+RAFT_DEVICE_INLINE_FUNCTION void lds(uint4& x, uint32_t addr)
+{
+  asm volatile("ld.shared.v4.u32 {%0, %1, %2, %3}, [%4];"
+               : "=r"(x.x), "=r"(x.y), "=r"(x.z), "=r"(x.w)
+               : "r"(addr));
+}
+
+RAFT_DEVICE_INLINE_FUNCTION void lds(uint4& x, const uint4* addr)
+{
+  lds(x, uint32_t(__cvta_generic_to_shared(addr)));
+}
+
+RAFT_DEVICE_INLINE_FUNCTION void sts(uint32_t addr, const half2& x)
+{
+  asm volatile("st.shared.v2.u16 [%0], {%1, %2};"
+               :
+               : "r"(addr),
+                 "h"(reinterpret_cast<const uint16_t&>(x.x)),
+                 "h"(reinterpret_cast<const uint16_t&>(x.y)));
+}
+
+RAFT_DEVICE_INLINE_FUNCTION void ldg_cg(uint4& x, const uint4* addr)
+{
+  asm volatile("ld.global.cg.v4.u32 {%0, %1, %2, %3}, [%4];"
+               : "=r"(x.x), "=r"(x.y), "=r"(x.z), "=r"(x.w)
+               : "l"(addr));
+}
+
+RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(uint4& x, const uint4* addr)
+{
+  asm volatile("ld.global.ca.v4.u32 {%0, %1, %2, %3}, [%4];"
+               : "=r"(x.x), "=r"(x.y), "=r"(x.z), "=r"(x.w)
+               : "l"(addr));
+}
+
+RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(uint32_t& x, const uint32_t* addr)
+{
+  asm volatile("ld.global.ca.u32 %0, [%1];" : "=r"(x) : "l"(addr));
+}
+
+RAFT_DEVICE_INLINE_FUNCTION void ldg_cg(uint32_t& x, const uint32_t* addr)
+{
+  asm volatile("ld.global.cg.u32 %0, [%1];" : "=r"(x) : "l"(addr));
+}
+
+RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half& x, const half* addr)
+{
+  asm volatile("ld.global.ca.u16 {%0}, [%1];"
+               : "=h"(reinterpret_cast<uint16_t&>(x))
+               : "l"(reinterpret_cast<const uint16_t*>(addr)));
+}
+RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half (&x)[1], const half* addr)
+{
+  asm volatile("ld.global.ca.u16 {%0}, [%1];"
+               : "=h"(*reinterpret_cast<uint16_t*>(x))
+               : "l"(reinterpret_cast<const uint16_t*>(addr)));
+}
+RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half (&x)[2], const half* addr)
+{
+  asm volatile("ld.global.ca.v2.u16 {%0, %1}, [%2];"
+               : "=h"(*reinterpret_cast<uint16_t*>(x)), "=h"(*reinterpret_cast<uint16_t*>(x + 1))
+               : "l"(reinterpret_cast<const uint16_t*>(addr)));
+}
+RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half (&x)[4], const half* addr)
+{
+  asm volatile("ld.global.ca.v4.u16 {%0, %1, %2, %3}, [%4];"
+               : "=h"(*reinterpret_cast<uint16_t*>(x)),
+                 "=h"(*reinterpret_cast<uint16_t*>(x + 1)),
+                 "=h"(*reinterpret_cast<uint16_t*>(x + 2)),
+                 "=h"(*reinterpret_cast<uint16_t*>(x + 3))
+               : "l"(reinterpret_cast<const uint16_t*>(addr)));
+}
+
+RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half2& x, const half* addr)
+{
+  asm volatile("ld.global.ca.u32 %0, [%1];"
+               : "=r"(reinterpret_cast<uint32_t&>(x))
+               : "l"(reinterpret_cast<const uint32_t*>(addr)));
+}
+RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half2 (&x)[1], const half* addr)
+{
+  asm volatile("ld.global.ca.u32 %0, [%1];"
+               : "=r"(*reinterpret_cast<uint32_t*>(x))
+               : "l"(reinterpret_cast<const uint32_t*>(addr)));
+}
+RAFT_DEVICE_INLINE_FUNCTION void ldg_ca(half2 (&x)[2], const half* addr)
+{
+  asm volatile("ld.global.ca.v2.u32 {%0, %1}, [%2];"
+               : "=r"(*reinterpret_cast<uint32_t*>(x)), "=r"(*reinterpret_cast<uint32_t*>(x + 1))
+               : "l"(reinterpret_cast<const uint32_t*>(addr)));
+}
+
 }  // namespace device
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/factory.cuh b/cpp/src/neighbors/detail/cagra/factory.cuh
index 183d6051f..1c99f72f7 100644
--- a/cpp/src/neighbors/detail/cagra/factory.cuh
+++ b/cpp/src/neighbors/detail/cagra/factory.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include "compute_distance-ext.cuh"
 #include "search_multi_cta.cuh"
 #include "search_multi_kernel.cuh"
 #include "search_plan.cuh"
@@ -25,71 +26,153 @@
 
 namespace cuvs::neighbors::cagra::detail {
 
-template <typename DATASET_DESCRIPTOR_T,
+template <typename DataT,
+          typename IndexT,
+          typename DistanceT,
           typename CagraSampleFilterT = cuvs::neighbors::filtering::none_cagra_sample_filter>
 class factory {
-  using T         = typename DATASET_DESCRIPTOR_T::DATA_T;
-  using IdxT      = typename DATASET_DESCRIPTOR_T::INDEX_T;
-  using DistanceT = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
-
  public:
   /**
    * Create a search structure for dataset with dim features.
    */
-  static std::unique_ptr<search_plan_impl<DATASET_DESCRIPTOR_T, CagraSampleFilterT>> create(
+  static std::unique_ptr<search_plan_impl<DataT, IndexT, DistanceT, CagraSampleFilterT>> create(
     raft::resources const& res,
     search_params const& params,
+    const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc,
     int64_t dim,
     int64_t graph_degree,
-    uint32_t topk,
-    const cuvs::distance::DistanceType metric)
+    uint32_t topk)
   {
-    search_plan_impl_base plan(params, dim, graph_degree, topk, metric);
-    switch (plan.dataset_block_dim) {
-      case 128:
-        switch (plan.team_size) {
-          case 8: return dispatch_kernel<128, 8>(res, plan); break;
-          default: THROW("Incorrect team size %lu", plan.team_size);
-        }
-        break;
-      case 256:
-        switch (plan.team_size) {
-          case 16: return dispatch_kernel<256, 16>(res, plan); break;
-          default: THROW("Incorrect team size %lu", plan.team_size);
-        }
-        break;
-      case 512:
-        switch (plan.team_size) {
-          case 32: return dispatch_kernel<512, 32>(res, plan); break;
-          default: THROW("Incorrect team size %lu", plan.team_size);
-        }
-        break;
-      default: THROW("Incorrect dataset_block_dim (%lu)\n", plan.dataset_block_dim);
-    }
-    return std::unique_ptr<search_plan_impl<DATASET_DESCRIPTOR_T, CagraSampleFilterT>>();
+    search_plan_impl_base plan(params, dim, graph_degree, topk);
+    return dispatch_kernel(res, plan, dataset_desc);
   }
 
  private:
-  template <unsigned DATASET_BLOCK_DIM, unsigned TEAM_SIZE>
-  static std::unique_ptr<search_plan_impl<DATASET_DESCRIPTOR_T, CagraSampleFilterT>>
-  dispatch_kernel(raft::resources const& res, search_plan_impl_base& plan)
+  static std::unique_ptr<search_plan_impl<DataT, IndexT, DistanceT, CagraSampleFilterT>>
+  dispatch_kernel(raft::resources const& res,
+                  search_plan_impl_base& plan,
+                  const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc)
   {
     if (plan.algo == search_algo::SINGLE_CTA) {
-      return std::unique_ptr<search_plan_impl<DATASET_DESCRIPTOR_T, CagraSampleFilterT>>(
-        new single_cta_search::
-          search<TEAM_SIZE, DATASET_BLOCK_DIM, DATASET_DESCRIPTOR_T, CagraSampleFilterT>(
-            res, plan, plan.dim, plan.graph_degree, plan.topk, plan.metric));
+      return std::make_unique<
+        single_cta_search::search<DataT, IndexT, DistanceT, CagraSampleFilterT>>(
+        res, plan, dataset_desc, plan.dim, plan.graph_degree, plan.topk);
     } else if (plan.algo == search_algo::MULTI_CTA) {
-      return std::unique_ptr<search_plan_impl<DATASET_DESCRIPTOR_T, CagraSampleFilterT>>(
-        new multi_cta_search::
-          search<TEAM_SIZE, DATASET_BLOCK_DIM, DATASET_DESCRIPTOR_T, CagraSampleFilterT>(
-            res, plan, plan.dim, plan.graph_degree, plan.topk, plan.metric));
+      return std::make_unique<
+        multi_cta_search::search<DataT, IndexT, DistanceT, CagraSampleFilterT>>(
+        res, plan, dataset_desc, plan.dim, plan.graph_degree, plan.topk);
     } else {
-      return std::unique_ptr<search_plan_impl<DATASET_DESCRIPTOR_T, CagraSampleFilterT>>(
-        new multi_kernel_search::
-          search<TEAM_SIZE, DATASET_BLOCK_DIM, DATASET_DESCRIPTOR_T, CagraSampleFilterT>(
-            res, plan, plan.dim, plan.graph_degree, plan.topk, plan.metric));
+      return std::make_unique<
+        multi_kernel_search::search<DataT, IndexT, DistanceT, CagraSampleFilterT>>(
+        res, plan, dataset_desc, plan.dim, plan.graph_degree, plan.topk);
     }
   }
 };
+
+/*
+Caching of dataset/distance descriptor initialization
+  (see `dataset_descriptor_init_with_cache` below).
+ */
+namespace descriptor_cache {
+
+/**
+ * The key for caching consists of a minimal set of fields that uniquely define the descriptor.
+ * The key field names are the same as of the descriptor and the contents are not relevant for
+ * caching.
+ */
+struct key {
+  uint64_t data_ptr;
+  uint64_t n_rows;
+  uint32_t dim;
+  uint32_t extra_val;  // this one has different meanings for different descriptor types
+  uint32_t team_size;
+  uint32_t metric;
+};
+
+template <typename DatasetT>
+auto make_key(const cagra::search_params& params,
+              const DatasetT& dataset,
+              cuvs::distance::DistanceType metric)
+  -> std::enable_if_t<is_strided_dataset_v<DatasetT>, key>
+{
+  return key{reinterpret_cast<uint64_t>(dataset.view().data_handle()),
+             uint64_t(dataset.n_rows()),
+             dataset.dim(),
+             dataset.stride(),
+             uint32_t(params.team_size),
+             uint32_t(metric)};
+}
+
+template <typename DatasetT>
+auto make_key(const cagra::search_params& params,
+              const DatasetT& dataset,
+              cuvs::distance::DistanceType metric)
+  -> std::enable_if_t<is_vpq_dataset_v<DatasetT>, key>
+{
+  return key{reinterpret_cast<uint64_t>(dataset.data.data_handle()),
+             uint64_t(dataset.n_rows()),
+             dataset.dim(),
+             uint32_t(reinterpret_cast<uint64_t>(dataset.pq_code_book.data_handle()) >> 6),
+             uint32_t(params.team_size),
+             uint32_t(metric)};
+}
+
+inline auto operator==(const key& a, const key& b) -> bool
+{
+  return a.data_ptr == b.data_ptr && a.n_rows == b.n_rows && a.dim == b.dim &&
+         a.extra_val == b.extra_val && a.team_size == b.team_size && a.metric == b.metric;
+}
+
+struct key_hash {
+  inline auto operator()(const key& x) const noexcept -> std::size_t
+  {
+    return size_t{x.data_ptr} + size_t{x.n_rows} * size_t{x.dim} * size_t{x.extra_val} +
+           (size_t{x.team_size} ^ size_t{x.metric});
+  }
+};
+
+template <typename DataT, typename IndexT, typename DistanceT>
+struct store {
+  /** Number of descriptors to cache. */
+  static constexpr size_t kDefaultSize = 100;
+  raft::cache::lru<key,
+                   key_hash,
+                   std::equal_to<>,
+                   std::shared_ptr<dataset_descriptor_host<DataT, IndexT, DistanceT>>>
+    value{kDefaultSize};
+};
+
+}  // namespace descriptor_cache
+
+/**
+ * Call `dataset_descriptor_init` with memoization.
+ * (NB: `dataset_descriptor_init` is a function in a generated header file
+ * `neighbors/detail/cagra/compute_distance-ext.cuh`).
+ *
+ * `dataset_descriptor_init`  involves calling a CUDA kernel to resolve device symbols before the
+ * main search kernel runs. This adds an extra unwanted latency.
+ * Caching the the descriptor helps to hide this latency for repeated searches.
+ *
+ */
+template <typename DataT, typename IndexT, typename DistanceT, typename DatasetT>
+auto dataset_descriptor_init_with_cache(const raft::resources& res,
+                                        const cagra::search_params& params,
+                                        const DatasetT& dataset,
+                                        cuvs::distance::DistanceType metric)
+  -> const dataset_descriptor_host<DataT, IndexT, DistanceT>&
+{
+  using desc_t = dataset_descriptor_host<DataT, IndexT, DistanceT>;
+  auto key     = descriptor_cache::make_key(params, dataset, metric);
+  auto& cache =
+    raft::resource::get_custom_resource<descriptor_cache::store<DataT, IndexT, DistanceT>>(res)
+      ->value;
+  std::shared_ptr<desc_t> desc{nullptr};
+  if (!cache.get(key, &desc)) {
+    desc = std::make_shared<desc_t>(std::move(dataset_descriptor_init<DataT, IndexT, DistanceT>(
+      params, dataset, metric, raft::resource::get_cuda_stream(res))));
+    cache.set(key, desc);
+  }
+  return *desc;
+}
+
 };  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index 515be75df..9edbbf5c1 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -73,12 +73,12 @@ __device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2, bool a
 }
 
 template <class DATA_T, class IdxT, int numElementsPerThread>
-RAFT_KERNEL kern_sort(const DATA_T* const dataset,  // [dataset_chunk_size, dataset_dim]
-                      const IdxT dataset_size,
-                      const uint32_t dataset_dim,
-                      IdxT* const knn_graph,  // [graph_chunk_size, graph_degree]
-                      const uint32_t graph_size,
-                      const uint32_t graph_degree)
+__global__ void kern_sort(const DATA_T* const dataset,  // [dataset_chunk_size, dataset_dim]
+                          const IdxT dataset_size,
+                          const uint32_t dataset_dim,
+                          IdxT* const knn_graph,  // [graph_chunk_size, graph_degree]
+                          const uint32_t graph_size,
+                          const uint32_t graph_degree)
 {
   const IdxT srcNode = (blockDim.x * blockIdx.x + threadIdx.x) / raft::WarpSize;
   if (srcNode >= graph_size) { return; }
@@ -129,15 +129,15 @@ RAFT_KERNEL kern_sort(const DATA_T* const dataset,  // [dataset_chunk_size, data
 }
 
 template <int MAX_DEGREE, class IdxT>
-RAFT_KERNEL kern_prune(const IdxT* const knn_graph,  // [graph_chunk_size, graph_degree]
-                       const uint32_t graph_size,
-                       const uint32_t graph_degree,
-                       const uint32_t degree,
-                       const uint32_t batch_size,
-                       const uint32_t batch_id,
-                       uint8_t* const detour_count,          // [graph_chunk_size, graph_degree]
-                       uint32_t* const num_no_detour_edges,  // [graph_size]
-                       uint64_t* const stats)
+__global__ void kern_prune(const IdxT* const knn_graph,  // [graph_chunk_size, graph_degree]
+                           const uint32_t graph_size,
+                           const uint32_t graph_degree,
+                           const uint32_t degree,
+                           const uint32_t batch_size,
+                           const uint32_t batch_id,
+                           uint8_t* const detour_count,          // [graph_chunk_size, graph_degree]
+                           uint32_t* const num_no_detour_edges,  // [graph_size]
+                           uint64_t* const stats)
 {
   __shared__ uint32_t smem_num_detour[MAX_DEGREE];
   uint64_t* const num_retain = stats;
@@ -192,11 +192,11 @@ RAFT_KERNEL kern_prune(const IdxT* const knn_graph,  // [graph_chunk_size, graph
 }
 
 template <class IdxT>
-RAFT_KERNEL kern_make_rev_graph(const IdxT* const dest_nodes,     // [graph_size]
-                                IdxT* const rev_graph,            // [size, degree]
-                                uint32_t* const rev_graph_count,  // [graph_size]
-                                const uint32_t graph_size,
-                                const uint32_t degree)
+__global__ void kern_make_rev_graph(const IdxT* const dest_nodes,     // [graph_size]
+                                    IdxT* const rev_graph,            // [size, degree]
+                                    uint32_t* const rev_graph_count,  // [graph_size]
+                                    const uint32_t graph_size,
+                                    const uint32_t degree)
 {
   const uint32_t tid  = threadIdx.x + (blockDim.x * blockIdx.x);
   const uint32_t tnum = blockDim.x * gridDim.x;
@@ -221,16 +221,16 @@ __device__ __host__ LabelT get_root_label(IdxT i, const LabelT* label)
 }
 
 template <class IdxT>
-RAFT_KERNEL kern_mst_opt_update_graph(IdxT* mst_graph,                 // [graph_size, graph_degree]
-                                      const IdxT* candidate_edges,     // [graph_size]
-                                      IdxT* outgoing_num_edges,        // [graph_size]
-                                      IdxT* incoming_num_edges,        // [graph_size]
-                                      const IdxT* outgoing_max_edges,  // [graph_size]
-                                      const IdxT* incoming_max_edges,  // [graph_size]
-                                      const IdxT* label,               // [graph_size]
-                                      const uint32_t graph_size,
-                                      const uint32_t graph_degree,
-                                      uint64_t* stats)
+__global__ void kern_mst_opt_update_graph(IdxT* mst_graph,  // [graph_size, graph_degree]
+                                          const IdxT* candidate_edges,     // [graph_size]
+                                          IdxT* outgoing_num_edges,        // [graph_size]
+                                          IdxT* incoming_num_edges,        // [graph_size]
+                                          const IdxT* outgoing_max_edges,  // [graph_size]
+                                          const IdxT* incoming_max_edges,  // [graph_size]
+                                          const IdxT* label,               // [graph_size]
+                                          const uint32_t graph_size,
+                                          const uint32_t graph_degree,
+                                          uint64_t* stats)
 {
   const uint64_t i = threadIdx.x + (blockDim.x * blockIdx.x);
   if (i >= graph_size) return;
@@ -310,11 +310,11 @@ RAFT_KERNEL kern_mst_opt_update_graph(IdxT* mst_graph,                 // [graph
 }
 
 template <class IdxT>
-RAFT_KERNEL kern_mst_opt_labeling(IdxT* label,            // [graph_size]
-                                  const IdxT* mst_graph,  // [graph_size, graph_degree]
-                                  const uint32_t graph_size,
-                                  const uint32_t graph_degree,
-                                  uint64_t* stats)
+__global__ void kern_mst_opt_labeling(IdxT* label,            // [graph_size]
+                                      const IdxT* mst_graph,  // [graph_size, graph_degree]
+                                      const uint32_t graph_size,
+                                      const uint32_t graph_degree,
+                                      uint64_t* stats)
 {
   const uint64_t i = threadIdx.x + (blockDim.x * blockIdx.x);
   if (i >= graph_size) return;
@@ -348,10 +348,10 @@ RAFT_KERNEL kern_mst_opt_labeling(IdxT* label,            // [graph_size]
 }
 
 template <class IdxT>
-RAFT_KERNEL kern_mst_opt_cluster_size(IdxT* cluster_size,  // [graph_size]
-                                      const IdxT* label,   // [graph_size]
-                                      const uint32_t graph_size,
-                                      uint64_t* stats)
+__global__ void kern_mst_opt_cluster_size(IdxT* cluster_size,  // [graph_size]
+                                          const IdxT* label,   // [graph_size]
+                                          const uint32_t graph_size,
+                                          uint64_t* stats)
 {
   const uint64_t i = threadIdx.x + (blockDim.x * blockIdx.x);
   if (i >= graph_size) return;
@@ -375,14 +375,14 @@ RAFT_KERNEL kern_mst_opt_cluster_size(IdxT* cluster_size,  // [graph_size]
 }
 
 template <class IdxT>
-RAFT_KERNEL kern_mst_opt_postprocessing(IdxT* outgoing_num_edges,  // [graph_size]
-                                        IdxT* incoming_num_edges,  // [graph_size]
-                                        IdxT* outgoing_max_edges,  // [graph_size]
-                                        IdxT* incoming_max_edges,  // [graph_size]
-                                        const IdxT* cluster_size,  // [graph_size]
-                                        const uint32_t graph_size,
-                                        const uint32_t graph_degree,
-                                        uint64_t* stats)
+__global__ void kern_mst_opt_postprocessing(IdxT* outgoing_num_edges,  // [graph_size]
+                                            IdxT* incoming_num_edges,  // [graph_size]
+                                            IdxT* outgoing_max_edges,  // [graph_size]
+                                            IdxT* incoming_max_edges,  // [graph_size]
+                                            const IdxT* cluster_size,  // [graph_size]
+                                            const uint32_t graph_size,
+                                            const uint32_t graph_degree,
+                                            uint64_t* stats)
 {
   const uint64_t i = threadIdx.x + (blockDim.x * blockIdx.x);
   if (i >= graph_size) return;
diff --git a/cpp/src/neighbors/detail/cagra/hashmap.hpp b/cpp/src/neighbors/detail/cagra/hashmap.hpp
index dd6c6c844..2c62dda90 100644
--- a/cpp/src/neighbors/detail/cagra/hashmap.hpp
+++ b/cpp/src/neighbors/detail/cagra/hashmap.hpp
@@ -29,10 +29,12 @@
 namespace cuvs::neighbors::cagra::detail {
 namespace hashmap {
 
-_RAFT_HOST_DEVICE inline uint32_t get_size(const uint32_t bitlen) { return 1U << bitlen; }
+RAFT_INLINE_FUNCTION uint32_t get_size(const uint32_t bitlen) { return 1U << bitlen; }
 
 template <class IdxT>
-_RAFT_DEVICE inline void init(IdxT* const table, const unsigned bitlen, unsigned FIRST_TID = 0)
+RAFT_DEVICE_INLINE_FUNCTION void init(IdxT* const table,
+                                      const unsigned bitlen,
+                                      unsigned FIRST_TID = 0)
 {
   if (threadIdx.x < FIRST_TID) return;
   for (unsigned i = threadIdx.x - FIRST_TID; i < get_size(bitlen); i += blockDim.x - FIRST_TID) {
@@ -41,7 +43,9 @@ _RAFT_DEVICE inline void init(IdxT* const table, const unsigned bitlen, unsigned
 }
 
 template <class IdxT>
-_RAFT_DEVICE inline uint32_t insert(IdxT* const table, const uint32_t bitlen, const IdxT key)
+RAFT_DEVICE_INLINE_FUNCTION uint32_t insert(IdxT* const table,
+                                            const uint32_t bitlen,
+                                            const IdxT key)
 {
   // Open addressing is used for collision resolution
   const uint32_t size     = get_size(bitlen);
@@ -68,7 +72,9 @@ _RAFT_DEVICE inline uint32_t insert(IdxT* const table, const uint32_t bitlen, co
 }
 
 template <unsigned TEAM_SIZE, class IdxT>
-_RAFT_DEVICE inline uint32_t insert(IdxT* const table, const uint32_t bitlen, const IdxT key)
+RAFT_DEVICE_INLINE_FUNCTION uint32_t insert(IdxT* const table,
+                                            const uint32_t bitlen,
+                                            const IdxT key)
 {
   IdxT ret = 0;
   if (threadIdx.x % TEAM_SIZE == 0) { ret = insert(table, bitlen, key); }
@@ -78,5 +84,17 @@ _RAFT_DEVICE inline uint32_t insert(IdxT* const table, const uint32_t bitlen, co
   return ret;
 }
 
+template <class IdxT>
+RAFT_DEVICE_INLINE_FUNCTION uint32_t
+insert(unsigned team_size, IdxT* const table, const uint32_t bitlen, const IdxT key)
+{
+  IdxT ret = 0;
+  if (threadIdx.x % team_size == 0) { ret = insert(table, bitlen, key); }
+  for (unsigned offset = 1; offset < team_size; offset *= 2) {
+    ret |= __shfl_xor_sync(0xffffffff, ret, offset);
+  }
+  return ret;
+}
+
 }  // namespace hashmap
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_00_generate.py b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_00_generate.py
deleted file mode 100644
index 63171373f..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_00_generate.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-header = """/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-#include "compute_distance_vpq.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-"""
-
-trailer = """
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
-"""
-
-mxdim_team = [(128, 8), (256, 16), (512, 32), (1024, 32)]
-pq_bits = [8]
-subspace_dims = [2, 4]
-# block = [(64, 16), (128, 8), (256, 4), (512, 2), (1024, 1)]
-# mxelem = [64, 128, 256]
-load_types = ["uint4"]
-code_book_types = ["half"]
-search_types = dict(
-    float_uint32=(
-        "float",
-        "uint32_t",
-        "float",
-    ),  # data_t, vec_idx_t, distance_t
-    half_uint32=("half", "uint32_t", "float"),
-    int8_uint32=("int8_t", "uint32_t", "float"),
-    uint8_uint32=("uint8_t", "uint32_t", "float"),
-    float_uint64=("float", "uint64_t", "float"),
-    half_uint64=("half", "uint64_t", "float"),
-)
-# knn
-for type_path, (data_t, idx_t, distance_t) in search_types.items():
-    for (mxdim, team) in mxdim_team:
-        for code_book_t in code_book_types:
-            for subspace_dim in subspace_dims:
-                for pq_bit in pq_bits:
-                    path = f"q_search_multi_cta_{type_path}_dim{mxdim}_t{team}_{pq_bit}pq_{subspace_dim}subd_{code_book_t}.cu"
-                    with open(path, "w") as f:
-                        f.write(header)
-                        f.write(
-                                f"instantiate_kernel_selection(\n  {team}, {mxdim}, cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<{data_t} COMMA {code_book_t} COMMA {pq_bit} COMMA {subspace_dim} COMMA {distance_t} COMMA {idx_t}>, cuvs::neighbors::filtering::none_cagra_sample_filter);\n"
-                        )
-                        f.write(trailer)
-                        # For pasting into CMakeLists.txt
-                    print(f"src/neighbors/detail/cagra/{path}")
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_2subd_half.cu
deleted file mode 100644
index 5d94a501a..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_4subd_half.cu
deleted file mode 100644
index 56534dc05..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim256_t16_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_2subd_half.cu
deleted file mode 100644
index 7ff962058..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_4subd_half.cu
deleted file mode 100644
index 3387a32a3..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint32_dim512_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu
deleted file mode 100644
index 2d3f2cb1d..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu
deleted file mode 100644
index 73dd8cd4b..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_2subd_half.cu
deleted file mode 100644
index b5e33602d..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_4subd_half.cu
deleted file mode 100644
index 32fe0d628..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim128_t8_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_2subd_half.cu
deleted file mode 100644
index e2726ea26..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_4subd_half.cu
deleted file mode 100644
index b4ebd49c4..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim256_t16_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_2subd_half.cu
deleted file mode 100644
index 72f198c92..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_4subd_half.cu
deleted file mode 100644
index dfb667a7f..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_float_uint64_dim512_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim1024_t32_8pq_2subd_half.cu
deleted file mode 100644
index c583569f6..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim1024_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim1024_t32_8pq_4subd_half.cu
deleted file mode 100644
index fedfb5146..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim1024_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim256_t16_8pq_2subd_half.cu
deleted file mode 100644
index 2b6e8e3da..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim256_t16_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim256_t16_8pq_4subd_half.cu
deleted file mode 100644
index 4a97fb752..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim256_t16_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim512_t32_8pq_2subd_half.cu
deleted file mode 100644
index 675cd3c93..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim512_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim512_t32_8pq_4subd_half.cu
deleted file mode 100644
index b42b3289c..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint32_dim512_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim1024_t32_8pq_2subd_half.cu
deleted file mode 100644
index 0db4296f1..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim1024_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim1024_t32_8pq_4subd_half.cu
deleted file mode 100644
index 4a2610dc7..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim1024_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim256_t16_8pq_2subd_half.cu
deleted file mode 100644
index b1c15662e..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim256_t16_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim256_t16_8pq_4subd_half.cu
deleted file mode 100644
index 201f68fb5..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim256_t16_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim512_t32_8pq_2subd_half.cu
deleted file mode 100644
index 26744ed76..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim512_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim512_t32_8pq_4subd_half.cu
deleted file mode 100644
index 1bce71bef..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_half_uint64_dim512_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu
deleted file mode 100644
index 694304f3c..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu
deleted file mode 100644
index e6a563731..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu
deleted file mode 100644
index 5c554af3f..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu
deleted file mode 100644
index 965b43c07..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu
deleted file mode 100644
index 97a4f8092..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu
deleted file mode 100644
index bdd1719b3..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu
deleted file mode 100644
index e39bc1e2d..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu
deleted file mode 100644
index 599cf327a..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu
deleted file mode 100644
index 621c5a249..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu
deleted file mode 100644
index cbed3ef8a..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu
deleted file mode 100644
index 7428bfd9e..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu
deleted file mode 100644
index 70efefdb0..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu
deleted file mode 100644
index 4039b8582..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu
deleted file mode 100644
index 022eb0e05..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu
deleted file mode 100644
index e48b2ed71..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu
deleted file mode 100644
index 64f08530f..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_multi_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_multi_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_multi_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_00_generate.py b/cpp/src/neighbors/detail/cagra/q_search_single_cta_00_generate.py
deleted file mode 100644
index bc5f506ac..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_00_generate.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-header = """/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-#include "compute_distance_vpq.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-"""
-
-trailer = """
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
-"""
-
-mxdim_team = [(128, 8), (256, 16), (512, 32), (1024, 32)]
-# block = [(64, 16), (128, 8), (256, 4), (512, 2), (1024, 1)]
-# itopk_candidates = [64, 128, 256]
-# itopk_size = [64, 128, 256, 512]
-# mxelem = [64, 128, 256]
-
-pq_bits = [8]
-subspace_dims = [2, 4]
-
-# rblock = [(256, 4), (512, 2), (1024, 1)]
-# rcandidates = [32]
-# rsize = [256, 512]
-code_book_types = ["half"]
-
-search_types = dict(
-    float_uint32=("float", "uint32_t", "float"),  # data_t, idx_t, distance_t
-    half_uint32=("half", "uint32_t", "float"),
-    int8_uint32=("int8_t", "uint32_t", "float"),
-    uint8_uint32=("uint8_t", "uint32_t", "float"),
-    float_uint64=("float", "uint64_t", "float"),
-    half_uint64=("half", "uint64_t", "float"),
-)
-
-# knn
-for type_path, (data_t, idx_t, distance_t) in search_types.items():
-    for (mxdim, team) in mxdim_team:
-        for code_book_t in code_book_types:
-            for subspace_dim in subspace_dims:
-                for pq_bit in pq_bits:
-                    path = f"q_search_single_cta_{type_path}_dim{mxdim}_t{team}_{pq_bit}pq_{subspace_dim}subd_{code_book_t}.cu"
-                    with open(path, "w") as f:
-                        f.write(header)
-                        f.write(
-                                f"instantiate_kernel_selection(\n  {team}, {mxdim}, cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<{data_t} COMMA {code_book_t} COMMA {pq_bit} COMMA {subspace_dim} COMMA {distance_t} COMMA {idx_t}>, cuvs::neighbors::filtering::none_cagra_sample_filter);\n"
-                        )
-
-                        f.write(trailer)
-                        # For pasting into CMakeLists.txt
-                        print(f"src/neighbors/detail/cagra/{path}")
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu
deleted file mode 100644
index b40322741..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu
deleted file mode 100644
index 36273d0d4..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim1024_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_2subd_half.cu
deleted file mode 100644
index ef483437a..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_4subd_half.cu
deleted file mode 100644
index d9ebb1b85..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim128_t8_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_2subd_half.cu
deleted file mode 100644
index e86524ee0..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_4subd_half.cu
deleted file mode 100644
index 9f2b7fbc7..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim256_t16_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_2subd_half.cu
deleted file mode 100644
index 1ce4f5520..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_4subd_half.cu
deleted file mode 100644
index 2d6f93ef0..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint32_dim512_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu
deleted file mode 100644
index 5f3267410..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu
deleted file mode 100644
index 631ac7938..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim1024_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_2subd_half.cu
deleted file mode 100644
index ea8faee1c..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_4subd_half.cu
deleted file mode 100644
index 061b1a04e..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim128_t8_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_2subd_half.cu
deleted file mode 100644
index 15610d853..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_4subd_half.cu
deleted file mode 100644
index f984b46f0..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim256_t16_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_2subd_half.cu
deleted file mode 100644
index 45299f272..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_4subd_half.cu
deleted file mode 100644
index fcb91be8c..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_float_uint64_dim512_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               float COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim1024_t32_8pq_2subd_half.cu
deleted file mode 100644
index b594fedab..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim1024_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim1024_t32_8pq_4subd_half.cu
deleted file mode 100644
index a82be6b55..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim1024_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim128_t8_8pq_2subd_half.cu
deleted file mode 100644
index d80fef52c..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim128_t8_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim128_t8_8pq_4subd_half.cu
deleted file mode 100644
index e2c3ef4f7..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim128_t8_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim256_t16_8pq_2subd_half.cu
deleted file mode 100644
index 98889811d..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim256_t16_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim256_t16_8pq_4subd_half.cu
deleted file mode 100644
index f5e9d12c9..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim256_t16_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim512_t32_8pq_2subd_half.cu
deleted file mode 100644
index 4f14910b4..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim512_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim512_t32_8pq_4subd_half.cu
deleted file mode 100644
index 67d52f8d5..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint32_dim512_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim1024_t32_8pq_2subd_half.cu
deleted file mode 100644
index 1420918a1..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim1024_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim1024_t32_8pq_4subd_half.cu
deleted file mode 100644
index eb0a72da3..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim1024_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim128_t8_8pq_2subd_half.cu
deleted file mode 100644
index 7a98b59a9..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim128_t8_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim128_t8_8pq_4subd_half.cu
deleted file mode 100644
index 7e07033c7..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim128_t8_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim256_t16_8pq_2subd_half.cu
deleted file mode 100644
index 857f32712..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim256_t16_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim256_t16_8pq_4subd_half.cu
deleted file mode 100644
index 3c00c5223..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim256_t16_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim512_t32_8pq_2subd_half.cu
deleted file mode 100644
index e5c4c7b69..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim512_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim512_t32_8pq_4subd_half.cu
deleted file mode 100644
index 22359d71b..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_half_uint64_dim512_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               half COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint64_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu
deleted file mode 100644
index 37c783f19..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu
deleted file mode 100644
index 0a4049d79..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim1024_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu
deleted file mode 100644
index 773f567c4..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu
deleted file mode 100644
index dfc176abd..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim128_t8_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu
deleted file mode 100644
index 680c32655..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu
deleted file mode 100644
index e57881e82..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim256_t16_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu
deleted file mode 100644
index 525004f2e..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu
deleted file mode 100644
index 7af2ef124..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_int8_uint32_dim512_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               int8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu
deleted file mode 100644
index 0fd36c31b..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu
deleted file mode 100644
index d4cc5f449..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim1024_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             1024,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu
deleted file mode 100644
index aa58ac2b7..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu
deleted file mode 100644
index 189c3ed9c..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim128_t8_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(8,
-                             128,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu
deleted file mode 100644
index 9dc9aaae3..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu
deleted file mode 100644
index 100110313..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim256_t16_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(16,
-                             256,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu
deleted file mode 100644
index 8d4e0aeee..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_2subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 2 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu b/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu
deleted file mode 100644
index 4c7318735..000000000
--- a/cpp/src/neighbors/detail/cagra/q_search_single_cta_uint8_uint32_dim512_t32_8pq_4subd_half.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by q_search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python q_search_single_cta_00_generate.py
- *
- */
-
-#include "compute_distance_vpq.cuh"
-#include "search_single_cta_inst.cuh"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(32,
-                             512,
-                             cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<
-                               uint8_t COMMA half COMMA 8 COMMA 4 COMMA float COMMA uint32_t>,
-                             cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh
index efbf9b56d..9bcccd9f9 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh
@@ -16,12 +16,12 @@
 #pragma once
 
 #include "bitonic.hpp"
-#include "compute_distance.hpp"
+#include "compute_distance-ext.cuh"
 #include "device_common.hpp"
 #include "hashmap.hpp"
 #include "search_multi_cta_kernel.cuh"
 #include "search_plan.cuh"
-#include "topk_for_cagra/topk_core.cuh"  // TODO replace with raft topk if possible
+#include "topk_for_cagra/topk.h"  // TODO replace with raft topk if possible
 #include "utils.hpp"
 
 #include <raft/core/detail/macros.hpp>
@@ -51,48 +51,46 @@
 namespace cuvs::neighbors::cagra::detail {
 namespace multi_cta_search {
 
-template <unsigned TEAM_SIZE,
-          unsigned DATASET_BLOCK_DIM,
-          typename DATASET_DESCRIPTOR_T,
-          typename SAMPLE_FILTER_T>
-
-struct search : public search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
-  using DATA_T     = typename DATASET_DESCRIPTOR_T::DATA_T;
-  using INDEX_T    = typename DATASET_DESCRIPTOR_T::INDEX_T;
-  using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::max_queries;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::itopk_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::algo;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::team_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::search_width;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::min_iterations;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::max_iterations;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::thread_block_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap_mode;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap_min_bitlen;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap_max_fill_rate;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::num_random_samplings;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::rand_xor_mask;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::dim;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::graph_degree;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::topk;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hash_bitlen;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::small_hash_bitlen;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::small_hash_reset_interval;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::dataset_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::result_buffer_size;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::smem_size;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::num_executed_iterations;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::dev_seed;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::num_seeds;
+template <typename DataT, typename IndexT, typename DistanceT, typename SAMPLE_FILTER_T>
+struct search : public search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_T> {
+  using base_type  = search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_T>;
+  using DATA_T     = typename base_type::DATA_T;
+  using INDEX_T    = typename base_type ::INDEX_T;
+  using DISTANCE_T = typename base_type::DISTANCE_T;
+
+  using base_type::algo;
+  using base_type::hashmap_max_fill_rate;
+  using base_type::hashmap_min_bitlen;
+  using base_type::hashmap_mode;
+  using base_type::itopk_size;
+  using base_type::max_iterations;
+  using base_type::max_queries;
+  using base_type::min_iterations;
+  using base_type::num_random_samplings;
+  using base_type::rand_xor_mask;
+  using base_type::search_width;
+  using base_type::team_size;
+  using base_type::thread_block_size;
+
+  using base_type::dim;
+  using base_type::graph_degree;
+  using base_type::topk;
+
+  using base_type::hash_bitlen;
+
+  using base_type::dataset_size;
+  using base_type::hashmap_size;
+  using base_type::result_buffer_size;
+  using base_type::small_hash_bitlen;
+  using base_type::small_hash_reset_interval;
+
+  using base_type::smem_size;
+
+  using base_type::dataset_desc;
+  using base_type::dev_seed;
+  using base_type::hashmap;
+  using base_type::num_executed_iterations;
+  using base_type::num_seeds;
 
   uint32_t num_cta_per_query;
   rmm::device_uvector<INDEX_T> intermediate_indices;
@@ -102,12 +100,11 @@ struct search : public search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
 
   search(raft::resources const& res,
          search_params params,
+         const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc,
          int64_t dim,
          int64_t graph_degree,
-         uint32_t topk,
-         cuvs::distance::DistanceType metric)
-    : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>(
-        res, params, dim, graph_degree, topk, metric),
+         uint32_t topk)
+    : base_type(res, params, dataset_desc, dim, graph_degree, topk),
       intermediate_indices(0, raft::resource::get_cuda_stream(res)),
       intermediate_distances(0, raft::resource::get_cuda_stream(res)),
       topk_workspace(0, raft::resource::get_cuda_stream(res))
@@ -129,13 +126,9 @@ struct search : public search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
     // constexpr unsigned max_result_buffer_size = 256;
     RAFT_EXPECTS(result_buffer_size_32 <= 256, "Result buffer size cannot exceed 256");
 
-    const auto query_smem_buffer_length =
-      raft::ceildiv<uint32_t>(dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM;
-
-    smem_size = sizeof(float) * query_smem_buffer_length +
+    smem_size = dataset_desc.smem_ws_size_in_bytes +
                 (sizeof(INDEX_T) + sizeof(DISTANCE_T)) * result_buffer_size_32 +
-                sizeof(uint32_t) * search_width + sizeof(uint32_t) +
-                DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte;
+                sizeof(uint32_t) * search_width + sizeof(uint32_t);
     RAFT_LOG_DEBUG("# smem_size: %u", smem_size);
 
     //
@@ -204,44 +197,37 @@ struct search : public search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
 
   ~search() {}
 
-  void operator()(
-    raft::resources const& res,
-    // raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,
-    DATASET_DESCRIPTOR_T dataset_desc,
-    raft::device_matrix_view<const typename DATASET_DESCRIPTOR_T::INDEX_T, int64_t, raft::row_major>
-      graph,
-    typename DATASET_DESCRIPTOR_T::INDEX_T* const topk_indices_ptr,       // [num_queries, topk]
-    typename DATASET_DESCRIPTOR_T::DISTANCE_T* const topk_distances_ptr,  // [num_queries, topk]
-    const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
-    const uint32_t num_queries,
-    const typename DATASET_DESCRIPTOR_T::INDEX_T* dev_seed_ptr,  // [num_queries, num_seeds]
-    uint32_t* const num_executed_iterations,                     // [num_queries,]
-    uint32_t topk,
-    SAMPLE_FILTER_T sample_filter)
+  void operator()(raft::resources const& res,
+                  raft::device_matrix_view<const INDEX_T, int64_t, raft::row_major> graph,
+                  INDEX_T* const topk_indices_ptr,       // [num_queries, topk]
+                  DISTANCE_T* const topk_distances_ptr,  // [num_queries, topk]
+                  const DATA_T* const queries_ptr,       // [num_queries, dataset_dim]
+                  const uint32_t num_queries,
+                  const INDEX_T* dev_seed_ptr,              // [num_queries, num_seeds]
+                  uint32_t* const num_executed_iterations,  // [num_queries,]
+                  uint32_t topk,
+                  SAMPLE_FILTER_T sample_filter)
   {
     cudaStream_t stream = raft::resource::get_cuda_stream(res);
-
-    select_and_run<TEAM_SIZE, DATASET_BLOCK_DIM, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>(
-      dataset_desc,
-      graph,
-      intermediate_indices.data(),
-      intermediate_distances.data(),
-      queries_ptr,
-      num_queries,
-      dev_seed_ptr,
-      num_executed_iterations,
-      *this,
-      topk,
-      thread_block_size,
-      result_buffer_size,
-      smem_size,
-      hash_bitlen,
-      hashmap.data(),
-      num_cta_per_query,
-      num_seeds,
-      sample_filter,
-      this->metric,
-      stream);
+    select_and_run(dataset_desc.dev_ptr(),
+                   graph,
+                   intermediate_indices.data(),
+                   intermediate_distances.data(),
+                   queries_ptr,
+                   num_queries,
+                   dev_seed_ptr,
+                   num_executed_iterations,
+                   *this,
+                   topk,
+                   thread_block_size,
+                   result_buffer_size,
+                   smem_size,
+                   hash_bitlen,
+                   hashmap.data(),
+                   num_cta_per_query,
+                   num_seeds,
+                   sample_filter,
+                   stream);
     RAFT_CUDA_TRY(cudaPeekAtLastError());
 
     // Select the top-k results from the intermediate results
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py b/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py
index cb63c0e03..3153a3a9f 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py
@@ -39,8 +39,6 @@
 
 #include "search_multi_cta_inst.cuh"
 
-#include "compute_distance.hpp"
-
 namespace cuvs::neighbors::cagra::detail::multi_cta_search {
 """
 
@@ -48,7 +46,6 @@
 }  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
 """
 
-mxdim_team = [(128, 8), (256, 16), (512, 32), (1024, 32)]
 # block = [(64, 16), (128, 8), (256, 4), (512, 2), (1024, 1)]
 # mxelem = [64, 128, 256]
 load_types = ["uint4"]
@@ -66,13 +63,12 @@
 )
 # knn
 for type_path, (data_t, idx_t, distance_t) in search_types.items():
-    for (mxdim, team) in mxdim_team:
-        path = f"search_multi_cta_{type_path}_dim{mxdim}_t{team}.cu"
-        with open(path, "w") as f:
-            f.write(header)
-            f.write(
-                    f"instantiate_kernel_selection(\n  {team}, {mxdim}, cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<{data_t} COMMA {idx_t} COMMA {distance_t}>, cuvs::neighbors::filtering::none_cagra_sample_filter);\n"
-            )
-            f.write(trailer)
-            # For pasting into CMakeLists.txt
-        print(f"src/neighbors/detail/cagra/{path}")
+    path = f"search_multi_cta_{type_path}.cu"
+    with open(path, "w") as f:
+        f.write(header)
+        f.write(
+                f"instantiate_kernel_selection(\n  {data_t}, {idx_t}, {distance_t}, cuvs::neighbors::filtering::none_cagra_sample_filter);\n"
+        )
+        f.write(trailer)
+        # For pasting into CMakeLists.txt
+    print(f"src/neighbors/detail/cagra/{path}")
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu
similarity index 80%
rename from cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim128_t8.cu
rename to cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu
index 2a14699f4..fae5a9387 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu
@@ -25,13 +25,10 @@
 
 #include "search_multi_cta_inst.cuh"
 
-#include "compute_distance.hpp"
-
 namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  8,
-  128,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(float,
+                             uint32_t,
+                             float,
+                             cuvs::neighbors::filtering::none_cagra_sample_filter);
 
 }  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu
deleted file mode 100644
index 0bf4a192f..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  32,
-  1024,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu
deleted file mode 100644
index a77859b7d..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  16,
-  256,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu
deleted file mode 100644
index ab49fa9f2..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  32,
-  512,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64.cu
similarity index 80%
rename from cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim128_t8.cu
rename to cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64.cu
index 157942dc5..88167b843 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64.cu
@@ -25,13 +25,10 @@
 
 #include "search_multi_cta_inst.cuh"
 
-#include "compute_distance.hpp"
-
 namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  8,
-  128,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(float,
+                             uint64_t,
+                             float,
+                             cuvs::neighbors::filtering::none_cagra_sample_filter);
 
 }  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu
deleted file mode 100644
index c38eeb009..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  32,
-  1024,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu
deleted file mode 100644
index 3094ddaeb..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  8,
-  128,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu
deleted file mode 100644
index 91725d185..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  16,
-  256,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu
deleted file mode 100644
index 0f452a6fa..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  32,
-  512,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu
similarity index 80%
rename from cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu
rename to cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu
index ea38b60c0..9606d510f 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu
@@ -25,13 +25,10 @@
 
 #include "search_multi_cta_inst.cuh"
 
-#include "compute_distance.hpp"
-
 namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  8,
-  128,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(half,
+                             uint32_t,
+                             float,
+                             cuvs::neighbors::filtering::none_cagra_sample_filter);
 
 }  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim1024_t32.cu
deleted file mode 100644
index cfe7a7aef..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim1024_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  32,
-  1024,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim512_t32.cu
deleted file mode 100644
index 292859382..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim512_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  32,
-  512,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64.cu
similarity index 80%
rename from cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim256_t16.cu
rename to cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64.cu
index ee2400037..dafb89cc3 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim256_t16.cu
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64.cu
@@ -25,13 +25,10 @@
 
 #include "search_multi_cta_inst.cuh"
 
-#include "compute_distance.hpp"
-
 namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  16,
-  256,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(half,
+                             uint64_t,
+                             float,
+                             cuvs::neighbors::filtering::none_cagra_sample_filter);
 
 }  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu
deleted file mode 100644
index 13044f12d..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  32,
-  1024,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu
deleted file mode 100644
index 2ce6f292d..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  16,
-  256,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu
deleted file mode 100644
index 2d607eb8d..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  32,
-  512,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_inst.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_inst.cuh
index b1cfaf870..036a4e414 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_inst.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_inst.cuh
@@ -21,30 +21,26 @@
 
 namespace cuvs::neighbors::cagra::detail::multi_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATASET_DESC_T, SAMPLE_FILTER_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATASET_DESC_T, SAMPLE_FILTER_T>(      \
-    DATASET_DESC_T dataset_desc,                                                                  \
-    raft::device_matrix_view<const typename DATASET_DESC_T::INDEX_T, int64_t, raft::row_major>    \
-      graph,                                                                                      \
-    typename DATASET_DESC_T::INDEX_T* const topk_indices_ptr,                                     \
-    typename DATASET_DESC_T::DISTANCE_T* const topk_distances_ptr,                                \
-    const typename DATASET_DESC_T::DATA_T* const queries_ptr,                                     \
-    const uint32_t num_queries,                                                                   \
-    const typename DATASET_DESC_T::INDEX_T* dev_seed_ptr,                                         \
-    uint32_t* const num_executed_iterations,                                                      \
-    const search_params& ps,                                                                      \
-    uint32_t topk,                                                                                \
-    uint32_t block_size,                                                                          \
-    uint32_t result_buffer_size,                                                                  \
-    uint32_t smem_size,                                                                           \
-    int64_t hash_bitlen,                                                                          \
-    typename DATASET_DESC_T::INDEX_T* hashmap_ptr,                                                \
-    uint32_t num_cta_per_query,                                                                   \
-    uint32_t num_seeds,                                                                           \
-    SAMPLE_FILTER_T sample_filter,                                                                \
-    cuvs::distance::DistanceType metric,                                                          \
+#define instantiate_kernel_selection(DataT, IndexT, DistanceT, SampleFilterT) \
+  template void select_and_run<DataT, IndexT, DistanceT, SampleFilterT>(      \
+    const dataset_descriptor_base_t<DataT, IndexT, DistanceT>* dataset_desc,  \
+    raft::device_matrix_view<const IndexT, int64_t, raft::row_major> graph,   \
+    IndexT* topk_indices_ptr,                                                 \
+    DistanceT* topk_distances_ptr,                                            \
+    const DataT* queries_ptr,                                                 \
+    uint32_t num_queries,                                                     \
+    const IndexT* dev_seed_ptr,                                               \
+    uint32_t* num_executed_iterations,                                        \
+    const search_params& ps,                                                  \
+    uint32_t topk,                                                            \
+    uint32_t block_size,                                                      \
+    uint32_t result_buffer_size,                                              \
+    uint32_t smem_size,                                                       \
+    int64_t hash_bitlen,                                                      \
+    IndexT* hashmap_ptr,                                                      \
+    uint32_t num_cta_per_query,                                               \
+    uint32_t num_seeds,                                                       \
+    SampleFilterT sample_filter,                                              \
     cudaStream_t stream);
 
-#define COMMA ,
-
 }  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu
new file mode 100644
index 000000000..a3322c435
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by search_multi_cta_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python search_multi_cta_00_generate.py
+ *
+ */
+
+#include "search_multi_cta_inst.cuh"
+
+namespace cuvs::neighbors::cagra::detail::multi_cta_search {
+instantiate_kernel_selection(int8_t,
+                             uint32_t,
+                             float,
+                             cuvs::neighbors::filtering::none_cagra_sample_filter);
+
+}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu
deleted file mode 100644
index c28adbf80..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  32,
-  1024,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<int8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu
deleted file mode 100644
index af5f13397..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  8,
-  128,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<int8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu
deleted file mode 100644
index bcc7b9b8c..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  16,
-  256,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<int8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu
deleted file mode 100644
index 916196c35..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  32,
-  512,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<int8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh
deleted file mode 100644
index e907568f5..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh
+++ /dev/null
@@ -1,405 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include "compute_distance_vpq.cuh"
-#include <cuvs/neighbors/common.hpp>    // none_cagra_sample_filter
-#include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
-
-#include <cuda_fp16.h>
-
-namespace cuvs::neighbors::cagra::detail {
-namespace multi_cta_search {
-
-#ifdef _CUVS_EXPLICIT_INSTANTIATE_ONLY
-
-template <unsigned TEAM_SIZE,
-          unsigned DATASET_BLOCK_DIM,
-          typename DATASET_DESCRIPTOR_T,
-          typename SAMPLE_FILTER_T>
-void select_and_run(
-  DATASET_DESCRIPTOR_T dataset_desc,
-  raft::device_matrix_view<const typename DATASET_DESCRIPTOR_T::INDEX_T, int64_t, raft::row_major>
-    graph,
-  typename DATASET_DESCRIPTOR_T::INDEX_T* const topk_indices_ptr,       // [num_queries, topk]
-  typename DATASET_DESCRIPTOR_T::DISTANCE_T* const topk_distances_ptr,  // [num_queries, topk]
-  const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
-  const uint32_t num_queries,
-  const typename DATASET_DESCRIPTOR_T::INDEX_T* dev_seed_ptr,  // [num_queries, num_seeds]
-  uint32_t* const num_executed_iterations,                     // [num_queries,]
-  const search_params& ps,
-  uint32_t topk,
-  // multi_cta_search (params struct)
-  uint32_t block_size,  //
-  uint32_t result_buffer_size,
-  uint32_t smem_size,
-  int64_t hash_bitlen,
-  typename DATASET_DESCRIPTOR_T::INDEX_T* hashmap_ptr,
-  uint32_t num_cta_per_query,
-  uint32_t num_seeds,
-  SAMPLE_FILTER_T sample_filter,
-  cuvs::distance::DistanceType metric,
-  cudaStream_t stream) RAFT_EXPLICIT;
-#endif  // CUVS_EXPLICIT_INSTANTIATE_ONLY
-
-#define instantiate_kernel_selection(                                                           \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                     \
-  extern template void select_and_run<                                                          \
-    TEAM_SIZE,                                                                                  \
-    MAX_DATASET_DIM,                                                                            \
-    cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<DATA_T, INDEX_T, DISTANCE_T>, \
-    SAMPLE_FILTER_T>(                                                                           \
-    cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<DATA_T, INDEX_T, DISTANCE_T>  \
-      dataset_desc,                                                                             \
-    raft::device_matrix_view<const INDEX_T, int64_t, raft::row_major> graph,                    \
-    INDEX_T* const topk_indices_ptr,                                                            \
-    DISTANCE_T* const topk_distances_ptr,                                                       \
-    const DATA_T* const queries_ptr,                                                            \
-    const uint32_t num_queries,                                                                 \
-    const INDEX_T* dev_seed_ptr,                                                                \
-    uint32_t* const num_executed_iterations,                                                    \
-    const search_params& ps,                                                                    \
-    uint32_t topk,                                                                              \
-    uint32_t block_size,                                                                        \
-    uint32_t result_buffer_size,                                                                \
-    uint32_t smem_size,                                                                         \
-    int64_t hash_bitlen,                                                                        \
-    INDEX_T* hashmap_ptr,                                                                       \
-    uint32_t num_cta_per_query,                                                                 \
-    uint32_t num_seeds,                                                                         \
-    SAMPLE_FILTER_T sample_filter,                                                              \
-    cuvs::distance::DistanceType metric,                                                        \
-    cudaStream_t stream);
-
-instantiate_kernel_selection(
-  32, 1024, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  8, 128, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  16, 256, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  32, 512, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  32, 1024, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  8, 128, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  16, 256, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  32, 512, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  32, 1024, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  8, 128, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  16, 256, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  32, 512, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  32, 1024, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  8, 128, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  16, 256, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_kernel_selection(
-  32, 512, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-#undef instantiate_kernel_selection
-
-#define instantiate_q_kernel_selection(TEAM_SIZE,                                               \
-                                       MAX_DATASET_DIM,                                         \
-                                       CODE_BOOK_T,                                             \
-                                       PQ_BITS,                                                 \
-                                       PQ_CODE_BOOK_DIM,                                        \
-                                       DATA_T,                                                  \
-                                       INDEX_T,                                                 \
-                                       DISTANCE_T,                                              \
-                                       SAMPLE_FILTER_T)                                         \
-  extern template void                                                                          \
-  select_and_run<TEAM_SIZE,                                                                     \
-                 MAX_DATASET_DIM,                                                               \
-                 cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<DATA_T,           \
-                                                                              CODE_BOOK_T,      \
-                                                                              PQ_BITS,          \
-                                                                              PQ_CODE_BOOK_DIM, \
-                                                                              DISTANCE_T,       \
-                                                                              INDEX_T>,         \
-                 SAMPLE_FILTER_T>(                                                              \
-    cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<DATA_T,                        \
-                                                                 CODE_BOOK_T,                   \
-                                                                 PQ_BITS,                       \
-                                                                 PQ_CODE_BOOK_DIM,              \
-                                                                 DISTANCE_T,                    \
-                                                                 INDEX_T> dataset_desc,         \
-    raft::device_matrix_view<const INDEX_T, int64_t, raft::row_major> graph,                    \
-    INDEX_T* const topk_indices_ptr,                                                            \
-    DISTANCE_T* const topk_distances_ptr,                                                       \
-    const DATA_T* const queries_ptr,                                                            \
-    const uint32_t num_queries,                                                                 \
-    const INDEX_T* dev_seed_ptr,                                                                \
-    uint32_t* const num_executed_iterations,                                                    \
-    const search_params& ps,                                                                    \
-    uint32_t topk,                                                                              \
-    uint32_t block_size,                                                                        \
-    uint32_t result_buffer_size,                                                                \
-    uint32_t smem_size,                                                                         \
-    int64_t hash_bitlen,                                                                        \
-    INDEX_T* hashmap_ptr,                                                                       \
-    uint32_t num_cta_per_query,                                                                 \
-    uint32_t num_seeds,                                                                         \
-    SAMPLE_FILTER_T sample_filter,                                                              \
-    cuvs::distance::DistanceType metric,                                                        \
-    cudaStream_t stream);
-
-instantiate_q_kernel_selection(
-  8, 128, half, 8, 2, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(
-  16, 256, half, 8, 2, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(
-  32, 512, half, 8, 2, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(32,
-                               1024,
-                               half,
-                               8,
-                               2,
-                               half,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(
-  8, 128, half, 8, 4, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(
-  16, 256, half, 8, 4, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(
-  32, 512, half, 8, 4, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(32,
-                               1024,
-                               half,
-                               8,
-                               4,
-                               half,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-instantiate_q_kernel_selection(
-  8, 128, half, 8, 2, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(16,
-                               256,
-                               half,
-                               8,
-                               2,
-                               float,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(32,
-                               512,
-                               half,
-                               8,
-                               2,
-                               float,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(32,
-                               1024,
-                               half,
-                               8,
-                               2,
-                               float,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(
-  8, 128, half, 8, 4, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(16,
-                               256,
-                               half,
-                               8,
-                               4,
-                               float,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(32,
-                               512,
-                               half,
-                               8,
-                               4,
-                               float,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(32,
-                               1024,
-                               half,
-                               8,
-                               4,
-                               float,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-instantiate_q_kernel_selection(8,
-                               128,
-                               half,
-                               8,
-                               2,
-                               uint8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(16,
-                               256,
-                               half,
-                               8,
-                               2,
-                               uint8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(32,
-                               512,
-                               half,
-                               8,
-                               2,
-                               uint8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(32,
-                               1024,
-                               half,
-                               8,
-                               2,
-                               uint8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(8,
-                               128,
-                               half,
-                               8,
-                               4,
-                               uint8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(16,
-                               256,
-                               half,
-                               8,
-                               4,
-                               uint8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(32,
-                               512,
-                               half,
-                               8,
-                               4,
-                               uint8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(32,
-                               1024,
-                               half,
-                               8,
-                               4,
-                               uint8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-instantiate_q_kernel_selection(8,
-                               128,
-                               half,
-                               8,
-                               2,
-                               int8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(16,
-                               256,
-                               half,
-                               8,
-                               2,
-                               int8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(32,
-                               512,
-                               half,
-                               8,
-                               2,
-                               int8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(32,
-                               1024,
-                               half,
-                               8,
-                               2,
-                               int8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(8,
-                               128,
-                               half,
-                               8,
-                               4,
-                               int8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(16,
-                               256,
-                               half,
-                               8,
-                               4,
-                               int8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(32,
-                               512,
-                               half,
-                               8,
-                               4,
-                               int8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_kernel_selection(32,
-                               1024,
-                               half,
-                               8,
-                               4,
-                               int8_t,
-                               uint32_t,
-                               float,
-                               cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-#undef instantiate_q_kernel_selection
-}  // namespace multi_cta_search
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
index 4d2030c6c..dd74ba44b 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
@@ -15,12 +15,14 @@
  */
 #pragma once
 
+#include "search_multi_cta_kernel.cuh"
+
 #include "bitonic.hpp"
-#include "compute_distance.hpp"
+#include "compute_distance-ext.cuh"
 #include "device_common.hpp"
 #include "hashmap.hpp"
 #include "search_plan.cuh"
-#include "topk_for_cagra/topk_core.cuh"  // TODO replace with raft topk if possible
+#include "topk_for_cagra/topk.h"  // TODO replace with raft topk if possible
 #include "utils.hpp"
 
 #include <raft/core/device_mdspan.hpp>
@@ -53,11 +55,12 @@ namespace multi_cta_search {
 // #define _CLK_BREAKDOWN
 
 template <class INDEX_T>
-__device__ void pickup_next_parents(INDEX_T* const next_parent_indices,  // [search_width]
-                                    const uint32_t search_width,
-                                    INDEX_T* const itopk_indices,  // [num_itopk]
-                                    const size_t num_itopk,
-                                    uint32_t* const terminate_flag)
+RAFT_DEVICE_INLINE_FUNCTION void pickup_next_parents(
+  INDEX_T* const next_parent_indices,  // [search_width]
+  const uint32_t search_width,
+  INDEX_T* const itopk_indices,  // [num_itopk]
+  const size_t num_itopk,
+  uint32_t* const terminate_flag)
 {
   constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
   const unsigned warp_id             = threadIdx.x / 32;
@@ -93,10 +96,11 @@ __device__ void pickup_next_parents(INDEX_T* const next_parent_indices,  // [sea
 }
 
 template <unsigned MAX_ELEMENTS, class INDEX_T>
-__device__ inline void topk_by_bitonic_sort(float* distances,  // [num_elements]
-                                            INDEX_T* indices,  // [num_elements]
-                                            const uint32_t num_elements,
-                                            const uint32_t num_itopk  // num_itopk <= num_elements
+RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort(
+  float* distances,  // [num_elements]
+  INDEX_T* indices,  // [num_elements]
+  const uint32_t num_elements,
+  const uint32_t num_itopk  // num_itopk <= num_elements
 )
 {
   const unsigned warp_id = threadIdx.x / 32;
@@ -130,17 +134,13 @@ __device__ inline void topk_by_bitonic_sort(float* distances,  // [num_elements]
 //
 // multiple CTAs per single query
 //
-template <int32_t TEAM_SIZE,
-          uint32_t DATASET_BLOCK_DIM,
-          std::uint32_t MAX_ELEMENTS,
-          class DATASET_DESCRIPTOR_T,
-          class SAMPLE_FILTER_T>
-__launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
+template <std::uint32_t MAX_ELEMENTS, class DATASET_DESCRIPTOR_T, class SAMPLE_FILTER_T>
+RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel(
   typename DATASET_DESCRIPTOR_T::INDEX_T* const
     result_indices_ptr,  // [num_queries, num_cta_per_query, itopk_size]
   typename DATASET_DESCRIPTOR_T::DISTANCE_T* const
     result_distances_ptr,  // [num_queries, num_cta_per_query, itopk_size]
-  DATASET_DESCRIPTOR_T dataset_desc,
+  const DATASET_DESCRIPTOR_T* dataset_desc,
   const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
   const typename DATASET_DESCRIPTOR_T::INDEX_T* const knn_graph,   // [dataset_size, graph_degree]
   const uint32_t graph_degree,
@@ -156,13 +156,11 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
   const uint32_t min_iteration,
   const uint32_t max_iteration,
   uint32_t* const num_executed_iterations, /* stats */
-  SAMPLE_FILTER_T sample_filter,
-  const cuvs::distance::DistanceType metric)
+  SAMPLE_FILTER_T sample_filter)
 {
   using DATA_T     = typename DATASET_DESCRIPTOR_T::DATA_T;
   using INDEX_T    = typename DATASET_DESCRIPTOR_T::INDEX_T;
   using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
-  using QUERY_T    = typename DATASET_DESCRIPTOR_T::QUERY_T;
 
   const auto num_queries       = gridDim.y;
   const auto query_id          = blockIdx.y;
@@ -184,7 +182,7 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
 #endif
   _CLK_START();
 
-  extern __shared__ uint32_t smem[];
+  extern __shared__ uint8_t smem[];
 
   // Layout of result_buffer
   // +----------------+------------------------------+---------+
@@ -192,26 +190,21 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
   // | <itopk_size>   | <search_width * graph_degree> | upto 32 |
   // +----------------+------------------------------+---------+
   // |<---          result_buffer_size           --->|
-  uint32_t result_buffer_size    = itopk_size + (search_width * graph_degree);
-  uint32_t result_buffer_size_32 = result_buffer_size;
-  if (result_buffer_size % 32) { result_buffer_size_32 += 32 - (result_buffer_size % 32); }
+  const auto result_buffer_size    = itopk_size + (search_width * graph_degree);
+  const auto result_buffer_size_32 = raft::round_up_safe<uint32_t>(result_buffer_size, 32);
   assert(result_buffer_size_32 <= MAX_ELEMENTS);
 
-  const auto query_smem_buffer_length =
-    raft::ceildiv<uint32_t>(dataset_desc.dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM;
-  auto query_buffer          = reinterpret_cast<QUERY_T*>(smem);
-  auto result_indices_buffer = reinterpret_cast<INDEX_T*>(query_buffer + query_smem_buffer_length);
-  auto result_distances_buffer =
+  // Set smem working buffer for the distance calculation
+  dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id);
+
+  auto* __restrict__ result_indices_buffer =
+    reinterpret_cast<INDEX_T*>(smem + dataset_desc->smem_ws_size_in_bytes());
+  auto* __restrict__ result_distances_buffer =
     reinterpret_cast<DISTANCE_T*>(result_indices_buffer + result_buffer_size_32);
-  auto parent_indices_buffer =
+  auto* __restrict__ parent_indices_buffer =
     reinterpret_cast<INDEX_T*>(result_distances_buffer + result_buffer_size_32);
-  auto distance_work_buffer_ptr =
-    reinterpret_cast<std::uint8_t*>(parent_indices_buffer + search_width);
-  auto terminate_flag = reinterpret_cast<uint32_t*>(distance_work_buffer_ptr +
-                                                    DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte);
-
-  // Set smem working buffer for the distance calculation
-  dataset_desc.set_smem_ptr(distance_work_buffer_ptr);
+  auto* __restrict__ terminate_flag =
+    reinterpret_cast<uint32_t*>(parent_indices_buffer + search_width);
 
 #if 0
     /* debug */
@@ -220,9 +213,6 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
         result_distances_buffer[i] = utils::get_max_value<DISTANCE_T>();
     }
 #endif
-  const DATA_T* const query_ptr = queries_ptr + (dataset_desc.dim * query_id);
-  dataset_desc.template copy_query<DATASET_BLOCK_DIM>(
-    query_ptr, query_buffer, query_smem_buffer_length);
 
   if (threadIdx.x == 0) { terminate_flag[0] = 0; }
   INDEX_T* const local_visited_hashmap_ptr =
@@ -236,20 +226,18 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
   uint32_t block_id                   = cta_id + (num_cta_per_query * query_id);
   uint32_t num_blocks                 = num_cta_per_query * num_queries;
 
-  device::compute_distance_to_random_nodes<TEAM_SIZE, DATASET_BLOCK_DIM>(result_indices_buffer,
-                                                                         result_distances_buffer,
-                                                                         query_buffer,
-                                                                         dataset_desc,
-                                                                         result_buffer_size,
-                                                                         num_distilation,
-                                                                         rand_xor_mask,
-                                                                         local_seed_ptr,
-                                                                         num_seeds,
-                                                                         local_visited_hashmap_ptr,
-                                                                         hash_bitlen,
-                                                                         metric,
-                                                                         block_id,
-                                                                         num_blocks);
+  device::compute_distance_to_random_nodes(result_indices_buffer,
+                                           result_distances_buffer,
+                                           *dataset_desc,
+                                           result_buffer_size,
+                                           num_distilation,
+                                           rand_xor_mask,
+                                           local_seed_ptr,
+                                           num_seeds,
+                                           local_visited_hashmap_ptr,
+                                           hash_bitlen,
+                                           block_id,
+                                           num_blocks);
   __syncthreads();
   _CLK_REC(clk_compute_1st_distance);
 
@@ -279,21 +267,16 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
 
     // compute the norms between child nodes and query node
     _CLK_START();
-    // constexpr unsigned max_n_frags = 16;
-    constexpr unsigned max_n_frags = 0;
-    device::compute_distance_to_child_nodes<TEAM_SIZE, DATASET_BLOCK_DIM, max_n_frags>(
-      result_indices_buffer + itopk_size,
-      result_distances_buffer + itopk_size,
-      query_buffer,
-      dataset_desc,
-      knn_graph,
-      graph_degree,
-      local_visited_hashmap_ptr,
-      hash_bitlen,
-      parent_indices_buffer,
-      result_indices_buffer,
-      search_width,
-      metric);
+    device::compute_distance_to_child_nodes(result_indices_buffer + itopk_size,
+                                            result_distances_buffer + itopk_size,
+                                            *dataset_desc,
+                                            knn_graph,
+                                            graph_degree,
+                                            local_visited_hashmap_ptr,
+                                            hash_bitlen,
+                                            parent_indices_buffer,
+                                            result_indices_buffer,
+                                            search_width);
     _CLK_REC(clk_compute_distance);
     __syncthreads();
 
@@ -409,84 +392,58 @@ void set_value_batch(T* const dev_ptr,
     <<<grid_size, block_size, 0, cuda_stream>>>(dev_ptr, ld, val, count, batch_size);
 }
 
-template <uint32_t TEAM_SIZE,
-          uint32_t DATASET_BLOCK_DIM,
-          typename DATASET_DESCRIPTOR_T,
-          typename SAMPLE_FILTER_T>
+template <typename DATASET_DESCRIPTOR_T, typename SAMPLE_FILTER_T>
 struct search_kernel_config {
   // Search kernel function type. Note that the actual values for the template value
   // parameters do not matter, because they are not part of the function signature. The
   // second to fourth value parameters will be selected by the choose_* functions below.
-  using kernel_t = decltype(&search_kernel<TEAM_SIZE,
-                                           DATASET_BLOCK_DIM,
-                                           128,
-                                           DATASET_DESCRIPTOR_T,
-                                           SAMPLE_FILTER_T>);
+  using kernel_t = decltype(&search_kernel<128, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>);
 
   static auto choose_buffer_size(unsigned result_buffer_size, unsigned block_size) -> kernel_t
   {
     if (result_buffer_size <= 64) {
-      return search_kernel<TEAM_SIZE, DATASET_BLOCK_DIM, 64, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>;
+      return search_kernel<64, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>;
     } else if (result_buffer_size <= 128) {
-      return search_kernel<TEAM_SIZE,
-                           DATASET_BLOCK_DIM,
-                           128,
-                           DATASET_DESCRIPTOR_T,
-                           SAMPLE_FILTER_T>;
+      return search_kernel<128, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>;
     } else if (result_buffer_size <= 256) {
-      return search_kernel<TEAM_SIZE,
-                           DATASET_BLOCK_DIM,
-                           256,
-                           DATASET_DESCRIPTOR_T,
-                           SAMPLE_FILTER_T>;
+      return search_kernel<256, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>;
     }
     THROW("Result buffer size %u larger than max buffer size %u", result_buffer_size, 256);
   }
 };
 
-template <unsigned TEAM_SIZE,
-          unsigned DATASET_BLOCK_DIM,
-          typename DATASET_DESCRIPTOR_T,
-          typename SAMPLE_FILTER_T>
-void select_and_run(
-  DATASET_DESCRIPTOR_T dataset_desc,
-  raft::device_matrix_view<const typename DATASET_DESCRIPTOR_T::INDEX_T, int64_t, raft::row_major>
-    graph,
-  typename DATASET_DESCRIPTOR_T::INDEX_T* const topk_indices_ptr,       // [num_queries, topk]
-  typename DATASET_DESCRIPTOR_T::DISTANCE_T* const topk_distances_ptr,  // [num_queries, topk]
-  const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
-  const uint32_t num_queries,
-  const typename DATASET_DESCRIPTOR_T::INDEX_T* dev_seed_ptr,  // [num_queries, num_seeds]
-  uint32_t* const num_executed_iterations,                     // [num_queries,]
-  const search_params& ps,
-  uint32_t topk,
-  // multi_cta_search (params struct)
-  uint32_t block_size,  //
-  uint32_t result_buffer_size,
-  uint32_t smem_size,
-  int64_t hash_bitlen,
-  typename DATASET_DESCRIPTOR_T::INDEX_T* hashmap_ptr,
-  uint32_t num_cta_per_query,
-  uint32_t num_seeds,
-  SAMPLE_FILTER_T sample_filter,
-  cuvs::distance::DistanceType metric,
-  cudaStream_t stream)
+template <typename DataT, typename IndexT, typename DistanceT, typename SampleFilterT>
+void select_and_run(const dataset_descriptor_base_t<DataT, IndexT, DistanceT>* dataset_desc,
+                    raft::device_matrix_view<const IndexT, int64_t, raft::row_major> graph,
+                    IndexT* topk_indices_ptr,       // [num_queries, topk]
+                    DistanceT* topk_distances_ptr,  // [num_queries, topk]
+                    const DataT* queries_ptr,       // [num_queries, dataset_dim]
+                    uint32_t num_queries,
+                    const IndexT* dev_seed_ptr,         // [num_queries, num_seeds]
+                    uint32_t* num_executed_iterations,  // [num_queries,]
+                    const search_params& ps,
+                    uint32_t topk,
+                    // multi_cta_search (params struct)
+                    uint32_t block_size,  //
+                    uint32_t result_buffer_size,
+                    uint32_t smem_size,
+                    int64_t hash_bitlen,
+                    IndexT* hashmap_ptr,
+                    uint32_t num_cta_per_query,
+                    uint32_t num_seeds,
+                    SampleFilterT sample_filter,
+                    cudaStream_t stream)
 {
   auto kernel =
-    search_kernel_config<TEAM_SIZE, DATASET_BLOCK_DIM, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::
-      choose_buffer_size(result_buffer_size, block_size);
+    search_kernel_config<dataset_descriptor_base_t<DataT, IndexT, DistanceT>,
+                         SampleFilterT>::choose_buffer_size(result_buffer_size, block_size);
 
-  RAFT_CUDA_TRY(cudaFuncSetAttribute(kernel,
-                                     cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                     smem_size + DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte));
+  RAFT_CUDA_TRY(
+    cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
   // Initialize hash table
   const uint32_t hash_size = hashmap::get_size(hash_bitlen);
-  set_value_batch(hashmap_ptr,
-                  hash_size,
-                  utils::get_max_value<typename DATASET_DESCRIPTOR_T::INDEX_T>(),
-                  hash_size,
-                  num_queries,
-                  stream);
+  set_value_batch(
+    hashmap_ptr, hash_size, utils::get_max_value<IndexT>(), hash_size, num_queries, stream);
 
   dim3 block_dims(block_size, 1, 1);
   dim3 grid_dims(num_cta_per_query, num_queries, 1);
@@ -513,8 +470,7 @@ void select_and_run(
                                                        ps.min_iterations,
                                                        ps.max_iterations,
                                                        num_executed_iterations,
-                                                       sample_filter,
-                                                       metric);
+                                                       sample_filter);
 }
 
 }  // namespace multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel.cuh
index 673fc5473..1ef35f947 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,10 +15,32 @@
  */
 #pragma once
 
-#ifndef _CUVS_EXPLICIT_INSTANTIATE_ONLY
-#include "search_multi_cta_kernel-inl.cuh"
-#endif
+#include "compute_distance-ext.cuh"
 
-#ifdef RAFT_COMPILED
-#include "search_multi_cta_kernel-ext.cuh"
-#endif
+#include <cuvs/neighbors/cagra.hpp>
+
+namespace cuvs::neighbors::cagra::detail::multi_cta_search {
+
+template <typename DataT, typename IndexT, typename DistanceT, typename SampleFilterT>
+void select_and_run(const dataset_descriptor_base_t<DataT, IndexT, DistanceT>* dataset_desc,
+                    raft::device_matrix_view<const IndexT, int64_t, raft::row_major> graph,
+                    IndexT* topk_indices_ptr,       // [num_queries, topk]
+                    DistanceT* topk_distances_ptr,  // [num_queries, topk]
+                    const DataT* queries_ptr,       // [num_queries, dataset_dim]
+                    uint32_t num_queries,
+                    const IndexT* dev_seed_ptr,         // [num_queries, num_seeds]
+                    uint32_t* num_executed_iterations,  // [num_queries,]
+                    const search_params& ps,
+                    uint32_t topk,
+                    // multi_cta_search (params struct)
+                    uint32_t block_size,  //
+                    uint32_t result_buffer_size,
+                    uint32_t smem_size,
+                    int64_t hash_bitlen,
+                    IndexT* hashmap_ptr,
+                    uint32_t num_cta_per_query,
+                    uint32_t num_seeds,
+                    SampleFilterT sample_filter,
+                    cudaStream_t stream);
+
+}
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32.cu
new file mode 100644
index 000000000..51fc6526f
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by search_multi_cta_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python search_multi_cta_00_generate.py
+ *
+ */
+
+#include "search_multi_cta_inst.cuh"
+
+namespace cuvs::neighbors::cagra::detail::multi_cta_search {
+instantiate_kernel_selection(uint8_t,
+                             uint32_t,
+                             float,
+                             cuvs::neighbors::filtering::none_cagra_sample_filter);
+
+}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu
deleted file mode 100644
index 3fa12d933..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  32,
-  1024,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<uint8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu
deleted file mode 100644
index e2f25a1c2..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  8,
-  128,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<uint8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu
deleted file mode 100644
index 4cd206d8c..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  16,
-  256,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<uint8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu
deleted file mode 100644
index 56989a1d5..000000000
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_multi_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_multi_cta_00_generate.py
- *
- */
-
-#include "search_multi_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::multi_cta_search {
-instantiate_kernel_selection(
-  32,
-  512,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<uint8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::multi_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
index bc1266fb4..7b3ecabf3 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
@@ -15,12 +15,11 @@
  */
 #pragma once
 
-#include "compute_distance.hpp"
-#include "compute_distance_vpq.cuh"
+#include "compute_distance-ext.cuh"
 #include "device_common.hpp"
 #include "hashmap.hpp"
 #include "search_plan.cuh"
-#include "topk_for_cagra/topk_core.cuh"  //todo replace with raft kernel
+#include "topk_for_cagra/topk.h"  //todo replace with raft kernel
 #include "utils.hpp"
 
 #include <raft/core/device_mdspan.hpp>
@@ -93,9 +92,9 @@ void get_value(T* const host_ptr, const T* const dev_ptr, cudaStream_t cuda_stre
 }
 
 // MAX_DATASET_DIM : must equal to or greater than dataset_dim
-template <unsigned TEAM_SIZE, unsigned DATASET_BLOCK_DIM, class DATASET_DESCRIPTOR_T>
+template <class DATASET_DESCRIPTOR_T>
 RAFT_KERNEL random_pickup_kernel(
-  const DATASET_DESCRIPTOR_T dataset_desc,
+  const DATASET_DESCRIPTOR_T* dataset_desc,
   const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
   const std::size_t num_pickup,
   const unsigned num_distilation,
@@ -106,30 +105,19 @@ RAFT_KERNEL random_pickup_kernel(
   typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr,  // [num_queries, ldr]
   const std::uint32_t ldr,                                                // (*) ldr >= num_pickup
   typename DATASET_DESCRIPTOR_T::INDEX_T* const visited_hashmap_ptr,  // [num_queries, 1 << bitlen]
-  const std::uint32_t hash_bitlen,
-  const cuvs::distance::DistanceType metric)
+  const std::uint32_t hash_bitlen)
 {
   using DATA_T     = typename DATASET_DESCRIPTOR_T::DATA_T;
   using INDEX_T    = typename DATASET_DESCRIPTOR_T::INDEX_T;
   using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
 
+  const auto team_size_bits    = dataset_desc->team_size_bitshift();
   const auto ldb               = hashmap::get_size(hash_bitlen);
-  const auto global_team_index = (blockIdx.x * blockDim.x + threadIdx.x) / TEAM_SIZE;
+  const auto global_team_index = (blockIdx.x * blockDim.x + threadIdx.x) >> team_size_bits;
   const uint32_t query_id      = blockIdx.y;
   if (global_team_index >= num_pickup) { return; }
-  // Load a query
-  extern __shared__ float query_buffer[];
-  const auto query_smem_buffer_length =
-    raft::ceildiv<uint32_t>(dataset_desc.dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM;
-  for (uint32_t i = threadIdx.x; i < query_smem_buffer_length; i += blockDim.x) {
-    unsigned j = device::swizzling(i);
-    if (i < dataset_desc.dim) {
-      query_buffer[j] = cuvs::spatial::knn::detail::utils::mapping<float>{}(
-        (queries_ptr + query_id * dataset_desc.dim)[i]);
-    } else {
-      query_buffer[j] = 0.0;
-    }
-  }
+  extern __shared__ uint8_t smem[];
+  dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id);
   __syncthreads();
 
   INDEX_T best_index_team_local;
@@ -141,27 +129,10 @@ RAFT_KERNEL random_pickup_kernel(
     } else {
       // Chose a seed node randomly
       seed_index =
-        device::xorshift64((global_team_index ^ rand_xor_mask) * (i + 1)) % dataset_desc.size;
-    }
-
-    DISTANCE_T norm2;
-    switch (metric) {
-      case cuvs::distance::DistanceType::L2Expanded:
-        norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
-                                                         TEAM_SIZE,
-                                                         cuvs::distance::DistanceType::L2Expanded>(
-          query_buffer, seed_index, true);
-        break;
-      case cuvs::distance::DistanceType::InnerProduct:
-        norm2 =
-          dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
-                                                   TEAM_SIZE,
-                                                   cuvs::distance::DistanceType::InnerProduct>(
-            query_buffer, seed_index, true);
-        break;
-      default: break;
+        device::xorshift64((global_team_index ^ rand_xor_mask) * (i + 1)) % dataset_desc->size;
     }
 
+    DISTANCE_T norm2 = dataset_desc->compute_distance(seed_index, true);
     if (norm2 < best_norm2_team_local) {
       best_norm2_team_local = norm2;
       best_index_team_local = seed_index;
@@ -169,7 +140,7 @@ RAFT_KERNEL random_pickup_kernel(
   }
 
   const auto store_gmem_index = global_team_index + (ldr * query_id);
-  if (threadIdx.x % TEAM_SIZE == 0) {
+  if ((threadIdx.x & ((1u << team_size_bits) - 1u)) == 0) {
     if (hashmap::insert(
           visited_hashmap_ptr + (ldb * query_id), hash_bitlen, best_index_team_local)) {
       result_distances_ptr[store_gmem_index] = best_norm2_team_local;
@@ -182,47 +153,40 @@ RAFT_KERNEL random_pickup_kernel(
 }
 
 // MAX_DATASET_DIM : must be equal to or greater than dataset_dim
-template <unsigned TEAM_SIZE, unsigned DATASET_BLOCK_DIM, class DATASET_DESCRIPTOR_T>
-void random_pickup(
-  const DATASET_DESCRIPTOR_T dataset_desc,
-  const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
-  const std::size_t num_queries,
-  const std::size_t num_pickup,
-  const unsigned num_distilation,
-  const uint64_t rand_xor_mask,
-  const typename DATASET_DESCRIPTOR_T::INDEX_T* seed_ptr,  // [num_queries, num_seeds]
-  const uint32_t num_seeds,
-  typename DATASET_DESCRIPTOR_T::INDEX_T* const result_indices_ptr,       // [num_queries, ldr]
-  typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr,  // [num_queries, ldr]
-  const std::size_t ldr,                                                  // (*) ldr >= num_pickup
-  typename DATASET_DESCRIPTOR_T::INDEX_T* const visited_hashmap_ptr,  // [num_queries, 1 << bitlen]
-  const std::uint32_t hash_bitlen,
-  const cuvs::distance::DistanceType metric,
-  cudaStream_t const cuda_stream = 0)
+template <typename DataT, typename IndexT, typename DistanceT>
+void random_pickup(const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc,
+                   const DataT* queries_ptr,  // [num_queries, dataset_dim]
+                   std::size_t num_queries,
+                   std::size_t num_pickup,
+                   unsigned num_distilation,
+                   uint64_t rand_xor_mask,
+                   const IndexT* seed_ptr,  // [num_queries, num_seeds]
+                   uint32_t num_seeds,
+                   IndexT* result_indices_ptr,       // [num_queries, ldr]
+                   DistanceT* result_distances_ptr,  // [num_queries, ldr]
+                   std::size_t ldr,                  // (*) ldr >= num_pickup
+                   IndexT* visited_hashmap_ptr,      // [num_queries, 1 << bitlen]
+                   std::uint32_t hash_bitlen,
+                   cudaStream_t cuda_stream)
 {
   const auto block_size                = 256u;
-  const auto num_teams_per_threadblock = block_size / TEAM_SIZE;
+  const auto num_teams_per_threadblock = block_size / dataset_desc.team_size;
   const dim3 grid_size((num_pickup + num_teams_per_threadblock - 1) / num_teams_per_threadblock,
                        num_queries);
 
-  const auto query_smem_buffer_length =
-    raft::ceildiv<uint32_t>(dataset_desc.dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM;
-  const auto smem_size = query_smem_buffer_length * sizeof(float);
-
-  random_pickup_kernel<TEAM_SIZE, DATASET_BLOCK_DIM, DATASET_DESCRIPTOR_T>
-    <<<grid_size, block_size, smem_size, cuda_stream>>>(dataset_desc,
-                                                        queries_ptr,
-                                                        num_pickup,
-                                                        num_distilation,
-                                                        rand_xor_mask,
-                                                        seed_ptr,
-                                                        num_seeds,
-                                                        result_indices_ptr,
-                                                        result_distances_ptr,
-                                                        ldr,
-                                                        visited_hashmap_ptr,
-                                                        hash_bitlen,
-                                                        metric);
+  random_pickup_kernel<<<grid_size, block_size, dataset_desc.smem_ws_size_in_bytes, cuda_stream>>>(
+    dataset_desc.dev_ptr(),
+    queries_ptr,
+    num_pickup,
+    num_distilation,
+    rand_xor_mask,
+    seed_ptr,
+    num_seeds,
+    result_indices_ptr,
+    result_distances_ptr,
+    ldr,
+    visited_hashmap_ptr,
+    hash_bitlen);
 }
 
 template <class INDEX_T>
@@ -325,9 +289,7 @@ void pickup_next_parents(INDEX_T* const parent_candidates_ptr,  // [num_queries,
                                                   terminate_flag);
 }
 
-template <unsigned TEAM_SIZE,
-          unsigned DATASET_BLOCK_DIM,
-          class DATASET_DESCRIPTOR_T,
+template <class DATASET_DESCRIPTOR_T,
           class SAMPLE_FILTER_T>
 RAFT_KERNEL compute_distance_to_child_nodes_kernel(
   const typename DATASET_DESCRIPTOR_T::INDEX_T* const
@@ -338,7 +300,7 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel(
     parent_distance_ptr,  // [num_queries, search_width]
   const std::size_t lds,
   const std::uint32_t search_width,
-  const DATASET_DESCRIPTOR_T dataset_desc,
+  const DATASET_DESCRIPTOR_T* dataset_desc,
   const typename DATASET_DESCRIPTOR_T::INDEX_T* const
     neighbor_graph_ptr,  // [dataset_size, graph_degree]
   const std::uint32_t graph_degree,
@@ -349,29 +311,22 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel(
   typename DATASET_DESCRIPTOR_T::INDEX_T* const result_indices_ptr,       // [num_queries, ldd]
   typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr,  // [num_queries, ldd]
   const std::uint32_t ldd,  // (*) ldd >= search_width * graph_degree
-  SAMPLE_FILTER_T sample_filter,
-  const cuvs::distance::DistanceType metric)
+  SAMPLE_FILTER_T sample_filter)
 {
   using INDEX_T    = typename DATASET_DESCRIPTOR_T::INDEX_T;
   using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
 
+  const auto team_size_bits = dataset_desc->team_size_bitshift();
+  const auto team_size      = 1u << team_size_bits;
   const uint32_t ldb        = hashmap::get_size(hash_bitlen);
   const auto tid            = threadIdx.x + blockDim.x * blockIdx.x;
-  const auto global_team_id = tid / TEAM_SIZE;
+  const auto global_team_id = tid >> team_size_bits;
   const auto query_id       = blockIdx.y;
 
-  extern __shared__ float query_buffer[];
-  const auto query_smem_buffer_length =
-    raft::ceildiv<uint32_t>(dataset_desc.dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM;
-  for (uint32_t i = threadIdx.x; i < query_smem_buffer_length; i += blockDim.x) {
-    unsigned j = device::swizzling(i);
-    if (i < dataset_desc.dim) {
-      query_buffer[j] = cuvs::spatial::knn::detail::utils::mapping<float>{}(
-        (query_ptr + query_id * dataset_desc.dim)[i]);
-    } else {
-      query_buffer[j] = 0.0;
-    }
-  }
+  extern __shared__ uint8_t smem[];
+  // Load a query
+  dataset_desc = dataset_desc->setup_workspace(smem, query_ptr, query_id);
+
   __syncthreads();
   if (global_team_id >= search_width * graph_degree) { return; }
 
@@ -393,33 +348,18 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel(
 
   const std::size_t child_id = neighbor_list_head_ptr[global_team_id % graph_degree];
 
-  const auto compute_distance_flag = hashmap::insert<TEAM_SIZE, INDEX_T>(
-    visited_hashmap_ptr + (ldb * blockIdx.y), hash_bitlen, child_id);
-
-  DISTANCE_T norm2;
-  switch (metric) {
-    case cuvs::distance::DistanceType::L2Expanded:
-      norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
-                                                       TEAM_SIZE,
-                                                       cuvs::distance::DistanceType::L2Expanded>(
-        query_buffer, child_id, compute_distance_flag);
-      break;
-    case cuvs::distance::DistanceType::InnerProduct:
-      norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
-                                                       TEAM_SIZE,
-                                                       cuvs::distance::DistanceType::InnerProduct>(
-        query_buffer, child_id, compute_distance_flag);
-      break;
-    default: break;
-  }
+  const auto compute_distance_flag = hashmap::insert<INDEX_T>(
+    team_size, visited_hashmap_ptr + (ldb * blockIdx.y), hash_bitlen, child_id);
+
+  DISTANCE_T norm2 = dataset_desc->compute_distance(child_id, compute_distance_flag);
 
   if (compute_distance_flag) {
-    if (threadIdx.x % TEAM_SIZE == 0) {
+    if ((threadIdx.x & (team_size - 1)) == 0) {
       result_indices_ptr[ldd * blockIdx.y + global_team_id]   = child_id;
       result_distances_ptr[ldd * blockIdx.y + global_team_id] = norm2;
     }
   } else {
-    if (threadIdx.x % TEAM_SIZE == 0) {
+    if ((threadIdx.x & (team_size - 1)) == 0) {
       result_distances_ptr[ldd * blockIdx.y + global_team_id] = utils::get_max_value<DISTANCE_T>();
     }
   }
@@ -434,66 +374,52 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel(
   }
 }
 
-template <unsigned TEAM_SIZE,
-          unsigned DATASET_BLOCK_DIM,
-          class SAMPLE_FILTER_T,
-          class DATASET_DESCRIPTOR_T>
+template <typename DataT,
+          typename IndexT,
+          typename DistanceT,
+          class SAMPLE_FILTER_T>
 void compute_distance_to_child_nodes(
-  const typename DATASET_DESCRIPTOR_T::INDEX_T* const
-    parent_node_list,  // [num_queries, search_width]
-  typename DATASET_DESCRIPTOR_T::INDEX_T* const
-    parent_candidates_ptr,  // [num_queries, search_width]
-  typename DATASET_DESCRIPTOR_T::DISTANCE_T* const
-    parent_distance_ptr,  // [num_queries, search_width]
-  const std::size_t lds,
-  const uint32_t search_width,
-  const DATASET_DESCRIPTOR_T dataset_desc,
-  const typename DATASET_DESCRIPTOR_T::INDEX_T* const
-    neighbor_graph_ptr,  // [dataset_size, graph_degree]
-  const std::uint32_t graph_degree,
-  const typename DATASET_DESCRIPTOR_T::DATA_T* query_ptr,  // [num_queries, data_dim]
-  const std::uint32_t num_queries,
-  typename DATASET_DESCRIPTOR_T::INDEX_T* const
-    visited_hashmap_ptr,  // [num_queries, 1 << hash_bitlen]
-  const std::uint32_t hash_bitlen,
-  typename DATASET_DESCRIPTOR_T::INDEX_T* const result_indices_ptr,       // [num_queries, ldd]
-  typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr,  // [num_queries, ldd]
-  const std::uint32_t ldd,  // (*) ldd >= search_width * graph_degree
+  const IndexT* parent_node_list,        // [num_queries, search_width]
+  IndexT* const parent_candidates_ptr,   // [num_queries, search_width]
+  DistanceT* const parent_distance_ptr,  // [num_queries, search_width]
+  std::size_t lds,
+  uint32_t search_width,
+  const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc,
+  const IndexT* neighbor_graph_ptr,  // [dataset_size, graph_degree]
+  std::uint32_t graph_degree,
+  const DataT* query_ptr,  // [num_queries, data_dim]
+  std::uint32_t num_queries,
+  IndexT* visited_hashmap_ptr,  // [num_queries, 1 << hash_bitlen]
+  std::uint32_t hash_bitlen,
+  IndexT* result_indices_ptr,       // [num_queries, ldd]
+  DistanceT* result_distances_ptr,  // [num_queries, ldd]
+  std::uint32_t ldd,                // (*) ldd >= search_width * graph_degree
   SAMPLE_FILTER_T sample_filter,
-  const cuvs::distance::DistanceType metric,
-  cudaStream_t cuda_stream = 0)
+  cudaStream_t cuda_stream)
 {
-  const auto block_size = 128;
-  const dim3 grid_size(
-    (search_width * graph_degree + (block_size / TEAM_SIZE) - 1) / (block_size / TEAM_SIZE),
-    num_queries);
-
-  const auto query_smem_buffer_length =
-    raft::ceildiv<uint32_t>(dataset_desc.dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM;
-
-  const auto smem_size =
-    query_smem_buffer_length * sizeof(float) + DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte;
-
-  compute_distance_to_child_nodes_kernel<TEAM_SIZE,
-                                         DATASET_BLOCK_DIM,
-                                         DATASET_DESCRIPTOR_T,
-                                         SAMPLE_FILTER_T>
-    <<<grid_size, block_size, smem_size, cuda_stream>>>(parent_node_list,
-                                                        parent_candidates_ptr,
-                                                        parent_distance_ptr,
-                                                        lds,
-                                                        search_width,
-                                                        dataset_desc,
-                                                        neighbor_graph_ptr,
-                                                        graph_degree,
-                                                        query_ptr,
-                                                        visited_hashmap_ptr,
-                                                        hash_bitlen,
-                                                        result_indices_ptr,
-                                                        result_distances_ptr,
-                                                        ldd,
-                                                        sample_filter,
-                                                        metric);
+  const auto block_size      = 128;
+  const auto teams_per_block = block_size / dataset_desc.team_size;
+  const dim3 grid_size((search_width * graph_degree + teams_per_block - 1) / teams_per_block,
+                       num_queries);
+
+  compute_distance_to_child_nodes_kernel<<<grid_size,
+                                           block_size,
+                                           dataset_desc.smem_ws_size_in_bytes,
+                                           cuda_stream>>>(parent_node_list,
+                                                          parent_candidates_ptr,
+                                                          parent_distance_ptr,
+                                                          lds,
+                                                          search_width,
+                                                          dataset_desc.dev_ptr(),
+                                                          neighbor_graph_ptr,
+                                                          graph_degree,
+                                                          query_ptr,
+                                                          visited_hashmap_ptr,
+                                                          hash_bitlen,
+                                                          result_indices_ptr,
+                                                          result_distances_ptr,
+                                                          ldd,
+                                                          sample_filter);
 }
 
 template <class INDEX_T>
@@ -639,49 +565,48 @@ void set_value_batch(T* const dev_ptr,
 // |<---                 result_buffer_allocation_size                 --->|
 // |<---                       result_buffer_size  --->|                     // Double buffer (A)
 //                      |<---  result_buffer_size                      --->| // Double buffer (B)
-template <unsigned TEAM_SIZE,
-          unsigned DATASET_BLOCK_DIM,
-          typename DATASET_DESCRIPTOR_T,
-          typename SAMPLE_FILTER_T>
-struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
-  using DATA_T     = typename DATASET_DESCRIPTOR_T::DATA_T;
-  using INDEX_T    = typename DATASET_DESCRIPTOR_T::INDEX_T;
-  using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
+template <typename DataT, typename IndexT, typename DistanceT, typename SAMPLE_FILTER_T>
+struct search : search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_T> {
+  using base_type  = search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_T>;
+  using DATA_T     = typename base_type::DATA_T;
+  using INDEX_T    = typename base_type::INDEX_T;
+  using DISTANCE_T = typename base_type::DISTANCE_T;
 
   static_assert(std::is_same_v<DISTANCE_T, float>, "Only float is supported as resulting distance");
 
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::max_queries;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::itopk_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::algo;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::team_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::search_width;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::min_iterations;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::max_iterations;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::thread_block_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap_mode;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap_min_bitlen;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap_max_fill_rate;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::num_random_samplings;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::rand_xor_mask;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::dim;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::graph_degree;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::topk;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hash_bitlen;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::small_hash_bitlen;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::small_hash_reset_interval;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::dataset_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::result_buffer_size;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::smem_size;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::num_executed_iterations;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::dev_seed;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::num_seeds;
+  using base_type::algo;
+  using base_type::hashmap_max_fill_rate;
+  using base_type::hashmap_min_bitlen;
+  using base_type::hashmap_mode;
+  using base_type::itopk_size;
+  using base_type::max_iterations;
+  using base_type::max_queries;
+  using base_type::min_iterations;
+  using base_type::num_random_samplings;
+  using base_type::rand_xor_mask;
+  using base_type::search_width;
+  using base_type::team_size;
+  using base_type::thread_block_size;
+
+  using base_type::dim;
+  using base_type::graph_degree;
+  using base_type::topk;
+
+  using base_type::hash_bitlen;
+
+  using base_type::dataset_size;
+  using base_type::hashmap_size;
+  using base_type::result_buffer_size;
+  using base_type::small_hash_bitlen;
+  using base_type::small_hash_reset_interval;
+
+  using base_type::smem_size;
+
+  using base_type::dataset_desc;
+  using base_type::dev_seed;
+  using base_type::hashmap;
+  using base_type::num_executed_iterations;
+  using base_type::num_seeds;
 
   size_t result_buffer_allocation_size;
   rmm::device_uvector<INDEX_T> result_indices;       // results_indices_buffer
@@ -699,12 +624,11 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
 
   search(raft::resources const& res,
          search_params params,
+         const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc,
          int64_t dim,
          int64_t graph_degree,
-         uint32_t topk,
-         cuvs::distance::DistanceType metric)
-    : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>(
-        res, params, dim, graph_degree, topk, metric),
+         uint32_t topk)
+    : base_type(res, params, dataset_desc, dim, graph_degree, topk),
       result_indices(0, raft::resource::get_cuda_stream(res)),
       result_distances(0, raft::resource::get_cuda_stream(res)),
       parent_node_list(0, raft::resource::get_cuda_stream(res)),
@@ -837,7 +761,6 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
   }
 
   void operator()(raft::resources const& res,
-                  DATASET_DESCRIPTOR_T dataset_desc,
                   raft::device_matrix_view<const INDEX_T, int64_t, raft::row_major> graph,
                   INDEX_T* const topk_indices_ptr,       // [num_queries, topk]
                   DISTANCE_T* const topk_distances_ptr,  // [num_queries, topk]
@@ -865,21 +788,20 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
     }
 
     // Choose initial entry point candidates at random
-    random_pickup<TEAM_SIZE, DATASET_BLOCK_DIM>(dataset_desc,
-                                                queries_ptr,
-                                                num_queries,
-                                                result_buffer_size,
-                                                num_random_samplings,
-                                                rand_xor_mask,
-                                                dev_seed_ptr,
-                                                num_seeds,
-                                                result_indices.data(),
-                                                result_distances.data(),
-                                                result_buffer_allocation_size,
-                                                hashmap.data(),
-                                                hash_bitlen,
-                                                this->metric,
-                                                stream);
+    random_pickup<DataT, IndexT, DistanceT>(dataset_desc,
+                                            queries_ptr,
+                                            num_queries,
+                                            result_buffer_size,
+                                            num_random_samplings,
+                                            rand_xor_mask,
+                                            dev_seed_ptr,
+                                            num_seeds,
+                                            result_indices.data(),
+                                            result_distances.data(),
+                                            result_buffer_allocation_size,
+                                            hashmap.data(),
+                                            hash_bitlen,
+                                            stream);
 
     unsigned iter = 0;
     while (1) {
@@ -931,7 +853,7 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
       }
 
       // Compute distance to child nodes that are adjacent to the parent node
-      compute_distance_to_child_nodes<TEAM_SIZE, DATASET_BLOCK_DIM, SAMPLE_FILTER_T>(
+      compute_distance_to_child_nodes(
         parent_node_list.data(),
         result_indices.data() + (1 - (iter & 0x1)) * result_buffer_size,
         result_distances.data() + (1 - (iter & 0x1)) * result_buffer_size,
@@ -948,7 +870,6 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
         result_distances.data() + itopk_size,
         result_buffer_allocation_size,
         sample_filter,
-        this->metric,
         stream);
 
       iter++;
@@ -1025,70 +946,5 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
   }
 };
 
-template <unsigned TEAM_SIZE,
-          unsigned DATASET_BLOCK_DIM,
-          class CODE_BOOK_T,
-          unsigned PQ_BITS,
-          unsigned PQ_CODE_BOOK_DIM,
-          class DATA_T,
-          class DISTANCE_T_,
-          class INDEX_T_,
-          typename SAMPLE_FILTER_T>
-struct search<TEAM_SIZE,
-              DATASET_BLOCK_DIM,
-              cagra_q_dataset_descriptor_t<DATA_T,
-                                           CODE_BOOK_T,
-                                           PQ_BITS,
-                                           PQ_CODE_BOOK_DIM,
-                                           DISTANCE_T_,
-                                           INDEX_T_>,
-              SAMPLE_FILTER_T>
-  : public search_plan_impl<cagra_q_dataset_descriptor_t<DATA_T,
-                                                         CODE_BOOK_T,
-                                                         PQ_BITS,
-                                                         PQ_CODE_BOOK_DIM,
-
-                                                         DISTANCE_T_,
-                                                         INDEX_T_>,
-                            SAMPLE_FILTER_T> {
-  using DATASET_DESCRIPTOR_T = cagra_q_dataset_descriptor_t<DATA_T,
-                                                            CODE_BOOK_T,
-                                                            PQ_BITS,
-                                                            PQ_CODE_BOOK_DIM,
-
-                                                            DISTANCE_T_,
-                                                            INDEX_T_>;
-  using INDEX_T              = typename DATASET_DESCRIPTOR_T::INDEX_T;
-  using DISTANCE_T           = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
-
-  search(raft::resources const& res,
-         search_params params,
-         int64_t dim,
-         int64_t graph_degree,
-         uint32_t topk,
-         cuvs::distance::DistanceType metric)
-    : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>(
-        res, params, dim, graph_degree, topk, metric)
-  {
-    THROW("The multi-kernel mode does not support VPQ");
-  }
-
-  void set_params(raft::resources const& res) {}
-
-  void operator()(raft::resources const& res,
-                  DATASET_DESCRIPTOR_T dataset_desc,
-                  raft::device_matrix_view<const INDEX_T, int64_t, raft::row_major> graph,
-                  INDEX_T* const topk_indices_ptr,       // [num_queries, topk]
-                  DISTANCE_T* const topk_distances_ptr,  // [num_queries, topk]
-                  const DATA_T* const queries_ptr,       // [num_queries, dataset_dim]
-                  const uint32_t num_queries,
-                  const INDEX_T* dev_seed_ptr,              // [num_queries, num_seeds]
-                  uint32_t* const num_executed_iterations,  // [num_queries,]
-                  uint32_t topk,
-                  SAMPLE_FILTER_T sample_filter)
-  {
-  }
-};
-
 }  // namespace multi_kernel_search
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/search_plan.cuh b/cpp/src/neighbors/detail/cagra/search_plan.cuh
index 0543224b3..16864ed19 100644
--- a/cpp/src/neighbors/detail/cagra/search_plan.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_plan.cuh
@@ -18,10 +18,11 @@
 
 #include "hashmap.hpp"
 
+#include "compute_distance-ext.cuh"
 #include <cuvs/neighbors/common.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 // #include "search_single_cta_inst.cuh"
-// #include "topk_for_cagra/topk_core.cuh"
+// #include "topk_for_cagra/topk.h"
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resources.hpp>
@@ -34,19 +35,12 @@
 namespace cuvs::neighbors::cagra::detail {
 
 struct search_plan_impl_base : public search_params {
-  int64_t dataset_block_dim;
   int64_t dim;
   int64_t graph_degree;
   uint32_t topk;
-  cuvs::distance::DistanceType metric;
-  search_plan_impl_base(search_params params,
-                        int64_t dim,
-                        int64_t graph_degree,
-                        uint32_t topk,
-                        cuvs::distance::DistanceType metric)
-    : search_params(params), dim(dim), graph_degree(graph_degree), topk(topk), metric(metric)
+  search_plan_impl_base(search_params params, int64_t dim, int64_t graph_degree, uint32_t topk)
+    : search_params(params), dim(dim), graph_degree(graph_degree), topk(topk)
   {
-    set_dataset_block_and_team_size(dim);
     if (algo == search_algo::AUTO) {
       const size_t num_sm = raft::getMultiProcessorCount();
       if (itopk_size <= 512 && search_params::max_queries >= num_sm * 2lu) {
@@ -61,29 +55,13 @@ struct search_plan_impl_base : public search_params {
       }
     }
   }
-
-  void set_dataset_block_and_team_size(int64_t dim)
-  {
-    constexpr int64_t max_dataset_block_dim = 512;
-    dataset_block_dim                       = 128;
-    while (dataset_block_dim < dim && dataset_block_dim < max_dataset_block_dim) {
-      dataset_block_dim *= 2;
-    }
-    // To keep binary size in check we limit only one team size specialization for each max_dim.
-    // TODO(tfeher): revise this decision.
-    switch (dataset_block_dim) {
-      case 128: team_size = 8; break;
-      case 256: team_size = 16; break;
-      default: team_size = 32; break;
-    }
-  }
 };
 
-template <class DATASET_DESCRIPTOR_T, class SAMPLE_FILTER_T>
+template <typename DataT, typename IndexT, typename DistanceT, typename SAMPLE_FILTER_T>
 struct search_plan_impl : public search_plan_impl_base {
-  using INDEX_T    = typename DATASET_DESCRIPTOR_T::INDEX_T;
-  using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
-  using DATA_T     = typename DATASET_DESCRIPTOR_T::DATA_T;
+  using DATA_T     = DataT;
+  using INDEX_T    = IndexT;
+  using DISTANCE_T = DistanceT;
 
   int64_t hash_bitlen;
 
@@ -100,23 +78,24 @@ struct search_plan_impl : public search_plan_impl_base {
   rmm::device_uvector<INDEX_T> hashmap;
   rmm::device_uvector<uint32_t> num_executed_iterations;  // device or managed?
   rmm::device_uvector<INDEX_T> dev_seed;
+  const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc;
 
   search_plan_impl(raft::resources const& res,
                    search_params params,
+                   const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc,
                    int64_t dim,
                    int64_t graph_degree,
-                   uint32_t topk,
-                   cuvs::distance::DistanceType metric)
-    : search_plan_impl_base(params, dim, graph_degree, topk, metric),
+                   uint32_t topk)
+    : search_plan_impl_base(params, dim, graph_degree, topk),
       hashmap(0, raft::resource::get_cuda_stream(res)),
       num_executed_iterations(0, raft::resource::get_cuda_stream(res)),
       dev_seed(0, raft::resource::get_cuda_stream(res)),
-      num_seeds(0)
+      num_seeds(0),
+      dataset_desc(dataset_desc)
   {
     adjust_search_params();
     check_params();
     calc_hashmap_params(res);
-    set_dataset_block_and_team_size(dim);
     num_executed_iterations.resize(max_queries, raft::resource::get_cuda_stream(res));
     RAFT_LOG_DEBUG("# algo = %d", static_cast<int>(algo));
   }
@@ -124,7 +103,6 @@ struct search_plan_impl : public search_plan_impl_base {
   virtual ~search_plan_impl() {}
 
   virtual void operator()(raft::resources const& res,
-                          DATASET_DESCRIPTOR_T dataset_desc,
                           raft::device_matrix_view<const INDEX_T, int64_t, raft::row_major> graph,
                           INDEX_T* const result_indices_ptr,       // [num_queries, topk]
                           DISTANCE_T* const result_distances_ptr,  // [num_queries, topk]
@@ -160,6 +138,7 @@ struct search_plan_impl : public search_plan_impl_base {
                      itopk32);
       itopk_size = itopk32;
     }
+    team_size = dataset_desc.team_size;
   }
 
   // defines hash_bitlen, small_hash_bitlen, small_hash_reset interval, hash_size
@@ -292,10 +271,6 @@ struct search_plan_impl : public search_plan_impl_base {
         algo != search_algo::MULTI_KERNEL) {
       error_message += "An invalid kernel mode has been given: " + std::to_string((int)algo) + "";
     }
-    if (team_size != 0 && team_size != 4 && team_size != 8 && team_size != 16 && team_size != 32) {
-      error_message +=
-        "`team_size` must be 0, 4, 8, 16 or 32. " + std::to_string(team_size) + " has been given.";
-    }
     if (thread_block_size != 0 && thread_block_size != 64 && thread_block_size != 128 &&
         thread_block_size != 256 && thread_block_size != 512 && thread_block_size != 1024) {
       error_message += "`thread_block_size` must be 0, 64, 128, 256 or 512. " +
@@ -330,20 +305,4 @@ struct search_plan_impl : public search_plan_impl_base {
   }
 };
 
-// template <class DATA_T, class DISTANCE_T, class INDEX_T>
-// struct search_plan {
-//   search_plan(raft::resources const& res,
-//               search_params param,
-//               int64_t dim,
-//               int64_t graph_degree)
-//     : plan(res, param, dim, graph_degree)
-//   {
-//   }
-//   void check(uint32_t topk) { plan.check(topk); }
-
-//   // private:
-//   detail::search_plan_impl<DATA_T, DISTANCE_T, INDEX_T> plan;
-// };
-/** @} */  // end group cagra
-
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
index 0a101cbfe..4abed6760 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
@@ -16,13 +16,13 @@
 #pragma once
 
 #include "bitonic.hpp"
-#include "compute_distance.hpp"
+#include "compute_distance-ext.cuh"
 #include "device_common.hpp"
 #include "hashmap.hpp"
 #include "search_plan.cuh"
 #include "search_single_cta_kernel.cuh"
 #include "topk_by_radix.cuh"
-#include "topk_for_cagra/topk_core.cuh"  // TODO replace with raft topk
+#include "topk_for_cagra/topk.h"  // TODO replace with raft topk
 #include "utils.hpp"
 
 #include <raft/core/device_mdspan.hpp>
@@ -49,58 +49,56 @@
 namespace cuvs::neighbors::cagra::detail {
 namespace single_cta_search {
 
-template <unsigned TEAM_SIZE,
-          unsigned DATASET_BLOCK_DIM,
-          typename DATASET_DESCRIPTOR_T,
-          typename SAMPLE_FILTER_T>
-struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
-  using DATA_T     = typename DATASET_DESCRIPTOR_T::DATA_T;
-  using INDEX_T    = typename DATASET_DESCRIPTOR_T::INDEX_T;
-  using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::max_queries;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::itopk_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::algo;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::team_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::search_width;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::min_iterations;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::max_iterations;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::thread_block_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap_mode;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap_min_bitlen;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap_max_fill_rate;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::num_random_samplings;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::rand_xor_mask;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::dim;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::graph_degree;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::topk;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hash_bitlen;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::small_hash_bitlen;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::small_hash_reset_interval;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::dataset_size;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::result_buffer_size;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::smem_size;
-
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::hashmap;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::num_executed_iterations;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::dev_seed;
-  using search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::num_seeds;
+template <typename DataT, typename IndexT, typename DistanceT, typename SAMPLE_FILTER_T>
+struct search : search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_T> {
+  using base_type  = search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_T>;
+  using DATA_T     = typename base_type::DATA_T;
+  using INDEX_T    = typename base_type::INDEX_T;
+  using DISTANCE_T = typename base_type::DISTANCE_T;
+
+  using base_type::algo;
+  using base_type::hashmap_max_fill_rate;
+  using base_type::hashmap_min_bitlen;
+  using base_type::hashmap_mode;
+  using base_type::itopk_size;
+  using base_type::max_iterations;
+  using base_type::max_queries;
+  using base_type::min_iterations;
+  using base_type::num_random_samplings;
+  using base_type::rand_xor_mask;
+  using base_type::search_width;
+  using base_type::team_size;
+  using base_type::thread_block_size;
+
+  using base_type::dim;
+  using base_type::graph_degree;
+  using base_type::topk;
+
+  using base_type::hash_bitlen;
+
+  using base_type::dataset_size;
+  using base_type::hashmap_size;
+  using base_type::result_buffer_size;
+  using base_type::small_hash_bitlen;
+  using base_type::small_hash_reset_interval;
+
+  using base_type::smem_size;
+
+  using base_type::dataset_desc;
+  using base_type::dev_seed;
+  using base_type::hashmap;
+  using base_type::num_executed_iterations;
+  using base_type::num_seeds;
 
   uint32_t num_itopk_candidates;
 
   search(raft::resources const& res,
          search_params params,
+         const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc,
          int64_t dim,
          int64_t graph_degree,
-         uint32_t topk,
-         cuvs::distance::DistanceType metric)
-    : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>(
-        res, params, dim, graph_degree, topk, metric)
+         uint32_t topk)
+    : base_type(res, params, dataset_desc, dim, graph_degree, topk)
   {
     set_params(res);
   }
@@ -128,14 +126,11 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
     constexpr unsigned max_block_size       = 1024;
     //
     const std::uint32_t topk_ws_size = 3;
-    const auto query_smem_buffer_length =
-      raft::ceildiv<uint32_t>(dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM;
     const std::uint32_t base_smem_size =
-      sizeof(float) * query_smem_buffer_length +
+      dataset_desc.smem_ws_size_in_bytes +
       (sizeof(INDEX_T) + sizeof(DISTANCE_T)) * result_buffer_size_32 +
       sizeof(INDEX_T) * hashmap::get_size(small_hash_bitlen) + sizeof(INDEX_T) * search_width +
-      sizeof(std::uint32_t) * topk_ws_size + sizeof(std::uint32_t) +
-      DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte;
+      sizeof(std::uint32_t) * topk_ws_size + sizeof(std::uint32_t);
     smem_size = base_smem_size;
     if (num_itopk_candidates > 256) {
       // Tentatively calculate the required share memory size when radix
@@ -212,7 +207,6 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
   }
 
   void operator()(raft::resources const& res,
-                  DATASET_DESCRIPTOR_T dataset_desc,
                   raft::device_matrix_view<const INDEX_T, int64_t, raft::row_major> graph,
                   INDEX_T* const result_indices_ptr,       // [num_queries, topk]
                   DISTANCE_T* const result_distances_ptr,  // [num_queries, topk]
@@ -224,28 +218,26 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
                   SAMPLE_FILTER_T sample_filter)
   {
     cudaStream_t stream = raft::resource::get_cuda_stream(res);
-    select_and_run<TEAM_SIZE, DATASET_BLOCK_DIM, DATASET_DESCRIPTOR_T>(
-      dataset_desc,
-      graph,
-      result_indices_ptr,
-      result_distances_ptr,
-      queries_ptr,
-      num_queries,
-      dev_seed_ptr,
-      num_executed_iterations,
-      *this,
-      topk,
-      num_itopk_candidates,
-      static_cast<uint32_t>(thread_block_size),
-      smem_size,
-      hash_bitlen,
-      hashmap.data(),
-      small_hash_bitlen,
-      small_hash_reset_interval,
-      num_seeds,
-      sample_filter,
-      this->metric,
-      stream);
+    select_and_run(dataset_desc.dev_ptr(),
+                   graph,
+                   result_indices_ptr,
+                   result_distances_ptr,
+                   queries_ptr,
+                   num_queries,
+                   dev_seed_ptr,
+                   num_executed_iterations,
+                   *this,
+                   topk,
+                   num_itopk_candidates,
+                   static_cast<uint32_t>(thread_block_size),
+                   smem_size,
+                   hash_bitlen,
+                   hashmap.data(),
+                   small_hash_bitlen,
+                   small_hash_reset_interval,
+                   num_seeds,
+                   sample_filter,
+                   stream);
   }
 };
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py b/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py
index a361269a6..e37ceb1fa 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py
@@ -39,8 +39,6 @@
 
 #include "search_single_cta_inst.cuh"
 
-#include "compute_distance.hpp"
-
 namespace cuvs::neighbors::cagra::detail::single_cta_search {
 """
 
@@ -48,7 +46,6 @@
 }  // namespace cuvs::neighbors::cagra::detail::single_cta_search
 """
 
-mxdim_team = [(128, 8), (256, 16), (512, 32), (1024, 32)]
 # block = [(64, 16), (128, 8), (256, 4), (512, 2), (1024, 1)]
 # itopk_candidates = [64, 128, 256]
 # itopk_size = [64, 128, 256, 512]
@@ -69,14 +66,13 @@
 
 # knn
 for type_path, (data_t, idx_t, distance_t) in search_types.items():
-    for (mxdim, team) in mxdim_team:
-        path = f"search_single_cta_{type_path}_dim{mxdim}_t{team}.cu"
-        with open(path, "w") as f:
-            f.write(header)
-            f.write(
-                    f"instantiate_kernel_selection(\n  {team}, {mxdim}, cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<{data_t} COMMA {idx_t} COMMA  {distance_t}>, cuvs::neighbors::filtering::none_cagra_sample_filter);\n"
-            )
+    path = f"search_single_cta_{type_path}.cu"
+    with open(path, "w") as f:
+        f.write(header)
+        f.write(
+                f"instantiate_kernel_selection(\n  {data_t}, {idx_t}, {distance_t}, cuvs::neighbors::filtering::none_cagra_sample_filter);\n"
+        )
 
-            f.write(trailer)
-            # For pasting into CMakeLists.txt
-            print(f"src/neighbors/detail/cagra/{path}")
+        f.write(trailer)
+        # For pasting into CMakeLists.txt
+        print(f"src/neighbors/detail/cagra/{path}")
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32.cu
similarity index 80%
rename from cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim128_t8.cu
rename to cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32.cu
index c2cfb13c4..f8495bc01 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32.cu
@@ -25,13 +25,10 @@
 
 #include "search_single_cta_inst.cuh"
 
-#include "compute_distance.hpp"
-
 namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  8,
-  128,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(float,
+                             uint32_t,
+                             float,
+                             cuvs::neighbors::filtering::none_cagra_sample_filter);
 
 }  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu
deleted file mode 100644
index 4cf4a26f7..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  32,
-  1024,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu
deleted file mode 100644
index 692710476..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  16,
-  256,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu
deleted file mode 100644
index ed3a900ff..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  32,
-  512,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64.cu
similarity index 80%
rename from cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim128_t8.cu
rename to cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64.cu
index 1e2b83492..0ef5c366f 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64.cu
@@ -25,13 +25,10 @@
 
 #include "search_single_cta_inst.cuh"
 
-#include "compute_distance.hpp"
-
 namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  8,
-  128,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(float,
+                             uint64_t,
+                             float,
+                             cuvs::neighbors::filtering::none_cagra_sample_filter);
 
 }  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu
deleted file mode 100644
index 2c4da00db..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  32,
-  1024,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu
deleted file mode 100644
index 8b26a595f..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  8,
-  128,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu
deleted file mode 100644
index a93f893d4..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  16,
-  256,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu
deleted file mode 100644
index 4a7502e3e..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  32,
-  512,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32.cu
similarity index 80%
rename from cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu
rename to cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32.cu
index 7d3e86f38..c21e6d1f4 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32.cu
@@ -25,13 +25,10 @@
 
 #include "search_single_cta_inst.cuh"
 
-#include "compute_distance.hpp"
-
 namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  8,
-  128,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<float COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(half,
+                             uint32_t,
+                             float,
+                             cuvs::neighbors::filtering::none_cagra_sample_filter);
 
 }  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim1024_t32.cu
deleted file mode 100644
index 6c13df91a..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim1024_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  32,
-  1024,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim512_t32.cu
deleted file mode 100644
index 12aa72a24..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim512_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  32,
-  512,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64.cu
similarity index 80%
rename from cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim256_t16.cu
rename to cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64.cu
index cfae9e367..b96ed0b22 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim256_t16.cu
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64.cu
@@ -25,13 +25,10 @@
 
 #include "search_single_cta_inst.cuh"
 
-#include "compute_distance.hpp"
-
 namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  16,
-  256,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
+instantiate_kernel_selection(half,
+                             uint64_t,
+                             float,
+                             cuvs::neighbors::filtering::none_cagra_sample_filter);
 
 }  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu
deleted file mode 100644
index 84a173d6d..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  32,
-  1024,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu
deleted file mode 100644
index d9c5198eb..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  16,
-  256,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu
deleted file mode 100644
index 3ba8f4e4d..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  32,
-  512,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<half COMMA uint64_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh
index a4581d15e..26ca7b672 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh
@@ -21,31 +21,27 @@
 
 namespace cuvs::neighbors::cagra::detail::single_cta_search {
 
-#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATASET_DESC_T, SAMPLE_FILTER_T) \
-  template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATASET_DESC_T, SAMPLE_FILTER_T>(      \
-    DATASET_DESC_T dataset_desc,                                                                  \
-    raft::device_matrix_view<const typename DATASET_DESC_T::INDEX_T, int64_t, raft::row_major>    \
-      graph,                                                                                      \
-    typename DATASET_DESC_T::INDEX_T* const topk_indices_ptr,                                     \
-    typename DATASET_DESC_T::DISTANCE_T* const topk_distances_ptr,                                \
-    const typename DATASET_DESC_T::DATA_T* const queries_ptr,                                     \
-    const uint32_t num_queries,                                                                   \
-    const typename DATASET_DESC_T::INDEX_T* dev_seed_ptr,                                         \
-    uint32_t* const num_executed_iterations,                                                      \
-    const search_params& ps,                                                                      \
-    uint32_t topk,                                                                                \
-    uint32_t num_itopk_candidates,                                                                \
-    uint32_t block_size,                                                                          \
-    uint32_t smem_size,                                                                           \
-    int64_t hash_bitlen,                                                                          \
-    typename DATASET_DESC_T::INDEX_T* hashmap_ptr,                                                \
-    size_t small_hash_bitlen,                                                                     \
-    size_t small_hash_reset_interval,                                                             \
-    uint32_t num_seeds,                                                                           \
-    SAMPLE_FILTER_T sample_filter,                                                                \
-    cuvs::distance::DistanceType metric,                                                          \
+#define instantiate_kernel_selection(DataT, IndexT, DistanceT, SampleFilterT) \
+  template void select_and_run<DataT, IndexT, DistanceT, SampleFilterT>(      \
+    const dataset_descriptor_base_t<DataT, IndexT, DistanceT>* dataset_desc,  \
+    raft::device_matrix_view<const IndexT, int64_t, raft::row_major> graph,   \
+    IndexT* topk_indices_ptr,                                                 \
+    DistanceT* topk_distances_ptr,                                            \
+    const DataT* queries_ptr,                                                 \
+    uint32_t num_queries,                                                     \
+    const IndexT* dev_seed_ptr,                                               \
+    uint32_t* num_executed_iterations,                                        \
+    const search_params& ps,                                                  \
+    uint32_t topk,                                                            \
+    uint32_t num_itopk_candidates,                                            \
+    uint32_t block_size,                                                      \
+    uint32_t smem_size,                                                       \
+    int64_t hash_bitlen,                                                      \
+    IndexT* hashmap_ptr,                                                      \
+    size_t small_hash_bitlen,                                                 \
+    size_t small_hash_reset_interval,                                         \
+    uint32_t num_seeds,                                                       \
+    SampleFilterT sample_filter,                                              \
     cudaStream_t stream);
 
-#define COMMA ,
-
 }  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32.cu
new file mode 100644
index 000000000..56a0d8ba9
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by search_single_cta_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python search_single_cta_00_generate.py
+ *
+ */
+
+#include "search_single_cta_inst.cuh"
+
+namespace cuvs::neighbors::cagra::detail::single_cta_search {
+instantiate_kernel_selection(int8_t,
+                             uint32_t,
+                             float,
+                             cuvs::neighbors::filtering::none_cagra_sample_filter);
+
+}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu
deleted file mode 100644
index ad2ca16fc..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  32,
-  1024,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<int8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu
deleted file mode 100644
index 6130a84bc..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  8,
-  128,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<int8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu
deleted file mode 100644
index 1e7bee57c..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  16,
-  256,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<int8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu
deleted file mode 100644
index 7f789e3d0..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  32,
-  512,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<int8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh
deleted file mode 100644
index 10dda0389..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh
+++ /dev/null
@@ -1,588 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cuvs/neighbors/common.hpp>
-#include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
-
-#include <cuda_fp16.h>
-
-namespace cuvs::neighbors::cagra::detail {
-namespace single_cta_search {
-
-#ifdef _CUVS_EXPLICIT_INSTANTIATE_ONLY
-
-template <unsigned TEAM_SIZE,
-          unsigned DATASET_BLOCK_DIM,
-          typename DATASET_DESCRIPTOR_T,
-          typename SAMPLE_FILTER_T>
-void select_and_run(
-  DATASET_DESCRIPTOR_T dataset_desc,
-  raft::device_matrix_view<const typename DATASET_DESCRIPTOR_T::INDEX_T, int64_t, raft::row_major>
-    graph,
-  typename DATASET_DESCRIPTOR_T::INDEX_T* const topk_indices_ptr,       // [num_queries, topk]
-  typename DATASET_DESCRIPTOR_T::DISTANCE_T* const topk_distances_ptr,  // [num_queries, topk]
-  const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
-  const uint32_t num_queries,
-  const typename DATASET_DESCRIPTOR_T::INDEX_T* dev_seed_ptr,  // [num_queries, num_seeds]
-  uint32_t* const num_executed_iterations,                     // [num_queries,]
-  const search_params& ps,
-  uint32_t topk,
-  uint32_t num_itopk_candidates,
-  uint32_t block_size,  //
-  uint32_t smem_size,
-  int64_t hash_bitlen,
-  typename DATASET_DESCRIPTOR_T::INDEX_T* hashmap_ptr,
-  size_t small_hash_bitlen,
-  size_t small_hash_reset_interval,
-  uint32_t num_seeds,
-  SAMPLE_FILTER_T sample_filter,
-  cuvs::distance::DistanceType metric,
-  cudaStream_t stream) RAFT_EXPLICIT;
-
-#endif  // CUVS_EXPLICIT_INSTANTIATE_ONLY
-
-#define instantiate_single_cta_select_and_run(                                                  \
-  TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T)                     \
-  extern template void select_and_run<                                                          \
-    TEAM_SIZE,                                                                                  \
-    MAX_DATASET_DIM,                                                                            \
-    cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<DATA_T, INDEX_T, DISTANCE_T>, \
-    SAMPLE_FILTER_T>(                                                                           \
-    cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<DATA_T, INDEX_T, DISTANCE_T>  \
-      dataset,                                                                                  \
-    raft::device_matrix_view<const INDEX_T, int64_t, raft::row_major> graph,                    \
-    INDEX_T* const topk_indices_ptr,                                                            \
-    DISTANCE_T* const topk_distances_ptr,                                                       \
-    const DATA_T* const queries_ptr,                                                            \
-    const uint32_t num_queries,                                                                 \
-    const INDEX_T* dev_seed_ptr,                                                                \
-    uint32_t* const num_executed_iterations,                                                    \
-    const search_params& ps,                                                                    \
-    uint32_t topk,                                                                              \
-    uint32_t num_itopk_candidates,                                                              \
-    uint32_t block_size,                                                                        \
-    uint32_t smem_size,                                                                         \
-    int64_t hash_bitlen,                                                                        \
-    INDEX_T* hashmap_ptr,                                                                       \
-    size_t small_hash_bitlen,                                                                   \
-    size_t small_hash_reset_interval,                                                           \
-    uint32_t num_seeds,                                                                         \
-    SAMPLE_FILTER_T sample_filter,                                                              \
-    cuvs::distance::DistanceType metric,                                                        \
-    cudaStream_t stream);
-
-instantiate_single_cta_select_and_run(
-  32, 1024, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  8, 128, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  16, 256, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  32, 512, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  32, 1024, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  8, 128, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  16, 256, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  32, 512, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  32, 1024, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  8, 128, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  16, 256, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  32, 512, int8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  32, 1024, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  8, 128, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  16, 256, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_single_cta_select_and_run(
-  32, 512, uint8_t, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-#undef instantiate_single_cta_select_and_run
-
-#define instantiate_q_single_cta_select_and_run(TEAM_SIZE,                                      \
-                                                MAX_DATASET_DIM,                                \
-                                                CODE_BOOK_T,                                    \
-                                                PQ_BITS,                                        \
-                                                PQ_CODE_BOOK_DIM,                               \
-                                                DATA_T,                                         \
-                                                INDEX_T,                                        \
-                                                DISTANCE_T,                                     \
-                                                SAMPLE_FILTER_T)                                \
-  extern template void                                                                          \
-  select_and_run<TEAM_SIZE,                                                                     \
-                 MAX_DATASET_DIM,                                                               \
-                 cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<DATA_T,           \
-                                                                              CODE_BOOK_T,      \
-                                                                              PQ_BITS,          \
-                                                                              PQ_CODE_BOOK_DIM, \
-                                                                              DISTANCE_T,       \
-                                                                              INDEX_T>,         \
-                 SAMPLE_FILTER_T>(                                                              \
-    cuvs::neighbors::cagra::detail::cagra_q_dataset_descriptor_t<DATA_T,                        \
-                                                                 CODE_BOOK_T,                   \
-                                                                 PQ_BITS,                       \
-                                                                 PQ_CODE_BOOK_DIM,              \
-                                                                 DISTANCE_T,                    \
-                                                                 INDEX_T> dataset,              \
-    raft::device_matrix_view<const INDEX_T, int64_t, raft::row_major> graph,                    \
-    INDEX_T* const topk_indices_ptr,                                                            \
-    DISTANCE_T* const topk_distances_ptr,                                                       \
-    const DATA_T* const queries_ptr,                                                            \
-    const uint32_t num_queries,                                                                 \
-    const INDEX_T* dev_seed_ptr,                                                                \
-    uint32_t* const num_executed_iterations,                                                    \
-    const search_params& ps,                                                                    \
-    uint32_t topk,                                                                              \
-    uint32_t num_itopk_candidates,                                                              \
-    uint32_t block_size,                                                                        \
-    uint32_t smem_size,                                                                         \
-    int64_t hash_bitlen,                                                                        \
-    INDEX_T* hashmap_ptr,                                                                       \
-    size_t small_hash_bitlen,                                                                   \
-    size_t small_hash_reset_interval,                                                           \
-    uint32_t num_seeds,                                                                         \
-    SAMPLE_FILTER_T sample_filter,                                                              \
-    cuvs::distance::DistanceType metric,                                                        \
-    cudaStream_t stream);
-
-instantiate_q_single_cta_select_and_run(
-  8, 128, half, 8, 2, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  16, 256, half, 8, 2, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  32, 512, half, 8, 2, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        1024,
-                                        half,
-                                        8,
-                                        2,
-                                        half,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  8, 128, half, 8, 4, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  16, 256, half, 8, 4, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  32, 512, half, 8, 4, half, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        1024,
-                                        half,
-                                        8,
-                                        4,
-                                        half,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-instantiate_q_single_cta_select_and_run(
-  8, 128, half, 8, 2, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(16,
-                                        256,
-                                        half,
-                                        8,
-                                        2,
-                                        float,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        512,
-                                        half,
-                                        8,
-                                        2,
-                                        float,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        1024,
-                                        half,
-                                        8,
-                                        2,
-                                        float,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  8, 128, half, 8, 4, float, uint32_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(16,
-                                        256,
-                                        half,
-                                        8,
-                                        4,
-                                        float,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        512,
-                                        half,
-                                        8,
-                                        4,
-                                        float,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        1024,
-                                        half,
-                                        8,
-                                        4,
-                                        float,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-instantiate_q_single_cta_select_and_run(
-  8, 128, half, 8, 2, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  16, 256, half, 8, 2, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  32, 512, half, 8, 2, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  32, 1024, half, 8, 2, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  8, 128, half, 8, 4, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  16, 256, half, 8, 4, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  32, 512, half, 8, 4, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  32, 1024, half, 8, 4, half, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-instantiate_q_single_cta_select_and_run(
-  8, 128, half, 8, 2, float, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  16, 256, half, 8, 2, float, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  32, 512, half, 8, 2, float, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        1024,
-                                        half,
-                                        8,
-                                        2,
-                                        float,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  8, 128, half, 8, 4, float, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  16, 256, half, 8, 4, float, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  32, 512, half, 8, 4, float, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        1024,
-                                        half,
-                                        8,
-                                        4,
-                                        float,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-instantiate_q_single_cta_select_and_run(8,
-                                        128,
-                                        half,
-                                        8,
-                                        2,
-                                        uint8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(16,
-                                        256,
-                                        half,
-                                        8,
-                                        2,
-                                        uint8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        512,
-                                        half,
-                                        8,
-                                        2,
-                                        uint8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        1024,
-                                        half,
-                                        8,
-                                        2,
-                                        uint8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(8,
-                                        128,
-                                        half,
-                                        8,
-                                        4,
-                                        uint8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(16,
-                                        256,
-                                        half,
-                                        8,
-                                        4,
-                                        uint8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        512,
-                                        half,
-                                        8,
-                                        4,
-                                        uint8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        1024,
-                                        half,
-                                        8,
-                                        4,
-                                        uint8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-instantiate_q_single_cta_select_and_run(8,
-                                        128,
-                                        half,
-                                        8,
-                                        2,
-                                        int8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(16,
-                                        256,
-                                        half,
-                                        8,
-                                        2,
-                                        int8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        512,
-                                        half,
-                                        8,
-                                        2,
-                                        int8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        1024,
-                                        half,
-                                        8,
-                                        2,
-                                        int8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(8,
-                                        128,
-                                        half,
-                                        8,
-                                        4,
-                                        int8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(16,
-                                        256,
-                                        half,
-                                        8,
-                                        4,
-                                        int8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        512,
-                                        half,
-                                        8,
-                                        4,
-                                        int8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        1024,
-                                        half,
-                                        8,
-                                        4,
-                                        int8_t,
-                                        uint32_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-instantiate_q_single_cta_select_and_run(8,
-                                        128,
-                                        half,
-                                        8,
-                                        2,
-                                        uint8_t,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(16,
-                                        256,
-                                        half,
-                                        8,
-                                        2,
-                                        uint8_t,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        512,
-                                        half,
-                                        8,
-                                        2,
-                                        uint8_t,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        1024,
-                                        half,
-                                        8,
-                                        2,
-                                        uint8_t,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(8,
-                                        128,
-                                        half,
-                                        8,
-                                        4,
-                                        uint8_t,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(16,
-                                        256,
-                                        half,
-                                        8,
-                                        4,
-                                        uint8_t,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        512,
-                                        half,
-                                        8,
-                                        4,
-                                        uint8_t,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        1024,
-                                        half,
-                                        8,
-                                        4,
-                                        uint8_t,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-instantiate_q_single_cta_select_and_run(
-  8, 128, half, 8, 2, int8_t, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(16,
-                                        256,
-                                        half,
-                                        8,
-                                        2,
-                                        int8_t,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        512,
-                                        half,
-                                        8,
-                                        2,
-                                        int8_t,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        1024,
-                                        half,
-                                        8,
-                                        2,
-                                        int8_t,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(
-  8, 128, half, 8, 4, int8_t, int64_t, float, cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(16,
-                                        256,
-                                        half,
-                                        8,
-                                        4,
-                                        int8_t,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        512,
-                                        half,
-                                        8,
-                                        4,
-                                        int8_t,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-instantiate_q_single_cta_select_and_run(32,
-                                        1024,
-                                        half,
-                                        8,
-                                        4,
-                                        int8_t,
-                                        int64_t,
-                                        float,
-                                        cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-#undef instantiate_q_single_cta_select_and_run
-
-}  // namespace single_cta_search
-}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index a101cdc1f..d10313c5b 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -15,13 +15,15 @@
  */
 #pragma once
 
+#include "search_single_cta_kernel.cuh"
+
 #include "bitonic.hpp"
-#include "compute_distance.hpp"
+#include "compute_distance-ext.cuh"
 #include "device_common.hpp"
 #include "hashmap.hpp"
 #include "search_plan.cuh"
 #include "topk_by_radix.cuh"
-#include "topk_for_cagra/topk_core.cuh"  // TODO replace with raft topk
+#include "topk_for_cagra/topk.h"  // TODO replace with raft topk
 #include "utils.hpp"
 
 #include <cuvs/distance/distance.hpp>
@@ -56,12 +58,11 @@ namespace single_cta_search {
 // #define _CLK_BREAKDOWN
 
 template <unsigned TOPK_BY_BITONIC_SORT, class INDEX_T>
-__device__ void pickup_next_parents(std::uint32_t* const terminate_flag,
-                                    INDEX_T* const next_parent_indices,
-                                    INDEX_T* const internal_topk_indices,
-                                    const std::size_t internal_topk_size,
-                                    const std::size_t dataset_size,
-                                    const std::uint32_t search_width)
+RAFT_DEVICE_INLINE_FUNCTION void pickup_next_parents(std::uint32_t* const terminate_flag,
+                                                     INDEX_T* const next_parent_indices,
+                                                     INDEX_T* const internal_topk_indices,
+                                                     const std::size_t internal_topk_size,
+                                                     const std::uint32_t search_width)
 {
   constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
   // if (threadIdx.x >= 32) return;
@@ -99,11 +100,12 @@ __device__ void pickup_next_parents(std::uint32_t* const terminate_flag,
 }
 
 template <unsigned MAX_CANDIDATES, class IdxT = void>
-__device__ inline void topk_by_bitonic_sort_1st(float* candidate_distances,  // [num_candidates]
-                                                IdxT* candidate_indices,     // [num_candidates]
-                                                const std::uint32_t num_candidates,
-                                                const std::uint32_t num_itopk,
-                                                unsigned MULTI_WARPS = 0)
+RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_1st(
+  float* candidate_distances,  // [num_candidates]
+  IdxT* candidate_indices,     // [num_candidates]
+  const std::uint32_t num_candidates,
+  const std::uint32_t num_itopk,
+  unsigned MULTI_WARPS = 0)
 {
   const unsigned lane_id = threadIdx.x % 32;
   const unsigned warp_id = threadIdx.x / 32;
@@ -202,15 +204,16 @@ __device__ inline void topk_by_bitonic_sort_1st(float* candidate_distances,  //
 }
 
 template <unsigned MAX_ITOPK, class IdxT = void>
-__device__ inline void topk_by_bitonic_sort_2nd(float* itopk_distances,  // [num_itopk]
-                                                IdxT* itopk_indices,     // [num_itopk]
-                                                const std::uint32_t num_itopk,
-                                                float* candidate_distances,  // [num_candidates]
-                                                IdxT* candidate_indices,     // [num_candidates]
-                                                const std::uint32_t num_candidates,
-                                                std::uint32_t* work_buf,
-                                                const bool first,
-                                                unsigned MULTI_WARPS = 0)
+RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_2nd(
+  float* itopk_distances,  // [num_itopk]
+  IdxT* itopk_indices,     // [num_itopk]
+  const std::uint32_t num_itopk,
+  float* candidate_distances,  // [num_candidates]
+  IdxT* candidate_indices,     // [num_candidates]
+  const std::uint32_t num_candidates,
+  std::uint32_t* work_buf,
+  const bool first,
+  unsigned MULTI_WARPS = 0)
 {
   const unsigned lane_id = threadIdx.x % 32;
   const unsigned warp_id = threadIdx.x / 32;
@@ -410,16 +413,17 @@ __device__ inline void topk_by_bitonic_sort_2nd(float* itopk_distances,  // [num
 template <unsigned MAX_ITOPK,
           unsigned MAX_CANDIDATES,
           class IdxT>
-__device__ void topk_by_bitonic_sort(float* itopk_distances,  // [num_itopk]
-                                     IdxT* itopk_indices,     // [num_itopk]
-                                     const std::uint32_t num_itopk,
-                                     float* candidate_distances,  // [num_candidates]
-                                     IdxT* candidate_indices,     // [num_candidates]
-                                     const std::uint32_t num_candidates,
-                                     std::uint32_t* work_buf,
-                                     const bool first,
-                                     const unsigned MULTI_WARPS_1,
-                                     const unsigned MULTI_WARPS_2)
+RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort(
+  float* itopk_distances,  // [num_itopk]
+  IdxT* itopk_indices,     // [num_itopk]
+  const std::uint32_t num_itopk,
+  float* candidate_distances,  // [num_candidates]
+  IdxT* candidate_indices,     // [num_candidates]
+  const std::uint32_t num_candidates,
+  std::uint32_t* work_buf,
+  const bool first,
+  const unsigned MULTI_WARPS_1,
+  const unsigned MULTI_WARPS_2)
 {
   // The results in candidate_distances/indices are sorted by bitonic sort.
   topk_by_bitonic_sort_1st<MAX_CANDIDATES, IdxT>(
@@ -439,11 +443,11 @@ __device__ void topk_by_bitonic_sort(float* itopk_distances,  // [num_itopk]
 }
 
 template <class INDEX_T>
-__device__ inline void hashmap_restore(INDEX_T* const hashmap_ptr,
-                                       const size_t hashmap_bitlen,
-                                       const INDEX_T* itopk_indices,
-                                       const uint32_t itopk_size,
-                                       const uint32_t first_tid = 0)
+RAFT_DEVICE_INLINE_FUNCTION void hashmap_restore(INDEX_T* const hashmap_ptr,
+                                                 const size_t hashmap_bitlen,
+                                                 const INDEX_T* itopk_indices,
+                                                 const uint32_t itopk_size,
+                                                 const uint32_t first_tid = 0)
 {
   constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
   if (threadIdx.x < first_tid) return;
@@ -454,18 +458,16 @@ __device__ inline void hashmap_restore(INDEX_T* const hashmap_ptr,
 }
 
 // One query one thread block
-template <uint32_t TEAM_SIZE,
-          uint32_t DATASET_BLOCK_DIM,
-          unsigned MAX_ITOPK,
+template <unsigned MAX_ITOPK,
           unsigned MAX_CANDIDATES,
           unsigned TOPK_BY_BITONIC_SORT,
           class DATASET_DESCRIPTOR_T,
           class SAMPLE_FILTER_T>
-__launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
+RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel(
   typename DATASET_DESCRIPTOR_T::INDEX_T* const result_indices_ptr,       // [num_queries, top_k]
   typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr,  // [num_queries, top_k]
   const std::uint32_t top_k,
-  DATASET_DESCRIPTOR_T dataset_desc,
+  const DATASET_DESCRIPTOR_T* dataset_desc,
   const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
   const typename DATASET_DESCRIPTOR_T::INDEX_T* const knn_graph,   // [dataset_size, graph_degree]
   const std::uint32_t graph_degree,
@@ -483,15 +485,13 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
   const std::uint32_t hash_bitlen,
   const std::uint32_t small_hash_bitlen,
   const std::uint32_t small_hash_reset_interval,
-  SAMPLE_FILTER_T sample_filter,
-  cuvs::distance::DistanceType metric)
+  SAMPLE_FILTER_T sample_filter)
 {
   using LOAD_T = device::LOAD_128BIT_T;
 
   using DATA_T     = typename DATASET_DESCRIPTOR_T::DATA_T;
   using INDEX_T    = typename DATASET_DESCRIPTOR_T::INDEX_T;
   using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
-  using QUERY_T    = typename DATASET_DESCRIPTOR_T::QUERY_T;
 
   const auto query_id = blockIdx.y;
 
@@ -512,7 +512,7 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
 #endif
   _CLK_START();
 
-  extern __shared__ std::uint32_t smem[];
+  extern __shared__ uint8_t smem[];
 
   // Layout of result_buffer
   // +----------------------+------------------------------+---------+
@@ -520,37 +520,28 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
   // | <internal_topk_size> | <search_width * graph_degree> | upto 32 |
   // +----------------------+------------------------------+---------+
   // |<---             result_buffer_size              --->|
-  std::uint32_t result_buffer_size    = internal_topk + (search_width * graph_degree);
-  std::uint32_t result_buffer_size_32 = result_buffer_size;
-  if (result_buffer_size % 32) { result_buffer_size_32 += 32 - (result_buffer_size % 32); }
-  const auto small_hash_size = hashmap::get_size(small_hash_bitlen);
-
-  const auto query_smem_buffer_length =
-    raft::ceildiv<uint32_t>(dataset_desc.dim, DATASET_BLOCK_DIM) * DATASET_BLOCK_DIM;
-  auto query_buffer          = reinterpret_cast<QUERY_T*>(smem);
-  auto result_indices_buffer = reinterpret_cast<INDEX_T*>(query_buffer + query_smem_buffer_length);
-  auto result_distances_buffer =
-    reinterpret_cast<DISTANCE_T*>(result_indices_buffer + result_buffer_size_32);
-  auto visited_hash_buffer =
-    reinterpret_cast<INDEX_T*>(result_distances_buffer + result_buffer_size_32);
-  auto parent_list_buffer = reinterpret_cast<INDEX_T*>(visited_hash_buffer + small_hash_size);
-  auto distance_work_buffer_ptr =
-    reinterpret_cast<std::uint8_t*>(parent_list_buffer + search_width);
-  auto topk_ws        = reinterpret_cast<std::uint32_t*>(distance_work_buffer_ptr +
-                                                  DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte);
-  auto terminate_flag = reinterpret_cast<std::uint32_t*>(topk_ws + 3);
-  auto smem_work_ptr  = reinterpret_cast<std::uint32_t*>(terminate_flag + 1);
+  const auto result_buffer_size    = internal_topk + (search_width * graph_degree);
+  const auto result_buffer_size_32 = raft::round_up_safe<uint32_t>(result_buffer_size, 32);
+  const auto small_hash_size       = hashmap::get_size(small_hash_bitlen);
 
   // Set smem working buffer for the distance calculation
-  dataset_desc.set_smem_ptr(distance_work_buffer_ptr);
+  dataset_desc = dataset_desc->setup_workspace(smem, queries_ptr, query_id);
+
+  auto* __restrict__ result_indices_buffer =
+    reinterpret_cast<INDEX_T*>(smem + dataset_desc->smem_ws_size_in_bytes());
+  auto* __restrict__ result_distances_buffer =
+    reinterpret_cast<DISTANCE_T*>(result_indices_buffer + result_buffer_size_32);
+  auto* __restrict__ visited_hash_buffer =
+    reinterpret_cast<INDEX_T*>(result_distances_buffer + result_buffer_size_32);
+  auto* __restrict__ parent_list_buffer =
+    reinterpret_cast<INDEX_T*>(visited_hash_buffer + small_hash_size);
+  auto* __restrict__ topk_ws = reinterpret_cast<std::uint32_t*>(parent_list_buffer + search_width);
+  auto* terminate_flag       = reinterpret_cast<std::uint32_t*>(topk_ws + 3);
+  auto* __restrict__ smem_work_ptr = reinterpret_cast<std::uint32_t*>(terminate_flag + 1);
 
   // A flag for filtering.
   auto filter_flag = terminate_flag;
 
-  const DATA_T* const query_ptr = queries_ptr + query_id * dataset_desc.dim;
-  dataset_desc.template copy_query<DATASET_BLOCK_DIM>(
-    query_ptr, query_buffer, query_smem_buffer_length);
-
   if (threadIdx.x == 0) {
     terminate_flag[0] = 0;
     topk_ws[0]        = ~0u;
@@ -570,18 +561,16 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
   // compute distance to randomly selecting nodes
   _CLK_START();
   const INDEX_T* const local_seed_ptr = seed_ptr ? seed_ptr + (num_seeds * query_id) : nullptr;
-  device::compute_distance_to_random_nodes<TEAM_SIZE, DATASET_BLOCK_DIM>(result_indices_buffer,
-                                                                         result_distances_buffer,
-                                                                         query_buffer,
-                                                                         dataset_desc,
-                                                                         result_buffer_size,
-                                                                         num_distilation,
-                                                                         rand_xor_mask,
-                                                                         local_seed_ptr,
-                                                                         num_seeds,
-                                                                         local_visited_hashmap_ptr,
-                                                                         hash_bitlen,
-                                                                         metric);
+  device::compute_distance_to_random_nodes(result_indices_buffer,
+                                           result_distances_buffer,
+                                           *dataset_desc,
+                                           result_buffer_size,
+                                           num_distilation,
+                                           rand_xor_mask,
+                                           local_seed_ptr,
+                                           num_seeds,
+                                           local_visited_hashmap_ptr,
+                                           hash_bitlen);
   __syncthreads();
   _CLK_REC(clk_compute_1st_distance);
 
@@ -666,7 +655,7 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
         nullptr,
         topk_ws,
         true,
-        reinterpret_cast<std::uint32_t*>(smem_work_ptr));
+        smem_work_ptr);
       _CLK_REC(clk_topk);
 
       // reset small-hash table
@@ -683,12 +672,8 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
     // pick up next parents
     if (threadIdx.x < 32) {
       _CLK_START();
-      pickup_next_parents<TOPK_BY_BITONIC_SORT, INDEX_T>(terminate_flag,
-                                                         parent_list_buffer,
-                                                         result_indices_buffer,
-                                                         internal_topk,
-                                                         dataset_desc.size,
-                                                         search_width);
+      pickup_next_parents<TOPK_BY_BITONIC_SORT, INDEX_T>(
+        terminate_flag, parent_list_buffer, result_indices_buffer, internal_topk, search_width);
       _CLK_REC(clk_pickup_parents);
     }
 
@@ -706,20 +691,16 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
 
     // compute the norms between child nodes and query node
     _CLK_START();
-    constexpr unsigned max_n_frags = 8;
-    device::compute_distance_to_child_nodes<TEAM_SIZE, DATASET_BLOCK_DIM, max_n_frags>(
-      result_indices_buffer + internal_topk,
-      result_distances_buffer + internal_topk,
-      query_buffer,
-      dataset_desc,
-      knn_graph,
-      graph_degree,
-      local_visited_hashmap_ptr,
-      hash_bitlen,
-      parent_list_buffer,
-      result_indices_buffer,
-      search_width,
-      metric);
+    device::compute_distance_to_child_nodes(result_indices_buffer + internal_topk,
+                                            result_distances_buffer + internal_topk,
+                                            *dataset_desc,
+                                            knn_graph,
+                                            graph_degree,
+                                            local_visited_hashmap_ptr,
+                                            hash_bitlen,
+                                            parent_list_buffer,
+                                            result_indices_buffer,
+                                            search_width);
     __syncthreads();
     _CLK_REC(clk_compute_distance);
 
@@ -815,50 +796,33 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
 #endif
 }
 
-template <uint32_t TEAM_SIZE,
-          uint32_t DATASET_BLOCK_DIM,
-          typename DATASET_DESCRIPTOR_T,
-          typename SAMPLE_FILTER_T>
+template <typename DATASET_DESCRIPTOR_T, typename SAMPLE_FILTER_T>
 struct search_kernel_config {
-  using kernel_t = decltype(&search_kernel<TEAM_SIZE,
-                                           DATASET_BLOCK_DIM,
-                                           64,
-                                           64,
-                                           0,
-                                           DATASET_DESCRIPTOR_T,
-                                           SAMPLE_FILTER_T>);
+  using kernel_t = decltype(&search_kernel<64, 64, 0, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>);
 
   template <unsigned MAX_CANDIDATES, unsigned USE_BITONIC_SORT>
   static auto choose_search_kernel(unsigned itopk_size) -> kernel_t
   {
     if (itopk_size <= 64) {
-      return search_kernel<TEAM_SIZE,
-                           DATASET_BLOCK_DIM,
-                           64,
+      return search_kernel<64,
                            MAX_CANDIDATES,
                            USE_BITONIC_SORT,
                            DATASET_DESCRIPTOR_T,
                            SAMPLE_FILTER_T>;
     } else if (itopk_size <= 128) {
-      return search_kernel<TEAM_SIZE,
-                           DATASET_BLOCK_DIM,
-                           128,
+      return search_kernel<128,
                            MAX_CANDIDATES,
                            USE_BITONIC_SORT,
                            DATASET_DESCRIPTOR_T,
                            SAMPLE_FILTER_T>;
     } else if (itopk_size <= 256) {
-      return search_kernel<TEAM_SIZE,
-                           DATASET_BLOCK_DIM,
-                           256,
+      return search_kernel<256,
                            MAX_CANDIDATES,
                            USE_BITONIC_SORT,
                            DATASET_DESCRIPTOR_T,
                            SAMPLE_FILTER_T>;
     } else if (itopk_size <= 512) {
-      return search_kernel<TEAM_SIZE,
-                           DATASET_BLOCK_DIM,
-                           512,
+      return search_kernel<512,
                            MAX_CANDIDATES,
                            USE_BITONIC_SORT,
                            DATASET_DESCRIPTOR_T,
@@ -882,21 +846,9 @@ struct search_kernel_config {
       // Radix-based topk is used
       constexpr unsigned max_candidates = 32;  // to avoid build failure
       if (itopk_size <= 256) {
-        return search_kernel<TEAM_SIZE,
-                             DATASET_BLOCK_DIM,
-                             256,
-                             max_candidates,
-                             0,
-                             DATASET_DESCRIPTOR_T,
-                             SAMPLE_FILTER_T>;
+        return search_kernel<256, max_candidates, 0, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>;
       } else if (itopk_size <= 512) {
-        return search_kernel<TEAM_SIZE,
-                             DATASET_BLOCK_DIM,
-                             512,
-                             max_candidates,
-                             0,
-                             DATASET_DESCRIPTOR_T,
-                             SAMPLE_FILTER_T>;
+        return search_kernel<512, max_candidates, 0, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>;
       }
     }
     THROW("No kernel for parametels itopk_size %u, num_itopk_candidates %u",
@@ -905,40 +857,35 @@ struct search_kernel_config {
   }
 };
 
-template <unsigned TEAM_SIZE,
-          unsigned DATASET_BLOCK_DIM,
-          typename DATASET_DESCRIPTOR_T,
-          typename SAMPLE_FILTER_T>
-void select_and_run(
-  DATASET_DESCRIPTOR_T dataset_desc,
-  raft::device_matrix_view<const typename DATASET_DESCRIPTOR_T::INDEX_T, int64_t, raft::row_major>
-    graph,
-  typename DATASET_DESCRIPTOR_T::INDEX_T* const topk_indices_ptr,       // [num_queries, topk]
-  typename DATASET_DESCRIPTOR_T::DISTANCE_T* const topk_distances_ptr,  // [num_queries, topk]
-  const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
-  const uint32_t num_queries,
-  const typename DATASET_DESCRIPTOR_T::INDEX_T* dev_seed_ptr,  // [num_queries, num_seeds]
-  uint32_t* const num_executed_iterations,                     // [num_queries,]
-  const search_params& ps,
-  uint32_t topk,
-  uint32_t num_itopk_candidates,
-  uint32_t block_size,  //
-  uint32_t smem_size,
-  int64_t hash_bitlen,
-  typename DATASET_DESCRIPTOR_T::INDEX_T* hashmap_ptr,
-  size_t small_hash_bitlen,
-  size_t small_hash_reset_interval,
-  uint32_t num_seeds,
-  SAMPLE_FILTER_T sample_filter,
-  cuvs::distance::DistanceType metric,
-  cudaStream_t stream)
+template <typename DataT, typename IndexT, typename DistanceT, typename SampleFilterT>
+void select_and_run(const dataset_descriptor_base_t<DataT, IndexT, DistanceT>* dataset_desc,
+                    raft::device_matrix_view<const IndexT, int64_t, raft::row_major> graph,
+                    IndexT* topk_indices_ptr,       // [num_queries, topk]
+                    DistanceT* topk_distances_ptr,  // [num_queries, topk]
+                    const DataT* queries_ptr,       // [num_queries, dataset_dim]
+                    uint32_t num_queries,
+                    const IndexT* dev_seed_ptr,         // [num_queries, num_seeds]
+                    uint32_t* num_executed_iterations,  // [num_queries,]
+                    const search_params& ps,
+                    uint32_t topk,
+                    uint32_t num_itopk_candidates,
+                    uint32_t block_size,  //
+                    uint32_t smem_size,
+                    int64_t hash_bitlen,
+                    IndexT* hashmap_ptr,
+                    size_t small_hash_bitlen,
+                    size_t small_hash_reset_interval,
+                    uint32_t num_seeds,
+                    SampleFilterT sample_filter,
+                    cudaStream_t stream)
 {
   auto kernel =
-    search_kernel_config<TEAM_SIZE, DATASET_BLOCK_DIM, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>::
-      choose_itopk_and_mx_candidates(ps.itopk_size, num_itopk_candidates, block_size);
-  RAFT_CUDA_TRY(cudaFuncSetAttribute(kernel,
-                                     cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                     smem_size + DATASET_DESCRIPTOR_T::smem_buffer_size_in_byte));
+    search_kernel_config<dataset_descriptor_base_t<DataT, IndexT, DistanceT>,
+                         SampleFilterT>::choose_itopk_and_mx_candidates(ps.itopk_size,
+                                                                        num_itopk_candidates,
+                                                                        block_size);
+  RAFT_CUDA_TRY(
+    cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
   dim3 thread_dims(block_size, 1, 1);
   dim3 block_dims(1, num_queries, 1);
   RAFT_LOG_DEBUG(
@@ -963,9 +910,9 @@ void select_and_run(
                                                          hash_bitlen,
                                                          small_hash_bitlen,
                                                          small_hash_reset_interval,
-                                                         sample_filter,
-                                                         metric);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+                                                         sample_filter);
+  // RAFT_CUDA_TRY(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 }
 }  // namespace single_cta_search
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh
index 1ccec9219..7b7f44db7 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,10 +15,32 @@
  */
 #pragma once
 
-#ifndef _CUVS_EXPLICIT_INSTANTIATE_ONLY
-#include "search_single_cta_kernel-inl.cuh"
-#endif
+#include "compute_distance-ext.cuh"
 
-#ifdef RAFT_COMPILED
-#include "search_single_cta_kernel-ext.cuh"
-#endif
+#include <cuvs/neighbors/cagra.hpp>
+
+namespace cuvs::neighbors::cagra::detail::single_cta_search {
+
+template <typename DataT, typename IndexT, typename DistanceT, typename SampleFilterT>
+void select_and_run(const dataset_descriptor_base_t<DataT, IndexT, DistanceT>* dataset_desc,
+                    raft::device_matrix_view<const IndexT, int64_t, raft::row_major> graph,
+                    IndexT* topk_indices_ptr,       // [num_queries, topk]
+                    DistanceT* topk_distances_ptr,  // [num_queries, topk]
+                    const DataT* queries_ptr,       // [num_queries, dataset_dim]
+                    uint32_t num_queries,
+                    const IndexT* dev_seed_ptr,         // [num_queries, num_seeds]
+                    uint32_t* num_executed_iterations,  // [num_queries,]
+                    const search_params& ps,
+                    uint32_t topk,
+                    uint32_t num_itopk_candidates,
+                    uint32_t block_size,  //
+                    uint32_t smem_size,
+                    int64_t hash_bitlen,
+                    IndexT* hashmap_ptr,
+                    size_t small_hash_bitlen,
+                    size_t small_hash_reset_interval,
+                    uint32_t num_seeds,
+                    SampleFilterT sample_filter,
+                    cudaStream_t stream);
+
+}
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32.cu
new file mode 100644
index 000000000..ee6427170
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by search_single_cta_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python search_single_cta_00_generate.py
+ *
+ */
+
+#include "search_single_cta_inst.cuh"
+
+namespace cuvs::neighbors::cagra::detail::single_cta_search {
+instantiate_kernel_selection(uint8_t,
+                             uint32_t,
+                             float,
+                             cuvs::neighbors::filtering::none_cagra_sample_filter);
+
+}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu
deleted file mode 100644
index 35e04ea6a..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  32,
-  1024,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<uint8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu
deleted file mode 100644
index 614e6ca01..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  8,
-  128,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<uint8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu
deleted file mode 100644
index 005afb566..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  16,
-  256,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<uint8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu
deleted file mode 100644
index af30b2e24..000000000
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by search_single_cta_00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python search_single_cta_00_generate.py
- *
- */
-
-#include "search_single_cta_inst.cuh"
-
-#include "compute_distance.hpp"
-
-namespace cuvs::neighbors::cagra::detail::single_cta_search {
-instantiate_kernel_selection(
-  32,
-  512,
-  cuvs::neighbors::cagra::detail::standard_dataset_descriptor_t<uint8_t COMMA uint32_t COMMA float>,
-  cuvs::neighbors::filtering::none_cagra_sample_filter);
-
-}  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/topk_by_radix.cuh b/cpp/src/neighbors/detail/cagra/topk_by_radix.cuh
index 67173026b..b6f97cb26 100644
--- a/cpp/src/neighbors/detail/cagra/topk_by_radix.cuh
+++ b/cpp/src/neighbors/detail/cagra/topk_by_radix.cuh
@@ -32,17 +32,17 @@ struct topk_by_radix_sort : topk_by_radix_sort_base<MAX_INTERNAL_TOPK> {};
 template <unsigned MAX_INTERNAL_TOPK, class IdxT>
 struct topk_by_radix_sort<MAX_INTERNAL_TOPK, IdxT, std::enable_if_t<((MAX_INTERNAL_TOPK <= 64))>>
   : topk_by_radix_sort_base<MAX_INTERNAL_TOPK> {
-  __device__ void operator()(uint32_t topk,
-                             uint32_t batch_size,
-                             uint32_t len_x,
-                             const uint32_t* _x,
-                             const IdxT* _in_vals,
-                             uint32_t* _y,
-                             IdxT* _out_vals,
-                             uint32_t* work,
-                             uint32_t* _hints,
-                             bool sort,
-                             uint32_t* _smem)
+  RAFT_DEVICE_INLINE_FUNCTION void operator()(uint32_t topk,
+                                              uint32_t batch_size,
+                                              uint32_t len_x,
+                                              const uint32_t* _x,
+                                              const IdxT* _in_vals,
+                                              uint32_t* _y,
+                                              IdxT* _out_vals,
+                                              uint32_t* work,
+                                              uint32_t* _hints,
+                                              bool sort,
+                                              uint32_t* _smem)
   {
     std::uint8_t* const state = reinterpret_cast<std::uint8_t*>(work);
     topk_cta_11_core<topk_by_radix_sort_base<MAX_INTERNAL_TOPK>::state_bit_lenght,
@@ -60,17 +60,17 @@ struct topk_by_radix_sort<MAX_INTERNAL_TOPK, IdxT, std::enable_if_t<((MAX_INTERN
     IdxT,                                                                            \
     std::enable_if_t<((MAX_INTERNAL_TOPK <= V) && (2 * MAX_INTERNAL_TOPK > V))>>     \
     : topk_by_radix_sort_base<MAX_INTERNAL_TOPK> {                                   \
-    __device__ void operator()(uint32_t topk,                                        \
-                               uint32_t batch_size,                                  \
-                               uint32_t len_x,                                       \
-                               const uint32_t* _x,                                   \
-                               const IdxT* _in_vals,                                 \
-                               uint32_t* _y,                                         \
-                               IdxT* _out_vals,                                      \
-                               uint32_t* work,                                       \
-                               uint32_t* _hints,                                     \
-                               bool sort,                                            \
-                               uint32_t* _smem)                                      \
+    RAFT_DEVICE_INLINE_FUNCTION void operator()(uint32_t topk,                       \
+                                                uint32_t batch_size,                 \
+                                                uint32_t len_x,                      \
+                                                const uint32_t* _x,                  \
+                                                const IdxT* _in_vals,                \
+                                                uint32_t* _y,                        \
+                                                IdxT* _out_vals,                     \
+                                                uint32_t* work,                      \
+                                                uint32_t* _hints,                    \
+                                                bool sort,                           \
+                                                uint32_t* _smem)                     \
     {                                                                                \
       assert(blockDim.x >= V / 4);                                                   \
       std::uint8_t* state = (std::uint8_t*)work;                                     \
diff --git a/cpp/src/neighbors/detail/cagra/topk_for_cagra/topk.cu b/cpp/src/neighbors/detail/cagra/topk_for_cagra/topk.cu
new file mode 100644
index 000000000..72ff2cb85
--- /dev/null
+++ b/cpp/src/neighbors/detail/cagra/topk_for_cagra/topk.cu
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "topk_core.cuh"
+
+namespace cuvs::neighbors::cagra::detail {
+
+//
+size_t _cuann_find_topk_bufferSize(uint32_t topK,
+                                   uint32_t sizeBatch,
+                                   uint32_t numElements,
+                                   cudaDataType_t sampleDtype)
+{
+  constexpr int numThreads  = NUM_THREADS;
+  constexpr int stateBitLen = STATE_BIT_LENGTH;
+  assert(stateBitLen == 0 || stateBitLen == 8);
+
+  size_t workspaceSize = 1;
+  // state
+  if (stateBitLen == 8) {
+    workspaceSize = _cuann_aligned(
+      sizeof(uint8_t) * get_state_size<stateBitLen, numThreads>(numElements) * sizeBatch);
+  }
+
+  return workspaceSize;
+}
+
+template <class ValT>
+void _cuann_find_topk(uint32_t topK,
+                      uint32_t sizeBatch,
+                      uint32_t numElements,
+                      const float* inputKeys,  // [sizeBatch, ldIK,]
+                      uint32_t ldIK,           // (*) ldIK >= numElements
+                      const ValT* inputVals,   // [sizeBatch, ldIV,]
+                      uint32_t ldIV,           // (*) ldIV >= numElements
+                      float* outputKeys,       // [sizeBatch, ldOK,]
+                      uint32_t ldOK,           // (*) ldOK >= topK
+                      ValT* outputVals,        // [sizeBatch, ldOV,]
+                      uint32_t ldOV,           // (*) ldOV >= topK
+                      void* workspace,
+                      bool sort,
+                      uint32_t* hints,
+                      cudaStream_t stream)
+{
+  assert(ldIK >= numElements);
+  assert(ldIV >= numElements);
+  assert(ldOK >= topK);
+  assert(ldOV >= topK);
+
+  constexpr int numThreads  = NUM_THREADS;
+  constexpr int stateBitLen = STATE_BIT_LENGTH;
+  assert(stateBitLen == 0 || stateBitLen == 8);
+
+  uint8_t* state = NULL;
+  if (stateBitLen == 8) { state = (uint8_t*)workspace; }
+
+  dim3 threads(numThreads, 1, 1);
+  dim3 blocks(sizeBatch, 1, 1);
+
+  void (*cta_kernel)(uint32_t,
+                     uint32_t,
+                     uint32_t,
+                     const uint32_t*,
+                     uint32_t,
+                     const ValT*,
+                     uint32_t,
+                     uint32_t*,
+                     uint32_t,
+                     ValT*,
+                     uint32_t,
+                     uint8_t*,
+                     uint32_t*,
+                     bool) = nullptr;
+
+  // V:vecLen, K:maxTopk, T:numSortThreads
+#define SET_KERNEL_VKT(V, K, T, ValT)                          \
+  do {                                                         \
+    assert(numThreads >= T);                                   \
+    assert((K % T) == 0);                                      \
+    assert((K / T) <= 4);                                      \
+    cta_kernel = kern_topk_cta_11<stateBitLen, V, K, T, ValT>; \
+  } while (0)
+
+  // V: vecLen
+#define SET_KERNEL_V(V, ValT)                                \
+  do {                                                       \
+    if (topK <= 32) {                                        \
+      SET_KERNEL_VKT(V, 32, 32, ValT);                       \
+    } else if (topK <= 64) {                                 \
+      SET_KERNEL_VKT(V, 64, 32, ValT);                       \
+    } else if (topK <= 96) {                                 \
+      SET_KERNEL_VKT(V, 96, 32, ValT);                       \
+    } else if (topK <= 128) {                                \
+      SET_KERNEL_VKT(V, 128, 32, ValT);                      \
+    } else if (topK <= 192) {                                \
+      SET_KERNEL_VKT(V, 192, 64, ValT);                      \
+    } else if (topK <= 256) {                                \
+      SET_KERNEL_VKT(V, 256, 64, ValT);                      \
+    } else if (topK <= 384) {                                \
+      SET_KERNEL_VKT(V, 384, 128, ValT);                     \
+    } else if (topK <= 512) {                                \
+      SET_KERNEL_VKT(V, 512, 128, ValT);                     \
+    } else if (topK <= 768) {                                \
+      SET_KERNEL_VKT(V, 768, 256, ValT);                     \
+    } else if (topK <= 1024) {                               \
+      SET_KERNEL_VKT(V, 1024, 256, ValT);                    \
+    } \
+        /* else if (topK <= 1536) { SET_KERNEL_VKT(V, 1536, 512); } */ \
+        /* else if (topK <= 2048) { SET_KERNEL_VKT(V, 2048, 512); } */ \
+        /* else if (topK <= 3072) { SET_KERNEL_VKT(V, 3072, 1024); } */ \
+        /* else if (topK <= 4096) { SET_KERNEL_VKT(V, 4096, 1024); } */ \
+        else {                                                      \
+      RAFT_FAIL("topk must be lower than or equal to 1024"); \
+    }                                                        \
+  } while (0)
+
+  int _vecLen = _get_vecLen(ldIK, 2);
+  if (_vecLen == 2) {
+    SET_KERNEL_V(2, ValT);
+  } else if (_vecLen == 1) {
+    SET_KERNEL_V(1, ValT);
+  }
+
+  cta_kernel<<<blocks, threads, 0, stream>>>(topK,
+                                             sizeBatch,
+                                             numElements,
+                                             (const uint32_t*)inputKeys,
+                                             ldIK,
+                                             inputVals,
+                                             ldIV,
+                                             (uint32_t*)outputKeys,
+                                             ldOK,
+                                             outputVals,
+                                             ldOV,
+                                             state,
+                                             hints,
+                                             sort);
+
+  return;
+}
+
+template void _cuann_find_topk<uint32_t>(uint32_t topK,
+                                         uint32_t sizeBatch,
+                                         uint32_t numElements,
+                                         const float* inputKeys,     // [sizeBatch, ldIK,]
+                                         uint32_t ldIK,              // (*) ldIK >= numElements
+                                         const uint32_t* inputVals,  // [sizeBatch, ldIV,]
+                                         uint32_t ldIV,              // (*) ldIV >= numElements
+                                         float* outputKeys,          // [sizeBatch, ldOK,]
+                                         uint32_t ldOK,              // (*) ldOK >= topK
+                                         uint32_t* outputVals,       // [sizeBatch, ldOV,]
+                                         uint32_t ldOV,              // (*) ldOV >= topK
+                                         void* workspace,
+                                         bool sort,
+                                         uint32_t* hint,
+                                         cudaStream_t stream);
+
+}  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh b/cpp/src/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh
index cbf99a556..65f9cfade 100644
--- a/cpp/src/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh
@@ -14,10 +14,15 @@
  * limitations under the License.
  */
 #pragma once
+
+#include "../utils.hpp"
 #include "topk.h"
 
 #include <cub/cub.cuh>
 
+#include <raft/core/detail/macros.hpp>
+#include <raft/core/error.hpp>
+
 #include <assert.h>
 #include <float.h>
 #include <stdint.h>
@@ -25,7 +30,7 @@
 
 namespace cuvs::neighbors::cagra::detail {
 //
-__device__ inline uint32_t convert(uint32_t x)
+RAFT_DEVICE_INLINE_FUNCTION constexpr uint32_t convert(uint32_t x)
 {
   if (x & 0x80000000) {
     return x ^ 0xffffffff;
@@ -35,7 +40,7 @@ __device__ inline uint32_t convert(uint32_t x)
 }
 
 //
-__device__ inline uint16_t convert(uint16_t x)
+RAFT_DEVICE_INLINE_FUNCTION constexpr uint16_t convert(uint16_t x)
 {
   if (x & 0x8000) {
     return x ^ 0xffff;
@@ -62,7 +67,7 @@ struct u16_vector {
 
 //
 template <int vecLen>
-__device__ inline void load_u32_vector(struct u32_vector& vec, const uint32_t* x, int i)
+RAFT_DEVICE_INLINE_FUNCTION void load_u32_vector(struct u32_vector& vec, const uint32_t* x, int i)
 {
   if (vecLen == 1) {
     vec.x1 = ((uint1*)(x + i))[0];
@@ -77,7 +82,7 @@ __device__ inline void load_u32_vector(struct u32_vector& vec, const uint32_t* x
 
 //
 template <int vecLen>
-__device__ inline void load_u16_vector(struct u16_vector& vec, const uint16_t* x, int i)
+RAFT_DEVICE_INLINE_FUNCTION void load_u16_vector(struct u16_vector& vec, const uint16_t* x, int i)
 {
   if (vecLen == 1) {
     vec.x1 = ((ushort1*)(x + i))[0];
@@ -92,7 +97,7 @@ __device__ inline void load_u16_vector(struct u16_vector& vec, const uint16_t* x
 
 //
 template <int vecLen>
-__device__ inline uint32_t get_element_from_u32_vector(struct u32_vector& vec, int i)
+RAFT_DEVICE_INLINE_FUNCTION uint32_t get_element_from_u32_vector(struct u32_vector& vec, int i)
 {
   uint32_t xi;
   if (vecLen == 1) {
@@ -134,7 +139,7 @@ __device__ inline uint32_t get_element_from_u32_vector(struct u32_vector& vec, i
 
 //
 template <int vecLen>
-__device__ inline uint16_t get_element_from_u16_vector(struct u16_vector& vec, int i)
+RAFT_DEVICE_INLINE_FUNCTION uint16_t get_element_from_u16_vector(struct u16_vector& vec, int i)
 {
   uint16_t xi;
   if (vecLen == 1) {
@@ -175,7 +180,7 @@ __device__ inline uint16_t get_element_from_u16_vector(struct u16_vector& vec, i
 }
 
 template <typename T>
-__device__ inline void block_scan(const T input, T& output)
+RAFT_DEVICE_INLINE_FUNCTION void block_scan(const T input, T& output)
 {
   switch (blockDim.x) {
     case 32: {
@@ -214,19 +219,19 @@ __device__ inline void block_scan(const T input, T& output)
 
 //
 template <typename T, int stateBitLen, int vecLen>
-__device__ inline void update_histogram(int itr,
-                                        uint32_t thread_id,
-                                        uint32_t num_threads,
-                                        uint32_t hint,
-                                        uint32_t threshold,
-                                        uint32_t& num_bins,
-                                        uint32_t& shift,
-                                        const T* x,  // [nx,]
-                                        uint32_t nx,
-                                        uint32_t* hist,  // [num_bins]
-                                        uint8_t* state,
-                                        uint32_t* output,  // [topk]
-                                        uint32_t* output_count)
+RAFT_DEVICE_INLINE_FUNCTION void update_histogram(int itr,
+                                                  uint32_t thread_id,
+                                                  uint32_t num_threads,
+                                                  uint32_t hint,
+                                                  uint32_t threshold,
+                                                  uint32_t& num_bins,
+                                                  uint32_t& shift,
+                                                  const T* x,  // [nx,]
+                                                  uint32_t nx,
+                                                  uint32_t* hist,  // [num_bins]
+                                                  uint8_t* state,
+                                                  uint32_t* output,  // [topk]
+                                                  uint32_t* output_count)
 {
   if (sizeof(T) == 4) {
     // 32-bit (uint32_t)
@@ -324,15 +329,16 @@ __device__ inline void update_histogram(int itr,
 }
 
 template <unsigned blockDim_x>
-__device__ inline void select_best_index_for_next_threshold_core(uint32_t& my_index,
-                                                                 uint32_t& my_csum,
-                                                                 const unsigned num_bins,
-                                                                 const uint32_t* const hist,
-                                                                 const uint32_t nx_below_threshold,
-                                                                 const uint32_t max_threshold,
-                                                                 const uint32_t threshold,
-                                                                 const uint32_t shift,
-                                                                 const uint32_t topk)
+RAFT_DEVICE_INLINE_FUNCTION void select_best_index_for_next_threshold_core(
+  uint32_t& my_index,
+  uint32_t& my_csum,
+  const unsigned num_bins,
+  const uint32_t* const hist,
+  const uint32_t nx_below_threshold,
+  const uint32_t max_threshold,
+  const uint32_t threshold,
+  const uint32_t shift,
+  const uint32_t topk)
 {
   typedef cub::BlockScan<uint32_t, blockDim_x> BlockScanT;
   __shared__ typename BlockScanT::TempStorage temp_storage;
@@ -370,7 +376,7 @@ __device__ inline void select_best_index_for_next_threshold_core(uint32_t& my_in
 }
 
 //
-__device__ inline void select_best_index_for_next_threshold(
+RAFT_DEVICE_INLINE_FUNCTION void select_best_index_for_next_threshold(
   const uint32_t topk,
   const uint32_t threshold,
   const uint32_t max_threshold,
@@ -469,17 +475,17 @@ __device__ inline void select_best_index_for_next_threshold(
 
 //
 template <typename T, int stateBitLen, int vecLen>
-__device__ inline void output_index_below_threshold(const uint32_t topk,
-                                                    const uint32_t thread_id,
-                                                    const uint32_t num_threads,
-                                                    const uint32_t threshold,
-                                                    const uint32_t nx_below_threshold,
-                                                    const T* const x,  // [nx,]
-                                                    const uint32_t nx,
-                                                    const uint8_t* state,
-                                                    uint32_t* const output,  // [topk]
-                                                    uint32_t* const output_count,
-                                                    uint32_t* const output_count_eq)
+RAFT_DEVICE_INLINE_FUNCTION void output_index_below_threshold(const uint32_t topk,
+                                                              const uint32_t thread_id,
+                                                              const uint32_t num_threads,
+                                                              const uint32_t threshold,
+                                                              const uint32_t nx_below_threshold,
+                                                              const T* const x,  // [nx,]
+                                                              const uint32_t nx,
+                                                              const uint8_t* state,
+                                                              uint32_t* const output,  // [topk]
+                                                              uint32_t* const output_count,
+                                                              uint32_t* const output_count_eq)
 {
   int ii = 0;
   for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) {
@@ -530,7 +536,7 @@ __device__ inline void output_index_below_threshold(const uint32_t topk,
 
 //
 template <typename T>
-__device__ inline void swap(T& val1, T& val2)
+RAFT_DEVICE_INLINE_FUNCTION constexpr void swap(T& val1, T& val2)
 {
   const T val0 = val1;
   val1         = val2;
@@ -539,7 +545,7 @@ __device__ inline void swap(T& val1, T& val2)
 
 //
 template <typename K>
-__device__ inline bool swap_if_needed(K& key1, K& key2)
+RAFT_DEVICE_INLINE_FUNCTION constexpr bool swap_if_needed(K& key1, K& key2)
 {
   if (key1 > key2) {
     swap<K>(key1, key2);
@@ -550,7 +556,7 @@ __device__ inline bool swap_if_needed(K& key1, K& key2)
 
 //
 template <typename K, typename V>
-__device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2)
+RAFT_DEVICE_INLINE_FUNCTION constexpr bool swap_if_needed(K& key1, K& key2, V& val1, V& val2)
 {
   if (key1 > key2) {
     swap<K>(key1, key2);
@@ -562,7 +568,8 @@ __device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2)
 
 //
 template <typename K, typename V>
-__device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2, bool ascending)
+RAFT_DEVICE_INLINE_FUNCTION constexpr bool swap_if_needed(
+  K& key1, K& key2, V& val1, V& val2, bool ascending)
 {
   if (key1 == key2) { return false; }
   if ((key1 > key2) == ascending) {
@@ -575,20 +582,20 @@ __device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2, bool a
 
 //
 template <typename T>
-__device__ inline T max_value_of();
+RAFT_DEVICE_INLINE_FUNCTION T max_value_of();
 template <>
-__device__ inline float max_value_of<float>()
+RAFT_DEVICE_INLINE_FUNCTION float max_value_of<float>()
 {
   return FLT_MAX;
 }
 template <>
-__device__ inline uint32_t max_value_of<uint32_t>()
+RAFT_DEVICE_INLINE_FUNCTION uint32_t max_value_of<uint32_t>()
 {
   return ~0u;
 }
 
 template <int stateBitLen, unsigned BLOCK_SIZE = 0>
-__device__ __host__ inline uint32_t get_state_size(uint32_t len_x)
+RAFT_INLINE_FUNCTION constexpr uint32_t get_state_size(uint32_t len_x)
 {
 #ifdef __CUDA_ARCH__
   const uint32_t num_threads = blockDim.x;
@@ -605,16 +612,16 @@ __device__ __host__ inline uint32_t get_state_size(uint32_t len_x)
 
 //
 template <int stateBitLen, int vecLen, int maxTopk, int numSortThreads, class ValT>
-__device__ inline void topk_cta_11_core(uint32_t topk,
-                                        uint32_t len_x,
-                                        const uint32_t* _x,    // [size_batch, ld_x,]
-                                        const ValT* _in_vals,  // [size_batch, ld_iv,]
-                                        uint32_t* _y,          // [size_batch, ld_y,]
-                                        ValT* _out_vals,       // [size_batch, ld_ov,]
-                                        uint8_t* _state,       // [size_batch, ...,]
-                                        uint32_t* _hint,
-                                        bool sort,
-                                        uint32_t* _smem)
+RAFT_DEVICE_INLINE_FUNCTION void topk_cta_11_core(uint32_t topk,
+                                                  uint32_t len_x,
+                                                  const uint32_t* _x,    // [size_batch, ld_x,]
+                                                  const ValT* _in_vals,  // [size_batch, ld_iv,]
+                                                  uint32_t* _y,          // [size_batch, ld_y,]
+                                                  ValT* _out_vals,       // [size_batch, ld_ov,]
+                                                  uint8_t* _state,       // [size_batch, ...,]
+                                                  uint32_t* _hint,
+                                                  bool sort,
+                                                  uint32_t* _smem)
 {
   uint32_t* const smem_out_vals = _smem;
   uint32_t* const hist          = &(_smem[2 * maxTopk]);
@@ -904,137 +911,4 @@ __launch_bounds__(1024, 1) RAFT_KERNEL
     _smem);
 }
 
-//
-size_t inline _cuann_find_topk_bufferSize(uint32_t topK,
-                                          uint32_t sizeBatch,
-                                          uint32_t numElements,
-                                          cudaDataType_t sampleDtype)
-{
-  constexpr int numThreads  = NUM_THREADS;
-  constexpr int stateBitLen = STATE_BIT_LENGTH;
-  assert(stateBitLen == 0 || stateBitLen == 8);
-
-  size_t workspaceSize = 1;
-  // state
-  if (stateBitLen == 8) {
-    workspaceSize = _cuann_aligned(
-      sizeof(uint8_t) * get_state_size<stateBitLen, numThreads>(numElements) * sizeBatch);
-  }
-
-  return workspaceSize;
-}
-
-template <class ValT>
-inline void _cuann_find_topk(uint32_t topK,
-                             uint32_t sizeBatch,
-                             uint32_t numElements,
-                             const float* inputKeys,  // [sizeBatch, ldIK,]
-                             uint32_t ldIK,           // (*) ldIK >= numElements
-                             const ValT* inputVals,   // [sizeBatch, ldIV,]
-                             uint32_t ldIV,           // (*) ldIV >= numElements
-                             float* outputKeys,       // [sizeBatch, ldOK,]
-                             uint32_t ldOK,           // (*) ldOK >= topK
-                             ValT* outputVals,        // [sizeBatch, ldOV,]
-                             uint32_t ldOV,           // (*) ldOV >= topK
-                             void* workspace,
-                             bool sort,
-                             uint32_t* hints,
-                             cudaStream_t stream)
-{
-  assert(ldIK >= numElements);
-  assert(ldIV >= numElements);
-  assert(ldOK >= topK);
-  assert(ldOV >= topK);
-
-  constexpr int numThreads  = NUM_THREADS;
-  constexpr int stateBitLen = STATE_BIT_LENGTH;
-  assert(stateBitLen == 0 || stateBitLen == 8);
-
-  uint8_t* state = NULL;
-  if (stateBitLen == 8) { state = (uint8_t*)workspace; }
-
-  dim3 threads(numThreads, 1, 1);
-  dim3 blocks(sizeBatch, 1, 1);
-
-  void (*cta_kernel)(uint32_t,
-                     uint32_t,
-                     uint32_t,
-                     const uint32_t*,
-                     uint32_t,
-                     const ValT*,
-                     uint32_t,
-                     uint32_t*,
-                     uint32_t,
-                     ValT*,
-                     uint32_t,
-                     uint8_t*,
-                     uint32_t*,
-                     bool) = nullptr;
-
-  // V:vecLen, K:maxTopk, T:numSortThreads
-#define SET_KERNEL_VKT(V, K, T, ValT)                          \
-  do {                                                         \
-    assert(numThreads >= T);                                   \
-    assert((K % T) == 0);                                      \
-    assert((K / T) <= 4);                                      \
-    cta_kernel = kern_topk_cta_11<stateBitLen, V, K, T, ValT>; \
-  } while (0)
-
-  // V: vecLen
-#define SET_KERNEL_V(V, ValT)                                \
-  do {                                                       \
-    if (topK <= 32) {                                        \
-      SET_KERNEL_VKT(V, 32, 32, ValT);                       \
-    } else if (topK <= 64) {                                 \
-      SET_KERNEL_VKT(V, 64, 32, ValT);                       \
-    } else if (topK <= 96) {                                 \
-      SET_KERNEL_VKT(V, 96, 32, ValT);                       \
-    } else if (topK <= 128) {                                \
-      SET_KERNEL_VKT(V, 128, 32, ValT);                      \
-    } else if (topK <= 192) {                                \
-      SET_KERNEL_VKT(V, 192, 64, ValT);                      \
-    } else if (topK <= 256) {                                \
-      SET_KERNEL_VKT(V, 256, 64, ValT);                      \
-    } else if (topK <= 384) {                                \
-      SET_KERNEL_VKT(V, 384, 128, ValT);                     \
-    } else if (topK <= 512) {                                \
-      SET_KERNEL_VKT(V, 512, 128, ValT);                     \
-    } else if (topK <= 768) {                                \
-      SET_KERNEL_VKT(V, 768, 256, ValT);                     \
-    } else if (topK <= 1024) {                               \
-      SET_KERNEL_VKT(V, 1024, 256, ValT);                    \
-    } \
-        /* else if (topK <= 1536) { SET_KERNEL_VKT(V, 1536, 512); } */ \
-        /* else if (topK <= 2048) { SET_KERNEL_VKT(V, 2048, 512); } */ \
-        /* else if (topK <= 3072) { SET_KERNEL_VKT(V, 3072, 1024); } */ \
-        /* else if (topK <= 4096) { SET_KERNEL_VKT(V, 4096, 1024); } */ \
-        else {                                                      \
-      RAFT_FAIL("topk must be lower than or equal to 1024"); \
-    }                                                        \
-  } while (0)
-
-  int _vecLen = _get_vecLen(ldIK, 2);
-  if (_vecLen == 2) {
-    SET_KERNEL_V(2, ValT);
-  } else if (_vecLen == 1) {
-    SET_KERNEL_V(1, ValT);
-  }
-
-  cta_kernel<<<blocks, threads, 0, stream>>>(topK,
-                                             sizeBatch,
-                                             numElements,
-                                             (const uint32_t*)inputKeys,
-                                             ldIK,
-                                             inputVals,
-                                             ldIV,
-                                             (uint32_t*)outputKeys,
-                                             ldOK,
-                                             outputVals,
-                                             ldOV,
-                                             state,
-                                             hints,
-                                             sort);
-
-  return;
-}
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/utils.hpp b/cpp/src/neighbors/detail/cagra/utils.hpp
index 8ce20ec5c..0f8309328 100644
--- a/cpp/src/neighbors/detail/cagra/utils.hpp
+++ b/cpp/src/neighbors/detail/cagra/utils.hpp
@@ -125,24 +125,24 @@ union fp_conv {
   FP_T fp;
 };
 template <class T>
-_RAFT_HOST_DEVICE inline T get_max_value();
+_RAFT_HOST_DEVICE constexpr inline T get_max_value();
 template <>
-_RAFT_HOST_DEVICE inline float get_max_value<float>()
+_RAFT_HOST_DEVICE constexpr inline float get_max_value<float>()
 {
   return FLT_MAX;
 };
 template <>
-_RAFT_HOST_DEVICE inline half get_max_value<half>()
+_RAFT_HOST_DEVICE constexpr inline half get_max_value<half>()
 {
   return fp_conv<std::uint16_t, half>{.bs = 0x7aff}.fp;
 };
 template <>
-_RAFT_HOST_DEVICE inline std::uint32_t get_max_value<std::uint32_t>()
+_RAFT_HOST_DEVICE constexpr inline std::uint32_t get_max_value<std::uint32_t>()
 {
   return 0xffffffffu;
 };
 template <>
-_RAFT_HOST_DEVICE inline std::uint64_t get_max_value<std::uint64_t>()
+_RAFT_HOST_DEVICE constexpr inline std::uint64_t get_max_value<std::uint64_t>()
 {
   return 0xfffffffffffffffflu;
 };
diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh
index 9d2f9c175..4ce0849fd 100644
--- a/cpp/test/neighbors/ann_cagra.cuh
+++ b/cpp/test/neighbors/ann_cagra.cuh
@@ -706,7 +706,7 @@ inline std::vector<AnnCagraInputs> generate_inputs()
     {graph_build_algo::IVF_PQ, graph_build_algo::NN_DESCENT},
     {search_algo::AUTO},
     {10},
-    {0, 4, 8, 16, 32},  // team_size
+    {0, 8, 16, 32},  // team_size
     {64},
     {1},
     {cuvs::distance::DistanceType::L2Expanded},

From 8e0215021454c51f9c03b1310f8c8ea39e56b041 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 25 Sep 2024 13:27:15 -0500
Subject: [PATCH 3/6] remove NCCL pins in build and test environments (#341)

Contributes to https://github.com/rapidsai/build-planning/issues/102

Some RAPIDS libraries are using `ncclCommSplit()`, which was introduced in `nccl==2.18.1.1`. This is part of a series of PRs across RAPIDS updating libraries' pins to `nccl>=2.18.1.1` to ensure they get a new-enough version that supports that.

`cuvs` doesn't have any *direct* uses of NCCL... it only uses it via raft. This PR proposes removing `cuvs`'s dependency pinnings on NCCL, in favor of just using whatever it gets transitively via raft.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/341
---
 conda/environments/all_cuda-118_arch-aarch64.yaml       | 1 -
 conda/environments/all_cuda-118_arch-x86_64.yaml        | 1 -
 conda/environments/all_cuda-125_arch-aarch64.yaml       | 1 -
 conda/environments/all_cuda-125_arch-x86_64.yaml        | 1 -
 conda/environments/bench_ann_cuda-118_arch-aarch64.yaml | 1 -
 conda/environments/bench_ann_cuda-118_arch-x86_64.yaml  | 1 -
 conda/environments/bench_ann_cuda-125_arch-aarch64.yaml | 1 -
 conda/environments/bench_ann_cuda-125_arch-x86_64.yaml  | 1 -
 conda/recipes/libcuvs/conda_build_config.yaml           | 3 ---
 dependencies.yaml                                       | 1 -
 10 files changed, 12 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index cfcb56225..5c599fcc2 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -37,7 +37,6 @@ dependencies:
 - libcusparse=11.7.5.86
 - librmm==24.10.*,>=0.0.0a0
 - make
-- nccl>=2.9.9
 - ninja
 - numpy>=1.23,<3.0a0
 - numpydoc
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index dc519d1b5..ce9a7f058 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -37,7 +37,6 @@ dependencies:
 - libcusparse=11.7.5.86
 - librmm==24.10.*,>=0.0.0a0
 - make
-- nccl>=2.9.9
 - ninja
 - numpy>=1.23,<3.0a0
 - numpydoc
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index b32650e44..116e80ac2 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -34,7 +34,6 @@ dependencies:
 - libcusparse-dev
 - librmm==24.10.*,>=0.0.0a0
 - make
-- nccl>=2.9.9
 - ninja
 - numpy>=1.23,<3.0a0
 - numpydoc
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index d40fc3b99..7f7ad045d 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -34,7 +34,6 @@ dependencies:
 - libcusparse-dev
 - librmm==24.10.*,>=0.0.0a0
 - make
-- nccl>=2.9.9
 - ninja
 - numpy>=1.23,<3.0a0
 - numpydoc
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index c6e8b05a2..7e1014f25 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -35,7 +35,6 @@ dependencies:
 - libcusparse=11.7.5.86
 - librmm==24.10.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.9.9
 - ninja
 - nlohmann_json>=3.11.2
 - nvcc_linux-aarch64=11.8
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index d6c023ae9..120b7afca 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -35,7 +35,6 @@ dependencies:
 - libcusparse=11.7.5.86
 - librmm==24.10.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.9.9
 - ninja
 - nlohmann_json>=3.11.2
 - nvcc_linux-64=11.8
diff --git a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
index 4d0ca9496..ac0ea97e6 100644
--- a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
@@ -32,7 +32,6 @@ dependencies:
 - libcusparse-dev
 - librmm==24.10.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.9.9
 - ninja
 - nlohmann_json>=3.11.2
 - openblas
diff --git a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
index 7dd67ab5e..e593c240d 100644
--- a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
@@ -32,7 +32,6 @@ dependencies:
 - libcusparse-dev
 - librmm==24.10.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.9.9
 - ninja
 - nlohmann_json>=3.11.2
 - openblas
diff --git a/conda/recipes/libcuvs/conda_build_config.yaml b/conda/recipes/libcuvs/conda_build_config.yaml
index 0e1416342..e165f7ed9 100644
--- a/conda/recipes/libcuvs/conda_build_config.yaml
+++ b/conda/recipes/libcuvs/conda_build_config.yaml
@@ -19,9 +19,6 @@ c_stdlib_version:
 cmake_version:
   - ">=3.26.4,!=3.30.0"
 
-nccl_version:
-  - ">=2.9.9"
-
 h5py_version:
   - ">=3.8.0"
 
diff --git a/dependencies.yaml b/dependencies.yaml
index 9fcbeaae2..c63cecbbe 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -151,7 +151,6 @@ dependencies:
         packages:
           - c-compiler
           - cxx-compiler
-          - nccl>=2.9.9
     specific:
       - output_types: conda
         matrices:

From edba189053c1b830ca2b02c63d7a9a1c2f10010f Mon Sep 17 00:00:00 2001
From: rhdong <hrong@nvidia.com>
Date: Thu, 26 Sep 2024 15:50:21 -0700
Subject: [PATCH 4/6] [Feat] Relative change with `bitset` API feature #2439 in
 raft (#350)

Authors:
  - rhdong (https://github.com/rhdong)

Approvers:
  - Micka (https://github.com/lowener)

URL: https://github.com/rapidsai/cuvs/pull/350
---
 cpp/src/neighbors/detail/knn_brute_force.cuh  | 2 +-
 cpp/test/neighbors/brute_force_prefiltered.cu | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/neighbors/detail/knn_brute_force.cuh b/cpp/src/neighbors/detail/knn_brute_force.cuh
index 88986af7d..cf27bcde7 100644
--- a/cpp/src/neighbors/detail/knn_brute_force.cuh
+++ b/cpp/src/neighbors/detail/knn_brute_force.cuh
@@ -595,7 +595,7 @@ void brute_force_search_filtered(
   auto filter_view =
     raft::make_device_vector_view<const BitmapT, IdxT>(filter.data(), filter.n_elements());
   IdxT size_h    = n_queries * n_dataset;
-  auto size_view = raft::make_host_scalar_view<IdxT>(&size_h);
+  auto size_view = raft::make_host_scalar_view<const IdxT, IdxT>(&size_h);
 
   raft::popc(res, filter_view, size_view, nnz_view);
   raft::copy(&nnz_h, nnz.data(), 1, stream);
diff --git a/cpp/test/neighbors/brute_force_prefiltered.cu b/cpp/test/neighbors/brute_force_prefiltered.cu
index 9304ee045..ae9111ea1 100644
--- a/cpp/test/neighbors/brute_force_prefiltered.cu
+++ b/cpp/test/neighbors/brute_force_prefiltered.cu
@@ -203,7 +203,7 @@ class PrefilteredBruteForceTest
       auto filter_view =
         raft::make_device_vector_view<const uint32_t, index_t>(filter_d.data(), filter_d.size());
       index_t size_h = m * n;
-      auto size_view = raft::make_host_scalar_view<index_t>(&size_h);
+      auto size_view = raft::make_host_scalar_view<const index_t, index_t>(&size_h);
 
       set_bitmap(src, dst, bitmap, n_edges, n, stream);
 

From b93b8f639b2e4caa8c542b643d615035a6dee754 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Fri, 27 Sep 2024 03:04:34 +0200
Subject: [PATCH 5/6] Persistent CAGRA kernel (#215)

An experimental version of the single-cta CAGRA kernel that run persistently while allowing many CPU threads submit the queries in small batches very efficiently.

<p align="center">
  <img alt="CAGRA throughput @ Recall = 0.94, n_queries = 1" src="https://github.com/user-attachments/assets/68884aef-2d5f-49f6-b75c-2cb86d17c0fb" width="49%">
  <img alt="CAGRA throughput @ Recall = 0.94, n_queries = 10" src="https://github.com/user-attachments/assets/e6fbfb73-7c29-4887-8e97-31a30a346d13" width="49%">
</p>

## API

In the current implementation, the public API does not change. An extra parameter `persistent` is added to the `ann::cagra::search_params` (only valid when `algo == SINGLE_CTA`).
The persistent kernel is managed by a global runner object in a `shared_ptr`; the first CPU thread to call the kernel spawns the runner, subsequent calls/threads only update a global "heartbeat" atomic variable (`runner_base_t::last_touch`). When there's no heartbeat in the last few seconds (`kLiveInterval`), the runner shuts down the kernel and cleans up the associated resources.

An alternative solution would be to control the kernel explicitly, in a client-server style. This would be more controllable, but would require significant re-thinking of the RAFT/cuVS API.

### Synchronization behavior and CUDA streams

The kernel is managed in a dedicated thread & a non-blocking stream; it's independent of any other (i.e. calling) threads.

Although we pass a CUDA stream to the search function to preserve the api, this **CUDA stream is never used**; in fact, there are no CUDA API calls happening in the calling thread.
All communication between the host calling thread and GPU workers happens via atomic variables.

**The search function blocks the CPU thread**, i.e. it waits till the results are back before returning.

### Exceptions and safety

The kernel runner object is stored in a shared pointer. Hence, it provides all the same safety guarantees as smart pointers. For example, if a C++ exception is raised in the runner thread, the kernel is stopped during the destruction of the runner/last shared pointer.

It's hard to detect if something happens to the kernel or CUDA context. If the kernel does not return the results to the calling thread within the configured kernel lifetime (`persistent_lifetime` ), the calling thread abandons the request and throws an exception.
The designed behavior here is that all components can gracefully shutdown within the configured kernel lifetime independently.

## Integration notes

### lightweight_uvector

RMM memory resources and device buffers are not zero-cost, even when the allocation size is zero (a common pattern for conditionally-used buffers). They do at least couple `cudaGetDevice` calls during initialization. Normally, the overhead of this is negligible. However, when the number of concurrent threads is large (hundreds of threads), any CUDA call can become a bottleneck due to a single mutex guarding a critical section somewhere in the driver.

To workaround this, I introduce a `lightweight_uvector` in `/detail/cagra/search_plan.cuh` for several buffers used in CAGRA kernels. This is a stripped "multi-device-unsafe" version of `rmm::uvector`: it does not check during resize/destruction whether the current device has changed since construction.
We may consider putting this in a common folder to use across other RAFT/cuVS algorithms.

### Shared resource queues / ring buffers

`resource_queue_t` is an atomic counter-based ring buffer used to distribute the worker resources (CTAs) and pre-allocated job descriptors across CPU I/O threads.
We may consider putting this in a common public namespace in raft if we envision more uses for it.

### Persistent runner structs

`launcher_t` and `persistent_runner_base_t` look like they could be abstracted from the cagra kernel and re-used in other algos. The code in its current state, however, is not ready for this.

### Adjusted benchmarks

 1. I introduced a global temporary buffer for keeping the intermediate results (e.g. neighbor candidates before refinement). This is needed to avoid unnecessary allocations alongside the persistent kernel (but also positively affects performance of the original non-persistent implementation)
 2. I adjusted cuvs common benchmark utils to avoid extra d2h copies and syncs during refinement.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Tamas Bela Feher (https://github.com/tfeher)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/215
---
 cpp/bench/ann/src/common/ann_types.hpp        |    3 +
 cpp/bench/ann/src/common/dataset.hpp          |   56 +-
 cpp/bench/ann/src/common/util.hpp             |  133 +-
 .../src/cuvs/cuvs_ann_bench_param_parser.h    |   10 +
 cpp/bench/ann/src/cuvs/cuvs_ann_bench_utils.h |   55 +-
 cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h   |   88 +-
 cpp/bench/ann/src/cuvs/cuvs_wrapper.h         |    1 +
 cpp/include/cuvs/neighbors/cagra.hpp          |   24 +
 .../detail/cagra/compute_distance-ext.cuh     |    5 +-
 .../detail/cagra/compute_distance.hpp         |   62 +-
 .../cagra/compute_distance_00_generate.py     |    5 +-
 .../cagra/compute_distance_standard-impl.cuh  |   30 +-
 .../cagra/compute_distance_standard.hpp       |   14 +-
 .../cagra/compute_distance_vpq-impl.cuh       |   42 +-
 .../detail/cagra/compute_distance_vpq.hpp     |    9 +-
 cpp/src/neighbors/detail/cagra/factory.cuh    |    4 +-
 .../detail/cagra/search_multi_cta.cuh         |    2 +-
 .../detail/cagra/search_multi_cta_inst.cuh    |    2 +-
 .../cagra/search_multi_cta_kernel-inl.cuh     |    4 +-
 .../detail/cagra/search_multi_cta_kernel.cuh  |    2 +-
 .../detail/cagra/search_multi_kernel.cuh      |    4 +-
 .../neighbors/detail/cagra/search_plan.cuh    |   89 +-
 .../detail/cagra/search_single_cta.cuh        |    8 +-
 .../detail/cagra/search_single_cta_inst.cuh   |    2 +-
 .../cagra/search_single_cta_kernel-inl.cuh    | 1220 ++++++++++++++++-
 .../detail/cagra/search_single_cta_kernel.cuh |    2 +-
 examples/cpp/CMakeLists.txt                   |    5 +
 examples/cpp/src/cagra_persistent_example.cu  |  258 ++++
 28 files changed, 1896 insertions(+), 243 deletions(-)
 create mode 100644 examples/cpp/src/cagra_persistent_example.cu

diff --git a/cpp/bench/ann/src/common/ann_types.hpp b/cpp/bench/ann/src/common/ann_types.hpp
index 4b17885c0..c2f85e539 100644
--- a/cpp/bench/ann/src/common/ann_types.hpp
+++ b/cpp/bench/ann/src/common/ann_types.hpp
@@ -35,6 +35,7 @@ enum class Mode {
 enum class MemoryType {
   kHost,
   kHostMmap,
+  kHostPinned,
   kDevice,
 };
 
@@ -60,6 +61,8 @@ inline auto parse_memory_type(const std::string& memory_type) -> MemoryType
     return MemoryType::kHost;
   } else if (memory_type == "mmap") {
     return MemoryType::kHostMmap;
+  } else if (memory_type == "pinned") {
+    return MemoryType::kHostPinned;
   } else if (memory_type == "device") {
     return MemoryType::kDevice;
   } else {
diff --git a/cpp/bench/ann/src/common/dataset.hpp b/cpp/bench/ann/src/common/dataset.hpp
index 95f1a82a2..49020fe36 100644
--- a/cpp/bench/ann/src/common/dataset.hpp
+++ b/cpp/bench/ann/src/common/dataset.hpp
@@ -286,7 +286,28 @@ class dataset {
   {
     switch (memory_type) {
       case MemoryType::kDevice: return query_set_on_gpu();
-      default: return query_set();
+      case MemoryType::kHost: {
+        auto r = query_set();
+#ifndef BUILD_CPU_ONLY
+        if (query_set_pinned_) {
+          cudaHostUnregister(const_cast<T*>(r));
+          query_set_pinned_ = false;
+        }
+#endif
+        return r;
+      }
+      case MemoryType::kHostPinned: {
+        auto r = query_set();
+#ifndef BUILD_CPU_ONLY
+        if (!query_set_pinned_) {
+          cudaHostRegister(
+            const_cast<T*>(r), query_set_size() * dim() * sizeof(T), cudaHostRegisterDefault);
+          query_set_pinned_ = true;
+        }
+#endif
+        return r;
+      }
+      default: return nullptr;
     }
   }
 
@@ -294,7 +315,27 @@ class dataset {
   {
     switch (memory_type) {
       case MemoryType::kDevice: return base_set_on_gpu();
-      case MemoryType::kHost: return base_set();
+      case MemoryType::kHost: {
+        auto r = base_set();
+#ifndef BUILD_CPU_ONLY
+        if (base_set_pinned_) {
+          cudaHostUnregister(const_cast<T*>(r));
+          base_set_pinned_ = false;
+        }
+#endif
+        return r;
+      }
+      case MemoryType::kHostPinned: {
+        auto r = base_set();
+#ifndef BUILD_CPU_ONLY
+        if (!base_set_pinned_) {
+          cudaHostRegister(
+            const_cast<T*>(r), base_set_size() * dim() * sizeof(T), cudaHostRegisterDefault);
+          base_set_pinned_ = true;
+        }
+#endif
+        return r;
+      }
       case MemoryType::kHostMmap: return mapped_base_set();
       default: return nullptr;
     }
@@ -315,18 +356,23 @@ class dataset {
   mutable T* d_query_set_     = nullptr;
   mutable T* mapped_base_set_ = nullptr;
   mutable int32_t* gt_set_    = nullptr;
+
+  mutable bool base_set_pinned_  = false;
+  mutable bool query_set_pinned_ = false;
 };
 
 template <typename T>
 dataset<T>::~dataset()
 {
-  delete[] base_set_;
-  delete[] query_set_;
-  delete[] gt_set_;
 #ifndef BUILD_CPU_ONLY
   if (d_base_set_) { cudaFree(d_base_set_); }
   if (d_query_set_) { cudaFree(d_query_set_); }
+  if (base_set_pinned_) { cudaHostUnregister(base_set_); }
+  if (query_set_pinned_) { cudaHostUnregister(query_set_); }
 #endif
+  delete[] base_set_;
+  delete[] query_set_;
+  delete[] gt_set_;
 }
 
 template <typename T>
diff --git a/cpp/bench/ann/src/common/util.hpp b/cpp/bench/ann/src/common/util.hpp
index e01e3847b..c3db2bb4b 100644
--- a/cpp/bench/ann/src/common/util.hpp
+++ b/cpp/bench/ann/src/common/util.hpp
@@ -198,42 +198,71 @@ inline auto get_stream_from_global_pool() -> cudaStream_t
 #endif
 }
 
-struct result_buffer {
-  explicit result_buffer(size_t size, cudaStream_t stream) : size_{size}, stream_{stream}
+/** The workspace buffer for use thread-locally. */
+struct ws_buffer {
+  explicit ws_buffer(size_t size, cudaStream_t stream) : size_{size}, stream_{stream} {}
+  ws_buffer()                                    = delete;
+  ws_buffer(ws_buffer&&)                         = delete;
+  auto operator=(ws_buffer&&) -> ws_buffer&      = delete;
+  ws_buffer(const ws_buffer&)                    = delete;
+  auto operator=(const ws_buffer&) -> ws_buffer& = delete;
+  ~ws_buffer() noexcept
   {
-    if (size_ == 0) { return; }
-    data_host_ = malloc(size_);
 #ifndef BUILD_CPU_ONLY
-    cudaMallocAsync(&data_device_, size_, stream_);
-    cudaStreamSynchronize(stream_);
-#endif
-  }
-  result_buffer()                                        = delete;
-  result_buffer(result_buffer&&)                         = delete;
-  auto operator=(result_buffer&&) -> result_buffer&      = delete;
-  result_buffer(const result_buffer&)                    = delete;
-  auto operator=(const result_buffer&) -> result_buffer& = delete;
-  ~result_buffer() noexcept
-  {
-    if (size_ == 0) { return; }
-#ifndef BUILD_CPU_ONLY
-    cudaFreeAsync(data_device_, stream_);
-    cudaStreamSynchronize(stream_);
+    if (data_device_ != nullptr) {
+      cudaFreeAsync(data_device_, stream_);
+      cudaStreamSynchronize(stream_);
+    }
+    if (data_host_ != nullptr) { cudaFreeHost(data_host_); }
+#else
+    if (data_host_ != nullptr) { free(data_host_); }
 #endif
-    free(data_host_);
   }
 
   [[nodiscard]] auto size() const noexcept { return size_; }
-  [[nodiscard]] auto data(MemoryType loc) const noexcept
+  [[nodiscard]] auto data(MemoryType loc) const noexcept -> void*
   {
+    if (size_ == 0) { return nullptr; }
     switch (loc) {
-      case MemoryType::kDevice: return data_device_;
-      default: return data_host_;
+#ifndef BUILD_CPU_ONLY
+      case MemoryType::kDevice: {
+        if (data_device_ == nullptr) {
+          cudaMallocAsync(&data_device_, size_, stream_);
+          cudaStreamSynchronize(stream_);
+          needs_cleanup_device_ = false;
+        } else if (needs_cleanup_device_) {
+          cudaMemsetAsync(data_device_, 0, size_, stream_);
+          cudaStreamSynchronize(stream_);
+          needs_cleanup_device_ = false;
+        }
+        return data_device_;
+      }
+#endif
+      default: {
+        if (data_host_ == nullptr) {
+#ifndef BUILD_CPU_ONLY
+          cudaMallocHost(&data_host_, size_);
+#else
+          data_host_ = malloc(size_);
+#endif
+          needs_cleanup_host_ = false;
+        } else if (needs_cleanup_host_) {
+          memset(data_host_, 0, size_);
+          needs_cleanup_host_ = false;
+        }
+        return data_host_;
+      }
     }
   }
 
   void transfer_data(MemoryType dst, MemoryType src)
   {
+    // The destination is overwritten and thus does not need cleanup
+    if (dst == MemoryType::kDevice) {
+      needs_cleanup_device_ = false;
+    } else {
+      needs_cleanup_host_ = false;
+    }
     auto dst_ptr = data(dst);
     auto src_ptr = data(src);
     if (dst_ptr == src_ptr) { return; }
@@ -243,15 +272,25 @@ struct result_buffer {
 #endif
   }
 
+  /** Mark the buffer for reuse - it needs to be cleared to make sure the previous results are not
+   * leaked to the new iteration. */
+  void reuse()
+  {
+    needs_cleanup_host_   = true;
+    needs_cleanup_device_ = true;
+  }
+
  private:
   size_t size_{0};
-  cudaStream_t stream_ = nullptr;
-  void* data_host_     = nullptr;
-  void* data_device_   = nullptr;
+  cudaStream_t stream_               = nullptr;
+  mutable void* data_host_           = nullptr;
+  mutable void* data_device_         = nullptr;
+  mutable bool needs_cleanup_host_   = false;
+  mutable bool needs_cleanup_device_ = false;
 };
 
 namespace detail {
-inline std::vector<std::unique_ptr<result_buffer>> global_result_buffer_pool(0);
+inline std::vector<std::unique_ptr<ws_buffer>> global_result_buffer_pool(0);
 inline std::mutex grp_mutex;
 }  // namespace detail
 
@@ -262,24 +301,47 @@ inline std::mutex grp_mutex;
  * This reduces the setup overhead and number of times the context is being blocked
  * (this is relevant if there is a persistent kernel running across multiples benchmark cases).
  */
-inline auto get_result_buffer_from_global_pool(size_t size) -> result_buffer&
+inline auto get_result_buffer_from_global_pool(size_t size) -> ws_buffer&
 {
   auto stream = get_stream_from_global_pool();
-  auto& rb    = [stream, size]() -> result_buffer& {
+  auto& rb    = [stream, size]() -> ws_buffer& {
     std::lock_guard guard(detail::grp_mutex);
     if (static_cast<int>(detail::global_result_buffer_pool.size()) < benchmark_n_threads) {
       detail::global_result_buffer_pool.resize(benchmark_n_threads);
     }
     auto& rb = detail::global_result_buffer_pool[benchmark_thread_id];
-    if (!rb || rb->size() < size) { rb = std::make_unique<result_buffer>(size, stream); }
+    if (!rb || rb->size() < size) {
+      rb = std::make_unique<ws_buffer>(size, stream);
+    } else {
+      rb->reuse();
+    }
     return *rb;
   }();
+  return rb;
+}
 
-  memset(rb.data(MemoryType::kHost), 0, size);
-#ifndef BUILD_CPU_ONLY
-  cudaMemsetAsync(rb.data(MemoryType::kDevice), 0, size, stream);
-  cudaStreamSynchronize(stream);
-#endif
+namespace detail {
+inline std::vector<std::unique_ptr<ws_buffer>> global_tmp_buffer_pool(0);
+inline std::mutex gtp_mutex;
+}  // namespace detail
+
+/**
+ * Global temporary buffer pool for use by algorithms.
+ * In contrast to `get_result_buffer_from_global_pool`, the content of these buffers is never
+ * initialized.
+ */
+inline auto get_tmp_buffer_from_global_pool(size_t size) -> ws_buffer&
+{
+  auto stream = get_stream_from_global_pool();
+  auto& rb    = [stream, size]() -> ws_buffer& {
+    std::lock_guard guard(detail::gtp_mutex);
+    if (static_cast<int>(detail::global_tmp_buffer_pool.size()) < benchmark_n_threads) {
+      detail::global_tmp_buffer_pool.resize(benchmark_n_threads);
+    }
+    auto& rb = detail::global_tmp_buffer_pool[benchmark_thread_id];
+    if (!rb || rb->size() < size) { rb = std::make_unique<ws_buffer>(size, stream); }
+    return *rb;
+  }();
   return rb;
 }
 
@@ -293,6 +355,7 @@ inline void reset_global_device_resources()
 {
 #ifndef BUILD_CPU_ONLY
   std::lock_guard guard(detail::gsp_mutex);
+  detail::global_tmp_buffer_pool.resize(0);
   detail::global_result_buffer_pool.resize(0);
   detail::global_stream_pool.resize(0);
 #endif
diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
index 67f8ed39d..22f0cab6f 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
@@ -247,6 +247,16 @@ void parse_search_param(const nlohmann::json& conf,
   if (conf.contains("itopk")) { param.p.itopk_size = conf.at("itopk"); }
   if (conf.contains("search_width")) { param.p.search_width = conf.at("search_width"); }
   if (conf.contains("max_iterations")) { param.p.max_iterations = conf.at("max_iterations"); }
+  if (conf.contains("persistent")) { param.p.persistent = conf.at("persistent"); }
+  if (conf.contains("persistent_lifetime")) {
+    param.p.persistent_lifetime = conf.at("persistent_lifetime");
+  }
+  if (conf.contains("persistent_device_usage")) {
+    param.p.persistent_device_usage = conf.at("persistent_device_usage");
+  }
+  if (conf.contains("thread_block_size")) {
+    param.p.thread_block_size = conf.at("thread_block_size");
+  }
   if (conf.contains("algo")) {
     if (conf.at("algo") == "single_cta") {
       param.p.algo = cuvs::neighbors::cagra::search_algo::SINGLE_CTA;
diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_utils.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_utils.h
index b92785943..92274e263 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_utils.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_utils.h
@@ -218,27 +218,46 @@ void refine_helper(const raft::resources& res,
   } else {
     auto dataset_host = raft::make_host_matrix_view<const data_type, extents_type>(
       dataset.data_handle(), dataset.extent(0), dataset.extent(1));
-    auto queries_host    = raft::make_host_matrix<data_type, extents_type>(batch_size, dim);
-    auto candidates_host = raft::make_host_matrix<index_type, extents_type>(batch_size, k0);
-    auto neighbors_host  = raft::make_host_matrix<index_type, extents_type>(batch_size, k);
-    auto distances_host  = raft::make_host_matrix<float, extents_type>(batch_size, k);
+    if (raft::get_device_for_address(queries.data_handle()) >= 0) {
+      // Queries & results are on the device
 
-    auto stream = raft::resource::get_cuda_stream(res);
-    raft::copy(queries_host.data_handle(), queries.data_handle(), queries_host.size(), stream);
-    raft::copy(
-      candidates_host.data_handle(), candidates.data_handle(), candidates_host.size(), stream);
+      auto queries_host    = raft::make_host_matrix<data_type, extents_type>(batch_size, dim);
+      auto candidates_host = raft::make_host_matrix<index_type, extents_type>(batch_size, k0);
+      auto neighbors_host  = raft::make_host_matrix<index_type, extents_type>(batch_size, k);
+      auto distances_host  = raft::make_host_matrix<float, extents_type>(batch_size, k);
 
-    raft::resource::sync_stream(res);  // wait for the queries and candidates
-    cuvs::neighbors::refine(res,
-                            dataset_host,
-                            queries_host.view(),
-                            candidates_host.view(),
-                            neighbors_host.view(),
-                            distances_host.view(),
-                            metric);
+      auto stream = raft::resource::get_cuda_stream(res);
+      raft::copy(queries_host.data_handle(), queries.data_handle(), queries_host.size(), stream);
+      raft::copy(
+        candidates_host.data_handle(), candidates.data_handle(), candidates_host.size(), stream);
+
+      raft::resource::sync_stream(res);  // wait for the queries and candidates
+      cuvs::neighbors::refine(res,
+                              dataset_host,
+                              queries_host.view(),
+                              candidates_host.view(),
+                              neighbors_host.view(),
+                              distances_host.view(),
+                              metric);
+
+      raft::copy(neighbors, neighbors_host.data_handle(), neighbors_host.size(), stream);
+      raft::copy(distances, distances_host.data_handle(), distances_host.size(), stream);
+
+    } else {
+      // Queries & results are on the host - no device sync / copy needed
+
+      auto queries_host = raft::make_host_matrix_view<const data_type, extents_type>(
+        queries.data_handle(), batch_size, dim);
+      auto candidates_host = raft::make_host_matrix_view<const index_type, extents_type>(
+        candidates.data_handle(), batch_size, k0);
+      auto neighbors_host =
+        raft::make_host_matrix_view<index_type, extents_type>(neighbors, batch_size, k);
+      auto distances_host =
+        raft::make_host_matrix_view<float, extents_type>(distances, batch_size, k);
 
-    raft::copy(neighbors, neighbors_host.data_handle(), neighbors_host.size(), stream);
-    raft::copy(distances, distances_host.data_handle(), distances_host.size(), stream);
+      cuvs::neighbors::refine(
+        res, dataset_host, queries_host, candidates_host, neighbors_host, distances_host, metric);
+    }
   }
 }
 
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h
index 53db717a6..9ca41cab0 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h
@@ -107,13 +107,23 @@ class cuvs_cagra : public algo<T>, public algo_gpu {
                    int batch_size,
                    int k,
                    algo_base::index_type* neighbors,
-                   float* distances) const;
+                   float* distances,
+                   IdxT* neighbors_idx_t) const;
 
   [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
   {
     return handle_.get_sync_stream();
   }
 
+  [[nodiscard]] auto uses_stream() const noexcept -> bool override
+  {
+    // If the algorithm uses persistent kernel, the CPU has to synchronize by the end of computing
+    // the result. Hence it guarantees the benchmark CUDA stream is empty by the end of the
+    // execution. Hence we inform the benchmark to not waste the time on recording & synchronizing
+    // the event.
+    return !search_params_.persistent;
+  }
+
   // to enable dataset access from GPU memory
   [[nodiscard]] auto get_preference() const -> algo_property override
   {
@@ -269,7 +279,11 @@ void cuvs_cagra<T, IdxT>::set_search_dataset(const T* dataset, size_t nrow)
 template <typename T, typename IdxT>
 void cuvs_cagra<T, IdxT>::save(const std::string& file) const
 {
-  cuvs::neighbors::cagra::serialize(handle_, file, *index_);
+  using ds_idx_type = decltype(index_->data().n_rows());
+  bool is_vpq =
+    dynamic_cast<const cuvs::neighbors::vpq_dataset<half, ds_idx_type>*>(&index_->data()) ||
+    dynamic_cast<const cuvs::neighbors::vpq_dataset<float, ds_idx_type>*>(&index_->data());
+  cuvs::neighbors::cagra::serialize(handle_, file, *index_, is_vpq);
 }
 
 template <typename T, typename IdxT>
@@ -292,19 +306,18 @@ std::unique_ptr<algo<T>> cuvs_cagra<T, IdxT>::copy()
 }
 
 template <typename T, typename IdxT>
-void cuvs_cagra<T, IdxT>::search_base(
-  const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const
+void cuvs_cagra<T, IdxT>::search_base(const T* queries,
+                                      int batch_size,
+                                      int k,
+                                      algo_base::index_type* neighbors,
+                                      float* distances,
+                                      IdxT* neighbors_idx_t) const
 {
   static_assert(std::is_integral_v<algo_base::index_type>);
   static_assert(std::is_integral_v<IdxT>);
 
-  IdxT* neighbors_idx_t;
-  std::optional<rmm::device_uvector<IdxT>> neighbors_storage{std::nullopt};
   if constexpr (sizeof(IdxT) == sizeof(algo_base::index_type)) {
     neighbors_idx_t = reinterpret_cast<IdxT*>(neighbors);
-  } else {
-    neighbors_storage.emplace(batch_size * k, raft::resource::get_cuda_stream(handle_));
-    neighbors_idx_t = neighbors_storage->data();
   }
 
   auto queries_view =
@@ -317,11 +330,23 @@ void cuvs_cagra<T, IdxT>::search_base(
     handle_, search_params_, *index_, queries_view, neighbors_view, distances_view);
 
   if constexpr (sizeof(IdxT) != sizeof(algo_base::index_type)) {
-    raft::linalg::unaryOp(neighbors,
-                          neighbors_idx_t,
-                          batch_size * k,
-                          raft::cast_op<algo_base::index_type>(),
-                          raft::resource::get_cuda_stream(handle_));
+    if (raft::get_device_for_address(neighbors) < 0 &&
+        raft::get_device_for_address(neighbors_idx_t) < 0) {
+      // Both pointers on the host, let's use host-side mapping
+      if (uses_stream()) {
+        // Need to wait for GPU to finish filling source
+        raft::resource::sync_stream(handle_);
+      }
+      for (int i = 0; i < batch_size * k; i++) {
+        neighbors[i] = algo_base::index_type(neighbors_idx_t[i]);
+      }
+    } else {
+      raft::linalg::unaryOp(neighbors,
+                            neighbors_idx_t,
+                            batch_size * k,
+                            raft::cast_op<algo_base::index_type>(),
+                            raft::resource::get_cuda_stream(handle_));
+    }
   }
 }
 
@@ -329,21 +354,42 @@ template <typename T, typename IdxT>
 void cuvs_cagra<T, IdxT>::search(
   const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const
 {
+  static_assert(std::is_integral_v<algo_base::index_type>);
+  static_assert(std::is_integral_v<IdxT>);
+  constexpr bool kNeedsIoMapping = sizeof(IdxT) != sizeof(algo_base::index_type);
+
   auto k0                       = static_cast<size_t>(refine_ratio_ * k);
   const bool disable_refinement = k0 <= static_cast<size_t>(k);
   const raft::resources& res    = handle_;
+  auto mem_type =
+    raft::get_device_for_address(neighbors) >= 0 ? MemoryType::kDevice : MemoryType::kHostPinned;
+  auto& tmp_buf = get_tmp_buffer_from_global_pool(
+    ((disable_refinement ? 0 : (sizeof(float) + sizeof(algo_base::index_type))) +
+     (kNeedsIoMapping ? sizeof(IdxT) : 0)) *
+    batch_size * k0);
+  auto* candidates_ptr = reinterpret_cast<algo_base::index_type*>(tmp_buf.data(mem_type));
+  auto* candidate_dists_ptr =
+    reinterpret_cast<float*>(candidates_ptr + (disable_refinement ? 0 : batch_size * k0));
+  auto* neighbors_idx_t =
+    reinterpret_cast<IdxT*>(candidate_dists_ptr + (disable_refinement ? 0 : batch_size * k0));
 
   if (disable_refinement) {
-    search_base(queries, batch_size, k, neighbors, distances);
+    search_base(queries, batch_size, k, neighbors, distances, neighbors_idx_t);
   } else {
+    search_base(queries, batch_size, k0, candidates_ptr, candidate_dists_ptr, neighbors_idx_t);
+
+    if (mem_type == MemoryType::kHostPinned && uses_stream()) {
+      // If the algorithm uses a stream to synchronize (non-persistent kernel), but the data is in
+      // the pinned host memory, we need to synchronize before the refinement operation to wait for
+      // the data being available for the host.
+      raft::resource::sync_stream(res);
+    }
+
+    auto candidate_ixs =
+      raft::make_device_matrix_view<const algo_base::index_type, algo_base::index_type>(
+        candidates_ptr, batch_size, k0);
     auto queries_v = raft::make_device_matrix_view<const T, algo_base::index_type>(
       queries, batch_size, dimension_);
-    auto candidate_ixs =
-      raft::make_device_matrix<algo_base::index_type, algo_base::index_type>(res, batch_size, k0);
-    auto candidate_dists =
-      raft::make_device_matrix<float, algo_base::index_type>(res, batch_size, k0);
-    search_base(
-      queries, batch_size, k0, candidate_ixs.data_handle(), candidate_dists.data_handle());
     refine_helper(
       res, *input_dataset_v_, queries_v, candidate_ixs, k, neighbors, distances, index_->metric());
   }
diff --git a/cpp/bench/ann/src/cuvs/cuvs_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_wrapper.h
index 0954e6051..ea052533d 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_wrapper.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_wrapper.h
@@ -26,6 +26,7 @@
 #include <fstream>
 #include <memory>
 #include <stdexcept>
+#include <stdint.h>
 #include <string>
 #include <type_traits>
 
diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp
index 5f77eb8a3..fec95b563 100644
--- a/cpp/include/cuvs/neighbors/cagra.hpp
+++ b/cpp/include/cuvs/neighbors/cagra.hpp
@@ -205,6 +205,30 @@ struct search_params : cuvs::neighbors::search_params {
   uint32_t num_random_samplings = 1;
   /** Bit mask used for initial random seed node selection. */
   uint64_t rand_xor_mask = 0x128394;
+
+  /** Whether to use the persistent version of the kernel (only SINGLE_CTA is supported a.t.m.) */
+  bool persistent = false;
+  /** Persistent kernel: time in seconds before the kernel stops if no requests received. */
+  float persistent_lifetime = 2;
+  /**
+   * Set the fraction of maximum grid size used by persistent kernel.
+   * Value 1.0 means the kernel grid size is maximum possible for the selected device.
+   * The value must be greater than 0.0 and not greater than 1.0.
+   *
+   * One may need to run other kernels alongside this persistent kernel. This parameter can
+   * be used to reduce the grid size of the persistent kernel to leave a few SMs idle.
+   * Note: running any other work on GPU alongside with the persistent kernel makes the setup
+   * fragile.
+   *   - Running another kernel in another thread usually works, but no progress guaranteed
+   *   - Any CUDA allocations block the context (this issue may be obscured by using pools)
+   *   - Memory copies to not-pinned host memory may block the context
+   *
+   * Even when we know there are no other kernels working at the same time, setting
+   * kDeviceUsage to 1.0 surprisingly sometimes hurts performance. Proceed with care.
+   * If you suspect this is an issue, you can reduce this number to ~0.9 without a significant
+   * impact on the throughput.
+   */
+  float persistent_device_usage = 1.0;
 };
 
 /**
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh
index 8407ef055..df447d196 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance-ext.cuh
@@ -496,8 +496,7 @@ using descriptor_instances = instance_selector<
 template <typename DataT, typename IndexT, typename DistanceT, typename DatasetT>
 auto dataset_descriptor_init(const cagra::search_params& params,
                              const DatasetT& dataset,
-                             cuvs::distance::DistanceType metric,
-                             rmm::cuda_stream_view stream)
+                             cuvs::distance::DistanceType metric)
   -> dataset_descriptor_host<DataT, IndexT, DistanceT>
 {
   auto [init, priority] =
@@ -505,7 +504,7 @@ auto dataset_descriptor_init(const cagra::search_params& params,
   if (init == nullptr || priority < 0) {
     RAFT_FAIL("No dataset descriptor instance compiled for this parameter combination.");
   }
-  return init(params, dataset, metric, stream);
+  return init(params, dataset, metric);
 }
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp
index 4bed275ab..297eb1f55 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp
+++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp
@@ -34,6 +34,7 @@
 #include <functional>
 #include <memory>
 #include <type_traits>
+#include <variant>
 
 namespace cuvs::neighbors::cagra::detail {
 
@@ -222,31 +223,61 @@ struct alignas(device::LOAD_128BIT_T) dataset_descriptor_base_t {
  * The host struct manages the lifetime of the associated device pointer and a couple parameters
  * affecting the search kernel launch config.
  *
+ * [Note: lazy initialization]
+ * Initialization of the descriptor involves allocating device memory and calling a kernel.
+ * This can interfere with other workloads (such as the persistent kernel) and generally adds
+ * overhead. To mitigate this, we don't call any CUDA api at the construction of the descriptor
+ * host. Instead, we postpone the initialization till the device pointer is requested.
+ *
  */
 template <typename DataT, typename IndexT, typename DistanceT>
 struct dataset_descriptor_host {
-  using dev_descriptor_t         = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
+  using dev_descriptor_t = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
+  using dd_ptr_t         = std::shared_ptr<dev_descriptor_t>;
+  using init_f =
+    std::tuple<std::function<void(dev_descriptor_t*, rmm::cuda_stream_view stream)>, size_t>;
   uint32_t smem_ws_size_in_bytes = 0;
   uint32_t team_size             = 0;
 
-  template <typename DescriptorImpl>
-  dataset_descriptor_host(const DescriptorImpl& dd_host, rmm::cuda_stream_view stream)
-    : dev_ptr_{[stream]() {
-                 dev_descriptor_t* p;
-                 RAFT_CUDA_TRY(cudaMallocAsync(&p, sizeof(DescriptorImpl), stream));
-                 return p;
-               }(),
-               [stream](dev_descriptor_t* p) { RAFT_CUDA_TRY_NO_THROW(cudaFreeAsync(p, stream)); }},
+  template <typename DescriptorImpl, typename InitF>
+  dataset_descriptor_host(const DescriptorImpl& dd_host, InitF init)
+    : value_{std::make_tuple(init, sizeof(DescriptorImpl))},
       smem_ws_size_in_bytes{dd_host.smem_ws_size_in_bytes()},
       team_size{dd_host.team_size()}
   {
   }
 
-  [[nodiscard]] auto dev_ptr() const -> const dev_descriptor_t* { return dev_ptr_.get(); }
-  [[nodiscard]] auto dev_ptr() -> dev_descriptor_t* { return dev_ptr_.get(); }
+  /**
+   * Return the device pointer, possibly evaluating it in the given thread.
+   */
+  [[nodiscard]] auto dev_ptr(rmm::cuda_stream_view stream) const -> const dev_descriptor_t*
+  {
+    if (std::holds_alternative<init_f>(value_)) { value_ = eval(std::get<init_f>(value_), stream); }
+    return std::get<dd_ptr_t>(value_).get();
+  }
+  [[nodiscard]] auto dev_ptr(rmm::cuda_stream_view stream) -> dev_descriptor_t*
+  {
+    if (std::holds_alternative<init_f>(value_)) { value_ = eval(std::get<init_f>(value_), stream); }
+    return std::get<dd_ptr_t>(value_).get();
+  }
 
  private:
-  std::unique_ptr<dev_descriptor_t, std::function<void(dev_descriptor_t*)>> dev_ptr_;
+  mutable std::variant<dd_ptr_t, init_f> value_;
+
+  static auto eval(init_f init, rmm::cuda_stream_view stream) -> dd_ptr_t
+  {
+    using raft::RAFT_NAME;
+    auto& [fun, size] = init;
+    dd_ptr_t dev_ptr{
+      [stream, s = size]() {
+        dev_descriptor_t* p;
+        RAFT_CUDA_TRY(cudaMallocAsync(&p, s, stream));
+        return p;
+      }(),
+      [stream](dev_descriptor_t* p) { RAFT_CUDA_TRY_NO_THROW(cudaFreeAsync(p, stream)); }};
+    fun(dev_ptr.get(), stream);
+    return dev_ptr;
+  }
 };
 
 /**
@@ -257,11 +288,8 @@ struct dataset_descriptor_host {
  *
  */
 template <typename DataT, typename IndexT, typename DistanceT, typename DatasetT>
-using init_desc_type =
-  dataset_descriptor_host<DataT, IndexT, DistanceT> (*)(const cagra::search_params&,
-                                                        const DatasetT&,
-                                                        cuvs::distance::DistanceType,
-                                                        rmm::cuda_stream_view);
+using init_desc_type = dataset_descriptor_host<DataT, IndexT, DistanceT> (*)(
+  const cagra::search_params&, const DatasetT&, cuvs::distance::DistanceType);
 
 /**
  * @brief Descriptor instance specification.
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py
index 52a15e2a1..f8584c62e 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_00_generate.py
@@ -135,15 +135,14 @@
 template <typename DataT, typename IndexT, typename DistanceT, typename DatasetT>
 auto dataset_descriptor_init(const cagra::search_params& params,
                              const DatasetT& dataset,
-                             cuvs::distance::DistanceType metric,
-                             rmm::cuda_stream_view stream)
+                             cuvs::distance::DistanceType metric)
   -> dataset_descriptor_host<DataT, IndexT, DistanceT>
 {{
   auto [init, priority] = descriptor_instances::select<DataT, IndexT, DistanceT>(params, dataset, metric);
   if (init == nullptr || priority < 0) {{
     RAFT_FAIL("No dataset descriptor instance compiled for this parameter combination.");
   }}
-  return init(params, dataset, metric, stream);
+  return init(params, dataset, metric);
 }}
 '''
     f.write(template.format(includes=includes, content=contents))
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh
index b0205508a..877d83fff 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard-impl.cuh
@@ -252,28 +252,24 @@ template <cuvs::distance::DistanceType Metric,
           typename DistanceT>
 dataset_descriptor_host<DataT, IndexT, DistanceT>
 standard_descriptor_spec<Metric, TeamSize, DatasetBlockDim, DataT, IndexT, DistanceT>::init_(
-  const cagra::search_params& params,
-  const DataT* ptr,
-  IndexT size,
-  uint32_t dim,
-  uint32_t ld,
-  rmm::cuda_stream_view stream)
+  const cagra::search_params& params, const DataT* ptr, IndexT size, uint32_t dim, uint32_t ld)
 {
   using desc_type =
     standard_dataset_descriptor_t<Metric, TeamSize, DatasetBlockDim, DataT, IndexT, DistanceT>;
   using base_type = typename desc_type::base_type;
   desc_type dd_host{nullptr, nullptr, ptr, size, dim, ld};
-  host_type result{dd_host, stream};
-
-  standard_dataset_descriptor_init_kernel<Metric,
-                                          TeamSize,
-                                          DatasetBlockDim,
-                                          DataT,
-                                          IndexT,
-                                          DistanceT>
-    <<<1, 1, 0, stream>>>(result.dev_ptr(), ptr, size, dim, desc_type::ld(dd_host.args));
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-  return result;
+  return host_type{dd_host,
+                   [=](dataset_descriptor_base_t<DataT, IndexT, DistanceT>* dev_ptr,
+                       rmm::cuda_stream_view stream) {
+                     standard_dataset_descriptor_init_kernel<Metric,
+                                                             TeamSize,
+                                                             DatasetBlockDim,
+                                                             DataT,
+                                                             IndexT,
+                                                             DistanceT>
+                       <<<1, 1, 0, stream>>>(dev_ptr, ptr, size, dim, ld);
+                     RAFT_CUDA_TRY(cudaPeekAtLastError());
+                   }};
 }
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_standard.hpp b/cpp/src/neighbors/detail/cagra/compute_distance_standard.hpp
index df1b77e86..fec14d713 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_standard.hpp
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_standard.hpp
@@ -45,15 +45,13 @@ struct standard_descriptor_spec : public instance_spec<DataT, IndexT, DistanceT>
   template <typename DatasetT>
   static auto init(const cagra::search_params& params,
                    const DatasetT& dataset,
-                   cuvs::distance::DistanceType metric,
-                   rmm::cuda_stream_view stream) -> host_type
+                   cuvs::distance::DistanceType metric) -> host_type
   {
     return init_(params,
                  dataset.view().data_handle(),
                  IndexT(dataset.n_rows()),
                  dataset.dim(),
-                 dataset.stride(),
-                 stream);
+                 dataset.stride());
   }
 
   template <typename DatasetT>
@@ -69,12 +67,8 @@ struct standard_descriptor_spec : public instance_spec<DataT, IndexT, DistanceT>
   }
 
  private:
-  static dataset_descriptor_host<DataT, IndexT, DistanceT> init_(const cagra::search_params& params,
-                                                                 const DataT* ptr,
-                                                                 IndexT size,
-                                                                 uint32_t dim,
-                                                                 uint32_t ld,
-                                                                 rmm::cuda_stream_view stream);
+  static dataset_descriptor_host<DataT, IndexT, DistanceT> init_(
+    const cagra::search_params& params, const DataT* ptr, IndexT size, uint32_t dim, uint32_t ld);
 };
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
index 86c592502..6caa173f2 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq-impl.cuh
@@ -421,8 +421,7 @@ vpq_descriptor_spec<Metric,
                                       const CodebookT* vq_code_book_ptr,
                                       const CodebookT* pq_code_book_ptr,
                                       IndexT size,
-                                      uint32_t dim,
-                                      rmm::cuda_stream_view stream)
+                                      uint32_t dim)
 {
   using desc_type = cagra_q_dataset_descriptor_t<Metric,
                                                  TeamSize,
@@ -443,24 +442,27 @@ vpq_descriptor_spec<Metric,
                     pq_code_book_ptr,
                     size,
                     dim};
-  host_type result{dd_host, stream};
-  vpq_dataset_descriptor_init_kernel<Metric,
-                                     TeamSize,
-                                     DatasetBlockDim,
-                                     PqBits,
-                                     PqLen,
-                                     CodebookT,
-                                     DataT,
-                                     IndexT,
-                                     DistanceT><<<1, 1, 0, stream>>>(result.dev_ptr(),
-                                                                     encoded_dataset_ptr,
-                                                                     encoded_dataset_dim,
-                                                                     vq_code_book_ptr,
-                                                                     pq_code_book_ptr,
-                                                                     size,
-                                                                     dim);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-  return result;
+  return host_type{dd_host,
+                   [=](dataset_descriptor_base_t<DataT, IndexT, DistanceT>* dev_ptr,
+                       rmm::cuda_stream_view stream) {
+                     vpq_dataset_descriptor_init_kernel<Metric,
+                                                        TeamSize,
+                                                        DatasetBlockDim,
+                                                        PqBits,
+                                                        PqLen,
+                                                        CodebookT,
+                                                        DataT,
+                                                        IndexT,
+                                                        DistanceT>
+                       <<<1, 1, 0, stream>>>(dev_ptr,
+                                             encoded_dataset_ptr,
+                                             encoded_dataset_dim,
+                                             vq_code_book_ptr,
+                                             pq_code_book_ptr,
+                                             size,
+                                             dim);
+                     RAFT_CUDA_TRY(cudaPeekAtLastError());
+                   }};
 }
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
index 378d2943e..4f7d24f17 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
+++ b/cpp/src/neighbors/detail/cagra/compute_distance_vpq.hpp
@@ -57,8 +57,7 @@ struct vpq_descriptor_spec : public instance_spec<DataT, IndexT, DistanceT> {
   template <typename DatasetT>
   static auto init(const cagra::search_params& params,
                    const DatasetT& dataset,
-                   cuvs::distance::DistanceType metric,
-                   rmm::cuda_stream_view stream) -> host_type
+                   cuvs::distance::DistanceType metric) -> host_type
   {
     return init_(params,
                  dataset.data.data_handle(),
@@ -66,8 +65,7 @@ struct vpq_descriptor_spec : public instance_spec<DataT, IndexT, DistanceT> {
                  dataset.vq_code_book.data_handle(),
                  dataset.pq_code_book.data_handle(),
                  IndexT(dataset.n_rows()),
-                 dataset.dim(),
-                 stream);
+                 dataset.dim());
   }
 
   template <typename DatasetT>
@@ -93,8 +91,7 @@ struct vpq_descriptor_spec : public instance_spec<DataT, IndexT, DistanceT> {
     const CodebookT* vq_code_book_ptr,
     const CodebookT* pq_code_book_ptr,
     IndexT size,
-    uint32_t dim,
-    rmm::cuda_stream_view stream);
+    uint32_t dim);
 };
 
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/factory.cuh b/cpp/src/neighbors/detail/cagra/factory.cuh
index 1c99f72f7..2f201de3b 100644
--- a/cpp/src/neighbors/detail/cagra/factory.cuh
+++ b/cpp/src/neighbors/detail/cagra/factory.cuh
@@ -168,8 +168,8 @@ auto dataset_descriptor_init_with_cache(const raft::resources& res,
       ->value;
   std::shared_ptr<desc_t> desc{nullptr};
   if (!cache.get(key, &desc)) {
-    desc = std::make_shared<desc_t>(std::move(dataset_descriptor_init<DataT, IndexT, DistanceT>(
-      params, dataset, metric, raft::resource::get_cuda_stream(res))));
+    desc = std::make_shared<desc_t>(
+      std::move(dataset_descriptor_init<DataT, IndexT, DistanceT>(params, dataset, metric)));
     cache.set(key, desc);
   }
   return *desc;
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh
index 9bcccd9f9..0003f2495 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh
@@ -209,7 +209,7 @@ struct search : public search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_
                   SAMPLE_FILTER_T sample_filter)
   {
     cudaStream_t stream = raft::resource::get_cuda_stream(res);
-    select_and_run(dataset_desc.dev_ptr(),
+    select_and_run(dataset_desc,
                    graph,
                    intermediate_indices.data(),
                    intermediate_distances.data(),
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_inst.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_inst.cuh
index 036a4e414..8d34ab0d6 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_inst.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_inst.cuh
@@ -23,7 +23,7 @@ namespace cuvs::neighbors::cagra::detail::multi_cta_search {
 
 #define instantiate_kernel_selection(DataT, IndexT, DistanceT, SampleFilterT) \
   template void select_and_run<DataT, IndexT, DistanceT, SampleFilterT>(      \
-    const dataset_descriptor_base_t<DataT, IndexT, DistanceT>* dataset_desc,  \
+    const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc,    \
     raft::device_matrix_view<const IndexT, int64_t, raft::row_major> graph,   \
     IndexT* topk_indices_ptr,                                                 \
     DistanceT* topk_distances_ptr,                                            \
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
index dd74ba44b..4dfc46256 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
@@ -413,7 +413,7 @@ struct search_kernel_config {
 };
 
 template <typename DataT, typename IndexT, typename DistanceT, typename SampleFilterT>
-void select_and_run(const dataset_descriptor_base_t<DataT, IndexT, DistanceT>* dataset_desc,
+void select_and_run(const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc,
                     raft::device_matrix_view<const IndexT, int64_t, raft::row_major> graph,
                     IndexT* topk_indices_ptr,       // [num_queries, topk]
                     DistanceT* topk_distances_ptr,  // [num_queries, topk]
@@ -455,7 +455,7 @@ void select_and_run(const dataset_descriptor_base_t<DataT, IndexT, DistanceT>* d
 
   kernel<<<grid_dims, block_dims, smem_size, stream>>>(topk_indices_ptr,
                                                        topk_distances_ptr,
-                                                       dataset_desc,
+                                                       dataset_desc.dev_ptr(stream),
                                                        queries_ptr,
                                                        graph.data_handle(),
                                                        graph.extent(1),
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel.cuh
index 1ef35f947..1a1dcd579 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel.cuh
@@ -22,7 +22,7 @@
 namespace cuvs::neighbors::cagra::detail::multi_cta_search {
 
 template <typename DataT, typename IndexT, typename DistanceT, typename SampleFilterT>
-void select_and_run(const dataset_descriptor_base_t<DataT, IndexT, DistanceT>* dataset_desc,
+void select_and_run(const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc,
                     raft::device_matrix_view<const IndexT, int64_t, raft::row_major> graph,
                     IndexT* topk_indices_ptr,       // [num_queries, topk]
                     DistanceT* topk_distances_ptr,  // [num_queries, topk]
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
index 7b3ecabf3..0daae17b3 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
@@ -175,7 +175,7 @@ void random_pickup(const dataset_descriptor_host<DataT, IndexT, DistanceT>& data
                        num_queries);
 
   random_pickup_kernel<<<grid_size, block_size, dataset_desc.smem_ws_size_in_bytes, cuda_stream>>>(
-    dataset_desc.dev_ptr(),
+    dataset_desc.dev_ptr(cuda_stream),
     queries_ptr,
     num_pickup,
     num_distilation,
@@ -410,7 +410,7 @@ void compute_distance_to_child_nodes(
                                                           parent_distance_ptr,
                                                           lds,
                                                           search_width,
-                                                          dataset_desc.dev_ptr(),
+                                                          dataset_desc.dev_ptr(cuda_stream),
                                                           neighbor_graph_ptr,
                                                           graph_degree,
                                                           query_ptr,
diff --git a/cpp/src/neighbors/detail/cagra/search_plan.cuh b/cpp/src/neighbors/detail/cagra/search_plan.cuh
index 16864ed19..6ecbbc2e8 100644
--- a/cpp/src/neighbors/detail/cagra/search_plan.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_plan.cuh
@@ -32,8 +32,81 @@
 #include <cuvs/neighbors/cagra.hpp>
 #include <raft/util/pow2_utils.cuh>
 
+#include <optional>
+#include <tuple>
+#include <variant>
+
 namespace cuvs::neighbors::cagra::detail {
 
+/**
+ * A lightweight version of rmm::device_uvector.
+ * This version avoids calling cudaSetDevice / cudaGetDevice, and therefore it is required that
+ * the current cuda device does not change during the lifetime of this object. This is expected
+ * to be useful in multi-threaded scenarios where we want to minimize overhead due to
+ * thread sincronization during cuda API calls.
+ * If the size stays at zero, this struct never calls any CUDA driver / RAFT resource functions.
+ */
+template <typename T>
+struct lightweight_uvector {
+ private:
+  using raft_res_type = const raft::resources*;
+  using rmm_res_type  = std::tuple<rmm::device_async_resource_ref, rmm::cuda_stream_view>;
+  static constexpr size_t kAlign = 256;
+
+  std::variant<raft_res_type, rmm_res_type> res_;
+  T* ptr_;
+  size_t size_;
+
+ public:
+  explicit lightweight_uvector(const raft::resources& res) : res_(&res), ptr_{nullptr}, size_{0} {}
+
+  [[nodiscard]] auto data() noexcept -> T* { return ptr_; }
+  [[nodiscard]] auto data() const noexcept -> const T* { return ptr_; }
+  [[nodiscard]] auto size() const noexcept -> size_t { return size_; }
+
+  void resize(size_t new_size)
+  {
+    if (new_size == size_) { return; }
+    if (std::holds_alternative<raft_res_type>(res_)) {
+      auto& h = std::get<raft_res_type>(res_);
+      res_    = rmm_res_type{raft::resource::get_workspace_resource(*h),
+                          raft::resource::get_cuda_stream(*h)};
+    }
+    auto& [r, s] = std::get<rmm_res_type>(res_);
+    T* new_ptr   = nullptr;
+    if (new_size > 0) {
+      new_ptr = reinterpret_cast<T*>(r.allocate_async(new_size * sizeof(T), kAlign, s));
+    }
+    auto copy_size = std::min(size_, new_size);
+    if (copy_size > 0) {
+      cudaMemcpyAsync(new_ptr, ptr_, copy_size * sizeof(T), cudaMemcpyDefault, s);
+    }
+    if (size_ > 0) { r.deallocate_async(ptr_, size_ * sizeof(T), kAlign, s); }
+    ptr_  = new_ptr;
+    size_ = new_size;
+  }
+
+  void resize(size_t new_size, rmm::cuda_stream_view stream)
+  {
+    if (new_size == size_) { return; }
+    if (std::holds_alternative<raft_res_type>(res_)) {
+      auto& h = std::get<raft_res_type>(res_);
+      res_    = rmm_res_type{raft::resource::get_workspace_resource(*h), stream};
+    } else {
+      std::get<rmm::cuda_stream_view>(std::get<rmm_res_type>(res_)) = stream;
+    }
+    resize(new_size);
+  }
+
+  ~lightweight_uvector() noexcept
+  {
+    if (size_ > 0) {
+      auto& [r, s] = std::get<rmm_res_type>(res_);
+      r.deallocate_async(ptr_, size_ * sizeof(T), kAlign, s);
+    }
+  }
+};
+
 struct search_plan_impl_base : public search_params {
   int64_t dim;
   int64_t graph_degree;
@@ -75,9 +148,9 @@ struct search_plan_impl : public search_plan_impl_base {
   uint32_t topk;
   uint32_t num_seeds;
 
-  rmm::device_uvector<INDEX_T> hashmap;
-  rmm::device_uvector<uint32_t> num_executed_iterations;  // device or managed?
-  rmm::device_uvector<INDEX_T> dev_seed;
+  lightweight_uvector<INDEX_T> hashmap;
+  lightweight_uvector<uint32_t> num_executed_iterations;  // device or managed?
+  lightweight_uvector<INDEX_T> dev_seed;
   const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc;
 
   search_plan_impl(raft::resources const& res,
@@ -87,16 +160,18 @@ struct search_plan_impl : public search_plan_impl_base {
                    int64_t graph_degree,
                    uint32_t topk)
     : search_plan_impl_base(params, dim, graph_degree, topk),
-      hashmap(0, raft::resource::get_cuda_stream(res)),
-      num_executed_iterations(0, raft::resource::get_cuda_stream(res)),
-      dev_seed(0, raft::resource::get_cuda_stream(res)),
+      hashmap(res),
+      num_executed_iterations(res),
+      dev_seed(res),
       num_seeds(0),
       dataset_desc(dataset_desc)
   {
     adjust_search_params();
     check_params();
     calc_hashmap_params(res);
-    num_executed_iterations.resize(max_queries, raft::resource::get_cuda_stream(res));
+    if (!persistent) {  // Persistent kernel does not provide this functionality
+      num_executed_iterations.resize(max_queries, raft::resource::get_cuda_stream(res));
+    }
     RAFT_LOG_DEBUG("# algo = %d", static_cast<int>(algo));
   }
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
index 4abed6760..2bed19009 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
@@ -37,8 +37,6 @@
 #include <raft/util/cuda_rt_essentials.hpp>
 #include <raft/util/cudart_utils.hpp>  // RAFT_CUDA_TRY_NOT_THROW is used TODO(tfeher): consider moving this to cuda_rt_essentials.hpp
 
-#include <rmm/device_uvector.hpp>
-
 #include <algorithm>
 #include <cassert>
 #include <iostream>
@@ -199,8 +197,8 @@ struct search : search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_T> {
     }
     RAFT_LOG_DEBUG("# smem_size: %u", smem_size);
     hashmap_size = 0;
-    if (small_hash_bitlen == 0) {
-      hashmap_size = sizeof(INDEX_T) * max_queries * hashmap::get_size(hash_bitlen);
+    if (small_hash_bitlen == 0 && !this->persistent) {
+      hashmap_size = max_queries * hashmap::get_size(hash_bitlen);
       hashmap.resize(hashmap_size, raft::resource::get_cuda_stream(res));
     }
     RAFT_LOG_DEBUG("# hashmap_size: %lu", hashmap_size);
@@ -218,7 +216,7 @@ struct search : search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_T> {
                   SAMPLE_FILTER_T sample_filter)
   {
     cudaStream_t stream = raft::resource::get_cuda_stream(res);
-    select_and_run(dataset_desc.dev_ptr(),
+    select_and_run(dataset_desc,
                    graph,
                    result_indices_ptr,
                    result_distances_ptr,
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh
index 26ca7b672..f734b0582 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh
@@ -23,7 +23,7 @@ namespace cuvs::neighbors::cagra::detail::single_cta_search {
 
 #define instantiate_kernel_selection(DataT, IndexT, DistanceT, SampleFilterT) \
   template void select_and_run<DataT, IndexT, DistanceT, SampleFilterT>(      \
-    const dataset_descriptor_base_t<DataT, IndexT, DistanceT>* dataset_desc,  \
+    const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc,    \
     raft::device_matrix_view<const IndexT, int64_t, raft::row_major> graph,   \
     IndexT* topk_indices_ptr,                                                 \
     DistanceT* topk_distances_ptr,                                            \
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index d10313c5b..21a0f6bb2 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -39,21 +39,32 @@
 #include "../ann_utils.cuh"
 
 #include <raft/util/cuda_rt_essentials.hpp>
-#include <raft/util/cudart_utils.hpp>  // RAFT_CUDA_TRY_NOT_THROW is used TODO(tfeher): consider moving this to cuda_rt_essentials.hpp
+#include <raft/util/integer_utils.hpp>
 
+#include <rmm/cuda_stream.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/pinned_host_memory_resource.hpp>
+
+#include <cuda/atomic>
+#include <cuda/std/atomic>
 
 #include <algorithm>
+#include <array>
 #include <cassert>
-#include <cstdint>
+#include <chrono>
 #include <cstdio>
 #include <iostream>
+#include <limits>
 #include <memory>
 #include <numeric>
+#include <stdint.h>
+#include <thread>
 #include <vector>
 
 namespace cuvs::neighbors::cagra::detail {
 namespace single_cta_search {
+using raft::RAFT_NAME;  // TODO: this is required for RAFT_LOG_XXX messages.
 
 // #define _CLK_BREAKDOWN
 
@@ -463,7 +474,7 @@ template <unsigned MAX_ITOPK,
           unsigned TOPK_BY_BITONIC_SORT,
           class DATASET_DESCRIPTOR_T,
           class SAMPLE_FILTER_T>
-RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel(
+__device__ void search_core(
   typename DATASET_DESCRIPTOR_T::INDEX_T* const result_indices_ptr,       // [num_queries, top_k]
   typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr,  // [num_queries, top_k]
   const std::uint32_t top_k,
@@ -485,6 +496,7 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel(
   const std::uint32_t hash_bitlen,
   const std::uint32_t small_hash_bitlen,
   const std::uint32_t small_hash_reset_interval,
+  const std::uint32_t query_id,
   SAMPLE_FILTER_T sample_filter)
 {
   using LOAD_T = device::LOAD_128BIT_T;
@@ -493,8 +505,6 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel(
   using INDEX_T    = typename DATASET_DESCRIPTOR_T::INDEX_T;
   using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
 
-  const auto query_id = blockIdx.y;
-
 #ifdef _CLK_BREAKDOWN
   std::uint64_t clk_init                 = 0;
   std::uint64_t clk_compute_1st_distance = 0;
@@ -552,7 +562,7 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel(
   if (small_hash_bitlen) {
     local_visited_hashmap_ptr = visited_hash_buffer;
   } else {
-    local_visited_hashmap_ptr = visited_hashmap_ptr + (hashmap::get_size(hash_bitlen) * query_id);
+    local_visited_hashmap_ptr = visited_hashmap_ptr + (hashmap::get_size(hash_bitlen) * blockIdx.y);
   }
   hashmap::init(local_visited_hashmap_ptr, hash_bitlen, 0);
   __syncthreads();
@@ -796,37 +806,292 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel(
 #endif
 }
 
-template <typename DATASET_DESCRIPTOR_T, typename SAMPLE_FILTER_T>
+template <unsigned MAX_ITOPK,
+          unsigned MAX_CANDIDATES,
+          unsigned TOPK_BY_BITONIC_SORT,
+          class DATASET_DESCRIPTOR_T,
+          class SAMPLE_FILTER_T>
+RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel(
+  typename DATASET_DESCRIPTOR_T::INDEX_T* const result_indices_ptr,       // [num_queries, top_k]
+  typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr,  // [num_queries, top_k]
+  const std::uint32_t top_k,
+  const DATASET_DESCRIPTOR_T* dataset_desc,
+  const typename DATASET_DESCRIPTOR_T::DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
+  const typename DATASET_DESCRIPTOR_T::INDEX_T* const knn_graph,   // [dataset_size, graph_degree]
+  const std::uint32_t graph_degree,
+  const unsigned num_distilation,
+  const uint64_t rand_xor_mask,
+  const typename DATASET_DESCRIPTOR_T::INDEX_T* seed_ptr,  // [num_queries, num_seeds]
+  const uint32_t num_seeds,
+  typename DATASET_DESCRIPTOR_T::INDEX_T* const
+    visited_hashmap_ptr,  // [num_queries, 1 << hash_bitlen]
+  const std::uint32_t internal_topk,
+  const std::uint32_t search_width,
+  const std::uint32_t min_iteration,
+  const std::uint32_t max_iteration,
+  std::uint32_t* const num_executed_iterations,  // [num_queries]
+  const std::uint32_t hash_bitlen,
+  const std::uint32_t small_hash_bitlen,
+  const std::uint32_t small_hash_reset_interval,
+  SAMPLE_FILTER_T sample_filter)
+{
+  const auto query_id = blockIdx.y;
+  search_core<MAX_ITOPK,
+              MAX_CANDIDATES,
+              TOPK_BY_BITONIC_SORT,
+              DATASET_DESCRIPTOR_T,
+              SAMPLE_FILTER_T>(result_indices_ptr,
+                               result_distances_ptr,
+                               top_k,
+                               dataset_desc,
+                               queries_ptr,
+                               knn_graph,
+                               graph_degree,
+                               num_distilation,
+                               rand_xor_mask,
+                               seed_ptr,
+                               num_seeds,
+                               visited_hashmap_ptr,
+                               internal_topk,
+                               search_width,
+                               min_iteration,
+                               max_iteration,
+                               num_executed_iterations,
+                               hash_bitlen,
+                               small_hash_bitlen,
+                               small_hash_reset_interval,
+                               query_id,
+                               sample_filter);
+}
+
+// To make sure we avoid false sharing on both CPU and GPU, we enforce cache line size to the
+// maximum of the two.
+// This makes sync atomic significantly faster.
+constexpr size_t kCacheLineBytes = 64;
+
+constexpr uint32_t kMaxJobsNum              = 8192;
+constexpr uint32_t kMaxWorkersNum           = 4096;
+constexpr uint32_t kMaxWorkersPerThread     = 256;
+constexpr uint32_t kSoftMaxWorkersPerThread = 16;
+
+template <typename DATASET_DESCRIPTOR_T>
+struct alignas(kCacheLineBytes) job_desc_t {
+  using index_type    = typename DATASET_DESCRIPTOR_T::INDEX_T;
+  using distance_type = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
+  using data_type     = typename DATASET_DESCRIPTOR_T::DATA_T;
+  // The algorithm input parameters
+  struct value_t {
+    index_type* result_indices_ptr;       // [num_queries, top_k]
+    distance_type* result_distances_ptr;  // [num_queries, top_k]
+    const data_type* queries_ptr;         // [num_queries, dataset_dim]
+    uint32_t top_k;
+    uint32_t n_queries;
+  };
+  using blob_elem_type = uint4;
+  constexpr static inline size_t kBlobSize =
+    raft::div_rounding_up_safe(sizeof(value_t), sizeof(blob_elem_type));
+  // Union facilitates loading the input by a warp in a single request
+  union input_t {
+    blob_elem_type blob[kBlobSize];  // NOLINT
+    value_t value;
+  } input;
+  // Last thread triggers this flag.
+  cuda::atomic<bool, cuda::thread_scope_system> completion_flag;
+};
+
+struct alignas(kCacheLineBytes) worker_handle_t {
+  using handle_t = uint64_t;
+  struct value_t {
+    uint32_t desc_id;
+    uint32_t query_id;
+  };
+  union data_t {
+    handle_t handle;
+    value_t value;
+  };
+  cuda::atomic<data_t, cuda::thread_scope_system> data;
+};
+static_assert(sizeof(worker_handle_t::value_t) == sizeof(worker_handle_t::handle_t));
+static_assert(
+  cuda::atomic<worker_handle_t::data_t, cuda::thread_scope_system>::is_always_lock_free);
+
+constexpr worker_handle_t::handle_t kWaitForWork = std::numeric_limits<uint64_t>::max();
+constexpr worker_handle_t::handle_t kNoMoreWork  = kWaitForWork - 1;
+
+constexpr auto is_worker_busy(worker_handle_t::handle_t h) -> bool
+{
+  return (h != kWaitForWork) && (h != kNoMoreWork);
+}
+
+template <unsigned MAX_ITOPK,
+          unsigned MAX_CANDIDATES,
+          unsigned TOPK_BY_BITONIC_SORT,
+          class DATASET_DESCRIPTOR_T,
+          class SAMPLE_FILTER_T>
+RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel_p(
+  const DATASET_DESCRIPTOR_T* dataset_desc,
+  worker_handle_t* worker_handles,
+  job_desc_t<DATASET_DESCRIPTOR_T>* job_descriptors,
+  uint32_t* completion_counters,
+  const typename DATASET_DESCRIPTOR_T::INDEX_T* const knn_graph,  // [dataset_size, graph_degree]
+  const std::uint32_t graph_degree,
+  const unsigned num_distilation,
+  const uint64_t rand_xor_mask,
+  const typename DATASET_DESCRIPTOR_T::INDEX_T* seed_ptr,  // [num_queries, num_seeds]
+  const uint32_t num_seeds,
+  typename DATASET_DESCRIPTOR_T::INDEX_T* const
+    visited_hashmap_ptr,  // [num_queries, 1 << hash_bitlen]
+  const std::uint32_t internal_topk,
+  const std::uint32_t search_width,
+  const std::uint32_t min_iteration,
+  const std::uint32_t max_iteration,
+  std::uint32_t* const num_executed_iterations,  // [num_queries]
+  const std::uint32_t hash_bitlen,
+  const std::uint32_t small_hash_bitlen,
+  const std::uint32_t small_hash_reset_interval,
+  SAMPLE_FILTER_T sample_filter)
+{
+  using job_desc_type = job_desc_t<DATASET_DESCRIPTOR_T>;
+  __shared__ typename job_desc_type::input_t job_descriptor;
+  __shared__ worker_handle_t::data_t worker_data;
+
+  auto& worker_handle = worker_handles[blockIdx.y].data;
+  uint32_t job_ix;
+
+  while (true) {
+    // wait the writing phase
+    if (threadIdx.x == 0) {
+      worker_handle_t::data_t worker_data_local;
+      do {
+        worker_data_local = worker_handle.load(cuda::memory_order_relaxed);
+      } while (worker_data_local.handle == kWaitForWork);
+      if (worker_data_local.handle != kNoMoreWork) {
+        worker_handle.store({kWaitForWork}, cuda::memory_order_relaxed);
+      }
+      job_ix = worker_data_local.value.desc_id;
+      cuda::atomic_thread_fence(cuda::memory_order_acquire, cuda::thread_scope_system);
+      worker_data = worker_data_local;
+    }
+    if (threadIdx.x < raft::WarpSize) {
+      // Sync one warp and copy descriptor data
+      static_assert(job_desc_type::kBlobSize <= raft::WarpSize);
+      job_ix = raft::shfl(job_ix, 0);
+      if (threadIdx.x < job_desc_type::kBlobSize && job_ix < kMaxJobsNum) {
+        job_descriptor.blob[threadIdx.x] = job_descriptors[job_ix].input.blob[threadIdx.x];
+      }
+    }
+    __syncthreads();
+    if (worker_data.handle == kNoMoreWork) { break; }
+
+    // reading phase
+    auto* result_indices_ptr   = job_descriptor.value.result_indices_ptr;
+    auto* result_distances_ptr = job_descriptor.value.result_distances_ptr;
+    auto* queries_ptr          = job_descriptor.value.queries_ptr;
+    auto top_k                 = job_descriptor.value.top_k;
+    auto n_queries             = job_descriptor.value.n_queries;
+    auto query_id              = worker_data.value.query_id;
+
+    // work phase
+    search_core<MAX_ITOPK,
+                MAX_CANDIDATES,
+                TOPK_BY_BITONIC_SORT,
+                DATASET_DESCRIPTOR_T,
+                SAMPLE_FILTER_T>(result_indices_ptr,
+                                 result_distances_ptr,
+                                 top_k,
+                                 dataset_desc,
+                                 queries_ptr,
+                                 knn_graph,
+                                 graph_degree,
+                                 num_distilation,
+                                 rand_xor_mask,
+                                 seed_ptr,
+                                 num_seeds,
+                                 visited_hashmap_ptr,
+                                 internal_topk,
+                                 search_width,
+                                 min_iteration,
+                                 max_iteration,
+                                 num_executed_iterations,
+                                 hash_bitlen,
+                                 small_hash_bitlen,
+                                 small_hash_reset_interval,
+                                 query_id,
+                                 sample_filter);
+
+    // make sure all writes are visible even for the host
+    //     (e.g. when result buffers are in pinned memory)
+    cuda::atomic_thread_fence(cuda::memory_order_release, cuda::thread_scope_system);
+
+    // arrive to mark the end of the work phase
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      auto completed_count = atomicInc(completion_counters + job_ix, n_queries - 1) + 1;
+      if (completed_count >= n_queries) {
+        job_descriptors[job_ix].completion_flag.store(true, cuda::memory_order_relaxed);
+      }
+    }
+  }
+}
+
+template <bool Persistent,
+          unsigned MAX_ITOPK,
+          unsigned MAX_CANDIDATES,
+          unsigned TOPK_BY_BITONIC_SORT,
+          class DATASET_DESCRIPTOR_T,
+          class SAMPLE_FILTER_T>
+auto dispatch_kernel = []() {
+  if constexpr (Persistent) {
+    return search_kernel_p<MAX_ITOPK,
+                           MAX_CANDIDATES,
+                           TOPK_BY_BITONIC_SORT,
+                           DATASET_DESCRIPTOR_T,
+                           SAMPLE_FILTER_T>;
+  } else {
+    return search_kernel<MAX_ITOPK,
+                         MAX_CANDIDATES,
+                         TOPK_BY_BITONIC_SORT,
+                         DATASET_DESCRIPTOR_T,
+                         SAMPLE_FILTER_T>;
+  }
+}();
+
+template <bool Persistent, typename DATASET_DESCRIPTOR_T, typename SAMPLE_FILTER_T>
 struct search_kernel_config {
-  using kernel_t = decltype(&search_kernel<64, 64, 0, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>);
+  using kernel_t =
+    decltype(dispatch_kernel<Persistent, 64, 64, 0, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>);
 
   template <unsigned MAX_CANDIDATES, unsigned USE_BITONIC_SORT>
   static auto choose_search_kernel(unsigned itopk_size) -> kernel_t
   {
     if (itopk_size <= 64) {
-      return search_kernel<64,
-                           MAX_CANDIDATES,
-                           USE_BITONIC_SORT,
-                           DATASET_DESCRIPTOR_T,
-                           SAMPLE_FILTER_T>;
+      return dispatch_kernel<Persistent,
+                             64,
+                             MAX_CANDIDATES,
+                             USE_BITONIC_SORT,
+                             DATASET_DESCRIPTOR_T,
+                             SAMPLE_FILTER_T>;
     } else if (itopk_size <= 128) {
-      return search_kernel<128,
-                           MAX_CANDIDATES,
-                           USE_BITONIC_SORT,
-                           DATASET_DESCRIPTOR_T,
-                           SAMPLE_FILTER_T>;
+      return dispatch_kernel<Persistent,
+                             128,
+                             MAX_CANDIDATES,
+                             USE_BITONIC_SORT,
+                             DATASET_DESCRIPTOR_T,
+                             SAMPLE_FILTER_T>;
     } else if (itopk_size <= 256) {
-      return search_kernel<256,
-                           MAX_CANDIDATES,
-                           USE_BITONIC_SORT,
-                           DATASET_DESCRIPTOR_T,
-                           SAMPLE_FILTER_T>;
+      return dispatch_kernel<Persistent,
+                             256,
+                             MAX_CANDIDATES,
+                             USE_BITONIC_SORT,
+                             DATASET_DESCRIPTOR_T,
+                             SAMPLE_FILTER_T>;
     } else if (itopk_size <= 512) {
-      return search_kernel<512,
-                           MAX_CANDIDATES,
-                           USE_BITONIC_SORT,
-                           DATASET_DESCRIPTOR_T,
-                           SAMPLE_FILTER_T>;
+      return dispatch_kernel<Persistent,
+                             512,
+                             MAX_CANDIDATES,
+                             USE_BITONIC_SORT,
+                             DATASET_DESCRIPTOR_T,
+                             SAMPLE_FILTER_T>;
     }
     THROW("No kernel for parametels itopk_size %u, max_candidates %u", itopk_size, MAX_CANDIDATES);
   }
@@ -846,9 +1111,19 @@ struct search_kernel_config {
       // Radix-based topk is used
       constexpr unsigned max_candidates = 32;  // to avoid build failure
       if (itopk_size <= 256) {
-        return search_kernel<256, max_candidates, 0, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>;
+        return dispatch_kernel<Persistent,
+                               256,
+                               max_candidates,
+                               0,
+                               DATASET_DESCRIPTOR_T,
+                               SAMPLE_FILTER_T>;
       } else if (itopk_size <= 512) {
-        return search_kernel<512, max_candidates, 0, DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>;
+        return dispatch_kernel<Persistent,
+                               512,
+                               max_candidates,
+                               0,
+                               DATASET_DESCRIPTOR_T,
+                               SAMPLE_FILTER_T>;
       }
     }
     THROW("No kernel for parametels itopk_size %u, num_itopk_candidates %u",
@@ -857,8 +1132,797 @@ struct search_kernel_config {
   }
 };
 
+/**
+ * @brief Resource queue
+ *
+ * @tparam T the element type
+ * @tparam Size the maximum capacity of the queue (power-of-two)
+ * @tparam Empty a special element value designating an empty queue slot. NB: storing `Empty` is UB.
+ *
+ * A shared atomic ring buffer based queue optimized for throughput when bottlenecked on `pop`
+ * operation.
+ *
+ * @code{.cpp}
+ *   // allocate the queue
+ *   resource_queue_t<int32_t, 256> resource_ids;
+ *
+ *   // store couple values
+ *   resource_ids.push(42);
+ *   resource_ids.push(7);
+ *
+ *   // wait to get the value from the queue
+ *   auto id_x = resource_ids.pop().wait();
+ *
+ *   // stand in line to get the value from the queue, but don't wait
+ *   auto ticket_y = resource_ids.pop();
+ *   // do other stuff and check if the value is available
+ *   int32_t id_y;
+ *   while (!ticket_y.test(id_y)) {
+ *     do_some_important_business(...);
+ *     std::this_thread::sleep_for(std::chrono::microseconds(10);
+ *   }
+ *   // `id_y` is set by now and `ticket_y.wait()` won't block anymore
+ *   assert(ticket_y.wait() == id_y);
+ * @endcode
+ */
+template <typename T, uint32_t Size, T Empty = std::numeric_limits<T>::max()>
+struct alignas(kCacheLineBytes) resource_queue_t {
+  using value_type                   = T;
+  static constexpr uint32_t kSize    = Size;
+  static constexpr value_type kEmpty = Empty;
+  static_assert(cuda::std::atomic<value_type>::is_always_lock_free,
+                "The value type must be lock-free.");
+  static_assert(raft::is_a_power_of_two(kSize), "The size must be a power-of-two for efficiency.");
+  static constexpr uint32_t kElemsPerCacheLine =
+    raft::div_rounding_up_safe<uint32_t>(kCacheLineBytes, sizeof(value_type));
+  /* [Note: cache-friendly indexing]
+     To avoid false sharing, the queue pushes and pops values not sequentially, but with an
+     increment that is larger than the cache line size.
+     Hence we introduce the `kCounterIncrement > kCacheLineBytes`.
+     However, to make sure all indices are used, we choose the increment to be coprime with the
+     buffer size. We also require that the buffer size is a power-of-two for two reasons:
+       1) Fast modulus operation - reduces to binary `and` (with `kCounterLocMask`).
+       2) Easy to ensure GCD(kCounterIncrement, kSize) == 1 by construction
+          (see the definition below).
+   */
+  static constexpr uint32_t kCounterIncrement = raft::bound_by_power_of_two(kElemsPerCacheLine) + 1;
+  static constexpr uint32_t kCounterLocMask   = kSize - 1;
+  // These props hold by design, but we add them here as a documentation and a sanity check.
+  static_assert(
+    kCounterIncrement * sizeof(value_type) >= kCacheLineBytes,
+    "The counter increment should be larger than the cache line size to avoid false sharing.");
+  static_assert(
+    std::gcd(kCounterIncrement, kSize) == 1,
+    "The counter increment and the size must be coprime to allow using all of the queue slots.");
+
+  static constexpr auto kMemOrder = cuda::std::memory_order_relaxed;
+
+  explicit resource_queue_t(uint32_t capacity = Size) noexcept : capacity_{capacity}
+  {
+    head_.store(0, kMemOrder);
+    tail_.store(0, kMemOrder);
+    for (uint32_t i = 0; i < kSize; i++) {
+      buf_[i].store(kEmpty, kMemOrder);
+    }
+  }
+
+  /** Nominal capacity of the queue. */
+  [[nodiscard]] auto capacity() const { return capacity_; }
+
+  /** This does not affect the queue behavior, but merely declares a nominal capacity. */
+  void set_capacity(uint32_t capacity) { capacity_ = capacity; }
+
+  /**
+   * A slot in the queue to take the value from.
+   * Once it's obtained, the corresponding value in the queue is lost for other users.
+   */
+  struct promise_t {
+    explicit promise_t(cuda::std::atomic<value_type>& loc) : loc_{loc}, val_{Empty} {}
+    ~promise_t() noexcept { wait(); }
+
+    auto test() noexcept -> bool
+    {
+      if (val_ != Empty) { return true; }
+      val_ = loc_.exchange(kEmpty, kMemOrder);
+      return val_ != Empty;
+    }
+
+    auto test(value_type& e) noexcept -> bool
+    {
+      if (test()) {
+        e = val_;
+        return true;
+      }
+      return false;
+    }
+
+    auto wait() noexcept -> value_type
+    {
+      if (val_ == Empty) {
+        // [HOT SPOT]
+        // Optimize for the case of contention: expect the loc is empty.
+        do {
+          loc_.wait(kEmpty, kMemOrder);
+          val_ = loc_.exchange(kEmpty, kMemOrder);
+        } while (val_ == kEmpty);
+      }
+      return val_;
+    }
+
+   private:
+    cuda::std::atomic<value_type>& loc_;
+    value_type val_;
+  };
+
+  void push(value_type x) noexcept
+  {
+    auto& loc = buf_[head_.fetch_add(kCounterIncrement, kMemOrder) & kCounterLocMask];
+    /* [NOT A HOT SPOT]
+     We expect there's always enough place in the queue to push the item,
+     but also we expect a few pop waiters - notify them the data is available.
+     */
+    value_type e = kEmpty;
+    while (!loc.compare_exchange_weak(e, x, kMemOrder, kMemOrder)) {
+      e = kEmpty;
+    }
+    loc.notify_one();
+  }
+
+  auto pop() noexcept -> promise_t
+  {
+    auto& loc = buf_[tail_.fetch_add(kCounterIncrement, kMemOrder) & kCounterLocMask];
+    return promise_t{loc};
+  }
+
+ private:
+  alignas(kCacheLineBytes) cuda::std::atomic<uint32_t> head_{};
+  alignas(kCacheLineBytes) cuda::std::atomic<uint32_t> tail_{};
+  alignas(kCacheLineBytes) std::array<cuda::std::atomic<value_type>, kSize> buf_{};
+  alignas(kCacheLineBytes) uint32_t capacity_;
+};
+
+/** Primitive fixed-size deque for single-threaded use. */
+template <typename T>
+struct local_deque_t {
+  explicit local_deque_t(uint32_t size) : store_(size) {}
+
+  [[nodiscard]] auto capacity() const -> uint32_t { return store_.size(); }
+  [[nodiscard]] auto size() const -> uint32_t { return end_ - start_; }
+
+  void push_back(T x) { store_[end_++ % capacity()] = x; }
+
+  void push_front(T x)
+  {
+    if (start_ == 0) {
+      start_ += capacity();
+      end_ += capacity();
+    }
+    store_[--start_ % capacity()] = x;
+  }
+
+  // NB: unsafe functions - do not check if the queue is full/empty.
+  auto pop_back() -> T { return store_[--end_ % capacity()]; }
+  auto pop_front() -> T { return store_[start_++ % capacity()]; }
+
+  auto try_push_back(T x) -> bool
+  {
+    if (size() >= capacity()) { return false; }
+    push_back(x);
+    return true;
+  }
+
+  auto try_push_front(T x) -> bool
+  {
+    if (size() >= capacity()) { return false; }
+    push_front(x);
+    return true;
+  }
+
+  auto try_pop_back(T& x) -> bool
+  {
+    if (start_ >= end_) { return false; }
+    x = pop_back();
+    return true;
+  }
+
+  auto try_pop_front(T& x) -> bool
+  {
+    if (start_ >= end_) { return false; }
+    x = pop_front();
+    return true;
+  }
+
+ private:
+  std::vector<T> store_;
+  uint32_t start_{0};
+  uint32_t end_{0};
+};
+
+struct persistent_runner_base_t {
+  using job_queue_type    = resource_queue_t<uint32_t, kMaxJobsNum>;
+  using worker_queue_type = resource_queue_t<uint32_t, kMaxWorkersNum>;
+  rmm::mr::pinned_host_memory_resource worker_handles_mr;
+  rmm::mr::pinned_host_memory_resource job_descriptor_mr;
+  rmm::mr::cuda_memory_resource device_mr;
+  cudaStream_t stream{};
+  job_queue_type job_queue{};
+  worker_queue_type worker_queue{};
+  // This should be large enough to make the runner live through restarts of the benchmark cases.
+  // Otherwise, the benchmarks slowdown significantly.
+  std::chrono::milliseconds lifetime;
+
+  persistent_runner_base_t(float persistent_lifetime)
+    : lifetime(size_t(persistent_lifetime * 1000)), job_queue(), worker_queue()
+  {
+    cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+  }
+  virtual ~persistent_runner_base_t() noexcept { cudaStreamDestroy(stream); };
+};
+
+struct alignas(kCacheLineBytes) launcher_t {
+  using job_queue_type           = persistent_runner_base_t::job_queue_type;
+  using worker_queue_type        = persistent_runner_base_t::worker_queue_type;
+  using pending_reads_queue_type = local_deque_t<uint32_t>;
+  using completion_flag_type     = cuda::atomic<bool, cuda::thread_scope_system>;
+
+  pending_reads_queue_type pending_reads;
+  job_queue_type& job_ids;
+  worker_queue_type& idle_worker_ids;
+  worker_handle_t* worker_handles;
+  uint32_t job_id;
+  completion_flag_type* completion_flag;
+  bool all_done = false;
+
+  /* [Note: sleeping]
+  When the number of threads is greater than the number of cores, the threads start to fight for
+  the CPU time, which reduces the throughput.
+  To ease the competition, we track the expected GPU latency and let a thread sleep for some
+  time, and only start to spin when it's about a time to get the result.
+  */
+  static inline constexpr auto kDefaultLatency = std::chrono::nanoseconds(50000);
+  /* This is the base for computing maximum time a thread is allowed to sleep. */
+  static inline constexpr auto kMaxExpectedLatency =
+    kDefaultLatency * std::max<std::uint32_t>(10, kMaxJobsNum / 128);
+  static inline thread_local auto expected_latency = kDefaultLatency;
+  const std::chrono::time_point<std::chrono::system_clock> start;
+  std::chrono::time_point<std::chrono::system_clock> now;
+  const int64_t pause_factor;
+  int pause_count = 0;
+  /**
+   * Beyond this threshold, the launcher (calling thread) does not wait for the results anymore and
+   * throws an exception.
+   */
+  std::chrono::time_point<std::chrono::system_clock> deadline;
+
+  template <typename RecordWork>
+  launcher_t(job_queue_type& job_ids,
+             worker_queue_type& idle_worker_ids,
+             worker_handle_t* worker_handles,
+             uint32_t n_queries,
+             std::chrono::milliseconds max_wait_time,
+             RecordWork record_work)
+    : pending_reads{std::min(n_queries, kMaxWorkersPerThread)},
+      job_ids{job_ids},
+      idle_worker_ids{idle_worker_ids},
+      worker_handles{worker_handles},
+      job_id{job_ids.pop().wait()},
+      completion_flag{record_work(job_id)},
+      start{std::chrono::system_clock::now()},
+      pause_factor{calc_pause_factor(n_queries)},
+      now{start},
+      deadline{start + max_wait_time + expected_latency}
+  {
+    // Wait for the first worker and submit the query immediately.
+    submit_query(idle_worker_ids.pop().wait(), 0);
+    // Submit the rest of the queries in the batch
+    for (uint32_t i = 1; i < n_queries; i++) {
+      auto promised_worker = idle_worker_ids.pop();
+      uint32_t worker_id;
+      while (!promised_worker.test(worker_id)) {
+        if (pending_reads.try_pop_front(worker_id)) {
+          bool returned_some = false;
+          for (bool keep_returning = true; keep_returning;) {
+            if (try_return_worker(worker_id)) {
+              keep_returning = pending_reads.try_pop_front(worker_id);
+              returned_some  = true;
+            } else {
+              pending_reads.push_front(worker_id);
+              keep_returning = false;
+            }
+          }
+          if (!returned_some) { pause(); }
+        } else {
+          // Calmly wait for the promised worker instead of spinning.
+          worker_id = promised_worker.wait();
+          break;
+        }
+      }
+      pause_count = 0;  // reset the pause behavior
+      submit_query(worker_id, i);
+      // Try to not hold too many workers in one thread
+      if (i >= kSoftMaxWorkersPerThread && pending_reads.try_pop_front(worker_id)) {
+        if (!try_return_worker(worker_id)) { pending_reads.push_front(worker_id); }
+      }
+    }
+  }
+
+  inline ~launcher_t() noexcept  // NOLINT
+  {
+    // bookkeeping: update the expected latency to wait more efficiently later
+    constexpr size_t kWindow = 100;  // moving average memory
+    expected_latency         = std::min<std::chrono::nanoseconds>(
+      ((kWindow - 1) * expected_latency + now - start) / kWindow, kMaxExpectedLatency);
+
+    // Try to gracefully cleanup the queue resources if the launcher is being destructed after an
+    // exception.
+    if (job_id != job_queue_type::kEmpty) { job_ids.push(job_id); }
+    uint32_t worker_id;
+    while (pending_reads.try_pop_front(worker_id)) {
+      idle_worker_ids.push(worker_id);
+    }
+  }
+
+  inline void submit_query(uint32_t worker_id, uint32_t query_id)
+  {
+    worker_handles[worker_id].data.store(worker_handle_t::data_t{.value = {job_id, query_id}},
+                                         cuda::memory_order_relaxed);
+
+    while (!pending_reads.try_push_back(worker_id)) {
+      // The only reason pending_reads cannot push is that the queue is full.
+      // It's local, so we must pop and wait for the returned worker to finish its work.
+      auto pending_worker_id = pending_reads.pop_front();
+      while (!try_return_worker(pending_worker_id)) {
+        pause();
+      }
+    }
+    pause_count = 0;  // reset the pause behavior
+  }
+
+  /** Check if the worker has finished the work; if so, return it to the shared pool. */
+  inline auto try_return_worker(uint32_t worker_id) -> bool
+  {
+    // Use the cached `all_done` - makes sense when called from the `wait()` routine.
+    if (all_done ||
+        !is_worker_busy(worker_handles[worker_id].data.load(cuda::memory_order_relaxed).handle)) {
+      idle_worker_ids.push(worker_id);
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  /** Check if all workers finished their work. */
+  inline auto is_all_done()
+  {
+    // Cache the result of the check to avoid doing unnecessary atomic loads.
+    if (all_done) { return true; }
+    all_done = completion_flag->load(cuda::memory_order_relaxed);
+    return all_done;
+  }
+
+  /** The launcher shouldn't attempt to wait past the returned time. */
+  [[nodiscard]] inline auto sleep_limit() const
+  {
+    constexpr auto kMinWakeTime  = std::chrono::nanoseconds(10000);
+    constexpr double kSleepLimit = 0.6;
+    return start + expected_latency * kSleepLimit - kMinWakeTime;
+  }
+
+  /**
+   * When the latency is much larger than expected, it's a sign that there is a thread contention.
+   * Then we switch to sleeping instead of waiting to give the cpu cycles to other threads.
+   */
+  [[nodiscard]] inline auto overtime_threshold() const
+  {
+    constexpr auto kOvertimeFactor = 3;
+    return start + expected_latency * kOvertimeFactor;
+  }
+
+  /**
+   * Calculate the fraction of time can be spent sleeping in a single call to `pause()`.
+   * Naturally it depends on the number of queries in a batch and the number of parallel workers.
+   */
+  [[nodiscard]] inline auto calc_pause_factor(uint32_t n_queries) const -> uint32_t
+  {
+    constexpr uint32_t kMultiplier = 10;
+    return kMultiplier * raft::div_rounding_up_safe(n_queries, idle_worker_ids.capacity());
+  }
+
+  /** Wait a little bit (called in a loop). */
+  inline void pause()
+  {
+    // Don't sleep this many times hoping for smoother run
+    constexpr auto kSpinLimit = 3;
+    // It doesn't make much sense to slee less than this
+    constexpr auto kPauseTimeMin = std::chrono::nanoseconds(1000);
+    // Bound sleeping time
+    constexpr auto kPauseTimeMax = std::chrono::nanoseconds(50000);
+    if (pause_count++ < kSpinLimit) {
+      std::this_thread::yield();
+      return;
+    }
+    now                  = std::chrono::system_clock::now();
+    auto pause_time_base = std::max(now - start, expected_latency);
+    auto pause_time      = std::clamp(pause_time_base / pause_factor, kPauseTimeMin, kPauseTimeMax);
+    if (now + pause_time < sleep_limit()) {
+      // It's too early: sleep for a bit
+      std::this_thread::sleep_for(pause_time);
+    } else if (now <= overtime_threshold()) {
+      // It's about time to check the results, don't sleep
+      std::this_thread::yield();
+    } else if (now <= deadline) {
+      // Too late; perhaps the system is too busy - sleep again
+      std::this_thread::sleep_for(pause_time);
+    } else {
+      // Missed the deadline: throw an exception
+      throw raft::exception(
+        "The calling thread didn't receive the results from the persistent CAGRA kernel within the "
+        "expected kernel lifetime. Here are possible reasons of this failure:\n"
+        "  (1) `persistent_lifetime` search parameter is too small - increase it;\n"
+        "  (2) there is other work being executed on the same device and the kernel failed to "
+        "progress - decreasing `persistent_device_usage` may help (but not guaranteed);\n"
+        "  (3) there is a bug in the implementation - please report it to cuVS team.");
+    }
+  }
+
+  /** Wait for all work to finish and don't forget to return the workers to the shared pool. */
+  inline void wait()
+  {
+    uint32_t worker_id;
+    while (pending_reads.try_pop_front(worker_id)) {
+      while (!try_return_worker(worker_id)) {
+        if (!is_all_done()) { pause(); }
+      }
+    }
+    pause_count = 0;  // reset the pause behavior
+    // terminal state, should be engaged only after the `pending_reads` is empty
+    // and `queries_submitted == n_queries`
+    now = std::chrono::system_clock::now();
+    while (!is_all_done()) {
+      auto till_time = sleep_limit();
+      if (now < till_time) {
+        std::this_thread::sleep_until(till_time);
+        now = std::chrono::system_clock::now();
+      } else {
+        pause();
+      }
+    }
+
+    // Return the job descriptor
+    job_ids.push(job_id);
+    job_id = job_queue_type::kEmpty;
+  }
+};
+
+template <typename DataT, typename IndexT, typename DistanceT, typename SampleFilterT>
+struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_base_t {
+  using descriptor_base_type = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
+  using index_type           = IndexT;
+  using distance_type        = DistanceT;
+  using data_type            = DataT;
+  using kernel_config_type   = search_kernel_config<true, descriptor_base_type, SampleFilterT>;
+  using kernel_type          = typename kernel_config_type::kernel_t;
+  using job_desc_type        = job_desc_t<descriptor_base_type>;
+  kernel_type kernel;
+  uint32_t block_size;
+  dataset_descriptor_host<DataT, IndexT, DistanceT> dd_host;
+  rmm::device_uvector<worker_handle_t> worker_handles;
+  rmm::device_uvector<job_desc_type> job_descriptors;
+  rmm::device_uvector<uint32_t> completion_counters;
+  rmm::device_uvector<index_type> hashmap;
+  std::atomic<std::chrono::time_point<std::chrono::system_clock>> last_touch;
+  uint64_t param_hash;
+
+  /**
+   * Calculate the hash of the parameters to detect if they've changed across the calls.
+   * NB: this must have the same argument types as the constructor.
+   */
+  static inline auto calculate_parameter_hash(
+    std::reference_wrapper<const dataset_descriptor_host<DataT, IndexT, DistanceT>> dataset_desc,
+    raft::device_matrix_view<const index_type, int64_t, raft::row_major> graph,
+    uint32_t num_itopk_candidates,
+    uint32_t block_size,  //
+    uint32_t smem_size,
+    int64_t hash_bitlen,
+    size_t small_hash_bitlen,
+    size_t small_hash_reset_interval,
+    uint32_t num_random_samplings,
+    uint64_t rand_xor_mask,
+    uint32_t num_seeds,
+    size_t itopk_size,
+    size_t search_width,
+    size_t min_iterations,
+    size_t max_iterations,
+    SampleFilterT sample_filter,
+    float persistent_lifetime,
+    float persistent_device_usage) -> uint64_t
+  {
+    return uint64_t(graph.data_handle()) ^ dataset_desc.get().team_size ^ num_itopk_candidates ^
+           block_size ^ smem_size ^ hash_bitlen ^ small_hash_reset_interval ^ num_random_samplings ^
+           rand_xor_mask ^ num_seeds ^ itopk_size ^ search_width ^ min_iterations ^ max_iterations ^
+           uint64_t(persistent_lifetime * 1000) ^ uint64_t(persistent_device_usage * 1000);
+  }
+
+  persistent_runner_t(
+    std::reference_wrapper<const dataset_descriptor_host<DataT, IndexT, DistanceT>> dataset_desc,
+    raft::device_matrix_view<const index_type, int64_t, raft::row_major> graph,
+    uint32_t num_itopk_candidates,
+    uint32_t block_size,  //
+    uint32_t smem_size,
+    int64_t hash_bitlen,
+    size_t small_hash_bitlen,
+    size_t small_hash_reset_interval,
+    uint32_t num_random_samplings,
+    uint64_t rand_xor_mask,
+    uint32_t num_seeds,
+    size_t itopk_size,
+    size_t search_width,
+    size_t min_iterations,
+    size_t max_iterations,
+    SampleFilterT sample_filter,
+    float persistent_lifetime,
+    float persistent_device_usage)
+    : persistent_runner_base_t{persistent_lifetime},
+      kernel{kernel_config_type::choose_itopk_and_mx_candidates(
+        itopk_size, num_itopk_candidates, block_size)},
+      block_size{block_size},
+      worker_handles(0, stream, worker_handles_mr),
+      job_descriptors(kMaxJobsNum, stream, job_descriptor_mr),
+      completion_counters(kMaxJobsNum, stream, device_mr),
+      hashmap(0, stream, device_mr),
+      dd_host{dataset_desc.get()},
+      param_hash(calculate_parameter_hash(dd_host,
+                                          graph,
+                                          num_itopk_candidates,
+                                          block_size,
+                                          smem_size,
+                                          hash_bitlen,
+                                          small_hash_bitlen,
+                                          small_hash_reset_interval,
+                                          num_random_samplings,
+                                          rand_xor_mask,
+                                          num_seeds,
+                                          itopk_size,
+                                          search_width,
+                                          min_iterations,
+                                          max_iterations,
+                                          sample_filter,
+                                          persistent_lifetime,
+                                          persistent_device_usage))
+  {
+    // initialize the dataset/distance descriptor
+    auto* dd_dev_ptr = dd_host.dev_ptr(stream);
+
+    // set kernel attributes same as in normal kernel
+    RAFT_CUDA_TRY(
+      cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+
+    // set kernel launch parameters
+    dim3 gs = calc_coop_grid_size(block_size, smem_size, persistent_device_usage);
+    dim3 bs(block_size, 1, 1);
+    RAFT_LOG_DEBUG(
+      "Launching persistent kernel with %u threads, %u block %u smem", bs.x, gs.y, smem_size);
+
+    // initialize the job queue
+    auto* completion_counters_ptr = completion_counters.data();
+    auto* job_descriptors_ptr     = job_descriptors.data();
+    for (uint32_t i = 0; i < kMaxJobsNum; i++) {
+      auto& jd                = job_descriptors_ptr[i].input.value;
+      jd.result_indices_ptr   = nullptr;
+      jd.result_distances_ptr = nullptr;
+      jd.queries_ptr          = nullptr;
+      jd.top_k                = 0;
+      jd.n_queries            = 0;
+      job_descriptors_ptr[i].completion_flag.store(false);
+      job_queue.push(i);
+    }
+
+    // initialize the worker queue
+    worker_queue.set_capacity(gs.y);
+    worker_handles.resize(gs.y, stream);
+    auto* worker_handles_ptr = worker_handles.data();
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    for (uint32_t i = 0; i < gs.y; i++) {
+      worker_handles_ptr[i].data.store({kWaitForWork});
+      worker_queue.push(i);
+    }
+
+    index_type* hashmap_ptr = nullptr;
+    if (small_hash_bitlen == 0) {
+      hashmap.resize(gs.y * hashmap::get_size(hash_bitlen), stream);
+      hashmap_ptr = hashmap.data();
+    }
+
+    // launch the kernel
+    auto* graph_ptr                   = graph.data_handle();
+    uint32_t graph_degree             = graph.extent(1);
+    uint32_t* num_executed_iterations = nullptr;  // optional arg [num_queries]
+    const index_type* dev_seed_ptr    = nullptr;  // optional arg [num_queries, num_seeds]
+
+    void* args[] =  // NOLINT
+      {&dd_dev_ptr,
+       &worker_handles_ptr,
+       &job_descriptors_ptr,
+       &completion_counters_ptr,
+       &graph_ptr,  // [dataset_size, graph_degree]
+       &graph_degree,
+       &num_random_samplings,
+       &rand_xor_mask,
+       &dev_seed_ptr,
+       &num_seeds,
+       &hashmap_ptr,  // visited_hashmap_ptr: [num_queries, 1 << hash_bitlen]
+       &itopk_size,
+       &search_width,
+       &min_iterations,
+       &max_iterations,
+       &num_executed_iterations,
+       &hash_bitlen,
+       &small_hash_bitlen,
+       &small_hash_reset_interval,
+       &sample_filter};
+    cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
+    RAFT_CUDA_TRY(cudaLaunchCooperativeKernel<std::remove_pointer_t<kernel_type>>(
+      kernel, gs, bs, args, smem_size, stream));
+    RAFT_LOG_INFO(
+      "Initialized the kernel %p in stream %zd; job_queue size = %u; worker_queue size = %u",
+      reinterpret_cast<void*>(kernel),
+      int64_t((cudaStream_t)stream),
+      job_queue.capacity(),
+      worker_queue.capacity());
+    last_touch.store(std::chrono::system_clock::now(), std::memory_order_relaxed);
+  }
+
+  ~persistent_runner_t() noexcept override
+  {
+    auto whs = worker_handles.data();
+    for (auto i = worker_handles.size(); i > 0; i--) {
+      whs[worker_queue.pop().wait()].data.store({kNoMoreWork}, cuda::memory_order_relaxed);
+    }
+    RAFT_CUDA_TRY_NO_THROW(cudaStreamSynchronize(stream));
+    RAFT_LOG_INFO("Destroyed the persistent runner.");
+  }
+
+  void launch(index_type* result_indices_ptr,       // [num_queries, top_k]
+              distance_type* result_distances_ptr,  // [num_queries, top_k]
+              const data_type* queries_ptr,         // [num_queries, dataset_dim]
+              uint32_t num_queries,
+              uint32_t top_k)
+  {
+    // submit all queries
+    launcher_t launcher{job_queue,
+                        worker_queue,
+                        worker_handles.data(),
+                        num_queries,
+                        this->lifetime,
+                        [=](uint32_t job_ix) {
+                          auto& jd                = job_descriptors.data()[job_ix].input.value;
+                          auto* cflag             = &job_descriptors.data()[job_ix].completion_flag;
+                          jd.result_indices_ptr   = result_indices_ptr;
+                          jd.result_distances_ptr = result_distances_ptr;
+                          jd.queries_ptr          = queries_ptr;
+                          jd.top_k                = top_k;
+                          jd.n_queries            = num_queries;
+                          cflag->store(false, cuda::memory_order_relaxed);
+                          cuda::atomic_thread_fence(cuda::memory_order_release,
+                                                    cuda::thread_scope_system);
+                          return cflag;
+                        }};
+
+    // Update the state of the keep-alive atomic in the meanwhile
+    auto prev_touch = last_touch.load(std::memory_order_relaxed);
+    if (prev_touch + lifetime / 10 < launcher.now) {
+      // to avoid congestion at this atomic, we only update it if a significant fraction of the live
+      // interval has passed.
+      last_touch.store(launcher.now, std::memory_order_relaxed);
+    }
+    // wait for the results to arrive
+    launcher.wait();
+  }
+
+  auto calc_coop_grid_size(uint32_t block_size, uint32_t smem_size, float persistent_device_usage)
+    -> dim3
+  {
+    // determine the grid size
+    int ctas_per_sm = 1;
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor<kernel_type>(
+      &ctas_per_sm, kernel, block_size, smem_size);
+    int num_sm    = raft::getMultiProcessorCount();
+    auto n_blocks = static_cast<uint32_t>(persistent_device_usage * (ctas_per_sm * num_sm));
+    if (n_blocks > kMaxWorkersNum) {
+      RAFT_LOG_WARN("Limiting the grid size limit due to the size of the queue: %u -> %u",
+                    n_blocks,
+                    kMaxWorkersNum);
+      n_blocks = kMaxWorkersNum;
+    }
+
+    return {1, n_blocks, 1};
+  }
+};
+
+struct alignas(kCacheLineBytes) persistent_state {
+  std::shared_ptr<persistent_runner_base_t> runner{nullptr};
+  std::mutex lock;
+};
+
+inline persistent_state persistent{};
+
+template <typename RunnerT, typename... Args>
+auto create_runner(Args... args) -> std::shared_ptr<RunnerT>  // it's ok.. pass everything by values
+{
+  std::lock_guard<std::mutex> guard(persistent.lock);
+  // Check if the runner has already been created
+  std::shared_ptr<RunnerT> runner_outer = std::dynamic_pointer_cast<RunnerT>(persistent.runner);
+  if (runner_outer) {
+    if (runner_outer->param_hash == RunnerT::calculate_parameter_hash(args...)) {
+      return runner_outer;
+    } else {
+      runner_outer.reset();
+    }
+  }
+  // Runner has not yet been created (or it's incompatible):
+  //   create it in another thread and only then release the lock.
+  // Free the resources (if any) in advance
+  persistent.runner.reset();
+
+  cuda::std::atomic_flag ready{};
+  ready.clear(cuda::std::memory_order_relaxed);
+  std::thread(
+    [&runner_outer, &ready](Args... thread_args) {  // pass everything by values
+      // create the runner (the lock is acquired in the parent thread).
+      runner_outer      = std::make_shared<RunnerT>(thread_args...);
+      auto lifetime     = runner_outer->lifetime;
+      persistent.runner = std::static_pointer_cast<persistent_runner_base_t>(runner_outer);
+      std::weak_ptr<RunnerT> runner_weak = runner_outer;
+      ready.test_and_set(cuda::std::memory_order_release);
+      ready.notify_one();
+      // NB: runner_outer is passed by reference and may be dead by this time.
+
+      while (true) {
+        std::this_thread::sleep_for(lifetime);
+        auto runner = runner_weak.lock();  // runner_weak is local - thread-safe
+        if (!runner) {
+          return;  // dead already
+        }
+        if (runner->last_touch.load(std::memory_order_relaxed) + lifetime <
+            std::chrono::system_clock::now()) {
+          std::lock_guard<std::mutex> guard(persistent.lock);
+          if (runner == persistent.runner) { persistent.runner.reset(); }
+          return;
+        }
+      }
+    },
+    args...)
+    .detach();
+  ready.wait(false, cuda::std::memory_order_acquire);
+  return runner_outer;
+}
+
+template <typename RunnerT, typename... Args>
+auto get_runner(Args... args) -> std::shared_ptr<RunnerT>
+{
+  // Using a thread-local weak pointer allows us to avoid using locks/atomics,
+  // since the control block of weak/shared pointers is thread-safe.
+  static thread_local std::weak_ptr<RunnerT> weak;
+  auto runner = weak.lock();
+  if (runner) {
+    if (runner->param_hash == RunnerT::calculate_parameter_hash(args...)) {
+      return runner;
+    } else {
+      weak.reset();
+      runner.reset();
+    }
+  }
+  // Thread-local variable expected_latency makes sense only for a current RunnerT configuration.
+  // If `weak` is not alive, it's a hint the configuration has changed and we should reset our
+  // estimate of the expected launch latency.
+  launcher_t::expected_latency = launcher_t::kDefaultLatency;
+  runner                       = create_runner<RunnerT>(args...);
+  weak                         = runner;
+  return runner;
+}
+
 template <typename DataT, typename IndexT, typename DistanceT, typename SampleFilterT>
-void select_and_run(const dataset_descriptor_base_t<DataT, IndexT, DistanceT>* dataset_desc,
+void select_and_run(const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc,
                     raft::device_matrix_view<const IndexT, int64_t, raft::row_major> graph,
                     IndexT* topk_indices_ptr,       // [num_queries, topk]
                     DistanceT* topk_distances_ptr,  // [num_queries, topk]
@@ -879,40 +1943,66 @@ void select_and_run(const dataset_descriptor_base_t<DataT, IndexT, DistanceT>* d
                     SampleFilterT sample_filter,
                     cudaStream_t stream)
 {
-  auto kernel =
-    search_kernel_config<dataset_descriptor_base_t<DataT, IndexT, DistanceT>,
-                         SampleFilterT>::choose_itopk_and_mx_candidates(ps.itopk_size,
-                                                                        num_itopk_candidates,
-                                                                        block_size);
-  RAFT_CUDA_TRY(
-    cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
-  dim3 thread_dims(block_size, 1, 1);
-  dim3 block_dims(1, num_queries, 1);
-  RAFT_LOG_DEBUG(
-    "Launching kernel with %u threads, %u block %u smem", block_size, num_queries, smem_size);
-  kernel<<<block_dims, thread_dims, smem_size, stream>>>(topk_indices_ptr,
-                                                         topk_distances_ptr,
-                                                         topk,
-                                                         dataset_desc,
-                                                         queries_ptr,
-                                                         graph.data_handle(),
-                                                         graph.extent(1),
-                                                         ps.num_random_samplings,
-                                                         ps.rand_xor_mask,
-                                                         dev_seed_ptr,
-                                                         num_seeds,
-                                                         hashmap_ptr,
-                                                         ps.itopk_size,
-                                                         ps.search_width,
-                                                         ps.min_iterations,
-                                                         ps.max_iterations,
-                                                         num_executed_iterations,
-                                                         hash_bitlen,
-                                                         small_hash_bitlen,
-                                                         small_hash_reset_interval,
-                                                         sample_filter);
-  // RAFT_CUDA_TRY(cudaPeekAtLastError());
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  if (ps.persistent) {
+    using runner_type = persistent_runner_t<DataT, IndexT, DistanceT, SampleFilterT>;
+
+    get_runner<runner_type>(/*
+Note, we're passing the descriptor by reference here, and this reference is going to be passed to a
+new spawned thread, which is dangerous. However, the descriptor is copied in that thread before the
+control is returned in this thread (in persistent_runner_t constructor), so we're safe.
+*/
+                            std::cref(dataset_desc),
+                            graph,
+                            num_itopk_candidates,
+                            block_size,
+                            smem_size,
+                            hash_bitlen,
+                            small_hash_bitlen,
+                            small_hash_reset_interval,
+                            ps.num_random_samplings,
+                            ps.rand_xor_mask,
+                            num_seeds,
+                            ps.itopk_size,
+                            ps.search_width,
+                            ps.min_iterations,
+                            ps.max_iterations,
+                            sample_filter,
+                            ps.persistent_lifetime,
+                            ps.persistent_device_usage)
+      ->launch(topk_indices_ptr, topk_distances_ptr, queries_ptr, num_queries, topk);
+  } else {
+    using descriptor_base_type = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
+    auto kernel                = search_kernel_config<false, descriptor_base_type, SampleFilterT>::
+      choose_itopk_and_mx_candidates(ps.itopk_size, num_itopk_candidates, block_size);
+    RAFT_CUDA_TRY(
+      cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+    dim3 thread_dims(block_size, 1, 1);
+    dim3 block_dims(1, num_queries, 1);
+    RAFT_LOG_DEBUG(
+      "Launching kernel with %u threads, %u block %u smem", block_size, num_queries, smem_size);
+    kernel<<<block_dims, thread_dims, smem_size, stream>>>(topk_indices_ptr,
+                                                           topk_distances_ptr,
+                                                           topk,
+                                                           dataset_desc.dev_ptr(stream),
+                                                           queries_ptr,
+                                                           graph.data_handle(),
+                                                           graph.extent(1),
+                                                           ps.num_random_samplings,
+                                                           ps.rand_xor_mask,
+                                                           dev_seed_ptr,
+                                                           num_seeds,
+                                                           hashmap_ptr,
+                                                           ps.itopk_size,
+                                                           ps.search_width,
+                                                           ps.min_iterations,
+                                                           ps.max_iterations,
+                                                           num_executed_iterations,
+                                                           hash_bitlen,
+                                                           small_hash_bitlen,
+                                                           small_hash_reset_interval,
+                                                           sample_filter);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
 }
 }  // namespace single_cta_search
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh
index 7b7f44db7..4d8b72b41 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh
@@ -22,7 +22,7 @@
 namespace cuvs::neighbors::cagra::detail::single_cta_search {
 
 template <typename DataT, typename IndexT, typename DistanceT, typename SampleFilterT>
-void select_and_run(const dataset_descriptor_base_t<DataT, IndexT, DistanceT>* dataset_desc,
+void select_and_run(const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc,
                     raft::device_matrix_view<const IndexT, int64_t, raft::row_major> graph,
                     IndexT* topk_indices_ptr,       // [num_queries, topk]
                     DistanceT* topk_distances_ptr,  // [num_queries, topk]
diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt
index d744a8178..15f494d3d 100644
--- a/examples/cpp/CMakeLists.txt
+++ b/examples/cpp/CMakeLists.txt
@@ -27,6 +27,7 @@ include(rapids-find)
 rapids_cuda_init_architectures(test_cuvs)
 
 project(test_cuvs LANGUAGES CXX CUDA)
+find_package(Threads)
 
 # ------------- configure cuvs -----------------#
 
@@ -36,11 +37,15 @@ include(../cmake/thirdparty/get_cuvs.cmake)
 
 # -------------- compile tasks ----------------- #
 add_executable(CAGRA_EXAMPLE src/cagra_example.cu)
+add_executable(CAGRA_PERSISTENT_EXAMPLE src/cagra_persistent_example.cu)
 add_executable(IVF_FLAT_EXAMPLE src/ivf_flat_example.cu)
 add_executable(IVF_PQ_EXAMPLE src/ivf_pq_example.cu)
 
 # `$<TARGET_NAME_IF_EXISTS:conda_env>` is a generator expression that ensures that targets are
 # installed in a conda environment, if one exists
 target_link_libraries(CAGRA_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
+target_link_libraries(
+  CAGRA_PERSISTENT_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> Threads::Threads
+)
 target_link_libraries(IVF_PQ_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
 target_link_libraries(IVF_FLAT_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
diff --git a/examples/cpp/src/cagra_persistent_example.cu b/examples/cpp/src/cagra_persistent_example.cu
new file mode 100644
index 000000000..9258a7311
--- /dev/null
+++ b/examples/cpp/src/cagra_persistent_example.cu
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "common.cuh"
+
+#include <cuvs/neighbors/cagra.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/random/make_blobs.cuh>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <array>
+#include <chrono>
+#include <cstdint>
+#include <future>
+
+// A helper to split the dataset into chunks
+template <typename DeviceMatrixOrView>
+auto slice_matrix(DeviceMatrixOrView source,
+                  typename DeviceMatrixOrView::index_type offset_rows,
+                  typename DeviceMatrixOrView::index_type count_rows) {
+  auto n_cols = source.extent(1);
+  return raft::make_device_matrix_view<
+      typename DeviceMatrixOrView::element_type,
+      typename DeviceMatrixOrView::index_type>(
+      source.data_handle() + offset_rows * n_cols, count_rows, n_cols);
+}
+
+// A helper to measure the execution time of a function
+template <typename F, typename... Args>
+void time_it(std::string label, F f, Args &&...xs) {
+  auto start = std::chrono::system_clock::now();
+  f(std::forward<Args>(xs)...);
+  auto end = std::chrono::system_clock::now();
+  auto t = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+  auto t_ms = double(t.count()) / 1000.0;
+  std::cout << "[" << label << "] execution time: " << t_ms << " ms"
+            << std::endl;
+}
+
+void cagra_build_search_variants(
+    raft::device_resources const &res,
+    raft::device_matrix_view<const float, int64_t> dataset,
+    raft::device_matrix_view<const float, int64_t> queries) {
+  using namespace cuvs::neighbors;
+
+  // Number of neighbors to search
+  int64_t topk = 100;
+  // We split the queries set into three subsets for our experiment, one for a
+  // sanity check and two for measuring the performance.
+  int64_t n_queries_a = queries.extent(0) / 2;
+  int64_t n_queries_b = queries.extent(0) - n_queries_a;
+
+  auto queries_a = slice_matrix(queries, 0, n_queries_a);
+  auto queries_b = slice_matrix(queries, n_queries_a, n_queries_b);
+
+  // create output arrays
+  auto neighbors =
+      raft::make_device_matrix<uint32_t>(res, queries.extent(0), topk);
+  auto distances =
+      raft::make_device_matrix<float>(res, queries.extent(0), topk);
+  // slice them same as queries
+  auto neighbors_a = slice_matrix(neighbors, 0, n_queries_a);
+  auto distances_a = slice_matrix(distances, 0, n_queries_a);
+  auto neighbors_b = slice_matrix(neighbors, n_queries_a, n_queries_b);
+  auto distances_b = slice_matrix(distances, n_queries_a, n_queries_b);
+
+  // use default index parameters
+  cagra::index_params index_params;
+
+  std::cout << "Building CAGRA index (search graph)" << std::endl;
+  auto index = cagra::build(res, index_params, dataset);
+
+  std::cout << "CAGRA index has " << index.size() << " vectors" << std::endl;
+  std::cout << "CAGRA graph has degree " << index.graph_degree()
+            << ", graph size [" << index.graph().extent(0) << ", "
+            << index.graph().extent(1) << "]" << std::endl;
+
+  // use default search parameters
+  cagra::search_params search_params;
+  // get a decent recall by increasing the internal topk list
+  search_params.itopk_size = 512;
+
+  // Another copy of search parameters to enable persistent kernel
+  cagra::search_params search_params_persistent = search_params;
+  search_params_persistent.persistent = true;
+  // Persistent kernel only support single-cta search algorithm for now.
+  search_params_persistent.algo = cagra::search_algo::SINGLE_CTA;
+  // Slightly reduce the kernel grid size to make this example program work
+  // smooth on workstations, which use the same GPU for other tasks (e.g.
+  // rendering GUI).
+  search_params_persistent.persistent_device_usage = 0.95;
+
+  /*
+  Define the big-batch setting as a baseline for measuring the throughput.
+
+  Note, this lambda can be used by the standard and the persistent
+  implementation interchangeably: the index stays the same, only search
+  parameters need some adjustment.
+  */
+  auto search_batch =
+      [&res, &index](bool needs_sync, const cagra::search_params &ps,
+                     raft::device_matrix_view<const float, int64_t> queries,
+                     raft::device_matrix_view<uint32_t, int64_t> neighbors,
+                     raft::device_matrix_view<float, int64_t> distances) {
+        cagra::search(res, ps, index, queries, neighbors, distances);
+        /*
+        To make a fair comparison, standard implementation needs to synchronize
+        with the device to make sure the kernel has finished the work.
+        Persistent kernel does not make any use of CUDA streams and blocks till
+        the results are available. Hence, synchronizing with the stream is a
+        waste of time in this case.
+         */
+        if (needs_sync) {
+          raft::resource::sync_stream(res);
+        }
+      };
+
+  /*
+  Define the asynchronous small-batch search setting.
+  The same lambda is used for both the standard and the persistent
+  implementations.
+
+  There are a few things to remember about this example though:
+    1. The standard kernel is launched in the given stream (behind the `res`);
+       The persistent kernel is launched implicitly; the public api call does
+       not touch the stream and blocks till the results are returned. (Hence the
+       optional sync at the end of the lambda.)
+    2. When launched asynchronously, the standard kernel should actually have a
+       separate raft::resource per-thread to achieve best performance. However,
+       this requires extra management of the resource/stream pools, we don't
+       include that for simplicity.
+       The persistent implementation does not require any special care; you can
+       safely pass a single raft::resources to all threads.
+    3. This example relies on the compiler implementation to launch the async
+       jobs in separate threads. This is not guaranteed, however.
+       In the real world, we'd advise to use a custom thread pool for managing
+       the requests.
+    4. Although the API defines the arguments as device-side mdspans, we advise
+       to use the host-side buffers accessible from the device, such as
+       allocated by cudaHostAlloc/cudaHostRegister (or any host memory if
+       HMM/ATS is enabled).
+       This way, you can save some GPU resources by not manually copying the
+       data in cuda streams.
+  */
+  auto search_async =
+      [&res, &index](bool needs_sync, const cagra::search_params &ps,
+                     raft::device_matrix_view<const float, int64_t> queries,
+                     raft::device_matrix_view<uint32_t, int64_t> neighbors,
+                     raft::device_matrix_view<float, int64_t> distances) {
+        auto work_size = queries.extent(0);
+        using index_type = typeof(work_size);
+        // Limit the maximum number of concurrent jobs
+        constexpr index_type kMaxJobs = 1000;
+        std::array<std::future<void>, kMaxJobs> futures;
+        for (index_type i = 0; i < work_size + kMaxJobs; i++) {
+          // wait for previous job in the same slot to finish
+          if (i >= kMaxJobs) {
+            futures[i % kMaxJobs].wait();
+          }
+          // submit a new job
+          if (i < work_size) {
+            futures[i % kMaxJobs] = std::async(std::launch::async, [&]() {
+              cagra::search(res, ps, index, slice_matrix(queries, i, 1),
+                            slice_matrix(neighbors, i, 1),
+                            slice_matrix(distances, i, 1));
+            });
+          }
+        }
+        /* See the remark for search_batch */
+        if (needs_sync) {
+          raft::resource::sync_stream(res);
+        }
+      };
+
+  // Launch the baseline search: check the big-batch performance
+  time_it("standard/batch A", search_batch, true, search_params, queries_a,
+          neighbors_a, distances_a);
+  time_it("standard/batch B", search_batch, true, search_params, queries_b,
+          neighbors_b, distances_b);
+
+  // Try to handle the same amount of work in the async setting using the
+  // standard implementation.
+  // (Warning: suboptimal - it uses a single stream for all async jobs)
+  time_it("standard/async A", search_async, true, search_params, queries_a,
+          neighbors_a, distances_a);
+  time_it("standard/async B", search_async, true, search_params, queries_b,
+          neighbors_b, distances_b);
+
+  // Do the same using persistent kernel.
+  time_it("persistent/async A", search_async, false, search_params_persistent,
+          queries_a, neighbors_a, distances_a);
+  time_it("persistent/async B", search_async, false, search_params_persistent,
+          queries_b, neighbors_b, distances_b);
+  /*
+Here's an example output, which shows the wall time of processing the same
+amount of data in a single batch vs in async mode (1 query per job):
+```
+CAGRA index has 1000000 vectors
+CAGRA graph has degree 64, graph size [1000000, 64]
+[standard/batch A] execution time: 854.645 ms
+[standard/batch B] execution time: 698.58 ms
+[standard/async A] execution time: 19190.6 ms
+[standard/async B] execution time: 18292 ms
+[I] [15:56:49.756754] Initialized the kernel 0x7ea4e55a5350 in stream
+              139227270582864; job_queue size = 8192; worker_queue size = 155
+[persistent/async A] execution time: 1285.65 ms
+[persistent/async B] execution time: 1316.97 ms
+[I] [15:56:55.756952] Destroyed the persistent runner.
+```
+Note, while the persistent kernel provides minimal latency for each search
+request, the wall time to process all the queries in async mode (1 query per
+job) is up to 2x slower than the standard kernel with the huge batch
+size (100K queries). One reason for this is the non-optimal CTA size: CAGRA
+kernels are automatically tuned for latency and so use large CTA sizes when the
+batch size is small. Try explicitly setting the search parameter
+`thread_block_size` to a small value, such as `64` or `128` if this is an issue
+for you. This increases the latency of individual jobs though.
+  */
+}
+
+int main() {
+  raft::device_resources res;
+
+  // Set pool memory resource with 1 GiB initial pool size. All allocations use
+  // the same pool.
+  rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> pool_mr(
+      rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull);
+  rmm::mr::set_current_device_resource(&pool_mr);
+
+  // Create input arrays.
+  int64_t n_samples = 1000000;
+  int64_t n_dim = 128;
+  int64_t n_queries = 100000;
+  auto dataset =
+      raft::make_device_matrix<float, int64_t>(res, n_samples, n_dim);
+  auto queries =
+      raft::make_device_matrix<float, int64_t>(res, n_queries, n_dim);
+  generate_dataset(res, dataset.view(), queries.view());
+
+  // run the interesting part of the program
+  cagra_build_search_variants(res, raft::make_const_mdspan(dataset.view()),
+                              raft::make_const_mdspan(queries.view()));
+}

From d9eec694b31ac78b0c13e2a02dee4e6b7d01c954 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <danteg@nvidia.com>
Date: Fri, 27 Sep 2024 11:19:27 -0500
Subject: [PATCH 6/6] Add cuvs_bench.run python code and build (#279)

Builds on top of #275

Authors:
  - Dante Gama Dessavre (https://github.com/dantegd)
  - Corey J. Nolet (https://github.com/cjnolet)
  - Divye Gala (https://github.com/divyegala)
  - rhdong (https://github.com/rhdong)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Divye Gala (https://github.com/divyegala)
  - Corey J. Nolet (https://github.com/cjnolet)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cuvs/pull/279
---
 .gitignore                                    |   3 +
 build.sh                                      |   9 +-
 .../bench_ann_cuda-118_arch-aarch64.yaml      |   3 +
 .../bench_ann_cuda-118_arch-x86_64.yaml       |   3 +
 .../bench_ann_cuda-125_arch-aarch64.yaml      |   3 +
 .../bench_ann_cuda-125_arch-x86_64.yaml       |   3 +
 conda/recipes/libcuvs/build_libcuvs_tests.sh  |   2 +-
 cpp/CMakeLists.txt                            |   8 +-
 cpp/bench/ann/CMakeLists.txt                  |  52 +-
 dependencies.yaml                             |  15 +-
 python/cuvs/CMakeLists.txt                    |  22 +-
 .../cuvs/neighbors/filters/CMakeLists.txt     |   3 +-
 .../cuvs_bench/cuvs_bench/config/__init__.py  |  17 +
 .../cuvs_bench/config/algorithms.yaml         |  42 ++
 .../cuvs_bench/config/algos/__init__.py       |   0
 .../constraints/__init__.py}                  |   2 +-
 .../cuvs_bench/config/algos/cuvs_cagra.yaml   |   6 +-
 .../config/algos/cuvs_cagra_hnswlib.yaml      |   2 +-
 .../cuvs_bench/config/algos/cuvs_ivf_pq.yaml  |   4 +-
 .../config/algos/faiss_gpu_ivf_pq.yaml        |   4 +-
 .../cuvs_bench/config/algos/hnswlib.yaml      |   2 +-
 .../config/{ => datasets}/bigann-100M.yaml    |   0
 .../config/{ => datasets}/datasets.yaml       |   0
 .../config/{ => datasets}/deep-100M.yaml      |   0
 .../config/{ => datasets}/deep-1B.yaml        |   0
 .../{ => datasets}/deep-image-96-inner.yaml   |   0
 .../fashion-mnist-784-euclidean.yaml          |   0
 .../{ => datasets}/gist-960-euclidean.yaml    |   0
 .../{ => datasets}/glove-100-angular.yaml     |   0
 .../{ => datasets}/glove-100-inner.yaml       |   0
 .../{ => datasets}/glove-50-angular.yaml      |   0
 .../config/{ => datasets}/glove-50-inner.yaml |   0
 .../{ => datasets}/lastfm-65-angular.yaml     |   0
 .../{ => datasets}/mnist-784-euclidean.yaml   |   0
 .../{ => datasets}/nytimes-256-angular.yaml   |   0
 .../{ => datasets}/nytimes-256-inner.yaml     |   0
 .../{ => datasets}/sift-128-euclidean.yaml    |   0
 .../config/{ => datasets}/wiki_all_10M.yaml   |   0
 .../config/{ => datasets}/wiki_all_1M.yaml    |   0
 .../config/{ => datasets}/wiki_all_88M.yaml   |   0
 python/cuvs_bench/cuvs_bench/run/__init__.py  |  17 +
 python/cuvs_bench/cuvs_bench/run/__main__.py  | 216 ++++++
 python/cuvs_bench/cuvs_bench/run/run.py       | 685 ++++++++++++++++++
 python/cuvs_bench/cuvs_bench/run/runners.py   | 273 +++++++
 .../cuvs_bench/cuvs_bench/tests/test_run.py   | 227 ++++++
 python/cuvs_bench/pyproject.toml              |  14 +-
 46 files changed, 1562 insertions(+), 75 deletions(-)
 create mode 100644 python/cuvs_bench/cuvs_bench/config/__init__.py
 create mode 100644 python/cuvs_bench/cuvs_bench/config/algorithms.yaml
 create mode 100644 python/cuvs_bench/cuvs_bench/config/algos/__init__.py
 rename python/cuvs_bench/cuvs_bench/config/{constraints.py => algos/constraints/__init__.py} (98%)
 rename python/cuvs_bench/cuvs_bench/config/{ => datasets}/bigann-100M.yaml (100%)
 rename python/cuvs_bench/cuvs_bench/config/{ => datasets}/datasets.yaml (100%)
 rename python/cuvs_bench/cuvs_bench/config/{ => datasets}/deep-100M.yaml (100%)
 rename python/cuvs_bench/cuvs_bench/config/{ => datasets}/deep-1B.yaml (100%)
 rename python/cuvs_bench/cuvs_bench/config/{ => datasets}/deep-image-96-inner.yaml (100%)
 rename python/cuvs_bench/cuvs_bench/config/{ => datasets}/fashion-mnist-784-euclidean.yaml (100%)
 rename python/cuvs_bench/cuvs_bench/config/{ => datasets}/gist-960-euclidean.yaml (100%)
 rename python/cuvs_bench/cuvs_bench/config/{ => datasets}/glove-100-angular.yaml (100%)
 rename python/cuvs_bench/cuvs_bench/config/{ => datasets}/glove-100-inner.yaml (100%)
 rename python/cuvs_bench/cuvs_bench/config/{ => datasets}/glove-50-angular.yaml (100%)
 rename python/cuvs_bench/cuvs_bench/config/{ => datasets}/glove-50-inner.yaml (100%)
 rename python/cuvs_bench/cuvs_bench/config/{ => datasets}/lastfm-65-angular.yaml (100%)
 rename python/cuvs_bench/cuvs_bench/config/{ => datasets}/mnist-784-euclidean.yaml (100%)
 rename python/cuvs_bench/cuvs_bench/config/{ => datasets}/nytimes-256-angular.yaml (100%)
 rename python/cuvs_bench/cuvs_bench/config/{ => datasets}/nytimes-256-inner.yaml (100%)
 rename python/cuvs_bench/cuvs_bench/config/{ => datasets}/sift-128-euclidean.yaml (100%)
 rename python/cuvs_bench/cuvs_bench/config/{ => datasets}/wiki_all_10M.yaml (100%)
 rename python/cuvs_bench/cuvs_bench/config/{ => datasets}/wiki_all_1M.yaml (100%)
 rename python/cuvs_bench/cuvs_bench/config/{ => datasets}/wiki_all_88M.yaml (100%)
 create mode 100644 python/cuvs_bench/cuvs_bench/run/__init__.py
 create mode 100644 python/cuvs_bench/cuvs_bench/run/__main__.py
 create mode 100644 python/cuvs_bench/cuvs_bench/run/run.py
 create mode 100644 python/cuvs_bench/cuvs_bench/run/runners.py
 create mode 100644 python/cuvs_bench/cuvs_bench/tests/test_run.py

diff --git a/.gitignore b/.gitignore
index fcbe0fa3a..17258e3de 100644
--- a/.gitignore
+++ b/.gitignore
@@ -79,3 +79,6 @@ cagra_index
 ivf_flat_index
 ivf_pq_index
 
+# cuvs_bench
+datasets/
+/*.json
\ No newline at end of file
diff --git a/build.sh b/build.sh
index a283bcd07..b463f0f0d 100755
--- a/build.sh
+++ b/build.sh
@@ -275,7 +275,7 @@ if hasArg tests || (( ${NUMARGS} == 0 )); then
 fi
 
 if hasArg bench-ann || (( ${NUMARGS} == 0 )); then
-    BUILD_ANN_BENCH=ON
+    BUILD_CUVS_BENCH=ON
     CMAKE_TARGET="${CMAKE_TARGET};${ANN_BENCH_TARGETS}"
 fi
 
@@ -351,7 +351,7 @@ if (( ${NUMARGS} == 0 )) || hasArg libcuvs || hasArg docs || hasArg tests || has
           -DDISABLE_DEPRECATION_WARNINGS=${DISABLE_DEPRECATION_WARNINGS} \
           -DBUILD_TESTS=${BUILD_TESTS} \
           -DBUILD_C_TESTS=${BUILD_TESTS} \
-          -DBUILD_ANN_BENCH=${BUILD_ANN_BENCH} \
+          -DBUILD_CUVS_BENCH=${BUILD_CUVS_BENCH} \
           -DBUILD_CPU_ONLY=${BUILD_CPU_ONLY} \
           -DCMAKE_MESSAGE_LOG_LEVEL=${CMAKE_LOG_LEVEL} \
           ${CACHE_ARGS} \
@@ -419,6 +419,11 @@ if (( ${NUMARGS} == 0 )) || hasArg python; then
         python -m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true ${REPODIR}/python/cuvs
 fi
 
+# Build and (optionally) install the cuvs_bench Python package
+if (( ${NUMARGS} == 0 )) || hasArg bench-ann; then
+    python -m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true ${REPODIR}/python/cuvs_bench
+fi
+
 # Build the cuvs Rust bindings
 if (( ${NUMARGS} == 0 )) || hasArg rust; then
     cd ${REPODIR}/rust
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index 7e1014f25..73c42ca71 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -42,5 +42,8 @@ dependencies:
 - pandas
 - pylibraft==24.10.*,>=0.0.0a0
 - pyyaml
+- rapids-build-backend>=0.3.0,<0.4.0.dev0
+- setuptools
 - sysroot_linux-aarch64==2.17
+- wheel
 name: bench_ann_cuda-118_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index 120b7afca..473e50bc6 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -42,5 +42,8 @@ dependencies:
 - pandas
 - pylibraft==24.10.*,>=0.0.0a0
 - pyyaml
+- rapids-build-backend>=0.3.0,<0.4.0.dev0
+- setuptools
 - sysroot_linux-64==2.17
+- wheel
 name: bench_ann_cuda-118_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
index ac0ea97e6..8a877c4c0 100644
--- a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
@@ -38,5 +38,8 @@ dependencies:
 - pandas
 - pylibraft==24.10.*,>=0.0.0a0
 - pyyaml
+- rapids-build-backend>=0.3.0,<0.4.0.dev0
+- setuptools
 - sysroot_linux-aarch64==2.17
+- wheel
 name: bench_ann_cuda-125_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
index e593c240d..54859a77f 100644
--- a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
@@ -38,5 +38,8 @@ dependencies:
 - pandas
 - pylibraft==24.10.*,>=0.0.0a0
 - pyyaml
+- rapids-build-backend>=0.3.0,<0.4.0.dev0
+- setuptools
 - sysroot_linux-64==2.17
+- wheel
 name: bench_ann_cuda-125_arch-x86_64
diff --git a/conda/recipes/libcuvs/build_libcuvs_tests.sh b/conda/recipes/libcuvs/build_libcuvs_tests.sh
index 5d77ae2d1..b077dbe60 100644
--- a/conda/recipes/libcuvs/build_libcuvs_tests.sh
+++ b/conda/recipes/libcuvs/build_libcuvs_tests.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
 # Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
-./build.sh tests bench-ann --allgpuarch --no-nvtx --build-metrics=tests_bench --incl-cache-stats
+./build.sh tests --allgpuarch --no-nvtx --build-metrics=tests --incl-cache-stats
 cmake --install cpp/build --component testing
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index d8d554648..b72d7f165 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -55,7 +55,7 @@ option(BUILD_SHARED_LIBS "Build cuvs shared libraries" ON)
 option(BUILD_TESTS "Build cuvs unit-tests" ON)
 option(BUILD_C_LIBRARY "Build cuVS C API library" OFF)
 option(BUILD_C_TESTS "Build cuVS C API tests" OFF)
-option(BUILD_ANN_BENCH "Build cuVS ann benchmarks" OFF)
+option(BUILD_CUVS_BENCH "Build cuVS ann benchmarks" OFF)
 option(BUILD_CAGRA_HNSWLIB "Build CAGRA+hnswlib interface" ON)
 option(CUDA_ENABLE_KERNELINFO "Enable kernel resource usage info" OFF)
 option(CUDA_ENABLE_LINEINFO
@@ -96,7 +96,7 @@ include(CMakeDependentOption)
 
 message(VERBOSE "cuVS: Build cuVS unit-tests: ${BUILD_TESTS}")
 message(VERBOSE "cuVS: Build CPU only components: ${BUILD_CPU_ONLY}")
-message(VERBOSE "cuVS: Build ANN benchmarks: ${BUILD_ANN_BENCH}")
+message(VERBOSE "cuVS: Build ANN benchmarks: ${BUILD_CUVS_BENCH}")
 message(VERBOSE "cuVS: Enable detection of conda environment for dependencies: ${DETECT_CONDA_ENV}")
 message(VERBOSE "cuVS: Disable depreaction warnings " ${DISABLE_DEPRECATION_WARNINGS})
 message(VERBOSE "cuVS: Disable OpenMP: ${DISABLE_OPENMP}")
@@ -188,7 +188,7 @@ endif()
 
 include(cmake/thirdparty/get_cutlass.cmake)
 
-if(BUILD_ANN_BENCH)
+if(BUILD_CUVS_BENCH)
   include(${rapids-cmake-dir}/cpm/gbench.cmake)
   rapids_cpm_gbench(BUILD_STATIC)
 endif()
@@ -651,6 +651,6 @@ endif()
 # ##################################################################################################
 # * build ann benchmark executable -----------------------------------------------
 
-if(BUILD_ANN_BENCH)
+if(BUILD_CUVS_BENCH)
   add_subdirectory(bench/ann/)
 endif()
diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index 6fe23483e..3224587e4 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -199,30 +199,19 @@ if(NOT TARGET CUVS_ANN_BENCH_ALL)
 endif()
 
 if(CUVS_ANN_BENCH_USE_HNSWLIB)
-  ConfigureAnnBench(
-    NAME HNSWLIB PATH src/hnswlib/hnswlib_benchmark.cpp LINKS hnswlib::hnswlib
-  )
+  ConfigureAnnBench(NAME HNSWLIB PATH src/hnswlib/hnswlib_benchmark.cpp LINKS hnswlib::hnswlib)
 
 endif()
 
 if(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ)
   ConfigureAnnBench(
-    NAME CUVS_IVF_PQ
-    PATH
-    src/cuvs/cuvs_benchmark.cu
-    src/cuvs/cuvs_ivf_pq.cu
-    LINKS cuvs
+    NAME CUVS_IVF_PQ PATH src/cuvs/cuvs_benchmark.cu src/cuvs/cuvs_ivf_pq.cu LINKS cuvs
   )
 endif()
 
 if(CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT)
   ConfigureAnnBench(
-    NAME CUVS_IVF_FLAT
-    PATH
-    src/cuvs/cuvs_benchmark.cu
-    src/cuvs/cuvs_ivf_flat.cu
-    LINKS
-    cuvs
+    NAME CUVS_IVF_FLAT PATH src/cuvs/cuvs_benchmark.cu src/cuvs/cuvs_ivf_flat.cu LINKS cuvs
   )
 endif()
 
@@ -232,12 +221,8 @@ endif()
 
 if(CUVS_KNN_BENCH_USE_CUVS_BRUTE_FORCE)
   ConfigureAnnBench(
-    NAME
-    CUVS_KNN_BRUTE_FORCE
-    PATH
-    $<$<BOOL:${CUVS_KNN_BENCH_USE_CUVS_BRUTE_FORCE}>:src/cuvs/cuvs_brute_force_knn.cu>
-    LINKS
-    cuvs
+    NAME CUVS_KNN_BRUTE_FORCE PATH
+    $<$<BOOL:${CUVS_KNN_BENCH_USE_CUVS_BRUTE_FORCE}>:src/cuvs/cuvs_brute_force_knn.cu> LINKS cuvs
   )
 endif()
 
@@ -258,8 +243,7 @@ endif()
 
 if(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB)
   ConfigureAnnBench(
-    NAME CUVS_CAGRA_HNSWLIB PATH src/cuvs/cuvs_cagra_hnswlib.cu LINKS cuvs
-    hnswlib::hnswlib
+    NAME CUVS_CAGRA_HNSWLIB PATH src/cuvs/cuvs_cagra_hnswlib.cu LINKS cuvs hnswlib::hnswlib
   )
 endif()
 
@@ -267,36 +251,31 @@ message("CUVS_FAISS_TARGETS: ${CUVS_FAISS_TARGETS}")
 message("CUDAToolkit_LIBRARY_DIR: ${CUDAToolkit_LIBRARY_DIR}")
 if(CUVS_ANN_BENCH_USE_FAISS_CPU_FLAT)
   ConfigureAnnBench(
-    NAME FAISS_CPU_FLAT PATH src/faiss/faiss_cpu_benchmark.cpp LINKS
-    ${CUVS_FAISS_TARGETS}
+    NAME FAISS_CPU_FLAT PATH src/faiss/faiss_cpu_benchmark.cpp LINKS ${CUVS_FAISS_TARGETS}
   )
 endif()
 
 if(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT)
   ConfigureAnnBench(
-    NAME FAISS_CPU_IVF_FLAT PATH src/faiss/faiss_cpu_benchmark.cpp LINKS
-    ${CUVS_FAISS_TARGETS}
+    NAME FAISS_CPU_IVF_FLAT PATH src/faiss/faiss_cpu_benchmark.cpp LINKS ${CUVS_FAISS_TARGETS}
   )
 endif()
 
 if(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_PQ)
   ConfigureAnnBench(
-    NAME FAISS_CPU_IVF_PQ PATH src/faiss/faiss_cpu_benchmark.cpp LINKS
-    ${CUVS_FAISS_TARGETS}
+    NAME FAISS_CPU_IVF_PQ PATH src/faiss/faiss_cpu_benchmark.cpp LINKS ${CUVS_FAISS_TARGETS}
   )
 endif()
 
 if(CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT AND CUVS_FAISS_ENABLE_GPU)
   ConfigureAnnBench(
-    NAME FAISS_GPU_IVF_FLAT PATH src/faiss/faiss_gpu_benchmark.cu LINKS
-    ${CUVS_FAISS_TARGETS}
+    NAME FAISS_GPU_IVF_FLAT PATH src/faiss/faiss_gpu_benchmark.cu LINKS ${CUVS_FAISS_TARGETS}
   )
 endif()
 
 if(CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_PQ AND CUVS_FAISS_ENABLE_GPU)
   ConfigureAnnBench(
-    NAME FAISS_GPU_IVF_PQ PATH src/faiss/faiss_gpu_benchmark.cu LINKS
-    ${CUVS_FAISS_TARGETS}
+    NAME FAISS_GPU_IVF_PQ PATH src/faiss/faiss_gpu_benchmark.cu LINKS ${CUVS_FAISS_TARGETS}
   )
 endif()
 
@@ -322,13 +301,8 @@ if(CUVS_ANN_BENCH_SINGLE_EXE)
 
   target_link_libraries(
     ANN_BENCH
-    PRIVATE raft::raft
-            nlohmann_json::nlohmann_json
-            benchmark::benchmark
-            dl
-            fmt::fmt-header-only
-            spdlog::spdlog_header_only
-            $<$<BOOL:${NVTX3_HEADERS_FOUND}>:CUDA::nvtx3>
+    PRIVATE raft::raft nlohmann_json::nlohmann_json benchmark::benchmark dl fmt::fmt-header-only
+            spdlog::spdlog_header_only $<$<BOOL:${NVTX3_HEADERS_FOUND}>:CUDA::nvtx3>
   )
   set_target_properties(
     ANN_BENCH
diff --git a/dependencies.yaml b/dependencies.yaml
index c63cecbbe..c18f53305 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -38,6 +38,7 @@ files:
       - develop
       - bench
       - bench_python
+      - rapids_build_setuptools
   test_cpp:
     output: none
     includes:
@@ -115,6 +116,13 @@ files:
       table: build-system
     includes:
       - rapids_build_setuptools
+  py_rapids_build_py_cuvs_bench:
+    output: pyproject
+    pyproject_dir: python/cuvs_bench
+    extras:
+      table: tool.rapids-build-backend
+      key: requires
+    includes: []
   py_run_cuvs_bench:
     output: pyproject
     pyproject_dir: python/cuvs_bench
@@ -187,7 +195,7 @@ dependencies:
 
   rapids_build_setuptools:
     common:
-      - output_types: [requirements, pyproject]
+      - output_types: [conda, requirements, pyproject]
         packages:
           - &rapids_build_backend rapids-build-backend>=0.3.0,<0.4.0.dev0
           - setuptools
@@ -469,13 +477,12 @@ dependencies:
           - openblas
   bench_python:
     common:
-      - output_types: [conda]
+      - output_types: [conda, pyproject, requirements]
         packages:
+          - click
           - matplotlib
           - pandas
           - pyyaml
-          - pandas
-          - click
   depends_on_librmm:
     common:
       - output_types: conda
diff --git a/python/cuvs/CMakeLists.txt b/python/cuvs/CMakeLists.txt
index 7d2f8dcf9..feb3bd58c 100644
--- a/python/cuvs/CMakeLists.txt
+++ b/python/cuvs/CMakeLists.txt
@@ -83,14 +83,22 @@ if(NOT cuvs_FOUND)
 
   if(NOT CUDA_STATIC_MATH_LIBRARIES AND USE_CUDA_MATH_WHEELS)
     set(rpaths
-      "$ORIGIN/../nvidia/cublas/lib"
-      "$ORIGIN/../nvidia/curand/lib"
-      "$ORIGIN/../nvidia/cusolver/lib"
-      "$ORIGIN/../nvidia/cusparse/lib"
-      "$ORIGIN/../nvidia/nvjitlink/lib"
+        "$ORIGIN/../nvidia/cublas/lib"
+        "$ORIGIN/../nvidia/curand/lib"
+        "$ORIGIN/../nvidia/cusolver/lib"
+        "$ORIGIN/../nvidia/cusparse/lib"
+        "$ORIGIN/../nvidia/nvjitlink/lib"
+    )
+    set_property(
+      TARGET cuvs
+      PROPERTY INSTALL_RPATH ${rpaths}
+      APPEND
+    )
+    set_property(
+      TARGET cuvs_c
+      PROPERTY INSTALL_RPATH ${rpaths}
+      APPEND
     )
-    set_property(TARGET cuvs PROPERTY INSTALL_RPATH ${rpaths} APPEND)
-    set_property(TARGET cuvs_c PROPERTY INSTALL_RPATH ${rpaths} APPEND)
   endif()
 
   set(cython_lib_dir cuvs)
diff --git a/python/cuvs/cuvs/neighbors/filters/CMakeLists.txt b/python/cuvs/cuvs/neighbors/filters/CMakeLists.txt
index 8f281d1c8..c90615feb 100644
--- a/python/cuvs/cuvs/neighbors/filters/CMakeLists.txt
+++ b/python/cuvs/cuvs/neighbors/filters/CMakeLists.txt
@@ -20,6 +20,5 @@ set(linked_libraries cuvs::cuvs cuvs::c_api)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX
-                   neighbors_prefilter_
+  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX neighbors_prefilter_
 )
diff --git a/python/cuvs_bench/cuvs_bench/config/__init__.py b/python/cuvs_bench/cuvs_bench/config/__init__.py
new file mode 100644
index 000000000..7c04e3fd8
--- /dev/null
+++ b/python/cuvs_bench/cuvs_bench/config/__init__.py
@@ -0,0 +1,17 @@
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from .algos.constraints import *
diff --git a/python/cuvs_bench/cuvs_bench/config/algorithms.yaml b/python/cuvs_bench/cuvs_bench/config/algorithms.yaml
new file mode 100644
index 000000000..dc1127fbc
--- /dev/null
+++ b/python/cuvs_bench/cuvs_bench/config/algorithms.yaml
@@ -0,0 +1,42 @@
+faiss_gpu_flat:
+  executable: FAISS_GPU_FLAT_ANN_BENCH
+  requires_gpu: true
+faiss_gpu_ivf_flat:
+  executable: FAISS_GPU_IVF_FLAT_ANN_BENCH
+  requires_gpu: true
+faiss_gpu_ivf_pq:
+  executable: FAISS_GPU_IVF_PQ_ANN_BENCH
+  requires_gpu: true
+faiss_gpu_ivf_sq:
+  executable: FAISS_GPU_IVF_PQ_ANN_BENCH
+  requires_gpu: true
+faiss_cpu_flat:
+  executable: FAISS_CPU_FLAT_ANN_BENCH
+  requires_gpu: false
+faiss_cpu_ivf_flat:
+  executable: FAISS_CPU_IVF_FLAT_ANN_BENCH
+  requires_gpu: false
+faiss_cpu_ivf_pq:
+  executable: FAISS_CPU_IVF_PQ_ANN_BENCH
+  requires_gpu: false
+cuvs_ivf_flat:
+  executable: CUVS_IVF_FLAT_ANN_BENCH
+  requires_gpu: true
+cuvs_ivf_pq:
+  executable: CUVS_IVF_PQ_ANN_BENCH
+  requires_gpu: true
+cuvs_cagra:
+  executable: CUVS_CAGRA_ANN_BENCH
+  requires_gpu: true
+cuvs_brute_force:
+  executable: CUVS_BRUTE_FORCE_ANN_BENCH
+  requires_gpu: true
+ggnn:
+  executable: GGNN_ANN_BENCH
+  requires_gpu: true
+hnswlib:
+  executable: HNSWLIB_ANN_BENCH
+  requires_gpu: false
+cuvs_cagra_hnswlib:
+  executable: CUVS_CAGRA_HNSWLIB_ANN_BENCH
+  requires_gpu: true
diff --git a/python/cuvs_bench/cuvs_bench/config/algos/__init__.py b/python/cuvs_bench/cuvs_bench/config/algos/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/cuvs_bench/cuvs_bench/config/constraints.py b/python/cuvs_bench/cuvs_bench/config/algos/constraints/__init__.py
similarity index 98%
rename from python/cuvs_bench/cuvs_bench/config/constraints.py
rename to python/cuvs_bench/cuvs_bench/config/algos/constraints/__init__.py
index ff451c056..de05bd752 100644
--- a/python/cuvs_bench/cuvs_bench/config/constraints.py
+++ b/python/cuvs_bench/cuvs_bench/config/algos/constraints/__init__.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra.yaml b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra.yaml
index e7b049d0c..edacb25b5 100644
--- a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra.yaml
+++ b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra.yaml
@@ -1,11 +1,11 @@
 name: cuvs_cagra
 constraints:
-  build: cuvs_bench.constraints.raft_cagra_build_constraints
-  search: cuvs_bench.constraints.raft_cagra_search_constraints
+  build: cuvs_bench.config.algos.constraints.cuvs_cagra_build
+  search: cuvs_bench.config.algos.constraints.cuvs_cagra_search
 groups:
   base:
     build:
-      graph_degree: [32, 64, 128, 256]
+      graph_degree: [32, 64, 96, 128]
       intermediate_graph_degree: [32, 64, 96, 128]
       graph_build_algo: ["NN_DESCENT"]
     search:
diff --git a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml
index 70e344dfd..f1a7f272c 100644
--- a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml
+++ b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml
@@ -1,6 +1,6 @@
 name: cuvs_cagra_hnswlib
 constraints:
-  search: cuvs_bench.constraints.hnswlib_search
+  search: cuvs_bench.config.algos.constraints.hnswlib_search
 groups:
   base:
     build:
diff --git a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_ivf_pq.yaml b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_ivf_pq.yaml
index aa95d6716..d68e7973a 100644
--- a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_ivf_pq.yaml
+++ b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_ivf_pq.yaml
@@ -1,7 +1,7 @@
 name: cuvs_ivf_pq
 constraints:
-  build: cuvs_bench.constraints.cuvs_ivf_pq_build
-  search: cuvs_bench.constraints.cuvs_ivf_pq_search
+  build: cuvs_bench.config.algos.constraints.cuvs_ivf_pq_build
+  search: cuvs_bench.config.algos.constraints.cuvs_ivf_pq_search
 groups:
   base:
     build:
diff --git a/python/cuvs_bench/cuvs_bench/config/algos/faiss_gpu_ivf_pq.yaml b/python/cuvs_bench/cuvs_bench/config/algos/faiss_gpu_ivf_pq.yaml
index 1bd78b736..782f3aed1 100644
--- a/python/cuvs_bench/cuvs_bench/config/algos/faiss_gpu_ivf_pq.yaml
+++ b/python/cuvs_bench/cuvs_bench/config/algos/faiss_gpu_ivf_pq.yaml
@@ -1,7 +1,7 @@
 name: faiss_gpu_ivf_pq
 constraints:
-  build: cuvs_bench.constraints.faiss_gpu_ivf_pq_build
-  search: cuvs_bench.constraints.faiss_gpu_ivf_pq_search
+  build: cuvs_bench.config.algos.constraints.faiss_gpu_ivf_pq_build
+  search: cuvs_bench.config.algos.constraints.faiss_gpu_ivf_pq_search
 groups:
   base:
     build:
diff --git a/python/cuvs_bench/cuvs_bench/config/algos/hnswlib.yaml b/python/cuvs_bench/cuvs_bench/config/algos/hnswlib.yaml
index dbd73155d..93d8cff2d 100644
--- a/python/cuvs_bench/cuvs_bench/config/algos/hnswlib.yaml
+++ b/python/cuvs_bench/cuvs_bench/config/algos/hnswlib.yaml
@@ -1,6 +1,6 @@
 name: hnswlib
 constraints:
-  search: cuvs_bench.constraints.hnswlib_search
+  search: cuvs_bench.config.algos.constraints.hnswlib_search
 groups:
   base:
     build:
diff --git a/python/cuvs_bench/cuvs_bench/config/bigann-100M.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/bigann-100M.yaml
similarity index 100%
rename from python/cuvs_bench/cuvs_bench/config/bigann-100M.yaml
rename to python/cuvs_bench/cuvs_bench/config/datasets/bigann-100M.yaml
diff --git a/python/cuvs_bench/cuvs_bench/config/datasets.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/datasets.yaml
similarity index 100%
rename from python/cuvs_bench/cuvs_bench/config/datasets.yaml
rename to python/cuvs_bench/cuvs_bench/config/datasets/datasets.yaml
diff --git a/python/cuvs_bench/cuvs_bench/config/deep-100M.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/deep-100M.yaml
similarity index 100%
rename from python/cuvs_bench/cuvs_bench/config/deep-100M.yaml
rename to python/cuvs_bench/cuvs_bench/config/datasets/deep-100M.yaml
diff --git a/python/cuvs_bench/cuvs_bench/config/deep-1B.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/deep-1B.yaml
similarity index 100%
rename from python/cuvs_bench/cuvs_bench/config/deep-1B.yaml
rename to python/cuvs_bench/cuvs_bench/config/datasets/deep-1B.yaml
diff --git a/python/cuvs_bench/cuvs_bench/config/deep-image-96-inner.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/deep-image-96-inner.yaml
similarity index 100%
rename from python/cuvs_bench/cuvs_bench/config/deep-image-96-inner.yaml
rename to python/cuvs_bench/cuvs_bench/config/datasets/deep-image-96-inner.yaml
diff --git a/python/cuvs_bench/cuvs_bench/config/fashion-mnist-784-euclidean.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/fashion-mnist-784-euclidean.yaml
similarity index 100%
rename from python/cuvs_bench/cuvs_bench/config/fashion-mnist-784-euclidean.yaml
rename to python/cuvs_bench/cuvs_bench/config/datasets/fashion-mnist-784-euclidean.yaml
diff --git a/python/cuvs_bench/cuvs_bench/config/gist-960-euclidean.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/gist-960-euclidean.yaml
similarity index 100%
rename from python/cuvs_bench/cuvs_bench/config/gist-960-euclidean.yaml
rename to python/cuvs_bench/cuvs_bench/config/datasets/gist-960-euclidean.yaml
diff --git a/python/cuvs_bench/cuvs_bench/config/glove-100-angular.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/glove-100-angular.yaml
similarity index 100%
rename from python/cuvs_bench/cuvs_bench/config/glove-100-angular.yaml
rename to python/cuvs_bench/cuvs_bench/config/datasets/glove-100-angular.yaml
diff --git a/python/cuvs_bench/cuvs_bench/config/glove-100-inner.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/glove-100-inner.yaml
similarity index 100%
rename from python/cuvs_bench/cuvs_bench/config/glove-100-inner.yaml
rename to python/cuvs_bench/cuvs_bench/config/datasets/glove-100-inner.yaml
diff --git a/python/cuvs_bench/cuvs_bench/config/glove-50-angular.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/glove-50-angular.yaml
similarity index 100%
rename from python/cuvs_bench/cuvs_bench/config/glove-50-angular.yaml
rename to python/cuvs_bench/cuvs_bench/config/datasets/glove-50-angular.yaml
diff --git a/python/cuvs_bench/cuvs_bench/config/glove-50-inner.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/glove-50-inner.yaml
similarity index 100%
rename from python/cuvs_bench/cuvs_bench/config/glove-50-inner.yaml
rename to python/cuvs_bench/cuvs_bench/config/datasets/glove-50-inner.yaml
diff --git a/python/cuvs_bench/cuvs_bench/config/lastfm-65-angular.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/lastfm-65-angular.yaml
similarity index 100%
rename from python/cuvs_bench/cuvs_bench/config/lastfm-65-angular.yaml
rename to python/cuvs_bench/cuvs_bench/config/datasets/lastfm-65-angular.yaml
diff --git a/python/cuvs_bench/cuvs_bench/config/mnist-784-euclidean.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/mnist-784-euclidean.yaml
similarity index 100%
rename from python/cuvs_bench/cuvs_bench/config/mnist-784-euclidean.yaml
rename to python/cuvs_bench/cuvs_bench/config/datasets/mnist-784-euclidean.yaml
diff --git a/python/cuvs_bench/cuvs_bench/config/nytimes-256-angular.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/nytimes-256-angular.yaml
similarity index 100%
rename from python/cuvs_bench/cuvs_bench/config/nytimes-256-angular.yaml
rename to python/cuvs_bench/cuvs_bench/config/datasets/nytimes-256-angular.yaml
diff --git a/python/cuvs_bench/cuvs_bench/config/nytimes-256-inner.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/nytimes-256-inner.yaml
similarity index 100%
rename from python/cuvs_bench/cuvs_bench/config/nytimes-256-inner.yaml
rename to python/cuvs_bench/cuvs_bench/config/datasets/nytimes-256-inner.yaml
diff --git a/python/cuvs_bench/cuvs_bench/config/sift-128-euclidean.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/sift-128-euclidean.yaml
similarity index 100%
rename from python/cuvs_bench/cuvs_bench/config/sift-128-euclidean.yaml
rename to python/cuvs_bench/cuvs_bench/config/datasets/sift-128-euclidean.yaml
diff --git a/python/cuvs_bench/cuvs_bench/config/wiki_all_10M.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/wiki_all_10M.yaml
similarity index 100%
rename from python/cuvs_bench/cuvs_bench/config/wiki_all_10M.yaml
rename to python/cuvs_bench/cuvs_bench/config/datasets/wiki_all_10M.yaml
diff --git a/python/cuvs_bench/cuvs_bench/config/wiki_all_1M.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/wiki_all_1M.yaml
similarity index 100%
rename from python/cuvs_bench/cuvs_bench/config/wiki_all_1M.yaml
rename to python/cuvs_bench/cuvs_bench/config/datasets/wiki_all_1M.yaml
diff --git a/python/cuvs_bench/cuvs_bench/config/wiki_all_88M.yaml b/python/cuvs_bench/cuvs_bench/config/datasets/wiki_all_88M.yaml
similarity index 100%
rename from python/cuvs_bench/cuvs_bench/config/wiki_all_88M.yaml
rename to python/cuvs_bench/cuvs_bench/config/datasets/wiki_all_88M.yaml
diff --git a/python/cuvs_bench/cuvs_bench/run/__init__.py b/python/cuvs_bench/cuvs_bench/run/__init__.py
new file mode 100644
index 000000000..7cb04e6f8
--- /dev/null
+++ b/python/cuvs_bench/cuvs_bench/run/__init__.py
@@ -0,0 +1,17 @@
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from .run import run_benchmark
diff --git a/python/cuvs_bench/cuvs_bench/run/__main__.py b/python/cuvs_bench/cuvs_bench/run/__main__.py
new file mode 100644
index 000000000..b5d99a4bf
--- /dev/null
+++ b/python/cuvs_bench/cuvs_bench/run/__main__.py
@@ -0,0 +1,216 @@
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+from pathlib import Path
+from typing import Optional
+
+import click
+from run import run_benchmark
+
+
+@click.command()
+@click.option(
+    "--subset-size",
+    type=click.IntRange(min=1),
+    help="The number of subset rows of the dataset to build the index",
+)
+@click.option(
+    "-k",
+    "--count",
+    default=10,
+    show_default=True,
+    type=click.IntRange(min=1),
+    prompt="Enter the number of  neighbors to search for",
+    help="The number of nearest neighbors to search for",
+)
+@click.option(
+    "-bs",
+    "--batch-size",
+    default=10000,
+    show_default=True,
+    type=click.IntRange(min=1),
+    prompt="Enter the batch size",
+    help="Number of query vectors to use in each query trial",
+)
+@click.option(
+    "--dataset-configuration",
+    default=None,
+    show_default=True,
+    help="Path to YAML configuration file for datasets",
+)
+@click.option(
+    "--configuration",
+    help="Path to YAML configuration file or directory for algorithms. "
+    "Any run groups found in the specified file/directory will "
+    "automatically override groups of the same name present in the "
+    "default configurations, including `base`.",
+)
+@click.option(
+    "--dataset",
+    default="glove-100-inner",
+    show_default=True,
+    prompt="Enter the name of dataset",
+    help="Name of dataset",
+)
+@click.option(
+    "--dataset-path",
+    default=lambda: os.environ.get(
+        "RAPIDS_DATASET_ROOT_DIR",
+        os.path.join(Path(__file__).parent, "datasets/"),
+    ),
+    show_default=True,
+    prompt="Enter the path to dataset folder",
+    help="Path to dataset folder, by default will look in "
+    "RAPIDS_DATASET_ROOT_DIR if defined, otherwise a datasets "
+    "subdirectory from the calling directory.",
+)
+@click.option("--build", is_flag=True, help="Build the index")
+@click.option("--search", is_flag=True, help="Perform the search")
+@click.option(
+    "--algorithms",
+    default="cuvs_cagra",
+    show_default=True,
+    prompt="Enter the comma separated list of named algorithms to run",
+    help="Run only comma separated list of named algorithms. If parameters "
+    "`groups` and `algo-groups` are both undefined, then group `base` "
+    "is run by default.",
+)
+@click.option(
+    "--groups",
+    default="base",
+    show_default=True,
+    prompt="Enter the comma separated groups of parameters",
+    help="Run only comma separated groups of parameters",
+)
+@click.option(
+    "--algo-groups",
+    help="Add comma separated <algorithm>.<group> to run. Example usage: "
+    ' "--algo-groups=cuvs_cagra.large,hnswlib.large".',
+)
+@click.option(
+    "-f",
+    "--force",
+    is_flag=True,
+    help="Re-run algorithms even if their results already exist",
+)
+@click.option(
+    "-m",
+    "--search-mode",
+    default="latency",
+    show_default=True,
+    prompt='Enter the search mode ("latency" or "throughput")',
+    help="Run search in 'latency' (measure individual batches) or "
+    "'throughput' (pipeline batches and measure end-to-end) mode.",
+)
+@click.option(
+    "-t",
+    "--search-threads",
+    default=None,
+    show_default=True,
+    help="Specify the number threads to use for throughput benchmark. "
+    "Single value or a pair of min and max separated by ':'. "
+    "Example: --search-threads=1:4. Power of 2 values between 'min' "
+    "and 'max' will be used. If only 'min' is specified, then a single "
+    "test is run with 'min' threads. By default min=1, "
+    "max=<num hyper threads>.",
+)
+@click.option(
+    "-r",
+    "--dry-run",
+    is_flag=True,
+    help="Dry-run mode will convert the yaml config for the specified "
+    "algorithms and datasets to the json format that’s consumed "
+    "by the lower-level c++ binaries and then print the command to "
+    "run execute the benchmarks but will not actually execute "
+    "the command.",
+)
+@click.option(
+    "--raft-log-level",
+    default="info",
+    show_default=True,
+    prompt="Enter the log level",
+    help="Log level, possible values are [off, error, warn, info, debug, "
+    "trace]. Default: 'info'. Note that 'debug' or more detailed "
+    "logging level requires that the library is compiled with "
+    "-DRAFT_ACTIVE_LEVEL=<L> where <L> >= <requested log level>.",
+)
+def main(
+    subset_size: Optional[int],
+    count: int,
+    batch_size: int,
+    dataset_configuration: Optional[str],
+    configuration: Optional[str],
+    dataset: str,
+    dataset_path: str,
+    build: bool,
+    search: bool,
+    algorithms: Optional[str],
+    groups: str,
+    algo_groups: Optional[str],
+    force: bool,
+    search_mode: str,
+    search_threads: Optional[str],
+    dry_run: bool,
+    raft_log_level: str,
+) -> None:
+    """
+    Main function to run the benchmark with the provided options.
+
+    Parameters
+    ----------
+    subset_size : Optional[int]
+        The number of subset rows of the dataset to build the index.
+    count : int
+        The number of nearest neighbors to search for.
+    batch_size : int
+        Number of query vectors to use in each query trial.
+    dataset_configuration : Optional[str]
+        Path to YAML configuration file for datasets.
+    configuration : Optional[str]
+        Path to YAML configuration file or directory for algorithms.
+    dataset : str
+        Name of the dataset to use.
+    dataset_path : str
+        Path to the dataset folder.
+    build : bool
+        Whether to build the indices.
+    search : bool
+        Whether to perform the search.
+    algorithms : Optional[str]
+        Comma-separated list of algorithm names to use.
+    groups : str
+        Comma-separated list of groups to consider.
+    algo_groups : Optional[str]
+        Comma-separated list of algorithm groups to consider.
+    force : bool
+        Whether to force the execution regardless of warnings.
+    search_mode : str
+        The mode of search to perform ('latency' or 'throughput').
+    search_threads : Optional[str]
+        The number of threads to use for throughput benchmark.
+    dry_run : bool
+        Whether to perform a dry run without actual execution.
+    raft_log_level : str
+        The logging level for the RAFT library.
+
+    """
+
+    run_benchmark(**locals())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/cuvs_bench/cuvs_bench/run/run.py b/python/cuvs_bench/cuvs_bench/run/run.py
new file mode 100644
index 000000000..dbedcc183
--- /dev/null
+++ b/python/cuvs_bench/cuvs_bench/run/run.py
@@ -0,0 +1,685 @@
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import itertools
+import os
+import warnings
+from importlib import import_module
+from typing import Any, Dict, Optional, Tuple
+
+import yaml
+from runners import cuvs_bench_cpp
+
+
+def rmm_present() -> bool:
+    """
+    Check if RMM (RAPIDS Memory Manager) is present.
+
+    Returns
+    -------
+    bool
+        True if RMM is present, False otherwise.
+    """
+    try:
+        import rmm  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+def load_yaml_file(file_path: str) -> dict:
+    """
+    Load a YAML file and return its contents as a dictionary.
+
+    Parameters
+    ----------
+    file_path : str
+        The path to the YAML file.
+
+    Returns
+    -------
+    dict
+        The contents of the YAML file.
+    """
+    with open(file_path, "r") as f:
+        return yaml.safe_load(f)
+
+
+def get_dataset_configuration(dataset: str, dataset_conf_all: list) -> dict:
+    """
+    Retrieve the configuration for a specific dataset.
+
+    Parameters
+    ----------
+    dataset : str
+        The name of the dataset to retrieve the configuration for.
+    dataset_conf_all : list
+        A list of dataset configurations.
+
+    Returns
+    -------
+    dict
+        The configuration for the specified dataset.
+
+    Raises
+    ------
+    ValueError
+        If the dataset configuration is not found.
+    """
+    for dset in dataset_conf_all:
+        if dataset == dset["name"]:
+            return dset
+    raise ValueError("Could not find a dataset configuration")
+
+
+def prepare_conf_file(
+    dataset_conf: dict, subset_size: Optional[int], count: int, batch_size: int
+) -> dict:
+    """
+    Prepare the main configuration file for the benchmark.
+
+    Parameters
+    ----------
+    dataset_conf : dict
+        The configuration for the dataset.
+    subset_size : Optional[int]
+        The subset size of the dataset.
+    count : int
+        The number of nearest neighbors to search for.
+    batch_size : int
+        The size of each batch for processing.
+
+    Returns
+    -------
+    dict
+        The prepared configuration file.
+    """
+    conf_file = {"dataset": dataset_conf}
+    if subset_size:
+        conf_file["dataset"]["subset_size"] = subset_size
+    conf_file["search_basic_param"] = {"k": count, "batch_size": batch_size}
+    return conf_file
+
+
+def gather_algorithm_configs(
+    scripts_path: str, configuration: Optional[str]
+) -> list:
+    """
+    Gather the list of algorithm configuration files.
+
+    Parameters
+    ----------
+    scripts_path : str
+        The path to the script directory.
+    configuration : Optional[str]
+        The path to the algorithm configuration directory or file.
+
+    Returns
+    -------
+    list
+        A list of paths to the algorithm configuration files.
+    """
+    algos_conf_fs = os.listdir(
+        os.path.join(scripts_path, "../config", "algos")
+    )
+    algos_conf_fs = [
+        os.path.join(scripts_path, "../config", "algos", f)
+        for f in algos_conf_fs
+        if ".json" not in f and "constraint" not in f and ".py" not in f
+    ]
+
+    if configuration:
+        if os.path.isdir(configuration):
+            algos_conf_fs += [
+                os.path.join(configuration, f)
+                for f in os.listdir(configuration)
+                if ".json" not in f
+            ]
+        elif os.path.isfile(configuration):
+            algos_conf_fs.append(configuration)
+    return algos_conf_fs
+
+
+def load_algorithms_conf(
+    algos_conf_fs: list,
+    allowed_algos: Optional[list],
+    allowed_algo_groups: Optional[tuple],
+) -> dict:
+    """
+    Load and filter the algorithm configurations.
+
+    Parameters
+    ----------
+    algos_conf_fs : list
+        A list of paths to algorithm configuration files.
+    allowed_algos : Optional[list]
+        A list of allowed algorithm names to filter by.
+    allowed_algo_groups : Optional[tuple]
+        A tuple of allowed algorithm groups to filter by.
+
+    Returns
+    -------
+    dict
+        A dictionary containing the loaded and filtered algorithm
+        configurations.
+    """
+    algos_conf = {}
+    for algo_f in algos_conf_fs:
+        try:
+            algo = load_yaml_file(algo_f)
+        except Exception as e:
+            warnings.warn(f"Could not load YAML config {algo_f} due to {e}")
+            continue
+        if allowed_algos and algo["name"] not in allowed_algos:
+            continue
+        algos_conf[algo["name"]] = {
+            "groups": algo.get("groups", {}),
+            "constraints": algo.get("constraints", {}),
+        }
+        if allowed_algo_groups and algo["name"] in allowed_algo_groups[0]:
+            algos_conf[algo["name"]]["groups"].update(
+                {
+                    group: algo["groups"][group]
+                    for group in allowed_algo_groups[1]
+                    if group in algo["groups"]
+                }
+            )
+    return algos_conf
+
+
+def prepare_executables(
+    algos_conf: dict,
+    algos_yaml: dict,
+    gpu_present: bool,
+    conf_file: dict,
+    dataset_path: str,
+    dataset: str,
+    count: int,
+    batch_size: int,
+) -> dict:
+    """
+    Prepare the list of executables to run based on the configurations.
+
+    Parameters
+    ----------
+    algos_conf : dict
+        The loaded algorithm configurations.
+    algos_yaml : dict
+        The global algorithms configuration.
+    gpu_present : bool
+        Whether a GPU is present.
+    conf_file : dict
+        The main configuration file.
+    dataset_path : str
+        The path to the dataset directory.
+    dataset : str
+        The name of the dataset.
+    count : int
+        The number of nearest neighbors to search for.
+    batch_size : int
+        The size of each batch for processing.
+
+    Returns
+    -------
+    dict
+        A dictionary of executables to run with their associated
+        configurations.
+    """
+    executables_to_run = {}
+    for algo, algo_conf in algos_conf.items():
+        validate_algorithm(algos_yaml, algo, gpu_present)
+        for group, group_conf in algo_conf["groups"].items():
+            executable = find_executable(
+                algos_yaml, algo, group, count, batch_size
+            )
+            if executable not in executables_to_run:
+                executables_to_run[executable] = {"index": []}
+            indexes = prepare_indexes(
+                group_conf,
+                algo,
+                group,
+                conf_file,
+                algos_conf,
+                dataset_path,
+                dataset,
+                count,
+                batch_size,
+            )
+            executables_to_run[executable]["index"].extend(indexes)
+    return executables_to_run
+
+
+def validate_algorithm(algos_conf: dict, algo: str, gpu_present: bool) -> bool:
+    """
+    Validate the algorithm based on the available hardware (GPU presence).
+
+    Parameters
+    ----------
+    algos_conf : dict
+        The configuration dictionary for the algorithms.
+    algo : str
+        The name of the algorithm.
+    gpu_present : bool
+        Whether a GPU is present.
+
+    Returns
+    -------
+    bool
+        True if the algorithm is valid for the current hardware
+        configuration, False otherwise.
+    """
+    algos_conf_keys = set(algos_conf.keys())
+    if gpu_present:
+        return algo in algos_conf_keys
+    return (
+        algo in algos_conf_keys and algos_conf[algo]["requires_gpu"] is False
+    )
+
+
+def find_executable(
+    algos_conf: dict, algo: str, group: str, k: int, batch_size: int
+) -> Tuple[str, str, Tuple[str, str]]:
+    """
+    Find the executable for the given algorithm and group.
+
+    Parameters
+    ----------
+    algos_conf : dict
+        The configuration dictionary for the algorithms.
+    algo : str
+        The name of the algorithm.
+    group : str
+        The name of the group.
+    k : int
+        The number of nearest neighbors to search for.
+    batch_size : int
+        The size of each batch for processing.
+
+    Returns
+    -------
+    Tuple[str, str, Tuple[str, str]]
+        A tuple containing the executable name, the path to the executable,
+        and the file name.
+    """
+    executable = algos_conf[algo]["executable"]
+    file_name = (f"{algo},{group}", f"{algo},{group},k{k},bs{batch_size}")
+    build_path = get_build_path(executable)
+    if build_path:
+        return executable, build_path, file_name
+    raise FileNotFoundError(executable)
+
+
+def get_build_path(executable: str) -> Optional[str]:
+    """
+    Get the build path for the given executable.
+
+    Parameters
+    ----------
+    executable : str
+        The name of the executable.
+
+    Returns
+    -------
+    Optional[str]
+        The build path for the executable, if found.
+    """
+
+    devcontainer_path = "/home/coder/cuvs/cpp/build/latest/bench/ann"
+    if os.path.exists(devcontainer_path):
+        print(f"-- Detected devcontainer artifacts in {devcontainer_path}.")
+        return devcontainer_path
+
+    build_path = os.getenv("CUVS_HOME")
+    if build_path:
+        build_path = os.path.join(
+            build_path, "cpp", "build", "release", executable
+        )
+        if os.path.exists(build_path):
+            print(f"-- Using RAFT bench from repository in {build_path}.")
+            return build_path
+
+    conda_path = os.getenv("CONDA_PREFIX")
+    if conda_path:
+        conda_executable = os.path.join(conda_path, "bin", "ann", executable)
+        if os.path.exists(conda_executable):
+            print("-- Using cuVS bench found in conda environment.")
+            return conda_executable
+
+    return None
+
+
+def prepare_indexes(
+    group_conf: dict,
+    algo: str,
+    group: str,
+    conf_file: dict,
+    algos_conf: dict,
+    dataset_path: str,
+    dataset: str,
+    count: int,
+    batch_size: int,
+) -> list:
+    """
+    Prepare the index configurations for the given algorithm and group.
+
+    Parameters
+    ----------
+    group_conf : dict
+        The configuration for the algorithm group.
+    algo : str
+        The name of the algorithm.
+    group : str
+        The name of the group.
+    conf_file : dict
+        The main configuration file.
+    dataset_path : str
+        The path to the dataset directory.
+    dataset : str
+        The name of the dataset.
+    count : int
+        The number of nearest neighbors to search for.
+    batch_size : int
+        The size of each batch for processing.
+
+    Returns
+    -------
+    list
+        A list of index configurations.
+    """
+    indexes = []
+    build_params = group_conf.get("build", {})
+    search_params = group_conf.get("search", {})
+    all_build_params = itertools.product(*build_params.values())
+    search_param_names, search_param_lists = (
+        zip(*search_params.items()) if search_params else ([], [])
+    )
+    param_names = list(build_params.keys())
+    for params in all_build_params:
+        index = {
+            "algo": algo,
+            "build_param": dict(zip(build_params.keys(), params)),
+        }
+        index_name = f"{algo}_{group}" if group != "base" else f"{algo}"
+        for i in range(len(params)):
+            index["build_param"][param_names[i]] = params[i]
+            index_name += "." + f"{param_names[i]}{params[i]}"
+
+        if not validate_constraints(
+            algos_conf,
+            algo,
+            "build",
+            index["build_param"],
+            None,
+            conf_file["dataset"].get("dims"),
+            count,
+            batch_size,
+        ):
+            continue
+
+        index_filename = (
+            index_name if len(index_name) < 128 else str(hash(index_name))
+        )
+        index["name"] = index_name
+        index["file"] = os.path.join(
+            dataset_path, dataset, "index", index_filename
+        )
+        index["search_params"] = validate_search_params(
+            itertools.product(*search_param_lists),
+            search_param_names,
+            index["build_param"],
+            algo,
+            group_conf,
+            algos_conf,
+            conf_file,
+            count,
+            batch_size,
+        )
+        if index["search_params"]:
+            indexes.append(index)
+    return indexes
+
+
+def validate_search_params(
+    all_search_params,
+    search_param_names,
+    build_params,
+    algo,
+    group_conf,
+    algos_conf,
+    conf_file,
+    count,
+    batch_size,
+) -> list:
+    """
+    Validate and prepare the search parameters for the given algorithm
+    and group.
+
+    Parameters
+    ----------
+    all_search_params : itertools.product
+        The Cartesian product of search parameter values.
+    search_param_names : list
+        The names of the search parameters.
+    algo : str
+        The name of the algorithm.
+    group_conf : dict
+        The configuration for the algorithm group.
+    conf_file : dict
+        The main configuration file.
+    count : int
+        The number of nearest neighbors to search for.
+    batch_size : int
+        The size of each batch for processing.
+
+    Returns
+    -------
+    list
+        A list of validated search parameters.
+    """
+    search_params_list = []
+    for search_params in all_search_params:
+        search_dict = dict(zip(search_param_names, search_params))
+        if validate_constraints(
+            algos_conf,
+            algo,
+            "search",
+            search_dict,
+            build_params,
+            conf_file["dataset"].get("dims"),
+            count,
+            batch_size,
+        ):
+            search_params_list.append(search_dict)
+    return search_params_list
+
+
+def validate_constraints(
+    algos_conf: Dict[str, Any],
+    algo: str,
+    constraint_type: str,
+    param: Dict[str, Any],
+    build_param: dict,
+    dims: Any,
+    k: Optional[int],
+    batch_size: Optional[int],
+) -> bool:
+    """
+    Validate the constraints for the given algorithm and constraint type.
+
+    Parameters
+    ----------
+    algos_conf : Dict[str, Any]
+        The configuration dictionary for the algorithms.
+    algo : str
+        The name of the algorithm.
+    constraint_type : str
+        The type of constraint to validate ('build' or 'search').
+    param : Dict[str, Any]
+        The parameters to validate against the constraints.
+    dims : Any
+        The dimensions required for the constraints.
+    k : Optional[int]
+        The number of nearest neighbors to search for.
+    batch_size : Optional[int]
+        The size of each batch for processing.
+
+    Returns
+    -------
+    bool
+        True if the constraints are valid, False otherwise.
+
+    Raises
+    ------
+    ValueError
+        If `dims` are needed for build constraints but not specified in the
+        dataset configuration.
+    """
+    if constraint_type in algos_conf[algo]["constraints"]:
+        importable = algos_conf[algo]["constraints"][constraint_type]
+        module, func = (
+            ".".join(importable.split(".")[:-1]),
+            importable.split(".")[-1],
+        )
+        validator = import_module(module)
+        constraints_func = getattr(validator, func)
+        if constraint_type == "build":
+            return constraints_func(param, dims)
+        else:
+            return constraints_func(param, build_param, k, batch_size)
+    return True
+
+
+def run_benchmark(
+    subset_size: int,
+    count: int,
+    batch_size: int,
+    dataset_configuration: Optional[str],
+    configuration: Optional[str],
+    dataset: str,
+    dataset_path: str,
+    build: Optional[bool],
+    search: Optional[bool],
+    algorithms: Optional[str],
+    groups: str,
+    algo_groups: Optional[str],
+    force: bool,
+    search_mode: str,
+    search_threads: int,
+    dry_run: bool,
+    raft_log_level: int,
+) -> None:
+    """
+    Runs a benchmarking process based on the provided configurations.
+
+    Parameters
+    ----------
+    subset_size : int
+        The subset size of the dataset.
+    count : int
+        The number of nearest neighbors to search for.
+    batch_size : int
+        The size of each batch for processing.
+    dataset_configuration : Optional[str]
+        Path to the dataset configuration file.
+    configuration : Optional[str]
+        Path to the algorithm configuration directory or file.
+    dataset : str
+        The name of the dataset to use.
+    dataset_path : str
+        The path to the dataset directory.
+    build : Optional[bool]
+        Whether to build the indices.
+    search : Optional[bool]
+        Whether to perform the search.
+    algorithms : Optional[str]
+        Comma-separated list of algorithm names to use.
+    groups : str
+        Comma-separated list of groups to consider.
+    algo_groups : Optional[str]
+        Comma-separated list of algorithm groups to consider.
+    force : bool
+        Whether to force the execution regardless of warnings.
+    search_mode : str
+        The mode of search to perform.
+    search_threads : int
+        The number of threads to use for searching.
+    dry_run : bool
+        Whether to perform a dry run without actual execution.
+    raft_log_level : int
+        The logging level for the RAFT library.
+
+    Returns
+    -------
+    None
+    """
+    scripts_path = os.path.dirname(os.path.realpath(__file__))
+    gpu_present = rmm_present()
+
+    if not build and not search:
+        build, search = True, True
+
+    dataset_conf_all = load_yaml_file(
+        dataset_configuration
+        or os.path.join(scripts_path, "../config/datasets", "datasets.yaml")
+    )
+    dataset_conf = get_dataset_configuration(dataset, dataset_conf_all)
+    conf_file = prepare_conf_file(dataset_conf, subset_size, count, batch_size)
+    algos_conf_fs = gather_algorithm_configs(scripts_path, configuration)
+
+    allowed_algos = algorithms.split(",") if algorithms else None
+    allowed_algo_groups = (
+        [algo_group.split(".") for algo_group in algo_groups.split(",")]
+        if algo_groups
+        else None
+    )
+    algos_conf = load_algorithms_conf(
+        algos_conf_fs,
+        allowed_algos,
+        list(zip(*allowed_algo_groups)) if allowed_algo_groups else None,
+    )
+
+    executables_to_run = prepare_executables(
+        algos_conf,
+        load_yaml_file(
+            os.path.join(scripts_path, "../config", "algorithms.yaml")
+        ),
+        gpu_present,
+        conf_file,
+        dataset_path,
+        dataset,
+        count,
+        batch_size,
+    )
+
+    cuvs_bench_cpp(
+        conf_file,
+        dataset,
+        os.path.dirname(configuration)
+        if configuration and os.path.isfile(configuration)
+        else os.path.join(scripts_path, "conf", "algos"),
+        executables_to_run,
+        dataset_path,
+        force,
+        build,
+        search,
+        dry_run,
+        count,
+        batch_size,
+        search_threads,
+        search_mode,
+        raft_log_level,
+    )
diff --git a/python/cuvs_bench/cuvs_bench/run/runners.py b/python/cuvs_bench/cuvs_bench/run/runners.py
new file mode 100644
index 000000000..5a540d2e5
--- /dev/null
+++ b/python/cuvs_bench/cuvs_bench/run/runners.py
@@ -0,0 +1,273 @@
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import json
+import os
+import subprocess
+import uuid
+from typing import Dict, List, Optional, Tuple
+
+
+def cuvs_bench_cpp(
+    conf_file: Dict,
+    conf_filename: str,
+    conf_filedir: str,
+    executables_to_run: Dict[
+        Tuple[str, str, Tuple[str, str]], Dict[str, List[Dict]]
+    ],
+    dataset_path: str,
+    force: bool,
+    build: bool,
+    search: bool,
+    dry_run: bool,
+    k: int,
+    batch_size: int,
+    search_threads: Optional[int],
+    mode: str = "throughput",
+    raft_log_level: str = "info",
+) -> None:
+    """
+    Run the CUVS benchmarking tool with the provided configuration.
+
+    Parameters
+    ----------
+    conf_file : Dict
+        The configuration file content.
+    conf_filename : str
+        The name of the configuration file.
+    conf_filedir : str
+        The directory of the configuration file.
+    executables_to_run : Dict[Tuple[str, str, Tuple[str, str]],
+                         Dict[str, List[Dict]]]
+        Dictionary of executables to run and their configurations.
+    dataset_path : str
+        The path to the dataset.
+    force : bool
+        Whether to force the execution regardless of existing results.
+    build : bool
+        Whether to build the indices.
+    search : bool
+        Whether to perform the search.
+    dry_run : bool
+        Whether to perform a dry run without actual execution.
+    k : int
+        The number of nearest neighbors to search for.
+    batch_size : int
+        The size of each batch for processing.
+    search_threads : Optional[int]
+        The number of threads to use for searching.
+    mode : str, optional
+        The mode of search to perform ('latency' or 'throughput'),
+        by default 'throughput'.
+    raft_log_level : str, optional
+        The logging level for the RAFT library, by default 'info'.
+
+    Returns
+    -------
+    None
+    """
+    for (
+        executable,
+        ann_executable_path,
+        output_filename,
+    ) in executables_to_run.keys():
+        # Need to write temporary configuration
+        temp_conf_filename = (
+            f"{conf_filename}_{output_filename[1]}_{uuid.uuid1()}.json"
+        )
+        with open(temp_conf_filename, "w") as f:
+            temp_conf = {
+                "dataset": conf_file["dataset"],
+                "search_basic_param": conf_file["search_basic_param"],
+                "index": executables_to_run[
+                    (executable, ann_executable_path, output_filename)
+                ]["index"],
+            }
+            json_str = json.dumps(temp_conf, indent=2)
+            f.write(json_str)
+
+        legacy_result_folder = os.path.join(
+            dataset_path, conf_file["dataset"]["name"], "result"
+        )
+        os.makedirs(legacy_result_folder, exist_ok=True)
+
+        if build:
+            build_folder = os.path.join(legacy_result_folder, "build")
+            os.makedirs(build_folder, exist_ok=True)
+            build_file = f"{output_filename[0]}.json"
+            temp_build_file = f"{build_file}.lock"
+            benchmark_out = os.path.join(build_folder, temp_build_file)
+            cmd = [
+                ann_executable_path,
+                "--build",
+                f"--data_prefix={dataset_path}",
+                "--benchmark_out_format=json",
+                "--benchmark_counters_tabular=true",
+                f"--benchmark_out={os.path.join(benchmark_out)}",
+                f"--raft_log_level={parse_log_level(raft_log_level)}",
+            ]
+            if force:
+                cmd.append("--force")
+            cmd.append(temp_conf_filename)
+
+            if dry_run:
+                print(
+                    f"Benchmark command for {output_filename[0]}:\n"
+                    f"{' '.join(cmd)}\n"
+                )
+            else:
+                try:
+                    subprocess.run(cmd, check=True)
+                    merge_build_files(
+                        build_folder, build_file, temp_build_file
+                    )
+                except Exception as e:
+                    print(f"Error occurred running benchmark: {e}")
+                finally:
+                    os.remove(os.path.join(build_folder, temp_build_file))
+                    if not search:
+                        os.remove(temp_conf_filename)
+
+        if search:
+            search_folder = os.path.join(legacy_result_folder, "search")
+            os.makedirs(search_folder, exist_ok=True)
+            search_file = f"{output_filename[1]}.json"
+            cmd = [
+                ann_executable_path,
+                "--search",
+                f"--data_prefix={dataset_path}",
+                "--benchmark_counters_tabular=true",
+                f"--override_kv=k:{k}",
+                f"--override_kv=n_queries:{batch_size}",
+                "--benchmark_min_warmup_time=1",
+                "--benchmark_out_format=json",
+                f"--mode={mode}",
+                f"--benchmark_out={os.path.join(search_folder, search_file)}",
+                f"--raft_log_level={parse_log_level(raft_log_level)}",
+            ]
+            if force:
+                cmd.append("--force")
+            if search_threads:
+                cmd.append(f"--threads={search_threads}")
+            cmd.append(temp_conf_filename)
+
+            if dry_run:
+                print(
+                    f"Benchmark command for {output_filename[1]}:\n"
+                    f"{' '.join(cmd)}\n"
+                )
+            else:
+                try:
+                    subprocess.run(cmd, check=True)
+                except Exception as e:
+                    print(f"Error occurred running benchmark: {e}")
+                finally:
+                    os.remove(temp_conf_filename)
+
+
+log_levels = {
+    "off": 0,
+    "error": 1,
+    "warn": 2,
+    "info": 3,
+    "debug": 4,
+    "trace": 5,
+}
+
+
+def parse_log_level(level_str: str) -> int:
+    """
+    Parse the log level from string to integer.
+
+    Parameters
+    ----------
+    level_str : str
+        The log level as a string.
+
+    Returns
+    -------
+    int
+        The corresponding integer value of the log level.
+
+    Raises
+    ------
+    ValueError
+        If the log level string is invalid.
+    """
+    if level_str not in log_levels:
+        raise ValueError(f"Invalid log level: {level_str}")
+    return log_levels[level_str.lower()]
+
+
+def merge_build_files(
+    build_dir: str, build_file: str, temp_build_file: str
+) -> None:
+    """
+    Merge temporary build files into the main build file.
+
+    Parameters
+    ----------
+    build_dir : str
+        The directory of the build files.
+    build_file : str
+        The main build file.
+    temp_build_file : str
+        The temporary build file to merge.
+
+    Returns
+    -------
+    None
+
+    Raises
+    ------
+    ValueError
+        If the temporary build file is not found.
+    """
+    build_dict = {}
+
+    # If build file exists, read it
+    build_json_path = os.path.join(build_dir, build_file)
+    tmp_build_json_path = os.path.join(build_dir, temp_build_file)
+    if os.path.isfile(build_json_path):
+        try:
+            with open(build_json_path, "r") as f:
+                build_dict = json.load(f)
+        except Exception as e:
+            print(
+                f"Error loading existing build file: {build_json_path} ({e})"
+            )
+
+    temp_build_dict = {}
+    if os.path.isfile(tmp_build_json_path):
+        with open(tmp_build_json_path, "r") as f:
+            temp_build_dict = json.load(f)
+    else:
+        raise ValueError(f"Temp build file not found: {tmp_build_json_path}")
+
+    tmp_benchmarks = temp_build_dict.get("benchmarks", {})
+    benchmarks = build_dict.get("benchmarks", {})
+
+    # If the build time is absolute 0 then an error occurred
+    final_bench_dict = {b["name"]: b for b in benchmarks if b["real_time"] > 0}
+
+    for tmp_bench in tmp_benchmarks:
+        if tmp_bench["real_time"] > 0:
+            final_bench_dict[tmp_bench["name"]] = tmp_bench
+
+    temp_build_dict["benchmarks"] = list(final_bench_dict.values())
+    with open(build_json_path, "w") as f:
+        json_str = json.dumps(temp_build_dict, indent=2)
+        f.write(json_str)
diff --git a/python/cuvs_bench/cuvs_bench/tests/test_run.py b/python/cuvs_bench/cuvs_bench/tests/test_run.py
new file mode 100644
index 000000000..7b7a481a0
--- /dev/null
+++ b/python/cuvs_bench/cuvs_bench/tests/test_run.py
@@ -0,0 +1,227 @@
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+import itertools
+from unittest.mock import MagicMock, mock_open, patch
+
+import pytest
+from benchmark import (
+    find_executable,
+    gather_algorithm_configs,
+    get_dataset_configuration,
+    load_algorithms_conf,
+    load_yaml_file,
+    prepare_conf_file,
+    prepare_executables,
+    prepare_indexes,
+    rmm_present,
+    validate_algorithm,
+    validate_constraints,
+    validate_search_params,
+)
+
+
+def test_load_yaml_file():
+    yaml_content = """
+    key: value
+    """
+    with patch("builtins.open", mock_open(read_data=yaml_content)):
+        result = load_yaml_file("dummy_path.yaml")
+        assert result == {"key": "value"}
+
+
+def test_get_dataset_configuration():
+    dataset_conf_all = [{"name": "dataset1"}, {"name": "dataset2"}]
+    result = get_dataset_configuration("dataset1", dataset_conf_all)
+    assert result == {"name": "dataset1"}
+    with pytest.raises(ValueError):
+        get_dataset_configuration("non_existent_dataset", dataset_conf_all)
+
+
+def test_prepare_conf_file():
+    dataset_conf = {"name": "dataset1"}
+    result = prepare_conf_file(dataset_conf, 1000, 10, 128)
+    expected_result = {
+        "dataset": {"name": "dataset1", "subset_size": 1000},
+        "search_basic_param": {"k": 10, "batch_size": 128},
+    }
+    assert result == expected_result
+    result_no_subset = prepare_conf_file(dataset_conf, None, 10, 128)
+    assert result_no_subset["dataset"].get("subset_size") is None
+
+
+def test_gather_algorithm_configs(tmpdir):
+    scripts_path = tmpdir.mkdir("scripts")
+    algos_path = scripts_path.mkdir("algos")
+    algos_path.join("algo1.yaml").write("key: value")
+    algos_path.join("algo2.yaml").write("key: value")
+    result = gather_algorithm_configs(str(scripts_path), None)
+    assert len(result) == 2
+
+    custom_conf_dir = tmpdir.mkdir("custom_conf")
+    custom_conf_dir.join("custom_algo.yaml").write("key: value")
+    result = gather_algorithm_configs(str(scripts_path), str(custom_conf_dir))
+    assert len(result) == 3
+
+    custom_conf_file = custom_conf_dir.join("custom_algo_file.yaml")
+    custom_conf_file.write("key: value")
+    result = gather_algorithm_configs(str(scripts_path), str(custom_conf_file))
+    assert len(result) == 4
+
+
+def test_load_algorithms_conf():
+    algos_conf_fs = ["path/to/algo1.yaml", "path/to/algo2.yaml"]
+    yaml_content = """
+    name: algo1
+    groups:
+      group1: {}
+    """
+    with patch("builtins.open", mock_open(read_data=yaml_content)):
+        result = load_algorithms_conf(algos_conf_fs, None, None)
+        assert "algo1" in result
+
+    with patch("builtins.open", mock_open(read_data=yaml_content)):
+        result = load_algorithms_conf(algos_conf_fs, ["algo1"], None)
+        assert "algo1" in result
+        result = load_algorithms_conf(algos_conf_fs, ["algo2"], None)
+        assert "algo1" not in result
+
+
+@patch(
+    "benchmark.find_executable",
+    return_value=("executable", "path", "filename"),
+)
+@patch("benchmark.validate_algorithm", return_value=True)
+@patch(
+    "benchmark.prepare_indexes", return_value=[{"index_key": "index_value"}]
+)
+def test_prepare_executables(
+    mock_prepare_indexes, mock_validate_algorithm, mock_find_executable
+):
+    algos_conf = {"algo1": {"groups": {"group1": {"build": {}, "search": {}}}}}
+    algos_yaml = {"algo1": {}}
+    gpu_present = True
+    conf_file = {}
+    dataset_path = "dataset_path"
+    dataset = "dataset"
+    count = 10
+    batch_size = 128
+    result = prepare_executables(
+        algos_conf,
+        algos_yaml,
+        gpu_present,
+        conf_file,
+        dataset_path,
+        dataset,
+        count,
+        batch_size,
+    )
+    assert "executable" in result
+    assert len(result["executable"]["index"]) == 1
+
+
+def test_prepare_indexes():
+    group_conf = {"build": {"param1": [1, 2]}, "search": {"param2": [3, 4]}}
+    conf_file = {"dataset": {"dims": 128}}
+    result = prepare_indexes(
+        group_conf,
+        "algo",
+        "group",
+        conf_file,
+        "dataset_path",
+        "dataset",
+        10,
+        128,
+    )
+    assert len(result) == 2
+    assert "param1" in result[0]["build_param"]
+
+
+def test_validate_search_params():
+    all_search_params = itertools.product([1, 2], [3, 4])
+    search_param_names = ["param1", "param2"]
+    group_conf = {}
+    conf_file = {"dataset": {"dims": 128}}
+    result = validate_search_params(
+        all_search_params,
+        search_param_names,
+        "algo",
+        group_conf,
+        conf_file,
+        10,
+        128,
+    )
+    assert len(result) == 4
+
+
+def test_rmm_present():
+    with patch.dict("sys.modules", {"rmm": MagicMock()}):
+        assert rmm_present() is True
+    with patch.dict("sys.modules", {"rmm": None}):
+        assert rmm_present() is False
+
+
+@patch("benchmark.get_build_path", return_value="build_path")
+def test_find_executable(mock_get_build_path):
+    algos_conf = {"algo1": {"executable": "executable1"}}
+    result = find_executable(algos_conf, "algo1", "group1", 10, 128)
+    assert result == (
+        "executable1",
+        "build_path",
+        ("algo1,group1", "algo1,group1,k10,bs128"),
+    )
+    mock_get_build_path.return_value = None
+    with pytest.raises(FileNotFoundError):
+        find_executable(algos_conf, "algo1", "group1", 10, 128)
+
+
+def test_validate_algorithm():
+    algos_conf = {"algo1": {"requires_gpu": False}}
+    result = validate_algorithm(algos_conf, "algo1", gpu_present=True)
+    assert result is True
+    result = validate_algorithm(algos_conf, "algo1", gpu_present=False)
+    assert result is True
+    algos_conf["algo1"]["requires_gpu"] = True
+    result = validate_algorithm(algos_conf, "algo1", gpu_present=False)
+    assert result is False
+
+
+@patch("benchmark.import_module")
+def test_validate_constraints(mock_import_module):
+    mock_validator = MagicMock()
+    mock_import_module.return_value = mock_validator
+    mock_validator.constraint_func.return_value = True
+    algos_conf = {
+        "algo1": {"constraints": {"build": "module.constraint_func"}}
+    }
+    result = validate_constraints(
+        algos_conf, "algo1", "build", {"param1": "value1"}, 128, None, None
+    )
+    assert result is True
+
+    algos_conf = {"algo1": {"constraints": {}}}
+    result = validate_constraints(
+        algos_conf, "algo1", "build", {"param1": "value1"}, 128, None, None
+    )
+    assert result is True
+
+    mock_validator.constraint_func.return_value = False
+    algos_conf["algo1"]["constraints"]["build"] = "module.constraint_func"
+    result = validate_constraints(
+        algos_conf, "algo1", "build", {"param1": "value1"}, 128, None, None
+    )
+    assert result is False
diff --git a/python/cuvs_bench/pyproject.toml b/python/cuvs_bench/pyproject.toml
index 7bb9e2f8d..41ebad116 100644
--- a/python/cuvs_bench/pyproject.toml
+++ b/python/cuvs_bench/pyproject.toml
@@ -18,6 +18,10 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
+    "click",
+    "matplotlib",
+    "pandas",
+    "pyyaml",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -31,10 +35,7 @@ classifiers = [
 ]
 
 [project.urls]
-Homepage = "https://github.com/rapidsai/raft"
-
-[tool.setuptools.packages.find]
-where = ["src"]
+Homepage = "https://github.com/rapidsai/cuvs"
 
 [tool.setuptools.package-data]
 "*" = ["*.*", "VERSION"]
@@ -64,7 +65,8 @@ skip = [
 version = { file = "cuvs_bench/VERSION" }
 
 [tool.rapids-build-backend]
-build-backend = "scikit_build_core.build"
-requires = []
+build-backend = "setuptools.build_meta"
+requires = [
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 dependencies-file = "../../dependencies.yaml"
 matrix-entry = "cuda_suffixed=true"