diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0d6ab7ee54..6bca1d228e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -81,12 +81,6 @@ repos:
                 verbose: true
                 require_serial: true
                 exclude: .*/thirdparty/.*
-              - id: copyright-check
-                name: copyright-check
-                entry: python ./ci/checks/copyright.py --git-modified-only --update-current-year
-                language: python
-                pass_filenames: false
-                additional_dependencies: [gitpython]
               - id: include-check
                 name: include-check
                 entry: python ./cpp/scripts/include_checker.py cpp/bench cpp/include cpp/test
@@ -109,6 +103,23 @@ repos:
         rev: v4.5.0
         hooks:
               - id: check-json
+      - repo: https://github.com/rapidsai/pre-commit-hooks
+        rev: v0.0.1
+        hooks:
+          - id: verify-copyright
+            files: |
+              (?x)
+                  [.](cmake|cpp|cu|cuh|h|hpp|sh|pxd|py|pyx)$|
+                  CMakeLists[.]txt$|
+                  CMakeLists_standalone[.]txt$|
+                  meta[.]yaml$|
+                  setup[.]cfg$
+            exclude: |
+              (?x)
+                  cpp/include/raft/neighbors/detail/faiss_select/|
+                  cpp/include/raft/thirdparty/|
+                  docs/source/sphinxext/github_link[.]py|
+                  cpp/cmake/modules/FindAVX[.]cmake
 
 default_language_version:
       python: python3
diff --git a/build.sh b/build.sh
index 148d23c9c1..45c7d1380f 100755
--- a/build.sh
+++ b/build.sh
@@ -305,7 +305,7 @@ if hasArg --allgpuarch; then
     BUILD_ALL_GPU_ARCH=1
 fi
 
-if hasArg --compile-lib || (( ${NUMARGS} == 0 )); then
+if hasArg --compile-lib || hasArg pylibraft || (( ${NUMARGS} == 0 )); then
     COMPILE_LIBRARY=ON
     CMAKE_TARGET="${CMAKE_TARGET};raft_lib"
 fi
@@ -405,7 +405,7 @@ fi
 
 ################################################################################
 # Configure for building all C++ targets
-if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || hasArg bench-prims || hasArg bench-ann; then
+if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || hasArg bench-prims || hasArg bench-ann || ((${COMPILE_LIBRARY} == ON )); then
     if (( ${BUILD_ALL_GPU_ARCH} == 0 )); then
         RAFT_CMAKE_CUDA_ARCHITECTURES="NATIVE"
         echo "Building for the architecture of the GPU in the system..."
diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py
deleted file mode 100644
index 2af8b1b8ff..0000000000
--- a/ci/checks/copyright.py
+++ /dev/null
@@ -1,290 +0,0 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import datetime
-import re
-import argparse
-import io
-import os
-import sys
-
-import git
-
-SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
-
-# Add the scripts dir for gitutils
-sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR,
-                                              "../../cpp/scripts")))
-
-# Now import gitutils. Ignore flake8 error here since there is no other way to
-# set up imports
-import gitutils  # noqa: E402
-
-FilesToCheck = [
-    re.compile(r"[.](cmake|cpp|cu|cuh|h|hpp|sh|pxd|py|pyx)$"),
-    re.compile(r"CMakeLists[.]txt$"),
-    re.compile(r"CMakeLists_standalone[.]txt$"),
-    re.compile(r"setup[.]cfg$"),
-    re.compile(r"meta[.]yaml$")
-]
-ExemptFiles = [
-    re.compile("cpp/include/raft/neighbors/detail/faiss_select/"),
-    re.compile("cpp/include/raft/thirdparty/"),
-    re.compile("docs/source/sphinxext/github_link.py"),
-    re.compile("cpp/cmake/modules/FindAVX.cmake")
-]
-
-# this will break starting at year 10000, which is probably OK :)
-CheckSimple = re.compile(
-    r"Copyright *(?:\(c\))? *(\d{4}),? *NVIDIA C(?:ORPORATION|orporation)")
-CheckDouble = re.compile(
-    r"Copyright *(?:\(c\))? *(\d{4})-(\d{4}),? *NVIDIA C(?:ORPORATION|orporation)"  # noqa: E501
-)
-
-
-def checkThisFile(f):
-    if isinstance(f, git.Diff):
-        if f.deleted_file or f.b_blob.size == 0:
-            return False
-        f = f.b_path
-    elif not os.path.exists(f) or os.stat(f).st_size == 0:
-        # This check covers things like symlinks which point to files that DNE
-        return False
-    for exempt in ExemptFiles:
-        if exempt.search(f):
-            return False
-    for checker in FilesToCheck:
-        if checker.search(f):
-            return True
-    return False
-
-
-def modifiedFiles():
-    """Get a set of all modified files, as Diff objects.
-
-    The files returned have been modified in git since the merge base of HEAD
-    and the upstream of the target branch. We return the Diff objects so that
-    we can read only the staged changes.
-    """
-    repo = git.Repo()
-    # Use the environment variable TARGET_BRANCH or RAPIDS_BASE_BRANCH (defined in CI) if possible
-    target_branch = os.environ.get("TARGET_BRANCH", os.environ.get("RAPIDS_BASE_BRANCH"))
-    if target_branch is None:
-        # Fall back to the closest branch if not on CI
-        target_branch = repo.git.describe(
-            all=True, tags=True, match="branch-*", abbrev=0
-        ).lstrip("heads/")
-
-    upstream_target_branch = None
-    if target_branch in repo.heads:
-        # Use the tracking branch of the local reference if it exists. This
-        # returns None if no tracking branch is set.
-        upstream_target_branch = repo.heads[target_branch].tracking_branch()
-    if upstream_target_branch is None:
-        # Fall back to the remote with the newest target_branch. This code
-        # path is used on CI because the only local branch reference is
-        # current-pr-branch, and thus target_branch is not in repo.heads.
-        # This also happens if no tracking branch is defined for the local
-        # target_branch. We use the remote with the latest commit if
-        # multiple remotes are defined.
-        candidate_branches = [
-            remote.refs[target_branch] for remote in repo.remotes
-            if target_branch in remote.refs
-        ]
-        if len(candidate_branches) > 0:
-            upstream_target_branch = sorted(
-                candidate_branches,
-                key=lambda branch: branch.commit.committed_datetime,
-            )[-1]
-        else:
-            # If no remotes are defined, try to use the local version of the
-            # target_branch. If this fails, the repo configuration must be very
-            # strange and we can fix this script on a case-by-case basis.
-            upstream_target_branch = repo.heads[target_branch]
-    merge_base = repo.merge_base("HEAD", upstream_target_branch.commit)[0]
-    diff = merge_base.diff()
-    changed_files = {f for f in diff if f.b_path is not None}
-    return changed_files
-
-
-def getCopyrightYears(line):
-    res = CheckSimple.search(line)
-    if res:
-        return int(res.group(1)), int(res.group(1))
-    res = CheckDouble.search(line)
-    if res:
-        return int(res.group(1)), int(res.group(2))
-    return None, None
-
-
-def replaceCurrentYear(line, start, end):
-    # first turn a simple regex into double (if applicable). then update years
-    res = CheckSimple.sub(r"Copyright (c) \1-\1, NVIDIA CORPORATION", line)
-    res = CheckDouble.sub(
-        rf"Copyright (c) {start:04d}-{end:04d}, NVIDIA CORPORATION",
-        res,
-    )
-    return res
-
-
-def checkCopyright(f, update_current_year):
-    """Checks for copyright headers and their years."""
-    errs = []
-    thisYear = datetime.datetime.now().year
-    lineNum = 0
-    crFound = False
-    yearMatched = False
-
-    if isinstance(f, git.Diff):
-        path = f.b_path
-        lines = f.b_blob.data_stream.read().decode().splitlines(keepends=True)
-    else:
-        path = f
-        with open(f, encoding="utf-8") as fp:
-            lines = fp.readlines()
-
-    for line in lines:
-        lineNum += 1
-        start, end = getCopyrightYears(line)
-        if start is None:
-            continue
-        crFound = True
-        if start > end:
-            e = [
-                path,
-                lineNum,
-                "First year after second year in the copyright "
-                "header (manual fix required)",
-                None,
-            ]
-            errs.append(e)
-        elif thisYear < start or thisYear > end:
-            e = [
-                path,
-                lineNum,
-                "Current year not included in the copyright header",
-                None,
-            ]
-            if thisYear < start:
-                e[-1] = replaceCurrentYear(line, thisYear, end)
-            if thisYear > end:
-                e[-1] = replaceCurrentYear(line, start, thisYear)
-            errs.append(e)
-        else:
-            yearMatched = True
-    # copyright header itself not found
-    if not crFound:
-        e = [
-            path,
-            0,
-            "Copyright header missing or formatted incorrectly "
-            "(manual fix required)",
-            None,
-        ]
-        errs.append(e)
-    # even if the year matches a copyright header, make the check pass
-    if yearMatched:
-        errs = []
-
-    if update_current_year:
-        errs_update = [x for x in errs if x[-1] is not None]
-        if len(errs_update) > 0:
-            lines_changed = ", ".join(str(x[1]) for x in errs_update)
-            print(f"File: {path}. Changing line(s) {lines_changed}")
-            for _, lineNum, __, replacement in errs_update:
-                lines[lineNum - 1] = replacement
-            with open(path, "w", encoding="utf-8") as out_file:
-                out_file.writelines(lines)
-
-    return errs
-
-
-def getAllFilesUnderDir(root, pathFilter=None):
-    retList = []
-    for dirpath, dirnames, filenames in os.walk(root):
-        for fn in filenames:
-            filePath = os.path.join(dirpath, fn)
-            if pathFilter(filePath):
-                retList.append(filePath)
-    return retList
-
-
-def checkCopyright_main():
-    """
-    Checks for copyright headers in all the modified files. In case of local
-    repo, this script will just look for uncommitted files and in case of CI
-    it compares between branches "$PR_TARGET_BRANCH" and "current-pr-branch"
-    """
-    retVal = 0
-
-    argparser = argparse.ArgumentParser(
-        "Checks for a consistent copyright header in git's modified files"
-    )
-    argparser.add_argument(
-        "--update-current-year",
-        dest="update_current_year",
-        action="store_true",
-        required=False,
-        help="If set, "
-        "update the current year if a header is already "
-        "present and well formatted.",
-    )
-    argparser.add_argument(
-        "--git-modified-only",
-        dest="git_modified_only",
-        action="store_true",
-        required=False,
-        help="If set, "
-        "only files seen as modified by git will be "
-        "processed.",
-    )
-
-    args, dirs = argparser.parse_known_args()
-
-    if args.git_modified_only:
-        files = [f for f in modifiedFiles() if checkThisFile(f)]
-    else:
-        files = []
-        for d in [os.path.abspath(d) for d in dirs]:
-            if not os.path.isdir(d):
-                raise ValueError(f"{d} is not a directory.")
-            files += getAllFilesUnderDir(d, pathFilter=checkThisFile)
-
-    errors = []
-    for f in files:
-        errors += checkCopyright(f, args.update_current_year)
-
-    if len(errors) > 0:
-        if any(e[-1] is None for e in errors):
-            print("Copyright headers incomplete in some of the files!")
-        for e in errors:
-            print("  %s:%d Issue: %s" % (e[0], e[1], e[2]))
-        print("")
-        n_fixable = sum(1 for e in errors if e[-1] is not None)
-        path_parts = os.path.abspath(__file__).split(os.sep)
-        file_from_repo = os.sep.join(path_parts[path_parts.index("ci") :])
-        if n_fixable > 0 and not args.update_current_year:
-            print(
-                f"You can run `python {file_from_repo} --git-modified-only "
-                "--update-current-year` and stage the results in git to "
-                f"fix {n_fixable} of these errors.\n"
-            )
-        retVal = 1
-
-    return retVal
-
-
-if __name__ == "__main__":
-    sys.exit(checkCopyright_main())
diff --git a/conda/recipes/libraft/build_libraft_template.sh b/conda/recipes/libraft/build_libraft_template.sh
index bd7719af76..86c0fa11b6 100644
--- a/conda/recipes/libraft/build_libraft_template.sh
+++ b/conda/recipes/libraft/build_libraft_template.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 # Just building template so we verify it uses libraft.so and fail if it doesn't build
-./build.sh template
+./build.sh template --no-nvtx
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 638ceb3b45..6107b9325a 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -256,7 +256,7 @@ endif()
 
 if(RAFT_NVTX)
   # This enables NVTX within the project with no option to disable it downstream.
-  target_link_libraries(raft INTERFACE CUDA::nvToolsExt)
+  target_link_libraries(raft INTERFACE CUDA::nvtx3)
   target_compile_definitions(raft INTERFACE NVTX_ENABLED)
 else()
   # Allow enable NVTX downstream if not set here. This creates a new option at build/install time,
@@ -324,6 +324,7 @@ if(RAFT_COMPILE_LIBRARY)
     src/distance/detail/pairwise_matrix/dispatch_russel_rao_float_float_float_int.cu
     src/distance/distance.cu
     src/distance/fused_l2_nn.cu
+    src/distance/fused_distance_nn.cu
     src/linalg/detail/coalesced_reduction.cu
     src/matrix/detail/select_k_double_int64_t.cu
     src/matrix/detail/select_k_double_uint32_t.cu
@@ -422,6 +423,7 @@ if(RAFT_COMPILE_LIBRARY)
     src/raft_runtime/cluster/update_centroids.cuh
     src/raft_runtime/cluster/update_centroids_double.cu
     src/raft_runtime/cluster/update_centroids_float.cu
+    src/raft_runtime/distance/fused_distance_min_arg.cu
     src/raft_runtime/distance/fused_l2_min_arg.cu
     src/raft_runtime/distance/pairwise_distance.cu
     src/raft_runtime/matrix/select_k_float_int64_t.cu
diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
index 3a2431cd34..95361e19ca 100644
--- a/cpp/bench/prims/CMakeLists.txt
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -122,21 +122,13 @@ if(BUILD_PRIMS_BENCH)
   )
 
   ConfigureBench(
-    NAME
-    MATRIX_BENCH
-    PATH
-    bench/prims/matrix/argmin.cu
-    bench/prims/matrix/gather.cu
-    bench/prims/matrix/select_k.cu
-    bench/prims/matrix/main.cpp
-    OPTIONAL
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY
+    NAME MATRIX_BENCH PATH bench/prims/matrix/argmin.cu bench/prims/matrix/gather.cu
+    bench/prims/matrix/select_k.cu bench/prims/main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureBench(
     NAME RANDOM_BENCH PATH bench/prims/random/make_blobs.cu bench/prims/random/permute.cu
-    bench/prims/random/rng.cu bench/prims/main.cpp
+   bench/prims/random/rng.cu bench/prims/random/subsample.cu bench/prims/main.cpp
   )
 
   ConfigureBench(NAME SPARSE_BENCH PATH bench/prims/sparse/convert_csr.cu bench/prims/main.cpp)
@@ -156,6 +148,14 @@ if(BUILD_PRIMS_BENCH)
     bench/prims/neighbors/knn/ivf_pq_filter_float_int64_t.cu
     bench/prims/neighbors/knn/ivf_pq_int8_t_int64_t.cu
     bench/prims/neighbors/knn/ivf_pq_uint8_t_int64_t.cu
+    src/neighbors/detail/ivf_pq_search_filtering_float_int64_t.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_float_bitset64.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_bitset64.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_bitset64.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_half_bitset64.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_bitset64.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_bitset64.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_half_half_bitset64.cu
     bench/prims/neighbors/refine_float_int64_t.cu
     bench/prims/neighbors/refine_uint8_t_int64_t.cu
     bench/prims/main.cpp
diff --git a/cpp/bench/prims/matrix/gather.cu b/cpp/bench/prims/matrix/gather.cu
index e6f26ba925..078f9e6198 100644
--- a/cpp/bench/prims/matrix/gather.cu
+++ b/cpp/bench/prims/matrix/gather.cu
@@ -16,34 +16,48 @@
 
 #include <common/benchmark.hpp>
 
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/host_mdarray.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/matrix/gather.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/util/itertools.hpp>
 
 #include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
 
 namespace raft::bench::matrix {
 
 template <typename IdxT>
 struct GatherParams {
   IdxT rows, cols, map_length;
+  bool host;
 };
 
 template <typename IdxT>
 inline auto operator<<(std::ostream& os, const GatherParams<IdxT>& p) -> std::ostream&
 {
-  os << p.rows << "#" << p.cols << "#" << p.map_length;
+  os << p.rows << "#" << p.cols << "#" << p.map_length << (p.host ? "#host" : "#device");
   return os;
 }
 
 template <typename T, typename MapT, typename IdxT, bool Conditional = false>
 struct Gather : public fixture {
   Gather(const GatherParams<IdxT>& p)
-    : params(p), matrix(this->handle), map(this->handle), out(this->handle), stencil(this->handle)
+    : params(p),
+      old_mr(rmm::mr::get_current_device_resource()),
+      pool_mr(rmm::mr::get_current_device_resource(), 2 * (1ULL << 30)),
+      matrix(this->handle),
+      map(this->handle),
+      out(this->handle),
+      stencil(this->handle),
+      matrix_h(this->handle)
   {
+    rmm::mr::set_current_device_resource(&pool_mr);
   }
 
+  ~Gather() { rmm::mr::set_current_device_resource(old_mr); }
+
   void allocate_data(const ::benchmark::State& state) override
   {
     matrix  = raft::make_device_matrix<T, IdxT>(handle, params.rows, params.cols);
@@ -59,6 +73,11 @@ struct Gather : public fixture {
     if constexpr (Conditional) {
       raft::random::uniform(handle, rng, stencil.data_handle(), params.map_length, T(-1), T(1));
     }
+
+    if (params.host) {
+      matrix_h = raft::make_host_matrix<T, IdxT>(handle, params.rows, params.cols);
+      raft::copy(matrix_h.data_handle(), matrix.data_handle(), matrix.size(), stream);
+    }
     resource::sync_stream(handle, stream);
   }
 
@@ -77,14 +96,22 @@ struct Gather : public fixture {
         raft::matrix::gather_if(
           handle, matrix_const_view, out.view(), map_const_view, stencil_const_view, pred_op);
       } else {
-        raft::matrix::gather(handle, matrix_const_view, map_const_view, out.view());
+        if (params.host) {
+          raft::matrix::detail::gather(
+            handle, make_const_mdspan(matrix_h.view()), map_const_view, out.view());
+        } else {
+          raft::matrix::gather(handle, matrix_const_view, map_const_view, out.view());
+        }
       }
     });
   }
 
  private:
   GatherParams<IdxT> params;
+  rmm::mr::device_memory_resource* old_mr;
+  rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> pool_mr;
   raft::device_matrix<T, IdxT> matrix, out;
+  raft::host_matrix<T, IdxT> matrix_h;
   raft::device_vector<T, IdxT> stencil;
   raft::device_vector<MapT, IdxT> map;
 };  // struct Gather
@@ -100,4 +127,9 @@ RAFT_BENCH_REGISTER((Gather<float, uint32_t, int64_t>), "", gather_inputs_i64);
 RAFT_BENCH_REGISTER((Gather<double, uint32_t, int64_t>), "", gather_inputs_i64);
 RAFT_BENCH_REGISTER((GatherIf<float, uint32_t, int64_t>), "", gather_inputs_i64);
 RAFT_BENCH_REGISTER((GatherIf<double, uint32_t, int64_t>), "", gather_inputs_i64);
+
+auto inputs_host = raft::util::itertools::product<GatherParams<int64_t>>(
+  {10000000}, {100}, {1000, 1000000, 10000000}, {true});
+RAFT_BENCH_REGISTER((Gather<float, uint32_t, int64_t>), "Host", inputs_host);
+
 }  // namespace raft::bench::matrix
diff --git a/cpp/bench/prims/neighbors/knn/ivf_pq_filter_float_int64_t.cu b/cpp/bench/prims/neighbors/knn/ivf_pq_filter_float_int64_t.cu
index 9534515cbb..1840eca99d 100644
--- a/cpp/bench/prims/neighbors/knn/ivf_pq_filter_float_int64_t.cu
+++ b/cpp/bench/prims/neighbors/knn/ivf_pq_filter_float_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,10 @@
  * limitations under the License.
  */
 
-#undef RAFT_EXPLICIT_INSTANTIATE_ONLY  // Enable instantiation of search with filter
 #include "../knn.cuh"
 
+#include <raft_internal/neighbors/ivf_pq_compute_similarity_filters_test-ext.cuh>
+#include <raft_internal/neighbors/ivf_pq_search_test-ext.cuh>
 namespace raft::bench::spatial {
 
 KNN_REGISTER(float, int64_t, ivf_pq_filter_knn, kInputsFilter, kNoCopyOnly, kScopeFull);
diff --git a/cpp/bench/prims/random/subsample.cu b/cpp/bench/prims/random/subsample.cu
new file mode 100644
index 0000000000..4c8ca2bf31
--- /dev/null
+++ b/cpp/bench/prims/random/subsample.cu
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/random/permute.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/random/sample_without_replacement.cuh>
+#include <raft/spatial/knn/detail/ann_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_scalar.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <cub/cub.cuh>
+
+namespace raft::bench::random {
+
+struct sample_inputs {
+  int n_samples;
+  int n_train;
+  int method;
+};  // struct sample_inputs
+
+inline auto operator<<(std::ostream& os, const sample_inputs& p) -> std::ostream&
+{
+  os << p.n_samples << "#" << p.n_train << "#" << p.method;
+  return os;
+}
+
+// Sample with replacement. We use this as a baseline.
+template <typename IdxT>
+auto bernoulli_subsample(raft::resources const& res, IdxT n_samples, IdxT n_subsamples, int seed)
+  -> raft::device_vector<IdxT, IdxT>
+{
+  RAFT_EXPECTS(n_subsamples <= n_samples, "Cannot have more training samples than dataset vectors");
+
+  auto indices = raft::make_device_vector<IdxT, IdxT>(res, n_subsamples);
+  raft::random::RngState state(123456ULL);
+  raft::random::uniformInt(
+    res, state, indices.data_handle(), n_subsamples, IdxT(0), IdxT(n_samples));
+  return indices;
+}
+
+template <typename T>
+struct sample : public fixture {
+  sample(const sample_inputs& p)
+    : params(p),
+      old_mr(rmm::mr::get_current_device_resource()),
+      pool_mr(rmm::mr::get_current_device_resource(), 2 * GiB),
+      in(make_device_vector<T, int64_t>(res, p.n_samples)),
+      out(make_device_vector<T, int64_t>(res, p.n_train))
+  {
+    rmm::mr::set_current_device_resource(&pool_mr);
+    raft::random::RngState r(123456ULL);
+  }
+
+  ~sample() { rmm::mr::set_current_device_resource(old_mr); }
+  void run_benchmark(::benchmark::State& state) override
+  {
+    std::ostringstream label_stream;
+    label_stream << params;
+    state.SetLabel(label_stream.str());
+
+    raft::random::RngState r(123456ULL);
+    loop_on_state(state, [this, &r]() {
+      if (params.method == 1) {
+        this->out =
+          bernoulli_subsample<T>(this->res, this->params.n_samples, this->params.n_train, 137);
+      } else if (params.method == 2) {
+        this->out = raft::random::excess_subsample<T, int64_t>(
+          this->res, r, this->params.n_samples, this->params.n_train);
+      }
+    });
+  }
+
+ private:
+  float GiB = 1073741824.0f;
+  raft::device_resources res;
+  rmm::mr::device_memory_resource* old_mr;
+  rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> pool_mr;
+  sample_inputs params;
+  raft::device_vector<T, int64_t> out, in;
+};  // struct sample
+
+const std::vector<sample_inputs> input_vecs = {{100000000, 10000000, 1},
+                                               {100000000, 50000000, 1},
+                                               {100000000, 100000000, 1},
+                                               {100000000, 10000000, 2},
+                                               {100000000, 50000000, 2},
+                                               {100000000, 100000000, 2}};
+
+RAFT_BENCH_REGISTER(sample<int64_t>, "", input_vecs);
+
+}  // namespace raft::bench::random
diff --git a/cpp/include/raft/core/detail/mdspan_numpy_serializer.hpp b/cpp/include/raft/core/detail/mdspan_numpy_serializer.hpp
index 176309c8ce..3fb7b3005b 100644
--- a/cpp/include/raft/core/detail/mdspan_numpy_serializer.hpp
+++ b/cpp/include/raft/core/detail/mdspan_numpy_serializer.hpp
@@ -126,7 +126,8 @@ inline dtype_t get_numpy_dtype()
 }
 
 #if defined(_RAFT_HAS_CUDA)
-template <typename T, typename std::enable_if_t<std::is_same_v<T, half>, bool> = true>
+template <typename T,
+          typename std::enable_if_t<std::is_same_v<std::remove_cv_t<T>, half>, bool> = true>
 inline dtype_t get_numpy_dtype()
 {
   return {RAFT_NUMPY_HOST_ENDIAN_CHAR, 'e', sizeof(T)};
diff --git a/cpp/include/raft/core/detail/nvtx.hpp b/cpp/include/raft/core/detail/nvtx.hpp
index 8afd1f16c6..82db75de84 100644
--- a/cpp/include/raft/core/detail/nvtx.hpp
+++ b/cpp/include/raft/core/detail/nvtx.hpp
@@ -28,6 +28,7 @@
 #include <string>
 #include <type_traits>
 #include <unordered_map>
+#include <vector>
 
 namespace raft::common::nvtx::detail {
 
diff --git a/cpp/include/raft/distance/detail/fused_distance_nn.cuh b/cpp/include/raft/distance/detail/fused_distance_nn.cuh
new file mode 100644
index 0000000000..4fbfdc8755
--- /dev/null
+++ b/cpp/include/raft/distance/detail/fused_distance_nn.cuh
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/kvp.hpp>                             // raft::KeyValuePair
+#include <raft/core/operators.hpp>                       // raft::identity_op
+#include <raft/distance/detail/distance_ops/l2_exp.cuh>  // ops::l2_exp_distance_op
+#include <raft/distance/detail/fused_distance_nn/cutlass_base.cuh>
+#include <raft/distance/detail/fused_distance_nn/fused_cosine_nn.cuh>
+#include <raft/distance/detail/fused_distance_nn/fused_l2_nn.cuh>
+#include <raft/distance/detail/fused_distance_nn/helper_structs.cuh>
+#include <raft/distance/detail/fused_distance_nn/simt_kernel.cuh>
+#include <raft/distance/detail/pairwise_distance_base.cuh>  // PairwiseDistances
+#include <raft/distance/distance_types.hpp>
+#include <raft/linalg/contractions.cuh>  // Policy
+#include <raft/util/arch.cuh>            // raft::util::arch::SM_*
+#include <raft/util/cuda_utils.cuh>      // raft::ceildiv, raft::shfl
+
+#include <cstddef>  // size_t
+#include <limits>   // std::numeric_limits
+
+namespace raft {
+namespace distance {
+
+namespace detail {
+
+template <typename DataT,
+          typename OutT,
+          typename IdxT,
+          typename Policy,
+          typename ReduceOpT,
+          typename KVPReduceOpT>
+void fusedDistanceNNImpl(OutT* min,
+                         const DataT* x,
+                         const DataT* y,
+                         const DataT* xn,
+                         const DataT* yn,
+                         IdxT m,
+                         IdxT n,
+                         IdxT k,
+                         int* workspace,
+                         ReduceOpT redOp,
+                         KVPReduceOpT pairRedOp,
+                         bool sqrt,
+                         bool initOutBuffer,
+                         bool isRowMajor,
+                         raft::distance::DistanceType metric,
+                         float metric_arg,
+                         cudaStream_t stream)
+{
+  // The kernel policy is determined by fusedDistanceNN.
+  typedef Policy P;
+
+  dim3 blk(P::Nthreads);
+  auto nblks            = raft::ceildiv<int>(m, P::Nthreads);
+  constexpr auto maxVal = std::numeric_limits<DataT>::max();
+  typedef KeyValuePair<IdxT, DataT> KVPair;
+
+  RAFT_CUDA_TRY(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream));
+  if (initOutBuffer) {
+    initKernel<DataT, OutT, IdxT, ReduceOpT>
+      <<<nblks, P::Nthreads, 0, stream>>>(min, m, maxVal, redOp);
+    RAFT_CUDA_TRY(cudaGetLastError());
+  }
+
+  switch (metric) {
+    case raft::distance::DistanceType::CosineExpanded:
+      fusedCosineNN<DataT, OutT, IdxT, P, ReduceOpT, KVPReduceOpT>(
+        min, x, y, xn, yn, m, n, k, workspace, redOp, pairRedOp, sqrt, stream);
+      break;
+    case raft::distance::DistanceType::L2SqrtExpanded:
+    case raft::distance::DistanceType::L2Expanded:
+      // initOutBuffer is take care by fusedDistanceNNImpl() so we set it false to fusedL2NNImpl.
+      fusedL2NNImpl<DataT, OutT, IdxT, P, ReduceOpT, KVPReduceOpT>(
+        min, x, y, xn, yn, m, n, k, workspace, redOp, pairRedOp, sqrt, false, stream);
+      break;
+    default: assert("only cosine/l2 metric is supported with fusedDistanceNN\n"); break;
+  }
+}
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/distance/detail/fused_distance_nn/custom_epilogue_with_broadcast.h b/cpp/include/raft/distance/detail/fused_distance_nn/custom_epilogue_with_broadcast.h
index 32e9214a0d..186715851b 100644
--- a/cpp/include/raft/distance/detail/fused_distance_nn/custom_epilogue_with_broadcast.h
+++ b/cpp/include/raft/distance/detail/fused_distance_nn/custom_epilogue_with_broadcast.h
@@ -611,6 +611,7 @@ class EpilogueWithBroadcastCustom : public EpilogueBase<Shape_,
         ++tensor_iterator;
       }
     }
+    tensor_iterator.dumpToGmem();
   }
 
   /// Helper to invoke the output functor over each vector of output
diff --git a/cpp/include/raft/distance/detail/fused_distance_nn/cutlass_base.cuh b/cpp/include/raft/distance/detail/fused_distance_nn/cutlass_base.cuh
index e06d996c49..b2fc5e0cc7 100644
--- a/cpp/include/raft/distance/detail/fused_distance_nn/cutlass_base.cuh
+++ b/cpp/include/raft/distance/detail/fused_distance_nn/cutlass_base.cuh
@@ -33,6 +33,8 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/semaphore>
+
 #include <cutlass/cutlass.h>
 #include <cutlass/gemm/device/gemm.h>
 #include <cutlass/gemm/device/gemm_grouped.h>
@@ -46,6 +48,14 @@ namespace raft {
 namespace distance {
 namespace detail {
 
+template <typename IdxT>
+RAFT_KERNEL initBinMutexKernel(cuda::binary_semaphore<cuda::thread_scope_device>* mut, IdxT m)
+{
+  auto tid = IdxT(blockIdx.x) * blockDim.x + threadIdx.x;
+
+  if (tid < m) { mut[tid].release(); }
+}
+
 template <typename DataT,
           typename AccT,
           typename OutT,
@@ -87,8 +97,14 @@ void cutlassFusedDistanceNN(const DataT* x,
     KVPReduceOpT>;
   constexpr int batch_count = 1;
 
+  rmm::device_uvector<cuda::binary_semaphore<cuda::thread_scope_device>> bin_mutex(m, stream);
+
+  int blks_ = (m / 256) + 1;
+
+  initBinMutexKernel<<<blks_, 256, 0, stream>>>(bin_mutex.data(), m);
+
   typename EpilogueOutputOp::Params epilog_op_param(
-    dist_op, cg_reduce_op, redOp, pairRedOp, mutexes);
+    dist_op, cg_reduce_op, redOp, pairRedOp, mutexes, bin_mutex.data());
 
   // Number of pipelines you want to use
   constexpr int NumStages = 3;
diff --git a/cpp/include/raft/distance/detail/fused_distance_nn/epilogue_elementwise.cuh b/cpp/include/raft/distance/detail/fused_distance_nn/epilogue_elementwise.cuh
index f9ab394585..e69b2486df 100644
--- a/cpp/include/raft/distance/detail/fused_distance_nn/epilogue_elementwise.cuh
+++ b/cpp/include/raft/distance/detail/fused_distance_nn/epilogue_elementwise.cuh
@@ -56,6 +56,8 @@
 
 #pragma once
 
+#include <cuda/semaphore>
+
 #include <cutlass/array.h>
 #include <cutlass/cutlass.h>
 #include <cutlass/epilogue/thread/activation.h>
@@ -121,6 +123,7 @@ class FusedDistanceNNEpilogueElementwise {
     KVPReduceOpT_ pair_redop_;
     ReduceOpT_ red_op_;
     int* mutexes_;
+    cuda::binary_semaphore<cuda::thread_scope_device>* bin_mutex_;
     using CGReduceT = CGReduceOp_;
     //
     // Methods
@@ -130,12 +133,14 @@ class FusedDistanceNNEpilogueElementwise {
            CGReduceOp cg_reduce_op,
            ReduceOpT_ red_op,
            KVPReduceOpT_ pair_redop,
-           int* mutexes)
+           int* mutexes,
+           cuda::binary_semaphore<cuda::thread_scope_device>* bin_mutex)
       : cg_reduce_op(cg_reduce_op),
         dist_op_(dist_op),
         pair_redop_(pair_redop),
         red_op_(red_op),
-        mutexes_(mutexes)
+        mutexes_(mutexes),
+        bin_mutex_(bin_mutex)
     {
     }
 
diff --git a/cpp/include/raft/distance/detail/fused_distance_nn/fused_cosine_nn.cuh b/cpp/include/raft/distance/detail/fused_distance_nn/fused_cosine_nn.cuh
new file mode 100644
index 0000000000..f29c8b4d4c
--- /dev/null
+++ b/cpp/include/raft/distance/detail/fused_distance_nn/fused_cosine_nn.cuh
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/kvp.hpp>                             // raft::KeyValuePair
+#include <raft/core/operators.hpp>                       // raft::identity_op
+#include <raft/distance/detail/distance_ops/cosine.cuh>  // ops::l2_exp_distance_op
+#include <raft/distance/detail/fused_distance_nn/cutlass_base.cuh>
+#include <raft/distance/detail/fused_distance_nn/helper_structs.cuh>
+#include <raft/distance/detail/fused_distance_nn/simt_kernel.cuh>
+#include <raft/distance/detail/pairwise_distance_base.cuh>  // PairwiseDistances
+#include <raft/linalg/contractions.cuh>                     // Policy
+#include <raft/util/arch.cuh>                               // raft::util::arch::SM_*
+#include <raft/util/cuda_utils.cuh>                         // raft::ceildiv, raft::shfl
+
+#include <cstddef>  // size_t
+#include <limits>   // std::numeric_limits
+
+namespace raft {
+namespace distance {
+
+namespace detail {
+
+template <typename DataT,
+          typename OutT,
+          typename IdxT,
+          typename Policy,
+          typename ReduceOpT,
+          typename KVPReduceOpT>
+void fusedCosineNN(OutT* min,
+                   const DataT* x,
+                   const DataT* y,
+                   const DataT* xn,
+                   const DataT* yn,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   int* workspace,
+                   ReduceOpT redOp,
+                   KVPReduceOpT pairRedOp,
+                   bool sqrt,
+                   cudaStream_t stream)
+{
+  // The kernel policy is determined by fusedL2NN.
+  typedef Policy P;
+
+  dim3 blk(P::Nthreads);
+  constexpr auto maxVal = std::numeric_limits<DataT>::max();
+  typedef KeyValuePair<IdxT, DataT> KVPair;
+
+  namespace arch = raft::util::arch;
+  using AccT     = DataT;
+  ops::cosine_distance_op<DataT, AccT, IdxT> distance_op{};
+
+  raft::identity_op fin_op{};
+
+  auto kernel = fusedDistanceNNkernel<DataT,
+                                      OutT,
+                                      IdxT,
+                                      P,
+                                      ReduceOpT,
+                                      KVPReduceOpT,
+                                      decltype(distance_op),
+                                      decltype(fin_op)>;
+
+  // Get pointer to fp32 SIMT kernel to determine the runtime architecture of the
+  // current system. Other methods to determine the architecture (that do not
+  // require a pointer) can be error prone. See:
+  // https://github.com/NVIDIA/cub/issues/545
+  void* kernel_ptr   = reinterpret_cast<void*>(kernel);
+  auto runtime_arch  = arch::kernel_virtual_arch(kernel_ptr);
+  auto cutlass_range = arch::SM_range(arch::SM_80(), arch::SM_future());
+
+  if (cutlass_range.contains(runtime_arch)) {
+    // If device is SM_80 or later, use CUTLASS-based kernel.
+    using cosineOp              = raft::distance::detail::ops::cosine_cutlass_op<DataT, DataT>;
+    using kvp_cg_min_reduce_op_ = kvp_cg_min_reduce_op<DataT, IdxT, OutT>;
+    kvp_cg_min_reduce_op_ cg_reduce_op;
+    cosineOp cosine_dist_op;
+
+    IdxT lda, ldb, ldd;
+    lda = k, ldb = k, ldd = n;
+
+    cutlassFusedDistanceNN<DataT,
+                           DataT,
+                           OutT,
+                           IdxT,
+                           P::Veclen,
+                           decltype(cg_reduce_op),
+                           decltype(cosine_dist_op),
+                           ReduceOpT,
+                           KVPReduceOpT>(x,
+                                         y,
+                                         xn,
+                                         yn,
+                                         m,
+                                         n,
+                                         k,
+                                         lda,
+                                         ldb,
+                                         ldd,
+                                         min,
+                                         workspace,
+                                         cg_reduce_op,
+                                         cosine_dist_op,
+                                         redOp,
+                                         pairRedOp,
+                                         stream);
+  } else {
+    // If device less than SM_80, use fp32 SIMT kernel.
+    constexpr size_t shmemSize = P::SmemSize + ((P::Mblk + P::Nblk) * sizeof(DataT));
+    dim3 grid                  = launchConfigGenerator<P>(m, n, shmemSize, kernel);
+
+    kernel<<<grid, blk, shmemSize, stream>>>(
+      min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, distance_op, fin_op);
+    RAFT_CUDA_TRY(cudaGetLastError());
+  }
+}
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/distance/detail/fused_distance_nn/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_distance_nn/fused_l2_nn.cuh
new file mode 100644
index 0000000000..65475e73c7
--- /dev/null
+++ b/cpp/include/raft/distance/detail/fused_distance_nn/fused_l2_nn.cuh
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/kvp.hpp>                             // raft::KeyValuePair
+#include <raft/core/operators.hpp>                       // raft::identity_op
+#include <raft/distance/detail/distance_ops/l2_exp.cuh>  // ops::l2_exp_distance_op
+#include <raft/distance/detail/fused_distance_nn/cutlass_base.cuh>
+#include <raft/distance/detail/fused_distance_nn/helper_structs.cuh>
+#include <raft/distance/detail/fused_distance_nn/simt_kernel.cuh>
+#include <raft/distance/detail/pairwise_distance_base.cuh>  // PairwiseDistances
+#include <raft/linalg/contractions.cuh>                     // Policy
+#include <raft/util/arch.cuh>                               // raft::util::arch::SM_*
+#include <raft/util/cuda_utils.cuh>                         // raft::ceildiv, raft::shfl
+
+#include <cstddef>  // size_t
+#include <limits>   // std::numeric_limits
+
+namespace raft {
+namespace distance {
+
+namespace detail {
+
+template <typename DataT,
+          typename OutT,
+          typename IdxT,
+          typename Policy,
+          typename ReduceOpT,
+          typename KVPReduceOpT>
+void fusedL2NNImpl(OutT* min,
+                   const DataT* x,
+                   const DataT* y,
+                   const DataT* xn,
+                   const DataT* yn,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   int* workspace,
+                   ReduceOpT redOp,
+                   KVPReduceOpT pairRedOp,
+                   bool sqrt,
+                   bool initOutBuffer,
+                   cudaStream_t stream)
+{
+  // The kernel policy is determined by fusedL2NN.
+  typedef Policy P;
+
+  dim3 blk(P::Nthreads);
+  auto nblks            = raft::ceildiv<int>(m, P::Nthreads);
+  constexpr auto maxVal = std::numeric_limits<DataT>::max();
+  typedef KeyValuePair<IdxT, DataT> KVPair;
+
+  RAFT_CUDA_TRY(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream));
+  if (initOutBuffer) {
+    initKernel<DataT, OutT, IdxT, ReduceOpT>
+      <<<nblks, P::Nthreads, 0, stream>>>(min, m, maxVal, redOp);
+    RAFT_CUDA_TRY(cudaGetLastError());
+  }
+
+  namespace arch = raft::util::arch;
+  using AccT     = DataT;
+  ops::l2_exp_distance_op<DataT, AccT, IdxT> distance_op{sqrt};
+
+  raft::identity_op fin_op{};
+
+  auto kernel = fusedDistanceNNkernel<DataT,
+                                      OutT,
+                                      IdxT,
+                                      P,
+                                      ReduceOpT,
+                                      KVPReduceOpT,
+                                      decltype(distance_op),
+                                      decltype(fin_op)>;
+
+  // Get pointer to fp32 SIMT kernel to determine the best compute architecture
+  // out of all for which the kernel was compiled for that matches closely
+  // to the current device. Other methods to determine the architecture (that do not
+  // require a pointer) can be error prone. See:
+  // https://github.com/NVIDIA/cub/issues/545
+  void* kernel_ptr   = reinterpret_cast<void*>(kernel);
+  auto runtime_arch  = arch::kernel_virtual_arch(kernel_ptr);
+  auto cutlass_range = arch::SM_range(arch::SM_80(), arch::SM_future());
+
+  if (cutlass_range.contains(runtime_arch)) {
+    // If device is SM_80 or later, use CUTLASS-based kernel.
+    using L2Op                  = raft::distance::detail::ops::l2_exp_cutlass_op<DataT, DataT>;
+    using kvp_cg_min_reduce_op_ = kvp_cg_min_reduce_op<DataT, IdxT, OutT>;
+    kvp_cg_min_reduce_op_ cg_reduce_op;
+    L2Op L2_dist_op(sqrt);
+
+    IdxT lda, ldb, ldd;
+    lda = k, ldb = k, ldd = n;
+
+    cutlassFusedDistanceNN<DataT,
+                           DataT,
+                           OutT,
+                           IdxT,
+                           P::Veclen,
+                           kvp_cg_min_reduce_op_,
+                           L2Op,
+                           ReduceOpT,
+                           KVPReduceOpT>(x,
+                                         y,
+                                         xn,
+                                         yn,
+                                         m,
+                                         n,
+                                         k,
+                                         lda,
+                                         ldb,
+                                         ldd,
+                                         min,
+                                         workspace,
+                                         cg_reduce_op,
+                                         L2_dist_op,
+                                         redOp,
+                                         pairRedOp,
+                                         stream);
+  } else {
+    // If device less than SM_80, use fp32 SIMT kernel.
+    constexpr size_t shmemSize = P::SmemSize + ((P::Mblk + P::Nblk) * sizeof(DataT));
+    dim3 grid                  = launchConfigGenerator<P>(m, n, shmemSize, kernel);
+
+    kernel<<<grid, blk, shmemSize, stream>>>(
+      min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, distance_op, fin_op);
+    RAFT_CUDA_TRY(cudaGetLastError());
+  }
+}
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/distance/detail/fused_distance_nn/helper_structs.cuh b/cpp/include/raft/distance/detail/fused_distance_nn/helper_structs.cuh
new file mode 100644
index 0000000000..e056c5d397
--- /dev/null
+++ b/cpp/include/raft/distance/detail/fused_distance_nn/helper_structs.cuh
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/kvp.hpp>                             // raft::KeyValuePair
+#include <raft/core/operators.hpp>                       // raft::identity_op
+#include <raft/distance/detail/distance_ops/l2_exp.cuh>  // ops::l2_exp_distance_op
+#include <raft/distance/detail/fused_distance_nn/cutlass_base.cuh>
+#include <raft/distance/detail/fused_distance_nn/simt_kernel.cuh>
+#include <raft/distance/detail/pairwise_distance_base.cuh>  // PairwiseDistances
+#include <raft/linalg/contractions.cuh>                     // Policy
+#include <raft/util/arch.cuh>                               // raft::util::arch::SM_*
+#include <raft/util/cuda_utils.cuh>                         // raft::ceildiv, raft::shfl
+#include <raft/util/device_atomics.cuh>
+
+#include <cstddef>  // size_t
+#include <limits>   // std::numeric_limits
+
+namespace raft {
+namespace distance {
+
+namespace detail {
+
+template <typename LabelT, typename DataT>
+struct KVPMinReduceImpl {
+  typedef raft::KeyValuePair<LabelT, DataT> KVP;
+  DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
+  DI KVP operator()(const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
+
+};  // KVPMinReduce
+
+template <typename LabelT, typename DataT>
+struct MinAndDistanceReduceOpImpl {
+  typedef typename raft::KeyValuePair<LabelT, DataT> KVP;
+
+  DI void operator()(LabelT rid, KVP* out, const KVP& other) const
+  {
+    if (other.value < out->value) {
+      out->key   = other.key;
+      out->value = other.value;
+    }
+  }
+  DI void operator()(LabelT rid, volatile KVP* out, const KVP& other) const
+  {
+    if (other.value < out->value) {
+      out->key   = other.key;
+      out->value = other.value;
+    }
+  }
+
+  DI void operator()(LabelT rid, DataT* out, const KVP& other) const
+  {
+    if (other.value < *out) { *out = other.value; }
+  }
+
+  DI void operator()(LabelT rid, volatile DataT* out, const KVP& other) const
+  {
+    if (other.value < *out) { *out = other.value; }
+  }
+
+  DI void operator()(LabelT rid, DataT* out, const DataT& other) const
+  {
+    if (other < *out) { *out = other; }
+  }
+
+  DI void operator()(LabelT rid, volatile DataT* out, const DataT& other) const
+  {
+    if (other < *out) { *out = other; }
+  }
+
+  DI void init(DataT* out, DataT maxVal) const { *out = maxVal; }
+  DI void init(KVP* out, DataT maxVal) const
+  {
+    out->value = maxVal;
+    out->key   = 0xfffffff0;
+  }
+
+  DI void init_key(DataT& out, LabelT idx) const { return; }
+  DI void init_key(KVP& out, LabelT idx) const { out.key = idx; }
+
+  DI DataT get_value(KVP& out) const { return out.value; }
+  DI DataT get_value(DataT& out) const { return out; }
+};
+
+template <typename LabelT, typename DataT>
+struct MinReduceOpImpl {
+  typedef typename raft::KeyValuePair<LabelT, DataT> KVP;
+  DI void operator()(LabelT rid, DataT* out, const KVP& other)
+  {
+    if (other.value < *out) { *out = other.value; }
+  }
+
+  DI void init(DataT* out, DataT maxVal) { *out = maxVal; }
+};
+
+template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT>
+RAFT_KERNEL initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp)
+{
+  auto tid = IdxT(blockIdx.x) * blockDim.x + threadIdx.x;
+  if (tid < m) { redOp.init(min + tid, maxVal); }
+}
+
+template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT>
+void initialize(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp, cudaStream_t stream)
+{
+  auto blks = raft::ceildiv(m, 256);
+  initKernel<DataT, OutT, IdxT><<<blks, 256, 0, stream>>>(min, m, maxVal, redOp);
+}
+
+// cg::reduce functor for FusedDistanceNN used in its cutlass version
+// to output the min distance value & key(loc id).
+// This is used in fused_distance_nn/predicated_tile_iterator_reduced_vec.h
+// store_with_byte_offset() passed to cg::reduce() & select_reduce.
+template <typename AccType, typename Index, typename OutType>
+struct kvp_cg_min_reduce_op {
+  typedef typename raft::KeyValuePair<Index, AccType> KVP;
+
+  __host__ __device__ kvp_cg_min_reduce_op() noexcept {};
+
+  using AccTypeT = AccType;
+  using IndexT   = Index;
+  // functor signature.
+  __host__ __device__ KVP operator()(KVP a, KVP b) const { return a.value < b.value ? a : b; }
+
+  __host__ __device__ AccType operator()(AccType a, AccType b) const { return min(a, b); }
+
+  __host__ __device__ bool isAmin(AccType a, AccType b) const { return a < b ? true : false; }
+};
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/distance/detail/fused_distance_nn/persistent_gemm.h b/cpp/include/raft/distance/detail/fused_distance_nn/persistent_gemm.h
index 4f05251705..f1a7c728e9 100644
--- a/cpp/include/raft/distance/detail/fused_distance_nn/persistent_gemm.h
+++ b/cpp/include/raft/distance/detail/fused_distance_nn/persistent_gemm.h
@@ -180,8 +180,7 @@ struct FusedDistanceNNPersistent {
     /// Default ctor
     CUTLASS_HOST_DEVICE
     Arguments()
-      :  // problem_count(0),
-        threadblock_count(0),
+      : threadblock_count(0),
         ptr_A(nullptr),
         ptr_B(nullptr),
         ptr_C(nullptr),
@@ -235,7 +234,6 @@ struct FusedDistanceNNPersistent {
 
   /// Parameters structure
   struct Params {
-    // typename ProblemVisitor::Params problem_visitor;
     temp_problem_visitor problem_visitor;
     int threadblock_count;
 
diff --git a/cpp/include/raft/distance/detail/fused_distance_nn/predicated_tile_iterator_reduced_vec.h b/cpp/include/raft/distance/detail/fused_distance_nn/predicated_tile_iterator_reduced_vec.h
index 936b9f2c89..d61018593f 100644
--- a/cpp/include/raft/distance/detail/fused_distance_nn/predicated_tile_iterator_reduced_vec.h
+++ b/cpp/include/raft/distance/detail/fused_distance_nn/predicated_tile_iterator_reduced_vec.h
@@ -322,10 +322,8 @@ class PredicatedTileIteratorReducedVec {
   /// Parameters structure containing reference and precomputed state.
   Params params_;
 
-  /// Byte-level pointer
-  uint8_t* byte_pointer_;
   /// Byte-level pointer first tile offset of this threadblock.
-  uint8_t* first_tile_byte_pointer_;
+  volatile uint8_t* first_tile_byte_pointer_;
 
   /// Array of boolean values to contain steady-state predicates
   Mask mask_;
@@ -350,6 +348,8 @@ class PredicatedTileIteratorReducedVec {
   /// Scatter indices
   int const* indices_;
 
+  const int do_gmem_reduction_;
+
   //
   // Static asserts about internal strides
   //
@@ -360,7 +360,6 @@ class PredicatedTileIteratorReducedVec {
 
  protected:
   SharedStorage& shared_storage_;
-  const bool& do_gmem_reduction_;
 
  private:
   //
@@ -374,10 +373,10 @@ class PredicatedTileIteratorReducedVec {
   CUTLASS_DEVICE
   PredicatedTileIteratorReducedVec(SharedStorage& shared_storage,
                                    Params const& params,
-                                   Element* pointer,
+                                   volatile Element* pointer,
                                    TensorCoord extent,
                                    int thread_idx,
-                                   const bool& do_gmem_reduction,
+                                   const bool do_gmem_reduction,
                                    TensorCoord threadblock_offset = TensorCoord(),
                                    int const* indices             = nullptr)
     : params_(params),
@@ -409,6 +408,7 @@ class PredicatedTileIteratorReducedVec {
       EpilogueOpParams const& user_params = params_.user_param;
       shared_storage_.initSmem(user_params);
     }
+    __syncthreads();
 
     // Null pointer performs no accesses
     if (!pointer) { mask_.clear(); }
@@ -416,66 +416,53 @@ class PredicatedTileIteratorReducedVec {
     if (ScatterD && !indices) { mask_.clear(); }
 
     // Initialize pointer
-    first_tile_byte_pointer_ = reinterpret_cast<uint8_t*>(pointer) +
+    first_tile_byte_pointer_ = reinterpret_cast<volatile uint8_t*>(pointer) +
                                LongIndex(block_offset.row()) * LongIndex(params_.stride);
 
-    if (ScatterD) {
-      byte_pointer_ = reinterpret_cast<uint8_t*>(pointer) +
-                      LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
-    }
-
     // Initialize internal state counter
     state_[0] = state_[1] = state_[2] = 0;
   }
 
-  /// Destructor
-  CUTLASS_DEVICE
-  ~PredicatedTileIteratorReducedVec()
+  CUTLASS_DEVICE void dumpToGmem()
   {
+    if (block_start_row_first_tile_ >= extent_row_) { return; }
+
     if (do_gmem_reduction_) {
       EpilogueOpParams const& user_params = params_.user_param;
-      auto gmem_ptr                       = reinterpret_cast<Element*>(first_tile_byte_pointer_);
-      Element* shared_elem_arr            = shared_storage_.data();
       const uint32_t mutex_id             = (block_start_row_first_tile_ / total_rows);
-      bool useGmemMutex = (gridDim.x != ((extent_row_ - 1 + total_rows) / total_rows));
-      // If this is not optimal grid size perform mutex based gmem reduce.
-      if (useGmemMutex) {
-        // single lock per block for multiple rows
-        if (threadIdx.x == 0 && block_start_row_first_tile_ < extent_row_) {
-          // acquire mutex lock.
-          unsigned int ns = 8;
-          while (atomicCAS(user_params.mutexes_ + mutex_id, 0, 1) == 1) {
-            __nanosleep(ns);
-            if (ns < 256) { ns *= 2; }
-          }
+      const bool useGmemMutex  = (gridDim.x != ((extent_row_ - 1 + total_rows) / total_rows));
+      int row                  = threadIdx.x;
+      Element* shared_elem_arr = shared_storage_.data();
+      Element row_local_min;
+      if (row < total_rows) { row_local_min = shared_elem_arr[row]; }
+
+      // single lock per block for multiple rows
+      if (useGmemMutex && threadIdx.x == 0) { user_params.bin_mutex_[mutex_id].acquire(); }
+      __syncthreads();
+
+      if (row < total_rows) {
+        volatile Element* gmem_ptr = reinterpret_cast<volatile Element*>(first_tile_byte_pointer_);
+
+        if ((block_start_row_first_tile_ + row) < extent_row_) {
+          user_params.red_op_(block_start_row_first_tile_ + row, (gmem_ptr + row), row_local_min);
         }
       }
 
       __syncthreads();
-      for (int row = threadIdx.x; row < total_rows; row += blockDim.x) {
-        if (block_start_row_first_tile_ + row < extent_row_) {
-          user_params.red_op_(
-            block_start_row_first_tile_ + row, &gmem_ptr[row], shared_elem_arr[row]);
-        }
-      }
+      __threadfence();
 
-      if (useGmemMutex) {
-        __threadfence();
-        __syncthreads();
-        if (threadIdx.x == 0 && block_start_row_first_tile_ < extent_row_) {
-          // release mutex lock.
-          atomicExch(user_params.mutexes_ + mutex_id, 0);
-        }
+      if (useGmemMutex && (threadIdx.x == 0)) {
+        // release mutex lock.
+        user_params.bin_mutex_[mutex_id].release();
       }
+      shared_storage_.initSmem(user_params);
+      __syncthreads();
     }
   }
 
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset)
-  {
-    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
+  /// Destructor
+  CUTLASS_DEVICE
+  ~PredicatedTileIteratorReducedVec() {}
 
   /// Performs reduction and Stores a reduced output to memory
   CUTLASS_DEVICE
@@ -515,9 +502,6 @@ class PredicatedTileIteratorReducedVec {
           user_params.red_op_.init(&red_val, maxVal);
 
           if (row_guard) {
-            const int iter_row      = (row_id % total_rows);
-            const auto prev_red_val = user_params.red_op_.get_value(shared_elem_arr[iter_row]);
-
             CUTLASS_PRAGMA_UNROLL
             for (int column = 0; column < ThreadMap::Iterations::kColumn * kElementsPerAccess;
                  ++column) {
@@ -536,6 +520,10 @@ class PredicatedTileIteratorReducedVec {
                 user_params.red_op_(row_id, &red_val, this_val);
               }
             }
+          }
+          const int iter_row      = (row_id % total_rows);
+          const auto prev_red_val = user_params.red_op_.get_value(shared_elem_arr[iter_row]);
+          if (row_guard) {
             // select_reduce doesn't need to use `red_op_` as at the warp level we use cg_reduce_op,
             // this satisfies the requirement of mst/single linkage of checking colors buffer.
             select_reduce<cg_reduce_t, tile32_t, OutIdxT, OutValT, Element> red_obj(
@@ -544,6 +532,7 @@ class PredicatedTileIteratorReducedVec {
         }
       }
     }
+    __syncthreads();
   }
 
   /// Stores a fragment to memory
@@ -575,14 +564,11 @@ class PredicatedTileIteratorReducedVec {
   {
     ++state_[0];
 
-    if (!ScatterD) { byte_pointer_ += params_.advance_row; }
-
     thread_start_row_ += ThreadMap::Shape::kRow;
 
     if (state_[0] == ThreadMap::Count::kRow) {
       state_[0] = 0;
       ++state_[1];
-      byte_pointer_ += params_.advance_group;
 
       thread_start_row_ +=
         (ThreadMap::Shape::kGroup - 1) * ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
@@ -590,18 +576,13 @@ class PredicatedTileIteratorReducedVec {
       if (state_[1] == ThreadMap::Count::kGroup) {
         state_[1] = 0;
         ++state_[2];
-        byte_pointer_ += params_.advance_cluster;
 
         thread_start_row_ += ThreadMap::Count::kGroup * ThreadMap::Shape::kGroup *
                              ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
 
-        if (state_[2] == ThreadMap::Count::kCluster) {
-          state_[2] = 0;
-          byte_pointer_ += params_.advance_tile;
-        }
+        if (state_[2] == ThreadMap::Count::kCluster) { state_[2] = 0; }
       }
     }
-
     return *this;
   }
 
diff --git a/cpp/include/raft/distance/detail/fused_distance_nn/simt_kernel.cuh b/cpp/include/raft/distance/detail/fused_distance_nn/simt_kernel.cuh
new file mode 100644
index 0000000000..7417fd5dac
--- /dev/null
+++ b/cpp/include/raft/distance/detail/fused_distance_nn/simt_kernel.cuh
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/kvp.hpp>                                // raft::KeyValuePair
+#include <raft/distance/detail/distance_ops/l2_exp.cuh>     // ops::l2_exp_distance_op
+#include <raft/distance/detail/pairwise_distance_base.cuh>  // PairwiseDistances
+#include <raft/linalg/contractions.cuh>                     // Policy
+
+#include <cstddef>  // size_t
+#include <limits>   // std::numeric_limits
+
+namespace raft {
+namespace distance {
+namespace detail {
+
+// TODO: specialize this function for MinAndDistanceReduceOp<int, float>
+// with atomicCAS of 64 bit which will eliminate mutex and shfls
+template <typename P, typename OutT, typename IdxT, typename KVPair, typename ReduceOpT>
+DI void updateReducedVal(
+  int* mutex, OutT* min, KVPair* val, ReduceOpT red_op, IdxT m, IdxT gridStrideY)
+{
+  const auto lid      = threadIdx.x % raft::WarpSize;
+  const auto accrowid = threadIdx.x / P::AccThCols;
+
+  // Update each output row in order within a warp. This will resolve hang
+  // issues with pre-Volta architectures
+#pragma unroll
+  for (int j = 0; j < (raft::WarpSize / P::AccThCols); j++) {
+    if (lid == j * P::AccThCols) {
+#pragma unroll
+      for (int i = 0; i < P::AccRowsPerTh; ++i) {
+        auto rid = gridStrideY + accrowid + i * P::AccThRows;
+        if (rid < m) {
+          auto value = val[i];
+          while (atomicCAS(mutex + rid, 0, 1) == 1)
+            ;
+          __threadfence();
+          red_op(rid, min + rid, value);
+          __threadfence();
+          atomicCAS(mutex + rid, 1, 0);
+        }
+      }
+    }
+  }
+}
+
+template <typename DataT,
+          typename OutT,
+          typename IdxT,
+          typename P,
+          typename ReduceOpT,
+          typename KVPReduceOpT,
+          typename OpT,
+          typename FinalLambda>
+__launch_bounds__(P::Nthreads, 2) RAFT_KERNEL fusedDistanceNNkernel(OutT* min,
+                                                                    const DataT* x,
+                                                                    const DataT* y,
+                                                                    const DataT* xn,
+                                                                    const DataT* yn,
+                                                                    IdxT m,
+                                                                    IdxT n,
+                                                                    IdxT k,
+                                                                    DataT maxVal,
+                                                                    int* mutex,
+                                                                    ReduceOpT redOp,
+                                                                    KVPReduceOpT pairRedOp,
+                                                                    OpT distance_op,
+                                                                    FinalLambda fin_op)
+{
+// compile only if below non-ampere arch.
+#if __CUDA_ARCH__ < 800
+  extern __shared__ char smem[];
+
+  typedef KeyValuePair<IdxT, DataT> KVPair;
+  KVPair val[P::AccRowsPerTh];
+#pragma unroll
+  for (int i = 0; i < P::AccRowsPerTh; ++i) {
+    val[i] = {0, maxVal};
+  }
+
+  // epilogue operation lambda for final value calculation
+  auto epilog_lambda = [n, pairRedOp, &val, maxVal] __device__(
+                         DataT acc[P::AccRowsPerTh][P::AccColsPerTh],
+                         DataT * regxn,
+                         DataT * regyn,
+                         IdxT gridStrideX,
+                         IdxT gridStrideY) {
+    KVPReduceOpT pairRed_op(pairRedOp);
+
+    // intra thread reduce
+    const auto acccolid = threadIdx.x % P::AccThCols;
+    const auto accrowid = threadIdx.x / P::AccThCols;
+#pragma unroll
+    for (int i = 0; i < P::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < P::AccColsPerTh; ++j) {
+        auto tmpkey = acccolid + j * P::AccThCols + gridStrideX;
+        KVPair tmp  = {tmpkey, acc[i][j]};
+        if (tmpkey < n) {
+          val[i] = pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]);
+        }
+      }
+    }
+  };
+
+  auto rowEpilog_lambda =
+    [m, mutex, min, pairRedOp, redOp, &val, maxVal] __device__(IdxT gridStrideY) {
+      KVPReduceOpT pairRed_op(pairRedOp);
+      ReduceOpT red_op(redOp);
+
+      const auto accrowid = threadIdx.x / P::AccThCols;
+      const auto lid      = raft::laneId();
+
+    // reduce
+#pragma unroll
+      for (int i = 0; i < P::AccRowsPerTh; ++i) {
+#pragma unroll
+        for (int j = P::AccThCols / 2; j > 0; j >>= 1) {
+          // Actually, the srcLane (lid +j) should be (lid +j) % P:AccThCols,
+          // but the shfl op applies the modulo internally.
+          auto tmpkey   = raft::shfl(val[i].key, lid + j, P::AccThCols);
+          auto tmpvalue = raft::shfl(val[i].value, lid + j, P::AccThCols);
+          KVPair tmp    = {tmpkey, tmpvalue};
+          val[i]        = pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]);
+        }
+      }
+
+      updateReducedVal<P, OutT, IdxT, KVPair, ReduceOpT>(mutex, min, val, red_op, m, gridStrideY);
+
+    // reset the val array.
+#pragma unroll
+      for (int i = 0; i < P::AccRowsPerTh; ++i) {
+        val[i] = {0, maxVal};
+      }
+    };
+
+  IdxT lda = k, ldb = k, ldd = n;
+  constexpr bool row_major = true;
+  constexpr bool write_out = false;
+  PairwiseDistances<DataT,
+                    DataT,  // OutT (unused in PairwiseDistances)
+                    IdxT,
+                    P,
+                    decltype(distance_op),
+                    decltype(epilog_lambda),
+                    FinalLambda,
+                    decltype(rowEpilog_lambda),
+                    row_major,
+                    write_out>
+    obj(x,
+        y,
+        m,
+        n,
+        k,
+        lda,
+        ldb,
+        ldd,
+        xn,
+        yn,
+        nullptr,  // Output pointer
+        smem,
+        distance_op,
+        epilog_lambda,
+        fin_op,
+        rowEpilog_lambda);
+  obj.run();
+#endif
+}
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/distance/detail/masked_nn.cuh b/cpp/include/raft/distance/detail/masked_nn.cuh
index 536e4937ab..3e3699766f 100644
--- a/cpp/include/raft/distance/detail/masked_nn.cuh
+++ b/cpp/include/raft/distance/detail/masked_nn.cuh
@@ -19,7 +19,7 @@
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/device_memory_resource.hpp>
 #include <raft/distance/detail/compress_to_bits.cuh>
-#include <raft/distance/detail/fused_l2_nn.cuh>
+#include <raft/distance/detail/fused_distance_nn/fused_l2_nn.cuh>
 #include <raft/distance/detail/masked_distance_base.cuh>
 #include <raft/linalg/contractions.cuh>
 #include <raft/util/cuda_utils.cuh>
diff --git a/cpp/include/raft/distance/fused_distance_nn-ext.cuh b/cpp/include/raft/distance/fused_distance_nn-ext.cuh
new file mode 100644
index 0000000000..263bbcea81
--- /dev/null
+++ b/cpp/include/raft/distance/fused_distance_nn-ext.cuh
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/kvp.hpp>                            // raft::KeyValuePair
+#include <raft/core/resources.hpp>                      // raft::resources
+#include <raft/distance/fused_distance_nn_helpers.cuh>  // include initialize and reduce operations
+#include <raft/util/raft_explicit.hpp>                  // RAFT_EXPLICIT
+
+#include <cstdint>  // int64_t
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+namespace raft {
+namespace distance {
+
+template <typename DataT, typename OutT, typename IdxT>
+void fusedDistanceNNMinReduce(OutT* min,
+                              const DataT* x,
+                              const DataT* y,
+                              const DataT* xn,
+                              const DataT* yn,
+                              IdxT m,
+                              IdxT n,
+                              IdxT k,
+                              void* workspace,
+                              bool sqrt,
+                              bool initOutBuffer,
+                              bool isRowMajor,
+                              raft::distance::DistanceType metric,
+                              float metric_arg,
+                              cudaStream_t stream) RAFT_EXPLICIT;
+
+}  // namespace distance
+}  // namespace raft
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+#define instantiate_raft_distance_fusedDistanceNNMinReduce(DataT, OutT, IdxT)       \
+  extern template void raft::distance::fusedDistanceNNMinReduce<DataT, OutT, IdxT>( \
+    OutT * min,                                                                     \
+    const DataT* x,                                                                 \
+    const DataT* y,                                                                 \
+    const DataT* xn,                                                                \
+    const DataT* yn,                                                                \
+    IdxT m,                                                                         \
+    IdxT n,                                                                         \
+    IdxT k,                                                                         \
+    void* workspace,                                                                \
+    bool sqrt,                                                                      \
+    bool initOutBuffer,                                                             \
+    bool isRowMajor,                                                                \
+    raft::distance::DistanceType metric,                                            \
+    float metric_arg,                                                               \
+    cudaStream_t stream)
+
+instantiate_raft_distance_fusedDistanceNNMinReduce(float, float, int);
+instantiate_raft_distance_fusedDistanceNNMinReduce(float, float, int64_t);
+
+// We can't have comma's in the macro expansion, so we use the COMMA macro:
+#define COMMA ,
+
+instantiate_raft_distance_fusedDistanceNNMinReduce(float, raft::KeyValuePair<int COMMA float>, int);
+instantiate_raft_distance_fusedDistanceNNMinReduce(float,
+                                                   raft::KeyValuePair<int64_t COMMA float>,
+                                                   int64_t);
+
+#undef COMMA
+
+#undef instantiate_raft_distance_fusedDistanceNNMinReduce
diff --git a/cpp/include/raft/distance/fused_distance_nn-inl.cuh b/cpp/include/raft/distance/fused_distance_nn-inl.cuh
new file mode 100644
index 0000000000..ffe86a1c04
--- /dev/null
+++ b/cpp/include/raft/distance/fused_distance_nn-inl.cuh
@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FUSED_DISTANCE_NN_H
+#define __FUSED_DISTANCE_NN_H
+
+#pragma once
+
+#include <raft/core/resources.hpp>
+#include <raft/distance/detail/fused_distance_nn.cuh>
+#include <raft/distance/fused_distance_nn_helpers.cuh>
+#include <raft/linalg/contractions.cuh>
+#include <raft/util/cuda_utils.cuh>
+
+#include <cub/cub.cuh>
+
+#include <stdint.h>
+
+#include <limits>
+#include <type_traits>
+
+namespace raft {
+namespace distance {
+
+/**
+ * \ingroup fused_l2_nn
+ * @{
+ */
+/**
+ * @brief Fused L2 distance and 1-nearest-neighbor computation in a single call.
+ *
+ * The benefits of such a call are 2-fold: 1) eliminate the need for an
+ * intermediate buffer to store the output of gemm 2) reduce the memory read
+ * traffic on this intermediate buffer, otherwise needed during the reduction
+ * phase for 1-NN.
+ *
+ * @tparam DataT      data type
+ * @tparam OutT       output type to either store 1-NN indices and their minimum
+ *                    distances or store only the min distances. Accordingly, one
+ *                    has to pass an appropriate `ReduceOpT`
+ * @tparam IdxT       indexing arithmetic type
+ * @tparam ReduceOpT  A struct to perform the final needed reduction operation
+ *                    and also to initialize the output array elements with the
+ *                    appropriate initial value needed for reduction.
+ * @tparam KVPReduceOpT A struct providing functions for key-value pair comparison.
+ *
+ * @param[out] min           will contain the reduced output (Length = `m`)
+ *                           (on device)
+ * @param[in]  x             first matrix. Row major. Dim = `m x k`.
+ *                           (on device).
+ * @param[in]  y             second matrix. Row major. Dim = `n x k`.
+ *                           (on device).
+ * @param[in]  xn            L2 squared norm of `x`. Length = `m`. (on device).
+ * @param[in]  yn            L2 squared norm of `y`. Length = `n`. (on device)
+ * @param[in]  m             gemm m
+ * @param[in]  n             gemm n
+ * @param[in]  k             gemm k
+ * @param[in]  workspace     temp workspace. Size = sizeof(int)*m. (on device)
+ * @param[in]  redOp         reduction operator in the epilogue
+ * @param[in]  pairRedOp     reduction operation on key value pairs
+ * @param[in]  sqrt          Whether the output `minDist` should contain L2-sqrt
+ * @param[in]  initOutBuffer whether to initialize the output buffer before the
+ *                           main kernel launch
+ * @param[in]  isRowMajor    whether the input/output is row or column major.
+ * @param[in]  metric        Distance metric to be used (supports L2, cosine)
+ * @param[in]  metric_arg    power argument for distances like Minkowski (not supported for now)
+ * @param[in]  stream        cuda stream
+ */
+template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT, typename KVPReduceOpT>
+void fusedDistanceNN(OutT* min,
+                     const DataT* x,
+                     const DataT* y,
+                     const DataT* xn,
+                     const DataT* yn,
+                     IdxT m,
+                     IdxT n,
+                     IdxT k,
+                     void* workspace,
+                     ReduceOpT redOp,
+                     KVPReduceOpT pairRedOp,
+                     bool sqrt,
+                     bool initOutBuffer,
+                     bool isRowMajor,
+                     raft::distance::DistanceType metric,
+                     float metric_arg,
+                     cudaStream_t stream)
+{
+  ASSERT(isRowMajor, "fusedDistanceNN only supports row major inputs");
+  // When k is smaller than 32, the Policy4x4 results in redundant calculations
+  // as it uses tiles that have k=32. Therefore, use a "skinny" policy instead
+  // that uses tiles with a smaller value of k.
+  bool is_skinny = k < 32;
+
+  size_t bytes = sizeof(DataT) * k;
+  auto px      = reinterpret_cast<uintptr_t>(x);
+  auto py      = reinterpret_cast<uintptr_t>(y);
+  if (16 % sizeof(DataT) == 0 && bytes % 16 == 0 && px % 16 == 0 && py % 16 == 0) {
+    if (is_skinny) {
+      detail::fusedDistanceNNImpl<
+        DataT,
+        OutT,
+        IdxT,
+        typename linalg::Policy4x4Skinny<DataT, 16 / sizeof(DataT)>::Policy,
+        ReduceOpT>(min,
+                   x,
+                   y,
+                   xn,
+                   yn,
+                   m,
+                   n,
+                   k,
+                   (int*)workspace,
+                   redOp,
+                   pairRedOp,
+                   sqrt,
+                   initOutBuffer,
+                   isRowMajor,
+                   metric,
+                   metric_arg,
+                   stream);
+    } else {
+      detail::fusedDistanceNNImpl<DataT,
+                                  OutT,
+                                  IdxT,
+                                  typename linalg::Policy4x4<DataT, 16 / sizeof(DataT)>::Policy,
+                                  ReduceOpT>(min,
+                                             x,
+                                             y,
+                                             xn,
+                                             yn,
+                                             m,
+                                             n,
+                                             k,
+                                             (int*)workspace,
+                                             redOp,
+                                             pairRedOp,
+                                             sqrt,
+                                             initOutBuffer,
+                                             isRowMajor,
+                                             metric,
+                                             metric_arg,
+                                             stream);
+    }
+  } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0 && px % 8 == 0 && py % 8 == 0) {
+    if (is_skinny) {
+      detail::fusedDistanceNNImpl<
+        DataT,
+        OutT,
+        IdxT,
+        typename linalg::Policy4x4Skinny<DataT, 8 / sizeof(DataT)>::Policy,
+        ReduceOpT>(min,
+                   x,
+                   y,
+                   xn,
+                   yn,
+                   m,
+                   n,
+                   k,
+                   (int*)workspace,
+                   redOp,
+                   pairRedOp,
+                   sqrt,
+                   initOutBuffer,
+                   isRowMajor,
+                   metric,
+                   metric_arg,
+                   stream);
+    } else {
+      detail::fusedDistanceNNImpl<DataT,
+                                  OutT,
+                                  IdxT,
+                                  typename linalg::Policy4x4<DataT, 8 / sizeof(DataT)>::Policy,
+                                  ReduceOpT>(min,
+                                             x,
+                                             y,
+                                             xn,
+                                             yn,
+                                             m,
+                                             n,
+                                             k,
+                                             (int*)workspace,
+                                             redOp,
+                                             pairRedOp,
+                                             sqrt,
+                                             initOutBuffer,
+                                             isRowMajor,
+                                             metric,
+                                             metric_arg,
+                                             stream);
+    }
+  } else {
+    if (is_skinny) {
+      detail::fusedDistanceNNImpl<DataT,
+                                  OutT,
+                                  IdxT,
+                                  typename linalg::Policy4x4Skinny<DataT, 1>::Policy,
+                                  ReduceOpT>(min,
+                                             x,
+                                             y,
+                                             xn,
+                                             yn,
+                                             m,
+                                             n,
+                                             k,
+                                             (int*)workspace,
+                                             redOp,
+                                             pairRedOp,
+                                             sqrt,
+                                             initOutBuffer,
+                                             isRowMajor,
+                                             metric,
+                                             metric_arg,
+                                             stream);
+    } else {
+      detail::fusedDistanceNNImpl<DataT,
+                                  OutT,
+                                  IdxT,
+                                  typename linalg::Policy4x4<DataT, 1>::Policy,
+                                  ReduceOpT>(min,
+                                             x,
+                                             y,
+                                             xn,
+                                             yn,
+                                             m,
+                                             n,
+                                             k,
+                                             (int*)workspace,
+                                             redOp,
+                                             pairRedOp,
+                                             sqrt,
+                                             initOutBuffer,
+                                             isRowMajor,
+                                             metric,
+                                             metric_arg,
+                                             stream);
+    }
+  }
+}
+
+/**
+ * @brief Wrapper around fusedDistanceNN with minimum reduction operators.
+ *
+ * fusedDistanceNN cannot be compiled in the distance library due to the lambda
+ * operators, so this wrapper covers the most common case (minimum).
+ *
+ * @tparam DataT     data type
+ * @tparam OutT      output type to either store 1-NN indices and their minimum
+ *                   distances (e.g. raft::KeyValuePair<int, float>) or store only the min
+ * distances.
+ * @tparam IdxT      indexing arithmetic type
+ * @param[out] min           will contain the reduced output (Length = `m`)
+ *                           (on device)
+ * @param[in]  x             first matrix. Row major. Dim = `m x k`.
+ *                           (on device).
+ * @param[in]  y             second matrix. Row major. Dim = `n x k`.
+ *                           (on device).
+ * @param[in]  xn            L2 squared norm of `x`. Length = `m`. (on device).
+ * @param[in]  yn            L2 squared norm of `y`. Length = `n`. (on device)
+ * @param[in]  m             gemm m
+ * @param[in]  n             gemm n
+ * @param[in]  k             gemm k
+ * @param[in]  workspace     temp workspace. Size = sizeof(int)*m. (on device)
+ * @param[in]  sqrt          Whether the output `minDist` should contain L2-sqrt
+ * @param[in]  initOutBuffer whether to initialize the output buffer before the
+ *                           main kernel launch
+ * @param[in]  isRowMajor    whether the input/output is row or column major.
+ * @param[in]  metric        Distance metric to be used (supports L2, cosine)
+ * @param[in]  metric_arg    power argument for distances like Minkowski (not supported for now)
+ * @param[in]  stream        cuda stream
+ */
+template <typename DataT, typename OutT, typename IdxT>
+void fusedDistanceNNMinReduce(OutT* min,
+                              const DataT* x,
+                              const DataT* y,
+                              const DataT* xn,
+                              const DataT* yn,
+                              IdxT m,
+                              IdxT n,
+                              IdxT k,
+                              void* workspace,
+                              bool sqrt,
+                              bool initOutBuffer,
+                              bool isRowMajor,
+                              raft::distance::DistanceType metric,
+                              float metric_arg,
+                              cudaStream_t stream)
+{
+  MinAndDistanceReduceOp<IdxT, DataT> redOp;
+  KVPMinReduce<IdxT, DataT> pairRedOp;
+
+  fusedDistanceNN<DataT, OutT, IdxT>(min,
+                                     x,
+                                     y,
+                                     xn,
+                                     yn,
+                                     m,
+                                     n,
+                                     k,
+                                     workspace,
+                                     redOp,
+                                     pairRedOp,
+                                     sqrt,
+                                     initOutBuffer,
+                                     isRowMajor,
+                                     metric,
+                                     metric_arg,
+                                     stream);
+}
+
+/** @} */
+
+}  // namespace distance
+}  // namespace raft
+
+#endif
diff --git a/cpp/include/raft/distance/fused_distance_nn.cuh b/cpp/include/raft/distance/fused_distance_nn.cuh
new file mode 100755
index 0000000000..04c42e49a1
--- /dev/null
+++ b/cpp/include/raft/distance/fused_distance_nn.cuh
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY
+#include "fused_distance_nn-inl.cuh"
+#endif
+
+#ifdef RAFT_COMPILED
+#include "fused_distance_nn-ext.cuh"
+#endif
diff --git a/cpp/include/raft/distance/fused_l2_nn_helpers.cuh b/cpp/include/raft/distance/fused_distance_nn_helpers.cuh
similarity index 92%
rename from cpp/include/raft/distance/fused_l2_nn_helpers.cuh
rename to cpp/include/raft/distance/fused_distance_nn_helpers.cuh
index 996f696ef6..3a570c681c 100644
--- a/cpp/include/raft/distance/fused_l2_nn_helpers.cuh
+++ b/cpp/include/raft/distance/fused_distance_nn_helpers.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/core/resource/cuda_stream.hpp>
-#include <raft/distance/detail/fused_l2_nn.cuh>
+#include <raft/distance/detail/fused_distance_nn/helper_structs.cuh>
 
 namespace raft::distance {
 
diff --git a/cpp/include/raft/distance/fused_l2_nn-ext.cuh b/cpp/include/raft/distance/fused_l2_nn-ext.cuh
index c7dc4c5fc6..d0ac83cd51 100644
--- a/cpp/include/raft/distance/fused_l2_nn-ext.cuh
+++ b/cpp/include/raft/distance/fused_l2_nn-ext.cuh
@@ -16,10 +16,10 @@
 
 #pragma once
 
-#include <raft/core/kvp.hpp>                      // raft::KeyValuePair
-#include <raft/core/resources.hpp>                // raft::resources
-#include <raft/distance/fused_l2_nn_helpers.cuh>  // include initialize and reduce operations
-#include <raft/util/raft_explicit.hpp>            // RAFT_EXPLICIT
+#include <raft/core/kvp.hpp>                            // raft::KeyValuePair
+#include <raft/core/resources.hpp>                      // raft::resources
+#include <raft/distance/fused_distance_nn_helpers.cuh>  // include initialize and reduce operations
+#include <raft/util/raft_explicit.hpp>                  // RAFT_EXPLICIT
 
 #include <cstdint>  // int64_t
 
diff --git a/cpp/include/raft/distance/fused_l2_nn-inl.cuh b/cpp/include/raft/distance/fused_l2_nn-inl.cuh
index 6f4db15c72..bf9a49d813 100644
--- a/cpp/include/raft/distance/fused_l2_nn-inl.cuh
+++ b/cpp/include/raft/distance/fused_l2_nn-inl.cuh
@@ -20,8 +20,8 @@
 #pragma once
 
 #include <raft/core/resources.hpp>
-#include <raft/distance/detail/fused_l2_nn.cuh>
-#include <raft/distance/fused_l2_nn_helpers.cuh>
+#include <raft/distance/detail/fused_distance_nn/fused_l2_nn.cuh>
+#include <raft/distance/fused_distance_nn_helpers.cuh>
 #include <raft/linalg/contractions.cuh>
 #include <raft/util/cuda_utils.cuh>
 
diff --git a/cpp/include/raft/matrix/detail/gather.cuh b/cpp/include/raft/matrix/detail/gather.cuh
index 651fec81c3..05cc9204bf 100644
--- a/cpp/include/raft/matrix/detail/gather.cuh
+++ b/cpp/include/raft/matrix/detail/gather.cuh
@@ -16,9 +16,19 @@
 
 #pragma once
 
+#include <raft/common/nvtx.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/host_mdspan.hpp>
 #include <raft/core/operators.hpp>
+#include <raft/core/pinned_mdarray.hpp>
+#include <raft/core/pinned_mdspan.hpp>
+#include <raft/util/cuda_dev_essentials.cuh>
 #include <raft/util/cudart_utils.hpp>
 
+#include <omp.h>
+
 #include <functional>
 
 namespace raft {
@@ -336,6 +346,83 @@ void gather_if(const InputIteratorT in,
   gatherImpl(in, D, N, map, stencil, map_length, out, pred_op, transform_op, stream);
 }
 
+/**
+ * Helper function to gather a set of vectors from a (host) dataset.
+ */
+template <typename T, typename IdxT, typename MatIdxT = int64_t>
+void gather_buff(host_matrix_view<const T, MatIdxT> dataset,
+                 host_vector_view<const IdxT, MatIdxT> indices,
+                 MatIdxT offset,
+                 pinned_matrix_view<T, MatIdxT> buff)
+{
+  raft::common::nvtx::range<common::nvtx::domain::raft> fun_scope("gather_host_buff");
+  IdxT batch_size = std::min<IdxT>(buff.extent(0), indices.extent(0) - offset);
+
+#pragma omp for
+  for (IdxT i = 0; i < batch_size; i++) {
+    IdxT in_idx = indices(offset + i);
+    for (IdxT k = 0; k < buff.extent(1); k++) {
+      buff(i, k) = dataset(in_idx, k);
+    }
+  }
+}
+
+template <typename T, typename IdxT, typename MatIdxT = int64_t>
+void gather(raft::resources const& res,
+            host_matrix_view<const T, MatIdxT> dataset,
+            device_vector_view<const IdxT, MatIdxT> indices,
+            raft::device_matrix_view<T, MatIdxT> output)
+{
+  raft::common::nvtx::range<common::nvtx::domain::raft> fun_scope("gather");
+  IdxT n_dim        = output.extent(1);
+  IdxT n_train      = output.extent(0);
+  auto indices_host = raft::make_host_vector<IdxT, MatIdxT>(n_train);
+  raft::copy(
+    indices_host.data_handle(), indices.data_handle(), n_train, resource::get_cuda_stream(res));
+  resource::sync_stream(res);
+
+  const size_t buffer_size = 32768 * 1024;  // bytes
+  const size_t max_batch_size =
+    std::min<size_t>(round_up_safe<size_t>(buffer_size / n_dim, 32), n_train);
+  RAFT_LOG_DEBUG("Gathering data with batch size %zu", max_batch_size);
+
+  // Gather the vector on the host in tmp buffers. We use two buffers to overlap H2D sync
+  // and gathering the data.
+  auto out_tmp1 = raft::make_pinned_matrix<T, MatIdxT>(res, max_batch_size, n_dim);
+  auto out_tmp2 = raft::make_pinned_matrix<T, MatIdxT>(res, max_batch_size, n_dim);
+
+  // Usually a limited number of threads provide sufficient bandwidth for gathering data.
+  int n_threads = std::min(omp_get_max_threads(), 32);
+
+  // The gather_buff function has a parallel for loop. We start the the omp parallel
+  // region here, to avoid repeated overhead within the device_offset loop.
+#pragma omp parallel num_threads(n_threads)
+  {
+    auto view1 = out_tmp1.view();
+    auto view2 = out_tmp2.view();
+    gather_buff(dataset, make_const_mdspan(indices_host.view()), (MatIdxT)0, view1);
+    for (MatIdxT device_offset = 0; device_offset < n_train; device_offset += max_batch_size) {
+      MatIdxT batch_size = std::min<IdxT>(max_batch_size, n_train - device_offset);
+
+#pragma omp master
+      raft::copy(output.data_handle() + device_offset * n_dim,
+                 view1.data_handle(),
+                 batch_size * n_dim,
+                 resource::get_cuda_stream(res));
+      // Start gathering the next batch on the host.
+      MatIdxT host_offset = device_offset + batch_size;
+      batch_size          = std::min<IdxT>(max_batch_size, n_train - host_offset);
+      if (batch_size > 0) {
+        gather_buff(dataset, make_const_mdspan(indices_host.view()), host_offset, view2);
+      }
+#pragma omp master
+      resource::sync_stream(res);
+#pragma omp barrier
+      std::swap(view1, view2);
+    }
+  }
+}
+
 }  // namespace detail
 }  // namespace matrix
 }  // namespace raft
diff --git a/cpp/include/raft/matrix/detail/sample_rows.cuh b/cpp/include/raft/matrix/detail/sample_rows.cuh
new file mode 100644
index 0000000000..e28ad648da
--- /dev/null
+++ b/cpp/include/raft/matrix/detail/sample_rows.cuh
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/matrix/gather.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft::matrix::detail {
+
+/** Select rows randomly from input and copy to output. */
+template <typename T, typename IdxT = int64_t>
+void sample_rows(raft::resources const& res,
+                 random::RngState random_state,
+                 const T* input,
+                 IdxT n_rows_input,
+                 raft::device_matrix_view<T, IdxT> output)
+{
+  IdxT n_dim     = output.extent(1);
+  IdxT n_samples = output.extent(0);
+
+  raft::device_vector<IdxT, IdxT> train_indices =
+    raft::random::excess_subsample<IdxT, int64_t>(res, random_state, n_rows_input, n_samples);
+
+  cudaPointerAttributes attr;
+  RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, input));
+  T* ptr = reinterpret_cast<T*>(attr.devicePointer);
+  if (ptr != nullptr) {
+    raft::matrix::gather(res,
+                         raft::make_device_matrix_view<const T, IdxT>(ptr, n_rows_input, n_dim),
+                         raft::make_const_mdspan(train_indices.view()),
+                         output);
+  } else {
+    auto dataset = raft::make_host_matrix_view<const T, IdxT>(input, n_rows_input, n_dim);
+    raft::matrix::detail::gather(res, dataset, make_const_mdspan(train_indices.view()), output);
+  }
+}
+}  // namespace raft::matrix::detail
diff --git a/cpp/include/raft/matrix/sample_rows.cuh b/cpp/include/raft/matrix/sample_rows.cuh
new file mode 100644
index 0000000000..7925d344e4
--- /dev/null
+++ b/cpp/include/raft/matrix/sample_rows.cuh
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/matrix/detail/sample_rows.cuh>
+#include <raft/random/rng.cuh>
+
+namespace raft::matrix {
+
+/** @brief Select rows randomly from input and copy to output.
+ *
+ * The rows are selected randomly. The random sampling method does not guarantee completely unique
+ * selection of rows, but it is close to being unique.
+ *
+ * @param res RAFT resource handle
+ * @param random_state
+ * @param dataset input dataset
+ * @param output subsampled dataset
+ */
+template <typename T, typename IdxT = int64_t, typename accessor>
+void sample_rows(raft::resources const& res,
+                 random::RngState random_state,
+                 mdspan<const T, matrix_extent<IdxT>, row_major, accessor> dataset,
+                 raft::device_matrix_view<T, IdxT> output)
+{
+  RAFT_EXPECTS(dataset.extent(1) == output.extent(1),
+               "dataset dims must match, but received %ld vs %ld",
+               static_cast<long>(dataset.extent(1)),
+               static_cast<long>(output.extent(1)));
+  detail::sample_rows<T, IdxT>(res, random_state, dataset.data_handle(), dataset.extent(0), output);
+}
+
+/** @brief Select rows randomly from input and copy to output.
+ *
+ * The rows are selected randomly. The random sampling method does not guarantee completely unique
+ * selection of rows, but it is close to being unique.
+ *
+ * @param res RAFT resource handle
+ * @param random_state
+ * @param dataset input dataset
+ * @param n_samples number of rows in the returned matrix
+ *
+ * @return subsampled dataset
+ * */
+template <typename T, typename IdxT = int64_t, typename accessor>
+raft::device_matrix<T, IdxT> sample_rows(
+  raft::resources const& res,
+  random::RngState random_state,
+  mdspan<const T, matrix_extent<IdxT>, row_major, accessor> dataset,
+  IdxT n_samples)
+{
+  auto output = raft::make_device_matrix<T, IdxT>(res, n_samples, dataset.extent(1));
+  sample_rows(res, random_state, dataset, output.view());
+  return output;
+}
+
+}  // namespace raft::matrix
diff --git a/cpp/include/raft/neighbors/cagra.cuh b/cpp/include/raft/neighbors/cagra.cuh
index b8258297e6..b7e362f704 100644
--- a/cpp/include/raft/neighbors/cagra.cuh
+++ b/cpp/include/raft/neighbors/cagra.cuh
@@ -25,6 +25,7 @@
 #include <raft/core/mdspan.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/neighbors/cagra_types.hpp>
+#include <raft/neighbors/dataset.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -279,62 +280,6 @@ index<T, IdxT> build(raft::resources const& res,
   return detail::build<T, IdxT, Accessor>(res, params, dataset);
 }
 
-/**
- * @brief Search ANN using the constructed index.
- *
- * See the [cagra::build](#cagra::build) documentation for a usage example.
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices
- *
- * @param[in] res raft resources
- * @param[in] params configure the search
- * @param[in] idx cagra index
- * @param[in] queries a device matrix view to a row-major matrix [n_queries, index->dim()]
- * @param[out] neighbors a device matrix view to the indices of the neighbors in the source dataset
- * [n_queries, k]
- * @param[out] distances a device matrix view to the distances to the selected neighbors [n_queries,
- * k]
- */
-template <typename T, typename IdxT>
-void search(raft::resources const& res,
-            const search_params& params,
-            const index<T, IdxT>& idx,
-            raft::device_matrix_view<const T, int64_t, row_major> queries,
-            raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,
-            raft::device_matrix_view<float, int64_t, row_major> distances)
-{
-  RAFT_EXPECTS(
-    queries.extent(0) == neighbors.extent(0) && queries.extent(0) == distances.extent(0),
-    "Number of rows in output neighbors and distances matrices must equal the number of queries.");
-
-  RAFT_EXPECTS(neighbors.extent(1) == distances.extent(1),
-               "Number of columns in output neighbors and distances matrices must equal k");
-  RAFT_EXPECTS(queries.extent(1) == idx.dim(),
-               "Number of query dimensions should equal number of dimensions in the index.");
-
-  using internal_IdxT   = typename std::make_unsigned<IdxT>::type;
-  auto queries_internal = raft::make_device_matrix_view<const T, int64_t, row_major>(
-    queries.data_handle(), queries.extent(0), queries.extent(1));
-  auto neighbors_internal = raft::make_device_matrix_view<internal_IdxT, int64_t, row_major>(
-    reinterpret_cast<internal_IdxT*>(neighbors.data_handle()),
-    neighbors.extent(0),
-    neighbors.extent(1));
-  auto distances_internal = raft::make_device_matrix_view<float, int64_t, row_major>(
-    distances.data_handle(), distances.extent(0), distances.extent(1));
-
-  cagra::detail::search_main<T,
-                             internal_IdxT,
-                             decltype(raft::neighbors::filtering::none_cagra_sample_filter()),
-                             IdxT>(res,
-                                   params,
-                                   idx,
-                                   queries_internal,
-                                   neighbors_internal,
-                                   distances_internal,
-                                   raft::neighbors::filtering::none_cagra_sample_filter());
-}
-
 /**
  * @brief Search ANN using the constructed index with the given sample filter.
  *
@@ -401,10 +346,40 @@ void search_with_filtering(raft::resources const& res,
   auto distances_internal = raft::make_device_matrix_view<float, int64_t, row_major>(
     distances.data_handle(), distances.extent(0), distances.extent(1));
 
-  cagra::detail::search_main<T, internal_IdxT, CagraSampleFilterT, IdxT>(
+  return cagra::detail::search_main<T, internal_IdxT, CagraSampleFilterT, IdxT>(
     res, params, idx, queries_internal, neighbors_internal, distances_internal, sample_filter);
 }
 
+/**
+ * @brief Search ANN using the constructed index.
+ *
+ * See the [cagra::build](#cagra::build) documentation for a usage example.
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices
+ *
+ * @param[in] res raft resources
+ * @param[in] params configure the search
+ * @param[in] idx cagra index
+ * @param[in] queries a device matrix view to a row-major matrix [n_queries, index->dim()]
+ * @param[out] neighbors a device matrix view to the indices of the neighbors in the source dataset
+ * [n_queries, k]
+ * @param[out] distances a device matrix view to the distances to the selected neighbors [n_queries,
+ * k]
+ */
+template <typename T, typename IdxT>
+void search(raft::resources const& res,
+            const search_params& params,
+            const index<T, IdxT>& idx,
+            raft::device_matrix_view<const T, int64_t, row_major> queries,
+            raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, row_major> distances)
+{
+  using none_filter_type = raft::neighbors::filtering::none_cagra_sample_filter;
+  return cagra::search_with_filtering<T, IdxT, none_filter_type>(
+    res, params, idx, queries, neighbors, distances, none_filter_type{});
+}
+
 /** @} */  // end group cagra
 
 }  // namespace raft::neighbors::cagra
diff --git a/cpp/include/raft/neighbors/cagra_types.hpp b/cpp/include/raft/neighbors/cagra_types.hpp
index 0f574ae5bb..807f89fd65 100644
--- a/cpp/include/raft/neighbors/cagra_types.hpp
+++ b/cpp/include/raft/neighbors/cagra_types.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include "ann_types.hpp"
+#include "dataset.hpp"
 
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/error.hpp>
@@ -35,6 +36,7 @@
 #include <optional>
 #include <string>
 #include <type_traits>
+
 namespace raft::neighbors::cagra {
 /**
  * @addtogroup cagra
@@ -61,6 +63,12 @@ struct index_params : ann::index_params {
   graph_build_algo build_algo = graph_build_algo::IVF_PQ;
   /** Number of Iterations to run if building with NN_DESCENT */
   size_t nn_descent_niter = 20;
+  /**
+   * Specify compression params if compression is desired.
+   *
+   * NOTE: this is experimental new API, consider it unsafe.
+   */
+  std::optional<vpq_params> compression = std::nullopt;
 };
 
 enum class search_algo {
@@ -145,25 +153,37 @@ struct index : ann::index {
   /** Total length of the index (number of vectors). */
   [[nodiscard]] constexpr inline auto size() const noexcept -> IdxT
   {
-    return dataset_view_.extent(0) ? dataset_view_.extent(0) : graph_view_.extent(0);
+    auto data_rows = dataset_->n_rows();
+    return data_rows > 0 ? data_rows : graph_view_.extent(0);
   }
 
   /** Dimensionality of the data. */
-  [[nodiscard]] constexpr inline auto dim() const noexcept -> uint32_t
-  {
-    return dataset_view_.extent(1);
-  }
+  [[nodiscard]] constexpr inline auto dim() const noexcept -> uint32_t { return dataset_->dim(); }
   /** Graph degree */
   [[nodiscard]] constexpr inline auto graph_degree() const noexcept -> uint32_t
   {
     return graph_view_.extent(1);
   }
 
-  /** Dataset [size, dim] */
-  [[nodiscard]] inline auto dataset() const noexcept
+  /**
+   * DEPRECATED: please use data() instead.
+   *   If you need to query dataset dimensions, use the dim() and size() of the cagra index.
+   *   The data_handle() is not always available: you need to do a dynamic_cast to the expected
+   *   dataset type at runtime.
+   */
+  [[nodiscard]] [[deprecated("Use data()")]] inline auto dataset() const noexcept
     -> device_matrix_view<const T, int64_t, layout_stride>
   {
-    return dataset_view_;
+    auto p = dynamic_cast<strided_dataset<T, int64_t>*>(dataset_.get());
+    if (p != nullptr) { return p->view(); }
+    auto d = dataset_->dim();
+    return make_device_strided_matrix_view<const T, int64_t>(nullptr, 0, d, d);
+  }
+
+  /** Dataset [size, dim] */
+  [[nodiscard]] inline auto data() const noexcept -> const neighbors::dataset<int64_t>&
+  {
+    return *dataset_;
   }
 
   /** neighborhood graph [size, graph-degree] */
@@ -185,7 +205,7 @@ struct index : ann::index {
         raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded)
     : ann::index(),
       metric_(metric),
-      dataset_(make_device_matrix<T, int64_t>(res, 0, 0)),
+      dataset_(new neighbors::empty_dataset<int64_t>(0)),
       graph_(make_device_matrix<IdxT, int64_t>(res, 0, 0))
   {
   }
@@ -251,12 +271,11 @@ struct index : ann::index {
         mdspan<const IdxT, matrix_extent<int64_t>, row_major, graph_accessor> knn_graph)
     : ann::index(),
       metric_(metric),
-      dataset_(make_device_matrix<T, int64_t>(res, 0, 0)),
+      dataset_(make_aligned_dataset(res, dataset, 16)),
       graph_(make_device_matrix<IdxT, int64_t>(res, 0, 0))
   {
     RAFT_EXPECTS(dataset.extent(0) == knn_graph.extent(0),
                  "Dataset and knn_graph must have equal number of rows");
-    update_dataset(res, dataset);
     update_graph(res, knn_graph);
     resource::sync_stream(res);
   }
@@ -271,21 +290,14 @@ struct index : ann::index {
   void update_dataset(raft::resources const& res,
                       raft::device_matrix_view<const T, int64_t, row_major> dataset)
   {
-    if (dataset.extent(1) * sizeof(T) % 16 != 0) {
-      RAFT_LOG_DEBUG("Creating a padded copy of CAGRA dataset in device memory");
-      copy_padded(res, dataset);
-    } else {
-      dataset_view_ = make_device_strided_matrix_view<const T, int64_t>(
-        dataset.data_handle(), dataset.extent(0), dataset.extent(1), dataset.extent(1));
-    }
+    dataset_ = make_aligned_dataset(res, dataset, 16);
   }
 
   /** Set the dataset reference explicitly to a device matrix view with padding. */
-  void update_dataset(raft::resources const&,
+  void update_dataset(raft::resources const& res,
                       raft::device_matrix_view<const T, int64_t, layout_stride> dataset)
   {
-    RAFT_EXPECTS(dataset.stride(0) * sizeof(T) % 16 == 0, "Incorrect data padding.");
-    dataset_view_ = dataset;
+    dataset_ = make_aligned_dataset(res, dataset, 16);
   }
 
   /**
@@ -296,8 +308,22 @@ struct index : ann::index {
   void update_dataset(raft::resources const& res,
                       raft::host_matrix_view<const T, int64_t, row_major> dataset)
   {
-    RAFT_LOG_DEBUG("Copying CAGRA dataset from host to device");
-    copy_padded(res, dataset);
+    dataset_ = make_aligned_dataset(res, dataset, 16);
+  }
+
+  /** Replace the dataset with a new dataset. */
+  template <typename DatasetT>
+  auto update_dataset(raft::resources const& res, DatasetT&& dataset)
+    -> std::enable_if_t<std::is_base_of_v<neighbors::dataset<int64_t>, DatasetT>>
+  {
+    dataset_ = std::make_unique<DatasetT>(std::move(dataset));
+  }
+
+  template <typename DatasetT>
+  auto update_dataset(raft::resources const& res, std::unique_ptr<DatasetT>&& dataset)
+    -> std::enable_if_t<std::is_base_of_v<neighbors::dataset<int64_t>, DatasetT>>
+  {
+    dataset_ = std::move(dataset);
   }
 
   /**
@@ -334,26 +360,10 @@ struct index : ann::index {
   }
 
  private:
-  /** Create a device copy of the dataset, and pad it if necessary. */
-  template <typename data_accessor>
-  void copy_padded(raft::resources const& res,
-                   mdspan<const T, matrix_extent<int64_t>, row_major, data_accessor> dataset)
-  {
-    detail::copy_with_padding(res, dataset_, dataset);
-
-    dataset_view_ = make_device_strided_matrix_view<const T, int64_t>(
-      dataset_.data_handle(), dataset_.extent(0), dataset.extent(1), dataset_.extent(1));
-    RAFT_LOG_DEBUG("CAGRA dataset strided matrix view %zux%zu, stride %zu",
-                   static_cast<size_t>(dataset_view_.extent(0)),
-                   static_cast<size_t>(dataset_view_.extent(1)),
-                   static_cast<size_t>(dataset_view_.stride(0)));
-  }
-
   raft::distance::DistanceType metric_;
-  raft::device_matrix<T, int64_t, row_major> dataset_;
   raft::device_matrix<IdxT, int64_t, row_major> graph_;
-  raft::device_matrix_view<const T, int64_t, layout_stride> dataset_view_;
   raft::device_matrix_view<const IdxT, int64_t, row_major> graph_view_;
+  std::unique_ptr<neighbors::dataset<int64_t>> dataset_;
 };
 
 /** @} */
diff --git a/cpp/include/raft/neighbors/dataset.hpp b/cpp/include/raft/neighbors/dataset.hpp
new file mode 100644
index 0000000000..e7a3ba97a4
--- /dev/null
+++ b/cpp/include/raft/neighbors/dataset.hpp
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/util/cudart_utils.hpp>   // get_device_for_address
+#include <raft/util/integer_utils.hpp>  // rounding up
+
+#include <memory>
+#include <numeric>
+#include <type_traits>
+
+#ifdef __cpp_lib_bitops
+#include <bit>
+#endif
+
+namespace raft::neighbors {
+
+/** Two-dimensional dataset; maybe owning, maybe compressed, maybe strided. */
+template <typename IdxT>
+struct dataset {
+  using index_type = IdxT;
+  /**  Size of the dataset. */
+  [[nodiscard]] virtual auto n_rows() const noexcept -> index_type = 0;
+  /** Dimensionality of the dataset. */
+  [[nodiscard]] virtual auto dim() const noexcept -> uint32_t = 0;
+  /** Whether the object owns the data. */
+  [[nodiscard]] virtual auto is_owning() const noexcept -> bool = 0;
+  virtual ~dataset() noexcept                                   = default;
+};
+
+template <typename IdxT>
+struct empty_dataset : public dataset<IdxT> {
+  using index_type = IdxT;
+  uint32_t suggested_dim;
+  explicit empty_dataset(uint32_t dim) noexcept : suggested_dim(dim) {}
+  [[nodiscard]] auto n_rows() const noexcept -> index_type final { return 0; }
+  [[nodiscard]] auto dim() const noexcept -> uint32_t final { return suggested_dim; }
+  [[nodiscard]] auto is_owning() const noexcept -> bool final { return true; }
+};
+
+template <typename DataT, typename IdxT>
+struct strided_dataset : public dataset<IdxT> {
+  using index_type = IdxT;
+  using value_type = DataT;
+  using view_type  = device_matrix_view<const value_type, index_type, layout_stride>;
+  [[nodiscard]] auto n_rows() const noexcept -> index_type final { return view().extent(0); }
+  [[nodiscard]] auto dim() const noexcept -> uint32_t final
+  {
+    return static_cast<uint32_t>(view().extent(1));
+  }
+  /** Leading dimension of the dataset. */
+  [[nodiscard]] constexpr auto stride() const noexcept -> uint32_t
+  {
+    auto v = view();
+    return static_cast<uint32_t>(v.stride(0) > 0 ? v.stride(0) : v.extent(1));
+  }
+  /** Get the view of the data. */
+  [[nodiscard]] virtual auto view() const noexcept -> view_type;
+};
+
+template <typename DataT, typename IdxT>
+struct non_owning_dataset : public strided_dataset<DataT, IdxT> {
+  using index_type = IdxT;
+  using value_type = DataT;
+  using typename strided_dataset<value_type, index_type>::view_type;
+  view_type data;
+  explicit non_owning_dataset(view_type v) noexcept : data(v) {}
+  [[nodiscard]] auto is_owning() const noexcept -> bool final { return false; }
+  [[nodiscard]] auto view() const noexcept -> view_type final { return data; };
+};
+
+template <typename DataT, typename IdxT, typename LayoutPolicy, typename ContainerPolicy>
+struct owning_dataset : public strided_dataset<DataT, IdxT> {
+  using index_type = IdxT;
+  using value_type = DataT;
+  using typename strided_dataset<value_type, index_type>::view_type;
+  using storage_type =
+    mdarray<value_type, matrix_extent<index_type>, LayoutPolicy, ContainerPolicy>;
+  using mapping_type = typename view_type::mapping_type;
+  storage_type data;
+  mapping_type view_mapping;
+  owning_dataset(storage_type&& store, mapping_type view_mapping) noexcept
+    : data{std::move(store)}, view_mapping{view_mapping}
+  {
+  }
+
+  [[nodiscard]] auto is_owning() const noexcept -> bool final { return true; }
+  [[nodiscard]] auto view() const noexcept -> view_type final
+  {
+    return view_type{data.data_handle(), view_mapping};
+  };
+};
+
+/**
+ * @brief Contstruct a strided matrix from any mdarray or mdspan.
+ *
+ * This function constructs a non-owning view if the input satisfied two conditions:
+ *
+ *   1) The data is accessible from the current device
+ *   2) The memory layout is the same as expected (row-major matrix with the required stride)
+ *
+ * Otherwise, this function constructs an owning device matrix and copies the data.
+ * When the data is copied, padding elements are filled with zeroes.
+ *
+ * @tparam SrcT the source mdarray or mdspan
+ *
+ * @param[in] res raft resources handle
+ * @param[in] src the source mdarray or mdspan
+ * @param[in] required_stride the leading dimension (in elements)
+ * @return maybe owning current-device-accessible strided matrix
+ */
+template <typename SrcT>
+auto make_strided_dataset(const raft::resources& res, const SrcT& src, uint32_t required_stride)
+  -> std::unique_ptr<strided_dataset<typename SrcT::value_type, typename SrcT::index_type>>
+{
+  using extents_type = typename SrcT::extents_type;
+  using value_type   = typename SrcT::value_type;
+  using index_type   = typename SrcT::index_type;
+  using layout_type  = typename SrcT::layout_type;
+  static_assert(extents_type::rank() == 2, "The input must be a matrix.");
+  static_assert(std::is_same_v<layout_type, layout_right> ||
+                  std::is_same_v<layout_type, layout_right_padded<value_type>> ||
+                  std::is_same_v<layout_type, layout_stride>,
+                "The input must be row-major");
+  RAFT_EXPECTS(src.extent(1) <= required_stride,
+               "The input row length must be not larger than the desired stride.");
+  cudaPointerAttributes ptr_attrs;
+  RAFT_CUDA_TRY(cudaPointerGetAttributes(&ptr_attrs, src.data_handle()));
+  auto* device_ptr             = reinterpret_cast<value_type*>(ptr_attrs.devicePointer);
+  const uint32_t src_stride    = src.stride(0) > 0 ? src.stride(0) : src.extent(1);
+  const bool device_accessible = device_ptr != nullptr;
+  const bool row_major         = src.stride(1) <= 1;
+  const bool stride_matches    = required_stride == src_stride;
+
+  if (device_accessible && row_major && stride_matches) {
+    // Everything matches: make a non-owning dataset
+    return std::make_unique<non_owning_dataset<value_type, index_type>>(
+      make_device_strided_matrix_view<const value_type, index_type>(
+        device_ptr, src.extent(0), src.extent(1), required_stride));
+  }
+  // Something is wrong: have to make a copy and produce an owning dataset
+  auto out_layout =
+    make_strided_layout(src.extents(), std::array<index_type, 2>{required_stride, 1});
+  auto out_array = make_device_matrix<value_type, index_type>(res, src.extent(0), required_stride);
+
+  using out_mdarray_type          = decltype(out_array);
+  using out_layout_type           = typename out_mdarray_type::layout_type;
+  using out_container_policy_type = typename out_mdarray_type::container_policy_type;
+  using out_owning_type =
+    owning_dataset<value_type, index_type, out_layout_type, out_container_policy_type>;
+
+  RAFT_CUDA_TRY(cudaMemsetAsync(out_array.data_handle(),
+                                0,
+                                out_array.size() * sizeof(value_type),
+                                resource::get_cuda_stream(res)));
+  RAFT_CUDA_TRY(cudaMemcpy2DAsync(out_array.data_handle(),
+                                  sizeof(value_type) * required_stride,
+                                  src.data_handle(),
+                                  sizeof(value_type) * src_stride,
+                                  sizeof(value_type) * src.extent(1),
+                                  src.extent(0),
+                                  cudaMemcpyDefault,
+                                  resource::get_cuda_stream(res)));
+
+  return std::make_unique<out_owning_type>(std::move(out_array), out_layout);
+}
+
+/**
+ * @brief Contstruct a strided matrix from any mdarray or mdspan.
+ *
+ * A variant `make_strided_dataset` that allows specifying the byte alignment instead of the
+ * explicit stride length.
+ *
+ * @tparam SrcT the source mdarray or mdspan
+ *
+ * @param[in] res raft resources handle
+ * @param[in] src the source mdarray or mdspan
+ * @param[in] align_bytes the required byte alignment for the dataset rows.
+ * @return maybe owning current-device-accessible strided matrix
+ */
+template <typename SrcT>
+auto make_aligned_dataset(const raft::resources& res, const SrcT& src, uint32_t align_bytes = 16)
+  -> std::unique_ptr<strided_dataset<typename SrcT::value_type, typename SrcT::index_type>>
+{
+  using value_type       = typename SrcT::value_type;
+  constexpr size_t kSize = sizeof(value_type);
+  uint32_t required_stride =
+    raft::round_up_safe<size_t>(src.extent(1) * kSize, std::lcm(align_bytes, kSize)) / kSize;
+  return make_strided_dataset(res, src, required_stride);
+}
+
+/** Parameters for VPQ compression. */
+struct vpq_params {
+  /**
+   * The bit length of the vector element after compression by PQ.
+   *
+   * Possible values: [4, 5, 6, 7, 8].
+   *
+   * Hint: the smaller the 'pq_bits', the smaller the index size and the better the search
+   * performance, but the lower the recall.
+   */
+  uint32_t pq_bits = 8;
+  /**
+   * The dimensionality of the vector after compression by PQ.
+   * When zero, an optimal value is selected using a heuristic.
+   *
+   * TODO: at the moment `dim` must be a multiple `pq_dim`.
+   */
+  uint32_t pq_dim = 0;
+  /**
+   * Vector Quantization (VQ) codebook size - number of "coarse cluster centers".
+   * When zero, an optimal value is selected using a heuristic.
+   */
+  uint32_t vq_n_centers = 0;
+  /** The number of iterations searching for kmeans centers (both VQ & PQ phases). */
+  uint32_t kmeans_n_iters = 25;
+  /**
+   * The fraction of data to use during iterative kmeans building (VQ phase).
+   * When zero, an optimal value is selected using a heuristic.
+   */
+  double vq_kmeans_trainset_fraction = 0;
+  /**
+   * The fraction of data to use during iterative kmeans building (PQ phase).
+   * When zero, an optimal value is selected using a heuristic.
+   */
+  double pq_kmeans_trainset_fraction = 0;
+};
+
+/**
+ * @brief VPQ compressed dataset.
+ *
+ * The dataset is compressed using two level quantization
+ *
+ *   1. Vector Quantization
+ *   2. Product Quantization of residuals
+ *
+ * @tparam MathT the type of elements in the codebooks
+ * @tparam IdxT type of the vector indices (represent dataset.extent(0))
+ *
+ */
+template <typename MathT, typename IdxT>
+struct vpq_dataset : public dataset<IdxT> {
+  /** Vector Quantization codebook - "coarse cluster centers". */
+  device_matrix<MathT, uint32_t, row_major> vq_code_book;
+  /** Product Quantization codebook - "fine cluster centers".  */
+  device_matrix<MathT, uint32_t, row_major> pq_code_book;
+  /** Compressed dataset.  */
+  device_matrix<uint8_t, IdxT, row_major> data;
+
+  vpq_dataset(device_matrix<MathT, uint32_t, row_major>&& vq_code_book,
+              device_matrix<MathT, uint32_t, row_major>&& pq_code_book,
+              device_matrix<uint8_t, IdxT, row_major>&& data)
+    : vq_code_book{std::move(vq_code_book)},
+      pq_code_book{std::move(pq_code_book)},
+      data{std::move(data)}
+  {
+  }
+
+  [[nodiscard]] auto n_rows() const noexcept -> IdxT final { return data.extent(0); }
+  [[nodiscard]] auto dim() const noexcept -> uint32_t final { return vq_code_book.extent(1); }
+  [[nodiscard]] auto is_owning() const noexcept -> bool final { return true; }
+
+  /** Row length of the encoded data in bytes. */
+  [[nodiscard]] constexpr inline auto encoded_row_length() const noexcept -> uint32_t
+  {
+    return data.extent(1);
+  }
+  /** The number of "coarse cluster centers" */
+  [[nodiscard]] constexpr inline auto vq_n_centers() const noexcept -> uint32_t
+  {
+    return vq_code_book.extent(0);
+  }
+  /** The bit length of an encoded vector element after compression by PQ. */
+  [[nodiscard]] constexpr inline auto pq_bits() const noexcept -> uint32_t
+  {
+    /*
+    NOTE: pq_bits and the book size
+
+    Normally, we'd store `pq_bits` as a part of the index.
+    However, we know there's an invariant `pq_n_centers = 1 << pq_bits`, i.e. the codebook size is
+    the same as the number of possible code values. Hence, we don't store the pq_bits and derive it
+    from the array dimensions instead.
+     */
+    auto pq_width = pq_n_centers();
+#ifdef __cpp_lib_bitops
+    return std::countr_zero(pq_width);
+#else
+    uint32_t pq_bits = 0;
+    while (pq_width > 1) {
+      pq_bits++;
+      pq_width >>= 1;
+    }
+    return pq_bits;
+#endif
+  }
+  /** The dimensionality of an encoded vector after compression by PQ. */
+  [[nodiscard]] constexpr inline auto pq_dim() const noexcept -> uint32_t
+  {
+    return raft::div_rounding_up_unsafe(dim(), pq_len());
+  }
+  /** Dimensionality of a subspaces, i.e. the number of vector components mapped to a subspace */
+  [[nodiscard]] constexpr inline auto pq_len() const noexcept -> uint32_t
+  {
+    return pq_code_book.extent(1);
+  }
+  /** The number of vectors in a PQ codebook (`1 << pq_bits`). */
+  [[nodiscard]] constexpr inline auto pq_n_centers() const noexcept -> uint32_t
+  {
+    return pq_code_book.extent(0);
+  }
+};
+
+}  // namespace raft::neighbors
diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
index 08cc2beaeb..d91e45257e 100644
--- a/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
@@ -16,6 +16,7 @@
 #pragma once
 
 #include "../../cagra_types.hpp"
+#include "../../vpq_dataset.cuh"
 #include "graph_core.cuh"
 
 #include <raft/core/device_mdarray.hpp>
@@ -344,6 +345,15 @@ index<T, IdxT> build(
   RAFT_LOG_INFO("Graph optimized, creating index");
   // Construct an index from dataset and optimized knn graph.
   if (construct_index_with_dataset) {
+    if (params.compression.has_value()) {
+      index<T, IdxT> idx(res, params.metric);
+      idx.update_graph(res, raft::make_const_mdspan(cagra_graph.view()));
+      idx.update_dataset(
+        res,
+        // TODO: hardcoding codebook math to `half`, we can do runtime dispatching later
+        neighbors::vpq_build<decltype(dataset), half, int64_t>(res, *params.compression, dataset));
+      return idx;
+    }
     return index<T, IdxT>(res, params.metric, dataset, raft::make_const_mdspan(cagra_graph.view()));
   } else {
     // We just add the graph. User is expected to update dataset separately. This branch is used
diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh
index d7bd27222b..600c8785e0 100644
--- a/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh
@@ -23,6 +23,7 @@
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/serialize.hpp>
 #include <raft/neighbors/cagra_types.hpp>
+#include <raft/neighbors/detail/dataset_serialize.hpp>
 
 #include <cstddef>
 #include <cstdint>
@@ -31,7 +32,7 @@
 
 namespace raft::neighbors::cagra::detail {
 
-constexpr int serialization_version = 3;
+constexpr int serialization_version = 4;
 
 /**
  * Save the index to file.
@@ -65,26 +66,14 @@ void serialize(raft::resources const& res,
   serialize_scalar(res, os, index_.metric());
   serialize_mdspan(res, os, index_.graph());
 
-  include_dataset &= (index_.dataset().extent(0) > 0);
+  include_dataset &= (index_.data().n_rows() > 0);
 
   serialize_scalar(res, os, include_dataset);
   if (include_dataset) {
     RAFT_LOG_INFO("Saving CAGRA index with dataset");
-    auto dataset = index_.dataset();
-    // Remove padding before saving the dataset
-    auto host_dataset = make_host_matrix<T, int64_t>(dataset.extent(0), dataset.extent(1));
-    RAFT_CUDA_TRY(cudaMemcpy2DAsync(host_dataset.data_handle(),
-                                    sizeof(T) * host_dataset.extent(1),
-                                    dataset.data_handle(),
-                                    sizeof(T) * dataset.stride(0),
-                                    sizeof(T) * host_dataset.extent(1),
-                                    dataset.extent(0),
-                                    cudaMemcpyDefault,
-                                    resource::get_cuda_stream(res)));
-    resource::sync_stream(res);
-    serialize_mdspan(res, os, host_dataset.view());
+    neighbors::detail::serialize(res, os, index_.data());
   } else {
-    RAFT_LOG_INFO("Saving CAGRA index WITHOUT dataset");
+    RAFT_LOG_DEBUG("Saving CAGRA index WITHOUT dataset");
   }
 }
 
@@ -256,19 +245,13 @@ auto deserialize(raft::resources const& res, std::istream& is) -> index<T, IdxT>
   auto graph = raft::make_host_matrix<IdxT, int64_t>(n_rows, graph_degree);
   deserialize_mdspan(res, is, graph.view());
 
+  index<T, IdxT> idx(res, metric);
+  idx.update_graph(res, raft::make_const_mdspan(graph.view()));
   bool has_dataset = deserialize_scalar<bool>(res, is);
   if (has_dataset) {
-    auto dataset = raft::make_host_matrix<T, int64_t>(n_rows, dim);
-    deserialize_mdspan(res, is, dataset.view());
-    return index<T, IdxT>(
-      res, metric, raft::make_const_mdspan(dataset.view()), raft::make_const_mdspan(graph.view()));
-  } else {
-    // create a new index with no dataset - the user must supply via update_dataset themselves
-    // later (this avoids allocating GPU memory in the meantime)
-    index<T, IdxT> idx(res, metric);
-    idx.update_graph(res, raft::make_const_mdspan(graph.view()));
-    return idx;
+    idx.update_dataset(res, neighbors::detail::deserialize_dataset<int64_t>(res, is));
   }
+  return idx;
 }
 
 template <typename T, typename IdxT>
diff --git a/cpp/include/raft/neighbors/detail/dataset_serialize.hpp b/cpp/include/raft/neighbors/detail/dataset_serialize.hpp
new file mode 100644
index 0000000000..a6a6ae59a5
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/dataset_serialize.hpp
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../dataset.hpp"
+
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/core/serialize.hpp>
+
+#include <cuda_fp16.h>
+
+#include <fstream>
+#include <memory>
+
+namespace raft::neighbors::detail {
+
+using dataset_instance_tag                              = uint32_t;
+constexpr dataset_instance_tag kSerializeEmptyDataset   = 1;
+constexpr dataset_instance_tag kSerializeStridedDataset = 2;
+constexpr dataset_instance_tag kSerializeVPQDataset     = 3;
+
+template <typename IdxT>
+void serialize(const raft::resources& res, std::ostream& os, const empty_dataset<IdxT>& dataset)
+{
+  serialize_scalar(res, os, dataset.suggested_dim);
+}
+
+template <typename DataT, typename IdxT>
+void serialize(const raft::resources& res,
+               std::ostream& os,
+               const strided_dataset<DataT, IdxT>& dataset)
+{
+  auto n_rows = dataset.n_rows();
+  auto dim    = dataset.dim();
+  auto stride = dataset.stride();
+  serialize_scalar(res, os, n_rows);
+  serialize_scalar(res, os, dim);
+  serialize_scalar(res, os, stride);
+  // Remove padding before saving the dataset
+  auto src = dataset.view();
+  auto dst = make_host_matrix<DataT, IdxT>(n_rows, dim);
+  RAFT_CUDA_TRY(cudaMemcpy2DAsync(dst.data_handle(),
+                                  sizeof(DataT) * dim,
+                                  src.data_handle(),
+                                  sizeof(DataT) * stride,
+                                  sizeof(DataT) * dim,
+                                  n_rows,
+                                  cudaMemcpyDefault,
+                                  resource::get_cuda_stream(res)));
+  resource::sync_stream(res);
+  serialize_mdspan(res, os, dst.view());
+}
+
+template <typename MathT, typename IdxT>
+void serialize(const raft::resources& res,
+               std::ostream& os,
+               const vpq_dataset<MathT, IdxT>& dataset)
+{
+  serialize_scalar(res, os, dataset.n_rows());
+  serialize_scalar(res, os, dataset.dim());
+  serialize_scalar(res, os, dataset.vq_n_centers());
+  serialize_scalar(res, os, dataset.pq_n_centers());
+  serialize_scalar(res, os, dataset.pq_len());
+  serialize_scalar(res, os, dataset.encoded_row_length());
+  serialize_mdspan(res, os, make_const_mdspan(dataset.vq_code_book.view()));
+  serialize_mdspan(res, os, make_const_mdspan(dataset.pq_code_book.view()));
+  serialize_mdspan(res, os, make_const_mdspan(dataset.data.view()));
+}
+
+template <typename IdxT>
+void serialize(const raft::resources& res, std::ostream& os, const dataset<IdxT>& dataset)
+{
+  if (auto x = dynamic_cast<const empty_dataset<IdxT>*>(&dataset); x != nullptr) {
+    serialize_scalar(res, os, kSerializeEmptyDataset);
+    return serialize(res, os, *x);
+  }
+  if (auto x = dynamic_cast<const strided_dataset<float, IdxT>*>(&dataset); x != nullptr) {
+    serialize_scalar(res, os, kSerializeStridedDataset);
+    serialize_scalar(res, os, CUDA_R_32F);
+    return serialize(res, os, *x);
+  }
+  if (auto x = dynamic_cast<const strided_dataset<half, IdxT>*>(&dataset); x != nullptr) {
+    serialize_scalar(res, os, kSerializeStridedDataset);
+    serialize_scalar(res, os, CUDA_R_16F);
+    return serialize(res, os, *x);
+  }
+  if (auto x = dynamic_cast<const strided_dataset<int8_t, IdxT>*>(&dataset); x != nullptr) {
+    serialize_scalar(res, os, kSerializeStridedDataset);
+    serialize_scalar(res, os, CUDA_R_8I);
+    return serialize(res, os, *x);
+  }
+  if (auto x = dynamic_cast<const strided_dataset<uint8_t, IdxT>*>(&dataset); x != nullptr) {
+    serialize_scalar(res, os, kSerializeStridedDataset);
+    serialize_scalar(res, os, CUDA_R_8U);
+    return serialize(res, os, *x);
+  }
+  if (auto x = dynamic_cast<const vpq_dataset<float, IdxT>*>(&dataset); x != nullptr) {
+    serialize_scalar(res, os, kSerializeVPQDataset);
+    serialize_scalar(res, os, CUDA_R_32F);
+    return serialize(res, os, *x);
+  }
+  if (auto x = dynamic_cast<const vpq_dataset<half, IdxT>*>(&dataset); x != nullptr) {
+    serialize_scalar(res, os, kSerializeVPQDataset);
+    serialize_scalar(res, os, CUDA_R_16F);
+    return serialize(res, os, *x);
+  }
+  RAFT_FAIL("unsupported dataset type.");
+}
+
+template <typename IdxT>
+auto deserialize_empty(raft::resources const& res, std::istream& is)
+  -> std::unique_ptr<empty_dataset<IdxT>>
+{
+  auto suggested_dim = deserialize_scalar<uint32_t>(res, is);
+  return std::make_unique<empty_dataset<IdxT>>(suggested_dim);
+}
+
+template <typename DataT, typename IdxT>
+auto deserialize_strided(raft::resources const& res, std::istream& is)
+  -> std::unique_ptr<strided_dataset<DataT, IdxT>>
+{
+  auto n_rows     = deserialize_scalar<IdxT>(res, is);
+  auto dim        = deserialize_scalar<uint32_t>(res, is);
+  auto stride     = deserialize_scalar<uint32_t>(res, is);
+  auto host_array = make_host_matrix<DataT, IdxT>(n_rows, dim);
+  deserialize_mdspan(res, is, host_array.view());
+  return make_strided_dataset(res, host_array, stride);
+}
+
+template <typename MathT, typename IdxT>
+auto deserialize_vpq(raft::resources const& res, std::istream& is)
+  -> std::unique_ptr<vpq_dataset<MathT, IdxT>>
+{
+  auto n_rows             = deserialize_scalar<IdxT>(res, is);
+  auto dim                = deserialize_scalar<uint32_t>(res, is);
+  auto vq_n_centers       = deserialize_scalar<uint32_t>(res, is);
+  auto pq_n_centers       = deserialize_scalar<uint32_t>(res, is);
+  auto pq_len             = deserialize_scalar<uint32_t>(res, is);
+  auto encoded_row_length = deserialize_scalar<uint32_t>(res, is);
+
+  auto vq_code_book = make_device_matrix<MathT, uint32_t, row_major>(res, vq_n_centers, dim);
+  auto pq_code_book = make_device_matrix<MathT, uint32_t, row_major>(res, pq_n_centers, pq_len);
+  auto data         = make_device_matrix<uint8_t, IdxT, row_major>(res, n_rows, encoded_row_length);
+
+  deserialize_mdspan(res, is, vq_code_book.view());
+  deserialize_mdspan(res, is, pq_code_book.view());
+  deserialize_mdspan(res, is, data.view());
+
+  return std::make_unique<vpq_dataset<MathT, IdxT>>(
+    std::move(vq_code_book), std::move(pq_code_book), std::move(data));
+}
+
+template <typename IdxT>
+auto deserialize_dataset(raft::resources const& res, std::istream& is)
+  -> std::unique_ptr<dataset<IdxT>>
+{
+  switch (deserialize_scalar<dataset_instance_tag>(res, is)) {
+    case kSerializeEmptyDataset: return deserialize_empty<IdxT>(res, is);
+    case kSerializeStridedDataset:
+      switch (deserialize_scalar<cudaDataType_t>(res, is)) {
+        case CUDA_R_32F: return deserialize_strided<float, IdxT>(res, is);
+        case CUDA_R_16F: return deserialize_strided<half, IdxT>(res, is);
+        case CUDA_R_8I: return deserialize_strided<int8_t, IdxT>(res, is);
+        case CUDA_R_8U: return deserialize_strided<uint8_t, IdxT>(res, is);
+        default: break;
+      }
+    case kSerializeVPQDataset:
+      switch (deserialize_scalar<cudaDataType_t>(res, is)) {
+        case CUDA_R_32F: return deserialize_vpq<float, IdxT>(res, is);
+        case CUDA_R_16F: return deserialize_vpq<half, IdxT>(res, is);
+        default: break;
+      }
+    default: break;
+  }
+  RAFT_FAIL("Failed to deserialize dataset: unsupported combination of instance tags.");
+}
+
+}  // namespace raft::neighbors::detail
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index b7796d52fa..8e3f7dbaf3 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -61,6 +61,8 @@ namespace raft::neighbors::ivf_pq::detail {
 
 using namespace raft::spatial::knn::detail;  // NOLINT
 
+using internal_extents_t = int64_t;  // The default mdspan extent type used internally.
+
 template <uint32_t BlockDim, typename T, typename S>
 __launch_bounds__(BlockDim) RAFT_KERNEL copy_warped_kernel(
   T* out, uint32_t ld_out, const S* in, uint32_t ld_in, uint32_t n_cols, size_t n_rows)
@@ -442,15 +444,16 @@ void train_per_subset(raft::resources const& handle,
                  stream);
 
     // train PQ codebook for this subspace
-    auto sub_trainset_view =
-      raft::make_device_matrix_view<const float, IdxT>(sub_trainset.data(), n_rows, index.pq_len());
-    auto centers_tmp_view = raft::make_device_matrix_view<float, IdxT>(
+    auto sub_trainset_view = raft::make_device_matrix_view<const float, internal_extents_t>(
+      sub_trainset.data(), n_rows, index.pq_len());
+    auto centers_tmp_view = raft::make_device_matrix_view<float, internal_extents_t>(
       pq_centers_tmp.data() + index.pq_book_size() * index.pq_len() * j,
       index.pq_book_size(),
       index.pq_len());
-    auto sub_labels_view = raft::make_device_vector_view<uint32_t, IdxT>(sub_labels.data(), n_rows);
-    auto cluster_sizes_view =
-      raft::make_device_vector_view<uint32_t, IdxT>(pq_cluster_sizes.data(), index.pq_book_size());
+    auto sub_labels_view =
+      raft::make_device_vector_view<uint32_t, internal_extents_t>(sub_labels.data(), n_rows);
+    auto cluster_sizes_view = raft::make_device_vector_view<uint32_t, internal_extents_t>(
+      pq_cluster_sizes.data(), index.pq_book_size());
     raft::cluster::kmeans_balanced_params kmeans_params;
     kmeans_params.n_iters = kmeans_n_iters;
     kmeans_params.metric  = raft::distance::DistanceType::L2Expanded;
@@ -525,17 +528,17 @@ void train_per_cluster(raft::resources const& handle,
     size_t available_rows = size_t(cluster_size) * size_t(index.pq_dim());
     auto pq_n_rows        = uint32_t(std::min(big_enough, available_rows));
     // train PQ codebook for this cluster
-    auto rot_vectors_view = raft::make_device_matrix_view<const float, IdxT>(
+    auto rot_vectors_view = raft::make_device_matrix_view<const float, internal_extents_t>(
       rot_vectors.data(), pq_n_rows, index.pq_len());
-    auto centers_tmp_view = raft::make_device_matrix_view<float, IdxT>(
+    auto centers_tmp_view = raft::make_device_matrix_view<float, internal_extents_t>(
       pq_centers_tmp.data() + static_cast<size_t>(index.pq_book_size()) *
                                 static_cast<size_t>(index.pq_len()) * static_cast<size_t>(l),
       index.pq_book_size(),
       index.pq_len());
     auto pq_labels_view =
-      raft::make_device_vector_view<uint32_t, IdxT>(pq_labels.data(), pq_n_rows);
-    auto pq_cluster_sizes_view =
-      raft::make_device_vector_view<uint32_t, IdxT>(pq_cluster_sizes.data(), index.pq_book_size());
+      raft::make_device_vector_view<uint32_t, internal_extents_t>(pq_labels.data(), pq_n_rows);
+    auto pq_cluster_sizes_view = raft::make_device_vector_view<uint32_t, internal_extents_t>(
+      pq_cluster_sizes.data(), index.pq_book_size());
     raft::cluster::kmeans_balanced_params kmeans_params;
     kmeans_params.n_iters = kmeans_n_iters;
     kmeans_params.metric  = raft::distance::DistanceType::L2Expanded;
@@ -1587,11 +1590,11 @@ void extend(raft::resources const& handle,
                                     cudaMemcpyDefault,
                                     stream));
     for (const auto& batch : vec_batches) {
-      auto batch_data_view =
-        raft::make_device_matrix_view<const T, IdxT>(batch.data(), batch.size(), index->dim());
-      auto batch_labels_view = raft::make_device_vector_view<uint32_t, IdxT>(
+      auto batch_data_view = raft::make_device_matrix_view<const T, internal_extents_t>(
+        batch.data(), batch.size(), index->dim());
+      auto batch_labels_view = raft::make_device_vector_view<uint32_t, internal_extents_t>(
         new_data_labels.data() + batch.offset(), batch.size());
-      auto centers_view = raft::make_device_matrix_view<const float, IdxT>(
+      auto centers_view = raft::make_device_matrix_view<const float, internal_extents_t>(
         cluster_centers.data(), n_clusters, index->dim());
       raft::cluster::kmeans_balanced_params kmeans_params;
       kmeans_params.metric = index->metric();
@@ -1767,10 +1770,10 @@ auto build(raft::resources const& handle,
     auto cluster_centers = cluster_centers_buf.data();
 
     // Train balanced hierarchical kmeans clustering
-    auto trainset_const_view =
-      raft::make_device_matrix_view<const float, IdxT>(trainset.data(), n_rows_train, index.dim());
-    auto centers_view =
-      raft::make_device_matrix_view<float, IdxT>(cluster_centers, index.n_lists(), index.dim());
+    auto trainset_const_view = raft::make_device_matrix_view<const float, internal_extents_t>(
+      trainset.data(), n_rows_train, index.dim());
+    auto centers_view = raft::make_device_matrix_view<float, internal_extents_t>(
+      cluster_centers, index.n_lists(), index.dim());
     raft::cluster::kmeans_balanced_params kmeans_params;
     kmeans_params.n_iters = params.kmeans_n_iters;
     kmeans_params.metric  = index.metric();
@@ -1779,9 +1782,10 @@ auto build(raft::resources const& handle,
 
     // Trainset labels are needed for training PQ codebooks
     rmm::device_uvector<uint32_t> labels(n_rows_train, stream, device_memory);
-    auto centers_const_view = raft::make_device_matrix_view<const float, IdxT>(
+    auto centers_const_view = raft::make_device_matrix_view<const float, internal_extents_t>(
       cluster_centers, index.n_lists(), index.dim());
-    auto labels_view = raft::make_device_vector_view<uint32_t, IdxT>(labels.data(), n_rows_train);
+    auto labels_view =
+      raft::make_device_vector_view<uint32_t, internal_extents_t>(labels.data(), n_rows_train);
     raft::cluster::kmeans_balanced::predict(handle,
                                             kmeans_params,
                                             trainset_const_view,
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh
new file mode 100644
index 0000000000..83dd994bd6
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh
@@ -0,0 +1,71 @@
+
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is to be used in source files generated by
+ * src/neighbors/detailivf_pq_compute_similarity_00_generate.py
+ */
+
+#pragma once
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
+#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
+#include <raft/neighbors/sample_filter.cuh>
+
+#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(                 \
+  OutT, LutT, IvfSampleFilterT)                                                             \
+  template auto                                                                             \
+  raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT, IvfSampleFilterT>( \
+    const cudaDeviceProp& dev_props,                                                        \
+    bool manage_local_topk,                                                                 \
+    int locality_hint,                                                                      \
+    double preferred_shmem_carveout,                                                        \
+    uint32_t pq_bits,                                                                       \
+    uint32_t pq_dim,                                                                        \
+    uint32_t precomp_data_count,                                                            \
+    uint32_t n_queries,                                                                     \
+    uint32_t n_probes,                                                                      \
+    uint32_t topk)                                                                          \
+    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT, IvfSampleFilterT>;              \
+                                                                                            \
+  template void                                                                             \
+  raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT, IvfSampleFilterT>(    \
+    raft::neighbors::ivf_pq::detail::selected<OutT, LutT, IvfSampleFilterT> s,              \
+    rmm::cuda_stream_view stream,                                                           \
+    uint32_t dim,                                                                           \
+    uint32_t n_probes,                                                                      \
+    uint32_t pq_dim,                                                                        \
+    uint32_t n_queries,                                                                     \
+    uint32_t queries_offset,                                                                \
+    raft::distance::DistanceType metric,                                                    \
+    raft::neighbors::ivf_pq::codebook_gen codebook_kind,                                    \
+    uint32_t topk,                                                                          \
+    uint32_t max_samples,                                                                   \
+    const float* cluster_centers,                                                           \
+    const float* pq_centers,                                                                \
+    const uint8_t* const* pq_dataset,                                                       \
+    const uint32_t* cluster_labels,                                                         \
+    const uint32_t* _chunk_indices,                                                         \
+    const float* queries,                                                                   \
+    const uint32_t* index_list,                                                             \
+    float* query_kths,                                                                      \
+    IvfSampleFilterT sample_filter,                                                         \
+    LutT* lut_scores,                                                                       \
+    OutT* _out_scores,                                                                      \
+    uint32_t* _out_indices);
+
+#define COMMA ,
diff --git a/cpp/include/raft/neighbors/detail/knn_merge_parts.cuh b/cpp/include/raft/neighbors/detail/knn_merge_parts.cuh
index 0395e86e43..33324714fd 100644
--- a/cpp/include/raft/neighbors/detail/knn_merge_parts.cuh
+++ b/cpp/include/raft/neighbors/detail/knn_merge_parts.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <raft/core/error.hpp>
 #include <raft/neighbors/detail/faiss_select/DistanceUtils.h>
 #include <raft/neighbors/detail/faiss_select/Select.cuh>
 #include <raft/util/cuda_utils.cuh>
@@ -168,5 +169,7 @@ inline void knn_merge_parts(const value_t* inK,
   else if (k <= 1024)
     knn_merge_parts_impl<value_idx, value_t, 1024, 8>(
       inK, inV, outK, outV, n_samples, n_parts, k, stream, translations);
+  else
+    THROW("Unimplemented for k=%d, knn_merge_parts works for k<=1024", k);
 }
 }  // namespace raft::neighbors::detail
diff --git a/cpp/include/raft/neighbors/detail/vpq_dataset.cuh b/cpp/include/raft/neighbors/detail/vpq_dataset.cuh
new file mode 100644
index 0000000000..f6cd2a1ceb
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/vpq_dataset.cuh
@@ -0,0 +1,427 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../dataset.hpp"
+
+#include <raft/cluster/kmeans_balanced.cuh>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/linalg/map.cuh>
+#include <raft/neighbors/detail/ivf_pq_build.cuh>  // pq_bits-bitfield
+#include <raft/spatial/knn/detail/ann_utils.cuh>   // utils::mapping etc
+#include <raft/util/integer_utils.hpp>
+#include <raft/util/pow2_utils.cuh>
+
+// A temporary stub till https://github.com/rapidsai/raft/pull/2077 is re-merged
+namespace raft::util {
+
+/**
+ * Subsample the dataset to create a training set.
+ *
+ * @tparam DatasetT a row-major mdspan or mdarray (device or host)
+ *
+ * @param res raft handle
+ * @param dataset input row-major mdspan or mdarray (device or host)
+ * @param n_samples the size of the output mdarray
+ *
+ * @return a newly allocated subset of the dataset.
+ */
+template <typename DatasetT>
+auto subsample(raft::resources const& res,
+               const DatasetT& dataset,
+               typename DatasetT::index_type n_samples)
+  -> raft::device_matrix<typename DatasetT::value_type, typename DatasetT::index_type>
+{
+  using value_type = typename DatasetT::value_type;
+  using index_type = typename DatasetT::index_type;
+  static_assert(std::is_same_v<typename DatasetT::layout_type, row_major>,
+                "Only row-major layout is supported at the moment");
+  RAFT_EXPECTS(n_samples <= dataset.extent(0),
+               "The number of samples must be smaller than the number of input rows in the current "
+               "implementation.");
+  size_t dim            = dataset.extent(1);
+  size_t trainset_ratio = dataset.extent(0) / n_samples;
+  auto result = raft::make_device_matrix<value_type, index_type>(res, n_samples, dataset.extent(1));
+
+  RAFT_CUDA_TRY(cudaMemcpy2DAsync(result.data_handle(),
+                                  sizeof(value_type) * dim,
+                                  dataset.data_handle(),
+                                  sizeof(value_type) * dim * trainset_ratio,
+                                  sizeof(value_type) * dim,
+                                  n_samples,
+                                  cudaMemcpyDefault,
+                                  raft::resource::get_cuda_stream(res)));
+  return result;
+}
+
+}  // namespace raft::util
+
+namespace raft::neighbors::detail {
+
+template <typename DatasetT>
+auto fill_missing_params_heuristics(const vpq_params& params, const DatasetT& dataset) -> vpq_params
+{
+  vpq_params r  = params;
+  double n_rows = dataset.extent(0);
+  size_t dim    = dataset.extent(1);
+  if (r.pq_dim == 0) { r.pq_dim = raft::div_rounding_up_safe(dim, size_t{4}); }
+  if (r.pq_bits == 0) { r.pq_bits = 8; }
+  if (r.vq_n_centers == 0) { r.vq_n_centers = raft::round_up_safe<uint32_t>(std::sqrt(n_rows), 8); }
+  if (r.vq_kmeans_trainset_fraction == 0) {
+    double vq_trainset_size       = 100.0 * r.vq_n_centers;
+    r.vq_kmeans_trainset_fraction = std::min(1.0, vq_trainset_size / n_rows);
+  }
+  if (r.pq_kmeans_trainset_fraction == 0) {
+    // NB: we'll have actually `pq_dim` times more samples than this
+    //     (because the dataset is reinterpreted as `[n_rows * pq_dim, pq_len]`)
+    double pq_trainset_size       = 1000.0 * (1u << r.pq_bits);
+    r.pq_kmeans_trainset_fraction = std::min(1.0, pq_trainset_size / n_rows);
+  }
+  return r;
+}
+
+template <typename T, typename DatasetT>
+auto transform_data(const raft::resources& res, DatasetT dataset)
+  -> device_mdarray<T, typename DatasetT::extents_type, typename DatasetT::layout_type>
+{
+  using index_type       = typename DatasetT::index_type;
+  using extents_type     = typename DatasetT::extents_type;
+  using layout_type      = typename DatasetT::layout_type;
+  using out_mdarray_type = device_mdarray<T, extents_type, layout_type>;
+  if constexpr (std::is_same_v<out_mdarray_type, std::decay<DatasetT>>) { return dataset; }
+
+  auto result = raft::make_device_mdarray<T, index_type, layout_type>(res, dataset.extents());
+
+  linalg::map(res,
+              result.view(),
+              spatial::knn::detail::utils::mapping<T>{},
+              raft::make_const_mdspan(dataset.view()));
+
+  return result;
+}
+
+/** Fix the internal indexing type to avoid integer underflows/overflows */
+using ix_t = int64_t;
+
+template <typename MathT, typename DatasetT>
+auto train_vq(const raft::resources& res, const vpq_params& params, const DatasetT& dataset)
+  -> device_matrix<MathT, uint32_t, row_major>
+{
+  const ix_t n_rows       = dataset.extent(0);
+  const ix_t vq_n_centers = params.vq_n_centers;
+  const ix_t dim          = dataset.extent(1);
+  const ix_t n_rows_train = n_rows * params.vq_kmeans_trainset_fraction;
+
+  // Subsample the dataset and transform into the required type if necessary
+  auto vq_trainset = raft::util::subsample(res, dataset, n_rows_train);
+  auto vq_centers  = raft::make_device_matrix<MathT, uint32_t, row_major>(res, vq_n_centers, dim);
+
+  using kmeans_in_type = typename DatasetT::value_type;
+  raft::cluster::kmeans_balanced_params kmeans_params;
+  kmeans_params.n_iters = params.kmeans_n_iters;
+  kmeans_params.metric  = raft::distance::DistanceType::L2Expanded;
+  auto vq_centers_view =
+    raft::make_device_matrix_view<MathT, ix_t>(vq_centers.data_handle(), vq_n_centers, dim);
+  auto vq_trainset_view = raft::make_device_matrix_view<const kmeans_in_type, ix_t>(
+    vq_trainset.data_handle(), n_rows_train, dim);
+  raft::cluster::kmeans_balanced::fit<kmeans_in_type, MathT, ix_t>(
+    res,
+    kmeans_params,
+    vq_trainset_view,
+    vq_centers_view,
+    spatial::knn::detail::utils::mapping<MathT>{});
+
+  return vq_centers;
+}
+
+template <typename LabelT, typename DatasetT, typename VqCentersT>
+auto predict_vq(const raft::resources& res, const DatasetT& dataset, const VqCentersT& vq_centers)
+  -> device_vector<LabelT, typename DatasetT::index_type>
+{
+  using kmeans_data_type = typename DatasetT::value_type;
+  using kmeans_math_type = typename VqCentersT::value_type;
+  using index_type       = typename DatasetT::index_type;
+  using label_type       = LabelT;
+
+  auto vq_labels = raft::make_device_vector<label_type, index_type>(res, dataset.extent(0));
+
+  raft::cluster::kmeans_balanced_params kmeans_params;
+  kmeans_params.metric = raft::distance::DistanceType::L2Expanded;
+
+  auto vq_centers_view = raft::make_device_matrix_view<const kmeans_math_type, index_type>(
+    vq_centers.data_handle(), vq_centers.extent(0), vq_centers.extent(1));
+
+  auto vq_dataset_view = raft::make_device_matrix_view<const kmeans_data_type, index_type>(
+    dataset.data_handle(), dataset.extent(0), dataset.extent(1));
+
+  raft::cluster::kmeans_balanced::
+    predict<kmeans_data_type, kmeans_math_type, index_type, label_type>(
+      res,
+      kmeans_params,
+      vq_dataset_view,
+      vq_centers_view,
+      vq_labels.view(),
+      spatial::knn::detail::utils::mapping<kmeans_math_type>{});
+
+  return vq_labels;
+}
+
+template <typename MathT, typename DatasetT>
+auto train_pq(const raft::resources& res,
+              const vpq_params& params,
+              const DatasetT& dataset,
+              const device_matrix_view<const MathT, uint32_t, row_major>& vq_centers)
+  -> device_matrix<MathT, uint32_t, row_major>
+{
+  const ix_t n_rows       = dataset.extent(0);
+  const ix_t dim          = dataset.extent(1);
+  const ix_t pq_dim       = params.pq_dim;
+  const ix_t pq_bits      = params.pq_bits;
+  const ix_t pq_n_centers = ix_t{1} << pq_bits;
+  const ix_t pq_len       = raft::div_rounding_up_safe(dim, pq_dim);
+  const ix_t n_rows_train = n_rows * params.pq_kmeans_trainset_fraction;
+
+  // Subsample the dataset and transform into the required type if necessary
+  auto pq_trainset = transform_data<MathT>(res, raft::util::subsample(res, dataset, n_rows_train));
+
+  // Subtract VQ centers
+  {
+    auto vq_labels   = predict_vq<uint32_t>(res, pq_trainset, vq_centers);
+    using index_type = typename DatasetT::index_type;
+    linalg::map_offset(
+      res,
+      pq_trainset.view(),
+      [labels = vq_labels.view(), centers = vq_centers, dim] __device__(index_type off, MathT x) {
+        index_type i = off / dim;
+        index_type j = off % dim;
+        return x - centers(labels(i), j);
+      },
+      raft::make_const_mdspan(pq_trainset.view()));
+  }
+
+  auto pq_centers = raft::make_device_matrix<MathT, uint32_t, row_major>(res, pq_n_centers, pq_len);
+
+  // Train PQ centers
+  {
+    raft::cluster::kmeans_balanced_params kmeans_params;
+    kmeans_params.n_iters = params.kmeans_n_iters;
+    kmeans_params.metric  = raft::distance::DistanceType::L2Expanded;
+
+    auto pq_centers_view =
+      raft::make_device_matrix_view<MathT, ix_t>(pq_centers.data_handle(), pq_n_centers, pq_len);
+
+    auto pq_trainset_view = raft::make_device_matrix_view<const MathT, ix_t>(
+      pq_trainset.data_handle(), n_rows_train * pq_dim, pq_len);
+
+    raft::cluster::kmeans_balanced::fit<MathT, MathT, ix_t>(
+      res, kmeans_params, pq_trainset_view, pq_centers_view);
+  }
+
+  return pq_centers;
+}
+
+template <uint32_t SubWarpSize, typename DataT, typename MathT, typename IdxT, typename LabelT>
+__device__ auto compute_code(device_matrix_view<const DataT, IdxT, row_major> dataset,
+                             device_matrix_view<const MathT, uint32_t, row_major> vq_centers,
+                             device_matrix_view<const MathT, uint32_t, row_major> pq_centers,
+                             IdxT i,
+                             uint32_t j,
+                             LabelT vq_label) -> uint8_t
+{
+  auto data_mapping = spatial::knn::detail::utils::mapping<MathT>{};
+  uint32_t lane_id  = Pow2<SubWarpSize>::mod(laneId());
+
+  const uint32_t pq_book_size = pq_centers.extent(0);
+  const uint32_t pq_len       = pq_centers.extent(1);
+  float min_dist              = std::numeric_limits<float>::infinity();
+  uint8_t code                = 0;
+  // calculate the distance for each PQ cluster, find the minimum for each thread
+  for (uint32_t l = lane_id; l < pq_book_size; l += SubWarpSize) {
+    // NB: the L2 quantifiers on residuals are always trained on L2 metric.
+    float d = 0.0f;
+    for (uint32_t k = 0; k < pq_len; k++) {
+      auto jk = j * pq_len + k;
+      auto x  = data_mapping(dataset(i, jk)) - vq_centers(vq_label, jk);
+      auto t  = x - pq_centers(l, k);
+      d += t * t;
+    }
+    if (d < min_dist) {
+      min_dist = d;
+      code     = uint8_t(l);
+    }
+  }
+  // reduce among threads
+#pragma unroll
+  for (uint32_t stride = SubWarpSize >> 1; stride > 0; stride >>= 1) {
+    const auto other_dist = shfl_xor(min_dist, stride, SubWarpSize);
+    const auto other_code = shfl_xor(code, stride, SubWarpSize);
+    if (other_dist < min_dist) {
+      min_dist = other_dist;
+      code     = other_code;
+    }
+  }
+  return code;
+}
+
+template <uint32_t BlockSize,
+          uint32_t PqBits,
+          typename DataT,
+          typename MathT,
+          typename IdxT,
+          typename LabelT>
+__launch_bounds__(BlockSize) RAFT_KERNEL
+  process_and_fill_codes_kernel(device_matrix_view<uint8_t, IdxT, row_major> out_codes,
+                                device_matrix_view<const DataT, IdxT, row_major> dataset,
+                                device_matrix_view<const MathT, uint32_t, row_major> vq_centers,
+                                device_vector_view<const LabelT, IdxT, row_major> vq_labels,
+                                device_matrix_view<const MathT, uint32_t, row_major> pq_centers)
+{
+  constexpr uint32_t kSubWarpSize = std::min<uint32_t>(WarpSize, 1u << PqBits);
+  using subwarp_align             = Pow2<kSubWarpSize>;
+  const IdxT row_ix = subwarp_align::div(IdxT{threadIdx.x} + IdxT{BlockSize} * IdxT{blockIdx.x});
+  if (row_ix >= out_codes.extent(0)) { return; }
+
+  const uint32_t pq_dim = raft::div_rounding_up_unsafe(vq_centers.extent(1), pq_centers.extent(1));
+
+  const uint32_t lane_id = Pow2<kSubWarpSize>::mod(threadIdx.x);
+  const LabelT vq_label  = vq_labels(row_ix);
+
+  // write label
+  auto* out_label_ptr = reinterpret_cast<LabelT*>(&out_codes(row_ix, 0));
+  if (lane_id == 0) { *out_label_ptr = vq_label; }
+
+  auto* out_codes_ptr = reinterpret_cast<uint8_t*>(out_label_ptr + 1);
+  ivf_pq::detail::bitfield_view_t<PqBits> code_view{out_codes_ptr};
+  for (uint32_t j = 0; j < pq_dim; j++) {
+    // find PQ label
+    uint8_t code = compute_code<kSubWarpSize>(dataset, vq_centers, pq_centers, row_ix, j, vq_label);
+    // TODO: this writes in global memory one byte per warp, which is very slow.
+    //  It's better to keep the codes in the shared memory or registers and dump them at once.
+    if (lane_id == 0) { code_view[j] = code; }
+  }
+}
+
+template <typename MathT, typename IdxT, typename DatasetT>
+auto process_and_fill_codes(const raft::resources& res,
+                            const vpq_params& params,
+                            const DatasetT& dataset,
+                            device_matrix_view<const MathT, uint32_t, row_major> vq_centers,
+                            device_matrix_view<const MathT, uint32_t, row_major> pq_centers)
+  -> device_matrix<uint8_t, IdxT, row_major>
+{
+  using data_t     = typename DatasetT::value_type;
+  using cdataset_t = vpq_dataset<MathT, IdxT>;
+  using label_t    = uint32_t;
+
+  const ix_t n_rows       = dataset.extent(0);
+  const ix_t dim          = dataset.extent(1);
+  const ix_t pq_dim       = params.pq_dim;
+  const ix_t pq_bits      = params.pq_bits;
+  const ix_t pq_n_centers = ix_t{1} << pq_bits;
+  // NB: codes must be aligned at least to sizeof(label_t) to be able to read labels.
+  const ix_t codes_rowlen =
+    sizeof(label_t) * (1 + raft::div_rounding_up_safe<ix_t>(pq_dim * pq_bits, 8 * sizeof(label_t)));
+
+  auto codes = raft::make_device_matrix<uint8_t, IdxT, row_major>(res, n_rows, codes_rowlen);
+
+  auto stream = raft::resource::get_cuda_stream(res);
+
+  // TODO: with scaling workspace we could choose the batch size dynamically
+  constexpr ix_t kReasonableMaxBatchSize = 65536;
+  constexpr ix_t kBlockSize              = 256;
+  const ix_t threads_per_vec             = std::min<ix_t>(WarpSize, pq_n_centers);
+  dim3 threads(kBlockSize, 1, 1);
+  ix_t max_batch_size = std::min<ix_t>(n_rows, kReasonableMaxBatchSize);
+  auto kernel         = [](uint32_t pq_bits) {
+    switch (pq_bits) {
+      case 4: return process_and_fill_codes_kernel<kBlockSize, 4, data_t, MathT, IdxT, label_t>;
+      case 5: return process_and_fill_codes_kernel<kBlockSize, 5, data_t, MathT, IdxT, label_t>;
+      case 6: return process_and_fill_codes_kernel<kBlockSize, 6, data_t, MathT, IdxT, label_t>;
+      case 7: return process_and_fill_codes_kernel<kBlockSize, 7, data_t, MathT, IdxT, label_t>;
+      case 8: return process_and_fill_codes_kernel<kBlockSize, 8, data_t, MathT, IdxT, label_t>;
+      default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
+    }
+  }(pq_bits);
+  for (const auto& batch :
+       spatial::knn::detail::utils::batch_load_iterator(dataset.data_handle(),
+                                                        n_rows,
+                                                        dim,
+                                                        max_batch_size,
+                                                        stream,
+                                                        rmm::mr::get_current_device_resource())) {
+    auto batch_view = raft::make_device_matrix_view(batch.data(), ix_t(batch.size()), dim);
+    auto labels     = predict_vq<label_t>(res, batch_view, vq_centers);
+    dim3 blocks(div_rounding_up_safe<ix_t>(n_rows, kBlockSize / threads_per_vec), 1, 1);
+    kernel<<<blocks, threads, 0, stream>>>(
+      make_device_matrix_view<uint8_t, IdxT>(
+        codes.data_handle() + batch.offset() * codes_rowlen, batch.size(), codes_rowlen),
+      batch_view,
+      vq_centers,
+      make_const_mdspan(labels.view()),
+      pq_centers);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+
+  return codes;
+}
+
+template <typename NewMathT, typename OldMathT, typename IdxT>
+auto vpq_convert_math_type(const raft::resources& res, vpq_dataset<OldMathT, IdxT>&& src)
+  -> vpq_dataset<NewMathT, IdxT>
+{
+  auto vq_code_book = make_device_mdarray<NewMathT>(res, src.vq_code_book.extents());
+  auto pq_code_book = make_device_mdarray<NewMathT>(res, src.pq_code_book.extents());
+
+  linalg::map(res,
+              vq_code_book.view(),
+              spatial::knn::detail::utils::mapping<NewMathT>{},
+              raft::make_const_mdspan(src.vq_code_book.view()));
+  linalg::map(res,
+              pq_code_book.view(),
+              spatial::knn::detail::utils::mapping<NewMathT>{},
+              raft::make_const_mdspan(src.pq_code_book.view()));
+  return vpq_dataset<NewMathT, IdxT>{
+    std::move(vq_code_book), std::move(pq_code_book), std::move(src.data)};
+}
+
+template <typename DatasetT, typename MathT, typename IdxT>
+auto vpq_build(const raft::resources& res, const vpq_params& params, const DatasetT& dataset)
+  -> vpq_dataset<MathT, IdxT>
+{
+  // Use a heuristic to impute missing parameters.
+  auto ps = fill_missing_params_heuristics(params, dataset);
+
+  // Train codes
+  auto vq_code_book = train_vq<MathT>(res, ps, dataset);
+  auto pq_code_book =
+    train_pq<MathT>(res, ps, dataset, raft::make_const_mdspan(vq_code_book.view()));
+
+  // Encode dataset
+  auto codes = process_and_fill_codes<MathT, IdxT>(res,
+                                                   ps,
+                                                   dataset,
+                                                   raft::make_const_mdspan(vq_code_book.view()),
+                                                   raft::make_const_mdspan(pq_code_book.view()));
+
+  return vpq_dataset<MathT, IdxT>{
+    std::move(vq_code_book), std::move(pq_code_book), std::move(codes)};
+}
+
+}  // namespace raft::neighbors::detail
diff --git a/cpp/include/raft/neighbors/vpq_dataset.cuh b/cpp/include/raft/neighbors/vpq_dataset.cuh
new file mode 100644
index 0000000000..73ee6c52ed
--- /dev/null
+++ b/cpp/include/raft/neighbors/vpq_dataset.cuh
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "dataset.hpp"
+#include "detail/vpq_dataset.cuh"
+
+#include <raft/core/resources.hpp>
+
+namespace raft::neighbors {
+
+/**
+ * @brief Compress a dataset for use in CAGRA-Q search in place of the original data.
+ *
+ * @tparam DatasetT a row-major mdspan or mdarray (device or host).
+ * @tparam MathT a type of the codebook elements and internal math ops.
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res
+ * @param[in] params VQ and PQ parameters for compressing the data
+ * @param[in] dataset a row-major mdspan or mdarray (device or host) [n_rows, dim].
+ */
+template <typename DatasetT,
+          typename MathT = typename DatasetT::value_type,
+          typename IdxT  = typename DatasetT::index_type>
+auto vpq_build(const raft::resources& res, const vpq_params& params, const DatasetT& dataset)
+  -> vpq_dataset<MathT, IdxT>
+{
+  if constexpr (std::is_same_v<MathT, half>) {
+    return detail::vpq_convert_math_type<half, float, IdxT>(
+      res, detail::vpq_build<DatasetT, float, IdxT>(res, params, dataset));
+  } else {
+    return detail::vpq_build<DatasetT, MathT, IdxT>(res, params, dataset);
+  }
+}
+
+}  // namespace raft::neighbors
diff --git a/cpp/include/raft/random/detail/rng_impl.cuh b/cpp/include/raft/random/detail/rng_impl.cuh
index 57f4c8d33d..61a944e9b6 100644
--- a/cpp/include/raft/random/detail/rng_impl.cuh
+++ b/cpp/include/raft/random/detail/rng_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,12 +17,20 @@
 #pragma once
 
 #include <raft/core/detail/macros.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/math.hpp>
+#include <raft/core/operators.cuh>
+#include <raft/linalg/map.cuh>
 #include <raft/random/rng_device.cuh>
 #include <raft/random/rng_state.hpp>
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/detail/cub_wrappers.cuh>
 #include <raft/util/scatter.cuh>
 
+#include <rmm/device_scalar.hpp>
+
+#include <cub/cub.cuh>
+
 namespace raft {
 namespace random {
 namespace detail {
@@ -278,6 +286,7 @@ std::enable_if_t<std::is_integral_v<OutType>> discrete(RngState& rng_state,
                      len);
 }
 
+/** Note the memory space requirements are O(4*len) */
 template <typename DataT, typename WeightsT, typename IdxT = int>
 void sampleWithoutReplacement(RngState& rng_state,
                               DataT* out,
@@ -328,6 +337,133 @@ void affine_transform_params(RngState const& rng_state, IdxT n, IdxT& a, IdxT& b
   b = mt_rng() % n;
 }
 
+/** @brief Sample without replacement from range 0..N-1.
+ *
+ * Elements are sampled uniformly.
+ * The algorithm will allocate a workspace of size O(4*n_samples) internally.
+ *
+ * We use max N random numbers. Depending on how large n_samples is w.r.t to N, we
+ * either use rejection sampling, or sort the [0..N-1] values using random keys.
+ *
+ * @tparam IdxT type of indices that we sample
+ * @tparam MatIdxT extent type of the returned mdarray
+ *
+ * @param res RAFT resource handle
+ * @param state random number generator state
+ * @param N number of elements to sample from. We will sample values in range 0..N-1
+ * @param n_samples number of samples to return
+ *
+ * @return device mdarray with the random samples
+ */
+template <typename IdxT, typename MatIdxT = IdxT>
+auto excess_subsample(raft::resources const& res, RngState& state, IdxT N, IdxT n_samples)
+  -> raft::device_vector<IdxT, MatIdxT>
+{
+  RAFT_EXPECTS(n_samples <= N, "Cannot have more training samples than dataset vectors");
+
+  // Number of samples we'll need to sample (with replacement), to expect 'k'
+  // unique samples from 'n' is given by the following equation: log(1 - k/n)/log(1 - 1/n) ref:
+  // https://stats.stackexchange.com/questions/296005/the-expected-number-of-unique-elements-drawn-with-replacement
+  IdxT n_excess_samples =
+    n_samples < N
+      ? std::ceil(raft::log(1 - double(n_samples) / double(N)) / (raft::log(1 - 1 / double(N))))
+      : N;
+
+  // There is a variance of n_excess_samples, we take 10% more elements.
+  n_excess_samples += std::max<IdxT>(0.1 * n_samples, 100);
+
+  // n_excess_sampless will be larger than N around k = 0.64*N. When we reach N, then instead of
+  // doing rejection sampling, we simply shuffle the range [0..N-1] using N random numbers.
+  n_excess_samples = std::min<IdxT>(n_excess_samples, N);
+  auto rnd_idx     = raft::make_device_vector<IdxT, IdxT>(res, n_excess_samples);
+
+  auto linear_idx = raft::make_device_vector<IdxT, IdxT>(res, rnd_idx.size());
+  raft::linalg::map_offset(res, linear_idx.view(), identity_op());
+
+  uniformInt(res, state, rnd_idx.data_handle(), rnd_idx.size(), IdxT(0), IdxT(N));
+
+  // Sort indices according to rnd keys
+  size_t workspace_size = 0;
+  auto stream           = resource::get_cuda_stream(res);
+  cub::DeviceMergeSort::SortPairs(nullptr,
+                                  workspace_size,
+                                  rnd_idx.data_handle(),
+                                  linear_idx.data_handle(),
+                                  rnd_idx.size(),
+                                  raft::less_op{},
+                                  stream);
+  auto workspace = raft::make_device_vector<char, IdxT>(res, workspace_size);
+  cub::DeviceMergeSort::SortPairs(workspace.data_handle(),
+                                  workspace_size,
+                                  rnd_idx.data_handle(),
+                                  linear_idx.data_handle(),
+                                  rnd_idx.size(),
+                                  raft::less_op{},
+                                  stream);
+
+  if (rnd_idx.size() == static_cast<size_t>(N)) {
+    // We shuffled the linear_idx array by sorting it according to rnd_idx.
+    // We return the first n_samples elements.
+    if (n_samples == N) { return linear_idx; }
+    rnd_idx = raft::make_device_vector<IdxT, IdxT>(res, n_samples);
+    raft::copy(rnd_idx.data_handle(), linear_idx.data_handle(), n_samples, stream);
+    return rnd_idx;
+  }
+  // Else we do a rejection sampling (or excess sampling): we generated more random indices than
+  // needed and reject the duplicates.
+  auto keys_out   = raft::make_device_vector<IdxT, IdxT>(res, rnd_idx.size());
+  auto values_out = raft::make_device_vector<IdxT, IdxT>(res, rnd_idx.size());
+  rmm::device_scalar<IdxT> num_selected(stream);
+  size_t worksize2 = 0;
+  cub::DeviceSelect::UniqueByKey(nullptr,
+                                 worksize2,
+                                 rnd_idx.data_handle(),
+                                 linear_idx.data_handle(),
+                                 keys_out.data_handle(),
+                                 values_out.data_handle(),
+                                 num_selected.data(),
+                                 rnd_idx.size(),
+                                 stream);
+
+  if (worksize2 > workspace.size()) {
+    workspace      = raft::make_device_vector<char, IdxT>(res, worksize2);
+    workspace_size = workspace.size();
+  }
+
+  cub::DeviceSelect::UniqueByKey(workspace.data_handle(),
+                                 workspace_size,
+                                 rnd_idx.data_handle(),
+                                 linear_idx.data_handle(),
+                                 keys_out.data_handle(),
+                                 values_out.data_handle(),
+                                 num_selected.data(),
+                                 rnd_idx.size(),
+                                 stream);
+
+  IdxT selected = num_selected.value(stream);
+
+  if (selected < n_samples) {
+    RAFT_LOG_DEBUG("Subsampling returned with less unique indices (%zu) than requested (%zu)",
+                   (size_t)selected,
+                   (size_t)n_samples);
+
+    // We continue to select n_samples elements, this will now contains a few duplicates.
+  }
+
+  // After duplicates are removed, we need to shuffle back to random order
+  cub::DeviceMergeSort::SortPairs(workspace.data_handle(),
+                                  workspace_size,
+                                  values_out.data_handle(),
+                                  keys_out.data_handle(),
+                                  n_samples,
+                                  raft::less_op{},
+                                  stream);
+
+  values_out = raft::make_device_vector<IdxT, IdxT>(res, n_samples);
+  raft::copy(values_out.data_handle(), keys_out.data_handle(), n_samples, stream);
+  return values_out;
+}
+
 };  // end namespace detail
 };  // end namespace random
 };  // end namespace raft
diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh
index 4e63669f98..7fd461980f 100644
--- a/cpp/include/raft/random/rng.cuh
+++ b/cpp/include/raft/random/rng.cuh
@@ -813,6 +813,32 @@ void sampleWithoutReplacement(raft::resources const& handle,
     rng_state, out, outIdx, in, wts, sampledLen, len, resource::get_cuda_stream(handle));
 }
 
+/** @brief Sample from range 0..N-1.
+ *
+ * Elements are sampled uniformly. The method aims to sample without replacement,
+ * but there is a small probability of a few having duplicate elements.
+ *
+ * The algorithm will allocate a workspace of size 4*n_samples*sizeof(IdxT) internally.
+ *
+ * We use max N random numbers. Depending on how large n_samples is w.r.t to N, we
+ * either use rejection sampling, or sort the [0..N-1] values using random keys.
+ *
+ * @tparam IdxT type of indices that we sample
+ * @tparam MatIdxT extent type of the returned mdarray
+ *
+ * @param res RAFT resource handle
+ * @param state random number generator state
+ * @param N number of elements to sample from. We will sample values in range 0..N-1.
+ * @param n_samples number of samples to return
+ *
+ * @return device mdarray with the random samples
+ */
+template <typename IdxT, typename MatIdxT = IdxT>
+auto excess_subsample(raft::resources const& res, RngState& state, IdxT N, IdxT n_samples)
+{
+  return detail::excess_subsample(res, state, N, n_samples);
+}
+
 /**
  * @brief Generates the 'a' and 'b' parameters for a modulo affine
  *        transformation equation: `(ax + b) % n`
diff --git a/cpp/include/raft/stats/detail/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh
index cf4dbc7aa3..6c330acb26 100644
--- a/cpp/include/raft/stats/detail/mean.cuh
+++ b/cpp/include/raft/stats/detail/mean.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,7 +43,7 @@ RAFT_KERNEL meanKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N)
   __syncthreads();
   raft::myAtomicAdd(smu + thisColId, thread_data);
   __syncthreads();
-  if (threadIdx.x < ColsPerBlk) raft::myAtomicAdd(mu + colId, smu[thisColId]);
+  if (threadIdx.x < ColsPerBlk && colId < D) raft::myAtomicAdd(mu + colId, smu[thisColId]);
 }
 
 template <typename Type, typename IdxType, int TPB>
diff --git a/cpp/include/raft/stats/detail/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh
index acee4a944e..bc2644a233 100644
--- a/cpp/include/raft/stats/detail/stddev.cuh
+++ b/cpp/include/raft/stats/detail/stddev.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,7 +45,7 @@ RAFT_KERNEL stddevKernelRowMajor(Type* std, const Type* data, IdxType D, IdxType
   __syncthreads();
   raft::myAtomicAdd(sstd + thisColId, thread_data);
   __syncthreads();
-  if (threadIdx.x < ColsPerBlk) raft::myAtomicAdd(std + colId, sstd[thisColId]);
+  if (threadIdx.x < ColsPerBlk && colId < D) raft::myAtomicAdd(std + colId, sstd[thisColId]);
 }
 
 template <typename Type, typename IdxType, int TPB>
diff --git a/cpp/include/raft/stats/detail/sum.cuh b/cpp/include/raft/stats/detail/sum.cuh
index bb45eb50f4..4f85536e6c 100644
--- a/cpp/include/raft/stats/detail/sum.cuh
+++ b/cpp/include/raft/stats/detail/sum.cuh
@@ -34,30 +34,72 @@ RAFT_KERNEL sumKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N)
   IdxType thisRowId           = threadIdx.x / ColsPerBlk;
   IdxType colId               = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
   IdxType rowId               = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
-  Type thread_data            = Type(0);
+  Type thread_sum             = Type(0);
   const IdxType stride        = RowsPerBlkPerIter * gridDim.x;
-  for (IdxType i = rowId; i < N; i += stride)
-    thread_data += (colId < D) ? data[i * D + colId] : Type(0);
+  for (IdxType i = rowId; i < N; i += stride) {
+    thread_sum += (colId < D) ? data[i * D + colId] : Type(0);
+  }
   __shared__ Type smu[ColsPerBlk];
   if (threadIdx.x < ColsPerBlk) smu[threadIdx.x] = Type(0);
   __syncthreads();
-  raft::myAtomicAdd(smu + thisColId, thread_data);
+  raft::myAtomicAdd(smu + thisColId, thread_sum);
+  __syncthreads();
+  if (threadIdx.x < ColsPerBlk && colId < D) raft::myAtomicAdd(mu + colId, smu[thisColId]);
+}
+
+template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
+RAFT_KERNEL sumKahanKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N)
+{
+  constexpr int RowsPerBlkPerIter = TPB / ColsPerBlk;
+  IdxType thisColId               = threadIdx.x % ColsPerBlk;
+  IdxType thisRowId               = threadIdx.x / ColsPerBlk;
+  IdxType colId                   = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
+  IdxType rowId                   = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
+  Type thread_sum                 = Type(0);
+  Type thread_c                   = Type(0);
+  const IdxType stride            = RowsPerBlkPerIter * gridDim.x;
+  for (IdxType i = rowId; i < N; i += stride) {
+    // KahanBabushkaNeumaierSum
+    const Type cur_value = (colId < D) ? data[i * D + colId] : Type(0);
+    const Type t         = thread_sum + cur_value;
+    if (abs(thread_sum) >= abs(cur_value)) {
+      thread_c += (thread_sum - t) + cur_value;
+    } else {
+      thread_c += (cur_value - t) + thread_sum;
+    }
+    thread_sum = t;
+  }
+  thread_sum += thread_c;
+  __shared__ Type smu[ColsPerBlk];
+  if (threadIdx.x < ColsPerBlk) smu[threadIdx.x] = Type(0);
+  __syncthreads();
+  raft::myAtomicAdd(smu + thisColId, thread_sum);
   __syncthreads();
   if (threadIdx.x < ColsPerBlk && colId < D) raft::myAtomicAdd(mu + colId, smu[thisColId]);
 }
 
 template <typename Type, typename IdxType, int TPB>
-RAFT_KERNEL sumKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N)
+RAFT_KERNEL sumKahanKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N)
 {
   typedef cub::BlockReduce<Type, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
-  Type thread_data = Type(0);
+  Type thread_sum  = Type(0);
+  Type thread_c    = Type(0);
   IdxType colStart = N * blockIdx.x;
   for (IdxType i = threadIdx.x; i < N; i += TPB) {
-    IdxType idx = colStart + i;
-    thread_data += data[idx];
+    // KahanBabushkaNeumaierSum
+    IdxType idx          = colStart + i;
+    const Type cur_value = data[idx];
+    const Type t         = thread_sum + cur_value;
+    if (abs(thread_sum) >= abs(cur_value)) {
+      thread_c += (thread_sum - t) + cur_value;
+    } else {
+      thread_c += (cur_value - t) + thread_sum;
+    }
+    thread_sum = t;
   }
-  Type acc = BlockReduce(temp_storage).Sum(thread_data);
+  thread_sum += thread_c;
+  Type acc = BlockReduce(temp_storage).Sum(thread_sum);
   if (threadIdx.x == 0) { mu[blockIdx.x] = acc; }
 }
 
@@ -66,15 +108,21 @@ void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, c
 {
   static const int TPB = 256;
   if (rowMajor) {
-    static const int RowsPerThread = 4;
-    static const int ColsPerBlk    = 32;
-    static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
-    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk));
+    static const int ColsPerBlk       = 8;
+    static const int MinRowsPerThread = 16;
+    static const int MinRowsPerBlk    = (TPB / ColsPerBlk) * MinRowsPerThread;
+    static const int MaxBlocksDimX    = 8192;
+
+    const IdxType grid_y = raft::ceildiv(D, (IdxType)ColsPerBlk);
+    const IdxType grid_x =
+      raft::min((IdxType)MaxBlocksDimX, raft::ceildiv(N, (IdxType)MinRowsPerBlk));
+
+    dim3 grid(grid_x, grid_y);
     RAFT_CUDA_TRY(cudaMemset(output, 0, sizeof(Type) * D));
-    sumKernelRowMajor<Type, IdxType, TPB, ColsPerBlk>
+    sumKahanKernelRowMajor<Type, IdxType, TPB, ColsPerBlk>
       <<<grid, TPB, 0, stream>>>(output, input, D, N);
   } else {
-    sumKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(output, input, D, N);
+    sumKahanKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(output, input, D, N);
   }
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
diff --git a/cpp/include/raft/util/reduction.cuh b/cpp/include/raft/util/reduction.cuh
index 7e897e67f5..2c2b1aa228 100644
--- a/cpp/include/raft/util/reduction.cuh
+++ b/cpp/include/raft/util/reduction.cuh
@@ -157,11 +157,10 @@ DI std::pair<T, i_t> blockRankedReduce(T val,
     val = values[lane];
     idx = indices[lane];
   } else {
-    // get the min if it is a max op, get the max if it is a min op
-    val = reduce_op(std::numeric_limits<T>::min(), std::numeric_limits<T>::max()) ==
-              std::numeric_limits<T>::min()
-            ? std::numeric_limits<T>::max()
-            : std::numeric_limits<T>::min();
+    // get the lower_bound of the type if it is a max op,
+    // get the upper bound of the type if it is a min op
+    val = reduce_op(lower_bound<T>(), upper_bound<T>()) == lower_bound<T>() ? upper_bound<T>()
+                                                                            : lower_bound<T>();
     idx = -1;
   }
   __syncthreads();
diff --git a/cpp/include/raft_runtime/distance/fused_distance_nn.hpp b/cpp/include/raft_runtime/distance/fused_distance_nn.hpp
new file mode 100644
index 0000000000..7c309d6fc7
--- /dev/null
+++ b/cpp/include/raft_runtime/distance/fused_distance_nn.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/resources.hpp>
+#include <raft/distance/distance_types.hpp>
+
+namespace raft::runtime::distance {
+
+/**
+ * @defgroup fused_distance_nn_min_arg_runtime Fused Distance 1NN Runtime API
+ * @{
+ */
+
+/**
+ * @brief Wrapper around fusedDistanceNN with minimum reduction operators.
+ *
+ * fusedDistanceNN cannot be compiled in the distance library due to the lambda
+ * operators, so this wrapper covers the most common case (minimum).
+ *
+ * @param[in] handle         raft handle
+ * @param[out] min           will contain the reduced output (Length = `m`)
+ *                           (on device)
+ * @param[in]  x             first matrix. Row major. Dim = `m x k`.
+ *                           (on device).
+ * @param[in]  y             second matrix. Row major. Dim = `n x k`.
+ *                           (on device).
+ * @param[in]  m             gemm m
+ * @param[in]  n             gemm n
+ * @param[in]  k             gemm k
+ * @param[in]  sqrt          Whether the output `minDist` should contain L2-sqrt
+ * @param[in]  metric        Distance metric to be used (supports L2, cosine)
+ * @param[in]  isRowMajor    whether the input/output is row or column major.
+ * @param[in]  metric_arg    power argument for distances like Minkowski (not supported for now)
+ */
+void fused_distance_nn_min_arg(raft::resources const& handle,
+                               int* min,
+                               const float* x,
+                               const float* y,
+                               int m,
+                               int n,
+                               int k,
+                               bool sqrt,
+                               raft::distance::DistanceType metric,
+                               bool isRowMajor,
+                               float metric_arg);
+
+/** @} */  // end group fused_distance_nn_min_arg_runtime
+
+}  // end namespace raft::runtime::distance
diff --git a/cpp/include/raft_runtime/distance/fused_l2_nn.hpp b/cpp/include/raft_runtime/distance/fused_l2_nn.hpp
index 6154e03f4c..e46b3c5271 100644
--- a/cpp/include/raft_runtime/distance/fused_l2_nn.hpp
+++ b/cpp/include/raft_runtime/distance/fused_l2_nn.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,23 +42,25 @@ namespace raft::runtime::distance {
  * @param[in]  k             gemm k
  * @param[in]  sqrt          Whether the output `minDist` should contain L2-sqrt
  */
-void fused_l2_nn_min_arg(raft::resources const& handle,
-                         int* min,
-                         const float* x,
-                         const float* y,
-                         int m,
-                         int n,
-                         int k,
-                         bool sqrt);
+[[deprecated("use fused_distance_nn_min_arg instead")]] void fused_l2_nn_min_arg(
+  raft::resources const& handle,
+  int* min,
+  const float* x,
+  const float* y,
+  int m,
+  int n,
+  int k,
+  bool sqrt);
 
-void fused_l2_nn_min_arg(raft::resources const& handle,
-                         int* min,
-                         const double* x,
-                         const double* y,
-                         int m,
-                         int n,
-                         int k,
-                         bool sqrt);
+[[deprecated("use fused_distance_nn_min_arg instead")]] void fused_l2_nn_min_arg(
+  raft::resources const& handle,
+  int* min,
+  const double* x,
+  const double* y,
+  int m,
+  int n,
+  int k,
+  bool sqrt);
 
 /** @} */  // end group fused_l2_nn_min_arg_runtime
 
diff --git a/cpp/internal/raft_internal/neighbors/ivf_pq_compute_similarity_filters_test-ext.cuh b/cpp/internal/raft_internal/neighbors/ivf_pq_compute_similarity_filters_test-ext.cuh
new file mode 100644
index 0000000000..aa14ab19b8
--- /dev/null
+++ b/cpp/internal/raft_internal/neighbors/ivf_pq_compute_similarity_filters_test-ext.cuh
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/detail/macros.hpp>       // RAFT_WEAK_FUNCTION
+#include <raft/distance/distance_types.hpp>  // raft::distance::DistanceType
+#include <raft/neighbors/detail/ivf_pq_compute_similarity.cuh>
+#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>  // raft::neighbors::ivf_pq::detail::fp_8bit
+#include <raft/neighbors/sample_filter.cuh>          // none_ivf_sample_filter
+#include <raft/neighbors/sample_filter_types.hpp>    // none_ivf_sample_filter
+
+#include <rmm/cuda_stream_view.hpp>  // rmm::cuda_stream_view
+
+#include <cuda_fp16.h>  // __half
+
+#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(                 \
+  OutT, LutT, IvfSampleFilterT)                                                             \
+  extern template auto                                                                      \
+  raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT, IvfSampleFilterT>( \
+    const cudaDeviceProp& dev_props,                                                        \
+    bool manage_local_topk,                                                                 \
+    int locality_hint,                                                                      \
+    double preferred_shmem_carveout,                                                        \
+    uint32_t pq_bits,                                                                       \
+    uint32_t pq_dim,                                                                        \
+    uint32_t precomp_data_count,                                                            \
+    uint32_t n_queries,                                                                     \
+    uint32_t n_probes,                                                                      \
+    uint32_t topk)                                                                          \
+    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT, IvfSampleFilterT>;              \
+                                                                                            \
+  extern template void                                                                      \
+  raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT, IvfSampleFilterT>(    \
+    raft::neighbors::ivf_pq::detail::selected<OutT, LutT, IvfSampleFilterT> s,              \
+    rmm::cuda_stream_view stream,                                                           \
+    uint32_t dim,                                                                           \
+    uint32_t n_probes,                                                                      \
+    uint32_t pq_dim,                                                                        \
+    uint32_t n_queries,                                                                     \
+    uint32_t queries_offset,                                                                \
+    raft::distance::DistanceType metric,                                                    \
+    raft::neighbors::ivf_pq::codebook_gen codebook_kind,                                    \
+    uint32_t topk,                                                                          \
+    uint32_t max_samples,                                                                   \
+    const float* cluster_centers,                                                           \
+    const float* pq_centers,                                                                \
+    const uint8_t* const* pq_dataset,                                                       \
+    const uint32_t* cluster_labels,                                                         \
+    const uint32_t* _chunk_indices,                                                         \
+    const float* queries,                                                                   \
+    const uint32_t* index_list,                                                             \
+    float* query_kths,                                                                      \
+    IvfSampleFilterT sample_filter,                                                         \
+    LutT* lut_scores,                                                                       \
+    OutT* _out_scores,                                                                      \
+    uint32_t* _out_indices);
+
+#define COMMA ,
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  half,
+  raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    uint32_t COMMA raft::neighbors::filtering::none_ivf_sample_filter>);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  half,
+  raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    uint32_t COMMA raft::neighbors::filtering::none_ivf_sample_filter>);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  half,
+  half,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    uint32_t COMMA raft::neighbors::filtering::none_ivf_sample_filter>);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float,
+  half,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    uint32_t COMMA raft::neighbors::filtering::none_ivf_sample_filter>);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float,
+  float,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    uint32_t COMMA raft::neighbors::filtering::none_ivf_sample_filter>);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float,
+  raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    uint32_t COMMA raft::neighbors::filtering::none_ivf_sample_filter>);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float,
+  raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    uint32_t COMMA raft::neighbors::filtering::none_ivf_sample_filter>);
+
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  half,
+  raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    uint32_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA uint32_t>>);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  half,
+  raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    uint32_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA uint32_t>>);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  half,
+  half,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    uint32_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA uint32_t>>);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float,
+  half,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    uint32_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA uint32_t>>);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float,
+  float,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    uint32_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA uint32_t>>);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float,
+  raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    uint32_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA uint32_t>>);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float,
+  raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    uint32_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA uint32_t>>);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  half,
+  raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    int64_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA int64_t>>);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  half,
+  raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    int64_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA int64_t>>);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  half,
+  half,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    int64_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA int64_t>>);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float,
+  half,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    int64_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA int64_t>>);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float,
+  float,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    int64_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA int64_t>>);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float,
+  raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    int64_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA int64_t>>);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float,
+  raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    int64_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA int64_t>>);
+#undef COMMA
+
+#undef instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select
diff --git a/cpp/internal/raft_internal/neighbors/ivf_pq_search_test-ext.cuh b/cpp/internal/raft_internal/neighbors/ivf_pq_search_test-ext.cuh
new file mode 100644
index 0000000000..7a65e2d2f8
--- /dev/null
+++ b/cpp/internal/raft_internal/neighbors/ivf_pq_search_test-ext.cuh
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>  // raft::device_matrix_view
+#include <raft/core/resources.hpp>      // raft::resources
+#include <raft/neighbors/ivf_pq.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
+#include <raft/neighbors/sample_filter.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
+
+#include <raft_internal/neighbors/ivf_pq_compute_similarity_filters_test-ext.cuh>
+
+#include <cstdint>  // int64_t
+
+#define instantiate_raft_neighbors_ivf_pq_search(T, IdxT)            \
+  extern template void raft::neighbors::ivf_pq::search<T, IdxT>(     \
+    raft::resources const& handle,                                   \
+    const raft::neighbors::ivf_pq::search_params& params,            \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,                 \
+    raft::device_matrix_view<const T, uint32_t, row_major> queries,  \
+    raft::device_matrix_view<IdxT, uint32_t, row_major> neighbors,   \
+    raft::device_matrix_view<float, uint32_t, row_major> distances); \
+                                                                     \
+  extern template void raft::neighbors::ivf_pq::search<T, IdxT>(     \
+    raft::resources const& handle,                                   \
+    const raft::neighbors::ivf_pq::search_params& params,            \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,                 \
+    const T* queries,                                                \
+    uint32_t n_queries,                                              \
+    uint32_t k,                                                      \
+    IdxT* neighbors,                                                 \
+    float* distances,                                                \
+    rmm::mr::device_memory_resource* mr);                            \
+                                                                     \
+  extern template void raft::neighbors::ivf_pq::search<T, IdxT>(     \
+    raft::resources const& handle,                                   \
+    const raft::neighbors::ivf_pq::search_params& params,            \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,                 \
+    const T* queries,                                                \
+    uint32_t n_queries,                                              \
+    uint32_t k,                                                      \
+    IdxT* neighbors,                                                 \
+    float* distances)
+
+instantiate_raft_neighbors_ivf_pq_search(float, uint32_t);
+
+#undef instantiate_raft_neighbors_ivf_pq_search
+
+#define instantiate_raft_neighbors_ivf_pq_search_with_filtering(T, IdxT, FilterT)        \
+  extern template void raft::neighbors::ivf_pq::search_with_filtering<T, IdxT, FilterT>( \
+    raft::resources const& handle,                                                       \
+    const search_params& params,                                                         \
+    const index<IdxT>& idx,                                                              \
+    raft::device_matrix_view<const T, uint32_t, row_major> queries,                      \
+    raft::device_matrix_view<IdxT, uint32_t, row_major> neighbors,                       \
+    raft::device_matrix_view<float, uint32_t, row_major> distances,                      \
+    FilterT sample_filter)
+
+#define COMMA ,
+instantiate_raft_neighbors_ivf_pq_search_with_filtering(
+  float, uint32_t, raft::neighbors::filtering::bitset_filter<uint32_t COMMA uint32_t>);
+
+instantiate_raft_neighbors_ivf_pq_search_with_filtering(
+  float, uint32_t, raft::neighbors::filtering::none_ivf_sample_filter);
+
+instantiate_raft_neighbors_ivf_pq_search_with_filtering(
+  float, int64_t, raft::neighbors::filtering::bitset_filter<uint32_t COMMA int64_t>);
+
+instantiate_raft_neighbors_ivf_pq_search_with_filtering(
+  int8_t, int64_t, raft::neighbors::filtering::bitset_filter<uint32_t COMMA int64_t>);
+
+#undef COMMA
+#undef instantiate_raft_neighbors_ivf_pq_search_with_filtering
diff --git a/cpp/src/distance/fused_distance_nn.cu b/cpp/src/distance/fused_distance_nn.cu
new file mode 100644
index 0000000000..dc722d929c
--- /dev/null
+++ b/cpp/src/distance/fused_distance_nn.cu
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/kvp.hpp>  // raft::KeyValuePair
+#include <raft/distance/fused_distance_nn-inl.cuh>
+
+#include <cstdint>  // int64_t
+
+#define instantiate_raft_distance_fusedDistanceNNMinReduce(DataT, OutT, IdxT) \
+  template void raft::distance::fusedDistanceNNMinReduce<DataT, OutT, IdxT>(  \
+    OutT * min,                                                               \
+    const DataT* x,                                                           \
+    const DataT* y,                                                           \
+    const DataT* xn,                                                          \
+    const DataT* yn,                                                          \
+    IdxT m,                                                                   \
+    IdxT n,                                                                   \
+    IdxT k,                                                                   \
+    void* workspace,                                                          \
+    bool sqrt,                                                                \
+    bool initOutBuffer,                                                       \
+    bool isRowMajor,                                                          \
+    raft::distance::DistanceType metric,                                      \
+    float metric_arg,                                                         \
+    cudaStream_t stream)
+
+instantiate_raft_distance_fusedDistanceNNMinReduce(float, float, int);
+instantiate_raft_distance_fusedDistanceNNMinReduce(float, float, int64_t);
+
+// We can't have comma's in the macro expansion, so we use the COMMA macro:
+#define COMMA ,
+
+instantiate_raft_distance_fusedDistanceNNMinReduce(float, raft::KeyValuePair<int COMMA float>, int);
+instantiate_raft_distance_fusedDistanceNNMinReduce(float,
+                                                   raft::KeyValuePair<int64_t COMMA float>,
+                                                   int64_t);
+
+#undef COMMA
+
+#undef instantiate_raft_distance_fusedDistanceNNMinReduce
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_00_generate.py b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_00_generate.py
index 670ed57ed1..9825a48f81 100644
--- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_00_generate.py
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_00_generate.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-header = """
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+header = """/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,78 +30,56 @@
 
 /*
  * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
- *
  * Make changes there and run in this directory:
- *
  * > python ivf_pq_compute_similarity_00_generate.py
- *
  */
-
-#include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
-#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
-
-#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT, IvfSampleFilterT) \\
-    template auto raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT, IvfSampleFilterT>( \\
-        const cudaDeviceProp& dev_props,                                \\
-        bool manage_local_topk,                                         \\
-        int locality_hint,                                              \\
-        double preferred_shmem_carveout,                                \\
-        uint32_t pq_bits,                                               \\
-        uint32_t pq_dim,                                                \\
-        uint32_t precomp_data_count,                                    \\
-        uint32_t n_queries,                                             \\
-        uint32_t n_probes,                                              \\
-        uint32_t topk) -> raft::neighbors::ivf_pq::detail::selected<OutT, LutT, IvfSampleFilterT>; \\
-\\
-    template void raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT, IvfSampleFilterT>( \\
-        raft::neighbors::ivf_pq::detail::selected<OutT, LutT, IvfSampleFilterT> s,        \\
-        rmm::cuda_stream_view stream,                                   \\
-        uint32_t dim,                                                   \\
-        uint32_t n_probes,                                              \\
-        uint32_t pq_dim,                                                \\
-        uint32_t n_queries,                                             \\
-        uint32_t queries_offset,                                        \\
-        raft::distance::DistanceType metric,                                  \\
-        raft::neighbors::ivf_pq::codebook_gen codebook_kind,            \\
-        uint32_t topk,                                                  \\
-        uint32_t max_samples,                                           \\
-        const float* cluster_centers,                                   \\
-        const float* pq_centers,                                        \\
-        const uint8_t* const* pq_dataset,                               \\
-        const uint32_t* cluster_labels,                                 \\
-        const uint32_t* _chunk_indices,                                 \\
-        const float* queries,                                           \\
-        const uint32_t* index_list,                                     \\
-        float* query_kths,                                              \\
-        IvfSampleFilterT sample_filter,                                    \\
-        LutT* lut_scores,                                               \\
-        OutT* _out_scores,                                              \\
-        uint32_t* _out_indices);
-
-
-#define COMMA ,
+          
+#include <raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh>
 """
 
-trailer = """
-#undef COMMA
-
-#undef instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select
-"""
+none_filter_int64 = "raft::neighbors::filtering::ivf_to_sample_filter" \
+                    "<int64_t COMMA raft::neighbors::filtering::none_ivf_sample_filter>"
+none_filter_int32 = "raft::neighbors::filtering::ivf_to_sample_filter" \
+                    "<uint32_t COMMA raft::neighbors::filtering::none_ivf_sample_filter>"
+bitset_filter32 = "raft::neighbors::filtering::ivf_to_sample_filter" \
+                  "<uint32_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA uint32_t>>"
+bitset_filter64 = "raft::neighbors::filtering::ivf_to_sample_filter" \
+                  "<int64_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA int64_t>>"
 
 types = dict(
-    half_fp8_false=("half", "raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>"),
-    half_fp8_true=("half", "raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>"),
-    half_half=("half", "half"),
-    float_half=("float", "half"),
-    float_float= ("float", "float"),
-    float_fp8_false=("float", "raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>"),
-    float_fp8_true=("float", "raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>"),
+    half_fp8_false=("half", "raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>", none_filter_int64),
+    half_fp8_true=("half", "raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>", none_filter_int64),
+    half_half=("half", "half", none_filter_int64),
+    float_half=("float", "half", none_filter_int64),
+    float_float= ("float", "float", none_filter_int64),
+    float_fp8_false=("float", "raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>", none_filter_int64),
+    float_fp8_true=("float", "raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>", none_filter_int64),
+    half_fp8_false_filt32=("half", "raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>", none_filter_int32),
+    half_fp8_true_filt32=("half", "raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>", none_filter_int32),
+    half_half_filt32=("half", "half", none_filter_int32),
+    float_half_filt32=("float", "half", none_filter_int32),
+    float_float_filt32= ("float", "float", none_filter_int32),
+    float_fp8_false_filt32=("float", "raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>", none_filter_int32),
+    float_fp8_true_filt32=("float", "raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>", none_filter_int32),
+    half_fp8_false_bitset32=("half", "raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>", bitset_filter32),
+    half_fp8_true_bitset32=("half", "raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>", bitset_filter32),
+    half_half_bitset32=("half", "half", bitset_filter32),
+    float_half_bitset32=("float", "half", bitset_filter32),
+    float_float_bitset32= ("float", "float", bitset_filter32),
+    float_fp8_false_bitset32=("float", "raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>", bitset_filter32),
+    float_fp8_true_bitset32=("float", "raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>", bitset_filter32),
+    half_fp8_false_bitset64=("half", "raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>", bitset_filter64),
+    half_fp8_true_bitset64=("half", "raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>", bitset_filter64),
+    half_half_bitset64=("half", "half", bitset_filter64),
+    float_half_bitset64=("float", "half", bitset_filter64),
+    float_float_bitset64= ("float", "float", bitset_filter64),
+    float_fp8_false_bitset64=("float", "raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>", bitset_filter64),
+    float_fp8_true_bitset64=("float", "raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>", bitset_filter64)
 )
 
-for path_key, (OutT, LutT) in types.items():
+for path_key, (OutT, LutT, FilterT) in types.items():
     path = f"ivf_pq_compute_similarity_{path_key}.cu"
     with open(path, "w") as f:
         f.write(header)
-        f.write(f"instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select({OutT}, {LutT}, raft::neighbors::filtering::ivf_to_sample_filter<int64_t COMMA raft::neighbors::filtering::none_ivf_sample_filter>);\n")
-        f.write(trailer)
+        f.write(f"instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select({OutT}, {LutT}, {FilterT});\n")
     print(f"src/neighbors/detail/{path}")
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu
index 7e17d6822a..db51608ae1 100644
--- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu
@@ -1,6 +1,5 @@
-
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,65 +16,13 @@
 
 /*
  * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
- *
  * Make changes there and run in this directory:
- *
  * > python ivf_pq_compute_similarity_00_generate.py
- *
  */
 
-#include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
-#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
-
-#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(                 \
-  OutT, LutT, IvfSampleFilterT)                                                             \
-  template auto                                                                             \
-  raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT, IvfSampleFilterT>( \
-    const cudaDeviceProp& dev_props,                                                        \
-    bool manage_local_topk,                                                                 \
-    int locality_hint,                                                                      \
-    double preferred_shmem_carveout,                                                        \
-    uint32_t pq_bits,                                                                       \
-    uint32_t pq_dim,                                                                        \
-    uint32_t precomp_data_count,                                                            \
-    uint32_t n_queries,                                                                     \
-    uint32_t n_probes,                                                                      \
-    uint32_t topk)                                                                          \
-    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT, IvfSampleFilterT>;              \
-                                                                                            \
-  template void                                                                             \
-  raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT, IvfSampleFilterT>(    \
-    raft::neighbors::ivf_pq::detail::selected<OutT, LutT, IvfSampleFilterT> s,              \
-    rmm::cuda_stream_view stream,                                                           \
-    uint32_t dim,                                                                           \
-    uint32_t n_probes,                                                                      \
-    uint32_t pq_dim,                                                                        \
-    uint32_t n_queries,                                                                     \
-    uint32_t queries_offset,                                                                \
-    raft::distance::DistanceType metric,                                                    \
-    raft::neighbors::ivf_pq::codebook_gen codebook_kind,                                    \
-    uint32_t topk,                                                                          \
-    uint32_t max_samples,                                                                   \
-    const float* cluster_centers,                                                           \
-    const float* pq_centers,                                                                \
-    const uint8_t* const* pq_dataset,                                                       \
-    const uint32_t* cluster_labels,                                                         \
-    const uint32_t* _chunk_indices,                                                         \
-    const float* queries,                                                                   \
-    const uint32_t* index_list,                                                             \
-    float* query_kths,                                                                      \
-    IvfSampleFilterT sample_filter,                                                         \
-    LutT* lut_scores,                                                                       \
-    OutT* _out_scores,                                                                      \
-    uint32_t* _out_indices);
-
-#define COMMA ,
+#include <raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh>
 instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
   float,
   float,
   raft::neighbors::filtering::ivf_to_sample_filter<
     int64_t COMMA raft::neighbors::filtering::none_ivf_sample_filter>);
-
-#undef COMMA
-
-#undef instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float_bitset32.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float_bitset32.cu
new file mode 100644
index 0000000000..caaf40abdf
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float_bitset32.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ * Make changes there and run in this directory:
+ * > python ivf_pq_compute_similarity_00_generate.py
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh>
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float,
+  float,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    uint32_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA uint32_t>>);
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float_bitset64.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float_bitset64.cu
new file mode 100644
index 0000000000..7801c25e9f
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float_bitset64.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ * Make changes there and run in this directory:
+ * > python ivf_pq_compute_similarity_00_generate.py
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh>
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float,
+  float,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    int64_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA int64_t>>);
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float_filt32.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float_filt32.cu
new file mode 100644
index 0000000000..45ae348849
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float_filt32.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ * Make changes there and run in this directory:
+ * > python ivf_pq_compute_similarity_00_generate.py
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh>
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float,
+  float,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    uint32_t COMMA raft::neighbors::filtering::none_ivf_sample_filter>);
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu
index c1b72dab33..2f5bcf8f92 100644
--- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu
@@ -1,6 +1,5 @@
-
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,65 +16,13 @@
 
 /*
  * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
- *
  * Make changes there and run in this directory:
- *
  * > python ivf_pq_compute_similarity_00_generate.py
- *
  */
 
-#include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
-#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
-
-#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(                 \
-  OutT, LutT, IvfSampleFilterT)                                                             \
-  template auto                                                                             \
-  raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT, IvfSampleFilterT>( \
-    const cudaDeviceProp& dev_props,                                                        \
-    bool manage_local_topk,                                                                 \
-    int locality_hint,                                                                      \
-    double preferred_shmem_carveout,                                                        \
-    uint32_t pq_bits,                                                                       \
-    uint32_t pq_dim,                                                                        \
-    uint32_t precomp_data_count,                                                            \
-    uint32_t n_queries,                                                                     \
-    uint32_t n_probes,                                                                      \
-    uint32_t topk)                                                                          \
-    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT, IvfSampleFilterT>;              \
-                                                                                            \
-  template void                                                                             \
-  raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT, IvfSampleFilterT>(    \
-    raft::neighbors::ivf_pq::detail::selected<OutT, LutT, IvfSampleFilterT> s,              \
-    rmm::cuda_stream_view stream,                                                           \
-    uint32_t dim,                                                                           \
-    uint32_t n_probes,                                                                      \
-    uint32_t pq_dim,                                                                        \
-    uint32_t n_queries,                                                                     \
-    uint32_t queries_offset,                                                                \
-    raft::distance::DistanceType metric,                                                    \
-    raft::neighbors::ivf_pq::codebook_gen codebook_kind,                                    \
-    uint32_t topk,                                                                          \
-    uint32_t max_samples,                                                                   \
-    const float* cluster_centers,                                                           \
-    const float* pq_centers,                                                                \
-    const uint8_t* const* pq_dataset,                                                       \
-    const uint32_t* cluster_labels,                                                         \
-    const uint32_t* _chunk_indices,                                                         \
-    const float* queries,                                                                   \
-    const uint32_t* index_list,                                                             \
-    float* query_kths,                                                                      \
-    IvfSampleFilterT sample_filter,                                                         \
-    LutT* lut_scores,                                                                       \
-    OutT* _out_scores,                                                                      \
-    uint32_t* _out_indices);
-
-#define COMMA ,
+#include <raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh>
 instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
   float,
   raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>,
   raft::neighbors::filtering::ivf_to_sample_filter<
     int64_t COMMA raft::neighbors::filtering::none_ivf_sample_filter>);
-
-#undef COMMA
-
-#undef instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_bitset32.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_bitset32.cu
new file mode 100644
index 0000000000..e7f2c44254
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_bitset32.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ * Make changes there and run in this directory:
+ * > python ivf_pq_compute_similarity_00_generate.py
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh>
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float,
+  raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    uint32_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA uint32_t>>);
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_bitset64.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_bitset64.cu
new file mode 100644
index 0000000000..01b6900bb8
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_bitset64.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ * Make changes there and run in this directory:
+ * > python ivf_pq_compute_similarity_00_generate.py
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh>
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float,
+  raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    int64_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA int64_t>>);
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_filt32.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_filt32.cu
new file mode 100644
index 0000000000..9f8d453364
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_filt32.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ * Make changes there and run in this directory:
+ * > python ivf_pq_compute_similarity_00_generate.py
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh>
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float,
+  raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    uint32_t COMMA raft::neighbors::filtering::none_ivf_sample_filter>);
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu
index fdff0860fc..06d21bcd50 100644
--- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu
@@ -1,6 +1,5 @@
-
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,65 +16,13 @@
 
 /*
  * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
- *
  * Make changes there and run in this directory:
- *
  * > python ivf_pq_compute_similarity_00_generate.py
- *
  */
 
-#include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
-#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
-
-#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(                 \
-  OutT, LutT, IvfSampleFilterT)                                                             \
-  template auto                                                                             \
-  raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT, IvfSampleFilterT>( \
-    const cudaDeviceProp& dev_props,                                                        \
-    bool manage_local_topk,                                                                 \
-    int locality_hint,                                                                      \
-    double preferred_shmem_carveout,                                                        \
-    uint32_t pq_bits,                                                                       \
-    uint32_t pq_dim,                                                                        \
-    uint32_t precomp_data_count,                                                            \
-    uint32_t n_queries,                                                                     \
-    uint32_t n_probes,                                                                      \
-    uint32_t topk)                                                                          \
-    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT, IvfSampleFilterT>;              \
-                                                                                            \
-  template void                                                                             \
-  raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT, IvfSampleFilterT>(    \
-    raft::neighbors::ivf_pq::detail::selected<OutT, LutT, IvfSampleFilterT> s,              \
-    rmm::cuda_stream_view stream,                                                           \
-    uint32_t dim,                                                                           \
-    uint32_t n_probes,                                                                      \
-    uint32_t pq_dim,                                                                        \
-    uint32_t n_queries,                                                                     \
-    uint32_t queries_offset,                                                                \
-    raft::distance::DistanceType metric,                                                    \
-    raft::neighbors::ivf_pq::codebook_gen codebook_kind,                                    \
-    uint32_t topk,                                                                          \
-    uint32_t max_samples,                                                                   \
-    const float* cluster_centers,                                                           \
-    const float* pq_centers,                                                                \
-    const uint8_t* const* pq_dataset,                                                       \
-    const uint32_t* cluster_labels,                                                         \
-    const uint32_t* _chunk_indices,                                                         \
-    const float* queries,                                                                   \
-    const uint32_t* index_list,                                                             \
-    float* query_kths,                                                                      \
-    IvfSampleFilterT sample_filter,                                                         \
-    LutT* lut_scores,                                                                       \
-    OutT* _out_scores,                                                                      \
-    uint32_t* _out_indices);
-
-#define COMMA ,
+#include <raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh>
 instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
   float,
   raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>,
   raft::neighbors::filtering::ivf_to_sample_filter<
     int64_t COMMA raft::neighbors::filtering::none_ivf_sample_filter>);
-
-#undef COMMA
-
-#undef instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_bitset32.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_bitset32.cu
new file mode 100644
index 0000000000..8b733a23c1
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_bitset32.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ * Make changes there and run in this directory:
+ * > python ivf_pq_compute_similarity_00_generate.py
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh>
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float,
+  raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    uint32_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA uint32_t>>);
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_bitset64.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_bitset64.cu
new file mode 100644
index 0000000000..77e4f9a023
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_bitset64.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ * Make changes there and run in this directory:
+ * > python ivf_pq_compute_similarity_00_generate.py
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh>
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float,
+  raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    int64_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA int64_t>>);
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_filt32.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_filt32.cu
new file mode 100644
index 0000000000..3e036e3df4
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_filt32.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ * Make changes there and run in this directory:
+ * > python ivf_pq_compute_similarity_00_generate.py
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh>
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float,
+  raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    uint32_t COMMA raft::neighbors::filtering::none_ivf_sample_filter>);
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu
index 7205544370..ff42f5e041 100644
--- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu
@@ -1,6 +1,5 @@
-
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,65 +16,13 @@
 
 /*
  * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
- *
  * Make changes there and run in this directory:
- *
  * > python ivf_pq_compute_similarity_00_generate.py
- *
  */
 
-#include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
-#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
-
-#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(                 \
-  OutT, LutT, IvfSampleFilterT)                                                             \
-  template auto                                                                             \
-  raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT, IvfSampleFilterT>( \
-    const cudaDeviceProp& dev_props,                                                        \
-    bool manage_local_topk,                                                                 \
-    int locality_hint,                                                                      \
-    double preferred_shmem_carveout,                                                        \
-    uint32_t pq_bits,                                                                       \
-    uint32_t pq_dim,                                                                        \
-    uint32_t precomp_data_count,                                                            \
-    uint32_t n_queries,                                                                     \
-    uint32_t n_probes,                                                                      \
-    uint32_t topk)                                                                          \
-    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT, IvfSampleFilterT>;              \
-                                                                                            \
-  template void                                                                             \
-  raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT, IvfSampleFilterT>(    \
-    raft::neighbors::ivf_pq::detail::selected<OutT, LutT, IvfSampleFilterT> s,              \
-    rmm::cuda_stream_view stream,                                                           \
-    uint32_t dim,                                                                           \
-    uint32_t n_probes,                                                                      \
-    uint32_t pq_dim,                                                                        \
-    uint32_t n_queries,                                                                     \
-    uint32_t queries_offset,                                                                \
-    raft::distance::DistanceType metric,                                                    \
-    raft::neighbors::ivf_pq::codebook_gen codebook_kind,                                    \
-    uint32_t topk,                                                                          \
-    uint32_t max_samples,                                                                   \
-    const float* cluster_centers,                                                           \
-    const float* pq_centers,                                                                \
-    const uint8_t* const* pq_dataset,                                                       \
-    const uint32_t* cluster_labels,                                                         \
-    const uint32_t* _chunk_indices,                                                         \
-    const float* queries,                                                                   \
-    const uint32_t* index_list,                                                             \
-    float* query_kths,                                                                      \
-    IvfSampleFilterT sample_filter,                                                         \
-    LutT* lut_scores,                                                                       \
-    OutT* _out_scores,                                                                      \
-    uint32_t* _out_indices);
-
-#define COMMA ,
+#include <raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh>
 instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
   float,
   half,
   raft::neighbors::filtering::ivf_to_sample_filter<
     int64_t COMMA raft::neighbors::filtering::none_ivf_sample_filter>);
-
-#undef COMMA
-
-#undef instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half_bitset32.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half_bitset32.cu
new file mode 100644
index 0000000000..40b6313865
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half_bitset32.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ * Make changes there and run in this directory:
+ * > python ivf_pq_compute_similarity_00_generate.py
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh>
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float,
+  half,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    uint32_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA uint32_t>>);
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half_bitset64.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half_bitset64.cu
new file mode 100644
index 0000000000..9cedabdb11
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half_bitset64.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ * Make changes there and run in this directory:
+ * > python ivf_pq_compute_similarity_00_generate.py
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh>
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float,
+  half,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    int64_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA int64_t>>);
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half_filt32.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half_filt32.cu
new file mode 100644
index 0000000000..61422bbc36
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half_filt32.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ * Make changes there and run in this directory:
+ * > python ivf_pq_compute_similarity_00_generate.py
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh>
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float,
+  half,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    uint32_t COMMA raft::neighbors::filtering::none_ivf_sample_filter>);
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu
index 2ac6c3527b..d2064cfe97 100644
--- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu
@@ -1,6 +1,5 @@
-
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,65 +16,13 @@
 
 /*
  * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
- *
  * Make changes there and run in this directory:
- *
  * > python ivf_pq_compute_similarity_00_generate.py
- *
  */
 
-#include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
-#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
-
-#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(                 \
-  OutT, LutT, IvfSampleFilterT)                                                             \
-  template auto                                                                             \
-  raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT, IvfSampleFilterT>( \
-    const cudaDeviceProp& dev_props,                                                        \
-    bool manage_local_topk,                                                                 \
-    int locality_hint,                                                                      \
-    double preferred_shmem_carveout,                                                        \
-    uint32_t pq_bits,                                                                       \
-    uint32_t pq_dim,                                                                        \
-    uint32_t precomp_data_count,                                                            \
-    uint32_t n_queries,                                                                     \
-    uint32_t n_probes,                                                                      \
-    uint32_t topk)                                                                          \
-    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT, IvfSampleFilterT>;              \
-                                                                                            \
-  template void                                                                             \
-  raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT, IvfSampleFilterT>(    \
-    raft::neighbors::ivf_pq::detail::selected<OutT, LutT, IvfSampleFilterT> s,              \
-    rmm::cuda_stream_view stream,                                                           \
-    uint32_t dim,                                                                           \
-    uint32_t n_probes,                                                                      \
-    uint32_t pq_dim,                                                                        \
-    uint32_t n_queries,                                                                     \
-    uint32_t queries_offset,                                                                \
-    raft::distance::DistanceType metric,                                                    \
-    raft::neighbors::ivf_pq::codebook_gen codebook_kind,                                    \
-    uint32_t topk,                                                                          \
-    uint32_t max_samples,                                                                   \
-    const float* cluster_centers,                                                           \
-    const float* pq_centers,                                                                \
-    const uint8_t* const* pq_dataset,                                                       \
-    const uint32_t* cluster_labels,                                                         \
-    const uint32_t* _chunk_indices,                                                         \
-    const float* queries,                                                                   \
-    const uint32_t* index_list,                                                             \
-    float* query_kths,                                                                      \
-    IvfSampleFilterT sample_filter,                                                         \
-    LutT* lut_scores,                                                                       \
-    OutT* _out_scores,                                                                      \
-    uint32_t* _out_indices);
-
-#define COMMA ,
+#include <raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh>
 instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
   half,
   raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>,
   raft::neighbors::filtering::ivf_to_sample_filter<
     int64_t COMMA raft::neighbors::filtering::none_ivf_sample_filter>);
-
-#undef COMMA
-
-#undef instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_bitset32.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_bitset32.cu
new file mode 100644
index 0000000000..1127f39f71
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_bitset32.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ * Make changes there and run in this directory:
+ * > python ivf_pq_compute_similarity_00_generate.py
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh>
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  half,
+  raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    uint32_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA uint32_t>>);
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_bitset64.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_bitset64.cu
new file mode 100644
index 0000000000..0330bf58d6
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_bitset64.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ * Make changes there and run in this directory:
+ * > python ivf_pq_compute_similarity_00_generate.py
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh>
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  half,
+  raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    int64_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA int64_t>>);
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_filt32.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_filt32.cu
new file mode 100644
index 0000000000..d20f7921d5
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_filt32.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ * Make changes there and run in this directory:
+ * > python ivf_pq_compute_similarity_00_generate.py
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh>
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  half,
+  raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    uint32_t COMMA raft::neighbors::filtering::none_ivf_sample_filter>);
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu
index 70f3ffdb0c..9dc954406e 100644
--- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu
@@ -1,6 +1,5 @@
-
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,65 +16,13 @@
 
 /*
  * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
- *
  * Make changes there and run in this directory:
- *
  * > python ivf_pq_compute_similarity_00_generate.py
- *
  */
 
-#include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
-#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
-
-#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(                 \
-  OutT, LutT, IvfSampleFilterT)                                                             \
-  template auto                                                                             \
-  raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT, IvfSampleFilterT>( \
-    const cudaDeviceProp& dev_props,                                                        \
-    bool manage_local_topk,                                                                 \
-    int locality_hint,                                                                      \
-    double preferred_shmem_carveout,                                                        \
-    uint32_t pq_bits,                                                                       \
-    uint32_t pq_dim,                                                                        \
-    uint32_t precomp_data_count,                                                            \
-    uint32_t n_queries,                                                                     \
-    uint32_t n_probes,                                                                      \
-    uint32_t topk)                                                                          \
-    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT, IvfSampleFilterT>;              \
-                                                                                            \
-  template void                                                                             \
-  raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT, IvfSampleFilterT>(    \
-    raft::neighbors::ivf_pq::detail::selected<OutT, LutT, IvfSampleFilterT> s,              \
-    rmm::cuda_stream_view stream,                                                           \
-    uint32_t dim,                                                                           \
-    uint32_t n_probes,                                                                      \
-    uint32_t pq_dim,                                                                        \
-    uint32_t n_queries,                                                                     \
-    uint32_t queries_offset,                                                                \
-    raft::distance::DistanceType metric,                                                    \
-    raft::neighbors::ivf_pq::codebook_gen codebook_kind,                                    \
-    uint32_t topk,                                                                          \
-    uint32_t max_samples,                                                                   \
-    const float* cluster_centers,                                                           \
-    const float* pq_centers,                                                                \
-    const uint8_t* const* pq_dataset,                                                       \
-    const uint32_t* cluster_labels,                                                         \
-    const uint32_t* _chunk_indices,                                                         \
-    const float* queries,                                                                   \
-    const uint32_t* index_list,                                                             \
-    float* query_kths,                                                                      \
-    IvfSampleFilterT sample_filter,                                                         \
-    LutT* lut_scores,                                                                       \
-    OutT* _out_scores,                                                                      \
-    uint32_t* _out_indices);
-
-#define COMMA ,
+#include <raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh>
 instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
   half,
   raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>,
   raft::neighbors::filtering::ivf_to_sample_filter<
     int64_t COMMA raft::neighbors::filtering::none_ivf_sample_filter>);
-
-#undef COMMA
-
-#undef instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_bitset32.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_bitset32.cu
new file mode 100644
index 0000000000..9131fa25a8
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_bitset32.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ * Make changes there and run in this directory:
+ * > python ivf_pq_compute_similarity_00_generate.py
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh>
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  half,
+  raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    uint32_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA uint32_t>>);
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_bitset64.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_bitset64.cu
new file mode 100644
index 0000000000..8b4521b31b
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_bitset64.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ * Make changes there and run in this directory:
+ * > python ivf_pq_compute_similarity_00_generate.py
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh>
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  half,
+  raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    int64_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA int64_t>>);
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_filt32.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_filt32.cu
new file mode 100644
index 0000000000..71b63cf4a0
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_filt32.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ * Make changes there and run in this directory:
+ * > python ivf_pq_compute_similarity_00_generate.py
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh>
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  half,
+  raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    uint32_t COMMA raft::neighbors::filtering::none_ivf_sample_filter>);
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu
index 5cc1cb8038..f527d879be 100644
--- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu
@@ -1,6 +1,5 @@
-
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,65 +16,13 @@
 
 /*
  * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
- *
  * Make changes there and run in this directory:
- *
  * > python ivf_pq_compute_similarity_00_generate.py
- *
  */
 
-#include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
-#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
-
-#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(                 \
-  OutT, LutT, IvfSampleFilterT)                                                             \
-  template auto                                                                             \
-  raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT, IvfSampleFilterT>( \
-    const cudaDeviceProp& dev_props,                                                        \
-    bool manage_local_topk,                                                                 \
-    int locality_hint,                                                                      \
-    double preferred_shmem_carveout,                                                        \
-    uint32_t pq_bits,                                                                       \
-    uint32_t pq_dim,                                                                        \
-    uint32_t precomp_data_count,                                                            \
-    uint32_t n_queries,                                                                     \
-    uint32_t n_probes,                                                                      \
-    uint32_t topk)                                                                          \
-    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT, IvfSampleFilterT>;              \
-                                                                                            \
-  template void                                                                             \
-  raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT, IvfSampleFilterT>(    \
-    raft::neighbors::ivf_pq::detail::selected<OutT, LutT, IvfSampleFilterT> s,              \
-    rmm::cuda_stream_view stream,                                                           \
-    uint32_t dim,                                                                           \
-    uint32_t n_probes,                                                                      \
-    uint32_t pq_dim,                                                                        \
-    uint32_t n_queries,                                                                     \
-    uint32_t queries_offset,                                                                \
-    raft::distance::DistanceType metric,                                                    \
-    raft::neighbors::ivf_pq::codebook_gen codebook_kind,                                    \
-    uint32_t topk,                                                                          \
-    uint32_t max_samples,                                                                   \
-    const float* cluster_centers,                                                           \
-    const float* pq_centers,                                                                \
-    const uint8_t* const* pq_dataset,                                                       \
-    const uint32_t* cluster_labels,                                                         \
-    const uint32_t* _chunk_indices,                                                         \
-    const float* queries,                                                                   \
-    const uint32_t* index_list,                                                             \
-    float* query_kths,                                                                      \
-    IvfSampleFilterT sample_filter,                                                         \
-    LutT* lut_scores,                                                                       \
-    OutT* _out_scores,                                                                      \
-    uint32_t* _out_indices);
-
-#define COMMA ,
+#include <raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh>
 instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
   half,
   half,
   raft::neighbors::filtering::ivf_to_sample_filter<
     int64_t COMMA raft::neighbors::filtering::none_ivf_sample_filter>);
-
-#undef COMMA
-
-#undef instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half_bitset32.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half_bitset32.cu
new file mode 100644
index 0000000000..8e1962e2bb
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half_bitset32.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ * Make changes there and run in this directory:
+ * > python ivf_pq_compute_similarity_00_generate.py
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh>
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  half,
+  half,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    uint32_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA uint32_t>>);
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half_bitset64.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half_bitset64.cu
new file mode 100644
index 0000000000..e9671703e7
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half_bitset64.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ * Make changes there and run in this directory:
+ * > python ivf_pq_compute_similarity_00_generate.py
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh>
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  half,
+  half,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    int64_t COMMA raft::neighbors::filtering::bitset_filter<uint32_t COMMA int64_t>>);
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half_filt32.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half_filt32.cu
new file mode 100644
index 0000000000..b66a07d1a9
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half_filt32.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ * Make changes there and run in this directory:
+ * > python ivf_pq_compute_similarity_00_generate.py
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity_template.cuh>
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  half,
+  half,
+  raft::neighbors::filtering::ivf_to_sample_filter<
+    uint32_t COMMA raft::neighbors::filtering::none_ivf_sample_filter>);
diff --git a/cpp/src/neighbors/detail/ivf_pq_search_filtering_float_int64_t.cu b/cpp/src/neighbors/detail/ivf_pq_search_filtering_float_int64_t.cu
new file mode 100644
index 0000000000..39af78f12e
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_search_filtering_float_int64_t.cu
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/device_mdspan.hpp>  // raft::device_matrix_view
+#include <raft/core/resources.hpp>      // raft::resources
+#include <raft/neighbors/ivf_pq-inl.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
+#include <raft/neighbors/sample_filter.cuh>
+#include <raft/neighbors/sample_filter_types.hpp>
+
+#include <raft_internal/neighbors/ivf_pq_compute_similarity_filters_test-ext.cuh>
+
+#include <cstdint>  // int64_t
+
+#define instantiate_raft_neighbors_ivf_pq_search_with_filtering(T, IdxT, FilterT) \
+  template void raft::neighbors::ivf_pq::search_with_filtering<T, IdxT, FilterT>( \
+    raft::resources const& handle,                                                \
+    const search_params& params,                                                  \
+    const index<IdxT>& idx,                                                       \
+    raft::device_matrix_view<const T, uint32_t, row_major> queries,               \
+    raft::device_matrix_view<IdxT, uint32_t, row_major> neighbors,                \
+    raft::device_matrix_view<float, uint32_t, row_major> distances,               \
+    FilterT sample_filter)
+
+#define COMMA ,
+instantiate_raft_neighbors_ivf_pq_search_with_filtering(
+  float, int64_t, raft::neighbors::filtering::bitset_filter<uint32_t COMMA int64_t>);
+
+#undef COMMA
+#undef instantiate_raft_neighbors_ivf_pq_search_with_filtering
diff --git a/cpp/src/raft_runtime/distance/fused_distance_min_arg.cu b/cpp/src/raft_runtime/distance/fused_distance_min_arg.cu
new file mode 100644
index 0000000000..dfdff4e94b
--- /dev/null
+++ b/cpp/src/raft_runtime/distance/fused_distance_min_arg.cu
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fused_distance_min_arg.hpp"
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/kvp.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/linalg/norm.cuh>
+
+#include <thrust/for_each.h>
+#include <thrust/tuple.h>
+
+namespace raft::runtime::distance {
+
+void fused_distance_nn_min_arg(raft::resources const& handle,
+                               int* min,
+                               const float* x,
+                               const float* y,
+                               int m,
+                               int n,
+                               int k,
+                               bool sqrt,
+                               raft::distance::DistanceType metric,
+                               bool isRowMajor,
+                               float metric_arg)
+{
+  switch (metric) {
+    case raft::distance::DistanceType::CosineExpanded:
+      compute_fused_cosine_nn_min_arg<float, int>(handle, min, x, y, m, n, k, sqrt);
+      break;
+    case raft::distance::DistanceType::L2Expanded:
+    case raft::distance::DistanceType::L2SqrtExpanded:
+      compute_fused_l2_nn_min_arg<float, int>(handle, min, x, y, m, n, k, sqrt);
+      break;
+    default: assert("only Cosine/L2 metric is supported with fusedDistanceNN\n"); break;
+  }
+}
+
+}  // end namespace raft::runtime::distance
diff --git a/cpp/src/raft_runtime/distance/fused_distance_min_arg.hpp b/cpp/src/raft_runtime/distance/fused_distance_min_arg.hpp
new file mode 100644
index 0000000000..6452752a79
--- /dev/null
+++ b/cpp/src/raft_runtime/distance/fused_distance_min_arg.hpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/kvp.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/distance/fused_distance_nn.cuh>
+#include <raft/distance/fused_l2_nn.cuh>
+#include <raft/linalg/norm.cuh>
+
+#include <thrust/for_each.h>
+#include <thrust/tuple.h>
+
+namespace raft::runtime::distance {
+
+template <typename IndexT, typename DataT>
+struct KeyValueIndexOp {
+  __host__ __device__ __forceinline__ IndexT
+  operator()(const raft::KeyValuePair<IndexT, DataT>& a) const
+  {
+    return a.key;
+  }
+};
+
+template <typename value_t, typename idx_t>
+void compute_fused_l2_nn_min_arg(raft::resources const& handle,
+                                 idx_t* min,
+                                 const value_t* x,
+                                 const value_t* y,
+                                 idx_t m,
+                                 idx_t n,
+                                 idx_t k,
+                                 bool sqrt)
+{
+  rmm::device_uvector<int> workspace(m, resource::get_cuda_stream(handle));
+  auto kvp = raft::make_device_vector<raft::KeyValuePair<idx_t, value_t>>(handle, m);
+  constexpr bool is_row_major = true;
+
+  rmm::device_uvector<value_t> x_norms(m, resource::get_cuda_stream(handle));
+  rmm::device_uvector<value_t> y_norms(n, resource::get_cuda_stream(handle));
+  raft::linalg::rowNorm(
+    x_norms.data(), x, k, m, raft::linalg::L2Norm, is_row_major, resource::get_cuda_stream(handle));
+  raft::linalg::rowNorm(
+    y_norms.data(), y, k, n, raft::linalg::L2Norm, is_row_major, resource::get_cuda_stream(handle));
+
+  raft::distance::fusedL2NNMinReduce(kvp.data_handle(),
+                                     x,
+                                     y,
+                                     x_norms.data(),
+                                     y_norms.data(),
+                                     m,
+                                     n,
+                                     k,
+                                     (void*)workspace.data(),
+                                     sqrt,
+                                     true,
+                                     resource::get_cuda_stream(handle));
+
+  KeyValueIndexOp<idx_t, value_t> conversion_op;
+  thrust::transform(resource::get_thrust_policy(handle),
+                    kvp.data_handle(),
+                    kvp.data_handle() + m,
+                    min,
+                    conversion_op);
+  resource::sync_stream(handle);
+}
+
+template <typename value_t, typename idx_t>
+void compute_fused_cosine_nn_min_arg(raft::resources const& handle,
+                                     idx_t* min,
+                                     const value_t* x,
+                                     const value_t* y,
+                                     idx_t m,
+                                     idx_t n,
+                                     idx_t k,
+                                     bool sqrt)
+{
+  rmm::device_uvector<int> workspace(m, resource::get_cuda_stream(handle));
+  auto kvp = raft::make_device_vector<raft::KeyValuePair<idx_t, value_t>>(handle, m);
+
+  rmm::device_uvector<value_t> x_norms(m, resource::get_cuda_stream(handle));
+  rmm::device_uvector<value_t> y_norms(n, resource::get_cuda_stream(handle));
+  constexpr bool is_row_major = true;
+  raft::linalg::rowNorm(x_norms.data(),
+                        x,
+                        k,
+                        m,
+                        raft::linalg::L2Norm,
+                        is_row_major,
+                        resource::get_cuda_stream(handle),
+                        raft::sqrt_op{});
+  raft::linalg::rowNorm(y_norms.data(),
+                        y,
+                        k,
+                        n,
+                        raft::linalg::L2Norm,
+                        is_row_major,
+                        resource::get_cuda_stream(handle),
+                        raft::sqrt_op{});
+
+  raft::distance::fusedDistanceNNMinReduce(kvp.data_handle(),
+                                           x,
+                                           y,
+                                           x_norms.data(),
+                                           y_norms.data(),
+                                           m,
+                                           n,
+                                           k,
+                                           (void*)workspace.data(),
+                                           sqrt,
+                                           true,
+                                           is_row_major,
+                                           raft::distance::DistanceType::CosineExpanded,
+                                           0.0f,
+                                           resource::get_cuda_stream(handle));
+
+  KeyValueIndexOp<idx_t, value_t> conversion_op;
+  thrust::transform(resource::get_thrust_policy(handle),
+                    kvp.data_handle(),
+                    kvp.data_handle() + m,
+                    min,
+                    conversion_op);
+  resource::sync_stream(handle);
+}
+
+}  // end namespace raft::runtime::distance
diff --git a/cpp/src/raft_runtime/distance/fused_l2_min_arg.cu b/cpp/src/raft_runtime/distance/fused_l2_min_arg.cu
index bf4b1a431a..870757dca1 100644
--- a/cpp/src/raft_runtime/distance/fused_l2_min_arg.cu
+++ b/cpp/src/raft_runtime/distance/fused_l2_min_arg.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "fused_distance_min_arg.hpp"
+
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/kvp.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
@@ -28,77 +30,28 @@
 
 namespace raft::runtime::distance {
 
-template <typename IndexT, typename DataT>
-struct KeyValueIndexOp {
-  __host__ __device__ __forceinline__ IndexT
-  operator()(const raft::KeyValuePair<IndexT, DataT>& a) const
-  {
-    return a.key;
-  }
-};
-
-template <typename value_t, typename idx_t>
-void compute_fused_l2_nn_min_arg(raft::resources const& handle,
-                                 idx_t* min,
-                                 const value_t* x,
-                                 const value_t* y,
-                                 idx_t m,
-                                 idx_t n,
-                                 idx_t k,
-                                 bool sqrt)
-{
-  rmm::device_uvector<int> workspace(m, resource::get_cuda_stream(handle));
-  auto kvp = raft::make_device_vector<raft::KeyValuePair<idx_t, value_t>>(handle, m);
-
-  rmm::device_uvector<value_t> x_norms(m, resource::get_cuda_stream(handle));
-  rmm::device_uvector<value_t> y_norms(n, resource::get_cuda_stream(handle));
-  raft::linalg::rowNorm(
-    x_norms.data(), x, k, m, raft::linalg::L2Norm, true, resource::get_cuda_stream(handle));
-  raft::linalg::rowNorm(
-    y_norms.data(), y, k, n, raft::linalg::L2Norm, true, resource::get_cuda_stream(handle));
-
-  raft::distance::fusedL2NNMinReduce(kvp.data_handle(),
-                                     x,
-                                     y,
-                                     x_norms.data(),
-                                     y_norms.data(),
-                                     m,
-                                     n,
-                                     k,
-                                     (void*)workspace.data(),
-                                     sqrt,
-                                     true,
-                                     resource::get_cuda_stream(handle));
-
-  KeyValueIndexOp<idx_t, value_t> conversion_op;
-  thrust::transform(resource::get_thrust_policy(handle),
-                    kvp.data_handle(),
-                    kvp.data_handle() + m,
-                    min,
-                    conversion_op);
-  resource::sync_stream(handle);
-}
-
-void fused_l2_nn_min_arg(raft::resources const& handle,
-                         int* min,
-                         const float* x,
-                         const float* y,
-                         int m,
-                         int n,
-                         int k,
-                         bool sqrt)
+[[deprecated("use fused_distance_nn_min_arg instead")]] void fused_l2_nn_min_arg(
+  raft::resources const& handle,
+  int* min,
+  const float* x,
+  const float* y,
+  int m,
+  int n,
+  int k,
+  bool sqrt)
 {
   compute_fused_l2_nn_min_arg<float, int>(handle, min, x, y, m, n, k, sqrt);
 }
 
-void fused_l2_nn_min_arg(raft::resources const& handle,
-                         int* min,
-                         const double* x,
-                         const double* y,
-                         int m,
-                         int n,
-                         int k,
-                         bool sqrt)
+[[deprecated("use fused_distance_nn_min_arg instead")]] void fused_l2_nn_min_arg(
+  raft::resources const& handle,
+  int* min,
+  const double* x,
+  const double* y,
+  int m,
+  int n,
+  int k,
+  bool sqrt)
 {
   compute_fused_l2_nn_min_arg<double, int>(handle, min, x, y, m, n, k, sqrt);
 }
diff --git a/cpp/template/README.md b/cpp/template/README.md
index 348dff270a..05ec48964f 100644
--- a/cpp/template/README.md
+++ b/cpp/template/README.md
@@ -8,7 +8,7 @@ Once the minimum requirements are satisfied, this example template application c
 
 This directory (`RAFT_SOURCE/cpp/template`) can be copied directly in order to build a new application with RAFT.
 
-RAFT can be integrated into an existing CMake project by copying the contents in the `configure rapids-cmake` and `configure raft` sections of the provided `CMakeLists.txt` into your project, along with `cmake/thirdparty/get_raft.cmake`. 
+RAFT can be integrated into an existing CMake project by copying the contents in the `configure rapids-cmake` and `configure raft` sections of the provided `CMakeLists.txt` into your project, along with `cmake/thirdparty/get_raft.cmake`.
 
 Make sure to link against the appropriate Cmake targets. Use `raft::raft`to add make the headers available and `raft::compiled` when utilizing the shared library.
 
diff --git a/cpp/template/build.sh b/cpp/template/build.sh
index 3ac00fc9af..49c17f7499 100755
--- a/cpp/template/build.sh
+++ b/cpp/template/build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 # raft empty project template build script
 
diff --git a/cpp/template/cmake/thirdparty/get_raft.cmake b/cpp/template/cmake/thirdparty/get_raft.cmake
index 6128b5c43c..07b0897be0 100644
--- a/cpp/template/cmake/thirdparty/get_raft.cmake
+++ b/cpp/template/cmake/thirdparty/get_raft.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -33,6 +33,12 @@ function(find_and_configure_raft)
     #-----------------------------------------------------
     # Invoke CPM find_package()
     #-----------------------------------------------------
+    # Since the RAFT_NVTX option is used by targets generated by
+    # find_package(RAFT_NVTX) and when building from source we want to
+    # make `RAFT_NVTX` a  cache variable so we get consistent
+    # behavior
+    #
+    set(RAFT_NVTX ${PKG_ENABLE_NVTX} CACHE BOOL "Enable raft nvtx logging" FORCE)
     rapids_cpm_find(raft ${PKG_VERSION}
             GLOBAL_TARGETS      raft::raft
             BUILD_EXPORT_SET    raft-template-exports
@@ -46,7 +52,6 @@ function(find_and_configure_raft)
             "BUILD_TESTS OFF"
             "BUILD_PRIMS_BENCH OFF"
             "BUILD_ANN_BENCH OFF"
-            "RAFT_NVTX   ${ENABLE_NVTX}"
             "RAFT_COMPILE_LIBRARY ${PKG_COMPILE_LIBRARY}"
             )
 endfunction()
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index dd7eb839ab..ecb871fccc 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -171,6 +171,7 @@ if(BUILD_TESTS)
     test/distance/masked_nn.cu
     test/distance/masked_nn_compress_to_bits.cu
     test/distance/fused_l2_nn.cu
+    test/distance/fused_cosine_nn.cu
     test/distance/gram.cu
     LIB
     EXPLICIT_INSTANTIATE_ONLY
@@ -266,6 +267,7 @@ if(BUILD_TESTS)
     test/matrix/matrix.cu
     test/matrix/norm.cu
     test/matrix/reverse.cu
+    test/matrix/sample_rows.cu
     test/matrix/slice.cu
     test/matrix/triangular.cu
     test/sparse/spectral_matrix.cu
@@ -293,6 +295,7 @@ if(BUILD_TESTS)
     test/random/rng_int.cu
     test/random/rmat_rectangular_generator.cu
     test/random/sample_without_replacement.cu
+    test/random/excess_sampling.cu
   )
 
   ConfigureTest(
@@ -401,6 +404,30 @@ if(BUILD_TESTS)
     test/neighbors/ann_ivf_flat/test_float_int64_t.cu
     test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu
     test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu
+    test/neighbors/ann_ivf_pq/ivf_pq_build_float_uint32_t.cu
+    test/neighbors/ann_ivf_pq/ivf_pq_search_float_uint32_t.cu
+    src/neighbors/detail/ivf_pq_search_filtering_float_int64_t.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_float_filt32.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_filt32.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_filt32.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_half_filt32.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_filt32.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_filt32.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_half_half_filt32.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_float_bitset32.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_bitset32.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_bitset32.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_half_bitset32.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_bitset32.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_bitset32.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_half_half_bitset32.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_float_bitset64.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_bitset64.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_bitset64.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_half_bitset64.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_bitset64.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_bitset64.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_half_half_bitset64.cu
     test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
     test/neighbors/ann_ivf_pq/test_float_int64_t.cu
     test/neighbors/ann_ivf_pq/test_int8_t_int64_t.cu
diff --git a/cpp/test/distance/fused_cosine_nn.cu b/cpp/test/distance/fused_cosine_nn.cu
new file mode 100644
index 0000000000..d4d632e1dc
--- /dev/null
+++ b/cpp/test/distance/fused_cosine_nn.cu
@@ -0,0 +1,420 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#undef RAFT_EXPLICIT_INSTANTIATE_ONLY  // Search with filter instantiation
+
+#include "../test_utils.cuh"
+
+#include <raft/core/kvp.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/distance/detail/fused_distance_nn.cuh>
+#include <raft/distance/fused_distance_nn.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <gtest/gtest.h>
+
+namespace raft {
+namespace distance {
+
+template <typename LabelT, typename DataT>
+struct RaftKVPMinReduce {
+  typedef raft::KeyValuePair<LabelT, DataT> KVP;
+
+  DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
+
+  DI KVP operator()(const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
+
+};  // KVPMinReduce
+
+template <typename DataT, typename ReduceOpT, int NWARPS>
+__global__ void naiveCosKernel(raft::KeyValuePair<int, DataT>* min,
+                               DataT* x,
+                               DataT* y,
+                               int m,
+                               int n,
+                               int k,
+                               int* workspace,
+                               DataT maxVal)
+{
+  int midx     = threadIdx.y + blockIdx.y * blockDim.y;
+  int nidx     = threadIdx.x + blockIdx.x * blockDim.x;
+  DataT acc_a  = DataT(0);
+  DataT acc_b  = DataT(0);
+  DataT acc_ab = DataT(0);
+  // if (midx >= m || nidx >= n) { return; }
+
+  for (int i = 0; i < k; ++i) {
+    int xidx = i + midx * k;
+    int yidx = i + nidx * k;
+    auto a   = x[xidx];
+    auto b   = y[yidx];
+    acc_a += a * a;
+    acc_b += b * b;
+    acc_ab += a * b;
+  }
+
+  // Use 1.0 - (cosine similarity) to calc the distance
+  DataT acc = maxVal;
+  if (midx < m || nidx < n) { acc = (DataT)1.0 - acc_ab / (raft::sqrt(acc_a) * raft::sqrt(acc_b)); }
+
+  ReduceOpT redOp;
+  typedef cub::WarpReduce<raft::KeyValuePair<int, DataT>> WarpReduce;
+  __shared__ typename WarpReduce::TempStorage temp[NWARPS];
+  int warpId = threadIdx.x / raft::WarpSize;
+  raft::KeyValuePair<int, DataT> tmp;
+  tmp.key   = nidx;
+  tmp.value = midx >= m || nidx >= n ? maxVal : acc;
+  tmp       = WarpReduce(temp[warpId]).Reduce(tmp, RaftKVPMinReduce<int, DataT>());
+  if (threadIdx.x % raft::WarpSize == 0 && midx < m) {
+    while (atomicCAS(workspace + midx, 0, 1) == 1)
+      ;
+    __threadfence();
+    redOp(midx, min + midx, tmp);
+    __threadfence();
+    atomicCAS(workspace + midx, 1, 0);
+  }
+}
+
+template <typename DataT>
+void naive(raft::KeyValuePair<int, DataT>* min,
+           DataT* x,
+           DataT* y,
+           int m,
+           int n,
+           int k,
+           int* workspace,
+           cudaStream_t stream)
+{
+  static const dim3 TPB(32, 16, 1);
+  dim3 nblks(raft::ceildiv(n, (int)TPB.x), raft::ceildiv(m, (int)TPB.y), 1);
+  RAFT_CUDA_TRY(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream));
+  auto blks = raft::ceildiv(m, 256);
+  MinAndDistanceReduceOp<int, DataT> op;
+  detail::initKernel<DataT, raft::KeyValuePair<int, DataT>, int>
+    <<<blks, 256, 0, stream>>>(min, m, std::numeric_limits<DataT>::max(), op);
+  RAFT_CUDA_TRY(cudaGetLastError());
+  naiveCosKernel<DataT, MinAndDistanceReduceOp<int, DataT>, 16>
+    <<<nblks, TPB, 0, stream>>>(min, x, y, m, n, k, workspace, std::numeric_limits<DataT>::max());
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+template <typename DataT>
+struct Inputs {
+  DataT tolerance;
+  int m, n, k;
+  unsigned long long int seed;
+
+  friend std::ostream& operator<<(std::ostream& os, const Inputs& p)
+  {
+    return os << "m: " << p.m
+              << ", "
+                 "n: "
+              << p.n
+              << ", "
+                 "k: "
+              << p.k
+              << ", "
+                 "seed: "
+              << p.seed
+              << ", "
+                 "tol: "
+              << p.tolerance;
+  }
+};
+
+template <typename DataT>
+class FusedCosineNNTest : public ::testing::TestWithParam<Inputs<DataT>> {
+ public:
+  FusedCosineNNTest()
+    : params(::testing::TestWithParam<Inputs<DataT>>::GetParam()),
+      stream(resource::get_cuda_stream(handle)),
+      x(params.m * params.k, stream),
+      y(params.n * params.k, stream),
+      xn(params.m, stream),
+      yn(params.n, stream),
+      min(params.m, stream),
+      min_ref(params.m, stream),
+      workspace(params.m * sizeof(int), stream)
+  {
+  }
+
+ protected:
+  void SetUp() override
+  {
+    raft::random::RngState r(params.seed);
+    int m = params.m;
+    int n = params.n;
+    int k = params.k;
+    uniform(handle, r, x.data(), m * k, DataT(-1.0), DataT(1.0));
+    uniform(handle, r, y.data(), n * k, DataT(-1.0), DataT(1.0));
+    generateGoldenResult();
+    raft::linalg::rowNorm(
+      xn.data(), x.data(), k, m, raft::linalg::L2Norm, true, stream, raft::sqrt_op{});
+    raft::linalg::rowNorm(
+      yn.data(), y.data(), k, n, raft::linalg::L2Norm, true, stream, raft::sqrt_op{});
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  }
+
+ protected:
+  raft::resources handle;
+  cudaStream_t stream;
+  Inputs<DataT> params;
+  rmm::device_uvector<DataT> x;
+  rmm::device_uvector<DataT> y;
+  rmm::device_uvector<DataT> xn;
+  rmm::device_uvector<DataT> yn;
+  rmm::device_uvector<raft::KeyValuePair<int, DataT>> min;
+  rmm::device_uvector<raft::KeyValuePair<int, DataT>> min_ref;
+  rmm::device_uvector<char> workspace;
+
+  virtual void generateGoldenResult()
+  {
+    int m = params.m;
+    int n = params.n;
+    int k = params.k;
+    naive<DataT>(min_ref.data(), x.data(), y.data(), m, n, k, (int*)workspace.data(), stream);
+  }
+
+  void runTest(raft::KeyValuePair<int, DataT>* out)
+  {
+    int m                               = params.m;
+    int n                               = params.n;
+    int k                               = params.k;
+    raft::distance::DistanceType metric = raft::distance::DistanceType::CosineExpanded;
+    constexpr bool init_out_buffer      = true;
+    fusedDistanceNNMinReduce<DataT, raft::KeyValuePair<int, DataT>, int>(out,
+                                                                         x.data(),
+                                                                         y.data(),
+                                                                         xn.data(),
+                                                                         yn.data(),
+                                                                         m,
+                                                                         n,
+                                                                         k,
+                                                                         (void*)workspace.data(),
+                                                                         false,
+                                                                         init_out_buffer,
+                                                                         true,
+                                                                         metric,
+                                                                         0.0f,
+                                                                         stream);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  }
+};
+
+template <typename T>
+struct CompareApproxAbsKVP {
+  typedef typename raft::KeyValuePair<int, T> KVP;
+  CompareApproxAbsKVP(T eps_) : eps(eps_) {}
+  bool operator()(const KVP& a, const KVP& b) const
+  {
+    T diff  = std::abs(std::abs(a.value) - std::abs(b.value));
+    T m     = std::max(std::abs(a.value), std::abs(b.value));
+    T ratio = m >= eps ? diff / m : diff;
+    return (ratio <= eps);
+  }
+
+ private:
+  T eps;
+};
+
+template <typename T>
+struct CompareExactKVP {
+  typedef typename raft::KeyValuePair<int, T> KVP;
+  bool operator()(const KVP& a, const KVP& b) const
+  {
+    if (a.value != b.value) return false;
+    return true;
+  }
+};
+
+template <typename K, typename V, typename L>
+::testing::AssertionResult devArrMatch(const raft::KeyValuePair<K, V>* expected,
+                                       const raft::KeyValuePair<K, V>* actual,
+                                       size_t size,
+                                       L eq_compare,
+                                       cudaStream_t stream = 0)
+{
+  typedef typename raft::KeyValuePair<K, V> KVP;
+  std::shared_ptr<KVP> exp_h(new KVP[size]);
+  std::shared_ptr<KVP> act_h(new KVP[size]);
+  raft::update_host<KVP>(exp_h.get(), expected, size, stream);
+  raft::update_host<KVP>(act_h.get(), actual, size, stream);
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  for (size_t i(0); i < size; ++i) {
+    auto exp = exp_h.get()[i];
+    auto act = act_h.get()[i];
+    if (!eq_compare(exp, act)) {
+      return ::testing::AssertionFailure()
+             << "actual=" << act.key << "," << act.value << " != expected=" << exp.key << ","
+             << exp.value << " @" << i;
+    }
+  }
+  return ::testing::AssertionSuccess();
+}
+
+const std::vector<Inputs<float>> inputsf = {
+  {0.001f, 32, 32, 32, 1234ULL},
+  {0.001f, 32, 64, 32, 1234ULL},
+  {0.001f, 64, 32, 32, 1234ULL},
+  {0.001f, 64, 64, 32, 1234ULL},
+  {0.001f, 128, 32, 32, 1234ULL},
+  {0.001f, 128, 64, 32, 1234ULL},
+  {0.001f, 128, 128, 64, 1234ULL},
+  {0.001f, 64, 128, 128, 1234ULL},
+
+  {0.001f, 32, 32, 34, 1234ULL},
+  {0.001f, 32, 64, 34, 1234ULL},
+  {0.001f, 64, 32, 34, 1234ULL},
+  {0.001f, 64, 64, 34, 1234ULL},
+  {0.001f, 128, 32, 34, 1234ULL},
+  {0.001f, 128, 64, 34, 1234ULL},
+  {0.001f, 128, 128, 66, 1234ULL},
+  {0.001f, 64, 128, 130, 1234ULL},
+
+  {0.001f, 32, 32, 33, 1234ULL},
+  {0.001f, 32, 64, 33, 1234ULL},
+  {0.001f, 64, 32, 33, 1234ULL},
+  {0.001f, 64, 64, 33, 1234ULL},
+  {0.001f, 128, 32, 33, 1234ULL},
+  {0.001f, 128, 64, 33, 1234ULL},
+  {0.001f, 128, 128, 65, 1234ULL},
+  {0.001f, 64, 128, 129, 1234ULL},
+  {0.006f, 1805, 134, 2, 1234ULL},
+  {0.006f, 8192, 1024, 64, 1234ULL},
+  {0.006f, 8192, 1025, 64, 1234ULL},
+
+  // Repeat with smaller values of k
+  {0.006f, 32, 32, 1, 1234ULL},
+  {0.001f, 32, 64, 2, 1234ULL},
+  {0.001f, 64, 32, 3, 1234ULL},
+  {0.001f, 64, 64, 4, 1234ULL},
+  {0.001f, 128, 32, 5, 1234ULL},
+  {0.001f, 128, 64, 6, 1234ULL},
+  {0.001f, 128, 128, 7, 1234ULL},
+  {0.001f, 64, 128, 8, 1234ULL},
+
+  {0.001f, 32, 32, 9, 1234ULL},
+  {0.001f, 32, 64, 10, 1234ULL},
+  {0.001f, 64, 32, 11, 1234ULL},
+  {0.001f, 64, 64, 12, 1234ULL},
+  {0.001f, 128, 32, 13, 1234ULL},
+  {0.001f, 128, 64, 14, 1234ULL},
+  {0.001f, 128, 128, 15, 1234ULL},
+  {0.001f, 64, 128, 16, 1234ULL},
+
+  {0.001f, 32, 32, 17, 1234ULL},
+  {0.001f, 32, 64, 18, 1234ULL},
+  {0.001f, 64, 32, 19, 1234ULL},
+  {0.001f, 64, 64, 20, 1234ULL},
+  {0.001f, 128, 32, 21, 1234ULL},
+  {0.001f, 128, 64, 22, 1234ULL},
+  {0.001f, 128, 128, 23, 1234ULL},
+  {0.00001, 64, 128, 24, 1234ULL},
+  {0.001f, 1805, 134, 25, 1234ULL},
+  {0.006f, 8192, 1024, 25, 1234ULL},
+  {0.006f, 8192, 1024, 66, 1234ULL},
+};
+typedef FusedCosineNNTest<float> FusedCosineNNTestF;
+TEST_P(FusedCosineNNTestF, Result)
+{
+  runTest(min.data());
+  ASSERT_TRUE(devArrMatch(
+    min_ref.data(), min.data(), params.m, CompareApproxAbsKVP<float>(params.tolerance), stream));
+}
+INSTANTIATE_TEST_CASE_P(FusedCosineNNTests, FusedCosineNNTestF, ::testing::ValuesIn(inputsf));
+
+const std::vector<Inputs<double>> inputsd = {
+  {0.00001, 32, 32, 32, 1234ULL},   {0.00001, 32, 64, 32, 1234ULL},
+  {0.00001, 64, 32, 32, 1234ULL},   {0.00001, 64, 64, 32, 1234ULL},
+  {0.00001, 128, 32, 32, 1234ULL},  {0.00001, 128, 64, 32, 1234ULL},
+  {0.00001, 128, 128, 64, 1234ULL}, {0.00001, 64, 128, 128, 1234ULL},
+
+  {0.00001, 32, 32, 34, 1234ULL},   {0.00001, 32, 64, 34, 1234ULL},
+  {0.00001, 64, 32, 34, 1234ULL},   {0.00001, 64, 64, 34, 1234ULL},
+  {0.00001, 128, 32, 34, 1234ULL},  {0.00001, 128, 64, 34, 1234ULL},
+  {0.00001, 128, 128, 66, 1234ULL}, {0.00001, 64, 128, 130, 1234ULL},
+
+  {0.00001, 32, 32, 33, 1234ULL},   {0.00001, 32, 64, 33, 1234ULL},
+  {0.00001, 64, 32, 33, 1234ULL},   {0.00001, 64, 64, 33, 1234ULL},
+  {0.00001, 128, 32, 33, 1234ULL},  {0.00001, 128, 64, 33, 1234ULL},
+  {0.00001, 128, 128, 65, 1234ULL}, {0.00001, 64, 128, 129, 1234ULL},
+
+  {0.00001, 1805, 134, 2, 1234ULL}, {0.00001, 8192, 1024, 25, 1234ULL},
+};
+typedef FusedCosineNNTest<double> FusedCosineNNTestD;
+TEST_P(FusedCosineNNTestD, Result)
+{
+  runTest(min.data());
+  ASSERT_TRUE(devArrMatch(
+    min_ref.data(), min.data(), params.m, CompareApproxAbsKVP<double>(params.tolerance), stream));
+}
+INSTANTIATE_TEST_CASE_P(FusedCosineNNTests, FusedCosineNNTestD, ::testing::ValuesIn(inputsd));
+
+/// This is to test output determinism of the prim
+template <typename DataT>
+class FusedCosineNNDetTest : public FusedCosineNNTest<DataT> {
+ public:
+  FusedCosineNNDetTest() : stream(resource::get_cuda_stream(handle)), min1(0, stream) {}
+
+  void SetUp() override
+  {
+    FusedCosineNNTest<DataT>::SetUp();
+    int m = this->params.m;
+    min1.resize(m, stream);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  }
+
+  void TearDown() override { FusedCosineNNTest<DataT>::TearDown(); }
+
+ protected:
+  raft::resources handle;
+  cudaStream_t stream;
+
+  rmm::device_uvector<raft::KeyValuePair<int, DataT>> min1;
+
+  static const int NumRepeats = 3;
+
+  void generateGoldenResult() override {}
+};
+
+typedef FusedCosineNNDetTest<float> FusedCosineNNDetTestF;
+TEST_P(FusedCosineNNDetTestF, Result)
+{
+  runTest(min.data());  // assumed to be golden
+  for (int i = 0; i < NumRepeats; ++i) {
+    runTest(min1.data());
+    ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP<float>(), stream));
+    cudaMemsetAsync(min1.data(), 0, sizeof(*min.data()) * params.m, stream);
+  }
+}
+INSTANTIATE_TEST_CASE_P(FusedCosineNNDetTests, FusedCosineNNDetTestF, ::testing::ValuesIn(inputsf));
+
+typedef FusedCosineNNDetTest<double> FusedCosineNNDetTestD;
+TEST_P(FusedCosineNNDetTestD, Result)
+{
+  runTest(min.data());  // assumed to be golden
+  for (int i = 0; i < NumRepeats; ++i) {
+    runTest(min1.data());
+    ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP<double>(), stream));
+  }
+}
+INSTANTIATE_TEST_CASE_P(FusedCosineNNDetTests, FusedCosineNNDetTestD, ::testing::ValuesIn(inputsd));
+
+}  // end namespace distance
+}  // end namespace raft
diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
index d21a525d88..6fd8f15808 100644
--- a/cpp/test/distance/fused_l2_nn.cu
+++ b/cpp/test/distance/fused_l2_nn.cu
@@ -18,7 +18,6 @@
 
 #include <raft/core/kvp.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
-#include <raft/distance/detail/fused_l2_nn.cuh>
 #include <raft/distance/fused_l2_nn.cuh>
 #include <raft/linalg/norm.cuh>
 #include <raft/random/rng.cuh>
diff --git a/cpp/test/matrix/sample_rows.cu b/cpp/test/matrix/sample_rows.cu
new file mode 100644
index 0000000000..e332a918fe
--- /dev/null
+++ b/cpp/test/matrix/sample_rows.cu
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/mdspan.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/matrix/sample_rows.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <raft/util/itertools.hpp>
+
+#include <gtest/gtest.h>
+
+#include <unordered_set>
+
+namespace raft {
+namespace matrix {
+
+struct inputs {
+  int N;
+  int dim;
+  int n_samples;
+  bool host;
+};
+
+::std::ostream& operator<<(::std::ostream& os, const inputs p)
+{
+  os << p.N << "#" << p.dim << "#" << p.n_samples << (p.host ? "#host" : "#device");
+  return os;
+}
+
+template <typename T>
+class SampleRowsTest : public ::testing::TestWithParam<inputs> {
+ public:
+  SampleRowsTest()
+    : params(::testing::TestWithParam<inputs>::GetParam()),
+      stream(resource::get_cuda_stream(res)),
+      state{137ULL},
+      in(make_device_matrix<T, int64_t>(res, params.N, params.dim)),
+      out(make_device_matrix<T, int64_t>(res, 0, 0)),
+      in_h(make_host_matrix<T, int64_t>(res, params.N, params.dim)),
+      out_h(make_host_matrix<T, int64_t>(res, params.n_samples, params.dim))
+  {
+    raft::random::uniform(res, state, in.data_handle(), in.size(), T(-1.0), T(1.0));
+    for (int64_t i = 0; i < params.N; i++) {
+      for (int64_t k = 0; k < params.dim; k++)
+        in_h(i, k) = i * 1000 + k;
+    }
+    raft::copy(in.data_handle(), in_h.data_handle(), in_h.size(), stream);
+  }
+
+  void check()
+  {
+    if (params.host) {
+      out = raft::matrix::sample_rows<T, int64_t>(
+        res, state, make_const_mdspan(in_h.view()), (int64_t)params.n_samples);
+    } else {
+      out = raft::matrix::sample_rows<T, int64_t>(
+        res, state, make_const_mdspan(in.view()), (int64_t)params.n_samples);
+    }
+
+    raft::copy(out_h.data_handle(), out.data_handle(), out.size(), stream);
+    resource::sync_stream(res, stream);
+
+    ASSERT_TRUE(out.extent(0) == params.n_samples);
+    ASSERT_TRUE(out.extent(1) == params.dim);
+
+    std::unordered_set<int> occurrence;
+
+    for (int64_t i = 0; i < params.n_samples; ++i) {
+      T val = out_h(i, 0) / 1000;
+      ASSERT_TRUE(0 <= val && val < params.N)
+        << "out-of-range index @i=" << i << " val=" << val << " params=" << params;
+      EXPECT_TRUE(occurrence.find(val) == occurrence.end())
+        << "repeated index @i=" << i << " idx=" << val << " params=" << params;
+      occurrence.insert(val);
+      for (int64_t k = 0; k < params.dim; k++) {
+        ASSERT_TRUE(raft::match(out_h(i, k), val * 1000 + k, raft::CompareApprox<T>(1e-6)));
+      }
+    }
+  }
+
+ protected:
+  inputs params;
+  raft::resources res;
+  cudaStream_t stream;
+  random::RngState state;
+  device_matrix<T, int64_t> in, out;
+  host_matrix<T, int64_t> in_h, out_h;
+};
+
+inline std::vector<inputs> generate_inputs()
+{
+  std::vector<inputs> input1 =
+    raft::util::itertools::product<inputs>({10}, {1, 17, 96}, {1, 6, 9, 10}, {false});
+
+  std::vector<inputs> input2 =
+    raft::util::itertools::product<inputs>({137}, {1, 17, 128}, {1, 10, 100, 137}, {false});
+  input1.insert(input1.end(), input2.begin(), input2.end());
+
+  input2 = raft::util::itertools::product<inputs>(
+    {100000}, {1, 42}, {1, 137, 1000, 10000, 50000, 62000, 100000}, {false});
+
+  input1.insert(input1.end(), input2.begin(), input2.end());
+
+  int n = input1.size();
+  // Add same tests for host data
+  for (int i = 0; i < n; i++) {
+    inputs x = input1[i];
+    x.host   = true;
+    input1.push_back(x);
+  }
+  return input1;
+}
+
+const std::vector<inputs> inputs1 = generate_inputs();
+
+using SampleRowsTestInt64 = SampleRowsTest<float>;
+TEST_P(SampleRowsTestInt64, SamplingTest) { check(); }
+INSTANTIATE_TEST_SUITE_P(SampleRowsTests, SampleRowsTestInt64, ::testing::ValuesIn(inputs1));
+
+}  // namespace matrix
+}  // namespace raft
diff --git a/cpp/test/neighbors/ann_ivf_pq/ivf_pq_build_float_uint32_t.cu b/cpp/test/neighbors/ann_ivf_pq/ivf_pq_build_float_uint32_t.cu
new file mode 100644
index 0000000000..5ba21c3c2f
--- /dev/null
+++ b/cpp/test/neighbors/ann_ivf_pq/ivf_pq_build_float_uint32_t.cu
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_pq-inl.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
+#include <raft/neighbors/sample_filter.cuh>
+
+#define instantiate_raft_neighbors_ivf_pq_build(T, IdxT)                                 \
+  template raft::neighbors::ivf_pq::index<IdxT> raft::neighbors::ivf_pq::build<T, IdxT>( \
+    raft::resources const& handle,                                                       \
+    const raft::neighbors::ivf_pq::index_params& params,                                 \
+    raft::device_matrix_view<const T, IdxT, row_major> dataset);                         \
+                                                                                         \
+  template auto raft::neighbors::ivf_pq::build(                                          \
+    raft::resources const& handle,                                                       \
+    const raft::neighbors::ivf_pq::index_params& params,                                 \
+    const T* dataset,                                                                    \
+    IdxT n_rows,                                                                         \
+    uint32_t dim)                                                                        \
+    ->raft::neighbors::ivf_pq::index<IdxT>;
+
+instantiate_raft_neighbors_ivf_pq_build(float, uint32_t);
+
+#undef instantiate_raft_neighbors_ivf_pq_build
diff --git a/cpp/test/neighbors/ann_ivf_pq/ivf_pq_build_test-ext.cuh b/cpp/test/neighbors/ann_ivf_pq/ivf_pq_build_test-ext.cuh
new file mode 100644
index 0000000000..cd5435ab2e
--- /dev/null
+++ b/cpp/test/neighbors/ann_ivf_pq/ivf_pq_build_test-ext.cuh
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/neighbors/ivf_pq.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
+#include <raft/neighbors/sample_filter.cuh>
+
+#define instantiate_raft_neighbors_ivf_pq_build(T, IdxT)                                        \
+  extern template raft::neighbors::ivf_pq::index<IdxT> raft::neighbors::ivf_pq::build<T, IdxT>( \
+    raft::resources const& handle,                                                              \
+    const raft::neighbors::ivf_pq::index_params& params,                                        \
+    raft::device_matrix_view<const T, IdxT, row_major> dataset);                                \
+                                                                                                \
+  extern template auto raft::neighbors::ivf_pq::build(                                          \
+    raft::resources const& handle,                                                              \
+    const raft::neighbors::ivf_pq::index_params& params,                                        \
+    const T* dataset,                                                                           \
+    IdxT n_rows,                                                                                \
+    uint32_t dim)                                                                               \
+    ->raft::neighbors::ivf_pq::index<IdxT>;
+
+instantiate_raft_neighbors_ivf_pq_build(float, uint32_t);
+
+#undef instantiate_raft_neighbors_ivf_pq_build
diff --git a/cpp/test/neighbors/ann_ivf_pq/ivf_pq_search_float_uint32_t.cu b/cpp/test/neighbors/ann_ivf_pq/ivf_pq_search_float_uint32_t.cu
new file mode 100644
index 0000000000..942d0fcc44
--- /dev/null
+++ b/cpp/test/neighbors/ann_ivf_pq/ivf_pq_search_float_uint32_t.cu
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_pq-inl.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
+#include <raft/neighbors/sample_filter.cuh>
+
+#include <raft_internal/neighbors/ivf_pq_compute_similarity_filters_test-ext.cuh>
+
+#define instantiate_raft_neighbors_ivf_pq_search(T, IdxT)            \
+  template void raft::neighbors::ivf_pq::search<T, IdxT>(            \
+    raft::resources const& handle,                                   \
+    const raft::neighbors::ivf_pq::search_params& params,            \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,                 \
+    raft::device_matrix_view<const T, uint32_t, row_major> queries,  \
+    raft::device_matrix_view<IdxT, uint32_t, row_major> neighbors,   \
+    raft::device_matrix_view<float, uint32_t, row_major> distances); \
+                                                                     \
+  template void raft::neighbors::ivf_pq::search<T, IdxT>(            \
+    raft::resources const& handle,                                   \
+    const raft::neighbors::ivf_pq::search_params& params,            \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,                 \
+    const T* queries,                                                \
+    uint32_t n_queries,                                              \
+    uint32_t k,                                                      \
+    IdxT* neighbors,                                                 \
+    float* distances,                                                \
+    rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_neighbors_ivf_pq_search(float, uint32_t);
+
+#undef instantiate_raft_neighbors_ivf_pq_search
+
+#define instantiate_raft_neighbors_ivf_pq_search_with_filtering(T, IdxT, FilterT) \
+  template void raft::neighbors::ivf_pq::search_with_filtering<T, IdxT, FilterT>( \
+    raft::resources const& handle,                                                \
+    const search_params& params,                                                  \
+    const index<IdxT>& idx,                                                       \
+    raft::device_matrix_view<const T, uint32_t, row_major> queries,               \
+    raft::device_matrix_view<IdxT, uint32_t, row_major> neighbors,                \
+    raft::device_matrix_view<float, uint32_t, row_major> distances,               \
+    FilterT sample_filter)
+
+#define COMMA ,
+instantiate_raft_neighbors_ivf_pq_search_with_filtering(
+  float, uint32_t, raft::neighbors::filtering::bitset_filter<uint32_t COMMA uint32_t>);
+
+instantiate_raft_neighbors_ivf_pq_search_with_filtering(
+  int8_t, int64_t, raft::neighbors::filtering::bitset_filter<uint32_t COMMA int64_t>);
+
+instantiate_raft_neighbors_ivf_pq_search_with_filtering(
+  float, uint32_t, raft::neighbors::filtering::none_ivf_sample_filter);
+
+#undef COMMA
+#undef instantiate_raft_neighbors_ivf_pq_search_with_filtering
diff --git a/cpp/test/neighbors/ann_ivf_pq/test_filter_float_int64_t.cu b/cpp/test/neighbors/ann_ivf_pq/test_filter_float_int64_t.cu
index 17f72fb08a..70d5d8761f 100644
--- a/cpp/test/neighbors/ann_ivf_pq/test_filter_float_int64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_pq/test_filter_float_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,11 @@
  * limitations under the License.
  */
 
-#undef RAFT_EXPLICIT_INSTANTIATE_ONLY  // Enable instantiation of search with filter
 #include "../ann_ivf_pq.cuh"
 
+#include <raft_internal/neighbors/ivf_pq_compute_similarity_filters_test-ext.cuh>
+#include <raft_internal/neighbors/ivf_pq_search_test-ext.cuh>
+
 namespace raft::neighbors::ivf_pq {
 
 using f32_f32_i64_filter = ivf_pq_filter_test<float, float, int64_t>;
diff --git a/cpp/test/neighbors/ann_ivf_pq/test_filter_int8_t_int64_t.cu b/cpp/test/neighbors/ann_ivf_pq/test_filter_int8_t_int64_t.cu
index 537dbb4979..ba96a8db0b 100644
--- a/cpp/test/neighbors/ann_ivf_pq/test_filter_int8_t_int64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_pq/test_filter_int8_t_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,11 @@
  * limitations under the License.
  */
 
-#undef RAFT_EXPLICIT_INSTANTIATE_ONLY  // Enable instantiation of search with filter
 #include "../ann_ivf_pq.cuh"
 
+#include <raft_internal/neighbors/ivf_pq_compute_similarity_filters_test-ext.cuh>
+#include <raft_internal/neighbors/ivf_pq_search_test-ext.cuh>
+
 namespace raft::neighbors::ivf_pq {
 
 using f32_i08_i64_filter = ivf_pq_filter_test<float, int8_t, int64_t>;
diff --git a/cpp/test/neighbors/ann_ivf_pq/test_float_uint32_t.cu b/cpp/test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
index a6cfab1f19..b8ada2249a 100644
--- a/cpp/test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
+++ b/cpp/test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
@@ -14,15 +14,11 @@
  * limitations under the License.
  */
 
-// XXX: the uint32_t instance is not compiled in libraft.so. So we allow
-// instantiating the template here.
-//
-// TODO: consider removing this test or consider adding an instantiation to the
-// library.
-
-#undef RAFT_EXPLICIT_INSTANTIATE_ONLY
-
 #include "../ann_ivf_pq.cuh"
+#include "ivf_pq_build_test-ext.cuh"
+
+#include <raft_internal/neighbors/ivf_pq_compute_similarity_filters_test-ext.cuh>
+#include <raft_internal/neighbors/ivf_pq_search_test-ext.cuh>
 
 namespace raft::neighbors::ivf_pq {
 
diff --git a/cpp/test/neighbors/ann_ivf_pq/test_int8_t_int64_t.cu b/cpp/test/neighbors/ann_ivf_pq/test_int8_t_int64_t.cu
index 014e96a2db..970bdd6a12 100644
--- a/cpp/test/neighbors/ann_ivf_pq/test_int8_t_int64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_pq/test_int8_t_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #include "../ann_ivf_pq.cuh"
 
+#include <raft_internal/neighbors/ivf_pq_search_test-ext.cuh>
 namespace raft::neighbors::ivf_pq {
 
 using f32_i08_i64 = ivf_pq_test<float, int8_t, int64_t>;
diff --git a/cpp/test/random/excess_sampling.cu b/cpp/test/random/excess_sampling.cu
new file mode 100644
index 0000000000..e86436fb7d
--- /dev/null
+++ b/cpp/test/random/excess_sampling.cu
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <gtest/gtest.h>
+
+#include <unordered_set>
+#include <vector>
+
+namespace raft {
+namespace random {
+
+using namespace raft::random;
+
+struct inputs {
+  int64_t N;
+  int64_t n_samples;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const inputs p)
+{
+  os << p.N << "/" << p.n_samples;
+  return os;
+}
+
+template <typename T>
+class ExcessSamplingTest : public ::testing::TestWithParam<inputs> {
+ public:
+  ExcessSamplingTest()
+    : params(::testing::TestWithParam<inputs>::GetParam()),
+      stream(resource::get_cuda_stream(res)),
+      state{137ULL}
+  {
+  }
+
+  void check()
+  {
+    device_vector<T, int64_t> out =
+      raft::random::excess_subsample<T, int64_t>(res, state, params.N, params.n_samples);
+    ASSERT_TRUE(out.extent(0) == params.n_samples);
+
+    auto h_out = make_host_vector<T, int64_t>(res, params.n_samples);
+    raft::copy(h_out.data_handle(), out.data_handle(), out.size(), stream);
+    resource::sync_stream(res, stream);
+
+    std::unordered_set<int> occurrence;
+    int64_t sum = 0;
+    for (int64_t i = 0; i < params.n_samples; ++i) {
+      T val = h_out(i);
+      sum += val;
+      ASSERT_TRUE(0 <= val && val < params.N)
+        << "out-of-range index @i=" << i << " val=" << val << " n_samples=" << params.n_samples;
+      ASSERT_TRUE(occurrence.find(val) == occurrence.end())
+        << "repeated index @i=" << i << " idx=" << val;
+      occurrence.insert(val);
+    }
+    float avg = sum / (float)params.n_samples;
+    if (params.n_samples >= 100 && params.N / params.n_samples < 100) {
+      ASSERT_TRUE(raft::match(avg, (params.N - 1) / 2.0f, raft::CompareApprox<float>(0.2)))
+        << "non-uniform sample";
+    }
+  }
+
+ protected:
+  inputs params;
+  raft::resources res;
+  cudaStream_t stream;
+  RngState state;
+};
+
+const std::vector<inputs> input1 = {{1, 0},
+                                    {1, 1},
+                                    {10, 0},
+                                    {10, 1},
+                                    {10, 2},
+                                    {10, 10},
+                                    {137, 42},
+                                    {200, 0},
+                                    {200, 1},
+                                    {200, 100},
+                                    {200, 130},
+                                    {200, 200},
+                                    {10000, 893},
+                                    {10000000000, 1023}};
+
+using ExcessSamplingTestInt64 = ExcessSamplingTest<int64_t>;
+TEST_P(ExcessSamplingTestInt64, SamplingTest) { check(); }
+INSTANTIATE_TEST_SUITE_P(ExcessSamplingTests, ExcessSamplingTestInt64, ::testing::ValuesIn(input1));
+
+}  // namespace random
+}  // namespace raft
diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu
index 67931e2eed..61b57ce739 100644
--- a/cpp/test/stats/mean.cu
+++ b/cpp/test/stats/mean.cu
@@ -95,39 +95,39 @@ class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
 // Note: For 1024 samples, 256 experiments, a mean of 1.0 with stddev=1.0, the
 // measured mean (of a normal distribution) will fall outside of an epsilon of
 // 0.15 only 4/10000 times. (epsilon of 0.1 will fail 30/100 times)
-const std::vector<MeanInputs<float>> inputsf = {{0.15f, 1.f, 1024, 32, true, false, 1234ULL},
-                                                {0.15f, 1.f, 1024, 64, true, false, 1234ULL},
-                                                {0.15f, 1.f, 1024, 128, true, false, 1234ULL},
-                                                {0.15f, 1.f, 1024, 256, true, false, 1234ULL},
-                                                {0.15f, -1.f, 1024, 32, false, false, 1234ULL},
-                                                {0.15f, -1.f, 1024, 64, false, false, 1234ULL},
-                                                {0.15f, -1.f, 1024, 128, false, false, 1234ULL},
-                                                {0.15f, -1.f, 1024, 256, false, false, 1234ULL},
-                                                {0.15f, 1.f, 1024, 32, true, true, 1234ULL},
-                                                {0.15f, 1.f, 1024, 64, true, true, 1234ULL},
-                                                {0.15f, 1.f, 1024, 128, true, true, 1234ULL},
-                                                {0.15f, 1.f, 1024, 256, true, true, 1234ULL},
-                                                {0.15f, -1.f, 1024, 32, false, true, 1234ULL},
-                                                {0.15f, -1.f, 1024, 64, false, true, 1234ULL},
-                                                {0.15f, -1.f, 1024, 128, false, true, 1234ULL},
-                                                {0.15f, -1.f, 1024, 256, false, true, 1234ULL}};
-
-const std::vector<MeanInputs<double>> inputsd = {{0.15, 1.0, 1024, 32, true, false, 1234ULL},
-                                                 {0.15, 1.0, 1024, 64, true, false, 1234ULL},
-                                                 {0.15, 1.0, 1024, 128, true, false, 1234ULL},
-                                                 {0.15, 1.0, 1024, 256, true, false, 1234ULL},
-                                                 {0.15, -1.0, 1024, 32, false, false, 1234ULL},
-                                                 {0.15, -1.0, 1024, 64, false, false, 1234ULL},
-                                                 {0.15, -1.0, 1024, 128, false, false, 1234ULL},
-                                                 {0.15, -1.0, 1024, 256, false, false, 1234ULL},
-                                                 {0.15, 1.0, 1024, 32, true, true, 1234ULL},
-                                                 {0.15, 1.0, 1024, 64, true, true, 1234ULL},
-                                                 {0.15, 1.0, 1024, 128, true, true, 1234ULL},
-                                                 {0.15, 1.0, 1024, 256, true, true, 1234ULL},
-                                                 {0.15, -1.0, 1024, 32, false, true, 1234ULL},
-                                                 {0.15, -1.0, 1024, 64, false, true, 1234ULL},
-                                                 {0.15, -1.0, 1024, 128, false, true, 1234ULL},
-                                                 {0.15, -1.0, 1024, 256, false, true, 1234ULL}};
+const std::vector<MeanInputs<float>> inputsf = {
+  {0.15f, 1.f, 1024, 32, true, false, 1234ULL},    {0.15f, 1.f, 1024, 64, true, false, 1234ULL},
+  {0.15f, 1.f, 1024, 128, true, false, 1234ULL},   {0.15f, 1.f, 1024, 256, true, false, 1234ULL},
+  {0.15f, -1.f, 1024, 32, false, false, 1234ULL},  {0.15f, -1.f, 1024, 64, false, false, 1234ULL},
+  {0.15f, -1.f, 1024, 128, false, false, 1234ULL}, {0.15f, -1.f, 1024, 256, false, false, 1234ULL},
+  {0.15f, 1.f, 1024, 32, true, true, 1234ULL},     {0.15f, 1.f, 1024, 64, true, true, 1234ULL},
+  {0.15f, 1.f, 1024, 128, true, true, 1234ULL},    {0.15f, 1.f, 1024, 256, true, true, 1234ULL},
+  {0.15f, -1.f, 1024, 32, false, true, 1234ULL},   {0.15f, -1.f, 1024, 64, false, true, 1234ULL},
+  {0.15f, -1.f, 1024, 128, false, true, 1234ULL},  {0.15f, -1.f, 1024, 256, false, true, 1234ULL},
+  {0.15f, -1.f, 1030, 1, false, false, 1234ULL},   {0.15f, -1.f, 1030, 60, true, false, 1234ULL},
+  {2.0f, -1.f, 31, 120, false, false, 1234ULL},    {2.0f, -1.f, 1, 130, true, false, 1234ULL},
+  {0.15f, -1.f, 1030, 1, false, true, 1234ULL},    {0.15f, -1.f, 1030, 60, true, true, 1234ULL},
+  {2.0f, -1.f, 31, 120, false, true, 1234ULL},     {2.0f, -1.f, 1, 130, false, true, 1234ULL},
+  {2.0f, -1.f, 1, 1, false, false, 1234ULL},       {2.0f, -1.f, 1, 1, false, true, 1234ULL},
+  {2.0f, -1.f, 7, 23, false, false, 1234ULL},      {2.0f, -1.f, 7, 23, false, true, 1234ULL},
+  {2.0f, -1.f, 17, 5, false, false, 1234ULL},      {2.0f, -1.f, 17, 5, false, true, 1234ULL}};
+
+const std::vector<MeanInputs<double>> inputsd = {
+  {0.15, 1.0, 1024, 32, true, false, 1234ULL},    {0.15, 1.0, 1024, 64, true, false, 1234ULL},
+  {0.15, 1.0, 1024, 128, true, false, 1234ULL},   {0.15, 1.0, 1024, 256, true, false, 1234ULL},
+  {0.15, -1.0, 1024, 32, false, false, 1234ULL},  {0.15, -1.0, 1024, 64, false, false, 1234ULL},
+  {0.15, -1.0, 1024, 128, false, false, 1234ULL}, {0.15, -1.0, 1024, 256, false, false, 1234ULL},
+  {0.15, 1.0, 1024, 32, true, true, 1234ULL},     {0.15, 1.0, 1024, 64, true, true, 1234ULL},
+  {0.15, 1.0, 1024, 128, true, true, 1234ULL},    {0.15, 1.0, 1024, 256, true, true, 1234ULL},
+  {0.15, -1.0, 1024, 32, false, true, 1234ULL},   {0.15, -1.0, 1024, 64, false, true, 1234ULL},
+  {0.15, -1.0, 1024, 128, false, true, 1234ULL},  {0.15, -1.0, 1024, 256, false, true, 1234ULL},
+  {0.15, -1.0, 1030, 1, false, false, 1234ULL},   {0.15, -1.0, 1030, 60, true, false, 1234ULL},
+  {2.0, -1.0, 31, 120, false, false, 1234ULL},    {2.0, -1.0, 1, 130, true, false, 1234ULL},
+  {0.15, -1.0, 1030, 1, false, true, 1234ULL},    {0.15, -1.0, 1030, 60, true, true, 1234ULL},
+  {2.0, -1.0, 31, 120, false, true, 1234ULL},     {2.0, -1.0, 1, 130, false, true, 1234ULL},
+  {2.0, -1.0, 1, 1, false, false, 1234ULL},       {2.0, -1.0, 1, 1, false, true, 1234ULL},
+  {2.0, -1.0, 7, 23, false, false, 1234ULL},      {2.0, -1.0, 7, 23, false, true, 1234ULL},
+  {2.0, -1.0, 17, 5, false, false, 1234ULL},      {2.0, -1.0, 17, 5, false, true, 1234ULL}};
 
 typedef MeanTest<float> MeanTestF;
 TEST_P(MeanTestF, Result)
diff --git a/cpp/test/stats/minmax.cu b/cpp/test/stats/minmax.cu
index 7563cb12be..fd909ebb90 100644
--- a/cpp/test/stats/minmax.cu
+++ b/cpp/test/stats/minmax.cu
@@ -145,45 +145,33 @@ class MinMaxTest : public ::testing::TestWithParam<MinMaxInputs<T>> {
   rmm::device_uvector<T> minmax_ref;
 };
 
-const std::vector<MinMaxInputs<float>> inputsf = {{0.00001f, 1024, 32, 1234ULL},
-                                                  {0.00001f, 1024, 64, 1234ULL},
-                                                  {0.00001f, 1024, 128, 1234ULL},
-                                                  {0.00001f, 1024, 256, 1234ULL},
-                                                  {0.00001f, 1024, 512, 1234ULL},
-                                                  {0.00001f, 1024, 1024, 1234ULL},
-                                                  {0.00001f, 4096, 32, 1234ULL},
-                                                  {0.00001f, 4096, 64, 1234ULL},
-                                                  {0.00001f, 4096, 128, 1234ULL},
-                                                  {0.00001f, 4096, 256, 1234ULL},
-                                                  {0.00001f, 4096, 512, 1234ULL},
-                                                  {0.00001f, 4096, 1024, 1234ULL},
-                                                  {0.00001f, 8192, 32, 1234ULL},
-                                                  {0.00001f, 8192, 64, 1234ULL},
-                                                  {0.00001f, 8192, 128, 1234ULL},
-                                                  {0.00001f, 8192, 256, 1234ULL},
-                                                  {0.00001f, 8192, 512, 1234ULL},
-                                                  {0.00001f, 8192, 1024, 1234ULL},
-                                                  {0.00001f, 1024, 8192, 1234ULL}};
-
-const std::vector<MinMaxInputs<double>> inputsd = {{0.0000001, 1024, 32, 1234ULL},
-                                                   {0.0000001, 1024, 64, 1234ULL},
-                                                   {0.0000001, 1024, 128, 1234ULL},
-                                                   {0.0000001, 1024, 256, 1234ULL},
-                                                   {0.0000001, 1024, 512, 1234ULL},
-                                                   {0.0000001, 1024, 1024, 1234ULL},
-                                                   {0.0000001, 4096, 32, 1234ULL},
-                                                   {0.0000001, 4096, 64, 1234ULL},
-                                                   {0.0000001, 4096, 128, 1234ULL},
-                                                   {0.0000001, 4096, 256, 1234ULL},
-                                                   {0.0000001, 4096, 512, 1234ULL},
-                                                   {0.0000001, 4096, 1024, 1234ULL},
-                                                   {0.0000001, 8192, 32, 1234ULL},
-                                                   {0.0000001, 8192, 64, 1234ULL},
-                                                   {0.0000001, 8192, 128, 1234ULL},
-                                                   {0.0000001, 8192, 256, 1234ULL},
-                                                   {0.0000001, 8192, 512, 1234ULL},
-                                                   {0.0000001, 8192, 1024, 1234ULL},
-                                                   {0.0000001, 1024, 8192, 1234ULL}};
+const std::vector<MinMaxInputs<float>> inputsf = {
+  {0.00001f, 1024, 32, 1234ULL},   {0.00001f, 1024, 64, 1234ULL},  {0.00001f, 1024, 128, 1234ULL},
+  {0.00001f, 1024, 256, 1234ULL},  {0.00001f, 1024, 512, 1234ULL}, {0.00001f, 1024, 1024, 1234ULL},
+  {0.00001f, 4096, 32, 1234ULL},   {0.00001f, 4096, 64, 1234ULL},  {0.00001f, 4096, 128, 1234ULL},
+  {0.00001f, 4096, 256, 1234ULL},  {0.00001f, 4096, 512, 1234ULL}, {0.00001f, 4096, 1024, 1234ULL},
+  {0.00001f, 8192, 32, 1234ULL},   {0.00001f, 8192, 64, 1234ULL},  {0.00001f, 8192, 128, 1234ULL},
+  {0.00001f, 8192, 256, 1234ULL},  {0.00001f, 8192, 512, 1234ULL}, {0.00001f, 8192, 1024, 1234ULL},
+  {0.00001f, 1024, 8192, 1234ULL}, {0.00001f, 1023, 5, 1234ULL},   {0.00001f, 1025, 30, 1234ULL},
+  {0.00001f, 2047, 65, 1234ULL},   {0.00001f, 2049, 22, 1234ULL},  {0.00001f, 31, 644, 1234ULL},
+  {0.00001f, 33, 999, 1234ULL},    {0.00001f, 1, 1, 1234ULL},      {0.00001f, 7, 23, 1234ULL},
+  {0.00001f, 17, 5, 1234ULL}};
+
+const std::vector<MinMaxInputs<double>> inputsd = {
+  {0.0000001, 1024, 32, 1234ULL},   {0.0000001, 1024, 64, 1234ULL},
+  {0.0000001, 1024, 128, 1234ULL},  {0.0000001, 1024, 256, 1234ULL},
+  {0.0000001, 1024, 512, 1234ULL},  {0.0000001, 1024, 1024, 1234ULL},
+  {0.0000001, 4096, 32, 1234ULL},   {0.0000001, 4096, 64, 1234ULL},
+  {0.0000001, 4096, 128, 1234ULL},  {0.0000001, 4096, 256, 1234ULL},
+  {0.0000001, 4096, 512, 1234ULL},  {0.0000001, 4096, 1024, 1234ULL},
+  {0.0000001, 8192, 32, 1234ULL},   {0.0000001, 8192, 64, 1234ULL},
+  {0.0000001, 8192, 128, 1234ULL},  {0.0000001, 8192, 256, 1234ULL},
+  {0.0000001, 8192, 512, 1234ULL},  {0.0000001, 8192, 1024, 1234ULL},
+  {0.0000001, 1024, 8192, 1234ULL}, {0.0000001, 1023, 5, 1234ULL},
+  {0.0000001, 1025, 30, 1234ULL},   {0.0000001, 2047, 65, 1234ULL},
+  {0.0000001, 2049, 22, 1234ULL},   {0.0000001, 31, 644, 1234ULL},
+  {0.0000001, 33, 999, 1234ULL},    {0.0000001, 1, 1, 1234ULL},
+  {0.0000001, 7, 23, 1234ULL},      {0.0000001, 17, 5, 1234ULL}};
 
 typedef MinMaxTest<float> MinMaxTestF;
 TEST_P(MinMaxTestF, Result)
diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu
index cf57d3a923..641621c1c6 100644
--- a/cpp/test/stats/stddev.cu
+++ b/cpp/test/stats/stddev.cu
@@ -141,7 +141,19 @@ const std::vector<StdDevInputs<float>> inputsf = {
   {0.1f, -1.f, 2.f, 1024, 32, false, true, 1234ULL},
   {0.1f, -1.f, 2.f, 1024, 64, false, true, 1234ULL},
   {0.1f, -1.f, 2.f, 1024, 128, false, true, 1234ULL},
-  {0.1f, -1.f, 2.f, 1024, 256, false, true, 1234ULL}};
+  {0.1f, -1.f, 2.f, 1024, 256, false, true, 1234ULL},
+  {0.1f, -1.f, 2.f, 1099, 97, false, false, 1234ULL},
+  {0.1f, -1.f, 2.f, 1022, 694, true, false, 1234ULL},
+  {0.5f, -1.f, 2.f, 31, 1, true, true, 1234ULL},
+  {1.f, -1.f, 2.f, 1, 257, false, true, 1234ULL},
+  {0.5f, -1.f, 2.f, 31, 1, false, false, 1234ULL},
+  {1.f, -1.f, 2.f, 1, 257, true, false, 1234ULL},
+  {1.f, -1.f, 2.f, 1, 1, false, false, 1234ULL},
+  {1.f, -1.f, 2.f, 7, 23, false, false, 1234ULL},
+  {1.f, -1.f, 2.f, 17, 5, false, false, 1234ULL},
+  {1.f, -1.f, 2.f, 1, 1, false, true, 1234ULL},
+  {1.f, -1.f, 2.f, 7, 23, false, true, 1234ULL},
+  {1.f, -1.f, 2.f, 17, 5, false, true, 1234ULL}};
 
 const std::vector<StdDevInputs<double>> inputsd = {
   {0.1, 1.0, 2.0, 1024, 32, true, false, 1234ULL},
@@ -159,13 +171,33 @@ const std::vector<StdDevInputs<double>> inputsd = {
   {0.1, -1.0, 2.0, 1024, 32, false, true, 1234ULL},
   {0.1, -1.0, 2.0, 1024, 64, false, true, 1234ULL},
   {0.1, -1.0, 2.0, 1024, 128, false, true, 1234ULL},
-  {0.1, -1.0, 2.0, 1024, 256, false, true, 1234ULL}};
+  {0.1, -1.0, 2.0, 1024, 256, false, true, 1234ULL},
+  {0.1, -1.0, 2.0, 1099, 97, false, false, 1234ULL},
+  {0.1, -1.0, 2.0, 1022, 694, true, false, 1234ULL},
+  {0.5, -1.0, 2.0, 31, 1, true, true, 1234ULL},
+  {1.0, -1.0, 2.0, 1, 257, false, true, 1234ULL},
+  {0.5, -1.0, 2.0, 31, 1, false, false, 1234ULL},
+  {1.0, -1.0, 2.0, 1, 257, true, false, 1234ULL},
+  {1.0, -1.0, 2.0, 1, 1, false, false, 1234ULL},
+  {1.0, -1.0, 2.0, 7, 23, false, false, 1234ULL},
+  {1.0, -1.0, 2.0, 17, 5, false, false, 1234ULL},
+  {1.0, -1.0, 2.0, 1, 1, false, true, 1234ULL},
+  {1.0, -1.0, 2.0, 7, 23, false, true, 1234ULL},
+  {1.0, -1.0, 2.0, 17, 5, false, true, 1234ULL}};
 
 typedef StdDevTest<float> StdDevTestF;
 TEST_P(StdDevTestF, Result)
 {
-  ASSERT_TRUE(devArrMatch(
-    params.stddev, stddev_act.data(), params.cols, CompareApprox<float>(params.tolerance), stream));
+  if (params.rows == 1) {
+    ASSERT_TRUE(devArrMatch(
+      float(0), stddev_act.data(), params.cols, CompareApprox<float>(params.tolerance), stream));
+  } else {
+    ASSERT_TRUE(devArrMatch(params.stddev,
+                            stddev_act.data(),
+                            params.cols,
+                            CompareApprox<float>(params.tolerance),
+                            stream));
+  }
 
   ASSERT_TRUE(devArrMatch(stddev_act.data(),
                           vars_act.data(),
@@ -177,11 +209,16 @@ TEST_P(StdDevTestF, Result)
 typedef StdDevTest<double> StdDevTestD;
 TEST_P(StdDevTestD, Result)
 {
-  ASSERT_TRUE(devArrMatch(params.stddev,
-                          stddev_act.data(),
-                          params.cols,
-                          CompareApprox<double>(params.tolerance),
-                          stream));
+  if (params.rows == 1) {
+    ASSERT_TRUE(devArrMatch(
+      double(0), stddev_act.data(), params.cols, CompareApprox<double>(params.tolerance), stream));
+  } else {
+    ASSERT_TRUE(devArrMatch(params.stddev,
+                            stddev_act.data(),
+                            params.cols,
+                            CompareApprox<double>(params.tolerance),
+                            stream));
+  }
 
   ASSERT_TRUE(devArrMatch(stddev_act.data(),
                           vars_act.data(),
diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu
index 5a549f8ba4..bf2aa44a2c 100644
--- a/cpp/test/stats/sum.cu
+++ b/cpp/test/stats/sum.cu
@@ -33,7 +33,8 @@ template <typename T>
 struct SumInputs {
   T tolerance;
   int rows, cols;
-  unsigned long long int seed;
+  bool rowMajor;
+  T value = T(1);
 };
 
 template <typename T>
@@ -56,20 +57,34 @@ class SumTest : public ::testing::TestWithParam<SumInputs<T>> {
   }
 
  protected:
-  void SetUp() override
+  void runTest()
   {
     int len = rows * cols;
 
-    T data_h[len];
+    std::vector<T> data_h(len);
     for (int i = 0; i < len; i++) {
-      data_h[i] = T(1);
+      data_h[i] = T(params.value);
     }
 
-    raft::update_device(data.data(), data_h, len, stream);
-    sum(handle,
-        raft::make_device_matrix_view<const T>(data.data(), rows, cols),
-        raft::make_device_vector_view(sum_act.data(), cols));
+    raft::update_device(data.data(), data_h.data(), len, stream);
+
+    if (params.rowMajor) {
+      using layout = raft::row_major;
+      sum(handle,
+          raft::make_device_matrix_view<const T, int, layout>(data.data(), rows, cols),
+          raft::make_device_vector_view(sum_act.data(), cols));
+    } else {
+      using layout = raft::col_major;
+      sum(handle,
+          raft::make_device_matrix_view<const T, int, layout>(data.data(), rows, cols),
+          raft::make_device_vector_view(sum_act.data(), cols));
+    }
     resource::sync_stream(handle, stream);
+
+    double expected = double(params.rows) * params.value;
+
+    ASSERT_TRUE(raft::devArrMatch(
+      T(expected), sum_act.data(), params.cols, raft::CompareApprox<T>(params.tolerance)));
   }
 
  protected:
@@ -81,27 +96,49 @@ class SumTest : public ::testing::TestWithParam<SumInputs<T>> {
   rmm::device_uvector<T> data, sum_act;
 };
 
-const std::vector<SumInputs<float>> inputsf = {
-  {0.05f, 4, 5, 1234ULL}, {0.05f, 1024, 32, 1234ULL}, {0.05f, 1024, 256, 1234ULL}};
-
-const std::vector<SumInputs<double>> inputsd = {{0.05, 1024, 32, 1234ULL},
-                                                {0.05, 1024, 256, 1234ULL}};
+const std::vector<SumInputs<float>> inputsf = {{0.0001f, 4, 5, true, 1},
+                                               {0.0001f, 1024, 32, true, 1},
+                                               {0.0001f, 1024, 256, true, 1},
+                                               {0.0001f, 100000000, 1, true, 0.001},
+                                               {0.0001f, 1, 30, true, 0.001},
+                                               {0.0001f, 1, 1, true, 0.001},
+                                               {0.0001f, 17, 5, true, 0.001},
+                                               {0.0001f, 7, 23, true, 0.001},
+                                               {0.0001f, 3, 97, true, 0.001},
+                                               {0.0001f, 4, 5, false, 1},
+                                               {0.0001f, 1024, 32, false, 1},
+                                               {0.0001f, 1024, 256, false, 1},
+                                               {0.0001f, 100000000, 1, false, 0.001},
+                                               {0.0001f, 1, 30, false, 0.001},
+                                               {0.0001f, 1, 1, false, 0.001},
+                                               {0.0001f, 17, 5, false, 0.001},
+                                               {0.0001f, 7, 23, false, 0.001},
+                                               {0.0001f, 3, 97, false, 0.001}};
+
+const std::vector<SumInputs<double>> inputsd = {{0.000001, 1024, 32, true, 1},
+                                                {0.000001, 1024, 256, true, 1},
+                                                {0.000001, 1024, 256, true, 1},
+                                                {0.000001, 100000000, 1, true, 0.001},
+                                                {0.000001, 1, 30, true, 0.0001},
+                                                {0.000001, 1, 1, true, 0.0001},
+                                                {0.000001, 17, 5, true, 0.0001},
+                                                {0.000001, 7, 23, true, 0.0001},
+                                                {0.000001, 3, 97, true, 0.0001},
+                                                {0.000001, 1024, 32, false, 1},
+                                                {0.000001, 1024, 256, false, 1},
+                                                {0.000001, 1024, 256, false, 1},
+                                                {0.000001, 100000000, 1, false, 0.001},
+                                                {0.000001, 1, 30, false, 0.0001},
+                                                {0.000001, 1, 1, false, 0.0001},
+                                                {0.000001, 17, 5, false, 0.0001},
+                                                {0.000001, 7, 23, false, 0.0001},
+                                                {0.000001, 3, 97, false, 0.0001}};
 
 typedef SumTest<float> SumTestF;
-TEST_P(SumTestF, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(
-    float(params.rows), sum_act.data(), params.cols, raft::CompareApprox<float>(params.tolerance)));
-}
-
 typedef SumTest<double> SumTestD;
-TEST_P(SumTestD, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(double(params.rows),
-                                sum_act.data(),
-                                params.cols,
-                                raft::CompareApprox<double>(params.tolerance)));
-}
+
+TEST_P(SumTestF, Result) { runTest(); }
+TEST_P(SumTestD, Result) { runTest(); }
 
 INSTANTIATE_TEST_CASE_P(SumTests, SumTestF, ::testing::ValuesIn(inputsf));
 
diff --git a/python/pylibraft/pylibraft/distance/CMakeLists.txt b/python/pylibraft/pylibraft/distance/CMakeLists.txt
index 14f0cc441a..2530e07a98 100644
--- a/python/pylibraft/pylibraft/distance/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/distance/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -13,7 +13,7 @@
 # =============================================================================
 
 # Set the list of Cython files to build
-set(cython_sources pairwise_distance.pyx fused_l2_nn.pyx)
+set(cython_sources pairwise_distance.pyx fused_l2_nn.pyx fused_distance_nn.pyx)
 set(linked_libraries raft::raft raft::compiled)
 
 # Build all of the Cython targets
diff --git a/python/pylibraft/pylibraft/distance/__init__.py b/python/pylibraft/pylibraft/distance/__init__.py
index f059b5f3dd..d16ab30b2f 100644
--- a/python/pylibraft/pylibraft/distance/__init__.py
+++ b/python/pylibraft/pylibraft/distance/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,7 +13,12 @@
 # limitations under the License.
 #
 
+from .fused_distance_nn import fused_distance_nn_argmin
 from .fused_l2_nn import fused_l2_nn_argmin
 from .pairwise_distance import DISTANCE_TYPES, distance as pairwise_distance
 
-__all__ = ["fused_l2_nn_argmin", "pairwise_distance"]
+__all__ = [
+    "fused_distance_nn_argmin",
+    "fused_l2_nn_argmin",
+    "pairwise_distance",
+]
diff --git a/python/pylibraft/pylibraft/distance/fused_distance_nn.pyx b/python/pylibraft/pylibraft/distance/fused_distance_nn.pyx
new file mode 100644
index 0000000000..0e9fa4b366
--- /dev/null
+++ b/python/pylibraft/pylibraft/distance/fused_distance_nn.pyx
@@ -0,0 +1,200 @@
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+import numpy as np
+
+from cython.operator cimport dereference as deref
+from libc.stdint cimport uintptr_t
+from libcpp cimport bool
+
+from .distance_type cimport DistanceType
+
+from pylibraft.common import (
+    Handle,
+    auto_convert_output,
+    cai_wrapper,
+    device_ndarray,
+)
+from pylibraft.common.handle import auto_sync_handle
+
+from pylibraft.common.handle cimport device_resources
+
+
+cdef extern from "raft_runtime/distance/fused_distance_nn.hpp" \
+        namespace "raft::runtime::distance" nogil:
+
+    void fused_distance_nn_min_arg(
+        const device_resources &handle,
+        int* min,
+        const float* x,
+        const float* y,
+        int m,
+        int n,
+        int k,
+        bool sqrt,
+        DistanceType metric,
+        bool isRowMajor,
+        float metric_arg) except +
+
+
+from pylibraft.distance.pairwise_distance import DISTANCE_TYPES
+
+SUPPORTED_DISTANCES = ["euclidean", "l2", "cosine", "sqeuclidean"]
+
+
+@auto_sync_handle
+@auto_convert_output
+def fused_distance_nn_argmin(X, Y, out=None, sqrt=True, metric="euclidean",
+                             handle=None):
+    """
+    Compute the 1-nearest neighbors between X and Y using the distance metrics
+
+    Valid values for metric:
+        ["euclidean", "l2", "cosine", "sqeuclidean"]
+
+    Parameters
+    ----------
+
+    X : CUDA array interface compliant matrix shape (m, k)
+    Y : CUDA array interface compliant matrix shape (n, k)
+    out : Writable CUDA array interface matrix shape (m, 1)
+    metric : string denoting the metric type (default="euclidean")
+
+    {handle_docstring}
+
+    Examples
+    --------
+    To compute the 1-nearest neighbors argmin:
+
+    >>> import cupy as cp
+    >>> from pylibraft.common import Handle
+    >>> from pylibraft.distance import fused_distance_nn_argmin
+    >>> n_samples = 5000
+    >>> n_clusters = 5
+    >>> n_features = 50
+    >>> in1 = cp.random.random_sample((n_samples, n_features),
+    ...                               dtype=cp.float32)
+    >>> in2 = cp.random.random_sample((n_clusters, n_features),
+    ...                               dtype=cp.float32)
+    >>> # A single RAFT handle can optionally be reused across
+    >>> # pylibraft functions.
+    >>> handle = Handle()
+
+    >>> output = fused_distance_nn_argmin(in1, in2, handle=handle)
+
+    >>> # pylibraft functions are often asynchronous so the
+    >>> # handle needs to be explicitly synchronized
+    >>> handle.sync()
+
+    The output can also be computed in-place on a preallocated
+    array:
+
+    >>> import cupy as cp
+    >>> from pylibraft.common import Handle
+    >>> from pylibraft.distance import fused_distance_nn_argmin
+    >>> n_samples = 5000
+    >>> n_clusters = 5
+    >>> n_features = 50
+    >>> in1 = cp.random.random_sample((n_samples, n_features),
+    ...                               dtype=cp.float32)
+    >>> in2 = cp.random.random_sample((n_clusters, n_features),
+    ...                               dtype=cp.float32)
+    >>> output = cp.empty((n_samples, 1), dtype=cp.int32)
+    >>> # A single RAFT handle can optionally be reused across
+    >>> # pylibraft functions.
+    >>> handle = Handle()
+
+    >>> fused_distance_nn_argmin(in1, in2, out=output, handle=handle)
+    array(...)
+
+    >>> # pylibraft functions are often asynchronous so the
+    >>> # handle needs to be explicitly synchronized
+    >>> handle.sync()
+   """
+
+    x_cai = cai_wrapper(X)
+    y_cai = cai_wrapper(Y)
+
+    x_dt = x_cai.dtype
+    y_dt = y_cai.dtype
+
+    m = x_cai.shape[0]
+    n = y_cai.shape[0]
+
+    if out is None:
+        output = device_ndarray.empty((m,), dtype="int32")
+    else:
+        output = out
+
+    output_cai = cai_wrapper(output)
+
+    x_k = x_cai.shape[1]
+    y_k = y_cai.shape[1]
+
+    if x_k != y_k:
+        raise ValueError("Inputs must have same number of columns. "
+                         "a=%s, b=%s" % (x_k, y_k))
+
+    if metric not in SUPPORTED_DISTANCES:
+        raise ValueError("metric %s is not supported" % metric)
+
+    cdef DistanceType distance_type = DISTANCE_TYPES[metric]
+
+    x_ptr = <uintptr_t>x_cai.data
+    y_ptr = <uintptr_t>y_cai.data
+
+    d_ptr = <uintptr_t>output_cai.data
+
+    handle = handle if handle is not None else Handle()
+    cdef device_resources *h = <device_resources*><size_t>handle.getHandle()
+
+    d_dt = output_cai.dtype
+
+    x_c_contiguous = x_cai.c_contiguous
+    y_c_contiguous = y_cai.c_contiguous
+
+    if x_c_contiguous != y_c_contiguous:
+        raise ValueError("Inputs must have matching strides")
+
+    if not x_c_contiguous:
+        raise ValueError("Inputs must be C contiguous")
+
+    if x_dt != y_dt:
+        raise ValueError("Inputs must have the same dtypes")
+    if d_dt != np.int32:
+        raise ValueError("Output array must be int32")
+    # unused arg for now.
+    metric_arg = 0.0
+    if x_dt == np.float32:
+        fused_distance_nn_min_arg(deref(h),
+                                  <int*> d_ptr,
+                                  <float*> x_ptr,
+                                  <float*> y_ptr,
+                                  <int>m,
+                                  <int>n,
+                                  <int>x_k,
+                                  <bool>sqrt,
+                                  <DistanceType>distance_type,
+                                  <bool>x_c_contiguous,
+                                  <float>metric_arg)
+    else:
+        raise ValueError("dtype %s not supported" % x_dt)
+
+    return output
diff --git a/python/pylibraft/pylibraft/test/test_fused_distance_argmin.py b/python/pylibraft/pylibraft/test/test_fused_distance_argmin.py
new file mode 100755
index 0000000000..6736128242
--- /dev/null
+++ b/python/pylibraft/pylibraft/test/test_fused_distance_argmin.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import pytest
+from scipy.spatial.distance import cdist
+
+from pylibraft.common import DeviceResources, device_ndarray
+from pylibraft.distance import fused_distance_nn_argmin
+
+
+@pytest.mark.parametrize("inplace", [True, False])
+@pytest.mark.parametrize("n_rows", [10, 100])
+@pytest.mark.parametrize("n_clusters", [50, 100])
+@pytest.mark.parametrize("n_cols", [128, 31])
+@pytest.mark.parametrize("dtype", [np.float32])
+@pytest.mark.parametrize(
+    "metric",
+    [
+        "euclidean",
+        "cosine",
+        "sqeuclidean",
+    ],
+)
+def test_fused_distance_nn_minarg(
+    n_rows, n_cols, n_clusters, dtype, inplace, metric
+):
+    input1 = np.random.random_sample((n_rows, n_cols))
+    input1 = np.asarray(input1, order="C").astype(dtype)
+
+    input2 = np.random.random_sample((n_clusters, n_cols))
+    input2 = np.asarray(input2, order="C").astype(dtype)
+
+    output = np.zeros((n_rows), dtype="int32")
+    expected = cdist(input1, input2, metric)
+
+    expected = expected.argmin(axis=1)
+
+    input1_device = device_ndarray(input1)
+    input2_device = device_ndarray(input2)
+    output_device = device_ndarray(output) if inplace else None
+
+    is_sqrt = True if metric == "sqeuclidean" else False
+    handle = DeviceResources()
+    ret_output = fused_distance_nn_argmin(
+        input1_device,
+        input2_device,
+        output_device,
+        is_sqrt,
+        metric,
+        handle=handle,
+    )
+    handle.sync()
+    output_device = ret_output if not inplace else output_device
+    actual = output_device.copy_to_host()
+
+    assert np.allclose(expected, actual, rtol=1e-4)
diff --git a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py
index f4d97edea5..a5ebb76635 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 #
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -62,17 +62,12 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"):
 
         X = cp.asarray(dataset[i : i + n_batch, :], cp.float32)
 
-        D, Ind = knn(
-            X,
-            queries,
-            k,
-            metric=metric,
-            handle=handle,
-            global_id_offset=i,  # shift neighbor index by offset i
-        )
+        D, Ind = knn(X, queries, k, metric=metric, handle=handle)
         handle.sync()
 
         D, Ind = cp.asarray(D), cp.asarray(Ind)
+        Ind += i  # shift neighbor index by offset i
+
         if distances is None:
             distances = D
             indices = Ind
diff --git a/python/raft-dask/raft_dask/common/comms.py b/python/raft-dask/raft_dask/common/comms.py
index 118293c093..b2f7d1fb74 100644
--- a/python/raft-dask/raft_dask/common/comms.py
+++ b/python/raft-dask/raft_dask/common/comms.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,8 +18,7 @@
 import time
 import uuid
 import warnings
-from collections import Counter, OrderedDict, defaultdict
-from typing import Dict
+from collections import OrderedDict
 
 from dask.distributed import default_client
 from dask_cuda.utils import nvml_device_index
@@ -691,9 +690,11 @@ def _func_ucp_ports(client, workers):
 
 def _func_worker_ranks(client, workers):
     """
-    For each worker connected to the client,
-    compute a global rank which is the sum
-    of the NVML device index and the worker rank offset.
+    For each worker connected to the client, compute a global rank which takes
+    into account the NVML device index and the worker IP
+    (group workers on same host and order by NVML device).
+    Note that the reason for sorting was nvbug 4149999 and is presumably
+    fixed afterNCCL 2.19.3.
 
     Parameters
     ----------
@@ -703,13 +704,13 @@ def _func_worker_ranks(client, workers):
     # TODO: Add Test this function
     # Running into build issues preventing testing
     nvml_device_index_d = client.run(_get_nvml_device_index, workers=workers)
-    worker_ips = [
-        _get_worker_ip(worker_address)
-        for worker_address in nvml_device_index_d
+    # Sort workers first by IP and then by the nvml device index:
+    worker_info_list = [
+        (_get_worker_ip(worker), nvml_device_index, worker)
+        for worker, nvml_device_index in nvml_device_index_d.items()
     ]
-    ranks = _map_nvml_device_id_to_contiguous_range(nvml_device_index_d)
-    worker_ip_offset_dict = _get_rank_offset_across_nodes(worker_ips)
-    return _append_rank_offset(ranks, worker_ip_offset_dict)
+    worker_info_list.sort()
+    return {wi[2]: i for i, wi in enumerate(worker_info_list)}
 
 
 def _get_nvml_device_index():
@@ -730,73 +731,3 @@ def _get_worker_ip(worker_address):
         worker_address (str): Full address string of the worker
     """
     return ":".join(worker_address.split(":")[0:2])
-
-
-def _map_nvml_device_id_to_contiguous_range(nvml_device_index_d: dict) -> dict:
-    """
-    For each worker address in nvml_device_index_d, map the corresponding
-    worker rank in the range(0, num_workers_per_node) where rank is decided
-    by the NVML device index. Worker with the lowest NVML device index gets
-    rank 0, and worker with the highest NVML device index gets rank
-    num_workers_per_node-1.
-
-    Parameters
-    ----------
-    nvml_device_index_d : dict
-        Dictionary of worker addresses mapped to their nvml device index.
-
-    Returns
-    -------
-    dict
-        Updated dictionary with worker addresses mapped to their rank.
-    """
-
-    rank_per_ip: Dict[str, int] = defaultdict(int)
-
-    # Sort by NVML index to ensure that the worker
-    # with the lowest NVML index gets rank 0.
-    for worker, _ in sorted(nvml_device_index_d.items(), key=lambda x: x[1]):
-        ip = _get_worker_ip(worker)
-
-        nvml_device_index_d[worker] = rank_per_ip[ip]
-        rank_per_ip[ip] += 1
-
-    return nvml_device_index_d
-
-
-def _get_rank_offset_across_nodes(worker_ips):
-    """
-    Get a dictionary of worker IP addresses mapped to the cumulative count of
-    their occurrences in the worker_ips list. The cumulative count serves as
-    the rank offset.
-
-    Parameters
-    ----------
-        worker_ips (list): List of worker IP addresses.
-    """
-    worker_count_dict = Counter(worker_ips)
-    worker_offset_dict = {}
-    current_offset = 0
-    for worker_ip, worker_count in worker_count_dict.items():
-        worker_offset_dict[worker_ip] = current_offset
-        current_offset += worker_count
-    return worker_offset_dict
-
-
-def _append_rank_offset(rank_dict, worker_ip_offset_dict):
-    """
-    For each worker address in the rank dictionary, add the
-    corresponding worker offset from the worker_ip_offset_dict
-    to the rank value.
-
-    Parameters
-    ----------
-        rank_dict (dict): Dictionary of worker addresses mapped to their ranks.
-        worker_ip_offset_dict (dict): Dictionary of worker IP addresses
-                                      mapped to their offsets.
-    """
-    for worker_ip, worker_offset in worker_ip_offset_dict.items():
-        for worker_address in rank_dict:
-            if worker_ip in worker_address:
-                rank_dict[worker_address] += worker_offset
-    return rank_dict
diff --git a/python/raft-dask/raft_dask/test/test_comms.py b/python/raft-dask/raft_dask/test/test_comms.py
index 68c9fee556..b62d7185b2 100644
--- a/python/raft-dask/raft_dask/test/test_comms.py
+++ b/python/raft-dask/raft_dask/test/test_comms.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -354,3 +354,16 @@ def test_device_multicast_sendrecv(n_trials, client):
     wait(dfs, timeout=5)
 
     assert list(map(lambda x: x.result(), dfs))
+
+
+@pytest.mark.nccl
+@pytest.mark.parametrize(
+    "subset", [slice(-1, None), slice(1), slice(None, None, -2)]
+)
+def test_comm_init_worker_subset(client, subset):
+    # Basic test that initializing a subset of workers is fine
+    cb = Comms(comms_p2p=True, verbose=True)
+
+    workers = list(client.scheduler_info()["workers"].keys())
+    workers = workers[subset]
+    cb.init(workers=workers)