From 50719712889137fb451e5bdb8eab4f8f6fb80408 Mon Sep 17 00:00:00 2001
From: rhdong <hrong@nvidia.com>
Date: Wed, 31 Jan 2024 08:06:14 -0800
Subject: [PATCH 01/12] [FEA] Add support for `select_k` on CSR matrix

- This PR is one part of the feature of #1969
- Add the API of 'select_k' accepting CSR as input
- Add the API of 'segmented_copy'

Authors:
  - James Rong (https://github.com/rhdong)

Approvers:
  - Ben Frederickson (https://github.com/benfred)
  - Micka (https://github.com/lowener)
  - Corey J. Nolet (https://github.com/cjnolet)
---
 cpp/bench/prims/CMakeLists.txt                |   1 +
 cpp/bench/prims/matrix/select_k_csr.cu        | 257 +++++++++++++
 cpp/include/raft/matrix/copy.cuh              |  37 +-
 cpp/include/raft/matrix/detail/matrix.cuh     |  67 +++-
 .../raft/matrix/detail/select_k-ext.cuh       |  30 ++
 .../raft/matrix/detail/select_k-inl.cuh       |  99 +++++
 cpp/include/raft/matrix/select_k.cuh          |  39 ++
 .../matrix/detail/select_k_double_int64_t.cu  |  14 +
 .../matrix/detail/select_k_double_uint32_t.cu |  14 +
 cpp/src/matrix/detail/select_k_float_int32.cu |  14 +
 .../matrix/detail/select_k_float_int64_t.cu   |  14 +
 .../matrix/detail/select_k_float_uint32_t.cu  |  14 +
 .../matrix/detail/select_k_half_int64_t.cu    |  14 +
 .../matrix/detail/select_k_half_uint32_t.cu   |  14 +
 cpp/test/CMakeLists.txt                       |   9 +-
 cpp/test/matrix/copy.cu                       | 253 +++++++++++++
 cpp/test/matrix/select_k_csr.cu               | 350 ++++++++++++++++++
 17 files changed, 1237 insertions(+), 3 deletions(-)
 create mode 100644 cpp/bench/prims/matrix/select_k_csr.cu
 create mode 100644 cpp/test/matrix/copy.cu
 create mode 100644 cpp/test/matrix/select_k_csr.cu
diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
index 3a2431cd34..253bc6c2e0 100644
--- a/cpp/bench/prims/CMakeLists.txt
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -128,6 +128,7 @@ if(BUILD_PRIMS_BENCH)
     bench/prims/matrix/argmin.cu
     bench/prims/matrix/gather.cu
     bench/prims/matrix/select_k.cu
+    bench/prims/matrix/select_k_csr.cu
     bench/prims/matrix/main.cpp
     OPTIONAL
     LIB
diff --git a/cpp/bench/prims/matrix/select_k_csr.cu b/cpp/bench/prims/matrix/select_k_csr.cu
new file mode 100644
index 0000000000..99c59f4fde
--- /dev/null
+++ b/cpp/bench/prims/matrix/select_k_csr.cu
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <common/benchmark.hpp>
+#include <raft/sparse/convert/csr.cuh>
+#include <rmm/device_uvector.hpp>
+
+#include <raft/core/device_resources.hpp>
+#include <raft/util/itertools.hpp>
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+
+#include <random>
+#include <sstream>
+#include <unordered_set>
+#include <vector>
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/matrix/copy.cuh>
+#include <raft/matrix/select_k.cuh>
+#include <raft/random/make_blobs.cuh>
+#include <raft/random/rng_state.hpp>
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft::bench::sparse {
+
+template <typename index_t>
+struct bench_param {
+  index_t n_rows;
+  index_t n_cols;
+  index_t top_k;
+  float sparsity;
+  bool select_min         = true;
+  bool customized_indices = false;
+};
+
+template <typename index_t>
+inline auto operator<<(std::ostream& os, const bench_param<index_t>& params) -> std::ostream&
+{
+  os << " rows*cols=" << params.n_rows << "*" << params.n_cols << "\ttop_k=" << params.top_k
+     << "\tsparsity=" << params.sparsity;
+  return os;
+}
+
+template <typename value_t, typename index_t>
+struct SelectKCsrTest : public fixture {
+  SelectKCsrTest(const bench_param<index_t>& p)
+    : fixture(true),
+      params(p),
+      handle(stream),
+      values_d(0, stream),
+      indptr_d(0, stream),
+      indices_d(0, stream),
+      customized_indices_d(0, stream),
+      dst_values_d(0, stream),
+      dst_indices_d(0, stream)
+  {
+    std::vector<bool> dense_values_h(params.n_rows * params.n_cols, false);
+    nnz = create_sparse_matrix(params.n_rows, params.n_cols, params.sparsity, dense_values_h);
+
+    std::vector<index_t> indices_h(nnz);
+    std::vector<index_t> customized_indices_h(nnz);
+    std::vector<index_t> indptr_h(params.n_rows + 1);
+
+    convert_to_csr(dense_values_h, params.n_rows, params.n_cols, indices_h, indptr_h);
+
+    std::vector<value_t> dst_values_h(params.n_rows * params.top_k, static_cast<value_t>(2.0f));
+    std::vector<index_t> dst_indices_h(params.n_rows * params.top_k,
+                                       static_cast<index_t>(params.n_rows * params.n_cols * 100));
+
+    dst_values_d.resize(params.n_rows * params.top_k, stream);
+    dst_indices_d.resize(params.n_rows * params.top_k, stream);
+    values_d.resize(nnz, stream);
+
+    if (nnz) {
+      auto blobs_values = raft::make_device_matrix<value_t, index_t>(handle, 1, nnz);
+      auto labels       = raft::make_device_vector<index_t, index_t>(handle, 1);
+
+      raft::random::make_blobs<value_t, index_t>(blobs_values.data_handle(),
+                                                 labels.data_handle(),
+                                                 1,
+                                                 nnz,
+                                                 1,
+                                                 stream,
+                                                 false,
+                                                 nullptr,
+                                                 nullptr,
+                                                 value_t(1.0),
+                                                 false,
+                                                 value_t(-10.0f),
+                                                 value_t(10.0f),
+                                                 uint64_t(2024));
+      raft::copy(values_d.data(), blobs_values.data_handle(), nnz, stream);
+      resource::sync_stream(handle);
+    }
+
+    indices_d.resize(nnz, stream);
+    indptr_d.resize(params.n_rows + 1, stream);
+
+    update_device(indices_d.data(), indices_h.data(), indices_h.size(), stream);
+    update_device(indptr_d.data(), indptr_h.data(), indptr_h.size(), stream);
+
+    if (params.customized_indices) {
+      customized_indices_d.resize(nnz, stream);
+      update_device(customized_indices_d.data(),
+                    customized_indices_h.data(),
+                    customized_indices_h.size(),
+                    stream);
+    }
+  }
+
+  index_t create_sparse_matrix(index_t m, index_t n, value_t sparsity, std::vector<bool>& matrix)
+  {
+    index_t total_elements = static_cast<index_t>(m * n);
+    index_t num_ones       = static_cast<index_t>((total_elements * 1.0f) * sparsity);
+    index_t res            = num_ones;
+
+    for (index_t i = 0; i < total_elements; ++i) {
+      matrix[i] = false;
+    }
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<> dis_idx(0, total_elements - 1);
+
+    while (num_ones > 0) {
+      size_t index = dis_idx(gen);
+      if (matrix[index] == false) {
+        matrix[index] = true;
+        num_ones--;
+      }
+    }
+    return res;
+  }
+
+  void convert_to_csr(std::vector<bool>& matrix,
+                      index_t rows,
+                      index_t cols,
+                      std::vector<index_t>& indices,
+                      std::vector<index_t>& indptr)
+  {
+    index_t offset_indptr   = 0;
+    index_t offset_values   = 0;
+    indptr[offset_indptr++] = 0;
+
+    for (index_t i = 0; i < rows; ++i) {
+      for (index_t j = 0; j < cols; ++j) {
+        if (matrix[i * cols + j]) {
+          indices[offset_values] = static_cast<index_t>(j);
+          offset_values++;
+        }
+      }
+      indptr[offset_indptr++] = static_cast<index_t>(offset_values);
+    }
+  }
+
+  template <typename data_t>
+  std::optional<data_t> get_opt_var(data_t x)
+  {
+    if (params.customized_indices) {
+      return x;
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    std::ostringstream label_stream;
+    label_stream << params;
+    state.SetLabel(label_stream.str());
+
+    auto in_val_structure = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
+      indptr_d.data(),
+      indices_d.data(),
+      params.n_rows,
+      params.n_cols,
+      static_cast<index_t>(indices_d.size()));
+
+    auto in_val =
+      raft::make_device_csr_matrix_view<const value_t>(values_d.data(), in_val_structure);
+
+    std::optional<raft::device_vector_view<const index_t, index_t>> in_idx;
+
+    in_idx = get_opt_var(
+      raft::make_device_vector_view<const index_t, index_t>(customized_indices_d.data(), nnz));
+
+    auto out_val = raft::make_device_matrix_view<value_t, index_t, raft::row_major>(
+      dst_values_d.data(), params.n_rows, params.top_k);
+    auto out_idx = raft::make_device_matrix_view<index_t, index_t, raft::row_major>(
+      dst_indices_d.data(), params.n_rows, params.top_k);
+
+    raft::matrix::select_k(handle, in_val, in_idx, out_val, out_idx, params.select_min);
+    resource::sync_stream(handle);
+    loop_on_state(state, [this, &in_val, &in_idx, &out_val, &out_idx]() {
+      raft::matrix::select_k(handle, in_val, in_idx, out_val, out_idx, params.select_min);
+      resource::sync_stream(handle);
+    });
+  }
+
+ protected:
+  const raft::device_resources handle;
+
+  bench_param<index_t> params;
+  index_t nnz;
+
+  rmm::device_uvector<value_t> values_d;
+  rmm::device_uvector<index_t> indptr_d;
+  rmm::device_uvector<index_t> indices_d;
+  rmm::device_uvector<index_t> customized_indices_d;
+
+  rmm::device_uvector<value_t> dst_values_d;
+  rmm::device_uvector<index_t> dst_indices_d;
+};  // struct SelectKCsrTest
+
+template <typename index_t>
+const std::vector<bench_param<index_t>> getInputs()
+{
+  std::vector<bench_param<index_t>> param_vec;
+  struct TestParams {
+    index_t m;
+    index_t n;
+    index_t k;
+    float sparsity;
+  };
+
+  const std::vector<TestParams> params_group =
+    raft::util::itertools::product<TestParams>({index_t(10), index_t(1024)},
+                                               {index_t(1024 * 10), index_t(1024 * 1024)},
+                                               {index_t(128), index_t(100), index_t(2048)},
+                                               {0.1f, 0.2f, 0.5f});
+
+  param_vec.reserve(params_group.size());
+  for (TestParams params : params_group) {
+    param_vec.push_back(bench_param<index_t>({params.m, params.n, params.k, params.sparsity}));
+  }
+  return param_vec;
+}
+
+RAFT_BENCH_REGISTER((SelectKCsrTest<float, int>), "", getInputs<int>());
+
+}  // namespace raft::bench::sparse
diff --git a/cpp/include/raft/matrix/copy.cuh b/cpp/include/raft/matrix/copy.cuh
index be83a4a19e..785ff84b56 100644
--- a/cpp/include/raft/matrix/copy.cuh
+++ b/cpp/include/raft/matrix/copy.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -122,6 +122,41 @@ void trunc_zero_origin(raft::resources const& handle,
                                       resource::get_cuda_stream(handle));
 }
 
+/**
+ * @brief Copy a specific number of elements row by row from the source vector to the target matrix
+ * according to the segment indicated by offsets
+ *
+ * @tparam m_t the type of the copied items.
+ * @tparam idx_t the index type of vectors and matrix.
+ * @param[in] handle raft handle
+ * @param[in] max_len_per_row Maximum number of copies per row
+ * @param[in] src Source vector
+ * @param[in] offsets Indicates the starting and ending index of each row in the vector
+ * @param[out] dst Destination matrix in row major order
+ *
+ * @note When the length of one segment is less than max_len_per_row, the remaining position values
+ * of dst will remain unchanged.
+ */
+template <typename m_t, typename idx_t>
+void segmented_copy(raft::resources const& handle,
+                    idx_t max_len_per_row,
+                    raft::device_vector_view<m_t, idx_t> src,
+                    raft::device_vector_view<idx_t, idx_t> offsets,
+                    raft::device_matrix_view<m_t, idx_t, row_major> dst)
+{
+  RAFT_EXPECTS(static_cast<idx_t>(offsets.size()) == (dst.extent(0) + 1),
+               "Number of offsets must be larger than number of output rows by 1");
+  RAFT_EXPECTS(dst.extent(1) >= max_len_per_row,
+               "Number of rows in the out must be equal or larger than max_len_per_row");
+  detail::segmented_copy(handle,
+                         src.data_handle(),
+                         dst.extent(0),
+                         dst.extent(1),
+                         max_len_per_row,
+                         offsets.data_handle(),
+                         dst.data_handle());
+}
+
 /** @} */  // end of group matrix_copy
 
 }  // namespace raft::matrix
diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
index 2fa741fd96..415ef31965 100644
--- a/cpp/include/raft/matrix/detail/matrix.cuh
+++ b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -316,6 +316,71 @@ m_t getL2Norm(raft::resources const& handle, const m_t* in, idx_t size, cudaStre
   return normval;
 }
 
+// Threads per block in segmented_copy_kernel.
+static const constexpr int SEGMENTED_COPY_TPB_256 = 256;
+static const constexpr int SEGMENTED_COPY_TPB_32  = 32;
+
+template <typename m_t, typename idx_t, idx_t TPB>
+RAFT_KERNEL __launch_bounds__(TPB) segmented_copy_kernel(
+  const m_t* src, idx_t n_rows, idx_t n_cols, idx_t max_len_per_row, idx_t* offsets, m_t* dst)
+{
+#pragma unroll
+  for (idx_t row_id = blockIdx.y; row_id < n_rows; row_id += gridDim.y) {
+    idx_t segment_start = offsets[row_id];
+    idx_t len           = min(offsets[row_id + 1] - segment_start, max_len_per_row);
+    for (idx_t col_id = threadIdx.x + blockIdx.x * blockDim.x; col_id < len;
+         col_id += blockDim.x * gridDim.x) {
+      dst[row_id * n_cols + col_id] = src[segment_start + col_id];
+    }
+  }
+}
+
+template <typename m_t, typename idx_t>
+void segmented_copy(raft::resources const& handle,
+                    const m_t* src,
+                    idx_t n_rows,
+                    idx_t n_cols,
+                    idx_t max_len_per_row,
+                    idx_t* offsets,
+                    m_t* dst)
+{
+  auto stream = resource::get_cuda_stream(handle);
+
+  idx_t tpb = max_len_per_row >= 256 ? SEGMENTED_COPY_TPB_256 : SEGMENTED_COPY_TPB_32;
+
+  int dev_id, sm_count, blocks_per_sm;
+  cudaGetDevice(&dev_id);
+  cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
+
+  if (tpb == SEGMENTED_COPY_TPB_32) {
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &blocks_per_sm, segmented_copy_kernel<m_t, idx_t, SEGMENTED_COPY_TPB_32>, tpb, 0);
+  } else if (tpb == SEGMENTED_COPY_TPB_256) {
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &blocks_per_sm, segmented_copy_kernel<m_t, idx_t, SEGMENTED_COPY_TPB_256>, tpb, 0);
+  }
+
+  idx_t max_active_blocks = sm_count * blocks_per_sm;
+  // `max threads number = sm_count * blocks_per_sm * tpb`
+  // `problem size = n_rows * max_len_per_row`
+  idx_t required_active_blocks =
+    raft::min(max_active_blocks, raft::ceildiv(n_rows * max_len_per_row, tpb));
+
+  idx_t blocks_per_row = raft::ceildiv(required_active_blocks, n_rows);
+  idx_t grid_rows      = raft::ceildiv(required_active_blocks, blocks_per_row);
+  dim3 block(tpb, 1);
+  dim3 grid(blocks_per_row, grid_rows);
+
+  if (tpb == SEGMENTED_COPY_TPB_32) {
+    segmented_copy_kernel<m_t, idx_t, SEGMENTED_COPY_TPB_32>
+      <<<grid, block, 0, stream>>>(src, n_rows, n_cols, max_len_per_row, offsets, dst);
+  } else if (tpb == SEGMENTED_COPY_TPB_256) {
+    segmented_copy_kernel<m_t, idx_t, SEGMENTED_COPY_TPB_256>
+      <<<grid, block, 0, stream>>>(src, n_rows, n_cols, max_len_per_row, offsets, dst);
+  }
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
 }  // end namespace detail
 }  // end namespace matrix
 }  // end namespace raft
diff --git a/cpp/include/raft/matrix/detail/select_k-ext.cuh b/cpp/include/raft/matrix/detail/select_k-ext.cuh
index dfdbfa2d07..af47d45685 100644
--- a/cpp/include/raft/matrix/detail/select_k-ext.cuh
+++ b/cpp/include/raft/matrix/detail/select_k-ext.cuh
@@ -18,6 +18,7 @@
 
 #include <cstdint>      // uint32_t
 #include <cuda_fp16.h>  // __half
+#include <raft/core/device_csr_matrix.hpp>
 #include <raft/core/device_resources.hpp>
 #include <raft/matrix/select_k_types.hpp>
 #include <raft/util/raft_explicit.hpp>               // RAFT_EXPLICIT
@@ -41,6 +42,15 @@ void select_k(raft::resources const& handle,
               rmm::mr::device_memory_resource* mr = nullptr,
               bool sorted                         = false,
               SelectAlgo algo                     = SelectAlgo::kAuto) RAFT_EXPLICIT;
+
+template <typename T, typename IdxT>
+void select_k(raft::resources const& handle,
+              raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx,
+              raft::device_matrix_view<T, IdxT, raft::row_major> out_val,
+              raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,
+              bool select_min,
+              rmm::mr::device_memory_resource* mr = nullptr) RAFT_EXPLICIT;
 }  // namespace raft::matrix::detail
 
 #endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
@@ -70,3 +80,23 @@ instantiate_raft_matrix_detail_select_k(double, int64_t);
 instantiate_raft_matrix_detail_select_k(double, uint32_t);
 
 #undef instantiate_raft_matrix_detail_select_k
+
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)              \
+  extern template void raft::matrix::detail::select_k(                \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_matrix_detail_select_k(__half, uint32_t);
+instantiate_raft_matrix_detail_select_k(__half, int64_t);
+instantiate_raft_matrix_detail_select_k(float, int64_t);
+instantiate_raft_matrix_detail_select_k(float, uint32_t);
+instantiate_raft_matrix_detail_select_k(float, int);
+instantiate_raft_matrix_detail_select_k(double, int64_t);
+instantiate_raft_matrix_detail_select_k(double, uint32_t);
+
+#undef instantiate_raft_matrix_detail_select_k
diff --git a/cpp/include/raft/matrix/detail/select_k-inl.cuh b/cpp/include/raft/matrix/detail/select_k-inl.cuh
index 0a6f292e68..a9d1456e29 100644
--- a/cpp/include/raft/matrix/detail/select_k-inl.cuh
+++ b/cpp/include/raft/matrix/detail/select_k-inl.cuh
@@ -17,12 +17,17 @@
 
 #pragma once
 
+#include <type_traits>
+
 #include "select_radix.cuh"
 #include "select_warpsort.cuh"
 
+#include <raft/core/device_csr_matrix.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/nvtx.hpp>
+#include <raft/matrix/copy.cuh>
+#include <raft/matrix/gather.cuh>
 #include <raft/matrix/init.cuh>
 #include <raft/matrix/select_k_types.hpp>
 
@@ -320,4 +325,98 @@ void select_k(raft::resources const& handle,
     default: RAFT_FAIL("K-selection Algorithm not supported.");
   }
 }
+
+/**
+ * Selects the k smallest or largest keys/values from each row of the input matrix.
+ *
+ * This function operates on a row-major matrix `in_val` with dimensions `batch_size` x `len`,
+ * selecting the k smallest or largest elements from each row. The selected elements are then stored
+ * in a row-major output matrix `out_val` with dimensions `batch_size` x k.
+ *
+ * @tparam T
+ *   Type of the elements being compared (keys).
+ * @tparam IdxT
+ *   Type of the indices associated with the keys.
+ * @tparam NZType
+ *   Type representing non-zero elements of `in_val`.
+ *
+ * @param[in] handle
+ *   Container for managing reusable resources.
+ * @param[in] in_val
+ *   Input matrix in CSR format with a logical dense shape of [batch_size, len],
+ *   containing the elements to be compared and selected.
+ * @param[in] in_idx
+ *   Optional input indices [in_val.nnz] associated with `in_val.values`.
+ *   If `in_idx` is `std::nullopt`, it defaults to a contiguous array from 0 to len-1.
+ * @param[out] out_val
+ *   Output matrix [in_val.get_n_row(), k] storing the selected k smallest/largest elements
+ *   from each row of `in_val`.
+ * @param[out] out_idx
+ *   Output indices [in_val.get_n_row(), k] corresponding to the selected elements in `out_val`.
+ * @param[in] select_min
+ *   Flag indicating whether to select the k smallest (true) or largest (false) elements.
+ * @param[in] mr
+ *   An optional memory resource to use across the calls (you can provide a large enough
+ *           memory pool here to avoid memory allocations within the call).
+ */
+template <typename T, typename IdxT>
+void select_k(raft::resources const& handle,
+              raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx,
+              raft::device_matrix_view<T, IdxT, raft::row_major> out_val,
+              raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,
+              bool select_min,
+              rmm::mr::device_memory_resource* mr = nullptr)
+{
+  auto csr_view = in_val.structure_view();
+  auto nnz      = csr_view.get_nnz();
+
+  if (nnz == 0) return;
+
+  auto batch_size = csr_view.get_n_rows();
+  auto len        = csr_view.get_n_cols();
+  auto k          = IdxT(out_val.extent(1));
+
+  if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); }
+  RAFT_EXPECTS(out_val.extent(1) <= int64_t(std::numeric_limits<int>::max()),
+               "output k must fit the int type.");
+
+  RAFT_EXPECTS(batch_size == out_val.extent(0), "batch sizes must be equal");
+  RAFT_EXPECTS(batch_size == out_idx.extent(0), "batch sizes must be equal");
+
+  if (in_idx.has_value()) {
+    RAFT_EXPECTS(size_t(nnz) == in_idx->size(),
+                 "nnz of in_val must be equal to the length of in_idx");
+  }
+  RAFT_EXPECTS(IdxT(k) == out_idx.extent(1), "value and index output lengths must be equal");
+
+  auto stream = raft::resource::get_cuda_stream(handle);
+
+  rmm::device_uvector<IdxT> offsets(batch_size + 1, stream);
+  rmm::device_uvector<T> keys(nnz, stream);
+  rmm::device_uvector<IdxT> values(nnz, stream);
+
+  raft::copy(offsets.data(), csr_view.get_indptr().data(), batch_size + 1, stream);
+  raft::copy(keys.data(), in_val.get_elements().data(), nnz, stream);
+  raft::copy(values.data(),
+             (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+             nnz,
+             stream);
+
+  segmented_sort_by_key(handle,
+                        keys.data(),
+                        values.data(),
+                        size_t(batch_size),
+                        size_t(nnz),
+                        offsets.data(),
+                        select_min);
+
+  auto src_val      = raft::make_device_vector_view<T, IdxT>(keys.data(), nnz);
+  auto offsets_view = raft::make_device_vector_view<IdxT, IdxT>(offsets.data(), batch_size + 1);
+  raft::matrix::segmented_copy<T, IdxT>(handle, k, src_val, offsets_view, out_val);
+
+  auto src_idx = raft::make_device_vector_view<IdxT, IdxT>(values.data(), nnz);
+  raft::matrix::segmented_copy<IdxT, IdxT>(handle, k, src_idx, offsets_view, out_idx);
+}
+
 }  // namespace raft::matrix::detail
diff --git a/cpp/include/raft/matrix/select_k.cuh b/cpp/include/raft/matrix/select_k.cuh
index 92d7db006d..1f8136290b 100644
--- a/cpp/include/raft/matrix/select_k.cuh
+++ b/cpp/include/raft/matrix/select_k.cuh
@@ -19,6 +19,7 @@
 #include "detail/select_k.cuh"
 #include <raft/core/resource/cuda_stream.hpp>
 
+#include <raft/core/device_csr_matrix.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/nvtx.hpp>
 #include <raft/core/resources.hpp>
@@ -117,6 +118,44 @@ void select_k(raft::resources const& handle,
                                    algo);
 }
 
+/**
+ * Selects the k smallest or largest keys/values from each row of the input matrix.
+ *
+ * This function operates on a row-major matrix `in_val` with dimensions `batch_size` x `len`,
+ * selecting the k smallest or largest elements from each row. The selected elements are then stored
+ * in a row-major output matrix `out_val` with dimensions `batch_size` x k.
+ *
+ * @tparam T
+ *   Type of the elements being compared (keys).
+ * @tparam IdxT
+ *   Type of the indices associated with the keys.
+ *
+ * @param[in] handle
+ *   Container for managing reusable resources.
+ * @param[in] in_val
+ *   Input matrix in CSR format with a logical dense shape of [batch_size, len],
+ *   containing the elements to be compared and selected.
+ * @param[in] in_idx
+ *   Optional input indices [in_val.nnz] associated with `in_val.values`.
+ *   If `in_idx` is `std::nullopt`, it defaults to a contiguous array from 0 to len-1.
+ * @param[out] out_val
+ *   Output matrix [in_val.get_n_row(), k] storing the selected k smallest/largest elements
+ *   from each row of `in_val`.
+ * @param[out] out_idx
+ *   Output indices [in_val.get_n_row(), k] corresponding to the selected elements in `out_val`.
+ * @param[in] select_min
+ *   Flag indicating whether to select the k smallest (true) or largest (false) elements.
+ */
+template <typename T, typename IdxT>
+void select_k(raft::resources const& handle,
+              raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx,
+              raft::device_matrix_view<T, IdxT, raft::row_major> out_val,
+              raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,
+              bool select_min)
+{
+  return detail::select_k<T, IdxT>(handle, in_val, in_idx, out_val, out_idx, select_min);
+}
 /** @} */  // end of group select_k
 
 }  // namespace raft::matrix
diff --git a/cpp/src/matrix/detail/select_k_double_int64_t.cu b/cpp/src/matrix/detail/select_k_double_int64_t.cu
index 87e5d49d29..7f8aed2506 100644
--- a/cpp/src/matrix/detail/select_k_double_int64_t.cu
+++ b/cpp/src/matrix/detail/select_k_double_int64_t.cu
@@ -33,3 +33,17 @@
 instantiate_raft_matrix_detail_select_k(double, int64_t);
 
 #undef instantiate_raft_matrix_detail_select_k
+
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)              \
+  template void raft::matrix::detail::select_k(                       \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_matrix_detail_select_k(double, int64_t);
+
+#undef instantiate_raft_matrix_detail_select_k
\ No newline at end of file
diff --git a/cpp/src/matrix/detail/select_k_double_uint32_t.cu b/cpp/src/matrix/detail/select_k_double_uint32_t.cu
index 67dce0e166..73338e7578 100644
--- a/cpp/src/matrix/detail/select_k_double_uint32_t.cu
+++ b/cpp/src/matrix/detail/select_k_double_uint32_t.cu
@@ -34,3 +34,17 @@
 instantiate_raft_matrix_detail_select_k(double, uint32_t);
 
 #undef instantiate_raft_matrix_detail_select_k
+
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)              \
+  template void raft::matrix::detail::select_k(                       \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_matrix_detail_select_k(double, uint32_t);
+
+#undef instantiate_raft_matrix_detail_select_k
\ No newline at end of file
diff --git a/cpp/src/matrix/detail/select_k_float_int32.cu b/cpp/src/matrix/detail/select_k_float_int32.cu
index 4be7c54839..a2d796a7c5 100644
--- a/cpp/src/matrix/detail/select_k_float_int32.cu
+++ b/cpp/src/matrix/detail/select_k_float_int32.cu
@@ -33,3 +33,17 @@
 instantiate_raft_matrix_detail_select_k(float, int);
 
 #undef instantiate_raft_matrix_detail_select_k
+
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)              \
+  template void raft::matrix::detail::select_k(                       \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_matrix_detail_select_k(float, int);
+
+#undef instantiate_raft_matrix_detail_select_k
diff --git a/cpp/src/matrix/detail/select_k_float_int64_t.cu b/cpp/src/matrix/detail/select_k_float_int64_t.cu
index 6337994e86..c7d93ab463 100644
--- a/cpp/src/matrix/detail/select_k_float_int64_t.cu
+++ b/cpp/src/matrix/detail/select_k_float_int64_t.cu
@@ -33,3 +33,17 @@
 instantiate_raft_matrix_detail_select_k(float, int64_t);
 
 #undef instantiate_raft_matrix_detail_select_k
+
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)              \
+  template void raft::matrix::detail::select_k(                       \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_matrix_detail_select_k(float, uint64_t);
+
+#undef instantiate_raft_matrix_detail_select_k
diff --git a/cpp/src/matrix/detail/select_k_float_uint32_t.cu b/cpp/src/matrix/detail/select_k_float_uint32_t.cu
index ad26547812..dbf7afa06e 100644
--- a/cpp/src/matrix/detail/select_k_float_uint32_t.cu
+++ b/cpp/src/matrix/detail/select_k_float_uint32_t.cu
@@ -33,3 +33,17 @@
 instantiate_raft_matrix_detail_select_k(float, uint32_t);
 
 #undef instantiate_raft_matrix_detail_select_k
+
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)              \
+  template void raft::matrix::detail::select_k(                       \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_matrix_detail_select_k(float, uint32_t);
+
+#undef instantiate_raft_matrix_detail_select_k
diff --git a/cpp/src/matrix/detail/select_k_half_int64_t.cu b/cpp/src/matrix/detail/select_k_half_int64_t.cu
index e3c29a2033..9923088a84 100644
--- a/cpp/src/matrix/detail/select_k_half_int64_t.cu
+++ b/cpp/src/matrix/detail/select_k_half_int64_t.cu
@@ -33,3 +33,17 @@
 instantiate_raft_matrix_detail_select_k(__half, int64_t);
 
 #undef instantiate_raft_matrix_detail_select_k
+
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)              \
+  template void raft::matrix::detail::select_k(                       \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_matrix_detail_select_k(__half, int64_t);
+
+#undef instantiate_raft_matrix_detail_select_k
diff --git a/cpp/src/matrix/detail/select_k_half_uint32_t.cu b/cpp/src/matrix/detail/select_k_half_uint32_t.cu
index 3e3a738915..e90fe42c3e 100644
--- a/cpp/src/matrix/detail/select_k_half_uint32_t.cu
+++ b/cpp/src/matrix/detail/select_k_half_uint32_t.cu
@@ -33,3 +33,17 @@
 instantiate_raft_matrix_detail_select_k(__half, uint32_t);
 
 #undef instantiate_raft_matrix_detail_select_k
+
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)              \
+  template void raft::matrix::detail::select_k(                       \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_matrix_detail_select_k(__half, uint32_t);
+
+#undef instantiate_raft_matrix_detail_select_k
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index fe29409d9b..af283cf60c 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -256,6 +256,7 @@ if(BUILD_TESTS)
     test/matrix/argmax.cu
     test/matrix/argmin.cu
     test/matrix/columnSort.cu
+    test/matrix/copy.cu
     test/matrix/diagonal.cu
     test/matrix/gather.cu
     test/matrix/scatter.cu
@@ -272,7 +273,13 @@ if(BUILD_TESTS)
     EXPLICIT_INSTANTIATE_ONLY
   )
 
-  ConfigureTest(NAME MATRIX_SELECT_TEST PATH test/matrix/select_k.cu LIB EXPLICIT_INSTANTIATE_ONLY)
+  ConfigureTest(
+    NAME
+    MATRIX_SELECT_TEST
+    PATH test/matrix/select_k.cu
+    PATH test/matrix/select_k_csr.cu
+    LIB
+    EXPLICIT_INSTANTIATE_ONLY)
 
   ConfigureTest(
     NAME MATRIX_SELECT_LARGE_TEST PATH test/matrix/select_large_k.cu LIB EXPLICIT_INSTANTIATE_ONLY
diff --git a/cpp/test/matrix/copy.cu b/cpp/test/matrix/copy.cu
new file mode 100644
index 0000000000..58bc8970a4
--- /dev/null
+++ b/cpp/test/matrix/copy.cu
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+#include <gtest/gtest.h>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/matrix/copy.cuh>
+#include <raft/random/make_blobs.cuh>
+#include <raft/util/cuda_utils.cuh>
+
+#include <iostream>
+
+namespace raft {
+namespace sparse {
+
+template <typename index_t>
+struct SegmentedCopyInputs {
+  index_t n_rows;
+  index_t n_cols;
+  index_t top_k;
+  float sparsity;
+};
+
+template <typename value_t, typename index_t>
+class SegmentedCopyTest : public ::testing::TestWithParam<SegmentedCopyInputs<index_t>> {
+ public:
+  SegmentedCopyTest()
+    : stream(resource::get_cuda_stream(handle)),
+      params(::testing::TestWithParam<SegmentedCopyInputs<index_t>>::GetParam()),
+      indices_d(0, stream),
+      indptr_d(0, stream),
+      values_d(0, stream),
+      dst_values_d(0, stream),
+      dst_values_expected_d(0, stream),
+      dst_indices_d(0, stream),
+      dst_indices_expected_d(0, stream)
+  {
+  }
+
+ protected:
+  index_t create_sparse_matrix(index_t m, index_t n, value_t sparsity, std::vector<bool>& matrix)
+  {
+    index_t total_elements = static_cast<index_t>(m * n);
+    index_t num_ones       = static_cast<index_t>((total_elements * 1.0f) * sparsity);
+    index_t res            = num_ones;
+
+    for (index_t i = 0; i < total_elements; ++i) {
+      matrix[i] = false;
+    }
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<> dis_idx(0, total_elements - 1);
+
+    while (num_ones > 0) {
+      size_t index = dis_idx(gen);
+      if (matrix[index] == false) {
+        matrix[index] = true;
+        num_ones--;
+      }
+    }
+    return res;
+  }
+  void convert_to_csr(std::vector<bool>& matrix,
+                      index_t rows,
+                      index_t cols,
+                      std::vector<index_t>& indices,
+                      std::vector<index_t>& indptr)
+  {
+    index_t offset_indptr   = 0;
+    index_t offset_values   = 0;
+    indptr[offset_indptr++] = 0;
+
+    for (index_t i = 0; i < rows; ++i) {
+      for (index_t j = 0; j < cols; ++j) {
+        if (matrix[i * cols + j]) {
+          indices[offset_values] = static_cast<index_t>(j);
+          offset_values++;
+        }
+      }
+      indptr[offset_indptr++] = static_cast<index_t>(offset_values);
+    }
+  }
+
+  template <typename dst_t>
+  void cpu_segmented_copy(index_t rows,
+                          index_t max_len_per_row,
+                          const std::vector<dst_t>& src,
+                          const std::vector<index_t>& offsets,
+                          std::vector<dst_t>& dst)
+  {
+    for (index_t row = 0; row < rows; ++row) {
+      index_t start  = offsets[row];
+      index_t end    = offsets[row + 1];  //(row < rows - 1) ? offsets[row + 1] : src.size();
+      index_t length = std::min(end - start, max_len_per_row);
+      if (length == 0) continue;
+      std::copy(
+        src.begin() + start, src.begin() + start + length, dst.begin() + row * max_len_per_row);
+    }
+  }
+
+  void SetUp() override
+  {
+    std::vector<bool> dense_values_h(params.n_rows * params.n_cols);
+    nnz = create_sparse_matrix(params.n_rows, params.n_cols, params.sparsity, dense_values_h);
+
+    std::vector<value_t> values_h(nnz);
+    std::vector<index_t> indices_h(nnz);
+    std::vector<index_t> indptr_h(params.n_rows + 1);
+    std::vector<value_t> dst_values_h(params.n_rows * params.top_k, static_cast<value_t>(2.0f));
+
+    std::vector<index_t> dst_indices_h(params.n_rows * params.top_k,
+                                       static_cast<index_t>(params.n_rows * params.n_cols + 1));
+
+    // sync up the initial values in advance to 2.0 which is out of random range [-1.0, 1.0].
+    dst_values_d.resize(params.n_rows * params.top_k, stream);
+    dst_indices_d.resize(params.n_rows * params.top_k, stream);
+
+    update_device(dst_values_d.data(), dst_values_h.data(), dst_values_h.size(), stream);
+    update_device(dst_indices_d.data(), dst_indices_h.data(), dst_indices_h.size(), stream);
+    resource::sync_stream(handle);
+
+    auto blobs_values = raft::make_device_matrix<value_t, index_t>(handle, 1, dst_values_h.size());
+    auto labels       = raft::make_device_vector<index_t, index_t>(handle, 1);
+
+    raft::random::make_blobs<value_t, index_t>(blobs_values.data_handle(),
+                                               labels.data_handle(),
+                                               1,
+                                               dst_values_h.size(),
+                                               1,
+                                               stream,
+                                               false,
+                                               nullptr,
+                                               nullptr,
+                                               value_t(1.0),
+                                               false,
+                                               value_t(-1.0f),
+                                               value_t(1.0f),
+                                               uint64_t(2024));
+    raft::copy(dst_values_h.data(), blobs_values.data_handle(), dst_values_h.size(), stream);
+    raft::copy(dst_values_d.data(), blobs_values.data_handle(), dst_values_h.size(), stream);
+    resource::sync_stream(handle);
+
+    convert_to_csr(dense_values_h, params.n_rows, params.n_cols, indices_h, indptr_h);
+
+    cpu_segmented_copy<value_t>(params.n_rows, params.top_k, values_h, indptr_h, dst_values_h);
+    cpu_segmented_copy<index_t>(params.n_rows, params.top_k, indices_h, indptr_h, dst_indices_h);
+
+    values_d.resize(nnz, stream);
+    indices_d.resize(nnz, stream);
+    indptr_d.resize(params.n_rows + 1, stream);
+    dst_values_expected_d.resize(params.n_rows * params.top_k, stream);
+    dst_indices_expected_d.resize(params.n_rows * params.top_k, stream);
+
+    update_device(values_d.data(), values_h.data(), values_h.size(), stream);
+    update_device(indices_d.data(), indices_h.data(), indices_h.size(), stream);
+    update_device(indptr_d.data(), indptr_h.data(), indptr_h.size(), stream);
+    update_device(dst_values_expected_d.data(), dst_values_h.data(), dst_values_h.size(), stream);
+    update_device(
+      dst_indices_expected_d.data(), dst_indices_h.data(), dst_indices_h.size(), stream);
+
+    resource::sync_stream(handle);
+  }
+
+  void Run()
+  {
+    auto src_values  = raft::make_device_vector_view<value_t, index_t>(values_d.data(), nnz);
+    auto src_indices = raft::make_device_vector_view<index_t, index_t>(indices_d.data(), nnz);
+    auto offsets =
+      raft::make_device_vector_view<index_t, index_t>(indptr_d.data(), params.n_rows + 1);
+    auto dst_values = raft::make_device_matrix_view<value_t, index_t, raft::row_major>(
+      dst_values_d.data(), params.n_rows, params.top_k);
+    auto dst_indices = raft::make_device_matrix_view<index_t, index_t, raft::row_major>(
+      dst_indices_d.data(), params.n_rows, params.top_k);
+
+    raft::matrix::segmented_copy(handle, params.top_k, src_values, offsets, dst_values);
+    raft::matrix::segmented_copy(handle, params.top_k, src_indices, offsets, dst_indices);
+
+    resource::sync_stream(handle);
+
+    ASSERT_TRUE(raft::devArrMatch<value_t>(dst_values_expected_d.data(),
+                                           dst_values_d.data(),
+                                           params.n_rows * params.top_k,
+                                           raft::CompareApprox<value_t>(1e-6f),
+                                           stream));
+
+    ASSERT_TRUE(raft::devArrMatch<index_t>(dst_indices_expected_d.data(),
+                                           dst_indices_d.data(),
+                                           params.n_rows * params.top_k,
+                                           raft::Compare<index_t>(),
+                                           stream));
+  }
+
+ protected:
+  raft::resources handle;
+  cudaStream_t stream;
+
+  SegmentedCopyInputs<index_t> params;
+
+  index_t nnz;
+
+  rmm::device_uvector<value_t> values_d;
+  rmm::device_uvector<index_t> indptr_d;
+  rmm::device_uvector<index_t> indices_d;
+
+  rmm::device_uvector<value_t> dst_values_d;
+  rmm::device_uvector<value_t> dst_values_expected_d;
+
+  rmm::device_uvector<index_t> dst_indices_d;
+  rmm::device_uvector<index_t> dst_indices_expected_d;
+};
+
+using SegmentedCopyTest_float_int = SegmentedCopyTest<float, int>;
+TEST_P(SegmentedCopyTest_float_int, Result) { Run(); }
+
+using SegmentedCopyTest_double_int64 = SegmentedCopyTest<double, int64_t>;
+TEST_P(SegmentedCopyTest_double_int64, Result) { Run(); }
+
+template <typename index_t>
+const std::vector<SegmentedCopyInputs<index_t>> segmentedcopy_inputs = {
+  {10, 32, 10, 0.0},
+  {10, 32, 10, 0.3},
+  {32, 1024, 63, 0.3},
+  {1024, 1024, 128, 0.2},
+  {1024, 1024 * 2000, 251, 0.2},
+  {2048, 1024 * 100, 1000, 0.3},
+  {2048, 1024 * 100, 2100, 0.5}};
+
+INSTANTIATE_TEST_CASE_P(SegmentedCopyTest,
+                        SegmentedCopyTest_float_int,
+                        ::testing::ValuesIn(segmentedcopy_inputs<int>));
+INSTANTIATE_TEST_CASE_P(SegmentedCopyTest,
+                        SegmentedCopyTest_double_int64,
+                        ::testing::ValuesIn(segmentedcopy_inputs<int64_t>));
+
+}  // namespace sparse
+}  // namespace raft
diff --git a/cpp/test/matrix/select_k_csr.cu b/cpp/test/matrix/select_k_csr.cu
new file mode 100644
index 0000000000..b0b24fae08
--- /dev/null
+++ b/cpp/test/matrix/select_k_csr.cu
@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+#include <gtest/gtest.h>
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/matrix/copy.cuh>
+#include <raft/matrix/select_k.cuh>
+#include <raft/random/make_blobs.cuh>
+#include <raft/random/rng_state.hpp>
+#include <raft/util/cuda_utils.cuh>
+
+#include <algorithm>
+#include <cmath>
+#include <optional>
+#include <queue>
+#include <random>
+#include <unordered_set>
+#include <vector>
+
+namespace raft {
+namespace sparse {
+
+template <typename index_t>
+struct SelectKCsrInputs {
+  index_t n_rows;
+  index_t n_cols;
+  index_t top_k;
+  float sparsity;
+  bool select_min;
+  bool customized_indices;
+};
+
+template <typename value_t, typename index_t>
+class SelectKCsrTest : public ::testing::TestWithParam<SelectKCsrInputs<index_t>> {
+ public:
+  SelectKCsrTest()
+    : stream(resource::get_cuda_stream(handle)),
+      params(::testing::TestWithParam<SelectKCsrInputs<index_t>>::GetParam()),
+      indices_d(0, stream),
+      customized_indices_d(0, stream),
+      indptr_d(0, stream),
+      values_d(0, stream),
+      dst_values_d(0, stream),
+      dst_values_expected_d(0, stream),
+      dst_indices_d(0, stream),
+      dst_indices_expected_d(0, stream)
+  {
+  }
+
+ protected:
+  index_t create_sparse_matrix(index_t m, index_t n, value_t sparsity, std::vector<bool>& matrix)
+  {
+    index_t total_elements = static_cast<index_t>(m * n);
+    index_t num_ones       = static_cast<index_t>((total_elements * 1.0f) * sparsity);
+    index_t res            = num_ones;
+
+    for (index_t i = 0; i < total_elements; ++i) {
+      matrix[i] = false;
+    }
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<> dis_idx(0, total_elements - 1);
+
+    while (num_ones > 0) {
+      size_t index = dis_idx(gen);
+      if (matrix[index] == false) {
+        matrix[index] = true;
+        num_ones--;
+      }
+    }
+    return res;
+  }
+
+  void convert_to_csr(std::vector<bool>& matrix,
+                      index_t rows,
+                      index_t cols,
+                      std::vector<index_t>& indices,
+                      std::vector<index_t>& indptr)
+  {
+    index_t offset_indptr   = 0;
+    index_t offset_values   = 0;
+    indptr[offset_indptr++] = 0;
+
+    for (index_t i = 0; i < rows; ++i) {
+      for (index_t j = 0; j < cols; ++j) {
+        if (matrix[i * cols + j]) {
+          indices[offset_values] = static_cast<index_t>(j);
+          offset_values++;
+        }
+      }
+      indptr[offset_indptr++] = static_cast<index_t>(offset_values);
+    }
+  }
+
+  void cpu_select_k(const std::vector<index_t>& indptr_h,
+                    const std::vector<index_t>& indices_h,
+                    const std::vector<value_t>& values_h,
+                    std::optional<std::vector<index_t>>& in_idx_h,
+                    index_t n_rows,
+                    index_t n_cols,
+                    index_t top_k,
+                    std::vector<value_t>& out_values_h,
+                    std::vector<index_t>& out_indices_h,
+                    bool select_min = true)
+  {
+    auto comp = [select_min](const std::pair<value_t, index_t>& a,
+                             const std::pair<value_t, index_t>& b) {
+      return select_min ? a.first < b.first : a.first >= b.first;
+    };
+
+    for (index_t row = 0; row < n_rows; ++row) {
+      std::priority_queue<std::pair<value_t, index_t>,
+                          std::vector<std::pair<value_t, index_t>>,
+                          decltype(comp)>
+        pq(comp);
+
+      for (index_t idx = indptr_h[row]; idx < indptr_h[row + 1]; ++idx) {
+        pq.push({values_h[idx], (in_idx_h.has_value()) ? (*in_idx_h)[idx] : indices_h[idx]});
+        if (pq.size() > size_t(top_k)) { pq.pop(); }
+      }
+
+      std::vector<std::pair<value_t, index_t>> row_pairs;
+      while (!pq.empty()) {
+        row_pairs.push_back(pq.top());
+        pq.pop();
+      }
+
+      if (select_min) {
+        std::sort(row_pairs.begin(), row_pairs.end(), [](const auto& a, const auto& b) {
+          return a.first <= b.first;
+        });
+      } else {
+        std::sort(row_pairs.begin(), row_pairs.end(), [](const auto& a, const auto& b) {
+          return a.first >= b.first;
+        });
+      }
+      for (index_t col = 0; col < top_k; col++) {
+        if (col < index_t(row_pairs.size())) {
+          out_values_h[row * top_k + col]  = row_pairs[col].first;
+          out_indices_h[row * top_k + col] = row_pairs[col].second;
+        }
+      }
+    }
+  }
+
+  void random_array(value_t* array, size_t size)
+  {
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<value_t> dis(-10.0, 10.0);
+    std::unordered_set<value_t> uset;
+
+    while (uset.size() < size) {
+      uset.insert(dis(gen));
+    }
+    typename std::unordered_set<value_t>::iterator it = uset.begin();
+    for (size_t i = 0; i < size; ++i) {
+      array[i] = *(it++);
+    }
+  }
+
+  template <typename data_t>
+  std::optional<data_t> get_opt_var(data_t x)
+  {
+    if (params.customized_indices) {
+      return x;
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  void SetUp() override
+  {
+    std::vector<bool> dense_values_h(params.n_rows * params.n_cols, false);
+    nnz = create_sparse_matrix(params.n_rows, params.n_cols, params.sparsity, dense_values_h);
+
+    std::vector<value_t> values_h(nnz);
+    std::vector<index_t> indices_h(nnz);
+    std::vector<index_t> customized_indices_h(nnz);
+    std::vector<index_t> indptr_h(params.n_rows + 1);
+
+    convert_to_csr(dense_values_h, params.n_rows, params.n_cols, indices_h, indptr_h);
+
+    std::vector<value_t> dst_values_h(params.n_rows * params.top_k, static_cast<value_t>(2.0f));
+    std::vector<index_t> dst_indices_h(params.n_rows * params.top_k,
+                                       static_cast<index_t>(params.n_rows * params.n_cols * 100));
+
+    dst_values_d.resize(params.n_rows * params.top_k, stream);
+    dst_indices_d.resize(params.n_rows * params.top_k, stream);
+    values_d.resize(nnz, stream);
+
+    update_device(dst_values_d.data(), dst_values_h.data(), dst_values_h.size(), stream);
+    update_device(dst_indices_d.data(), dst_indices_h.data(), dst_indices_h.size(), stream);
+
+    if (params.customized_indices) {
+      customized_indices_d.resize(nnz, stream);
+      update_device(customized_indices_d.data(),
+                    customized_indices_h.data(),
+                    customized_indices_h.size(),
+                    stream);
+    }
+
+    resource::sync_stream(handle);
+
+    if (values_h.size()) {
+      random_array(values_h.data(), values_h.size());
+      raft::copy(values_d.data(), values_h.data(), values_h.size(), stream);
+      resource::sync_stream(handle);
+    }
+
+    auto optional_indices_h = get_opt_var(customized_indices_h);
+
+    cpu_select_k(indptr_h,
+                 indices_h,
+                 values_h,
+                 optional_indices_h,
+                 params.n_rows,
+                 params.n_cols,
+                 params.top_k,
+                 dst_values_h,
+                 dst_indices_h,
+                 params.select_min);
+
+    indices_d.resize(nnz, stream);
+    indptr_d.resize(params.n_rows + 1, stream);
+
+    dst_values_expected_d.resize(params.n_rows * params.top_k, stream);
+    dst_indices_expected_d.resize(params.n_rows * params.top_k, stream);
+
+    update_device(values_d.data(), values_h.data(), values_h.size(), stream);
+    update_device(indices_d.data(), indices_h.data(), indices_h.size(), stream);
+    update_device(indptr_d.data(), indptr_h.data(), indptr_h.size(), stream);
+    update_device(dst_values_expected_d.data(), dst_values_h.data(), dst_values_h.size(), stream);
+    update_device(
+      dst_indices_expected_d.data(), dst_indices_h.data(), dst_indices_h.size(), stream);
+
+    resource::sync_stream(handle);
+  }
+
+  void Run()
+  {
+    auto in_val_structure = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
+      indptr_d.data(),
+      indices_d.data(),
+      params.n_rows,
+      params.n_cols,
+      static_cast<index_t>(indices_d.size()));
+
+    auto in_val =
+      raft::make_device_csr_matrix_view<const value_t>(values_d.data(), in_val_structure);
+
+    std::optional<raft::device_vector_view<const index_t, index_t>> in_idx;
+
+    in_idx = get_opt_var(
+      raft::make_device_vector_view<const index_t, index_t>(customized_indices_d.data(), nnz));
+
+    auto out_val = raft::make_device_matrix_view<value_t, index_t, raft::row_major>(
+      dst_values_d.data(), params.n_rows, params.top_k);
+    auto out_idx = raft::make_device_matrix_view<index_t, index_t, raft::row_major>(
+      dst_indices_d.data(), params.n_rows, params.top_k);
+
+    raft::matrix::select_k(handle, in_val, in_idx, out_val, out_idx, params.select_min);
+
+    ASSERT_TRUE(raft::devArrMatch<value_t>(dst_values_expected_d.data(),
+                                           out_val.data_handle(),
+                                           params.n_rows * params.top_k,
+                                           raft::CompareApprox<value_t>(1e-6f),
+                                           stream));
+
+    ASSERT_TRUE(raft::devArrMatch<index_t>(dst_indices_expected_d.data(),
+                                           out_idx.data_handle(),
+                                           params.n_rows * params.top_k,
+                                           raft::Compare<index_t>(),
+                                           stream));
+  }
+
+ protected:
+  raft::resources handle;
+  cudaStream_t stream;
+
+  SelectKCsrInputs<index_t> params;
+
+  index_t nnz;
+
+  rmm::device_uvector<value_t> values_d;
+  rmm::device_uvector<index_t> indptr_d;
+  rmm::device_uvector<index_t> indices_d;
+  rmm::device_uvector<index_t> customized_indices_d;
+
+  rmm::device_uvector<value_t> dst_values_d;
+  rmm::device_uvector<value_t> dst_values_expected_d;
+
+  rmm::device_uvector<index_t> dst_indices_d;
+  rmm::device_uvector<index_t> dst_indices_expected_d;
+};
+
+using SelectKCsrTest_float_int = SelectKCsrTest<float, int>;
+TEST_P(SelectKCsrTest_float_int, Result) { Run(); }
+
+using SelectKCsrTest_double_int64 = SelectKCsrTest<double, int64_t>;
+TEST_P(SelectKCsrTest_double_int64, Result) { Run(); }
+
+template <typename index_t>
+const std::vector<SelectKCsrInputs<index_t>> selectk_inputs = {
+  {10, 32, 10, 0.0, true, false},
+  {10, 32, 10, 0.0, true, true},
+  {10, 32, 10, 0.01, true, false},
+  {10, 32, 10, 0.1, true, true},
+  {10, 32, 251, 0.1, true, false},
+  {10, 32, 251, 0.6, true, true},
+  {1024, 1024, 258, 0.3, true, false},
+  {1024, 1024, 600, 0.2, true, true},
+  {100, 1024 * 1000, 251, 0.1, true, false},
+  {100, 1024 * 1000, 251, 0.2, true, true},
+  {1024, 1024 * 10, 251, 0.3, true, false},
+  {1024, 1024 * 10, 251, 0.2, true, true},
+  {2048, 1024 * 10, 1000, 0.2, true, false},
+  {2048, 1024 * 10, 1000, 0.3, true, true},
+  {2048, 1024 * 10, 2100, 0.1, true, false},
+  {2048, 1024 * 10, 2100, 0.2, true, true}};
+
+INSTANTIATE_TEST_CASE_P(SelectKCsrTest,
+                        SelectKCsrTest_float_int,
+                        ::testing::ValuesIn(selectk_inputs<int>));
+INSTANTIATE_TEST_CASE_P(SelectKCsrTest,
+                        SelectKCsrTest_double_int64,
+                        ::testing::ValuesIn(selectk_inputs<int64_t>));
+
+}  // namespace sparse
+}  // namespace raft

From 435286a6880a3b2f6c0d763f3d19e687d2f9f7b8 Mon Sep 17 00:00:00 2001
From: rhdong <hrong@nvidia.com>
Date: Wed, 31 Jan 2024 09:24:38 -0800
Subject: [PATCH 02/12] add more comments on the select_k API

---
 cpp/include/raft/matrix/select_k.cuh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/include/raft/matrix/select_k.cuh b/cpp/include/raft/matrix/select_k.cuh
index 5c20227d23..7df1430455 100644
--- a/cpp/include/raft/matrix/select_k.cuh
+++ b/cpp/include/raft/matrix/select_k.cuh
@@ -123,6 +123,8 @@ void select_k(raft::resources const& handle,
  * This function operates on a row-major matrix `in_val` with dimensions `batch_size` x `len`,
  * selecting the k smallest or largest elements from each row. The selected elements are then stored
  * in a row-major output matrix `out_val` with dimensions `batch_size` x k.
+ * If the total number of values in a row is less than K, then the extra position in the
+ * corresponding row of out_val will maintain the original value. This applies to out_idx
  *
  * @tparam T
  *   Type of the elements being compared (keys).

From 977ccee26674a72145c1879bffecb3a991e92d9a Mon Sep 17 00:00:00 2001
From: hrong <hrong@nvidia.com>
Date: Mon, 4 Mar 2024 13:04:42 -0800
Subject: [PATCH 03/12] remove mr argument

---
 cpp/include/raft/matrix/detail/matrix.cuh         |  6 +++---
 cpp/include/raft/matrix/detail/select_k-ext.cuh   |  6 ++----
 cpp/include/raft/matrix/detail/select_k-inl.cuh   | 14 +++++---------
 cpp/src/matrix/detail/select_k_double_int64_t.cu  |  3 +--
 cpp/src/matrix/detail/select_k_double_uint32_t.cu |  3 +--
 cpp/src/matrix/detail/select_k_float_int32.cu     |  3 +--
 cpp/src/matrix/detail/select_k_float_int64_t.cu   |  3 +--
 cpp/src/matrix/detail/select_k_float_uint32_t.cu  |  3 +--
 cpp/src/matrix/detail/select_k_half_int64_t.cu    |  3 +--
 cpp/src/matrix/detail/select_k_half_uint32_t.cu   |  3 +--
 10 files changed, 17 insertions(+), 30 deletions(-)

diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
index f0cb4aaef8..69d38a68da 100644
--- a/cpp/include/raft/matrix/detail/matrix.cuh
+++ b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -33,6 +33,7 @@
 
 #include <algorithm>
 #include <cstddef>
+#include <raft/core/resource/device_properties.hpp>
 
 namespace raft {
 namespace matrix {
@@ -349,9 +350,8 @@ void segmented_copy(raft::resources const& handle,
 
   idx_t tpb = max_len_per_row >= 256 ? SEGMENTED_COPY_TPB_256 : SEGMENTED_COPY_TPB_32;
 
-  int dev_id, sm_count, blocks_per_sm;
-  cudaGetDevice(&dev_id);
-  cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
+  int blocks_per_sm;
+  int sm_count = resource::get_device_properties(handle).multiProcessorCount;
 
   if (tpb == SEGMENTED_COPY_TPB_32) {
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
diff --git a/cpp/include/raft/matrix/detail/select_k-ext.cuh b/cpp/include/raft/matrix/detail/select_k-ext.cuh
index 572abf6564..23ab3113e5 100644
--- a/cpp/include/raft/matrix/detail/select_k-ext.cuh
+++ b/cpp/include/raft/matrix/detail/select_k-ext.cuh
@@ -51,8 +51,7 @@ void select_k(raft::resources const& handle,
               std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx,
               raft::device_matrix_view<T, IdxT, raft::row_major> out_val,
               raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,
-              bool select_min,
-              rmm::mr::device_memory_resource* mr = nullptr) RAFT_EXPLICIT;
+              bool select_min) RAFT_EXPLICIT;
 }  // namespace raft::matrix::detail
 
 #endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
@@ -89,8 +88,7 @@ instantiate_raft_matrix_detail_select_k(double, uint32_t);
     std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
     raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
     raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min,                                                  \
-    rmm::mr::device_memory_resource* mr)
+    bool select_min)
 
 instantiate_raft_matrix_detail_select_k(__half, uint32_t);
 instantiate_raft_matrix_detail_select_k(__half, int64_t);
diff --git a/cpp/include/raft/matrix/detail/select_k-inl.cuh b/cpp/include/raft/matrix/detail/select_k-inl.cuh
index d1b50dd2db..86d835a0f0 100644
--- a/cpp/include/raft/matrix/detail/select_k-inl.cuh
+++ b/cpp/include/raft/matrix/detail/select_k-inl.cuh
@@ -348,9 +348,6 @@ void select_k(raft::resources const& handle,
  *   Output indices [in_val.get_n_row(), k] corresponding to the selected elements in `out_val`.
  * @param[in] select_min
  *   Flag indicating whether to select the k smallest (true) or largest (false) elements.
- * @param[in] mr
- *   An optional memory resource to use across the calls (you can provide a large enough
- *           memory pool here to avoid memory allocations within the call).
  */
 template <typename T, typename IdxT>
 void select_k(raft::resources const& handle,
@@ -358,8 +355,7 @@ void select_k(raft::resources const& handle,
               std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx,
               raft::device_matrix_view<T, IdxT, raft::row_major> out_val,
               raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,
-              bool select_min,
-              rmm::mr::device_memory_resource* mr = nullptr)
+              bool select_min)
 {
   auto csr_view = in_val.structure_view();
   auto nnz      = csr_view.get_nnz();
@@ -370,7 +366,7 @@ void select_k(raft::resources const& handle,
   auto len        = csr_view.get_n_cols();
   auto k          = IdxT(out_val.extent(1));
 
-  if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); }
+  auto mr = resource::get_workspace_resource(handle);
   RAFT_EXPECTS(out_val.extent(1) <= int64_t(std::numeric_limits<int>::max()),
                "output k must fit the int type.");
 
@@ -385,9 +381,9 @@ void select_k(raft::resources const& handle,
 
   auto stream = raft::resource::get_cuda_stream(handle);
 
-  rmm::device_uvector<IdxT> offsets(batch_size + 1, stream);
-  rmm::device_uvector<T> keys(nnz, stream);
-  rmm::device_uvector<IdxT> values(nnz, stream);
+  rmm::device_uvector<IdxT> offsets(batch_size + 1, stream, mr);
+  rmm::device_uvector<T> keys(nnz, stream, mr);
+  rmm::device_uvector<IdxT> values(nnz, stream, mr);
 
   raft::copy(offsets.data(), csr_view.get_indptr().data(), batch_size + 1, stream);
   raft::copy(keys.data(), in_val.get_elements().data(), nnz, stream);
diff --git a/cpp/src/matrix/detail/select_k_double_int64_t.cu b/cpp/src/matrix/detail/select_k_double_int64_t.cu
index 5ed77fed3c..f90d518f71 100644
--- a/cpp/src/matrix/detail/select_k_double_int64_t.cu
+++ b/cpp/src/matrix/detail/select_k_double_int64_t.cu
@@ -40,8 +40,7 @@ instantiate_raft_matrix_detail_select_k(double, int64_t);
     std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
     raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
     raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min,                                                  \
-    rmm::mr::device_memory_resource* mr)
+    bool select_min)
 
 instantiate_raft_matrix_detail_select_k(double, int64_t);
 
diff --git a/cpp/src/matrix/detail/select_k_double_uint32_t.cu b/cpp/src/matrix/detail/select_k_double_uint32_t.cu
index 5cf075f0e7..b88e81e2e7 100644
--- a/cpp/src/matrix/detail/select_k_double_uint32_t.cu
+++ b/cpp/src/matrix/detail/select_k_double_uint32_t.cu
@@ -42,8 +42,7 @@ instantiate_raft_matrix_detail_select_k(double, uint32_t);
     std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
     raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
     raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min,                                                  \
-    rmm::mr::device_memory_resource* mr)
+    bool select_min)
 
 instantiate_raft_matrix_detail_select_k(double, uint32_t);
 
diff --git a/cpp/src/matrix/detail/select_k_float_int32.cu b/cpp/src/matrix/detail/select_k_float_int32.cu
index 90613b696f..2ba7d41146 100644
--- a/cpp/src/matrix/detail/select_k_float_int32.cu
+++ b/cpp/src/matrix/detail/select_k_float_int32.cu
@@ -40,8 +40,7 @@ instantiate_raft_matrix_detail_select_k(float, int);
     std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
     raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
     raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min,                                                  \
-    rmm::mr::device_memory_resource* mr)
+    bool select_min)
 
 instantiate_raft_matrix_detail_select_k(float, int);
 
diff --git a/cpp/src/matrix/detail/select_k_float_int64_t.cu b/cpp/src/matrix/detail/select_k_float_int64_t.cu
index 771a851e82..c62121d70e 100644
--- a/cpp/src/matrix/detail/select_k_float_int64_t.cu
+++ b/cpp/src/matrix/detail/select_k_float_int64_t.cu
@@ -40,8 +40,7 @@ instantiate_raft_matrix_detail_select_k(float, int64_t);
     std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
     raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
     raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min,                                                  \
-    rmm::mr::device_memory_resource* mr)
+    bool select_min)
 
 instantiate_raft_matrix_detail_select_k(float, uint64_t);
 
diff --git a/cpp/src/matrix/detail/select_k_float_uint32_t.cu b/cpp/src/matrix/detail/select_k_float_uint32_t.cu
index f337574b8c..6b5cb6927d 100644
--- a/cpp/src/matrix/detail/select_k_float_uint32_t.cu
+++ b/cpp/src/matrix/detail/select_k_float_uint32_t.cu
@@ -40,8 +40,7 @@ instantiate_raft_matrix_detail_select_k(float, uint32_t);
     std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
     raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
     raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min,                                                  \
-    rmm::mr::device_memory_resource* mr)
+    bool select_min)
 
 instantiate_raft_matrix_detail_select_k(float, uint32_t);
 
diff --git a/cpp/src/matrix/detail/select_k_half_int64_t.cu b/cpp/src/matrix/detail/select_k_half_int64_t.cu
index 0a4dd0f668..78a7cb7a7e 100644
--- a/cpp/src/matrix/detail/select_k_half_int64_t.cu
+++ b/cpp/src/matrix/detail/select_k_half_int64_t.cu
@@ -40,8 +40,7 @@ instantiate_raft_matrix_detail_select_k(__half, int64_t);
     std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
     raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
     raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min,                                                  \
-    rmm::mr::device_memory_resource* mr)
+    bool select_min)
 
 instantiate_raft_matrix_detail_select_k(__half, int64_t);
 
diff --git a/cpp/src/matrix/detail/select_k_half_uint32_t.cu b/cpp/src/matrix/detail/select_k_half_uint32_t.cu
index f546984690..58c1668bf1 100644
--- a/cpp/src/matrix/detail/select_k_half_uint32_t.cu
+++ b/cpp/src/matrix/detail/select_k_half_uint32_t.cu
@@ -40,8 +40,7 @@ instantiate_raft_matrix_detail_select_k(__half, uint32_t);
     std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
     raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
     raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min,                                                  \
-    rmm::mr::device_memory_resource* mr)
+    bool select_min)
 
 instantiate_raft_matrix_detail_select_k(__half, uint32_t);
 

From bc544e3d82bf99ae5fd7b8a1629ba79f427c9c97 Mon Sep 17 00:00:00 2001
From: rhdong <hrong@nvidia.com>
Date: Fri, 15 Mar 2024 15:24:04 -0700
Subject: [PATCH 04/12] fix format issue

---
 cpp/bench/prims/matrix/select_k_csr.cu        | 25 +++++++++----------
 cpp/include/raft/matrix/detail/matrix.cuh     |  2 +-
 .../raft/matrix/detail/select_k-inl.cuh       |  4 +--
 cpp/test/matrix/copy.cu                       |  4 ++-
 cpp/test/matrix/select_k_csr.cu               |  3 ++-
 5 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/cpp/bench/prims/matrix/select_k_csr.cu b/cpp/bench/prims/matrix/select_k_csr.cu
index 99c59f4fde..0282f873c2 100644
--- a/cpp/bench/prims/matrix/select_k_csr.cu
+++ b/cpp/bench/prims/matrix/select_k_csr.cu
@@ -14,28 +14,27 @@
  * limitations under the License.
  */
 #include <common/benchmark.hpp>
-#include <raft/sparse/convert/csr.cuh>
-#include <rmm/device_uvector.hpp>
-
-#include <raft/core/device_resources.hpp>
-#include <raft/util/itertools.hpp>
-
-#include <raft/core/resource/cuda_stream.hpp>
-#include <raft/core/resources.hpp>
-
-#include <random>
-#include <sstream>
-#include <unordered_set>
-#include <vector>
 
 #include <raft/core/device_csr_matrix.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
 #include <raft/matrix/copy.cuh>
 #include <raft/matrix/select_k.cuh>
 #include <raft/random/make_blobs.cuh>
 #include <raft/random/rng_state.hpp>
+#include <raft/sparse/convert/csr.cuh>
 #include <raft/util/cuda_utils.cuh>
+#include <raft/util/itertools.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <random>
+#include <sstream>
+#include <unordered_set>
+#include <vector>
 
 namespace raft::bench::sparse {
 
diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
index 69d38a68da..a9109d37ba 100644
--- a/cpp/include/raft/matrix/detail/matrix.cuh
+++ b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <raft/core/resource/cublas_handle.hpp>
+#include <raft/core/resource/device_properties.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/util/cache_util.cuh>
@@ -33,7 +34,6 @@
 
 #include <algorithm>
 #include <cstddef>
-#include <raft/core/resource/device_properties.hpp>
 
 namespace raft {
 namespace matrix {
diff --git a/cpp/include/raft/matrix/detail/select_k-inl.cuh b/cpp/include/raft/matrix/detail/select_k-inl.cuh
index 86d835a0f0..e2490cca0b 100644
--- a/cpp/include/raft/matrix/detail/select_k-inl.cuh
+++ b/cpp/include/raft/matrix/detail/select_k-inl.cuh
@@ -17,8 +17,6 @@
 
 #pragma once
 
-#include <type_traits>
-
 #include "select_radix.cuh"
 #include "select_warpsort.cuh"
 
@@ -35,6 +33,8 @@
 
 #include <cub/cub.cuh>
 
+#include <type_traits>
+
 namespace raft::matrix::detail {
 
 /**
diff --git a/cpp/test/matrix/copy.cu b/cpp/test/matrix/copy.cu
index 58bc8970a4..adeeae73f5 100644
--- a/cpp/test/matrix/copy.cu
+++ b/cpp/test/matrix/copy.cu
@@ -15,7 +15,7 @@
  */
 
 #include "../test_utils.cuh"
-#include <gtest/gtest.h>
+
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
@@ -24,6 +24,8 @@
 #include <raft/random/make_blobs.cuh>
 #include <raft/util/cuda_utils.cuh>
 
+#include <gtest/gtest.h>
+
 #include <iostream>
 
 namespace raft {
diff --git a/cpp/test/matrix/select_k_csr.cu b/cpp/test/matrix/select_k_csr.cu
index b0b24fae08..ece773ee97 100644
--- a/cpp/test/matrix/select_k_csr.cu
+++ b/cpp/test/matrix/select_k_csr.cu
@@ -15,7 +15,6 @@
  */
 
 #include "../test_utils.cuh"
-#include <gtest/gtest.h>
 
 #include <raft/core/device_csr_matrix.hpp>
 #include <raft/core/device_mdarray.hpp>
@@ -28,6 +27,8 @@
 #include <raft/random/rng_state.hpp>
 #include <raft/util/cuda_utils.cuh>
 
+#include <gtest/gtest.h>
+
 #include <algorithm>
 #include <cmath>
 #include <optional>

From 558b69e7e8bbe103f2e1e8387d827338596d1a91 Mon Sep 17 00:00:00 2001
From: rhdong <hrong@nvidia.com>
Date: Fri, 29 Mar 2024 23:59:33 -0700
Subject: [PATCH 05/12] Optimizing the performance by reusing the dense
 `select_k`

---
 cpp/bench/prims/matrix/select_k_csr.cu        |  54 ++-
 .../raft/matrix/detail/select_k-ext.cuh       |   8 +-
 .../raft/matrix/detail/select_k-inl.cuh       | 160 +++++--
 .../raft/matrix/detail/select_radix.cuh       | 427 ++++++++++--------
 .../raft/matrix/detail/select_warpsort.cuh    |  55 ++-
 cpp/include/raft/matrix/select_k.cuh          |   7 +-
 .../matrix/detail/select_k_double_int64_t.cu  |   4 +-
 .../matrix/detail/select_k_double_uint32_t.cu |   4 +-
 cpp/src/matrix/detail/select_k_float_int32.cu |   4 +-
 .../matrix/detail/select_k_float_int64_t.cu   |   4 +-
 .../matrix/detail/select_k_float_uint32_t.cu  |   4 +-
 .../matrix/detail/select_k_half_int64_t.cu    |   4 +-
 .../matrix/detail/select_k_half_uint32_t.cu   |   4 +-
 .../matrix/select_k_float_int64_t.cu          |   5 +-
 cpp/test/matrix/select_k_csr.cu               |  40 +-
 15 files changed, 521 insertions(+), 263 deletions(-)

diff --git a/cpp/bench/prims/matrix/select_k_csr.cu b/cpp/bench/prims/matrix/select_k_csr.cu
index 0282f873c2..4ab706f471 100644
--- a/cpp/bench/prims/matrix/select_k_csr.cu
+++ b/cpp/bench/prims/matrix/select_k_csr.cu
@@ -51,8 +51,7 @@ struct bench_param {
 template <typename index_t>
 inline auto operator<<(std::ostream& os, const bench_param<index_t>& params) -> std::ostream&
 {
-  os << " rows*cols=" << params.n_rows << "*" << params.n_cols << "\ttop_k=" << params.top_k
-     << "\tsparsity=" << params.sparsity;
+  os << params.n_rows << "#" << params.n_cols << "#" << params.top_k << "#" << params.sparsity;
   return os;
 }
 
@@ -69,7 +68,7 @@ struct SelectKCsrTest : public fixture {
       dst_values_d(0, stream),
       dst_indices_d(0, stream)
   {
-    std::vector<bool> dense_values_h(params.n_rows * params.n_cols, false);
+    std::vector<bool> dense_values_h(params.n_rows * params.n_cols);
     nnz = create_sparse_matrix(params.n_rows, params.n_cols, params.sparsity, dense_values_h);
 
     std::vector<index_t> indices_h(nnz);
@@ -207,7 +206,7 @@ struct SelectKCsrTest : public fixture {
     raft::matrix::select_k(handle, in_val, in_idx, out_val, out_idx, params.select_min);
     resource::sync_stream(handle);
     loop_on_state(state, [this, &in_val, &in_idx, &out_val, &out_idx]() {
-      raft::matrix::select_k(handle, in_val, in_idx, out_val, out_idx, params.select_min);
+      raft::matrix::select_k(handle, in_val, in_idx, out_val, out_idx, params.select_min, false);
       resource::sync_stream(handle);
     });
   }
@@ -235,22 +234,53 @@ const std::vector<bench_param<index_t>> getInputs()
     index_t m;
     index_t n;
     index_t k;
-    float sparsity;
   };
 
-  const std::vector<TestParams> params_group =
-    raft::util::itertools::product<TestParams>({index_t(10), index_t(1024)},
-                                               {index_t(1024 * 10), index_t(1024 * 1024)},
-                                               {index_t(128), index_t(100), index_t(2048)},
-                                               {0.1f, 0.2f, 0.5f});
+  const std::vector<TestParams> params_group{
+    {20000, 500, 1},    {20000, 500, 2},    {20000, 500, 4},   {20000, 500, 8},
+    {20000, 500, 16},   {20000, 500, 32},   {20000, 500, 64},  {20000, 500, 128},
+    {20000, 500, 256},
+
+    {1000, 10000, 1},   {1000, 10000, 2},   {1000, 10000, 4},  {1000, 10000, 8},
+    {1000, 10000, 16},  {1000, 10000, 32},  {1000, 10000, 64}, {1000, 10000, 128},
+    {1000, 10000, 256},
+
+    {100, 100000, 1},   {100, 100000, 2},   {100, 100000, 4},  {100, 100000, 8},
+    {100, 100000, 16},  {100, 100000, 32},  {100, 100000, 64}, {100, 100000, 128},
+    {100, 100000, 256},
+
+    {10, 1000000, 1},   {10, 1000000, 2},   {10, 1000000, 4},  {10, 1000000, 8},
+    {10, 1000000, 16},  {10, 1000000, 32},  {10, 1000000, 64}, {10, 1000000, 128},
+    {10, 1000000, 256},
+
+    {10, 1000000, 1},   {10, 1000000, 2},   {10, 1000000, 4},  {10, 1000000, 8},
+    {10, 1000000, 16},  {10, 1000000, 32},  {10, 1000000, 64}, {10, 1000000, 128},
+    {10, 1000000, 256},
+
+    {10, 1000000, 1},   {10, 1000000, 16},  {10, 1000000, 64}, {10, 1000000, 128},
+    {10, 1000000, 256},
+
+    {10, 1000000, 1},   {10, 1000000, 16},  {10, 1000000, 64}, {10, 1000000, 128},
+    {10, 1000000, 256}, {1000, 10000, 1},   {1000, 10000, 16}, {1000, 10000, 64},
+    {1000, 10000, 128}, {1000, 10000, 256},
+
+    {10, 1000000, 1},   {10, 1000000, 16},  {10, 1000000, 64}, {10, 1000000, 128},
+    {10, 1000000, 256}, {1000, 10000, 1},   {1000, 10000, 16}, {1000, 10000, 64},
+    {1000, 10000, 128}, {1000, 10000, 256}};
 
   param_vec.reserve(params_group.size());
   for (TestParams params : params_group) {
-    param_vec.push_back(bench_param<index_t>({params.m, params.n, params.k, params.sparsity}));
+    param_vec.push_back(bench_param<index_t>({params.m, params.n, params.k, 0.1}));
+  }
+  for (TestParams params : params_group) {
+    param_vec.push_back(bench_param<index_t>({params.m, params.n, params.k, 0.2}));
+  }
+  for (TestParams params : params_group) {
+    param_vec.push_back(bench_param<index_t>({params.m, params.n, params.k, 0.5}));
   }
   return param_vec;
 }
 
-RAFT_BENCH_REGISTER((SelectKCsrTest<float, int>), "", getInputs<int>());
+RAFT_BENCH_REGISTER((SelectKCsrTest<float, uint32_t>), "", getInputs<uint32_t>());
 
 }  // namespace raft::bench::sparse
diff --git a/cpp/include/raft/matrix/detail/select_k-ext.cuh b/cpp/include/raft/matrix/detail/select_k-ext.cuh
index 1097935e3b..95d806dd43 100644
--- a/cpp/include/raft/matrix/detail/select_k-ext.cuh
+++ b/cpp/include/raft/matrix/detail/select_k-ext.cuh
@@ -52,7 +52,9 @@ void select_k(raft::resources const& handle,
               std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx,
               raft::device_matrix_view<T, IdxT, raft::row_major> out_val,
               raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,
-              bool select_min) RAFT_EXPLICIT;
+              bool select_min,
+              bool sorted     = false,
+              SelectAlgo algo = SelectAlgo::kAuto) RAFT_EXPLICIT;
 }  // namespace raft::matrix::detail
 
 #endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
@@ -90,7 +92,9 @@ instantiate_raft_matrix_detail_select_k(double, uint32_t);
     std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
     raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
     raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min)
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
 
 instantiate_raft_matrix_detail_select_k(__half, uint32_t);
 instantiate_raft_matrix_detail_select_k(__half, int64_t);
diff --git a/cpp/include/raft/matrix/detail/select_k-inl.cuh b/cpp/include/raft/matrix/detail/select_k-inl.cuh
index 7683c03283..bcf00db709 100644
--- a/cpp/include/raft/matrix/detail/select_k-inl.cuh
+++ b/cpp/include/raft/matrix/detail/select_k-inl.cuh
@@ -324,9 +324,9 @@ void select_k(raft::resources const& handle,
 }
 
 /**
- * Selects the k smallest or largest keys/values from each row of the input matrix.
+ * Selects the k smallest or largest keys/values from each row of the input CSR matrix.
  *
- * This function operates on a row-major matrix `in_val` with dimensions `batch_size` x `len`,
+ * This function operates on a CSR matrix `in_val` with a logical dense shape of [batch_size, len],
  * selecting the k smallest or largest elements from each row. The selected elements are then stored
  * in a row-major output matrix `out_val` with dimensions `batch_size` x k.
  *
@@ -352,6 +352,10 @@ void select_k(raft::resources const& handle,
  *   Output indices [in_val.get_n_row(), k] corresponding to the selected elements in `out_val`.
  * @param[in] select_min
  *   Flag indicating whether to select the k smallest (true) or largest (false) elements.
+ * @param[in] sorted
+ *   whether to make sure selected pairs are sorted by value
+ * @param[in] algo
+ *   the selection algorithm to use
  */
 template <typename T, typename IdxT>
 void select_k(raft::resources const& handle,
@@ -359,7 +363,9 @@ void select_k(raft::resources const& handle,
               std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx,
               raft::device_matrix_view<T, IdxT, raft::row_major> out_val,
               raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,
-              bool select_min)
+              bool select_min,
+              bool sorted     = false,
+              SelectAlgo algo = SelectAlgo::kAuto)
 {
   auto csr_view = in_val.structure_view();
   auto nnz      = csr_view.get_nnz();
@@ -383,33 +389,127 @@ void select_k(raft::resources const& handle,
   }
   RAFT_EXPECTS(IdxT(k) == out_idx.extent(1), "value and index output lengths must be equal");
 
-  auto stream = raft::resource::get_cuda_stream(handle);
-
-  rmm::device_uvector<IdxT> offsets(batch_size + 1, stream, mr);
-  rmm::device_uvector<T> keys(nnz, stream, mr);
-  rmm::device_uvector<IdxT> values(nnz, stream, mr);
-
-  raft::copy(offsets.data(), csr_view.get_indptr().data(), batch_size + 1, stream);
-  raft::copy(keys.data(), in_val.get_elements().data(), nnz, stream);
-  raft::copy(values.data(),
-             (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
-             nnz,
-             stream);
-
-  segmented_sort_by_key(handle,
-                        keys.data(),
-                        values.data(),
-                        size_t(batch_size),
-                        size_t(nnz),
-                        offsets.data(),
-                        select_min);
-
-  auto src_val      = raft::make_device_vector_view<T, IdxT>(keys.data(), nnz);
-  auto offsets_view = raft::make_device_vector_view<IdxT, IdxT>(offsets.data(), batch_size + 1);
-  raft::matrix::segmented_copy<T, IdxT>(handle, k, src_val, offsets_view, out_val);
-
-  auto src_idx = raft::make_device_vector_view<IdxT, IdxT>(values.data(), nnz);
-  raft::matrix::segmented_copy<IdxT, IdxT>(handle, k, src_idx, offsets_view, out_idx);
+  if (algo == SelectAlgo::kAuto) { algo = choose_select_k_algorithm(batch_size, len, k); }
+
+  auto indptr = csr_view.get_indptr().data();
+
+  switch (algo) {
+    case SelectAlgo::kRadix8bits:
+    case SelectAlgo::kRadix11bits:
+    case SelectAlgo::kRadix11bitsExtraPass: {
+      if (algo == SelectAlgo::kRadix8bits) {
+        detail::select::radix::select_k<T, IdxT, 8, 512, false>(
+          handle,
+          in_val.get_elements().data(),
+          (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+          batch_size,
+          len,
+          k,
+          out_val.data_handle(),
+          out_idx.data_handle(),
+          select_min,
+          true,
+          indptr);
+      } else {
+        bool fused_last_filter = algo == SelectAlgo::kRadix11bits;
+        detail::select::radix::select_k<T, IdxT, 11, 512, false>(
+          handle,
+          in_val.get_elements().data(),
+          (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+          batch_size,
+          len,
+          k,
+          out_val.data_handle(),
+          out_idx.data_handle(),
+          select_min,
+          fused_last_filter,
+          indptr);
+      }
+
+      if (sorted) {
+        auto offsets = make_device_mdarray<IdxT, IdxT>(
+          handle, resource::get_workspace_resource(handle), make_extents<IdxT>(batch_size + 1));
+        raft::linalg::map_offset(handle, offsets.view(), mul_const_op<IdxT>(k));
+
+        auto keys =
+          raft::make_device_vector_view<T, IdxT>(out_val.data_handle(), (IdxT)(batch_size * k));
+        auto vals =
+          raft::make_device_vector_view<IdxT, IdxT>(out_idx.data_handle(), (IdxT)(batch_size * k));
+
+        segmented_sort_by_key<T, IdxT>(
+          handle, raft::make_const_mdspan(offsets.view()), keys, vals, select_min);
+      }
+
+      return;
+    }
+    case SelectAlgo::kWarpDistributed:
+      return detail::select::warpsort::
+        select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_distributed>(
+          handle,
+          in_val.get_elements().data(),
+          (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+          batch_size,
+          len,
+          k,
+          out_val.data_handle(),
+          out_idx.data_handle(),
+          select_min,
+          indptr);
+    case SelectAlgo::kWarpDistributedShm:
+      return detail::select::warpsort::
+        select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_distributed_ext>(
+          handle,
+          in_val.get_elements().data(),
+          (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+          batch_size,
+          len,
+          k,
+          out_val.data_handle(),
+          out_idx.data_handle(),
+          select_min,
+          indptr);
+    case SelectAlgo::kWarpAuto:
+      return detail::select::warpsort::select_k<T, IdxT>(
+        handle,
+        in_val.get_elements().data(),
+        (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+        batch_size,
+        len,
+        k,
+        out_val.data_handle(),
+        out_idx.data_handle(),
+        select_min,
+        indptr);
+    case SelectAlgo::kWarpImmediate:
+      return detail::select::warpsort::
+        select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_immediate>(
+          handle,
+          in_val.get_elements().data(),
+          (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+          batch_size,
+          len,
+          k,
+          out_val.data_handle(),
+          out_idx.data_handle(),
+          select_min,
+          indptr);
+    case SelectAlgo::kWarpFiltered:
+      return detail::select::warpsort::
+        select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_filtered>(
+          handle,
+          in_val.get_elements().data(),
+          (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+          batch_size,
+          len,
+          k,
+          out_val.data_handle(),
+          out_idx.data_handle(),
+          select_min,
+          indptr);
+    default: RAFT_FAIL("K-selection Algorithm not supported.");
+  }
+
+  return;
 }
 
 }  // namespace raft::matrix::detail
diff --git a/cpp/include/raft/matrix/detail/select_radix.cuh b/cpp/include/raft/matrix/detail/select_radix.cuh
index 36a346fda3..83d4845c31 100644
--- a/cpp/include/raft/matrix/detail/select_radix.cuh
+++ b/cpp/include/raft/matrix/detail/select_radix.cuh
@@ -442,14 +442,76 @@ _RAFT_DEVICE void last_filter(const T* in_buf,
   }
 }
 
-template <typename T, typename IdxT, int BitsPerPass>
+template <typename T, typename IdxT>
+_RAFT_DEVICE void set_buf_pointers(const T* in,
+                                   const IdxT* in_idx,
+                                   char* bufs,
+                                   IdxT buf_len,
+                                   int pass,
+                                   const T*& in_buf,
+                                   const IdxT*& in_idx_buf,
+                                   T*& out_buf,
+                                   IdxT*& out_idx_buf)
+{
+  // bufs consists of 4 pieces in order: buf1, buf2, idx_buf1, idx_buf2
+  if (pass == 0) {
+    in_buf      = in;
+    in_idx_buf  = nullptr;
+    out_buf     = nullptr;
+    out_idx_buf = nullptr;
+  } else if (pass == 1) {
+    in_buf      = in;
+    in_idx_buf  = in_idx;
+    out_buf     = reinterpret_cast<T*>(bufs);
+    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
+  } else if (pass % 2 == 0) {
+    in_buf      = reinterpret_cast<T*>(bufs);
+    in_idx_buf  = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
+    out_buf     = const_cast<T*>(in_buf + buf_len);
+    out_idx_buf = const_cast<IdxT*>(in_idx_buf + buf_len);
+  } else {
+    out_buf     = reinterpret_cast<T*>(bufs);
+    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
+    in_buf      = out_buf + buf_len;
+    in_idx_buf  = out_idx_buf + buf_len;
+  }
+}
+
+template <typename T, typename IdxT>
+_RAFT_DEVICE void set_buf_pointers(const T* in,
+                                   const IdxT* in_idx,
+                                   char* bufs,
+                                   IdxT buf_len,
+                                   const int pass,
+                                   const T*& out_buf,
+                                   const IdxT*& out_idx_buf)
+{
+  // bufs consists of 4 pieces in order: buf1, buf2, idx_buf1, idx_buf2
+  if (pass == 0) {
+    out_buf     = nullptr;
+    out_idx_buf = nullptr;
+  } else if (pass == 1) {
+    out_buf     = reinterpret_cast<T*>(bufs);
+    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
+  } else if (pass % 2 == 0) {
+    out_buf = const_cast<T*>(reinterpret_cast<T*>(bufs) + buf_len);
+    out_idx_buf =
+      const_cast<IdxT*>(reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len) + buf_len);
+  } else {
+    out_buf     = reinterpret_cast<T*>(bufs);
+    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
+  }
+}
+
+template <typename T, typename IdxT, int BitsPerPass, bool len_or_indptr = true>
 RAFT_KERNEL last_filter_kernel(const T* in,
                                const IdxT* in_idx,
-                               const T* in_buf,
-                               const IdxT* in_idx_buf,
+                               char* bufs,
+                               size_t offset,
                                T* out,
                                IdxT* out_idx,
                                const IdxT len,
+                               const IdxT* len_i,
                                const IdxT k,
                                Counter<T, IdxT>* counters,
                                const bool select_min)
@@ -458,22 +520,31 @@ RAFT_KERNEL last_filter_kernel(const T* in,
 
   Counter<T, IdxT>* counter = counters + batch_id;
   IdxT previous_len         = counter->previous_len;
+
   if (previous_len == 0) { return; }
+
+  const IdxT l_len    = len_or_indptr ? len : (len_i[batch_id + 1] - len_i[batch_id]);
+  const IdxT l_offset = len_or_indptr ? (offset + batch_id) * len : len_i[batch_id];
+
   const IdxT buf_len = calc_buf_len<T>(len);
-  if (previous_len > buf_len || in_buf == in) {
-    in_buf       = in + batch_id * len;
-    in_idx_buf   = in_idx ? (in_idx + batch_id * len) : nullptr;
-    previous_len = len;
-  } else {
-    in_buf += batch_id * buf_len;
-    in_idx_buf += batch_id * buf_len;
-  }
-  out += batch_id * k;
-  out_idx += batch_id * k;
+
+  const T* in_buf        = nullptr;
+  const IdxT* in_idx_buf = nullptr;
+  bufs += batch_id * buf_len * 2 * (sizeof(T) + sizeof(IdxT));
 
   constexpr int pass      = calc_num_passes<T, BitsPerPass>() - 1;
   constexpr int start_bit = calc_start_bit<T, BitsPerPass>(pass);
 
+  set_buf_pointers(in + l_offset, in_idx + l_offset, bufs, buf_len, pass, in_buf, in_idx_buf);
+
+  if (previous_len > buf_len || in_buf == in + l_offset) {
+    in_buf       = in + l_offset;
+    in_idx_buf   = in_idx ? (in_idx + l_offset) : nullptr;
+    previous_len = l_len;
+  }
+  out += batch_id * k;
+  out_idx += batch_id * k;
+
   const auto kth_value_bits    = counter->kth_value_bits;
   const IdxT num_of_kth_needed = counter->k;
   IdxT* p_out_cnt              = &counter->out_cnt;
@@ -510,6 +581,29 @@ RAFT_KERNEL last_filter_kernel(const T* in,
                      f);
 }
 
+template <typename T, typename IdxT, typename S>
+_RAFT_DEVICE _RAFT_FORCEINLINE void copy_in_val(
+  T* dest, const T* src, S len, IdxT k, const bool select_min)
+{
+  S idx               = S(threadIdx.x);
+  S stride            = S(blockDim.x);
+  const T default_val = select_min ? upper_bound<T>() : lower_bound<T>();
+  for (S i = idx; i < k; i += stride) {
+    dest[i] = i < len ? src[i] : default_val;
+  }
+}
+
+template <typename T, typename S>
+_RAFT_DEVICE _RAFT_FORCEINLINE void copy_in_idx(T* dest, const T* src, S len)
+{
+  S idx    = S(threadIdx.x);
+  S stride = S(blockDim.x);
+
+  for (S i = idx; i < len; i += stride) {
+    dest[i] = src ? src[i] : i;
+  }
+}
+
 /**
  *
  * It is expected to call this kernel multiple times (passes), in each pass we process a radix,
@@ -545,13 +639,16 @@ RAFT_KERNEL last_filter_kernel(const T* in,
  * rather than from `in_buf`. The benefit is that we can save the cost of writing candidates and
  * their indices.
  */
-template <typename T, typename IdxT, int BitsPerPass, int BlockSize, bool fused_last_filter>
+template <typename T,
+          typename IdxT,
+          int BitsPerPass,
+          int BlockSize,
+          bool fused_last_filter,
+          bool len_or_indptr>
 RAFT_KERNEL radix_kernel(const T* in,
                          const IdxT* in_idx,
-                         const T* in_buf,
-                         const IdxT* in_idx_buf,
-                         T* out_buf,
-                         IdxT* out_idx_buf,
+                         char* bufs,
+                         size_t offset,
                          T* out,
                          IdxT* out_idx,
                          Counter<T, IdxT>* counters,
@@ -567,21 +664,38 @@ RAFT_KERNEL radix_kernel(const T* in,
   IdxT current_k;
   IdxT previous_len;
   IdxT current_len;
+
+  const IdxT l_len    = len_or_indptr ? len : (len_i[batch_id + 1] - len_i[batch_id]);
+  const IdxT l_offset = len_or_indptr ? (offset + batch_id) * len : len_i[batch_id];
+
   if (pass == 0) {
     current_k    = k;
-    previous_len = len;
+    previous_len = l_len;
     // Need to do this so setting counter->previous_len for the next pass is correct.
     // This value is meaningless for pass 0, but it's fine because pass 0 won't be the
     // last pass in this implementation so pass 0 won't hit the "if (pass ==
     // num_passes - 1)" branch.
     // Maybe it's better to reload counter->previous_len and use it rather than
     // current_len in last_filter()
-    current_len = len;
+    current_len = l_len;
   } else {
     current_k    = counter->k;
     current_len  = counter->len;
     previous_len = counter->previous_len;
   }
+  if constexpr (!len_or_indptr) {
+    if (pass == 0 && l_len <= k) {
+      copy_in_val(out + batch_id * k, in + l_offset, l_len, k, select_min);
+      copy_in_idx(out_idx + batch_id * k, (in_idx ? (in_idx + l_offset) : nullptr), l_len);
+      if (threadIdx.x == 0) {
+        counter->previous_len = 0;
+        counter->len          = 0;
+      }
+      __syncthreads();
+      return;
+    }
+  }
+
   if (current_len == 0) { return; }
 
   // When k=len, early_stop will be true at pass 0. It means filter_and_histogram() should handle
@@ -590,20 +704,33 @@ RAFT_KERNEL radix_kernel(const T* in,
   const bool early_stop = (current_len == current_k);
   const IdxT buf_len    = calc_buf_len<T>(len);
 
+  const T* in_buf;
+  const IdxT* in_idx_buf;
+  T* out_buf;
+  IdxT* out_idx_buf;
+  bufs += batch_id * buf_len * 2 * (sizeof(T) + sizeof(IdxT));
+
+  set_buf_pointers(in + l_offset,
+                   (in_idx ? (in_idx + l_offset) : nullptr),
+                   bufs,
+                   buf_len,
+                   pass,
+                   in_buf,
+                   in_idx_buf,
+                   out_buf,
+                   out_idx_buf);
+
   // "previous_len > buf_len" means previous pass skips writing buffer
   if (pass == 0 || pass == 1 || previous_len > buf_len) {
-    in_buf       = in + batch_id * len;
-    in_idx_buf   = in_idx ? (in_idx + batch_id * len) : nullptr;
-    previous_len = len;
-  } else {
-    in_buf += batch_id * buf_len;
-    in_idx_buf += batch_id * buf_len;
+    in_buf       = in + l_offset;
+    in_idx_buf   = in_idx ? (in_idx + l_offset) : nullptr;
+    previous_len = l_len;
   }
 
   // in case we have individual len for each query defined we want to make sure
   // that we only iterate valid elements.
   if (len_i != nullptr) {
-    const IdxT max_len = max(len_i[batch_id], k);
+    const IdxT max_len = max(l_len, k);
     if (max_len < previous_len) previous_len = max_len;
   }
 
@@ -611,9 +738,6 @@ RAFT_KERNEL radix_kernel(const T* in,
   if (pass == 0 || current_len > buf_len) {
     out_buf     = nullptr;
     out_idx_buf = nullptr;
-  } else {
-    out_buf += batch_id * buf_len;
-    out_idx_buf += batch_id * buf_len;
   }
   out += batch_id * k;
   out_idx += batch_id * k;
@@ -640,7 +764,6 @@ RAFT_KERNEL radix_kernel(const T* in,
     unsigned int finished = atomicInc(&counter->finished_block_cnt, gridDim.x - 1);
     isLastBlock           = (finished == (gridDim.x - 1));
   }
-
   if (__syncthreads_or(isLastBlock)) {
     if (early_stop) {
       if (threadIdx.x == 0) {
@@ -676,7 +799,7 @@ RAFT_KERNEL radix_kernel(const T* in,
                                           out_idx_buf ? out_idx_buf : in_idx_buf,
                                           out,
                                           out_idx,
-                                          out_buf ? current_len : len,
+                                          out_buf ? current_len : l_len,
                                           k,
                                           counter,
                                           select_min,
@@ -726,7 +849,7 @@ unsigned calc_grid_dim(int batch_size, IdxT len, int sm_cnt)
 
   int active_blocks;
   RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &active_blocks, radix_kernel<T, IdxT, BitsPerPass, BlockSize, false>, BlockSize, 0));
+    &active_blocks, radix_kernel<T, IdxT, BitsPerPass, BlockSize, false, true>, BlockSize, 0));
   active_blocks *= sm_cnt;
 
   IdxT best_num_blocks         = 0;
@@ -757,78 +880,7 @@ unsigned calc_grid_dim(int batch_size, IdxT len, int sm_cnt)
   return best_num_blocks;
 }
 
-template <typename T, typename IdxT>
-_RAFT_HOST void set_buf_pointers(const T* in,
-                                 const IdxT* in_idx,
-                                 T* buf1,
-                                 IdxT* idx_buf1,
-                                 T* buf2,
-                                 IdxT* idx_buf2,
-                                 int pass,
-                                 const T*& in_buf,
-                                 const IdxT*& in_idx_buf,
-                                 T*& out_buf,
-                                 IdxT*& out_idx_buf)
-{
-  if (pass == 0) {
-    in_buf      = in;
-    in_idx_buf  = nullptr;
-    out_buf     = nullptr;
-    out_idx_buf = nullptr;
-  } else if (pass == 1) {
-    in_buf      = in;
-    in_idx_buf  = in_idx;
-    out_buf     = buf1;
-    out_idx_buf = idx_buf1;
-  } else if (pass % 2 == 0) {
-    in_buf      = buf1;
-    in_idx_buf  = idx_buf1;
-    out_buf     = buf2;
-    out_idx_buf = idx_buf2;
-  } else {
-    in_buf      = buf2;
-    in_idx_buf  = idx_buf2;
-    out_buf     = buf1;
-    out_idx_buf = idx_buf1;
-  }
-}
-
-template <typename T, typename IdxT>
-_RAFT_DEVICE void set_buf_pointers(const T* in,
-                                   const IdxT* in_idx,
-                                   char* bufs,
-                                   IdxT buf_len,
-                                   int pass,
-                                   const T*& in_buf,
-                                   const IdxT*& in_idx_buf,
-                                   T*& out_buf,
-                                   IdxT*& out_idx_buf)
-{
-  // bufs consists of 4 pieces in order: buf1, buf2, idx_buf1, idx_buf2
-  if (pass == 0) {
-    in_buf      = in;
-    in_idx_buf  = nullptr;
-    out_buf     = nullptr;
-    out_idx_buf = nullptr;
-  } else if (pass == 1) {
-    in_buf      = in;
-    in_idx_buf  = in_idx;
-    out_buf     = reinterpret_cast<T*>(bufs);
-    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
-  } else if (pass % 2 == 0) {
-    in_buf      = reinterpret_cast<T*>(bufs);
-    in_idx_buf  = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
-    out_buf     = const_cast<T*>(in_buf + buf_len);
-    out_idx_buf = const_cast<IdxT*>(in_idx_buf + buf_len);
-  } else {
-    out_buf     = reinterpret_cast<T*>(bufs);
-    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
-    in_buf      = out_buf + buf_len;
-    in_idx_buf  = out_idx_buf + buf_len;
-  }
-}
-
-template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize, bool len_or_indptr>
 void radix_topk(const T* in,
                 const IdxT* in_idx,
                 int batch_size,
@@ -850,7 +902,7 @@ void radix_topk(const T* in,
 
   if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); }
 
-  auto kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, false>;
+  auto kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, false, len_or_indptr>;
   const size_t max_chunk_size =
     calc_chunk_size<T, IdxT, BlockSize>(batch_size, len, sm_cnt, kernel, false);
   if (max_chunk_size != static_cast<size_t>(batch_size)) {
@@ -862,55 +914,33 @@ void radix_topk(const T* in,
 
   rmm::device_uvector<Counter<T, IdxT>> counters(max_chunk_size, stream, mr);
   rmm::device_uvector<IdxT> histograms(max_chunk_size * num_buckets, stream, mr);
-  rmm::device_uvector<T> buf1(max_chunk_size * buf_len, stream, mr);
-  rmm::device_uvector<IdxT> idx_buf1(max_chunk_size * buf_len, stream, mr);
-  rmm::device_uvector<T> buf2(max_chunk_size * buf_len, stream, mr);
-  rmm::device_uvector<IdxT> idx_buf2(max_chunk_size * buf_len, stream, mr);
+
+  rmm::device_uvector<char> bufs(
+    max_chunk_size * buf_len * 2 * (sizeof(T) + sizeof(IdxT)), stream, mr);
 
   for (size_t offset = 0; offset < static_cast<size_t>(batch_size); offset += max_chunk_size) {
     int chunk_size = std::min(max_chunk_size, batch_size - offset);
     RAFT_CUDA_TRY(
       cudaMemsetAsync(counters.data(), 0, counters.size() * sizeof(Counter<T, IdxT>), stream));
     RAFT_CUDA_TRY(cudaMemsetAsync(histograms.data(), 0, histograms.size() * sizeof(IdxT), stream));
-    auto kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, false>;
+    auto kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, false, len_or_indptr>;
 
-    const T* chunk_in        = in + offset * len;
-    const IdxT* chunk_in_idx = in_idx ? (in_idx + offset * len) : nullptr;
-    T* chunk_out             = out + offset * k;
-    IdxT* chunk_out_idx      = out_idx + offset * k;
-    const IdxT* chunk_len_i  = len_i ? (len_i + offset) : nullptr;
-
-    const T* in_buf        = nullptr;
-    const IdxT* in_idx_buf = nullptr;
-    T* out_buf             = nullptr;
-    IdxT* out_idx_buf      = nullptr;
+    T* chunk_out            = out + offset * k;
+    IdxT* chunk_out_idx     = out_idx + offset * k;
+    const IdxT* chunk_len_i = len_i ? (len_i + offset) : nullptr;
 
     dim3 blocks(grid_dim, chunk_size);
     constexpr int num_passes = calc_num_passes<T, BitsPerPass>();
 
     for (int pass = 0; pass < num_passes; ++pass) {
-      set_buf_pointers(chunk_in,
-                       chunk_in_idx,
-                       buf1.data(),
-                       idx_buf1.data(),
-                       buf2.data(),
-                       idx_buf2.data(),
-                       pass,
-                       in_buf,
-                       in_idx_buf,
-                       out_buf,
-                       out_idx_buf);
-
       if (fused_last_filter && pass == num_passes - 1) {
-        kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, true>;
+        kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, true, len_or_indptr>;
       }
 
-      kernel<<<blocks, BlockSize, 0, stream>>>(chunk_in,
-                                               chunk_in_idx,
-                                               in_buf,
-                                               in_idx_buf,
-                                               out_buf,
-                                               out_idx_buf,
+      kernel<<<blocks, BlockSize, 0, stream>>>(in,
+                                               in_idx,
+                                               bufs.data(),
+                                               offset,
                                                chunk_out,
                                                chunk_out_idx,
                                                counters.data(),
@@ -924,16 +954,18 @@ void radix_topk(const T* in,
     }
 
     if (!fused_last_filter) {
-      last_filter_kernel<T, IdxT, BitsPerPass><<<blocks, BlockSize, 0, stream>>>(chunk_in,
-                                                                                 chunk_in_idx,
-                                                                                 out_buf,
-                                                                                 out_idx_buf,
-                                                                                 chunk_out,
-                                                                                 chunk_out_idx,
-                                                                                 len,
-                                                                                 k,
-                                                                                 counters.data(),
-                                                                                 select_min);
+      last_filter_kernel<T, IdxT, BitsPerPass, len_or_indptr>
+        <<<blocks, BlockSize, 0, stream>>>(in,
+                                           in_idx,
+                                           bufs.data(),
+                                           offset,
+                                           chunk_out,
+                                           chunk_out_idx,
+                                           len,
+                                           chunk_len_i,
+                                           k,
+                                           counters.data(),
+                                           select_min);
       RAFT_CUDA_TRY(cudaPeekAtLastError());
     }
   }
@@ -1015,7 +1047,7 @@ _RAFT_DEVICE void filter_and_histogram_for_one_block(const T* in_buf,
   }
 }
 
-template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize, bool len_or_indptr>
 RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
                                         const IdxT* in_idx,
                                         const IdxT len,
@@ -1024,30 +1056,48 @@ RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
                                         T* out,
                                         IdxT* out_idx,
                                         const bool select_min,
-                                        char* bufs)
+                                        char* bufs,
+                                        size_t offset)
 {
   constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
   __shared__ Counter<T, IdxT> counter;
   __shared__ IdxT histogram[num_buckets];
 
+  const size_t batch_id = blockIdx.x;  // size_t to avoid multiplication overflow
+
+  IdxT l_len    = len;
+  IdxT l_offset = (offset + batch_id) * len;
+  if constexpr (!len_or_indptr) {
+    l_offset = len_i[batch_id];
+    l_len    = len_i[batch_id + 1] - l_offset;
+  }
+
   if (threadIdx.x == 0) {
     counter.k              = k;
-    counter.len            = len;
-    counter.previous_len   = len;
+    counter.len            = l_len;
+    counter.previous_len   = l_len;
     counter.kth_value_bits = 0;
     counter.out_cnt        = 0;
     counter.out_back_cnt   = 0;
   }
   __syncthreads();
 
-  const size_t batch_id = blockIdx.x;  // size_t to avoid multiplication overflow
-  in += batch_id * len;
-  if (in_idx) { in_idx += batch_id * len; }
+  in += l_offset;
+  if (in_idx) { in_idx += l_offset; }
   out += batch_id * k;
   out_idx += batch_id * k;
   const IdxT buf_len = calc_buf_len<T, IdxT, unsigned>(len);
   bufs += batch_id * buf_len * 2 * (sizeof(T) + sizeof(IdxT));
 
+  if constexpr (!len_or_indptr) {
+    if (l_len <= k) {
+      copy_in_val(out, in, l_len, k, select_min);
+      copy_in_idx(out_idx, in_idx, l_len);
+      __syncthreads();
+      return;
+    }
+  }
+
   constexpr int num_passes = calc_num_passes<T, BitsPerPass>();
   for (int pass = 0; pass < num_passes; ++pass) {
     const T* in_buf;
@@ -1073,7 +1123,7 @@ RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
     // in case we have individual len for each query defined we want to make sure
     // that we only iterate valid elements.
     if (len_i != nullptr) {
-      const IdxT max_len = max(len_i[batch_id], k);
+      const IdxT max_len = max(l_len, k);
       if (max_len < previous_len) previous_len = max_len;
     }
 
@@ -1102,7 +1152,7 @@ RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
                                         out_buf ? out_idx_buf : in_idx,
                                         out,
                                         out_idx,
-                                        out_buf ? current_len : len,
+                                        out_buf ? current_len : l_len,
                                         k,
                                         &counter,
                                         select_min,
@@ -1117,7 +1167,7 @@ RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
 // counters and global histograms, can be kept in shared memory and cheap sync operations can be
 // used. It's used when len is relatively small or when the number of blocks per row calculated by
 // `calc_grid_dim()` is 1.
-template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize, bool len_or_indptr>
 void radix_topk_one_block(const T* in,
                           const IdxT* in_idx,
                           int batch_size,
@@ -1133,7 +1183,7 @@ void radix_topk_one_block(const T* in,
 {
   static_assert(calc_num_passes<T, BitsPerPass>() > 1);
 
-  auto kernel        = radix_topk_one_block_kernel<T, IdxT, BitsPerPass, BlockSize>;
+  auto kernel        = radix_topk_one_block_kernel<T, IdxT, BitsPerPass, BlockSize, len_or_indptr>;
   const IdxT buf_len = calc_buf_len<T, IdxT, unsigned>(len);
   const size_t max_chunk_size =
     calc_chunk_size<T, IdxT, BlockSize>(batch_size, len, sm_cnt, kernel, true);
@@ -1144,15 +1194,16 @@ void radix_topk_one_block(const T* in,
   for (size_t offset = 0; offset < static_cast<size_t>(batch_size); offset += max_chunk_size) {
     int chunk_size          = std::min(max_chunk_size, batch_size - offset);
     const IdxT* chunk_len_i = len_i ? (len_i + offset) : nullptr;
-    kernel<<<chunk_size, BlockSize, 0, stream>>>(in + offset * len,
-                                                 in_idx ? (in_idx + offset * len) : nullptr,
+    kernel<<<chunk_size, BlockSize, 0, stream>>>(in,
+                                                 in_idx,
                                                  len,
                                                  chunk_len_i,
                                                  k,
                                                  out + offset * k,
                                                  out_idx + offset * k,
                                                  select_min,
-                                                 bufs.data());
+                                                 bufs.data(),
+                                                 offset);
   }
 }
 
@@ -1182,6 +1233,10 @@ void radix_topk_one_block(const T* in,
  *   it affects the number of passes and number of buckets.
  * @tparam BlockSize
  *   Number of threads in a kernel thread block.
+ * @tparam len_or_indptr
+ *   Flag to interpret `len_i` as either direct row lengths (true) or CSR format
+ *   index pointers (false). When true, each `len_i` element denotes the length of a row. When
+ *   false, `len_i` represents the index pointers for a CSR matrix with shape of `batch_size + 1`.
  *
  * @param[in] res container of reusable resources
  * @param[in] in
@@ -1212,9 +1267,12 @@ void radix_topk_one_block(const T* in,
  *   same. That is, when the value range of input data is narrow. In such case, there could be a
  *   large number of inputs for the last filter, hence using multiple thread blocks is beneficial.
  * @param len_i
- *   optional array of size (batch_size) providing lengths for each individual row
+ *   Optional array used differently based on `len_or_indptr`:
+ *   When `len_or_indptr` is true, `len_i` presents the lengths of each row, which is `batch_size`.
+ *   When `len_or_indptr` is false, `len_i` works like a indptr for a CSR matrix. The length of each
+ *   row would be (`len_i[row_id + 1] - len_i[row_id]`). `len_i` size is `batch_size + 1`.
  */
-template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize, bool len_or_indptr = true>
 void select_k(raft::resources const& res,
               const T* in,
               const IdxT* in_idx,
@@ -1227,9 +1285,12 @@ void select_k(raft::resources const& res,
               bool fused_last_filter,
               const IdxT* len_i)
 {
+  RAFT_EXPECTS(!(!len_or_indptr && (len_i == nullptr)),
+               "When `len_or_indptr` is false, `len_i` must not be nullptr!");
+
   auto stream = resource::get_cuda_stream(res);
   auto mr     = resource::get_workspace_resource(res);
-  if (k == len) {
+  if (k == len && len_or_indptr) {
     RAFT_CUDA_TRY(
       cudaMemcpyAsync(out, in, sizeof(T) * batch_size * len, cudaMemcpyDeviceToDevice, stream));
     if (in_idx) {
@@ -1248,29 +1309,29 @@ void select_k(raft::resources const& res,
   constexpr int items_per_thread = 32;
 
   if (len <= BlockSize * items_per_thread) {
-    impl::radix_topk_one_block<T, IdxT, BitsPerPass, BlockSize>(
+    impl::radix_topk_one_block<T, IdxT, BitsPerPass, BlockSize, len_or_indptr>(
       in, in_idx, batch_size, len, k, out, out_idx, select_min, len_i, sm_cnt, stream, mr);
   } else {
     unsigned grid_dim =
       impl::calc_grid_dim<T, IdxT, BitsPerPass, BlockSize>(batch_size, len, sm_cnt);
     if (grid_dim == 1) {
-      impl::radix_topk_one_block<T, IdxT, BitsPerPass, BlockSize>(
+      impl::radix_topk_one_block<T, IdxT, BitsPerPass, BlockSize, len_or_indptr>(
         in, in_idx, batch_size, len, k, out, out_idx, select_min, len_i, sm_cnt, stream, mr);
     } else {
-      impl::radix_topk<T, IdxT, BitsPerPass, BlockSize>(in,
-                                                        in_idx,
-                                                        batch_size,
-                                                        len,
-                                                        k,
-                                                        out,
-                                                        out_idx,
-                                                        select_min,
-                                                        fused_last_filter,
-                                                        len_i,
-                                                        grid_dim,
-                                                        sm_cnt,
-                                                        stream,
-                                                        mr);
+      impl::radix_topk<T, IdxT, BitsPerPass, BlockSize, len_or_indptr>(in,
+                                                                       in_idx,
+                                                                       batch_size,
+                                                                       len,
+                                                                       k,
+                                                                       out,
+                                                                       out_idx,
+                                                                       select_min,
+                                                                       fused_last_filter,
+                                                                       len_i,
+                                                                       grid_dim,
+                                                                       sm_cnt,
+                                                                       stream,
+                                                                       mr);
     }
   }
 }
diff --git a/cpp/include/raft/matrix/detail/select_warpsort.cuh b/cpp/include/raft/matrix/detail/select_warpsort.cuh
index 572558153d..2cb32585d5 100644
--- a/cpp/include/raft/matrix/detail/select_warpsort.cuh
+++ b/cpp/include/raft/matrix/detail/select_warpsort.cuh
@@ -754,22 +754,32 @@ template <template <int, bool, typename, typename> class WarpSortClass,
           bool Ascending,
           typename T,
           typename IdxT>
-__launch_bounds__(256) RAFT_KERNEL
-  block_kernel(const T* in, const IdxT* in_idx, IdxT len, int k, T* out, IdxT* out_idx)
+__launch_bounds__(256) RAFT_KERNEL block_kernel(const T* in,
+                                                const IdxT* in_idx,
+                                                const IdxT* in_indptr,
+                                                size_t offset,
+                                                IdxT len,
+                                                int k,
+                                                T* out,
+                                                IdxT* out_idx)
 {
   extern __shared__ __align__(256) uint8_t smem_buf_bytes[];
   using bq_t         = block_sort<WarpSortClass, Capacity, Ascending, T, IdxT>;
   uint8_t* warp_smem = bq_t::queue_t::mem_required(blockDim.x) > 0 ? smem_buf_bytes : nullptr;
   bq_t queue(k, warp_smem);
+  const size_t batch_id = blockIdx.y;
 
-  in += blockIdx.y * len;
-  if (in_idx != nullptr) { in_idx += blockIdx.y * len; }
+  const IdxT l_len    = in_indptr ? (in_indptr[batch_id + 1] - in_indptr[batch_id]) : len;
+  const IdxT l_offset = in_indptr ? in_indptr[batch_id] : (offset + batch_id) * len;
+
+  in += l_offset;
+  if (in_idx != nullptr) { in_idx += l_offset; }
 
   const IdxT stride         = gridDim.x * blockDim.x;
-  const IdxT per_thread_lim = len + laneId();
+  const IdxT per_thread_lim = l_len + laneId();
   for (IdxT i = threadIdx.x + blockIdx.x * blockDim.x; i < per_thread_lim; i += stride) {
-    queue.add(i < len ? __ldcs(in + i) : WarpSortClass<Capacity, Ascending, T, IdxT>::kDummy,
-              (i < len && in_idx != nullptr) ? __ldcs(in_idx + i) : i);
+    queue.add(i < l_len ? __ldcs(in + i) : WarpSortClass<Capacity, Ascending, T, IdxT>::kDummy,
+              (i < l_len && in_idx != nullptr) ? __ldcs(in_idx + i) : i);
   }
 
   queue.done(smem_buf_bytes);
@@ -832,6 +842,7 @@ struct launch_setup {
                      int smem_size,
                      const T* in_key,
                      const IdxT* in_idx,
+                     const IdxT* in_indptr,
                      T* out_key,
                      IdxT* out_idx,
                      rmm::cuda_stream_view stream)
@@ -848,6 +859,7 @@ struct launch_setup {
                                                                           smem_size,
                                                                           in_key,
                                                                           in_idx,
+                                                                          in_indptr,
                                                                           out_key,
                                                                           out_idx,
                                                                           stream);
@@ -858,21 +870,23 @@ struct launch_setup {
     // This is less than cuda's max block dim along Y axis (65535), but it's a
     // power-of-two, which ensures the alignment of batches in memory.
     constexpr size_t kMaxGridDimY = 32768;
+    size_t g_offset               = 0;
     for (size_t offset = 0; offset < batch_size; offset += kMaxGridDimY) {
       size_t batch_chunk = std::min<size_t>(kMaxGridDimY, batch_size - offset);
       dim3 gs(num_blocks, batch_chunk, 1);
       if (select_min) {
-        block_kernel<WarpSortClass, Capacity, true, T, IdxT>
-          <<<gs, block_dim, smem_size, stream>>>(in_key, in_idx, IdxT(len), k, out_key, out_idx);
+        block_kernel<WarpSortClass, Capacity, true, T, IdxT><<<gs, block_dim, smem_size, stream>>>(
+          in_key, in_idx, in_indptr, g_offset, IdxT(len), k, out_key, out_idx);
       } else {
-        block_kernel<WarpSortClass, Capacity, false, T, IdxT>
-          <<<gs, block_dim, smem_size, stream>>>(in_key, in_idx, IdxT(len), k, out_key, out_idx);
+        block_kernel<WarpSortClass, Capacity, false, T, IdxT><<<gs, block_dim, smem_size, stream>>>(
+          in_key, in_idx, in_indptr, g_offset, IdxT(len), k, out_key, out_idx);
       }
       RAFT_CUDA_TRY(cudaPeekAtLastError());
       out_key += batch_chunk * num_blocks * k;
       out_idx += batch_chunk * num_blocks * k;
-      in_key += batch_chunk * len;
-      if (in_idx != nullptr) { in_idx += batch_chunk * len; }
+
+      if (in_indptr != nullptr) { in_indptr += batch_chunk; };
+      g_offset += batch_chunk;
     }
   }
 };
@@ -1010,6 +1024,7 @@ void select_k_(int num_of_block,
                int num_of_warp,
                const T* in,
                const IdxT* in_idx,
+               const IdxT* in_indptr,
                size_t batch_size,
                size_t len,
                int k,
@@ -1041,6 +1056,7 @@ void select_k_(int num_of_block,
                                                smem_size,
                                                in,
                                                in_idx,
+                                               in_indptr,
                                                result_val,
                                                result_idx,
                                                stream);
@@ -1056,6 +1072,7 @@ void select_k_(int num_of_block,
                                                  smem_size,
                                                  tmp_val.data(),
                                                  tmp_idx.data(),
+                                                 nullptr,
                                                  out,
                                                  out_idx,
                                                  stream);
@@ -1071,7 +1088,8 @@ void select_k_impl(raft::resources const& res,
                    int k,
                    T* out,
                    IdxT* out_idx,
-                   bool select_min)
+                   bool select_min,
+                   const IdxT* in_indptr = nullptr)
 {
   int num_of_block = 0;
   int num_of_warp  = 0;
@@ -1082,6 +1100,7 @@ void select_k_impl(raft::resources const& res,
                                     num_of_warp,
                                     in,
                                     in_idx,
+                                    in_indptr,
                                     batch_size,
                                     len,
                                     k,
@@ -1126,6 +1145,9 @@ void select_k_impl(raft::resources const& res,
  *   the payload selected together with `out`.
  * @param select_min
  *   whether to select k smallest (true) or largest (false) keys.
+ * @param[in] in_indptr
+ *   CSR indptr of the index matrix, which indicates the length for each row.
+ *   `nullptr` by default, under this situation, @p len is used as the length.
  */
 template <typename T, typename IdxT>
 void select_k(raft::resources const& res,
@@ -1136,7 +1158,8 @@ void select_k(raft::resources const& res,
               int k,
               T* out,
               IdxT* out_idx,
-              bool select_min)
+              bool select_min,
+              const IdxT* in_indptr = nullptr)
 {
   ASSERT(k <= kMaxCapacity, "Current max k is %d (requested %d)", kMaxCapacity, k);
   ASSERT(len <= size_t(std::numeric_limits<IdxT>::max()),
@@ -1155,6 +1178,7 @@ void select_k(raft::resources const& res,
                                             num_of_warp,
                                             in,
                                             in_idx,
+                                            in_indptr,
                                             batch_size,
                                             len,
                                             k,
@@ -1170,6 +1194,7 @@ void select_k(raft::resources const& res,
                                            num_of_warp,
                                            in,
                                            in_idx,
+                                           in_indptr,
                                            batch_size,
                                            len,
                                            k,
diff --git a/cpp/include/raft/matrix/select_k.cuh b/cpp/include/raft/matrix/select_k.cuh
index 7df1430455..2b3c75cec3 100644
--- a/cpp/include/raft/matrix/select_k.cuh
+++ b/cpp/include/raft/matrix/select_k.cuh
@@ -146,6 +146,8 @@ void select_k(raft::resources const& handle,
  *   Output indices [in_val.get_n_row(), k] corresponding to the selected elements in `out_val`.
  * @param[in] select_min
  *   Flag indicating whether to select the k smallest (true) or largest (false) elements.
+ * @param[in] sorted
+ *   whether to make sure selected pairs are sorted by value
  */
 template <typename T, typename IdxT>
 void select_k(raft::resources const& handle,
@@ -153,9 +155,10 @@ void select_k(raft::resources const& handle,
               std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx,
               raft::device_matrix_view<T, IdxT, raft::row_major> out_val,
               raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,
-              bool select_min)
+              bool select_min,
+              bool sorted = false)
 {
-  return detail::select_k<T, IdxT>(handle, in_val, in_idx, out_val, out_idx, select_min);
+  return detail::select_k<T, IdxT>(handle, in_val, in_idx, out_val, out_idx, select_min, sorted);
 }
 /** @} */  // end of group select_k
 
diff --git a/cpp/src/matrix/detail/select_k_double_int64_t.cu b/cpp/src/matrix/detail/select_k_double_int64_t.cu
index 238c961165..c17018efe0 100644
--- a/cpp/src/matrix/detail/select_k_double_int64_t.cu
+++ b/cpp/src/matrix/detail/select_k_double_int64_t.cu
@@ -41,7 +41,9 @@ instantiate_raft_matrix_detail_select_k(double, int64_t);
     std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
     raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
     raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min)
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
 
 instantiate_raft_matrix_detail_select_k(double, int64_t);
 
diff --git a/cpp/src/matrix/detail/select_k_double_uint32_t.cu b/cpp/src/matrix/detail/select_k_double_uint32_t.cu
index 48309d4744..fcc3e5d5a7 100644
--- a/cpp/src/matrix/detail/select_k_double_uint32_t.cu
+++ b/cpp/src/matrix/detail/select_k_double_uint32_t.cu
@@ -43,7 +43,9 @@ instantiate_raft_matrix_detail_select_k(double, uint32_t);
     std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
     raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
     raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min)
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
 
 instantiate_raft_matrix_detail_select_k(double, uint32_t);
 
diff --git a/cpp/src/matrix/detail/select_k_float_int32.cu b/cpp/src/matrix/detail/select_k_float_int32.cu
index 02397444d2..82041a9b2d 100644
--- a/cpp/src/matrix/detail/select_k_float_int32.cu
+++ b/cpp/src/matrix/detail/select_k_float_int32.cu
@@ -41,7 +41,9 @@ instantiate_raft_matrix_detail_select_k(float, int);
     std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
     raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
     raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min)
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
 
 instantiate_raft_matrix_detail_select_k(float, int);
 
diff --git a/cpp/src/matrix/detail/select_k_float_int64_t.cu b/cpp/src/matrix/detail/select_k_float_int64_t.cu
index 6e9160e9f9..4d381b417f 100644
--- a/cpp/src/matrix/detail/select_k_float_int64_t.cu
+++ b/cpp/src/matrix/detail/select_k_float_int64_t.cu
@@ -41,7 +41,9 @@ instantiate_raft_matrix_detail_select_k(float, int64_t);
     std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
     raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
     raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min)
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
 
 instantiate_raft_matrix_detail_select_k(float, uint64_t);
 
diff --git a/cpp/src/matrix/detail/select_k_float_uint32_t.cu b/cpp/src/matrix/detail/select_k_float_uint32_t.cu
index 6867baf019..775807cfac 100644
--- a/cpp/src/matrix/detail/select_k_float_uint32_t.cu
+++ b/cpp/src/matrix/detail/select_k_float_uint32_t.cu
@@ -41,7 +41,9 @@ instantiate_raft_matrix_detail_select_k(float, uint32_t);
     std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
     raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
     raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min)
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
 
 instantiate_raft_matrix_detail_select_k(float, uint32_t);
 
diff --git a/cpp/src/matrix/detail/select_k_half_int64_t.cu b/cpp/src/matrix/detail/select_k_half_int64_t.cu
index abae835b05..cfd260326b 100644
--- a/cpp/src/matrix/detail/select_k_half_int64_t.cu
+++ b/cpp/src/matrix/detail/select_k_half_int64_t.cu
@@ -41,7 +41,9 @@ instantiate_raft_matrix_detail_select_k(__half, int64_t);
     std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
     raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
     raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min)
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
 
 instantiate_raft_matrix_detail_select_k(__half, int64_t);
 
diff --git a/cpp/src/matrix/detail/select_k_half_uint32_t.cu b/cpp/src/matrix/detail/select_k_half_uint32_t.cu
index 3e8f0b8ea0..c252337f97 100644
--- a/cpp/src/matrix/detail/select_k_half_uint32_t.cu
+++ b/cpp/src/matrix/detail/select_k_half_uint32_t.cu
@@ -41,7 +41,9 @@ instantiate_raft_matrix_detail_select_k(__half, uint32_t);
     std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
     raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
     raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min)
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
 
 instantiate_raft_matrix_detail_select_k(__half, uint32_t);
 
diff --git a/cpp/src/raft_runtime/matrix/select_k_float_int64_t.cu b/cpp/src/raft_runtime/matrix/select_k_float_int64_t.cu
index 551a51f6b6..bfeaa2b75c 100644
--- a/cpp/src/raft_runtime/matrix/select_k_float_int64_t.cu
+++ b/cpp/src/raft_runtime/matrix/select_k_float_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,7 +29,8 @@ void select_k(const resources& handle,
               std::optional<raft::device_matrix_view<const int64_t, int64_t, row_major>> in_idx,
               raft::device_matrix_view<float, int64_t, row_major> out_val,
               raft::device_matrix_view<int64_t, int64_t, row_major> out_idx,
-              bool select_min)
+              bool select_min,
+              raft::matrix::SelectAlgo algo)
 {
   raft::matrix::select_k(handle, in_val, in_idx, out_val, out_idx, select_min);
 }
diff --git a/cpp/test/matrix/select_k_csr.cu b/cpp/test/matrix/select_k_csr.cu
index ece773ee97..57426f97dc 100644
--- a/cpp/test/matrix/select_k_csr.cu
+++ b/cpp/test/matrix/select_k_csr.cu
@@ -31,6 +31,7 @@
 
 #include <algorithm>
 #include <cmath>
+#include <limits>
 #include <optional>
 #include <queue>
 #include <random>
@@ -50,6 +51,23 @@ struct SelectKCsrInputs {
   bool customized_indices;
 };
 
+template <typename T>
+struct CompareApproxWithInf {
+  CompareApproxWithInf(T eps_) : eps(eps_) {}
+  bool operator()(const T& a, const T& b) const
+  {
+    if (std::isinf(a) && std::isinf(b)) return true;
+    T diff  = std::abs(a - b);
+    T m     = std::max(std::abs(a), std::abs(b));
+    T ratio = diff > eps ? diff / m : diff;
+
+    return (ratio <= eps);
+  }
+
+ private:
+  T eps;
+};
+
 template <typename value_t, typename index_t>
 class SelectKCsrTest : public ::testing::TestWithParam<SelectKCsrInputs<index_t>> {
  public:
@@ -202,9 +220,9 @@ class SelectKCsrTest : public ::testing::TestWithParam<SelectKCsrInputs<index_t>
 
     convert_to_csr(dense_values_h, params.n_rows, params.n_cols, indices_h, indptr_h);
 
-    std::vector<value_t> dst_values_h(params.n_rows * params.top_k, static_cast<value_t>(2.0f));
-    std::vector<index_t> dst_indices_h(params.n_rows * params.top_k,
-                                       static_cast<index_t>(params.n_rows * params.n_cols * 100));
+    std::vector<value_t> dst_values_h(params.n_rows * params.top_k,
+                                      std::numeric_limits<value_t>::infinity());
+    std::vector<index_t> dst_indices_h(params.n_rows * params.top_k, static_cast<index_t>(0));
 
     dst_values_d.resize(params.n_rows * params.top_k, stream);
     dst_indices_d.resize(params.n_rows * params.top_k, stream);
@@ -280,19 +298,19 @@ class SelectKCsrTest : public ::testing::TestWithParam<SelectKCsrInputs<index_t>
     auto out_idx = raft::make_device_matrix_view<index_t, index_t, raft::row_major>(
       dst_indices_d.data(), params.n_rows, params.top_k);
 
-    raft::matrix::select_k(handle, in_val, in_idx, out_val, out_idx, params.select_min);
-
-    ASSERT_TRUE(raft::devArrMatch<value_t>(dst_values_expected_d.data(),
-                                           out_val.data_handle(),
-                                           params.n_rows * params.top_k,
-                                           raft::CompareApprox<value_t>(1e-6f),
-                                           stream));
+    raft::matrix::select_k(handle, in_val, in_idx, out_val, out_idx, params.select_min, true);
 
     ASSERT_TRUE(raft::devArrMatch<index_t>(dst_indices_expected_d.data(),
                                            out_idx.data_handle(),
                                            params.n_rows * params.top_k,
                                            raft::Compare<index_t>(),
                                            stream));
+
+    ASSERT_TRUE(raft::devArrMatch<value_t>(dst_values_expected_d.data(),
+                                           out_val.data_handle(),
+                                           params.n_rows * params.top_k,
+                                           CompareApproxWithInf<value_t>(1e-6f),
+                                           stream));
   }
 
  protected:
@@ -331,6 +349,8 @@ const std::vector<SelectKCsrInputs<index_t>> selectk_inputs = {
   {10, 32, 251, 0.6, true, true},
   {1024, 1024, 258, 0.3, true, false},
   {1024, 1024, 600, 0.2, true, true},
+  {1024, 1024, 1024, 0.3, true, false},
+  {1024, 1024, 1024, 0.2, true, true},
   {100, 1024 * 1000, 251, 0.1, true, false},
   {100, 1024 * 1000, 251, 0.2, true, true},
   {1024, 1024 * 10, 251, 0.3, true, false},

From 5abbd4e59cdbce6c9ef4502412c66ea1ea87747f Mon Sep 17 00:00:00 2001
From: rhdong <hrong@nvidia.com>
Date: Sat, 30 Mar 2024 10:17:28 -0700
Subject: [PATCH 06/12] add `algo` to new select_k

---
 cpp/include/raft/matrix/select_k.cuh                  | 8 ++++++--
 cpp/src/raft_runtime/matrix/select_k_float_int64_t.cu | 5 ++---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/cpp/include/raft/matrix/select_k.cuh b/cpp/include/raft/matrix/select_k.cuh
index 2b3c75cec3..6a8cab5c90 100644
--- a/cpp/include/raft/matrix/select_k.cuh
+++ b/cpp/include/raft/matrix/select_k.cuh
@@ -148,6 +148,8 @@ void select_k(raft::resources const& handle,
  *   Flag indicating whether to select the k smallest (true) or largest (false) elements.
  * @param[in] sorted
  *   whether to make sure selected pairs are sorted by value
+ * @param[in] algo
+ *   the selection algorithm to use
  */
 template <typename T, typename IdxT>
 void select_k(raft::resources const& handle,
@@ -156,9 +158,11 @@ void select_k(raft::resources const& handle,
               raft::device_matrix_view<T, IdxT, raft::row_major> out_val,
               raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,
               bool select_min,
-              bool sorted = false)
+              bool sorted     = false,
+              SelectAlgo algo = SelectAlgo::kAuto)
 {
-  return detail::select_k<T, IdxT>(handle, in_val, in_idx, out_val, out_idx, select_min, sorted);
+  return detail::select_k<T, IdxT>(
+    handle, in_val, in_idx, out_val, out_idx, select_min, sorted, algo);
 }
 /** @} */  // end of group select_k
 
diff --git a/cpp/src/raft_runtime/matrix/select_k_float_int64_t.cu b/cpp/src/raft_runtime/matrix/select_k_float_int64_t.cu
index bfeaa2b75c..551a51f6b6 100644
--- a/cpp/src/raft_runtime/matrix/select_k_float_int64_t.cu
+++ b/cpp/src/raft_runtime/matrix/select_k_float_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,8 +29,7 @@ void select_k(const resources& handle,
               std::optional<raft::device_matrix_view<const int64_t, int64_t, row_major>> in_idx,
               raft::device_matrix_view<float, int64_t, row_major> out_val,
               raft::device_matrix_view<int64_t, int64_t, row_major> out_idx,
-              bool select_min,
-              raft::matrix::SelectAlgo algo)
+              bool select_min)
 {
   raft::matrix::select_k(handle, in_val, in_idx, out_val, out_idx, select_min);
 }

From ae7076242d78de071425dee9b12700552b50b337 Mon Sep 17 00:00:00 2001
From: rhdong <hrong@nvidia.com>
Date: Mon, 1 Apr 2024 14:11:03 -0700
Subject: [PATCH 07/12] add more test cases for coverage

---
 cpp/test/matrix/select_k_csr.cu | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/cpp/test/matrix/select_k_csr.cu b/cpp/test/matrix/select_k_csr.cu
index 57426f97dc..5ea4e89a25 100644
--- a/cpp/test/matrix/select_k_csr.cu
+++ b/cpp/test/matrix/select_k_csr.cu
@@ -343,21 +343,25 @@ template <typename index_t>
 const std::vector<SelectKCsrInputs<index_t>> selectk_inputs = {
   {10, 32, 10, 0.0, true, false},
   {10, 32, 10, 0.0, true, true},
-  {10, 32, 10, 0.01, true, false},
+  {10, 32, 10, 0.01, true, false},  // kWarpImmediate
   {10, 32, 10, 0.1, true, true},
-  {10, 32, 251, 0.1, true, false},
+  {10, 32, 251, 0.1, true, false},  // kWarpImmediate
   {10, 32, 251, 0.6, true, true},
-  {1024, 1024, 258, 0.3, true, false},
+  {1000, 1024 * 100, 1, 0.1, true, false},  // kWarpImmediate
+  {1000, 1024 * 100, 1, 0.2, true, true},
+  {1024, 1024, 258, 0.3, true, false},  // kRadix11bitsExtraPass
   {1024, 1024, 600, 0.2, true, true},
-  {1024, 1024, 1024, 0.3, true, false},
+  {1024, 1024, 1024, 0.3, true, false},  // kRadix11bitsExtraPass
   {1024, 1024, 1024, 0.2, true, true},
-  {100, 1024 * 1000, 251, 0.1, true, false},
+  {100, 1024 * 1000, 251, 0.1, true, false},  // kWarpDistributedShm
   {100, 1024 * 1000, 251, 0.2, true, true},
-  {1024, 1024 * 10, 251, 0.3, true, false},
+  {1024, 1024 * 10, 251, 0.3, true, false},  // kWarpImmediate
   {1024, 1024 * 10, 251, 0.2, true, true},
-  {2048, 1024 * 10, 1000, 0.2, true, false},
+  {1000, 1024 * 20, 1000, 0.2, true, false},  // kRadix11bits
+  {1000, 1024 * 20, 1000, 0.3, true, true},
+  {2048, 1024 * 10, 1000, 0.2, true, false},  // kRadix11bitsExtraPass
   {2048, 1024 * 10, 1000, 0.3, true, true},
-  {2048, 1024 * 10, 2100, 0.1, true, false},
+  {2048, 1024 * 10, 2100, 0.1, true, false},  // kRadix11bitsExtraPass
   {2048, 1024 * 10, 2100, 0.2, true, true}};
 
 INSTANTIATE_TEST_CASE_P(SelectKCsrTest,

From db24068982c5185b94b88c59211c5d7b7e7b0daf Mon Sep 17 00:00:00 2001
From: rhdong <hrong@nvidia.com>
Date: Tue, 2 Apr 2024 15:37:02 -0700
Subject: [PATCH 08/12] Response to the review comments.

---
 cpp/include/raft/matrix/copy.cuh              |  37 +--
 .../raft/matrix/detail/select_k-inl.cuh       |   1 -
 cpp/test/CMakeLists.txt                       |   1 -
 cpp/test/matrix/copy.cu                       | 255 ------------------
 cpp/test/matrix/select_k_csr.cu               |  24 +-
 5 files changed, 24 insertions(+), 294 deletions(-)
 delete mode 100644 cpp/test/matrix/copy.cu

diff --git a/cpp/include/raft/matrix/copy.cuh b/cpp/include/raft/matrix/copy.cuh
index 785ff84b56..be83a4a19e 100644
--- a/cpp/include/raft/matrix/copy.cuh
+++ b/cpp/include/raft/matrix/copy.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -122,41 +122,6 @@ void trunc_zero_origin(raft::resources const& handle,
                                       resource::get_cuda_stream(handle));
 }
 
-/**
- * @brief Copy a specific number of elements row by row from the source vector to the target matrix
- * according to the segment indicated by offsets
- *
- * @tparam m_t the type of the copied items.
- * @tparam idx_t the index type of vectors and matrix.
- * @param[in] handle raft handle
- * @param[in] max_len_per_row Maximum number of copies per row
- * @param[in] src Source vector
- * @param[in] offsets Indicates the starting and ending index of each row in the vector
- * @param[out] dst Destination matrix in row major order
- *
- * @note When the length of one segment is less than max_len_per_row, the remaining position values
- * of dst will remain unchanged.
- */
-template <typename m_t, typename idx_t>
-void segmented_copy(raft::resources const& handle,
-                    idx_t max_len_per_row,
-                    raft::device_vector_view<m_t, idx_t> src,
-                    raft::device_vector_view<idx_t, idx_t> offsets,
-                    raft::device_matrix_view<m_t, idx_t, row_major> dst)
-{
-  RAFT_EXPECTS(static_cast<idx_t>(offsets.size()) == (dst.extent(0) + 1),
-               "Number of offsets must be larger than number of output rows by 1");
-  RAFT_EXPECTS(dst.extent(1) >= max_len_per_row,
-               "Number of rows in the out must be equal or larger than max_len_per_row");
-  detail::segmented_copy(handle,
-                         src.data_handle(),
-                         dst.extent(0),
-                         dst.extent(1),
-                         max_len_per_row,
-                         offsets.data_handle(),
-                         dst.data_handle());
-}
-
 /** @} */  // end of group matrix_copy
 
 }  // namespace raft::matrix
diff --git a/cpp/include/raft/matrix/detail/select_k-inl.cuh b/cpp/include/raft/matrix/detail/select_k-inl.cuh
index bcf00db709..f75929573d 100644
--- a/cpp/include/raft/matrix/detail/select_k-inl.cuh
+++ b/cpp/include/raft/matrix/detail/select_k-inl.cuh
@@ -376,7 +376,6 @@ void select_k(raft::resources const& handle,
   auto len        = csr_view.get_n_cols();
   auto k          = IdxT(out_val.extent(1));
 
-  auto mr = resource::get_workspace_resource(handle);
   RAFT_EXPECTS(out_val.extent(1) <= int64_t(std::numeric_limits<int>::max()),
                "output k must fit the int type.");
 
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 6f2c4f0147..be7e469da6 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -258,7 +258,6 @@ if(BUILD_TESTS)
     test/matrix/argmax.cu
     test/matrix/argmin.cu
     test/matrix/columnSort.cu
-    test/matrix/copy.cu
     test/matrix/diagonal.cu
     test/matrix/gather.cu
     test/matrix/scatter.cu
diff --git a/cpp/test/matrix/copy.cu b/cpp/test/matrix/copy.cu
deleted file mode 100644
index adeeae73f5..0000000000
--- a/cpp/test/matrix/copy.cu
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "../test_utils.cuh"
-
-#include <raft/core/device_mdarray.hpp>
-#include <raft/core/device_mdspan.hpp>
-#include <raft/core/resource/cuda_stream.hpp>
-#include <raft/core/resources.hpp>
-#include <raft/matrix/copy.cuh>
-#include <raft/random/make_blobs.cuh>
-#include <raft/util/cuda_utils.cuh>
-
-#include <gtest/gtest.h>
-
-#include <iostream>
-
-namespace raft {
-namespace sparse {
-
-template <typename index_t>
-struct SegmentedCopyInputs {
-  index_t n_rows;
-  index_t n_cols;
-  index_t top_k;
-  float sparsity;
-};
-
-template <typename value_t, typename index_t>
-class SegmentedCopyTest : public ::testing::TestWithParam<SegmentedCopyInputs<index_t>> {
- public:
-  SegmentedCopyTest()
-    : stream(resource::get_cuda_stream(handle)),
-      params(::testing::TestWithParam<SegmentedCopyInputs<index_t>>::GetParam()),
-      indices_d(0, stream),
-      indptr_d(0, stream),
-      values_d(0, stream),
-      dst_values_d(0, stream),
-      dst_values_expected_d(0, stream),
-      dst_indices_d(0, stream),
-      dst_indices_expected_d(0, stream)
-  {
-  }
-
- protected:
-  index_t create_sparse_matrix(index_t m, index_t n, value_t sparsity, std::vector<bool>& matrix)
-  {
-    index_t total_elements = static_cast<index_t>(m * n);
-    index_t num_ones       = static_cast<index_t>((total_elements * 1.0f) * sparsity);
-    index_t res            = num_ones;
-
-    for (index_t i = 0; i < total_elements; ++i) {
-      matrix[i] = false;
-    }
-
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_int_distribution<> dis_idx(0, total_elements - 1);
-
-    while (num_ones > 0) {
-      size_t index = dis_idx(gen);
-      if (matrix[index] == false) {
-        matrix[index] = true;
-        num_ones--;
-      }
-    }
-    return res;
-  }
-  void convert_to_csr(std::vector<bool>& matrix,
-                      index_t rows,
-                      index_t cols,
-                      std::vector<index_t>& indices,
-                      std::vector<index_t>& indptr)
-  {
-    index_t offset_indptr   = 0;
-    index_t offset_values   = 0;
-    indptr[offset_indptr++] = 0;
-
-    for (index_t i = 0; i < rows; ++i) {
-      for (index_t j = 0; j < cols; ++j) {
-        if (matrix[i * cols + j]) {
-          indices[offset_values] = static_cast<index_t>(j);
-          offset_values++;
-        }
-      }
-      indptr[offset_indptr++] = static_cast<index_t>(offset_values);
-    }
-  }
-
-  template <typename dst_t>
-  void cpu_segmented_copy(index_t rows,
-                          index_t max_len_per_row,
-                          const std::vector<dst_t>& src,
-                          const std::vector<index_t>& offsets,
-                          std::vector<dst_t>& dst)
-  {
-    for (index_t row = 0; row < rows; ++row) {
-      index_t start  = offsets[row];
-      index_t end    = offsets[row + 1];  //(row < rows - 1) ? offsets[row + 1] : src.size();
-      index_t length = std::min(end - start, max_len_per_row);
-      if (length == 0) continue;
-      std::copy(
-        src.begin() + start, src.begin() + start + length, dst.begin() + row * max_len_per_row);
-    }
-  }
-
-  void SetUp() override
-  {
-    std::vector<bool> dense_values_h(params.n_rows * params.n_cols);
-    nnz = create_sparse_matrix(params.n_rows, params.n_cols, params.sparsity, dense_values_h);
-
-    std::vector<value_t> values_h(nnz);
-    std::vector<index_t> indices_h(nnz);
-    std::vector<index_t> indptr_h(params.n_rows + 1);
-    std::vector<value_t> dst_values_h(params.n_rows * params.top_k, static_cast<value_t>(2.0f));
-
-    std::vector<index_t> dst_indices_h(params.n_rows * params.top_k,
-                                       static_cast<index_t>(params.n_rows * params.n_cols + 1));
-
-    // sync up the initial values in advance to 2.0 which is out of random range [-1.0, 1.0].
-    dst_values_d.resize(params.n_rows * params.top_k, stream);
-    dst_indices_d.resize(params.n_rows * params.top_k, stream);
-
-    update_device(dst_values_d.data(), dst_values_h.data(), dst_values_h.size(), stream);
-    update_device(dst_indices_d.data(), dst_indices_h.data(), dst_indices_h.size(), stream);
-    resource::sync_stream(handle);
-
-    auto blobs_values = raft::make_device_matrix<value_t, index_t>(handle, 1, dst_values_h.size());
-    auto labels       = raft::make_device_vector<index_t, index_t>(handle, 1);
-
-    raft::random::make_blobs<value_t, index_t>(blobs_values.data_handle(),
-                                               labels.data_handle(),
-                                               1,
-                                               dst_values_h.size(),
-                                               1,
-                                               stream,
-                                               false,
-                                               nullptr,
-                                               nullptr,
-                                               value_t(1.0),
-                                               false,
-                                               value_t(-1.0f),
-                                               value_t(1.0f),
-                                               uint64_t(2024));
-    raft::copy(dst_values_h.data(), blobs_values.data_handle(), dst_values_h.size(), stream);
-    raft::copy(dst_values_d.data(), blobs_values.data_handle(), dst_values_h.size(), stream);
-    resource::sync_stream(handle);
-
-    convert_to_csr(dense_values_h, params.n_rows, params.n_cols, indices_h, indptr_h);
-
-    cpu_segmented_copy<value_t>(params.n_rows, params.top_k, values_h, indptr_h, dst_values_h);
-    cpu_segmented_copy<index_t>(params.n_rows, params.top_k, indices_h, indptr_h, dst_indices_h);
-
-    values_d.resize(nnz, stream);
-    indices_d.resize(nnz, stream);
-    indptr_d.resize(params.n_rows + 1, stream);
-    dst_values_expected_d.resize(params.n_rows * params.top_k, stream);
-    dst_indices_expected_d.resize(params.n_rows * params.top_k, stream);
-
-    update_device(values_d.data(), values_h.data(), values_h.size(), stream);
-    update_device(indices_d.data(), indices_h.data(), indices_h.size(), stream);
-    update_device(indptr_d.data(), indptr_h.data(), indptr_h.size(), stream);
-    update_device(dst_values_expected_d.data(), dst_values_h.data(), dst_values_h.size(), stream);
-    update_device(
-      dst_indices_expected_d.data(), dst_indices_h.data(), dst_indices_h.size(), stream);
-
-    resource::sync_stream(handle);
-  }
-
-  void Run()
-  {
-    auto src_values  = raft::make_device_vector_view<value_t, index_t>(values_d.data(), nnz);
-    auto src_indices = raft::make_device_vector_view<index_t, index_t>(indices_d.data(), nnz);
-    auto offsets =
-      raft::make_device_vector_view<index_t, index_t>(indptr_d.data(), params.n_rows + 1);
-    auto dst_values = raft::make_device_matrix_view<value_t, index_t, raft::row_major>(
-      dst_values_d.data(), params.n_rows, params.top_k);
-    auto dst_indices = raft::make_device_matrix_view<index_t, index_t, raft::row_major>(
-      dst_indices_d.data(), params.n_rows, params.top_k);
-
-    raft::matrix::segmented_copy(handle, params.top_k, src_values, offsets, dst_values);
-    raft::matrix::segmented_copy(handle, params.top_k, src_indices, offsets, dst_indices);
-
-    resource::sync_stream(handle);
-
-    ASSERT_TRUE(raft::devArrMatch<value_t>(dst_values_expected_d.data(),
-                                           dst_values_d.data(),
-                                           params.n_rows * params.top_k,
-                                           raft::CompareApprox<value_t>(1e-6f),
-                                           stream));
-
-    ASSERT_TRUE(raft::devArrMatch<index_t>(dst_indices_expected_d.data(),
-                                           dst_indices_d.data(),
-                                           params.n_rows * params.top_k,
-                                           raft::Compare<index_t>(),
-                                           stream));
-  }
-
- protected:
-  raft::resources handle;
-  cudaStream_t stream;
-
-  SegmentedCopyInputs<index_t> params;
-
-  index_t nnz;
-
-  rmm::device_uvector<value_t> values_d;
-  rmm::device_uvector<index_t> indptr_d;
-  rmm::device_uvector<index_t> indices_d;
-
-  rmm::device_uvector<value_t> dst_values_d;
-  rmm::device_uvector<value_t> dst_values_expected_d;
-
-  rmm::device_uvector<index_t> dst_indices_d;
-  rmm::device_uvector<index_t> dst_indices_expected_d;
-};
-
-using SegmentedCopyTest_float_int = SegmentedCopyTest<float, int>;
-TEST_P(SegmentedCopyTest_float_int, Result) { Run(); }
-
-using SegmentedCopyTest_double_int64 = SegmentedCopyTest<double, int64_t>;
-TEST_P(SegmentedCopyTest_double_int64, Result) { Run(); }
-
-template <typename index_t>
-const std::vector<SegmentedCopyInputs<index_t>> segmentedcopy_inputs = {
-  {10, 32, 10, 0.0},
-  {10, 32, 10, 0.3},
-  {32, 1024, 63, 0.3},
-  {1024, 1024, 128, 0.2},
-  {1024, 1024 * 2000, 251, 0.2},
-  {2048, 1024 * 100, 1000, 0.3},
-  {2048, 1024 * 100, 2100, 0.5}};
-
-INSTANTIATE_TEST_CASE_P(SegmentedCopyTest,
-                        SegmentedCopyTest_float_int,
-                        ::testing::ValuesIn(segmentedcopy_inputs<int>));
-INSTANTIATE_TEST_CASE_P(SegmentedCopyTest,
-                        SegmentedCopyTest_double_int64,
-                        ::testing::ValuesIn(segmentedcopy_inputs<int64_t>));
-
-}  // namespace sparse
-}  // namespace raft
diff --git a/cpp/test/matrix/select_k_csr.cu b/cpp/test/matrix/select_k_csr.cu
index 5ea4e89a25..ed58e6d80d 100644
--- a/cpp/test/matrix/select_k_csr.cu
+++ b/cpp/test/matrix/select_k_csr.cu
@@ -362,7 +362,29 @@ const std::vector<SelectKCsrInputs<index_t>> selectk_inputs = {
   {2048, 1024 * 10, 1000, 0.2, true, false},  // kRadix11bitsExtraPass
   {2048, 1024 * 10, 1000, 0.3, true, true},
   {2048, 1024 * 10, 2100, 0.1, true, false},  // kRadix11bitsExtraPass
-  {2048, 1024 * 10, 2100, 0.2, true, true}};
+  {2048, 1024 * 10, 2100, 0.2, true, true},
+  {10, 32, 10, 0.0, false, false},
+  {10, 32, 10, 0.0, false, true},
+  {10, 32, 10, 0.01, false, false},  // kWarpImmediate
+  {10, 32, 10, 0.1, false, true},
+  {10, 32, 251, 0.1, false, false},  // kWarpImmediate
+  {10, 32, 251, 0.6, false, true},
+  {1000, 1024 * 100, 1, 0.1, false, false},  // kWarpImmediate
+  {1000, 1024 * 100, 1, 0.2, false, true},
+  {1024, 1024, 258, 0.3, false, false},  // kRadix11bitsExtraPass
+  {1024, 1024, 600, 0.2, false, true},
+  {1024, 1024, 1024, 0.3, false, false},  // kRadix11bitsExtraPass
+  {1024, 1024, 1024, 0.2, false, true},
+  {100, 1024 * 1000, 251, 0.1, false, false},  // kWarpDistributedShm
+  {100, 1024 * 1000, 251, 0.2, false, true},
+  {1024, 1024 * 10, 251, 0.3, false, false},  // kWarpImmediate
+  {1024, 1024 * 10, 251, 0.2, false, true},
+  {1000, 1024 * 20, 1000, 0.2, false, false},  // kRadix11bits
+  {1000, 1024 * 20, 1000, 0.3, false, true},
+  {2048, 1024 * 10, 1000, 0.2, false, false},  // kRadix11bitsExtraPass
+  {2048, 1024 * 10, 1000, 0.3, false, true},
+  {2048, 1024 * 10, 2100, 0.1, false, false},  // kRadix11bitsExtraPass
+  {2048, 1024 * 10, 2100, 0.2, false, true}};
 
 INSTANTIATE_TEST_CASE_P(SelectKCsrTest,
                         SelectKCsrTest_float_int,

From 96ded4d40056207877fcaf2dcb43086b3fcb1c3c Mon Sep 17 00:00:00 2001
From: rhdong <hrong@nvidia.com>
Date: Tue, 2 Apr 2024 20:29:10 -0700
Subject: [PATCH 09/12] fix CI by removing segmented_copy totally.

---
 cpp/include/raft/matrix/detail/matrix.cuh     | 65 -------------------
 .../raft/matrix/detail/select_k-inl.cuh       |  2 -
 2 files changed, 67 deletions(-)

diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
index a9109d37ba..6ed9ffa645 100644
--- a/cpp/include/raft/matrix/detail/matrix.cuh
+++ b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <raft/core/resource/cublas_handle.hpp>
-#include <raft/core/resource/device_properties.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/util/cache_util.cuh>
@@ -318,70 +317,6 @@ m_t getL2Norm(raft::resources const& handle, const m_t* in, idx_t size, cudaStre
   return normval;
 }
 
-// Threads per block in segmented_copy_kernel.
-static const constexpr int SEGMENTED_COPY_TPB_256 = 256;
-static const constexpr int SEGMENTED_COPY_TPB_32  = 32;
-
-template <typename m_t, typename idx_t, idx_t TPB>
-RAFT_KERNEL __launch_bounds__(TPB) segmented_copy_kernel(
-  const m_t* src, idx_t n_rows, idx_t n_cols, idx_t max_len_per_row, idx_t* offsets, m_t* dst)
-{
-#pragma unroll
-  for (idx_t row_id = blockIdx.y; row_id < n_rows; row_id += gridDim.y) {
-    idx_t segment_start = offsets[row_id];
-    idx_t len           = min(offsets[row_id + 1] - segment_start, max_len_per_row);
-    for (idx_t col_id = threadIdx.x + blockIdx.x * blockDim.x; col_id < len;
-         col_id += blockDim.x * gridDim.x) {
-      dst[row_id * n_cols + col_id] = src[segment_start + col_id];
-    }
-  }
-}
-
-template <typename m_t, typename idx_t>
-void segmented_copy(raft::resources const& handle,
-                    const m_t* src,
-                    idx_t n_rows,
-                    idx_t n_cols,
-                    idx_t max_len_per_row,
-                    idx_t* offsets,
-                    m_t* dst)
-{
-  auto stream = resource::get_cuda_stream(handle);
-
-  idx_t tpb = max_len_per_row >= 256 ? SEGMENTED_COPY_TPB_256 : SEGMENTED_COPY_TPB_32;
-
-  int blocks_per_sm;
-  int sm_count = resource::get_device_properties(handle).multiProcessorCount;
-
-  if (tpb == SEGMENTED_COPY_TPB_32) {
-    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-      &blocks_per_sm, segmented_copy_kernel<m_t, idx_t, SEGMENTED_COPY_TPB_32>, tpb, 0);
-  } else if (tpb == SEGMENTED_COPY_TPB_256) {
-    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-      &blocks_per_sm, segmented_copy_kernel<m_t, idx_t, SEGMENTED_COPY_TPB_256>, tpb, 0);
-  }
-
-  idx_t max_active_blocks = sm_count * blocks_per_sm;
-  // `max threads number = sm_count * blocks_per_sm * tpb`
-  // `problem size = n_rows * max_len_per_row`
-  idx_t required_active_blocks =
-    raft::min(max_active_blocks, raft::ceildiv(n_rows * max_len_per_row, tpb));
-
-  idx_t blocks_per_row = raft::ceildiv(required_active_blocks, n_rows);
-  idx_t grid_rows      = raft::ceildiv(required_active_blocks, blocks_per_row);
-  dim3 block(tpb, 1);
-  dim3 grid(blocks_per_row, grid_rows);
-
-  if (tpb == SEGMENTED_COPY_TPB_32) {
-    segmented_copy_kernel<m_t, idx_t, SEGMENTED_COPY_TPB_32>
-      <<<grid, block, 0, stream>>>(src, n_rows, n_cols, max_len_per_row, offsets, dst);
-  } else if (tpb == SEGMENTED_COPY_TPB_256) {
-    segmented_copy_kernel<m_t, idx_t, SEGMENTED_COPY_TPB_256>
-      <<<grid, block, 0, stream>>>(src, n_rows, n_cols, max_len_per_row, offsets, dst);
-  }
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-}
-
 }  // end namespace detail
 }  // end namespace matrix
 }  // end namespace raft
diff --git a/cpp/include/raft/matrix/detail/select_k-inl.cuh b/cpp/include/raft/matrix/detail/select_k-inl.cuh
index f75929573d..7b52199530 100644
--- a/cpp/include/raft/matrix/detail/select_k-inl.cuh
+++ b/cpp/include/raft/matrix/detail/select_k-inl.cuh
@@ -27,8 +27,6 @@
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/device_memory_resource.hpp>
 #include <raft/linalg/map.cuh>
-#include <raft/matrix/copy.cuh>
-#include <raft/matrix/gather.cuh>
 #include <raft/matrix/select_k_types.hpp>
 
 #include <cub/cub.cuh>

From 9e8ae31d343690c9527c7fba77e1a07ca4016f01 Mon Sep 17 00:00:00 2001
From: rhdong <hrong@nvidia.com>
Date: Thu, 4 Apr 2024 11:25:22 -0700
Subject: [PATCH 10/12] move public API to naming scope of sparse

---
 cpp/bench/prims/CMakeLists.txt                |  2 +-
 .../prims/{matrix => sparse}/select_k_csr.cu  |  7 +-
 cpp/include/raft/matrix/select_k.cuh          | 48 -----------
 cpp/include/raft/sparse/matrix/select_k.cuh   | 81 +++++++++++++++++++
 cpp/test/CMakeLists.txt                       |  2 +-
 cpp/test/{matrix => sparse}/select_k_csr.cu   |  5 +-
 6 files changed, 90 insertions(+), 55 deletions(-)
 rename cpp/bench/prims/{matrix => sparse}/select_k_csr.cu (97%)
 create mode 100644 cpp/include/raft/sparse/matrix/select_k.cuh
 rename cpp/test/{matrix => sparse}/select_k_csr.cu (98%)

diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
index 1b28e7d0b9..063d69a737 100644
--- a/cpp/bench/prims/CMakeLists.txt
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -128,7 +128,6 @@ if(BUILD_PRIMS_BENCH)
     bench/prims/matrix/argmin.cu
     bench/prims/matrix/gather.cu
     bench/prims/matrix/select_k.cu
-    bench/prims/matrix/select_k_csr.cu
     bench/prims/main.cpp
     OPTIONAL
     LIB
@@ -146,6 +145,7 @@ if(BUILD_PRIMS_BENCH)
     PATH
     bench/prims/sparse/bitmap_to_csr.cu
     bench/prims/sparse/convert_csr.cu
+    bench/prims/sparse/select_k_csr.cu
     bench/prims/main.cpp
   )
 
diff --git a/cpp/bench/prims/matrix/select_k_csr.cu b/cpp/bench/prims/sparse/select_k_csr.cu
similarity index 97%
rename from cpp/bench/prims/matrix/select_k_csr.cu
rename to cpp/bench/prims/sparse/select_k_csr.cu
index 4ab706f471..a91e6c8514 100644
--- a/cpp/bench/prims/matrix/select_k_csr.cu
+++ b/cpp/bench/prims/sparse/select_k_csr.cu
@@ -22,10 +22,10 @@
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/matrix/copy.cuh>
-#include <raft/matrix/select_k.cuh>
 #include <raft/random/make_blobs.cuh>
 #include <raft/random/rng_state.hpp>
 #include <raft/sparse/convert/csr.cuh>
+#include <raft/sparse/matrix/select_k.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/itertools.hpp>
 
@@ -203,10 +203,11 @@ struct SelectKCsrTest : public fixture {
     auto out_idx = raft::make_device_matrix_view<index_t, index_t, raft::row_major>(
       dst_indices_d.data(), params.n_rows, params.top_k);
 
-    raft::matrix::select_k(handle, in_val, in_idx, out_val, out_idx, params.select_min);
+    raft::sparse::matrix::select_k(handle, in_val, in_idx, out_val, out_idx, params.select_min);
     resource::sync_stream(handle);
     loop_on_state(state, [this, &in_val, &in_idx, &out_val, &out_idx]() {
-      raft::matrix::select_k(handle, in_val, in_idx, out_val, out_idx, params.select_min, false);
+      raft::sparse::matrix::select_k(
+        handle, in_val, in_idx, out_val, out_idx, params.select_min, false);
       resource::sync_stream(handle);
     });
   }
diff --git a/cpp/include/raft/matrix/select_k.cuh b/cpp/include/raft/matrix/select_k.cuh
index 6a8cab5c90..2efa146495 100644
--- a/cpp/include/raft/matrix/select_k.cuh
+++ b/cpp/include/raft/matrix/select_k.cuh
@@ -18,7 +18,6 @@
 
 #include "detail/select_k.cuh"
 
-#include <raft/core/device_csr_matrix.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/nvtx.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
@@ -117,53 +116,6 @@ void select_k(raft::resources const& handle,
                                    algo);
 }
 
-/**
- * Selects the k smallest or largest keys/values from each row of the input matrix.
- *
- * This function operates on a row-major matrix `in_val` with dimensions `batch_size` x `len`,
- * selecting the k smallest or largest elements from each row. The selected elements are then stored
- * in a row-major output matrix `out_val` with dimensions `batch_size` x k.
- * If the total number of values in a row is less than K, then the extra position in the
- * corresponding row of out_val will maintain the original value. This applies to out_idx
- *
- * @tparam T
- *   Type of the elements being compared (keys).
- * @tparam IdxT
- *   Type of the indices associated with the keys.
- *
- * @param[in] handle
- *   Container for managing reusable resources.
- * @param[in] in_val
- *   Input matrix in CSR format with a logical dense shape of [batch_size, len],
- *   containing the elements to be compared and selected.
- * @param[in] in_idx
- *   Optional input indices [in_val.nnz] associated with `in_val.values`.
- *   If `in_idx` is `std::nullopt`, it defaults to a contiguous array from 0 to len-1.
- * @param[out] out_val
- *   Output matrix [in_val.get_n_row(), k] storing the selected k smallest/largest elements
- *   from each row of `in_val`.
- * @param[out] out_idx
- *   Output indices [in_val.get_n_row(), k] corresponding to the selected elements in `out_val`.
- * @param[in] select_min
- *   Flag indicating whether to select the k smallest (true) or largest (false) elements.
- * @param[in] sorted
- *   whether to make sure selected pairs are sorted by value
- * @param[in] algo
- *   the selection algorithm to use
- */
-template <typename T, typename IdxT>
-void select_k(raft::resources const& handle,
-              raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,
-              std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx,
-              raft::device_matrix_view<T, IdxT, raft::row_major> out_val,
-              raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,
-              bool select_min,
-              bool sorted     = false,
-              SelectAlgo algo = SelectAlgo::kAuto)
-{
-  return detail::select_k<T, IdxT>(
-    handle, in_val, in_idx, out_val, out_idx, select_min, sorted, algo);
-}
 /** @} */  // end of group select_k
 
 }  // namespace raft::matrix
diff --git a/cpp/include/raft/sparse/matrix/select_k.cuh b/cpp/include/raft/sparse/matrix/select_k.cuh
new file mode 100644
index 0000000000..f6c8bbe0c7
--- /dev/null
+++ b/cpp/include/raft/sparse/matrix/select_k.cuh
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/nvtx.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/matrix/detail/select_k.cuh>
+#include <raft/matrix/select_k_types.hpp>
+
+#include <optional>
+
+namespace raft::sparse::matrix {
+
+using SelectAlgo = raft::matrix::SelectAlgo;
+/**
+ * Selects the k smallest or largest keys/values from each row of the input matrix.
+ *
+ * This function operates on a row-major matrix `in_val` with dimensions `batch_size` x `len`,
+ * selecting the k smallest or largest elements from each row. The selected elements are then stored
+ * in a row-major output matrix `out_val` with dimensions `batch_size` x k.
+ * If the total number of values in a row is less than K, then the extra position in the
+ * corresponding row of out_val will maintain the original value. This applies to out_idx
+ *
+ * @tparam T
+ *   Type of the elements being compared (keys).
+ * @tparam IdxT
+ *   Type of the indices associated with the keys.
+ *
+ * @param[in] handle
+ *   Container for managing reusable resources.
+ * @param[in] in_val
+ *   Input matrix in CSR format with a logical dense shape of [batch_size, len],
+ *   containing the elements to be compared and selected.
+ * @param[in] in_idx
+ *   Optional input indices [in_val.nnz] associated with `in_val.values`.
+ *   If `in_idx` is `std::nullopt`, it defaults to a contiguous array from 0 to len-1.
+ * @param[out] out_val
+ *   Output matrix [in_val.get_n_row(), k] storing the selected k smallest/largest elements
+ *   from each row of `in_val`.
+ * @param[out] out_idx
+ *   Output indices [in_val.get_n_row(), k] corresponding to the selected elements in `out_val`.
+ * @param[in] select_min
+ *   Flag indicating whether to select the k smallest (true) or largest (false) elements.
+ * @param[in] sorted
+ *   whether to make sure selected pairs are sorted by value
+ * @param[in] algo
+ *   the selection algorithm to use
+ */
+template <typename T, typename IdxT>
+void select_k(raft::resources const& handle,
+              raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx,
+              raft::device_matrix_view<T, IdxT, raft::row_major> out_val,
+              raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,
+              bool select_min,
+              bool sorted     = false,
+              SelectAlgo algo = SelectAlgo::kAuto)
+{
+  return raft::matrix::detail::select_k<T, IdxT>(
+    handle, in_val, in_idx, out_val, out_idx, select_min, sorted, algo);
+}
+/** @} */  // end of group select_k
+
+}  // namespace raft::sparse::matrix
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index be7e469da6..17990700e6 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -279,7 +279,6 @@ if(BUILD_TESTS)
     NAME
     MATRIX_SELECT_TEST
     PATH test/matrix/select_k.cu
-    PATH test/matrix/select_k_csr.cu
     LIB
     EXPLICIT_INSTANTIATE_ONLY)
 
@@ -326,6 +325,7 @@ if(BUILD_TESTS)
     test/sparse/reduce.cu
     test/sparse/row_op.cu
     test/sparse/sddmm.cu
+    test/sparse/select_k_csr.cu
     test/sparse/sort.cu
     test/sparse/spgemmi.cu
     test/sparse/spmm.cu
diff --git a/cpp/test/matrix/select_k_csr.cu b/cpp/test/sparse/select_k_csr.cu
similarity index 98%
rename from cpp/test/matrix/select_k_csr.cu
rename to cpp/test/sparse/select_k_csr.cu
index ed58e6d80d..fc1061d7bb 100644
--- a/cpp/test/matrix/select_k_csr.cu
+++ b/cpp/test/sparse/select_k_csr.cu
@@ -22,9 +22,9 @@
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/matrix/copy.cuh>
-#include <raft/matrix/select_k.cuh>
 #include <raft/random/make_blobs.cuh>
 #include <raft/random/rng_state.hpp>
+#include <raft/sparse/matrix/select_k.cuh>
 #include <raft/util/cuda_utils.cuh>
 
 #include <gtest/gtest.h>
@@ -298,7 +298,8 @@ class SelectKCsrTest : public ::testing::TestWithParam<SelectKCsrInputs<index_t>
     auto out_idx = raft::make_device_matrix_view<index_t, index_t, raft::row_major>(
       dst_indices_d.data(), params.n_rows, params.top_k);
 
-    raft::matrix::select_k(handle, in_val, in_idx, out_val, out_idx, params.select_min, true);
+    raft::sparse::matrix::select_k(
+      handle, in_val, in_idx, out_val, out_idx, params.select_min, true);
 
     ASSERT_TRUE(raft::devArrMatch<index_t>(dst_indices_expected_d.data(),
                                            out_idx.data_handle(),

From b53cee3663226caab0c90f5117e5415045357dce Mon Sep 17 00:00:00 2001
From: rhdong <hrong@nvidia.com>
Date: Thu, 4 Apr 2024 12:49:13 -0700
Subject: [PATCH 11/12] add back the missed '@defgroup select_k'

---
 cpp/include/raft/sparse/matrix/select_k.cuh | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cpp/include/raft/sparse/matrix/select_k.cuh b/cpp/include/raft/sparse/matrix/select_k.cuh
index f6c8bbe0c7..030b5a354f 100644
--- a/cpp/include/raft/sparse/matrix/select_k.cuh
+++ b/cpp/include/raft/sparse/matrix/select_k.cuh
@@ -29,6 +29,12 @@
 namespace raft::sparse::matrix {
 
 using SelectAlgo = raft::matrix::SelectAlgo;
+
+/**
+ * @defgroup select_k Batched-select k smallest or largest key/values
+ * @{
+ */
+
 /**
  * Selects the k smallest or largest keys/values from each row of the input matrix.
  *

From 04a8bb42c4953a1b02838f57e1da97bc7848a184 Mon Sep 17 00:00:00 2001
From: hrong <hrong@nvidia.com>
Date: Thu, 4 Apr 2024 23:50:56 -0700
Subject: [PATCH 12/12] Move all of the impl over to sparse name scope

---
 cpp/bench/prims/CMakeLists.txt                |  12 +-
 .../raft/matrix/detail/select_k-ext.cuh       |  32 ---
 .../raft/matrix/detail/select_k-inl.cuh       | 192 ---------------
 .../sparse/matrix/detail/select_k-ext.cuh     |  67 ++++++
 .../sparse/matrix/detail/select_k-inl.cuh     | 225 ++++++++++++++++++
 .../raft/sparse/matrix/detail/select_k.cuh    |  24 ++
 cpp/include/raft/sparse/matrix/select_k.cuh   |   6 +-
 .../matrix/detail/select_k_double_int64_t.cu  |  15 --
 .../matrix/detail/select_k_double_uint32_t.cu |  15 --
 cpp/src/matrix/detail/select_k_float_int32.cu |  15 --
 .../matrix/detail/select_k_float_int64_t.cu   |  15 --
 .../matrix/detail/select_k_float_uint32_t.cu  |  15 --
 .../matrix/detail/select_k_half_int64_t.cu    |  15 --
 .../matrix/detail/select_k_half_uint32_t.cu   |  15 --
 .../matrix/detail/select_k_double_int64_t.cu  |  32 +++
 .../matrix/detail/select_k_double_uint32_t.cu |  34 +++
 .../matrix/detail/select_k_float_int32.cu     |  32 +++
 .../matrix/detail/select_k_float_int64_t.cu   |  32 +++
 .../matrix/detail/select_k_float_uint32_t.cu  |  32 +++
 .../matrix/detail/select_k_half_int64_t.cu    |  32 +++
 .../matrix/detail/select_k_half_uint32_t.cu   |  32 +++
 cpp/test/CMakeLists.txt                       |   7 +-
 22 files changed, 548 insertions(+), 348 deletions(-)
 create mode 100644 cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh
 create mode 100644 cpp/include/raft/sparse/matrix/detail/select_k-inl.cuh
 create mode 100644 cpp/include/raft/sparse/matrix/detail/select_k.cuh
 create mode 100644 cpp/src/sparse/matrix/detail/select_k_double_int64_t.cu
 create mode 100644 cpp/src/sparse/matrix/detail/select_k_double_uint32_t.cu
 create mode 100644 cpp/src/sparse/matrix/detail/select_k_float_int32.cu
 create mode 100644 cpp/src/sparse/matrix/detail/select_k_float_int64_t.cu
 create mode 100644 cpp/src/sparse/matrix/detail/select_k_float_uint32_t.cu
 create mode 100644 cpp/src/sparse/matrix/detail/select_k_half_int64_t.cu
 create mode 100644 cpp/src/sparse/matrix/detail/select_k_half_uint32_t.cu

diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
index 063d69a737..0c5521d447 100644
--- a/cpp/bench/prims/CMakeLists.txt
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -122,16 +122,8 @@ if(BUILD_PRIMS_BENCH)
   )
 
   ConfigureBench(
-    NAME
-    MATRIX_BENCH
-    PATH
-    bench/prims/matrix/argmin.cu
-    bench/prims/matrix/gather.cu
-    bench/prims/matrix/select_k.cu
-    bench/prims/main.cpp
-    OPTIONAL
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY
+    NAME MATRIX_BENCH PATH bench/prims/matrix/argmin.cu bench/prims/matrix/gather.cu
+    bench/prims/matrix/select_k.cu bench/prims/main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureBench(
diff --git a/cpp/include/raft/matrix/detail/select_k-ext.cuh b/cpp/include/raft/matrix/detail/select_k-ext.cuh
index 95d806dd43..506cbffcb9 100644
--- a/cpp/include/raft/matrix/detail/select_k-ext.cuh
+++ b/cpp/include/raft/matrix/detail/select_k-ext.cuh
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <raft/core/device_csr_matrix.hpp>
 #include <raft/core/device_resources.hpp>
 #include <raft/matrix/select_k_types.hpp>
 #include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
@@ -45,16 +44,6 @@ void select_k(raft::resources const& handle,
               bool sorted       = false,
               SelectAlgo algo   = SelectAlgo::kAuto,
               const IdxT* len_i = nullptr) RAFT_EXPLICIT;
-
-template <typename T, typename IdxT>
-void select_k(raft::resources const& handle,
-              raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,
-              std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx,
-              raft::device_matrix_view<T, IdxT, raft::row_major> out_val,
-              raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,
-              bool select_min,
-              bool sorted     = false,
-              SelectAlgo algo = SelectAlgo::kAuto) RAFT_EXPLICIT;
 }  // namespace raft::matrix::detail
 
 #endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
@@ -84,24 +73,3 @@ instantiate_raft_matrix_detail_select_k(double, int64_t);
 instantiate_raft_matrix_detail_select_k(double, uint32_t);
 
 #undef instantiate_raft_matrix_detail_select_k
-
-#define instantiate_raft_matrix_detail_select_k(T, IdxT)              \
-  extern template void raft::matrix::detail::select_k(                \
-    raft::resources const& handle,                                    \
-    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
-    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
-    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
-    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min,                                                  \
-    bool sorted,                                                      \
-    raft::matrix::SelectAlgo algo)
-
-instantiate_raft_matrix_detail_select_k(__half, uint32_t);
-instantiate_raft_matrix_detail_select_k(__half, int64_t);
-instantiate_raft_matrix_detail_select_k(float, int64_t);
-instantiate_raft_matrix_detail_select_k(float, uint32_t);
-instantiate_raft_matrix_detail_select_k(float, int);
-instantiate_raft_matrix_detail_select_k(double, int64_t);
-instantiate_raft_matrix_detail_select_k(double, uint32_t);
-
-#undef instantiate_raft_matrix_detail_select_k
diff --git a/cpp/include/raft/matrix/detail/select_k-inl.cuh b/cpp/include/raft/matrix/detail/select_k-inl.cuh
index 7b52199530..93d233152b 100644
--- a/cpp/include/raft/matrix/detail/select_k-inl.cuh
+++ b/cpp/include/raft/matrix/detail/select_k-inl.cuh
@@ -20,7 +20,6 @@
 #include "select_radix.cuh"
 #include "select_warpsort.cuh"
 
-#include <raft/core/device_csr_matrix.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/nvtx.hpp>
@@ -31,8 +30,6 @@
 
 #include <cub/cub.cuh>
 
-#include <type_traits>
-
 namespace raft::matrix::detail {
 
 /**
@@ -320,193 +317,4 @@ void select_k(raft::resources const& handle,
     default: RAFT_FAIL("K-selection Algorithm not supported.");
   }
 }
-
-/**
- * Selects the k smallest or largest keys/values from each row of the input CSR matrix.
- *
- * This function operates on a CSR matrix `in_val` with a logical dense shape of [batch_size, len],
- * selecting the k smallest or largest elements from each row. The selected elements are then stored
- * in a row-major output matrix `out_val` with dimensions `batch_size` x k.
- *
- * @tparam T
- *   Type of the elements being compared (keys).
- * @tparam IdxT
- *   Type of the indices associated with the keys.
- * @tparam NZType
- *   Type representing non-zero elements of `in_val`.
- *
- * @param[in] handle
- *   Container for managing reusable resources.
- * @param[in] in_val
- *   Input matrix in CSR format with a logical dense shape of [batch_size, len],
- *   containing the elements to be compared and selected.
- * @param[in] in_idx
- *   Optional input indices [in_val.nnz] associated with `in_val.values`.
- *   If `in_idx` is `std::nullopt`, it defaults to a contiguous array from 0 to len-1.
- * @param[out] out_val
- *   Output matrix [in_val.get_n_row(), k] storing the selected k smallest/largest elements
- *   from each row of `in_val`.
- * @param[out] out_idx
- *   Output indices [in_val.get_n_row(), k] corresponding to the selected elements in `out_val`.
- * @param[in] select_min
- *   Flag indicating whether to select the k smallest (true) or largest (false) elements.
- * @param[in] sorted
- *   whether to make sure selected pairs are sorted by value
- * @param[in] algo
- *   the selection algorithm to use
- */
-template <typename T, typename IdxT>
-void select_k(raft::resources const& handle,
-              raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,
-              std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx,
-              raft::device_matrix_view<T, IdxT, raft::row_major> out_val,
-              raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,
-              bool select_min,
-              bool sorted     = false,
-              SelectAlgo algo = SelectAlgo::kAuto)
-{
-  auto csr_view = in_val.structure_view();
-  auto nnz      = csr_view.get_nnz();
-
-  if (nnz == 0) return;
-
-  auto batch_size = csr_view.get_n_rows();
-  auto len        = csr_view.get_n_cols();
-  auto k          = IdxT(out_val.extent(1));
-
-  RAFT_EXPECTS(out_val.extent(1) <= int64_t(std::numeric_limits<int>::max()),
-               "output k must fit the int type.");
-
-  RAFT_EXPECTS(batch_size == out_val.extent(0), "batch sizes must be equal");
-  RAFT_EXPECTS(batch_size == out_idx.extent(0), "batch sizes must be equal");
-
-  if (in_idx.has_value()) {
-    RAFT_EXPECTS(size_t(nnz) == in_idx->size(),
-                 "nnz of in_val must be equal to the length of in_idx");
-  }
-  RAFT_EXPECTS(IdxT(k) == out_idx.extent(1), "value and index output lengths must be equal");
-
-  if (algo == SelectAlgo::kAuto) { algo = choose_select_k_algorithm(batch_size, len, k); }
-
-  auto indptr = csr_view.get_indptr().data();
-
-  switch (algo) {
-    case SelectAlgo::kRadix8bits:
-    case SelectAlgo::kRadix11bits:
-    case SelectAlgo::kRadix11bitsExtraPass: {
-      if (algo == SelectAlgo::kRadix8bits) {
-        detail::select::radix::select_k<T, IdxT, 8, 512, false>(
-          handle,
-          in_val.get_elements().data(),
-          (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
-          batch_size,
-          len,
-          k,
-          out_val.data_handle(),
-          out_idx.data_handle(),
-          select_min,
-          true,
-          indptr);
-      } else {
-        bool fused_last_filter = algo == SelectAlgo::kRadix11bits;
-        detail::select::radix::select_k<T, IdxT, 11, 512, false>(
-          handle,
-          in_val.get_elements().data(),
-          (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
-          batch_size,
-          len,
-          k,
-          out_val.data_handle(),
-          out_idx.data_handle(),
-          select_min,
-          fused_last_filter,
-          indptr);
-      }
-
-      if (sorted) {
-        auto offsets = make_device_mdarray<IdxT, IdxT>(
-          handle, resource::get_workspace_resource(handle), make_extents<IdxT>(batch_size + 1));
-        raft::linalg::map_offset(handle, offsets.view(), mul_const_op<IdxT>(k));
-
-        auto keys =
-          raft::make_device_vector_view<T, IdxT>(out_val.data_handle(), (IdxT)(batch_size * k));
-        auto vals =
-          raft::make_device_vector_view<IdxT, IdxT>(out_idx.data_handle(), (IdxT)(batch_size * k));
-
-        segmented_sort_by_key<T, IdxT>(
-          handle, raft::make_const_mdspan(offsets.view()), keys, vals, select_min);
-      }
-
-      return;
-    }
-    case SelectAlgo::kWarpDistributed:
-      return detail::select::warpsort::
-        select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_distributed>(
-          handle,
-          in_val.get_elements().data(),
-          (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
-          batch_size,
-          len,
-          k,
-          out_val.data_handle(),
-          out_idx.data_handle(),
-          select_min,
-          indptr);
-    case SelectAlgo::kWarpDistributedShm:
-      return detail::select::warpsort::
-        select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_distributed_ext>(
-          handle,
-          in_val.get_elements().data(),
-          (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
-          batch_size,
-          len,
-          k,
-          out_val.data_handle(),
-          out_idx.data_handle(),
-          select_min,
-          indptr);
-    case SelectAlgo::kWarpAuto:
-      return detail::select::warpsort::select_k<T, IdxT>(
-        handle,
-        in_val.get_elements().data(),
-        (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
-        batch_size,
-        len,
-        k,
-        out_val.data_handle(),
-        out_idx.data_handle(),
-        select_min,
-        indptr);
-    case SelectAlgo::kWarpImmediate:
-      return detail::select::warpsort::
-        select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_immediate>(
-          handle,
-          in_val.get_elements().data(),
-          (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
-          batch_size,
-          len,
-          k,
-          out_val.data_handle(),
-          out_idx.data_handle(),
-          select_min,
-          indptr);
-    case SelectAlgo::kWarpFiltered:
-      return detail::select::warpsort::
-        select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_filtered>(
-          handle,
-          in_val.get_elements().data(),
-          (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
-          batch_size,
-          len,
-          k,
-          out_val.data_handle(),
-          out_idx.data_handle(),
-          select_min,
-          indptr);
-    default: RAFT_FAIL("K-selection Algorithm not supported.");
-  }
-
-  return;
-}
-
 }  // namespace raft::matrix::detail
diff --git a/cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh b/cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh
new file mode 100644
index 0000000000..08bdfa6f30
--- /dev/null
+++ b/cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/matrix/select_k_types.hpp>
+#include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
+
+#include <rmm/cuda_stream_view.hpp>                  // rmm:cuda_stream_view
+#include <rmm/mr/device/device_memory_resource.hpp>  // rmm::mr::device_memory_resource
+
+#include <cuda_fp16.h>  // __half
+
+#include <cstdint>  // uint32_t
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+namespace raft::sparse::matrix::detail {
+
+template <typename T, typename IdxT>
+void select_k(raft::resources const& handle,
+              raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx,
+              raft::device_matrix_view<T, IdxT, raft::row_major> out_val,
+              raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,
+              bool select_min,
+              bool sorted                   = false,
+              raft::matrix::SelectAlgo algo = SelectAlgo::kAuto) RAFT_EXPLICIT;
+}  // namespace raft::sparse::matrix::detail
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  extern template void raft::sparse::matrix::detail::select_k(        \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(__half, uint32_t);
+instantiate_raft_sparse_matrix_detail_select_k(__half, int64_t);
+instantiate_raft_sparse_matrix_detail_select_k(float, int64_t);
+instantiate_raft_sparse_matrix_detail_select_k(float, uint32_t);
+instantiate_raft_sparse_matrix_detail_select_k(float, int);
+instantiate_raft_sparse_matrix_detail_select_k(double, int64_t);
+instantiate_raft_sparse_matrix_detail_select_k(double, uint32_t);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/include/raft/sparse/matrix/detail/select_k-inl.cuh b/cpp/include/raft/sparse/matrix/detail/select_k-inl.cuh
new file mode 100644
index 0000000000..5f39affce6
--- /dev/null
+++ b/cpp/include/raft/sparse/matrix/detail/select_k-inl.cuh
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/nvtx.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/linalg/map.cuh>
+#include <raft/matrix/detail/select_k-inl.cuh>
+#include <raft/matrix/select_k_types.hpp>
+
+#include <cub/cub.cuh>
+
+#include <type_traits>
+
+namespace raft::sparse::matrix::detail {
+
+using namespace raft::matrix::detail;
+using raft::matrix::SelectAlgo;
+
+/**
+ * Selects the k smallest or largest keys/values from each row of the input CSR matrix.
+ *
+ * This function operates on a CSR matrix `in_val` with a logical dense shape of [batch_size, len],
+ * selecting the k smallest or largest elements from each row. The selected elements are then stored
+ * in a row-major output matrix `out_val` with dimensions `batch_size` x k.
+ *
+ * @tparam T
+ *   Type of the elements being compared (keys).
+ * @tparam IdxT
+ *   Type of the indices associated with the keys.
+ * @tparam NZType
+ *   Type representing non-zero elements of `in_val`.
+ *
+ * @param[in] handle
+ *   Container for managing reusable resources.
+ * @param[in] in_val
+ *   Input matrix in CSR format with a logical dense shape of [batch_size, len],
+ *   containing the elements to be compared and selected.
+ * @param[in] in_idx
+ *   Optional input indices [in_val.nnz] associated with `in_val.values`.
+ *   If `in_idx` is `std::nullopt`, it defaults to a contiguous array from 0 to len-1.
+ * @param[out] out_val
+ *   Output matrix [in_val.get_n_row(), k] storing the selected k smallest/largest elements
+ *   from each row of `in_val`.
+ * @param[out] out_idx
+ *   Output indices [in_val.get_n_row(), k] corresponding to the selected elements in `out_val`.
+ * @param[in] select_min
+ *   Flag indicating whether to select the k smallest (true) or largest (false) elements.
+ * @param[in] sorted
+ *   whether to make sure selected pairs are sorted by value
+ * @param[in] algo
+ *   the selection algorithm to use
+ */
+template <typename T, typename IdxT>
+void select_k(raft::resources const& handle,
+              raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx,
+              raft::device_matrix_view<T, IdxT, raft::row_major> out_val,
+              raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,
+              bool select_min,
+              bool sorted     = false,
+              SelectAlgo algo = SelectAlgo::kAuto)
+{
+  auto csr_view = in_val.structure_view();
+  auto nnz      = csr_view.get_nnz();
+
+  if (nnz == 0) return;
+
+  auto batch_size = csr_view.get_n_rows();
+  auto len        = csr_view.get_n_cols();
+  auto k          = IdxT(out_val.extent(1));
+
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "sparse::matrix::select_k(batch_size = %zu, len = %zu, k = %d)", batch_size, len, k);
+
+  RAFT_EXPECTS(out_val.extent(1) <= int64_t(std::numeric_limits<int>::max()),
+               "output k must fit the int type.");
+
+  RAFT_EXPECTS(batch_size == out_val.extent(0), "batch sizes must be equal");
+  RAFT_EXPECTS(batch_size == out_idx.extent(0), "batch sizes must be equal");
+
+  if (in_idx.has_value()) {
+    RAFT_EXPECTS(size_t(nnz) == in_idx->size(),
+                 "nnz of in_val must be equal to the length of in_idx");
+  }
+  RAFT_EXPECTS(IdxT(k) == out_idx.extent(1), "value and index output lengths must be equal");
+
+  if (algo == SelectAlgo::kAuto) { algo = choose_select_k_algorithm(batch_size, len, k); }
+
+  auto indptr = csr_view.get_indptr().data();
+
+  switch (algo) {
+    case SelectAlgo::kRadix8bits:
+    case SelectAlgo::kRadix11bits:
+    case SelectAlgo::kRadix11bitsExtraPass: {
+      if (algo == SelectAlgo::kRadix8bits) {
+        select::radix::select_k<T, IdxT, 8, 512, false>(
+          handle,
+          in_val.get_elements().data(),
+          (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+          batch_size,
+          len,
+          k,
+          out_val.data_handle(),
+          out_idx.data_handle(),
+          select_min,
+          true,
+          indptr);
+      } else {
+        bool fused_last_filter = algo == SelectAlgo::kRadix11bits;
+        select::radix::select_k<T, IdxT, 11, 512, false>(
+          handle,
+          in_val.get_elements().data(),
+          (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+          batch_size,
+          len,
+          k,
+          out_val.data_handle(),
+          out_idx.data_handle(),
+          select_min,
+          fused_last_filter,
+          indptr);
+      }
+
+      if (sorted) {
+        auto offsets = make_device_mdarray<IdxT, IdxT>(
+          handle, resource::get_workspace_resource(handle), make_extents<IdxT>(batch_size + 1));
+        raft::linalg::map_offset(handle, offsets.view(), mul_const_op<IdxT>(k));
+
+        auto keys =
+          raft::make_device_vector_view<T, IdxT>(out_val.data_handle(), (IdxT)(batch_size * k));
+        auto vals =
+          raft::make_device_vector_view<IdxT, IdxT>(out_idx.data_handle(), (IdxT)(batch_size * k));
+
+        segmented_sort_by_key<T, IdxT>(
+          handle, raft::make_const_mdspan(offsets.view()), keys, vals, select_min);
+      }
+
+      return;
+    }
+    case SelectAlgo::kWarpDistributed:
+      return select::warpsort::select_k_impl<T, IdxT, select::warpsort::warp_sort_distributed>(
+        handle,
+        in_val.get_elements().data(),
+        (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+        batch_size,
+        len,
+        k,
+        out_val.data_handle(),
+        out_idx.data_handle(),
+        select_min,
+        indptr);
+    case SelectAlgo::kWarpDistributedShm:
+      return select::warpsort::select_k_impl<T, IdxT, select::warpsort::warp_sort_distributed_ext>(
+        handle,
+        in_val.get_elements().data(),
+        (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+        batch_size,
+        len,
+        k,
+        out_val.data_handle(),
+        out_idx.data_handle(),
+        select_min,
+        indptr);
+    case SelectAlgo::kWarpAuto:
+      return select::warpsort::select_k<T, IdxT>(
+        handle,
+        in_val.get_elements().data(),
+        (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+        batch_size,
+        len,
+        k,
+        out_val.data_handle(),
+        out_idx.data_handle(),
+        select_min,
+        indptr);
+    case SelectAlgo::kWarpImmediate:
+      return select::warpsort::select_k_impl<T, IdxT, select::warpsort::warp_sort_immediate>(
+        handle,
+        in_val.get_elements().data(),
+        (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+        batch_size,
+        len,
+        k,
+        out_val.data_handle(),
+        out_idx.data_handle(),
+        select_min,
+        indptr);
+    case SelectAlgo::kWarpFiltered:
+      return select::warpsort::select_k_impl<T, IdxT, select::warpsort::warp_sort_filtered>(
+        handle,
+        in_val.get_elements().data(),
+        (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+        batch_size,
+        len,
+        k,
+        out_val.data_handle(),
+        out_idx.data_handle(),
+        select_min,
+        indptr);
+    default: RAFT_FAIL("K-selection Algorithm not supported.");
+  }
+
+  return;
+}
+
+}  // namespace raft::sparse::matrix::detail
diff --git a/cpp/include/raft/sparse/matrix/detail/select_k.cuh b/cpp/include/raft/sparse/matrix/detail/select_k.cuh
new file mode 100644
index 0000000000..711169984b
--- /dev/null
+++ b/cpp/include/raft/sparse/matrix/detail/select_k.cuh
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY
+#include "select_k-inl.cuh"
+#endif
+
+#ifdef RAFT_COMPILED
+#include "select_k-ext.cuh"
+#endif
diff --git a/cpp/include/raft/sparse/matrix/select_k.cuh b/cpp/include/raft/sparse/matrix/select_k.cuh
index 030b5a354f..3f97e60c99 100644
--- a/cpp/include/raft/sparse/matrix/select_k.cuh
+++ b/cpp/include/raft/sparse/matrix/select_k.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,8 +21,8 @@
 #include <raft/core/nvtx.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
-#include <raft/matrix/detail/select_k.cuh>
 #include <raft/matrix/select_k_types.hpp>
+#include <raft/sparse/matrix/detail/select_k.cuh>
 
 #include <optional>
 
@@ -79,7 +79,7 @@ void select_k(raft::resources const& handle,
               bool sorted     = false,
               SelectAlgo algo = SelectAlgo::kAuto)
 {
-  return raft::matrix::detail::select_k<T, IdxT>(
+  return detail::select_k<T, IdxT>(
     handle, in_val, in_idx, out_val, out_idx, select_min, sorted, algo);
 }
 /** @} */  // end of group select_k
diff --git a/cpp/src/matrix/detail/select_k_double_int64_t.cu b/cpp/src/matrix/detail/select_k_double_int64_t.cu
index c17018efe0..bf234aacbf 100644
--- a/cpp/src/matrix/detail/select_k_double_int64_t.cu
+++ b/cpp/src/matrix/detail/select_k_double_int64_t.cu
@@ -33,18 +33,3 @@
 instantiate_raft_matrix_detail_select_k(double, int64_t);
 
 #undef instantiate_raft_matrix_detail_select_k
-
-#define instantiate_raft_matrix_detail_select_k(T, IdxT)              \
-  template void raft::matrix::detail::select_k(                       \
-    raft::resources const& handle,                                    \
-    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
-    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
-    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
-    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min,                                                  \
-    bool sorted,                                                      \
-    raft::matrix::SelectAlgo algo)
-
-instantiate_raft_matrix_detail_select_k(double, int64_t);
-
-#undef instantiate_raft_matrix_detail_select_k
\ No newline at end of file
diff --git a/cpp/src/matrix/detail/select_k_double_uint32_t.cu b/cpp/src/matrix/detail/select_k_double_uint32_t.cu
index fcc3e5d5a7..7f0511a76a 100644
--- a/cpp/src/matrix/detail/select_k_double_uint32_t.cu
+++ b/cpp/src/matrix/detail/select_k_double_uint32_t.cu
@@ -35,18 +35,3 @@
 instantiate_raft_matrix_detail_select_k(double, uint32_t);
 
 #undef instantiate_raft_matrix_detail_select_k
-
-#define instantiate_raft_matrix_detail_select_k(T, IdxT)              \
-  template void raft::matrix::detail::select_k(                       \
-    raft::resources const& handle,                                    \
-    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
-    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
-    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
-    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min,                                                  \
-    bool sorted,                                                      \
-    raft::matrix::SelectAlgo algo)
-
-instantiate_raft_matrix_detail_select_k(double, uint32_t);
-
-#undef instantiate_raft_matrix_detail_select_k
\ No newline at end of file
diff --git a/cpp/src/matrix/detail/select_k_float_int32.cu b/cpp/src/matrix/detail/select_k_float_int32.cu
index 82041a9b2d..e68b1e32df 100644
--- a/cpp/src/matrix/detail/select_k_float_int32.cu
+++ b/cpp/src/matrix/detail/select_k_float_int32.cu
@@ -33,18 +33,3 @@
 instantiate_raft_matrix_detail_select_k(float, int);
 
 #undef instantiate_raft_matrix_detail_select_k
-
-#define instantiate_raft_matrix_detail_select_k(T, IdxT)              \
-  template void raft::matrix::detail::select_k(                       \
-    raft::resources const& handle,                                    \
-    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
-    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
-    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
-    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min,                                                  \
-    bool sorted,                                                      \
-    raft::matrix::SelectAlgo algo)
-
-instantiate_raft_matrix_detail_select_k(float, int);
-
-#undef instantiate_raft_matrix_detail_select_k
diff --git a/cpp/src/matrix/detail/select_k_float_int64_t.cu b/cpp/src/matrix/detail/select_k_float_int64_t.cu
index 4d381b417f..5aa40d8c9d 100644
--- a/cpp/src/matrix/detail/select_k_float_int64_t.cu
+++ b/cpp/src/matrix/detail/select_k_float_int64_t.cu
@@ -33,18 +33,3 @@
 instantiate_raft_matrix_detail_select_k(float, int64_t);
 
 #undef instantiate_raft_matrix_detail_select_k
-
-#define instantiate_raft_matrix_detail_select_k(T, IdxT)              \
-  template void raft::matrix::detail::select_k(                       \
-    raft::resources const& handle,                                    \
-    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
-    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
-    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
-    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min,                                                  \
-    bool sorted,                                                      \
-    raft::matrix::SelectAlgo algo)
-
-instantiate_raft_matrix_detail_select_k(float, uint64_t);
-
-#undef instantiate_raft_matrix_detail_select_k
diff --git a/cpp/src/matrix/detail/select_k_float_uint32_t.cu b/cpp/src/matrix/detail/select_k_float_uint32_t.cu
index 775807cfac..9aba147edf 100644
--- a/cpp/src/matrix/detail/select_k_float_uint32_t.cu
+++ b/cpp/src/matrix/detail/select_k_float_uint32_t.cu
@@ -33,18 +33,3 @@
 instantiate_raft_matrix_detail_select_k(float, uint32_t);
 
 #undef instantiate_raft_matrix_detail_select_k
-
-#define instantiate_raft_matrix_detail_select_k(T, IdxT)              \
-  template void raft::matrix::detail::select_k(                       \
-    raft::resources const& handle,                                    \
-    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
-    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
-    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
-    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min,                                                  \
-    bool sorted,                                                      \
-    raft::matrix::SelectAlgo algo)
-
-instantiate_raft_matrix_detail_select_k(float, uint32_t);
-
-#undef instantiate_raft_matrix_detail_select_k
diff --git a/cpp/src/matrix/detail/select_k_half_int64_t.cu b/cpp/src/matrix/detail/select_k_half_int64_t.cu
index cfd260326b..bc513e4aeb 100644
--- a/cpp/src/matrix/detail/select_k_half_int64_t.cu
+++ b/cpp/src/matrix/detail/select_k_half_int64_t.cu
@@ -33,18 +33,3 @@
 instantiate_raft_matrix_detail_select_k(__half, int64_t);
 
 #undef instantiate_raft_matrix_detail_select_k
-
-#define instantiate_raft_matrix_detail_select_k(T, IdxT)              \
-  template void raft::matrix::detail::select_k(                       \
-    raft::resources const& handle,                                    \
-    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
-    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
-    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
-    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min,                                                  \
-    bool sorted,                                                      \
-    raft::matrix::SelectAlgo algo)
-
-instantiate_raft_matrix_detail_select_k(__half, int64_t);
-
-#undef instantiate_raft_matrix_detail_select_k
diff --git a/cpp/src/matrix/detail/select_k_half_uint32_t.cu b/cpp/src/matrix/detail/select_k_half_uint32_t.cu
index c252337f97..e46c7d46bb 100644
--- a/cpp/src/matrix/detail/select_k_half_uint32_t.cu
+++ b/cpp/src/matrix/detail/select_k_half_uint32_t.cu
@@ -33,18 +33,3 @@
 instantiate_raft_matrix_detail_select_k(__half, uint32_t);
 
 #undef instantiate_raft_matrix_detail_select_k
-
-#define instantiate_raft_matrix_detail_select_k(T, IdxT)              \
-  template void raft::matrix::detail::select_k(                       \
-    raft::resources const& handle,                                    \
-    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
-    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
-    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
-    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min,                                                  \
-    bool sorted,                                                      \
-    raft::matrix::SelectAlgo algo)
-
-instantiate_raft_matrix_detail_select_k(__half, uint32_t);
-
-#undef instantiate_raft_matrix_detail_select_k
diff --git a/cpp/src/sparse/matrix/detail/select_k_double_int64_t.cu b/cpp/src/sparse/matrix/detail/select_k_double_int64_t.cu
new file mode 100644
index 0000000000..c784b50dad
--- /dev/null
+++ b/cpp/src/sparse/matrix/detail/select_k_double_int64_t.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/sparse/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  template void raft::sparse::matrix::detail::select_k(               \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(double, int64_t);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
\ No newline at end of file
diff --git a/cpp/src/sparse/matrix/detail/select_k_double_uint32_t.cu b/cpp/src/sparse/matrix/detail/select_k_double_uint32_t.cu
new file mode 100644
index 0000000000..98bab9a504
--- /dev/null
+++ b/cpp/src/sparse/matrix/detail/select_k_double_uint32_t.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/sparse/matrix/detail/select_k-inl.cuh>
+
+#include <cstdint>  // uint32_t
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  template void raft::sparse::matrix::detail::select_k(               \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(double, uint32_t);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
\ No newline at end of file
diff --git a/cpp/src/sparse/matrix/detail/select_k_float_int32.cu b/cpp/src/sparse/matrix/detail/select_k_float_int32.cu
new file mode 100644
index 0000000000..bff213ae69
--- /dev/null
+++ b/cpp/src/sparse/matrix/detail/select_k_float_int32.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/sparse/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  template void raft::matrix::detail::select_k(                       \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(float, int);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/src/sparse/matrix/detail/select_k_float_int64_t.cu b/cpp/src/sparse/matrix/detail/select_k_float_int64_t.cu
new file mode 100644
index 0000000000..412b06e587
--- /dev/null
+++ b/cpp/src/sparse/matrix/detail/select_k_float_int64_t.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/sparse/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  template void raft::sparse::matrix::detail::select_k(               \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(float, int64_t);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/src/sparse/matrix/detail/select_k_float_uint32_t.cu b/cpp/src/sparse/matrix/detail/select_k_float_uint32_t.cu
new file mode 100644
index 0000000000..8ba3f0e22b
--- /dev/null
+++ b/cpp/src/sparse/matrix/detail/select_k_float_uint32_t.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/sparse/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  template void raft::sparse::matrix::detail::select_k(               \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(float, uint32_t);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/src/sparse/matrix/detail/select_k_half_int64_t.cu b/cpp/src/sparse/matrix/detail/select_k_half_int64_t.cu
new file mode 100644
index 0000000000..24c844f8c8
--- /dev/null
+++ b/cpp/src/sparse/matrix/detail/select_k_half_int64_t.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/sparse/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  template void raft::sparse::matrix::detail::select_k(               \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(__half, int64_t);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/src/sparse/matrix/detail/select_k_half_uint32_t.cu b/cpp/src/sparse/matrix/detail/select_k_half_uint32_t.cu
new file mode 100644
index 0000000000..d63dc64933
--- /dev/null
+++ b/cpp/src/sparse/matrix/detail/select_k_half_uint32_t.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/sparse/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  template void raft::sparse::matrix::detail::select_k(               \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(__half, uint32_t);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 17990700e6..4d17aacffd 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -275,12 +275,7 @@ if(BUILD_TESTS)
     EXPLICIT_INSTANTIATE_ONLY
   )
 
-  ConfigureTest(
-    NAME
-    MATRIX_SELECT_TEST
-    PATH test/matrix/select_k.cu
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY)
+  ConfigureTest(NAME MATRIX_SELECT_TEST PATH test/matrix/select_k.cu LIB EXPLICIT_INSTANTIATE_ONLY)
 
   ConfigureTest(
     NAME MATRIX_SELECT_LARGE_TEST PATH test/matrix/select_large_k.cu LIB EXPLICIT_INSTANTIATE_ONLY