diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
index 9f23c44a5c..0c5521d447 100644
--- a/cpp/bench/prims/CMakeLists.txt
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -137,6 +137,7 @@ if(BUILD_PRIMS_BENCH)
     PATH
     bench/prims/sparse/bitmap_to_csr.cu
     bench/prims/sparse/convert_csr.cu
+    bench/prims/sparse/select_k_csr.cu
     bench/prims/main.cpp
   )
 
diff --git a/cpp/bench/prims/sparse/select_k_csr.cu b/cpp/bench/prims/sparse/select_k_csr.cu
new file mode 100644
index 0000000000..a91e6c8514
--- /dev/null
+++ b/cpp/bench/prims/sparse/select_k_csr.cu
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <common/benchmark.hpp>
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/matrix/copy.cuh>
+#include <raft/random/make_blobs.cuh>
+#include <raft/random/rng_state.hpp>
+#include <raft/sparse/convert/csr.cuh>
+#include <raft/sparse/matrix/select_k.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/itertools.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <random>
+#include <sstream>
+#include <unordered_set>
+#include <vector>
+
+namespace raft::bench::sparse {
+
+template <typename index_t>
+struct bench_param {
+  index_t n_rows;
+  index_t n_cols;
+  index_t top_k;
+  float sparsity;
+  bool select_min         = true;
+  bool customized_indices = false;
+};
+
+template <typename index_t>
+inline auto operator<<(std::ostream& os, const bench_param<index_t>& params) -> std::ostream&
+{
+  os << params.n_rows << "#" << params.n_cols << "#" << params.top_k << "#" << params.sparsity;
+  return os;
+}
+
+template <typename value_t, typename index_t>
+struct SelectKCsrTest : public fixture {
+  SelectKCsrTest(const bench_param<index_t>& p)
+    : fixture(true),
+      params(p),
+      handle(stream),
+      values_d(0, stream),
+      indptr_d(0, stream),
+      indices_d(0, stream),
+      customized_indices_d(0, stream),
+      dst_values_d(0, stream),
+      dst_indices_d(0, stream)
+  {
+    std::vector<bool> dense_values_h(params.n_rows * params.n_cols);
+    nnz = create_sparse_matrix(params.n_rows, params.n_cols, params.sparsity, dense_values_h);
+
+    std::vector<index_t> indices_h(nnz);
+    std::vector<index_t> customized_indices_h(nnz);
+    std::vector<index_t> indptr_h(params.n_rows + 1);
+
+    convert_to_csr(dense_values_h, params.n_rows, params.n_cols, indices_h, indptr_h);
+
+    std::vector<value_t> dst_values_h(params.n_rows * params.top_k, static_cast<value_t>(2.0f));
+    std::vector<index_t> dst_indices_h(params.n_rows * params.top_k,
+                                       static_cast<index_t>(params.n_rows * params.n_cols * 100));
+
+    dst_values_d.resize(params.n_rows * params.top_k, stream);
+    dst_indices_d.resize(params.n_rows * params.top_k, stream);
+    values_d.resize(nnz, stream);
+
+    if (nnz) {
+      auto blobs_values = raft::make_device_matrix<value_t, index_t>(handle, 1, nnz);
+      auto labels       = raft::make_device_vector<index_t, index_t>(handle, 1);
+
+      raft::random::make_blobs<value_t, index_t>(blobs_values.data_handle(),
+                                                 labels.data_handle(),
+                                                 1,
+                                                 nnz,
+                                                 1,
+                                                 stream,
+                                                 false,
+                                                 nullptr,
+                                                 nullptr,
+                                                 value_t(1.0),
+                                                 false,
+                                                 value_t(-10.0f),
+                                                 value_t(10.0f),
+                                                 uint64_t(2024));
+      raft::copy(values_d.data(), blobs_values.data_handle(), nnz, stream);
+      resource::sync_stream(handle);
+    }
+
+    indices_d.resize(nnz, stream);
+    indptr_d.resize(params.n_rows + 1, stream);
+
+    update_device(indices_d.data(), indices_h.data(), indices_h.size(), stream);
+    update_device(indptr_d.data(), indptr_h.data(), indptr_h.size(), stream);
+
+    if (params.customized_indices) {
+      customized_indices_d.resize(nnz, stream);
+      update_device(customized_indices_d.data(),
+                    customized_indices_h.data(),
+                    customized_indices_h.size(),
+                    stream);
+    }
+  }
+
+  index_t create_sparse_matrix(index_t m, index_t n, value_t sparsity, std::vector<bool>& matrix)
+  {
+    index_t total_elements = static_cast<index_t>(m * n);
+    index_t num_ones       = static_cast<index_t>((total_elements * 1.0f) * sparsity);
+    index_t res            = num_ones;
+
+    for (index_t i = 0; i < total_elements; ++i) {
+      matrix[i] = false;
+    }
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<> dis_idx(0, total_elements - 1);
+
+    while (num_ones > 0) {
+      size_t index = dis_idx(gen);
+      if (matrix[index] == false) {
+        matrix[index] = true;
+        num_ones--;
+      }
+    }
+    return res;
+  }
+
+  void convert_to_csr(std::vector<bool>& matrix,
+                      index_t rows,
+                      index_t cols,
+                      std::vector<index_t>& indices,
+                      std::vector<index_t>& indptr)
+  {
+    index_t offset_indptr   = 0;
+    index_t offset_values   = 0;
+    indptr[offset_indptr++] = 0;
+
+    for (index_t i = 0; i < rows; ++i) {
+      for (index_t j = 0; j < cols; ++j) {
+        if (matrix[i * cols + j]) {
+          indices[offset_values] = static_cast<index_t>(j);
+          offset_values++;
+        }
+      }
+      indptr[offset_indptr++] = static_cast<index_t>(offset_values);
+    }
+  }
+
+  template <typename data_t>
+  std::optional<data_t> get_opt_var(data_t x)
+  {
+    if (params.customized_indices) {
+      return x;
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    std::ostringstream label_stream;
+    label_stream << params;
+    state.SetLabel(label_stream.str());
+
+    auto in_val_structure = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
+      indptr_d.data(),
+      indices_d.data(),
+      params.n_rows,
+      params.n_cols,
+      static_cast<index_t>(indices_d.size()));
+
+    auto in_val =
+      raft::make_device_csr_matrix_view<const value_t>(values_d.data(), in_val_structure);
+
+    std::optional<raft::device_vector_view<const index_t, index_t>> in_idx;
+
+    in_idx = get_opt_var(
+      raft::make_device_vector_view<const index_t, index_t>(customized_indices_d.data(), nnz));
+
+    auto out_val = raft::make_device_matrix_view<value_t, index_t, raft::row_major>(
+      dst_values_d.data(), params.n_rows, params.top_k);
+    auto out_idx = raft::make_device_matrix_view<index_t, index_t, raft::row_major>(
+      dst_indices_d.data(), params.n_rows, params.top_k);
+
+    raft::sparse::matrix::select_k(handle, in_val, in_idx, out_val, out_idx, params.select_min);
+    resource::sync_stream(handle);
+    loop_on_state(state, [this, &in_val, &in_idx, &out_val, &out_idx]() {
+      raft::sparse::matrix::select_k(
+        handle, in_val, in_idx, out_val, out_idx, params.select_min, false);
+      resource::sync_stream(handle);
+    });
+  }
+
+ protected:
+  const raft::device_resources handle;
+
+  bench_param<index_t> params;
+  index_t nnz;
+
+  rmm::device_uvector<value_t> values_d;
+  rmm::device_uvector<index_t> indptr_d;
+  rmm::device_uvector<index_t> indices_d;
+  rmm::device_uvector<index_t> customized_indices_d;
+
+  rmm::device_uvector<value_t> dst_values_d;
+  rmm::device_uvector<index_t> dst_indices_d;
+};  // struct SelectKCsrTest
+
+template <typename index_t>
+const std::vector<bench_param<index_t>> getInputs()
+{
+  std::vector<bench_param<index_t>> param_vec;
+  struct TestParams {
+    index_t m;
+    index_t n;
+    index_t k;
+  };
+
+  const std::vector<TestParams> params_group{
+    {20000, 500, 1},    {20000, 500, 2},    {20000, 500, 4},   {20000, 500, 8},
+    {20000, 500, 16},   {20000, 500, 32},   {20000, 500, 64},  {20000, 500, 128},
+    {20000, 500, 256},
+
+    {1000, 10000, 1},   {1000, 10000, 2},   {1000, 10000, 4},  {1000, 10000, 8},
+    {1000, 10000, 16},  {1000, 10000, 32},  {1000, 10000, 64}, {1000, 10000, 128},
+    {1000, 10000, 256},
+
+    {100, 100000, 1},   {100, 100000, 2},   {100, 100000, 4},  {100, 100000, 8},
+    {100, 100000, 16},  {100, 100000, 32},  {100, 100000, 64}, {100, 100000, 128},
+    {100, 100000, 256},
+
+    {10, 1000000, 1},   {10, 1000000, 2},   {10, 1000000, 4},  {10, 1000000, 8},
+    {10, 1000000, 16},  {10, 1000000, 32},  {10, 1000000, 64}, {10, 1000000, 128},
+    {10, 1000000, 256},
+
+    {10, 1000000, 1},   {10, 1000000, 2},   {10, 1000000, 4},  {10, 1000000, 8},
+    {10, 1000000, 16},  {10, 1000000, 32},  {10, 1000000, 64}, {10, 1000000, 128},
+    {10, 1000000, 256},
+
+    {10, 1000000, 1},   {10, 1000000, 16},  {10, 1000000, 64}, {10, 1000000, 128},
+    {10, 1000000, 256},
+
+    {10, 1000000, 1},   {10, 1000000, 16},  {10, 1000000, 64}, {10, 1000000, 128},
+    {10, 1000000, 256}, {1000, 10000, 1},   {1000, 10000, 16}, {1000, 10000, 64},
+    {1000, 10000, 128}, {1000, 10000, 256},
+
+    {10, 1000000, 1},   {10, 1000000, 16},  {10, 1000000, 64}, {10, 1000000, 128},
+    {10, 1000000, 256}, {1000, 10000, 1},   {1000, 10000, 16}, {1000, 10000, 64},
+    {1000, 10000, 128}, {1000, 10000, 256}};
+
+  param_vec.reserve(params_group.size());
+  for (TestParams params : params_group) {
+    param_vec.push_back(bench_param<index_t>({params.m, params.n, params.k, 0.1}));
+  }
+  for (TestParams params : params_group) {
+    param_vec.push_back(bench_param<index_t>({params.m, params.n, params.k, 0.2}));
+  }
+  for (TestParams params : params_group) {
+    param_vec.push_back(bench_param<index_t>({params.m, params.n, params.k, 0.5}));
+  }
+  return param_vec;
+}
+
+RAFT_BENCH_REGISTER((SelectKCsrTest<float, uint32_t>), "", getInputs<uint32_t>());
+
+}  // namespace raft::bench::sparse
diff --git a/cpp/include/raft/matrix/detail/select_radix.cuh b/cpp/include/raft/matrix/detail/select_radix.cuh
index 36a346fda3..83d4845c31 100644
--- a/cpp/include/raft/matrix/detail/select_radix.cuh
+++ b/cpp/include/raft/matrix/detail/select_radix.cuh
@@ -442,14 +442,76 @@ _RAFT_DEVICE void last_filter(const T* in_buf,
   }
 }
 
-template <typename T, typename IdxT, int BitsPerPass>
+template <typename T, typename IdxT>
+_RAFT_DEVICE void set_buf_pointers(const T* in,
+                                   const IdxT* in_idx,
+                                   char* bufs,
+                                   IdxT buf_len,
+                                   int pass,
+                                   const T*& in_buf,
+                                   const IdxT*& in_idx_buf,
+                                   T*& out_buf,
+                                   IdxT*& out_idx_buf)
+{
+  // bufs consists of 4 pieces in order: buf1, buf2, idx_buf1, idx_buf2
+  if (pass == 0) {
+    in_buf      = in;
+    in_idx_buf  = nullptr;
+    out_buf     = nullptr;
+    out_idx_buf = nullptr;
+  } else if (pass == 1) {
+    in_buf      = in;
+    in_idx_buf  = in_idx;
+    out_buf     = reinterpret_cast<T*>(bufs);
+    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
+  } else if (pass % 2 == 0) {
+    in_buf      = reinterpret_cast<T*>(bufs);
+    in_idx_buf  = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
+    out_buf     = const_cast<T*>(in_buf + buf_len);
+    out_idx_buf = const_cast<IdxT*>(in_idx_buf + buf_len);
+  } else {
+    out_buf     = reinterpret_cast<T*>(bufs);
+    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
+    in_buf      = out_buf + buf_len;
+    in_idx_buf  = out_idx_buf + buf_len;
+  }
+}
+
+template <typename T, typename IdxT>
+_RAFT_DEVICE void set_buf_pointers(const T* in,
+                                   const IdxT* in_idx,
+                                   char* bufs,
+                                   IdxT buf_len,
+                                   const int pass,
+                                   const T*& out_buf,
+                                   const IdxT*& out_idx_buf)
+{
+  // bufs consists of 4 pieces in order: buf1, buf2, idx_buf1, idx_buf2
+  if (pass == 0) {
+    out_buf     = nullptr;
+    out_idx_buf = nullptr;
+  } else if (pass == 1) {
+    out_buf     = reinterpret_cast<T*>(bufs);
+    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
+  } else if (pass % 2 == 0) {
+    out_buf = const_cast<T*>(reinterpret_cast<T*>(bufs) + buf_len);
+    out_idx_buf =
+      const_cast<IdxT*>(reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len) + buf_len);
+  } else {
+    out_buf     = reinterpret_cast<T*>(bufs);
+    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
+  }
+}
+
+template <typename T, typename IdxT, int BitsPerPass, bool len_or_indptr = true>
 RAFT_KERNEL last_filter_kernel(const T* in,
                                const IdxT* in_idx,
-                               const T* in_buf,
-                               const IdxT* in_idx_buf,
+                               char* bufs,
+                               size_t offset,
                                T* out,
                                IdxT* out_idx,
                                const IdxT len,
+                               const IdxT* len_i,
                                const IdxT k,
                                Counter<T, IdxT>* counters,
                                const bool select_min)
@@ -458,22 +520,31 @@ RAFT_KERNEL last_filter_kernel(const T* in,
 
   Counter<T, IdxT>* counter = counters + batch_id;
   IdxT previous_len         = counter->previous_len;
+
   if (previous_len == 0) { return; }
+
+  const IdxT l_len    = len_or_indptr ? len : (len_i[batch_id + 1] - len_i[batch_id]);
+  const IdxT l_offset = len_or_indptr ? (offset + batch_id) * len : len_i[batch_id];
+
   const IdxT buf_len = calc_buf_len<T>(len);
-  if (previous_len > buf_len || in_buf == in) {
-    in_buf       = in + batch_id * len;
-    in_idx_buf   = in_idx ? (in_idx + batch_id * len) : nullptr;
-    previous_len = len;
-  } else {
-    in_buf += batch_id * buf_len;
-    in_idx_buf += batch_id * buf_len;
-  }
-  out += batch_id * k;
-  out_idx += batch_id * k;
+
+  const T* in_buf        = nullptr;
+  const IdxT* in_idx_buf = nullptr;
+  bufs += batch_id * buf_len * 2 * (sizeof(T) + sizeof(IdxT));
 
   constexpr int pass      = calc_num_passes<T, BitsPerPass>() - 1;
   constexpr int start_bit = calc_start_bit<T, BitsPerPass>(pass);
 
+  set_buf_pointers(in + l_offset, in_idx + l_offset, bufs, buf_len, pass, in_buf, in_idx_buf);
+
+  if (previous_len > buf_len || in_buf == in + l_offset) {
+    in_buf       = in + l_offset;
+    in_idx_buf   = in_idx ? (in_idx + l_offset) : nullptr;
+    previous_len = l_len;
+  }
+  out += batch_id * k;
+  out_idx += batch_id * k;
+
   const auto kth_value_bits    = counter->kth_value_bits;
   const IdxT num_of_kth_needed = counter->k;
   IdxT* p_out_cnt              = &counter->out_cnt;
@@ -510,6 +581,29 @@ RAFT_KERNEL last_filter_kernel(const T* in,
                      f);
 }
 
+template <typename T, typename IdxT, typename S>
+_RAFT_DEVICE _RAFT_FORCEINLINE void copy_in_val(
+  T* dest, const T* src, S len, IdxT k, const bool select_min)
+{
+  S idx               = S(threadIdx.x);
+  S stride            = S(blockDim.x);
+  const T default_val = select_min ? upper_bound<T>() : lower_bound<T>();
+  for (S i = idx; i < k; i += stride) {
+    dest[i] = i < len ? src[i] : default_val;
+  }
+}
+
+template <typename T, typename S>
+_RAFT_DEVICE _RAFT_FORCEINLINE void copy_in_idx(T* dest, const T* src, S len)
+{
+  S idx    = S(threadIdx.x);
+  S stride = S(blockDim.x);
+
+  for (S i = idx; i < len; i += stride) {
+    dest[i] = src ? src[i] : i;
+  }
+}
+
 /**
  *
  * It is expected to call this kernel multiple times (passes), in each pass we process a radix,
@@ -545,13 +639,16 @@ RAFT_KERNEL last_filter_kernel(const T* in,
  * rather than from `in_buf`. The benefit is that we can save the cost of writing candidates and
  * their indices.
  */
-template <typename T, typename IdxT, int BitsPerPass, int BlockSize, bool fused_last_filter>
+template <typename T,
+          typename IdxT,
+          int BitsPerPass,
+          int BlockSize,
+          bool fused_last_filter,
+          bool len_or_indptr>
 RAFT_KERNEL radix_kernel(const T* in,
                          const IdxT* in_idx,
-                         const T* in_buf,
-                         const IdxT* in_idx_buf,
-                         T* out_buf,
-                         IdxT* out_idx_buf,
+                         char* bufs,
+                         size_t offset,
                          T* out,
                          IdxT* out_idx,
                          Counter<T, IdxT>* counters,
@@ -567,21 +664,38 @@ RAFT_KERNEL radix_kernel(const T* in,
   IdxT current_k;
   IdxT previous_len;
   IdxT current_len;
+
+  const IdxT l_len    = len_or_indptr ? len : (len_i[batch_id + 1] - len_i[batch_id]);
+  const IdxT l_offset = len_or_indptr ? (offset + batch_id) * len : len_i[batch_id];
+
   if (pass == 0) {
     current_k    = k;
-    previous_len = len;
+    previous_len = l_len;
     // Need to do this so setting counter->previous_len for the next pass is correct.
     // This value is meaningless for pass 0, but it's fine because pass 0 won't be the
     // last pass in this implementation so pass 0 won't hit the "if (pass ==
     // num_passes - 1)" branch.
     // Maybe it's better to reload counter->previous_len and use it rather than
     // current_len in last_filter()
-    current_len = len;
+    current_len = l_len;
   } else {
     current_k    = counter->k;
     current_len  = counter->len;
     previous_len = counter->previous_len;
   }
+  if constexpr (!len_or_indptr) {
+    if (pass == 0 && l_len <= k) {
+      copy_in_val(out + batch_id * k, in + l_offset, l_len, k, select_min);
+      copy_in_idx(out_idx + batch_id * k, (in_idx ? (in_idx + l_offset) : nullptr), l_len);
+      if (threadIdx.x == 0) {
+        counter->previous_len = 0;
+        counter->len          = 0;
+      }
+      __syncthreads();
+      return;
+    }
+  }
+
   if (current_len == 0) { return; }
 
   // When k=len, early_stop will be true at pass 0. It means filter_and_histogram() should handle
@@ -590,20 +704,33 @@ RAFT_KERNEL radix_kernel(const T* in,
   const bool early_stop = (current_len == current_k);
   const IdxT buf_len    = calc_buf_len<T>(len);
 
+  const T* in_buf;
+  const IdxT* in_idx_buf;
+  T* out_buf;
+  IdxT* out_idx_buf;
+  bufs += batch_id * buf_len * 2 * (sizeof(T) + sizeof(IdxT));
+
+  set_buf_pointers(in + l_offset,
+                   (in_idx ? (in_idx + l_offset) : nullptr),
+                   bufs,
+                   buf_len,
+                   pass,
+                   in_buf,
+                   in_idx_buf,
+                   out_buf,
+                   out_idx_buf);
+
   // "previous_len > buf_len" means previous pass skips writing buffer
   if (pass == 0 || pass == 1 || previous_len > buf_len) {
-    in_buf       = in + batch_id * len;
-    in_idx_buf   = in_idx ? (in_idx + batch_id * len) : nullptr;
-    previous_len = len;
-  } else {
-    in_buf += batch_id * buf_len;
-    in_idx_buf += batch_id * buf_len;
+    in_buf       = in + l_offset;
+    in_idx_buf   = in_idx ? (in_idx + l_offset) : nullptr;
+    previous_len = l_len;
   }
 
   // in case we have individual len for each query defined we want to make sure
   // that we only iterate valid elements.
   if (len_i != nullptr) {
-    const IdxT max_len = max(len_i[batch_id], k);
+    const IdxT max_len = max(l_len, k);
     if (max_len < previous_len) previous_len = max_len;
   }
 
@@ -611,9 +738,6 @@ RAFT_KERNEL radix_kernel(const T* in,
   if (pass == 0 || current_len > buf_len) {
     out_buf     = nullptr;
     out_idx_buf = nullptr;
-  } else {
-    out_buf += batch_id * buf_len;
-    out_idx_buf += batch_id * buf_len;
   }
   out += batch_id * k;
   out_idx += batch_id * k;
@@ -640,7 +764,6 @@ RAFT_KERNEL radix_kernel(const T* in,
     unsigned int finished = atomicInc(&counter->finished_block_cnt, gridDim.x - 1);
     isLastBlock           = (finished == (gridDim.x - 1));
   }
-
   if (__syncthreads_or(isLastBlock)) {
     if (early_stop) {
       if (threadIdx.x == 0) {
@@ -676,7 +799,7 @@ RAFT_KERNEL radix_kernel(const T* in,
                                           out_idx_buf ? out_idx_buf : in_idx_buf,
                                           out,
                                           out_idx,
-                                          out_buf ? current_len : len,
+                                          out_buf ? current_len : l_len,
                                           k,
                                           counter,
                                           select_min,
@@ -726,7 +849,7 @@ unsigned calc_grid_dim(int batch_size, IdxT len, int sm_cnt)
 
   int active_blocks;
   RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &active_blocks, radix_kernel<T, IdxT, BitsPerPass, BlockSize, false>, BlockSize, 0));
+    &active_blocks, radix_kernel<T, IdxT, BitsPerPass, BlockSize, false, true>, BlockSize, 0));
   active_blocks *= sm_cnt;
 
   IdxT best_num_blocks         = 0;
@@ -757,78 +880,7 @@ unsigned calc_grid_dim(int batch_size, IdxT len, int sm_cnt)
   return best_num_blocks;
 }
 
-template <typename T, typename IdxT>
-_RAFT_HOST void set_buf_pointers(const T* in,
-                                 const IdxT* in_idx,
-                                 T* buf1,
-                                 IdxT* idx_buf1,
-                                 T* buf2,
-                                 IdxT* idx_buf2,
-                                 int pass,
-                                 const T*& in_buf,
-                                 const IdxT*& in_idx_buf,
-                                 T*& out_buf,
-                                 IdxT*& out_idx_buf)
-{
-  if (pass == 0) {
-    in_buf      = in;
-    in_idx_buf  = nullptr;
-    out_buf     = nullptr;
-    out_idx_buf = nullptr;
-  } else if (pass == 1) {
-    in_buf      = in;
-    in_idx_buf  = in_idx;
-    out_buf     = buf1;
-    out_idx_buf = idx_buf1;
-  } else if (pass % 2 == 0) {
-    in_buf      = buf1;
-    in_idx_buf  = idx_buf1;
-    out_buf     = buf2;
-    out_idx_buf = idx_buf2;
-  } else {
-    in_buf      = buf2;
-    in_idx_buf  = idx_buf2;
-    out_buf     = buf1;
-    out_idx_buf = idx_buf1;
-  }
-}
-
-template <typename T, typename IdxT>
-_RAFT_DEVICE void set_buf_pointers(const T* in,
-                                   const IdxT* in_idx,
-                                   char* bufs,
-                                   IdxT buf_len,
-                                   int pass,
-                                   const T*& in_buf,
-                                   const IdxT*& in_idx_buf,
-                                   T*& out_buf,
-                                   IdxT*& out_idx_buf)
-{
-  // bufs consists of 4 pieces in order: buf1, buf2, idx_buf1, idx_buf2
-  if (pass == 0) {
-    in_buf      = in;
-    in_idx_buf  = nullptr;
-    out_buf     = nullptr;
-    out_idx_buf = nullptr;
-  } else if (pass == 1) {
-    in_buf      = in;
-    in_idx_buf  = in_idx;
-    out_buf     = reinterpret_cast<T*>(bufs);
-    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
-  } else if (pass % 2 == 0) {
-    in_buf      = reinterpret_cast<T*>(bufs);
-    in_idx_buf  = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
-    out_buf     = const_cast<T*>(in_buf + buf_len);
-    out_idx_buf = const_cast<IdxT*>(in_idx_buf + buf_len);
-  } else {
-    out_buf     = reinterpret_cast<T*>(bufs);
-    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
-    in_buf      = out_buf + buf_len;
-    in_idx_buf  = out_idx_buf + buf_len;
-  }
-}
-
-template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize, bool len_or_indptr>
 void radix_topk(const T* in,
                 const IdxT* in_idx,
                 int batch_size,
@@ -850,7 +902,7 @@ void radix_topk(const T* in,
 
   if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); }
 
-  auto kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, false>;
+  auto kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, false, len_or_indptr>;
   const size_t max_chunk_size =
     calc_chunk_size<T, IdxT, BlockSize>(batch_size, len, sm_cnt, kernel, false);
   if (max_chunk_size != static_cast<size_t>(batch_size)) {
@@ -862,55 +914,33 @@ void radix_topk(const T* in,
 
   rmm::device_uvector<Counter<T, IdxT>> counters(max_chunk_size, stream, mr);
   rmm::device_uvector<IdxT> histograms(max_chunk_size * num_buckets, stream, mr);
-  rmm::device_uvector<T> buf1(max_chunk_size * buf_len, stream, mr);
-  rmm::device_uvector<IdxT> idx_buf1(max_chunk_size * buf_len, stream, mr);
-  rmm::device_uvector<T> buf2(max_chunk_size * buf_len, stream, mr);
-  rmm::device_uvector<IdxT> idx_buf2(max_chunk_size * buf_len, stream, mr);
+
+  rmm::device_uvector<char> bufs(
+    max_chunk_size * buf_len * 2 * (sizeof(T) + sizeof(IdxT)), stream, mr);
 
   for (size_t offset = 0; offset < static_cast<size_t>(batch_size); offset += max_chunk_size) {
     int chunk_size = std::min(max_chunk_size, batch_size - offset);
     RAFT_CUDA_TRY(
       cudaMemsetAsync(counters.data(), 0, counters.size() * sizeof(Counter<T, IdxT>), stream));
     RAFT_CUDA_TRY(cudaMemsetAsync(histograms.data(), 0, histograms.size() * sizeof(IdxT), stream));
-    auto kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, false>;
+    auto kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, false, len_or_indptr>;
 
-    const T* chunk_in        = in + offset * len;
-    const IdxT* chunk_in_idx = in_idx ? (in_idx + offset * len) : nullptr;
-    T* chunk_out             = out + offset * k;
-    IdxT* chunk_out_idx      = out_idx + offset * k;
-    const IdxT* chunk_len_i  = len_i ? (len_i + offset) : nullptr;
-
-    const T* in_buf        = nullptr;
-    const IdxT* in_idx_buf = nullptr;
-    T* out_buf             = nullptr;
-    IdxT* out_idx_buf      = nullptr;
+    T* chunk_out            = out + offset * k;
+    IdxT* chunk_out_idx     = out_idx + offset * k;
+    const IdxT* chunk_len_i = len_i ? (len_i + offset) : nullptr;
 
     dim3 blocks(grid_dim, chunk_size);
     constexpr int num_passes = calc_num_passes<T, BitsPerPass>();
 
     for (int pass = 0; pass < num_passes; ++pass) {
-      set_buf_pointers(chunk_in,
-                       chunk_in_idx,
-                       buf1.data(),
-                       idx_buf1.data(),
-                       buf2.data(),
-                       idx_buf2.data(),
-                       pass,
-                       in_buf,
-                       in_idx_buf,
-                       out_buf,
-                       out_idx_buf);
-
       if (fused_last_filter && pass == num_passes - 1) {
-        kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, true>;
+        kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, true, len_or_indptr>;
       }
 
-      kernel<<<blocks, BlockSize, 0, stream>>>(chunk_in,
-                                               chunk_in_idx,
-                                               in_buf,
-                                               in_idx_buf,
-                                               out_buf,
-                                               out_idx_buf,
+      kernel<<<blocks, BlockSize, 0, stream>>>(in,
+                                               in_idx,
+                                               bufs.data(),
+                                               offset,
                                                chunk_out,
                                                chunk_out_idx,
                                                counters.data(),
@@ -924,16 +954,18 @@ void radix_topk(const T* in,
     }
 
     if (!fused_last_filter) {
-      last_filter_kernel<T, IdxT, BitsPerPass><<<blocks, BlockSize, 0, stream>>>(chunk_in,
-                                                                                 chunk_in_idx,
-                                                                                 out_buf,
-                                                                                 out_idx_buf,
-                                                                                 chunk_out,
-                                                                                 chunk_out_idx,
-                                                                                 len,
-                                                                                 k,
-                                                                                 counters.data(),
-                                                                                 select_min);
+      last_filter_kernel<T, IdxT, BitsPerPass, len_or_indptr>
+        <<<blocks, BlockSize, 0, stream>>>(in,
+                                           in_idx,
+                                           bufs.data(),
+                                           offset,
+                                           chunk_out,
+                                           chunk_out_idx,
+                                           len,
+                                           chunk_len_i,
+                                           k,
+                                           counters.data(),
+                                           select_min);
       RAFT_CUDA_TRY(cudaPeekAtLastError());
     }
   }
@@ -1015,7 +1047,7 @@ _RAFT_DEVICE void filter_and_histogram_for_one_block(const T* in_buf,
   }
 }
 
-template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize, bool len_or_indptr>
 RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
                                         const IdxT* in_idx,
                                         const IdxT len,
@@ -1024,30 +1056,48 @@ RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
                                         T* out,
                                         IdxT* out_idx,
                                         const bool select_min,
-                                        char* bufs)
+                                        char* bufs,
+                                        size_t offset)
 {
   constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
   __shared__ Counter<T, IdxT> counter;
   __shared__ IdxT histogram[num_buckets];
 
+  const size_t batch_id = blockIdx.x;  // size_t to avoid multiplication overflow
+
+  IdxT l_len    = len;
+  IdxT l_offset = (offset + batch_id) * len;
+  if constexpr (!len_or_indptr) {
+    l_offset = len_i[batch_id];
+    l_len    = len_i[batch_id + 1] - l_offset;
+  }
+
   if (threadIdx.x == 0) {
     counter.k              = k;
-    counter.len            = len;
-    counter.previous_len   = len;
+    counter.len            = l_len;
+    counter.previous_len   = l_len;
     counter.kth_value_bits = 0;
     counter.out_cnt        = 0;
     counter.out_back_cnt   = 0;
   }
   __syncthreads();
 
-  const size_t batch_id = blockIdx.x;  // size_t to avoid multiplication overflow
-  in += batch_id * len;
-  if (in_idx) { in_idx += batch_id * len; }
+  in += l_offset;
+  if (in_idx) { in_idx += l_offset; }
   out += batch_id * k;
   out_idx += batch_id * k;
   const IdxT buf_len = calc_buf_len<T, IdxT, unsigned>(len);
   bufs += batch_id * buf_len * 2 * (sizeof(T) + sizeof(IdxT));
 
+  if constexpr (!len_or_indptr) {
+    if (l_len <= k) {
+      copy_in_val(out, in, l_len, k, select_min);
+      copy_in_idx(out_idx, in_idx, l_len);
+      __syncthreads();
+      return;
+    }
+  }
+
   constexpr int num_passes = calc_num_passes<T, BitsPerPass>();
   for (int pass = 0; pass < num_passes; ++pass) {
     const T* in_buf;
@@ -1073,7 +1123,7 @@ RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
     // in case we have individual len for each query defined we want to make sure
     // that we only iterate valid elements.
     if (len_i != nullptr) {
-      const IdxT max_len = max(len_i[batch_id], k);
+      const IdxT max_len = max(l_len, k);
       if (max_len < previous_len) previous_len = max_len;
     }
 
@@ -1102,7 +1152,7 @@ RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
                                         out_buf ? out_idx_buf : in_idx,
                                         out,
                                         out_idx,
-                                        out_buf ? current_len : len,
+                                        out_buf ? current_len : l_len,
                                         k,
                                         &counter,
                                         select_min,
@@ -1117,7 +1167,7 @@ RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
 // counters and global histograms, can be kept in shared memory and cheap sync operations can be
 // used. It's used when len is relatively small or when the number of blocks per row calculated by
 // `calc_grid_dim()` is 1.
-template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize, bool len_or_indptr>
 void radix_topk_one_block(const T* in,
                           const IdxT* in_idx,
                           int batch_size,
@@ -1133,7 +1183,7 @@ void radix_topk_one_block(const T* in,
 {
   static_assert(calc_num_passes<T, BitsPerPass>() > 1);
 
-  auto kernel        = radix_topk_one_block_kernel<T, IdxT, BitsPerPass, BlockSize>;
+  auto kernel        = radix_topk_one_block_kernel<T, IdxT, BitsPerPass, BlockSize, len_or_indptr>;
   const IdxT buf_len = calc_buf_len<T, IdxT, unsigned>(len);
   const size_t max_chunk_size =
     calc_chunk_size<T, IdxT, BlockSize>(batch_size, len, sm_cnt, kernel, true);
@@ -1144,15 +1194,16 @@ void radix_topk_one_block(const T* in,
   for (size_t offset = 0; offset < static_cast<size_t>(batch_size); offset += max_chunk_size) {
     int chunk_size          = std::min(max_chunk_size, batch_size - offset);
     const IdxT* chunk_len_i = len_i ? (len_i + offset) : nullptr;
-    kernel<<<chunk_size, BlockSize, 0, stream>>>(in + offset * len,
-                                                 in_idx ? (in_idx + offset * len) : nullptr,
+    kernel<<<chunk_size, BlockSize, 0, stream>>>(in,
+                                                 in_idx,
                                                  len,
                                                  chunk_len_i,
                                                  k,
                                                  out + offset * k,
                                                  out_idx + offset * k,
                                                  select_min,
-                                                 bufs.data());
+                                                 bufs.data(),
+                                                 offset);
   }
 }
 
@@ -1182,6 +1233,10 @@ void radix_topk_one_block(const T* in,
  *   it affects the number of passes and number of buckets.
  * @tparam BlockSize
  *   Number of threads in a kernel thread block.
+ * @tparam len_or_indptr
+ *   Flag to interpret `len_i` as either direct row lengths (true) or CSR format
+ *   index pointers (false). When true, each `len_i` element denotes the length of a row. When
+ *   false, `len_i` represents the index pointers for a CSR matrix with shape of `batch_size + 1`.
  *
  * @param[in] res container of reusable resources
  * @param[in] in
@@ -1212,9 +1267,12 @@ void radix_topk_one_block(const T* in,
  *   same. That is, when the value range of input data is narrow. In such case, there could be a
  *   large number of inputs for the last filter, hence using multiple thread blocks is beneficial.
  * @param len_i
- *   optional array of size (batch_size) providing lengths for each individual row
+ *   Optional array used differently based on `len_or_indptr`:
+ *   When `len_or_indptr` is true, `len_i` presents the lengths of each row, which is `batch_size`.
+ *   When `len_or_indptr` is false, `len_i` works like a indptr for a CSR matrix. The length of each
+ *   row would be (`len_i[row_id + 1] - len_i[row_id]`). `len_i` size is `batch_size + 1`.
  */
-template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize, bool len_or_indptr = true>
 void select_k(raft::resources const& res,
               const T* in,
               const IdxT* in_idx,
@@ -1227,9 +1285,12 @@ void select_k(raft::resources const& res,
               bool fused_last_filter,
               const IdxT* len_i)
 {
+  RAFT_EXPECTS(!(!len_or_indptr && (len_i == nullptr)),
+               "When `len_or_indptr` is false, `len_i` must not be nullptr!");
+
   auto stream = resource::get_cuda_stream(res);
   auto mr     = resource::get_workspace_resource(res);
-  if (k == len) {
+  if (k == len && len_or_indptr) {
     RAFT_CUDA_TRY(
       cudaMemcpyAsync(out, in, sizeof(T) * batch_size * len, cudaMemcpyDeviceToDevice, stream));
     if (in_idx) {
@@ -1248,29 +1309,29 @@ void select_k(raft::resources const& res,
   constexpr int items_per_thread = 32;
 
   if (len <= BlockSize * items_per_thread) {
-    impl::radix_topk_one_block<T, IdxT, BitsPerPass, BlockSize>(
+    impl::radix_topk_one_block<T, IdxT, BitsPerPass, BlockSize, len_or_indptr>(
       in, in_idx, batch_size, len, k, out, out_idx, select_min, len_i, sm_cnt, stream, mr);
   } else {
     unsigned grid_dim =
       impl::calc_grid_dim<T, IdxT, BitsPerPass, BlockSize>(batch_size, len, sm_cnt);
     if (grid_dim == 1) {
-      impl::radix_topk_one_block<T, IdxT, BitsPerPass, BlockSize>(
+      impl::radix_topk_one_block<T, IdxT, BitsPerPass, BlockSize, len_or_indptr>(
         in, in_idx, batch_size, len, k, out, out_idx, select_min, len_i, sm_cnt, stream, mr);
     } else {
-      impl::radix_topk<T, IdxT, BitsPerPass, BlockSize>(in,
-                                                        in_idx,
-                                                        batch_size,
-                                                        len,
-                                                        k,
-                                                        out,
-                                                        out_idx,
-                                                        select_min,
-                                                        fused_last_filter,
-                                                        len_i,
-                                                        grid_dim,
-                                                        sm_cnt,
-                                                        stream,
-                                                        mr);
+      impl::radix_topk<T, IdxT, BitsPerPass, BlockSize, len_or_indptr>(in,
+                                                                       in_idx,
+                                                                       batch_size,
+                                                                       len,
+                                                                       k,
+                                                                       out,
+                                                                       out_idx,
+                                                                       select_min,
+                                                                       fused_last_filter,
+                                                                       len_i,
+                                                                       grid_dim,
+                                                                       sm_cnt,
+                                                                       stream,
+                                                                       mr);
     }
   }
 }
diff --git a/cpp/include/raft/matrix/detail/select_warpsort.cuh b/cpp/include/raft/matrix/detail/select_warpsort.cuh
index 572558153d..2cb32585d5 100644
--- a/cpp/include/raft/matrix/detail/select_warpsort.cuh
+++ b/cpp/include/raft/matrix/detail/select_warpsort.cuh
@@ -754,22 +754,32 @@ template <template <int, bool, typename, typename> class WarpSortClass,
           bool Ascending,
           typename T,
           typename IdxT>
-__launch_bounds__(256) RAFT_KERNEL
-  block_kernel(const T* in, const IdxT* in_idx, IdxT len, int k, T* out, IdxT* out_idx)
+__launch_bounds__(256) RAFT_KERNEL block_kernel(const T* in,
+                                                const IdxT* in_idx,
+                                                const IdxT* in_indptr,
+                                                size_t offset,
+                                                IdxT len,
+                                                int k,
+                                                T* out,
+                                                IdxT* out_idx)
 {
   extern __shared__ __align__(256) uint8_t smem_buf_bytes[];
   using bq_t         = block_sort<WarpSortClass, Capacity, Ascending, T, IdxT>;
   uint8_t* warp_smem = bq_t::queue_t::mem_required(blockDim.x) > 0 ? smem_buf_bytes : nullptr;
   bq_t queue(k, warp_smem);
+  const size_t batch_id = blockIdx.y;
 
-  in += blockIdx.y * len;
-  if (in_idx != nullptr) { in_idx += blockIdx.y * len; }
+  const IdxT l_len    = in_indptr ? (in_indptr[batch_id + 1] - in_indptr[batch_id]) : len;
+  const IdxT l_offset = in_indptr ? in_indptr[batch_id] : (offset + batch_id) * len;
+
+  in += l_offset;
+  if (in_idx != nullptr) { in_idx += l_offset; }
 
   const IdxT stride         = gridDim.x * blockDim.x;
-  const IdxT per_thread_lim = len + laneId();
+  const IdxT per_thread_lim = l_len + laneId();
   for (IdxT i = threadIdx.x + blockIdx.x * blockDim.x; i < per_thread_lim; i += stride) {
-    queue.add(i < len ? __ldcs(in + i) : WarpSortClass<Capacity, Ascending, T, IdxT>::kDummy,
-              (i < len && in_idx != nullptr) ? __ldcs(in_idx + i) : i);
+    queue.add(i < l_len ? __ldcs(in + i) : WarpSortClass<Capacity, Ascending, T, IdxT>::kDummy,
+              (i < l_len && in_idx != nullptr) ? __ldcs(in_idx + i) : i);
   }
 
   queue.done(smem_buf_bytes);
@@ -832,6 +842,7 @@ struct launch_setup {
                      int smem_size,
                      const T* in_key,
                      const IdxT* in_idx,
+                     const IdxT* in_indptr,
                      T* out_key,
                      IdxT* out_idx,
                      rmm::cuda_stream_view stream)
@@ -848,6 +859,7 @@ struct launch_setup {
                                                                           smem_size,
                                                                           in_key,
                                                                           in_idx,
+                                                                          in_indptr,
                                                                           out_key,
                                                                           out_idx,
                                                                           stream);
@@ -858,21 +870,23 @@ struct launch_setup {
     // This is less than cuda's max block dim along Y axis (65535), but it's a
     // power-of-two, which ensures the alignment of batches in memory.
     constexpr size_t kMaxGridDimY = 32768;
+    size_t g_offset               = 0;
     for (size_t offset = 0; offset < batch_size; offset += kMaxGridDimY) {
       size_t batch_chunk = std::min<size_t>(kMaxGridDimY, batch_size - offset);
       dim3 gs(num_blocks, batch_chunk, 1);
       if (select_min) {
-        block_kernel<WarpSortClass, Capacity, true, T, IdxT>
-          <<<gs, block_dim, smem_size, stream>>>(in_key, in_idx, IdxT(len), k, out_key, out_idx);
+        block_kernel<WarpSortClass, Capacity, true, T, IdxT><<<gs, block_dim, smem_size, stream>>>(
+          in_key, in_idx, in_indptr, g_offset, IdxT(len), k, out_key, out_idx);
       } else {
-        block_kernel<WarpSortClass, Capacity, false, T, IdxT>
-          <<<gs, block_dim, smem_size, stream>>>(in_key, in_idx, IdxT(len), k, out_key, out_idx);
+        block_kernel<WarpSortClass, Capacity, false, T, IdxT><<<gs, block_dim, smem_size, stream>>>(
+          in_key, in_idx, in_indptr, g_offset, IdxT(len), k, out_key, out_idx);
       }
       RAFT_CUDA_TRY(cudaPeekAtLastError());
       out_key += batch_chunk * num_blocks * k;
       out_idx += batch_chunk * num_blocks * k;
-      in_key += batch_chunk * len;
-      if (in_idx != nullptr) { in_idx += batch_chunk * len; }
+
+      if (in_indptr != nullptr) { in_indptr += batch_chunk; };
+      g_offset += batch_chunk;
     }
   }
 };
@@ -1010,6 +1024,7 @@ void select_k_(int num_of_block,
                int num_of_warp,
                const T* in,
                const IdxT* in_idx,
+               const IdxT* in_indptr,
                size_t batch_size,
                size_t len,
                int k,
@@ -1041,6 +1056,7 @@ void select_k_(int num_of_block,
                                                smem_size,
                                                in,
                                                in_idx,
+                                               in_indptr,
                                                result_val,
                                                result_idx,
                                                stream);
@@ -1056,6 +1072,7 @@ void select_k_(int num_of_block,
                                                  smem_size,
                                                  tmp_val.data(),
                                                  tmp_idx.data(),
+                                                 nullptr,
                                                  out,
                                                  out_idx,
                                                  stream);
@@ -1071,7 +1088,8 @@ void select_k_impl(raft::resources const& res,
                    int k,
                    T* out,
                    IdxT* out_idx,
-                   bool select_min)
+                   bool select_min,
+                   const IdxT* in_indptr = nullptr)
 {
   int num_of_block = 0;
   int num_of_warp  = 0;
@@ -1082,6 +1100,7 @@ void select_k_impl(raft::resources const& res,
                                     num_of_warp,
                                     in,
                                     in_idx,
+                                    in_indptr,
                                     batch_size,
                                     len,
                                     k,
@@ -1126,6 +1145,9 @@ void select_k_impl(raft::resources const& res,
  *   the payload selected together with `out`.
  * @param select_min
  *   whether to select k smallest (true) or largest (false) keys.
+ * @param[in] in_indptr
+ *   CSR indptr of the index matrix, which indicates the length for each row.
+ *   `nullptr` by default, under this situation, @p len is used as the length.
  */
 template <typename T, typename IdxT>
 void select_k(raft::resources const& res,
@@ -1136,7 +1158,8 @@ void select_k(raft::resources const& res,
               int k,
               T* out,
               IdxT* out_idx,
-              bool select_min)
+              bool select_min,
+              const IdxT* in_indptr = nullptr)
 {
   ASSERT(k <= kMaxCapacity, "Current max k is %d (requested %d)", kMaxCapacity, k);
   ASSERT(len <= size_t(std::numeric_limits<IdxT>::max()),
@@ -1155,6 +1178,7 @@ void select_k(raft::resources const& res,
                                             num_of_warp,
                                             in,
                                             in_idx,
+                                            in_indptr,
                                             batch_size,
                                             len,
                                             k,
@@ -1170,6 +1194,7 @@ void select_k(raft::resources const& res,
                                            num_of_warp,
                                            in,
                                            in_idx,
+                                           in_indptr,
                                            batch_size,
                                            len,
                                            k,
diff --git a/cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh b/cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh
new file mode 100644
index 0000000000..08bdfa6f30
--- /dev/null
+++ b/cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/matrix/select_k_types.hpp>
+#include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
+
+#include <rmm/cuda_stream_view.hpp>                  // rmm:cuda_stream_view
+#include <rmm/mr/device/device_memory_resource.hpp>  // rmm::mr::device_memory_resource
+
+#include <cuda_fp16.h>  // __half
+
+#include <cstdint>  // uint32_t
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+namespace raft::sparse::matrix::detail {
+
+template <typename T, typename IdxT>
+void select_k(raft::resources const& handle,
+              raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx,
+              raft::device_matrix_view<T, IdxT, raft::row_major> out_val,
+              raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,
+              bool select_min,
+              bool sorted                   = false,
+              raft::matrix::SelectAlgo algo = SelectAlgo::kAuto) RAFT_EXPLICIT;
+}  // namespace raft::sparse::matrix::detail
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  extern template void raft::sparse::matrix::detail::select_k(        \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(__half, uint32_t);
+instantiate_raft_sparse_matrix_detail_select_k(__half, int64_t);
+instantiate_raft_sparse_matrix_detail_select_k(float, int64_t);
+instantiate_raft_sparse_matrix_detail_select_k(float, uint32_t);
+instantiate_raft_sparse_matrix_detail_select_k(float, int);
+instantiate_raft_sparse_matrix_detail_select_k(double, int64_t);
+instantiate_raft_sparse_matrix_detail_select_k(double, uint32_t);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/include/raft/sparse/matrix/detail/select_k-inl.cuh b/cpp/include/raft/sparse/matrix/detail/select_k-inl.cuh
new file mode 100644
index 0000000000..5f39affce6
--- /dev/null
+++ b/cpp/include/raft/sparse/matrix/detail/select_k-inl.cuh
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/nvtx.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/linalg/map.cuh>
+#include <raft/matrix/detail/select_k-inl.cuh>
+#include <raft/matrix/select_k_types.hpp>
+
+#include <cub/cub.cuh>
+
+#include <type_traits>
+
+namespace raft::sparse::matrix::detail {
+
+using namespace raft::matrix::detail;
+using raft::matrix::SelectAlgo;
+
+/**
+ * Selects the k smallest or largest keys/values from each row of the input CSR matrix.
+ *
+ * This function operates on a CSR matrix `in_val` with a logical dense shape of [batch_size, len],
+ * selecting the k smallest or largest elements from each row. The selected elements are then stored
+ * in a row-major output matrix `out_val` with dimensions `batch_size` x k.
+ *
+ * @tparam T
+ *   Type of the elements being compared (keys).
+ * @tparam IdxT
+ *   Type of the indices associated with the keys.
+ * @tparam NZType
+ *   Type representing non-zero elements of `in_val`.
+ *
+ * @param[in] handle
+ *   Container for managing reusable resources.
+ * @param[in] in_val
+ *   Input matrix in CSR format with a logical dense shape of [batch_size, len],
+ *   containing the elements to be compared and selected.
+ * @param[in] in_idx
+ *   Optional input indices [in_val.nnz] associated with `in_val.values`.
+ *   If `in_idx` is `std::nullopt`, it defaults to a contiguous array from 0 to len-1.
+ * @param[out] out_val
+ *   Output matrix [in_val.get_n_row(), k] storing the selected k smallest/largest elements
+ *   from each row of `in_val`.
+ * @param[out] out_idx
+ *   Output indices [in_val.get_n_row(), k] corresponding to the selected elements in `out_val`.
+ * @param[in] select_min
+ *   Flag indicating whether to select the k smallest (true) or largest (false) elements.
+ * @param[in] sorted
+ *   whether to make sure selected pairs are sorted by value
+ * @param[in] algo
+ *   the selection algorithm to use
+ */
+template <typename T, typename IdxT>
+void select_k(raft::resources const& handle,
+              raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx,
+              raft::device_matrix_view<T, IdxT, raft::row_major> out_val,
+              raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,
+              bool select_min,
+              bool sorted     = false,
+              SelectAlgo algo = SelectAlgo::kAuto)
+{
+  auto csr_view = in_val.structure_view();
+  auto nnz      = csr_view.get_nnz();
+
+  if (nnz == 0) return;
+
+  auto batch_size = csr_view.get_n_rows();
+  auto len        = csr_view.get_n_cols();
+  auto k          = IdxT(out_val.extent(1));
+
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "sparse::matrix::select_k(batch_size = %zu, len = %zu, k = %d)", batch_size, len, k);
+
+  RAFT_EXPECTS(out_val.extent(1) <= int64_t(std::numeric_limits<int>::max()),
+               "output k must fit the int type.");
+
+  RAFT_EXPECTS(batch_size == out_val.extent(0), "batch sizes must be equal");
+  RAFT_EXPECTS(batch_size == out_idx.extent(0), "batch sizes must be equal");
+
+  if (in_idx.has_value()) {
+    RAFT_EXPECTS(size_t(nnz) == in_idx->size(),
+                 "nnz of in_val must be equal to the length of in_idx");
+  }
+  RAFT_EXPECTS(IdxT(k) == out_idx.extent(1), "value and index output lengths must be equal");
+
+  if (algo == SelectAlgo::kAuto) { algo = choose_select_k_algorithm(batch_size, len, k); }
+
+  auto indptr = csr_view.get_indptr().data();
+
+  switch (algo) {
+    case SelectAlgo::kRadix8bits:
+    case SelectAlgo::kRadix11bits:
+    case SelectAlgo::kRadix11bitsExtraPass: {
+      if (algo == SelectAlgo::kRadix8bits) {
+        select::radix::select_k<T, IdxT, 8, 512, false>(
+          handle,
+          in_val.get_elements().data(),
+          (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+          batch_size,
+          len,
+          k,
+          out_val.data_handle(),
+          out_idx.data_handle(),
+          select_min,
+          true,
+          indptr);
+      } else {
+        bool fused_last_filter = algo == SelectAlgo::kRadix11bits;
+        select::radix::select_k<T, IdxT, 11, 512, false>(
+          handle,
+          in_val.get_elements().data(),
+          (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+          batch_size,
+          len,
+          k,
+          out_val.data_handle(),
+          out_idx.data_handle(),
+          select_min,
+          fused_last_filter,
+          indptr);
+      }
+
+      if (sorted) {
+        auto offsets = make_device_mdarray<IdxT, IdxT>(
+          handle, resource::get_workspace_resource(handle), make_extents<IdxT>(batch_size + 1));
+        raft::linalg::map_offset(handle, offsets.view(), mul_const_op<IdxT>(k));
+
+        auto keys =
+          raft::make_device_vector_view<T, IdxT>(out_val.data_handle(), (IdxT)(batch_size * k));
+        auto vals =
+          raft::make_device_vector_view<IdxT, IdxT>(out_idx.data_handle(), (IdxT)(batch_size * k));
+
+        segmented_sort_by_key<T, IdxT>(
+          handle, raft::make_const_mdspan(offsets.view()), keys, vals, select_min);
+      }
+
+      return;
+    }
+    case SelectAlgo::kWarpDistributed:
+      return select::warpsort::select_k_impl<T, IdxT, select::warpsort::warp_sort_distributed>(
+        handle,
+        in_val.get_elements().data(),
+        (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+        batch_size,
+        len,
+        k,
+        out_val.data_handle(),
+        out_idx.data_handle(),
+        select_min,
+        indptr);
+    case SelectAlgo::kWarpDistributedShm:
+      return select::warpsort::select_k_impl<T, IdxT, select::warpsort::warp_sort_distributed_ext>(
+        handle,
+        in_val.get_elements().data(),
+        (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+        batch_size,
+        len,
+        k,
+        out_val.data_handle(),
+        out_idx.data_handle(),
+        select_min,
+        indptr);
+    case SelectAlgo::kWarpAuto:
+      return select::warpsort::select_k<T, IdxT>(
+        handle,
+        in_val.get_elements().data(),
+        (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+        batch_size,
+        len,
+        k,
+        out_val.data_handle(),
+        out_idx.data_handle(),
+        select_min,
+        indptr);
+    case SelectAlgo::kWarpImmediate:
+      return select::warpsort::select_k_impl<T, IdxT, select::warpsort::warp_sort_immediate>(
+        handle,
+        in_val.get_elements().data(),
+        (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+        batch_size,
+        len,
+        k,
+        out_val.data_handle(),
+        out_idx.data_handle(),
+        select_min,
+        indptr);
+    case SelectAlgo::kWarpFiltered:
+      return select::warpsort::select_k_impl<T, IdxT, select::warpsort::warp_sort_filtered>(
+        handle,
+        in_val.get_elements().data(),
+        (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+        batch_size,
+        len,
+        k,
+        out_val.data_handle(),
+        out_idx.data_handle(),
+        select_min,
+        indptr);
+    default: RAFT_FAIL("K-selection Algorithm not supported.");
+  }
+
+  return;
+}
+
+}  // namespace raft::sparse::matrix::detail
diff --git a/cpp/include/raft/sparse/matrix/detail/select_k.cuh b/cpp/include/raft/sparse/matrix/detail/select_k.cuh
new file mode 100644
index 0000000000..711169984b
--- /dev/null
+++ b/cpp/include/raft/sparse/matrix/detail/select_k.cuh
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY
+#include "select_k-inl.cuh"
+#endif
+
+#ifdef RAFT_COMPILED
+#include "select_k-ext.cuh"
+#endif
diff --git a/cpp/include/raft/sparse/matrix/select_k.cuh b/cpp/include/raft/sparse/matrix/select_k.cuh
new file mode 100644
index 0000000000..3f97e60c99
--- /dev/null
+++ b/cpp/include/raft/sparse/matrix/select_k.cuh
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/nvtx.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/matrix/select_k_types.hpp>
+#include <raft/sparse/matrix/detail/select_k.cuh>
+
+#include <optional>
+
+namespace raft::sparse::matrix {
+
+using SelectAlgo = raft::matrix::SelectAlgo;
+
+/**
+ * @defgroup select_k Batched-select k smallest or largest key/values
+ * @{
+ */
+
+/**
+ * Selects the k smallest or largest keys/values from each row of the input matrix.
+ *
+ * This function operates on a row-major matrix `in_val` with dimensions `batch_size` x `len`,
+ * selecting the k smallest or largest elements from each row. The selected elements are then stored
+ * in a row-major output matrix `out_val` with dimensions `batch_size` x k.
+ * If the total number of values in a row is less than K, then the extra position in the
+ * corresponding row of out_val will maintain the original value. This applies to out_idx
+ *
+ * @tparam T
+ *   Type of the elements being compared (keys).
+ * @tparam IdxT
+ *   Type of the indices associated with the keys.
+ *
+ * @param[in] handle
+ *   Container for managing reusable resources.
+ * @param[in] in_val
+ *   Input matrix in CSR format with a logical dense shape of [batch_size, len],
+ *   containing the elements to be compared and selected.
+ * @param[in] in_idx
+ *   Optional input indices [in_val.nnz] associated with `in_val.values`.
+ *   If `in_idx` is `std::nullopt`, it defaults to a contiguous array from 0 to len-1.
+ * @param[out] out_val
+ *   Output matrix [in_val.get_n_row(), k] storing the selected k smallest/largest elements
+ *   from each row of `in_val`.
+ * @param[out] out_idx
+ *   Output indices [in_val.get_n_row(), k] corresponding to the selected elements in `out_val`.
+ * @param[in] select_min
+ *   Flag indicating whether to select the k smallest (true) or largest (false) elements.
+ * @param[in] sorted
+ *   whether to make sure selected pairs are sorted by value
+ * @param[in] algo
+ *   the selection algorithm to use
+ */
+template <typename T, typename IdxT>
+void select_k(raft::resources const& handle,
+              raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx,
+              raft::device_matrix_view<T, IdxT, raft::row_major> out_val,
+              raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,
+              bool select_min,
+              bool sorted     = false,
+              SelectAlgo algo = SelectAlgo::kAuto)
+{
+  return detail::select_k<T, IdxT>(
+    handle, in_val, in_idx, out_val, out_idx, select_min, sorted, algo);
+}
+/** @} */  // end of group select_k
+
+}  // namespace raft::sparse::matrix
diff --git a/cpp/src/sparse/matrix/detail/select_k_double_int64_t.cu b/cpp/src/sparse/matrix/detail/select_k_double_int64_t.cu
new file mode 100644
index 0000000000..c784b50dad
--- /dev/null
+++ b/cpp/src/sparse/matrix/detail/select_k_double_int64_t.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/sparse/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  template void raft::sparse::matrix::detail::select_k(               \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(double, int64_t);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
\ No newline at end of file
diff --git a/cpp/src/sparse/matrix/detail/select_k_double_uint32_t.cu b/cpp/src/sparse/matrix/detail/select_k_double_uint32_t.cu
new file mode 100644
index 0000000000..98bab9a504
--- /dev/null
+++ b/cpp/src/sparse/matrix/detail/select_k_double_uint32_t.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/sparse/matrix/detail/select_k-inl.cuh>
+
+#include <cstdint>  // uint32_t
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  template void raft::sparse::matrix::detail::select_k(               \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(double, uint32_t);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
\ No newline at end of file
diff --git a/cpp/src/sparse/matrix/detail/select_k_float_int32.cu b/cpp/src/sparse/matrix/detail/select_k_float_int32.cu
new file mode 100644
index 0000000000..bff213ae69
--- /dev/null
+++ b/cpp/src/sparse/matrix/detail/select_k_float_int32.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/sparse/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  template void raft::matrix::detail::select_k(                       \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(float, int);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/src/sparse/matrix/detail/select_k_float_int64_t.cu b/cpp/src/sparse/matrix/detail/select_k_float_int64_t.cu
new file mode 100644
index 0000000000..412b06e587
--- /dev/null
+++ b/cpp/src/sparse/matrix/detail/select_k_float_int64_t.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/sparse/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  template void raft::sparse::matrix::detail::select_k(               \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(float, int64_t);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/src/sparse/matrix/detail/select_k_float_uint32_t.cu b/cpp/src/sparse/matrix/detail/select_k_float_uint32_t.cu
new file mode 100644
index 0000000000..8ba3f0e22b
--- /dev/null
+++ b/cpp/src/sparse/matrix/detail/select_k_float_uint32_t.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/sparse/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  template void raft::sparse::matrix::detail::select_k(               \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(float, uint32_t);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/src/sparse/matrix/detail/select_k_half_int64_t.cu b/cpp/src/sparse/matrix/detail/select_k_half_int64_t.cu
new file mode 100644
index 0000000000..24c844f8c8
--- /dev/null
+++ b/cpp/src/sparse/matrix/detail/select_k_half_int64_t.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/sparse/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  template void raft::sparse::matrix::detail::select_k(               \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(__half, int64_t);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/src/sparse/matrix/detail/select_k_half_uint32_t.cu b/cpp/src/sparse/matrix/detail/select_k_half_uint32_t.cu
new file mode 100644
index 0000000000..d63dc64933
--- /dev/null
+++ b/cpp/src/sparse/matrix/detail/select_k_half_uint32_t.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/sparse/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  template void raft::sparse::matrix::detail::select_k(               \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(__half, uint32_t);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 20ed3bacc7..4d17aacffd 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -320,6 +320,7 @@ if(BUILD_TESTS)
     test/sparse/reduce.cu
     test/sparse/row_op.cu
     test/sparse/sddmm.cu
+    test/sparse/select_k_csr.cu
     test/sparse/sort.cu
     test/sparse/spgemmi.cu
     test/sparse/spmm.cu
diff --git a/cpp/test/sparse/select_k_csr.cu b/cpp/test/sparse/select_k_csr.cu
new file mode 100644
index 0000000000..fc1061d7bb
--- /dev/null
+++ b/cpp/test/sparse/select_k_csr.cu
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/matrix/copy.cuh>
+#include <raft/random/make_blobs.cuh>
+#include <raft/random/rng_state.hpp>
+#include <raft/sparse/matrix/select_k.cuh>
+#include <raft/util/cuda_utils.cuh>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <optional>
+#include <queue>
+#include <random>
+#include <unordered_set>
+#include <vector>
+
+namespace raft {
+namespace sparse {
+
+template <typename index_t>
+struct SelectKCsrInputs {
+  index_t n_rows;
+  index_t n_cols;
+  index_t top_k;
+  float sparsity;
+  bool select_min;
+  bool customized_indices;
+};
+
+template <typename T>
+struct CompareApproxWithInf {
+  CompareApproxWithInf(T eps_) : eps(eps_) {}
+  bool operator()(const T& a, const T& b) const
+  {
+    if (std::isinf(a) && std::isinf(b)) return true;
+    T diff  = std::abs(a - b);
+    T m     = std::max(std::abs(a), std::abs(b));
+    T ratio = diff > eps ? diff / m : diff;
+
+    return (ratio <= eps);
+  }
+
+ private:
+  T eps;
+};
+
+template <typename value_t, typename index_t>
+class SelectKCsrTest : public ::testing::TestWithParam<SelectKCsrInputs<index_t>> {
+ public:
+  SelectKCsrTest()
+    : stream(resource::get_cuda_stream(handle)),
+      params(::testing::TestWithParam<SelectKCsrInputs<index_t>>::GetParam()),
+      indices_d(0, stream),
+      customized_indices_d(0, stream),
+      indptr_d(0, stream),
+      values_d(0, stream),
+      dst_values_d(0, stream),
+      dst_values_expected_d(0, stream),
+      dst_indices_d(0, stream),
+      dst_indices_expected_d(0, stream)
+  {
+  }
+
+ protected:
+  index_t create_sparse_matrix(index_t m, index_t n, value_t sparsity, std::vector<bool>& matrix)
+  {
+    index_t total_elements = static_cast<index_t>(m * n);
+    index_t num_ones       = static_cast<index_t>((total_elements * 1.0f) * sparsity);
+    index_t res            = num_ones;
+
+    for (index_t i = 0; i < total_elements; ++i) {
+      matrix[i] = false;
+    }
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<> dis_idx(0, total_elements - 1);
+
+    while (num_ones > 0) {
+      size_t index = dis_idx(gen);
+      if (matrix[index] == false) {
+        matrix[index] = true;
+        num_ones--;
+      }
+    }
+    return res;
+  }
+
+  void convert_to_csr(std::vector<bool>& matrix,
+                      index_t rows,
+                      index_t cols,
+                      std::vector<index_t>& indices,
+                      std::vector<index_t>& indptr)
+  {
+    index_t offset_indptr   = 0;
+    index_t offset_values   = 0;
+    indptr[offset_indptr++] = 0;
+
+    for (index_t i = 0; i < rows; ++i) {
+      for (index_t j = 0; j < cols; ++j) {
+        if (matrix[i * cols + j]) {
+          indices[offset_values] = static_cast<index_t>(j);
+          offset_values++;
+        }
+      }
+      indptr[offset_indptr++] = static_cast<index_t>(offset_values);
+    }
+  }
+
+  void cpu_select_k(const std::vector<index_t>& indptr_h,
+                    const std::vector<index_t>& indices_h,
+                    const std::vector<value_t>& values_h,
+                    std::optional<std::vector<index_t>>& in_idx_h,
+                    index_t n_rows,
+                    index_t n_cols,
+                    index_t top_k,
+                    std::vector<value_t>& out_values_h,
+                    std::vector<index_t>& out_indices_h,
+                    bool select_min = true)
+  {
+    auto comp = [select_min](const std::pair<value_t, index_t>& a,
+                             const std::pair<value_t, index_t>& b) {
+      return select_min ? a.first < b.first : a.first >= b.first;
+    };
+
+    for (index_t row = 0; row < n_rows; ++row) {
+      std::priority_queue<std::pair<value_t, index_t>,
+                          std::vector<std::pair<value_t, index_t>>,
+                          decltype(comp)>
+        pq(comp);
+
+      for (index_t idx = indptr_h[row]; idx < indptr_h[row + 1]; ++idx) {
+        pq.push({values_h[idx], (in_idx_h.has_value()) ? (*in_idx_h)[idx] : indices_h[idx]});
+        if (pq.size() > size_t(top_k)) { pq.pop(); }
+      }
+
+      std::vector<std::pair<value_t, index_t>> row_pairs;
+      while (!pq.empty()) {
+        row_pairs.push_back(pq.top());
+        pq.pop();
+      }
+
+      if (select_min) {
+        std::sort(row_pairs.begin(), row_pairs.end(), [](const auto& a, const auto& b) {
+          return a.first <= b.first;
+        });
+      } else {
+        std::sort(row_pairs.begin(), row_pairs.end(), [](const auto& a, const auto& b) {
+          return a.first >= b.first;
+        });
+      }
+      for (index_t col = 0; col < top_k; col++) {
+        if (col < index_t(row_pairs.size())) {
+          out_values_h[row * top_k + col]  = row_pairs[col].first;
+          out_indices_h[row * top_k + col] = row_pairs[col].second;
+        }
+      }
+    }
+  }
+
+  void random_array(value_t* array, size_t size)
+  {
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<value_t> dis(-10.0, 10.0);
+    std::unordered_set<value_t> uset;
+
+    while (uset.size() < size) {
+      uset.insert(dis(gen));
+    }
+    typename std::unordered_set<value_t>::iterator it = uset.begin();
+    for (size_t i = 0; i < size; ++i) {
+      array[i] = *(it++);
+    }
+  }
+
+  template <typename data_t>
+  std::optional<data_t> get_opt_var(data_t x)
+  {
+    if (params.customized_indices) {
+      return x;
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  void SetUp() override
+  {
+    std::vector<bool> dense_values_h(params.n_rows * params.n_cols, false);
+    nnz = create_sparse_matrix(params.n_rows, params.n_cols, params.sparsity, dense_values_h);
+
+    std::vector<value_t> values_h(nnz);
+    std::vector<index_t> indices_h(nnz);
+    std::vector<index_t> customized_indices_h(nnz);
+    std::vector<index_t> indptr_h(params.n_rows + 1);
+
+    convert_to_csr(dense_values_h, params.n_rows, params.n_cols, indices_h, indptr_h);
+
+    std::vector<value_t> dst_values_h(params.n_rows * params.top_k,
+                                      std::numeric_limits<value_t>::infinity());
+    std::vector<index_t> dst_indices_h(params.n_rows * params.top_k, static_cast<index_t>(0));
+
+    dst_values_d.resize(params.n_rows * params.top_k, stream);
+    dst_indices_d.resize(params.n_rows * params.top_k, stream);
+    values_d.resize(nnz, stream);
+
+    update_device(dst_values_d.data(), dst_values_h.data(), dst_values_h.size(), stream);
+    update_device(dst_indices_d.data(), dst_indices_h.data(), dst_indices_h.size(), stream);
+
+    if (params.customized_indices) {
+      customized_indices_d.resize(nnz, stream);
+      update_device(customized_indices_d.data(),
+                    customized_indices_h.data(),
+                    customized_indices_h.size(),
+                    stream);
+    }
+
+    resource::sync_stream(handle);
+
+    if (values_h.size()) {
+      random_array(values_h.data(), values_h.size());
+      raft::copy(values_d.data(), values_h.data(), values_h.size(), stream);
+      resource::sync_stream(handle);
+    }
+
+    auto optional_indices_h = get_opt_var(customized_indices_h);
+
+    cpu_select_k(indptr_h,
+                 indices_h,
+                 values_h,
+                 optional_indices_h,
+                 params.n_rows,
+                 params.n_cols,
+                 params.top_k,
+                 dst_values_h,
+                 dst_indices_h,
+                 params.select_min);
+
+    indices_d.resize(nnz, stream);
+    indptr_d.resize(params.n_rows + 1, stream);
+
+    dst_values_expected_d.resize(params.n_rows * params.top_k, stream);
+    dst_indices_expected_d.resize(params.n_rows * params.top_k, stream);
+
+    update_device(values_d.data(), values_h.data(), values_h.size(), stream);
+    update_device(indices_d.data(), indices_h.data(), indices_h.size(), stream);
+    update_device(indptr_d.data(), indptr_h.data(), indptr_h.size(), stream);
+    update_device(dst_values_expected_d.data(), dst_values_h.data(), dst_values_h.size(), stream);
+    update_device(
+      dst_indices_expected_d.data(), dst_indices_h.data(), dst_indices_h.size(), stream);
+
+    resource::sync_stream(handle);
+  }
+
+  void Run()
+  {
+    auto in_val_structure = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
+      indptr_d.data(),
+      indices_d.data(),
+      params.n_rows,
+      params.n_cols,
+      static_cast<index_t>(indices_d.size()));
+
+    auto in_val =
+      raft::make_device_csr_matrix_view<const value_t>(values_d.data(), in_val_structure);
+
+    std::optional<raft::device_vector_view<const index_t, index_t>> in_idx;
+
+    in_idx = get_opt_var(
+      raft::make_device_vector_view<const index_t, index_t>(customized_indices_d.data(), nnz));
+
+    auto out_val = raft::make_device_matrix_view<value_t, index_t, raft::row_major>(
+      dst_values_d.data(), params.n_rows, params.top_k);
+    auto out_idx = raft::make_device_matrix_view<index_t, index_t, raft::row_major>(
+      dst_indices_d.data(), params.n_rows, params.top_k);
+
+    raft::sparse::matrix::select_k(
+      handle, in_val, in_idx, out_val, out_idx, params.select_min, true);
+
+    ASSERT_TRUE(raft::devArrMatch<index_t>(dst_indices_expected_d.data(),
+                                           out_idx.data_handle(),
+                                           params.n_rows * params.top_k,
+                                           raft::Compare<index_t>(),
+                                           stream));
+
+    ASSERT_TRUE(raft::devArrMatch<value_t>(dst_values_expected_d.data(),
+                                           out_val.data_handle(),
+                                           params.n_rows * params.top_k,
+                                           CompareApproxWithInf<value_t>(1e-6f),
+                                           stream));
+  }
+
+ protected:
+  raft::resources handle;
+  cudaStream_t stream;
+
+  SelectKCsrInputs<index_t> params;
+
+  index_t nnz;
+
+  rmm::device_uvector<value_t> values_d;
+  rmm::device_uvector<index_t> indptr_d;
+  rmm::device_uvector<index_t> indices_d;
+  rmm::device_uvector<index_t> customized_indices_d;
+
+  rmm::device_uvector<value_t> dst_values_d;
+  rmm::device_uvector<value_t> dst_values_expected_d;
+
+  rmm::device_uvector<index_t> dst_indices_d;
+  rmm::device_uvector<index_t> dst_indices_expected_d;
+};
+
+using SelectKCsrTest_float_int = SelectKCsrTest<float, int>;
+TEST_P(SelectKCsrTest_float_int, Result) { Run(); }
+
+using SelectKCsrTest_double_int64 = SelectKCsrTest<double, int64_t>;
+TEST_P(SelectKCsrTest_double_int64, Result) { Run(); }
+
+template <typename index_t>
+const std::vector<SelectKCsrInputs<index_t>> selectk_inputs = {
+  {10, 32, 10, 0.0, true, false},
+  {10, 32, 10, 0.0, true, true},
+  {10, 32, 10, 0.01, true, false},  // kWarpImmediate
+  {10, 32, 10, 0.1, true, true},
+  {10, 32, 251, 0.1, true, false},  // kWarpImmediate
+  {10, 32, 251, 0.6, true, true},
+  {1000, 1024 * 100, 1, 0.1, true, false},  // kWarpImmediate
+  {1000, 1024 * 100, 1, 0.2, true, true},
+  {1024, 1024, 258, 0.3, true, false},  // kRadix11bitsExtraPass
+  {1024, 1024, 600, 0.2, true, true},
+  {1024, 1024, 1024, 0.3, true, false},  // kRadix11bitsExtraPass
+  {1024, 1024, 1024, 0.2, true, true},
+  {100, 1024 * 1000, 251, 0.1, true, false},  // kWarpDistributedShm
+  {100, 1024 * 1000, 251, 0.2, true, true},
+  {1024, 1024 * 10, 251, 0.3, true, false},  // kWarpImmediate
+  {1024, 1024 * 10, 251, 0.2, true, true},
+  {1000, 1024 * 20, 1000, 0.2, true, false},  // kRadix11bits
+  {1000, 1024 * 20, 1000, 0.3, true, true},
+  {2048, 1024 * 10, 1000, 0.2, true, false},  // kRadix11bitsExtraPass
+  {2048, 1024 * 10, 1000, 0.3, true, true},
+  {2048, 1024 * 10, 2100, 0.1, true, false},  // kRadix11bitsExtraPass
+  {2048, 1024 * 10, 2100, 0.2, true, true},
+  {10, 32, 10, 0.0, false, false},
+  {10, 32, 10, 0.0, false, true},
+  {10, 32, 10, 0.01, false, false},  // kWarpImmediate
+  {10, 32, 10, 0.1, false, true},
+  {10, 32, 251, 0.1, false, false},  // kWarpImmediate
+  {10, 32, 251, 0.6, false, true},
+  {1000, 1024 * 100, 1, 0.1, false, false},  // kWarpImmediate
+  {1000, 1024 * 100, 1, 0.2, false, true},
+  {1024, 1024, 258, 0.3, false, false},  // kRadix11bitsExtraPass
+  {1024, 1024, 600, 0.2, false, true},
+  {1024, 1024, 1024, 0.3, false, false},  // kRadix11bitsExtraPass
+  {1024, 1024, 1024, 0.2, false, true},
+  {100, 1024 * 1000, 251, 0.1, false, false},  // kWarpDistributedShm
+  {100, 1024 * 1000, 251, 0.2, false, true},
+  {1024, 1024 * 10, 251, 0.3, false, false},  // kWarpImmediate
+  {1024, 1024 * 10, 251, 0.2, false, true},
+  {1000, 1024 * 20, 1000, 0.2, false, false},  // kRadix11bits
+  {1000, 1024 * 20, 1000, 0.3, false, true},
+  {2048, 1024 * 10, 1000, 0.2, false, false},  // kRadix11bitsExtraPass
+  {2048, 1024 * 10, 1000, 0.3, false, true},
+  {2048, 1024 * 10, 2100, 0.1, false, false},  // kRadix11bitsExtraPass
+  {2048, 1024 * 10, 2100, 0.2, false, true}};
+
+INSTANTIATE_TEST_CASE_P(SelectKCsrTest,
+                        SelectKCsrTest_float_int,
+                        ::testing::ValuesIn(selectk_inputs<int>));
+INSTANTIATE_TEST_CASE_P(SelectKCsrTest,
+                        SelectKCsrTest_double_int64,
+                        ::testing::ValuesIn(selectk_inputs<int64_t>));
+
+}  // namespace sparse
+}  // namespace raft