From 2cc477b9c3ff95b0b62264caa6bba0a6c66c50a1 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 14 Aug 2023 16:16:45 +0200
Subject: [PATCH 01/15] Replace GEMM backend: cublas.gemm -> cublaslt.matmul

---
 .../raft/core/resource/cublaslt_handle.hpp    |  68 +++
 .../raft/core/resource/resource_types.hpp     |   1 +
 cpp/include/raft/linalg/detail/gemm.hpp       | 543 ++++++++++++------
 cpp/include/raft/linalg/gemm.cuh              | 100 +++-
 cpp/include/raft/util/cache.hpp               |  83 +++
 cpp/include/raft/util/cuda_data_type.hpp      |  82 +++
 cpp/test/linalg/gemm_layout.cu                |   2 +-
 7 files changed, 669 insertions(+), 210 deletions(-)
 create mode 100644 cpp/include/raft/core/resource/cublaslt_handle.hpp
 create mode 100644 cpp/include/raft/util/cache.hpp
 create mode 100644 cpp/include/raft/util/cuda_data_type.hpp
diff --git a/cpp/include/raft/core/resource/cublaslt_handle.hpp b/cpp/include/raft/core/resource/cublaslt_handle.hpp
new file mode 100644
index 0000000000..0d83fae752
--- /dev/null
+++ b/cpp/include/raft/core/resource/cublaslt_handle.hpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cublasLt.h>
+#include <raft/core/cublas_macros.hpp>
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+
+#include <memory>
+
+namespace raft::resource {
+
+class cublaslt_resource : public resource {
+ public:
+  cublaslt_resource() { RAFT_CUBLAS_TRY(cublasLtCreate(&handle_)); }
+  ~cublaslt_resource() noexcept override { RAFT_CUBLAS_TRY_NO_THROW(cublasLtDestroy(handle_)); }
+  auto get_resource() -> void* override { return &handle_; }
+
+ private:
+  cublasLtHandle_t handle_;
+};
+
+/** Factory that knows how to construct a specific raft::resource to populate the res_t. */
+class cublaslt_resource_factory : public resource_factory {
+ public:
+  auto get_resource_type() -> resource_type override { return resource_type::CUBLASLT_HANDLE; }
+  auto make_resource() -> resource* override { return new cublaslt_resource(); }
+};
+
+/**
+ * @defgroup resource_cublas cuBLAS handle resource functions
+ * @{
+ */
+
+/**
+ * Load a cublasLt res_t from raft res if it exists, otherwise
+ * add it and return it.
+ * @param[in] res the raft resources object
+ * @return cublasLt handle
+ */
+inline auto get_cublaslt_handle(resources const& res) -> cublasLtHandle_t
+{
+  if (!res.has_resource_factory(resource_type::CUBLASLT_HANDLE)) {
+    res.add_resource_factory(std::make_shared<cublaslt_resource_factory>());
+  }
+  auto ret = *res.get_resource<cublasLtHandle_t>(resource_type::CUBLASLT_HANDLE);
+  return ret;
+};
+
+/**
+ * @}
+ */
+
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resource/resource_types.hpp b/cpp/include/raft/core/resource/resource_types.hpp
index 2dc4eb1f9d..b32e09bb6b 100644
--- a/cpp/include/raft/core/resource/resource_types.hpp
+++ b/cpp/include/raft/core/resource/resource_types.hpp
@@ -41,6 +41,7 @@ enum resource_type {
   DEVICE_ID,               // cuda device id
   THRUST_POLICY,           // thrust execution policy
   WORKSPACE_RESOURCE,      // rmm device memory resource
+  CUBLASLT_HANDLE,         // cublasLt handle
 
   LAST_KEY                 // reserved for the last key
 };
diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp
index d82c821148..3966d7e1eb 100644
--- a/cpp/include/raft/linalg/detail/gemm.hpp
+++ b/cpp/include/raft/linalg/detail/gemm.hpp
@@ -13,27 +13,216 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
-#include <cublas_v2.h>
+#include <raft/core/cublas_macros.hpp>
+#include <raft/core/nvtx.hpp>
+#include <raft/core/resource/cublaslt_handle.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/util/cache.hpp>
+#include <raft/util/cuda_data_type.hpp>
+
+#include <cuda_fp16.hpp>
+#include <type_traits>
 
-#include "cublas_wrappers.hpp"
+namespace raft::linalg::detail {
 
-#include <raft/core/resource/cublas_handle.hpp>
-#include <raft/core/resources.hpp>
+/** Get the cublas compute type for the combination of input types. */
+template <typename S, typename A, typename B, typename C>
+auto get_matmul_type() -> cublasComputeType_t
+{
+  static_assert(std::is_same_v<S, float> && std::is_same_v<A, float> && std::is_same_v<B, float> &&
+                  std::is_same_v<C, float>,
+                "Unsupported combination of input types. Consult cublas API for supported types.");
+  return CUBLAS_COMPUTE_32F;
+}
+
+template <>
+inline auto get_matmul_type<float, float, float, float>() -> cublasComputeType_t
+{
+  return CUBLAS_COMPUTE_32F;
+}
+template <>
+inline auto get_matmul_type<float, half, half, float>() -> cublasComputeType_t
+{
+  return CUBLAS_COMPUTE_32F;
+}
+template <>
+inline auto get_matmul_type<float, int8_t, int8_t, float>() -> cublasComputeType_t
+{
+  return CUBLAS_COMPUTE_32F;
+}
+template <>
+inline auto get_matmul_type<float, half, half, half>() -> cublasComputeType_t
+{
+  return CUBLAS_COMPUTE_32F;
+}
+template <>
+inline auto get_matmul_type<half, half, half, half>() -> cublasComputeType_t
+{
+  return CUBLAS_COMPUTE_16F;
+}
+template <>
+inline auto get_matmul_type<int32_t, int8_t, int8_t, int32_t>() -> cublasComputeType_t
+{
+  return CUBLAS_COMPUTE_32I;
+}
+template <>
+inline auto get_matmul_type<float, int8_t, int8_t, int8_t>() -> cublasComputeType_t
+{
+  return CUBLAS_COMPUTE_32I;
+}
+template <>
+inline auto get_matmul_type<double, double, double, double>() -> cublasComputeType_t
+{
+  return CUBLAS_COMPUTE_64F;
+}
+
+/** Unique representation of a matrix multiplication (assuming fixed types). */
+struct matmul_key_t {
+  uint64_t m;
+  uint64_t n;
+  uint64_t k;
+  uint64_t lda;
+  uint64_t ldb;
+  uint64_t ldc;
+  bool trans_a;
+  bool trans_b;
+};
+
+inline auto operator==(const matmul_key_t& a, const matmul_key_t& b) -> bool
+{
+  return a.m == b.m && a.n == b.n && a.k == b.k && a.lda == b.lda && a.ldb == b.ldb &&
+         a.ldc == b.ldc && a.trans_a == b.trans_a && a.trans_b == b.trans_b;
+}
 
-namespace raft {
-namespace linalg {
-namespace detail {
+struct matmul_key_hash {
+  inline auto operator()(const matmul_key_t& x) const noexcept -> std::size_t
+  {
+    return x.m * x.n * x.k + x.lda * x.ldb * x.ldc + size_t{x.trans_a} + size_t{x.trans_b} * 2;
+  }
+};
+
+/** Descriptor for a column-major cublasLt matrix. */
+struct cublastlt_matrix_layout {
+  cublasLtMatrixLayout_t res{nullptr};
+  inline cublastlt_matrix_layout(cudaDataType dtype, uint64_t rows, uint64_t cols, uint64_t ld)
+  {
+    RAFT_CUBLAS_TRY(cublasLtMatrixLayoutCreate(&res, dtype, rows, cols, ld));
+  }
+  inline cublastlt_matrix_layout(const cublastlt_matrix_layout&)                    = delete;
+  inline auto operator=(const cublastlt_matrix_layout&) -> cublastlt_matrix_layout& = delete;
+  inline cublastlt_matrix_layout(cublastlt_matrix_layout&&)                         = default;
+  inline auto operator=(cublastlt_matrix_layout&&) -> cublastlt_matrix_layout&      = default;
+
+  inline ~cublastlt_matrix_layout() noexcept
+  {
+    RAFT_CUBLAS_TRY_NO_THROW(cublasLtMatrixLayoutDestroy(res));
+  }
+
+  // NOLINTNEXTLINE
+  inline operator cublasLtMatrixLayout_t() const noexcept { return res; }
+
+  template <typename T>
+  static inline auto for_matmul(bool col_major, uint64_t rows, uint64_t cols, uint64_t ld)
+    -> cublastlt_matrix_layout
+  {
+    return cublastlt_matrix_layout{
+      get_cuda_data_type<T>(), col_major ? rows : cols, col_major ? cols : rows, ld};
+  }
+};
+
+/** Descriptor for a cublasLt matmul function. */
+struct cublastlt_matmul_desc {
+  cublasLtMatmulDesc_t res{nullptr};
+  inline cublastlt_matmul_desc(cublasComputeType_t compute_type, cudaDataType scale_type)
+  {
+    RAFT_CUBLAS_TRY(cublasLtMatmulDescCreate(&res, compute_type, scale_type));
+  }
+  inline cublastlt_matmul_desc(const cublastlt_matmul_desc&)                    = delete;
+  inline auto operator=(const cublastlt_matmul_desc&) -> cublastlt_matmul_desc& = delete;
+  inline cublastlt_matmul_desc(cublastlt_matmul_desc&&)                         = default;
+  inline auto operator=(cublastlt_matmul_desc&&) -> cublastlt_matmul_desc&      = default;
+
+  inline ~cublastlt_matmul_desc() noexcept
+  {
+    RAFT_CUBLAS_TRY_NO_THROW(cublasLtMatmulDescDestroy(res));
+  }
+
+  // NOLINTNEXTLINE
+  inline operator cublasLtMatmulDesc_t() const noexcept { return res; }
+
+  template <typename S, typename A, typename B, typename C, bool DevicePointerMode = false>
+  static inline auto for_matmult(bool transpose_a, bool transpose_b) -> cublastlt_matmul_desc
+  {
+    auto desc = cublastlt_matmul_desc{get_matmul_type<S, A, B, C>(), get_cuda_data_type<S>()};
+    if constexpr (DevicePointerMode) {
+      const cublasPointerMode_t mode = CUBLAS_POINTER_MODE_DEVICE;
+      RAFT_CUBLAS_TRY(cublasLtMatmulDescSetAttribute(
+        desc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &mode, sizeof(mode)));
+    }
+    const cublasOperation_t trans_op = CUBLAS_OP_T;
+    if (transpose_a) {
+      RAFT_CUBLAS_TRY(cublasLtMatmulDescSetAttribute(
+        desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_op, sizeof(trans_op)));
+    }
+    if (transpose_b) {
+      RAFT_CUBLAS_TRY(cublasLtMatmulDescSetAttribute(
+        desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_op, sizeof(trans_op)));
+    }
+    return desc;
+  }
+};
+
+/** Full description of matmul. */
+struct matmul_desc {
+  cublastlt_matmul_desc desc;
+  cublastlt_matrix_layout a;
+  cublastlt_matrix_layout b;
+  cublastlt_matrix_layout c;
+  cublasLtMatmulHeuristicResult_t heuristics;
+
+  template <typename S, typename A, typename B, typename C, bool DevicePointerMode = false>
+  static inline auto create(raft::resources const& res, const matmul_key_t& args) -> matmul_desc
+  {
+    matmul_desc r{
+      cublastlt_matmul_desc::for_matmult<S, A, B, C, DevicePointerMode>(args.trans_a, args.trans_b),
+      cublastlt_matrix_layout::for_matmul<A>(!(args.trans_a), args.m, args.k, args.lda),
+      cublastlt_matrix_layout::for_matmul<B>(!(args.trans_b), args.k, args.n, args.ldb),
+      cublastlt_matrix_layout::for_matmul<C>(true, args.m, args.n, args.ldc)};
+    int algo_count;
+    cublasLtMatmulPreference_t preference;
+    RAFT_CUBLAS_TRY(cublasLtMatmulPreferenceCreate(&preference));
+    RAFT_CUBLAS_TRY(cublasLtMatmulAlgoGetHeuristic(resource::get_cublaslt_handle(res),
+                                                   r.desc,
+                                                   r.a,
+                                                   r.b,
+                                                   r.c,
+                                                   r.c,
+                                                   preference,
+                                                   1,
+                                                   &r.heuristics,
+                                                   &algo_count));
+    RAFT_CUBLAS_TRY(cublasLtMatmulPreferenceDestroy(preference));
+    return r;
+  }
+};
+
+/** Number of matmul invocations to cache. */
+static constexpr size_t kLRUSize = 100;
 
 /**
- * @brief the wrapper of cublas gemm function
+ * @brief the wrapper of cublasLt matmul function
  *  It computes the following equation: C = alpha .* opA(A) * opB(B) + beta .* C
  *
- * @tparam math_t the element type
  * @tparam DevicePointerMode whether pointers alpha, beta point to device memory
- * @param [in] handle raft handle
+ * @tparam S the type of scale parameters alpha, beta
+ * @tparam A the element type of matrix A
+ * @tparam B the element type of matrix B
+ * @tparam C the element type of matrix C
+ *
+ * @param [in] res raft resources
  * @param [in] trans_a cublas transpose op for A
  * @param [in] trans_b cublas transpose op for B
  * @param [in] m number of rows of C
@@ -49,195 +238,173 @@ namespace detail {
  * @param [in] ldc leading dimension of C
  * @param [in] stream
  */
-template <typename math_t, bool DevicePointerMode = false>
-void gemm(raft::resources const& handle,
-          const bool trans_a,
-          const bool trans_b,
-          const int m,
-          const int n,
-          const int k,
-          const math_t* alpha,
-          const math_t* A,
-          const int lda,
-          const math_t* B,
-          const int ldb,
-          const math_t* beta,
-          math_t* C,
-          const int ldc,
-          cudaStream_t stream)
+template <bool DevicePointerMode = false, typename S, typename A, typename B, typename C>
+void matmul(raft::resources const& res,
+            bool trans_a,
+            bool trans_b,
+            uint64_t m,
+            uint64_t n,
+            uint64_t k,
+            const S* alpha,
+            const A* a_ptr,
+            uint64_t lda,
+            const B* b_ptr,
+            uint64_t ldb,
+            const S* beta,
+            C* c_ptr,
+            uint64_t ldc,
+            cudaStream_t stream)
 {
-  auto cublas_h = raft::resource::get_cublas_handle(handle);
-  cublas_device_pointer_mode<DevicePointerMode> pmode(cublas_h);
-  RAFT_CUBLAS_TRY(cublasgemm(cublas_h,
-                             trans_a ? CUBLAS_OP_T : CUBLAS_OP_N,
-                             trans_b ? CUBLAS_OP_T : CUBLAS_OP_N,
-                             m,
-                             n,
-                             k,
-                             alpha,
-                             A,
-                             lda,
-                             B,
-                             ldb,
-                             beta,
-                             C,
-                             ldc,
-                             stream));
+  common::nvtx::range<common::nvtx::domain::raft> batch_scope(
+    "linalg::matmul(m = %d, n = %d, k = %d)", m, n, k);
+  std::shared_ptr<matmul_desc> mm_desc{nullptr};
+  matmul_key_t mm_key{m, n, k, lda, ldb, ldc, trans_a, trans_b};
+  static thread_local cache::
+    lru<matmul_key_t, matmul_key_hash, std::equal_to<>, std::shared_ptr<matmul_desc>>
+      cache{kLRUSize};
+  if (!cache.get(mm_key, &mm_desc)) {
+    mm_desc.reset(new matmul_desc{matmul_desc::create<S, A, B, C, DevicePointerMode>(res, mm_key)});
+    cache.set(mm_key, mm_desc);
+  }
+  RAFT_CUBLAS_TRY(cublasLtMatmul(resource::get_cublaslt_handle(res),
+                                 mm_desc->desc,
+                                 alpha,
+                                 a_ptr,
+                                 mm_desc->a,
+                                 b_ptr,
+                                 mm_desc->b,
+                                 beta,
+                                 c_ptr,
+                                 mm_desc->c,
+                                 c_ptr,
+                                 mm_desc->c,
+                                 &(mm_desc->heuristics.algo),
+                                 nullptr,
+                                 0,
+                                 stream));
 }
 
-/**
- * @brief the wrapper of cublas gemm function
- *  It computes the following equation: D = alpha . opA(A) * opB(B) + beta . C
- * @tparam math_t the type of input/output matrices
- * @param handle raft handle
- * @param a input matrix
- * @param n_rows_a number of rows of A
- * @param n_cols_a number of columns of A
- * @param b input matrix
- * @param c output matrix
- * @param n_rows_c number of rows of C
- * @param n_cols_c number of columns of C
- * @param trans_a cublas transpose op for A
- * @param trans_b cublas transpose op for B
- * @param alpha scalar
- * @param beta scalar
- * @param stream cuda stream
- */
-template <typename math_t>
-void gemm(raft::resources const& handle,
-          const math_t* a,
-          int n_rows_a,
-          int n_cols_a,
-          const math_t* b,
-          math_t* c,
-          int n_rows_c,
-          int n_cols_c,
-          cublasOperation_t trans_a,
-          cublasOperation_t trans_b,
-          math_t alpha,
-          math_t beta,
-          cudaStream_t stream)
+template <typename T, bool DevicePointerMode = false>
+void legacy_gemm(raft::resources const& res,
+                 const bool trans_a,
+                 const bool trans_b,
+                 const int m,
+                 const int n,
+                 const int k,
+                 const T* alpha,
+                 const T* A,
+                 const int lda,
+                 const T* B,
+                 const int ldb,
+                 const T* beta,
+                 T* C,
+                 const int ldc,
+                 cudaStream_t stream)
+{
+  return matmul<DevicePointerMode, T, T, T, T>(res,
+                                               trans_a,
+                                               trans_b,
+                                               static_cast<uint64_t>(m),
+                                               static_cast<uint64_t>(n),
+                                               static_cast<uint64_t>(k),
+                                               alpha,
+                                               A,
+                                               static_cast<uint64_t>(lda),
+                                               B,
+                                               static_cast<uint64_t>(ldb),
+                                               beta,
+                                               C,
+                                               static_cast<uint64_t>(ldc),
+                                               stream);
+}
+
+template <typename T>
+void legacy_gemm(raft::resources const& res,
+                 const T* a,
+                 int n_rows_a,
+                 int n_cols_a,
+                 const T* b,
+                 T* c,
+                 int n_rows_c,
+                 int n_cols_c,
+                 cublasOperation_t trans_a,
+                 cublasOperation_t trans_b,
+                 T alpha,
+                 T beta,
+                 cudaStream_t stream)
 {
-  auto cublas_h = raft::resource::get_cublas_handle(handle);
-
-  int m   = n_rows_c;
-  int n   = n_cols_c;
-  int k   = trans_a == CUBLAS_OP_T ? n_rows_a : n_cols_a;
-  int lda = trans_a == CUBLAS_OP_T ? k : m;
-  int ldb = trans_b == CUBLAS_OP_T ? n : k;
-  int ldc = m;
-  RAFT_CUBLAS_TRY(
-    cublasgemm(cublas_h, trans_a, trans_b, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc, stream));
+  int m  = n_rows_c;
+  int n  = n_cols_c;
+  auto k = trans_a == CUBLAS_OP_T ? n_rows_a : n_cols_a;
+  return matmul<false, T, T, T, T>(res,
+                                   trans_a == CUBLAS_OP_T,
+                                   trans_b == CUBLAS_OP_T,
+                                   static_cast<uint64_t>(n_rows_c),
+                                   static_cast<uint64_t>(n_cols_c),
+                                   static_cast<uint64_t>(k),
+                                   &alpha,
+                                   a,
+                                   static_cast<uint64_t>(trans_a == CUBLAS_OP_T ? k : m),
+                                   b,
+                                   static_cast<uint64_t>(trans_b == CUBLAS_OP_T ? n : k),
+                                   &beta,
+                                   c,
+                                   static_cast<uint64_t>(m),
+                                   stream);
 }
 
-template <typename math_t>
-void gemm(raft::resources const& handle,
-          const math_t* a,
-          int n_rows_a,
-          int n_cols_a,
-          const math_t* b,
-          math_t* c,
-          int n_rows_c,
-          int n_cols_c,
-          cublasOperation_t trans_a,
-          cublasOperation_t trans_b,
-          cudaStream_t stream)
+template <typename T>
+void legacy_gemm(raft::resources const& res,
+                 const T* a,
+                 int n_rows_a,
+                 int n_cols_a,
+                 const T* b,
+                 T* c,
+                 int n_rows_c,
+                 int n_cols_c,
+                 cublasOperation_t trans_a,
+                 cublasOperation_t trans_b,
+                 cudaStream_t stream)
 {
-  math_t alpha = math_t(1);
-  math_t beta  = math_t(0);
-  gemm(
-    handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream);
+  return legacy_gemm(
+    res, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, T{1}, T{0}, stream);
 }
 
 template <typename T, bool DevicePointerMode = false>
-void gemm(raft::resources const& handle,
-          T* z,
-          T* x,
-          T* y,
-          int _M,
-          int _N,
-          int _K,
-          bool isZColMajor,
-          bool isXColMajor,
-          bool isYColMajor,
-          cudaStream_t stream,
-          T* alpha,
-          T* beta)
+void legacy_gemm(raft::resources const& res,
+                 T* z,
+                 T* x,
+                 T* y,
+                 int _M,
+                 int _N,
+                 int _K,
+                 bool isZColMajor,
+                 bool isXColMajor,
+                 bool isYColMajor,
+                 cudaStream_t stream,
+                 const T* alpha,
+                 const T* beta)
 {
-  auto cublas_h = raft::resource::get_cublas_handle(handle);
-  cublas_device_pointer_mode<DevicePointerMode> pmode(cublas_h);
-
-  cublasOperation_t trans_a, trans_b;
-  T *a, *b, *c;
-  int lda, ldb, ldc;
-  int M, N, K;
-  // This function performs c = a * b. Based on the required output layout,
-  // either a = x,  b = y or a = y, b = x. In either case c = z.
-  if (isZColMajor == true) {
-    // Result c is required in column major layout. Thus we perform,
-    // z = x * y
-    // Using BLAS call c = a * b. Therefore a = x, b = y and c = z
-
-    a = x;
-    // If x is in row major layout, cublas needs to transpose x first,
-    // therefore trans_x needs to be CUBLAS_OP_T. If x is in column major
-    // layout, trans_b needs to be CUBLAS_OP_N.
-    trans_a = isXColMajor == true ? CUBLAS_OP_N : CUBLAS_OP_T;
-    // Set leading dimension appropriately
-    lda = isXColMajor == true ? _M : _K;
-
-    b = y;
-    // If y is in row major layout, cublas needs to transpose y first,
-    // therefore trans_x needs to be CUBLAS_OP_T. If x is in column major
-    // layout, trans_b needs to be CUBLAS_OP_N.
-    trans_b = isYColMajor == true ? CUBLAS_OP_N : CUBLAS_OP_T;
-    ldb     = isYColMajor == true ? _K : _N;
-
-    c   = z;
-    ldc = _M;
-    M   = _M;
-    N   = _N;
-    K   = _K;
+  if (isZColMajor) {
+    return matmul<DevicePointerMode, T, T, T, T>(res,
+                                                 !isXColMajor,
+                                                 !isYColMajor,
+                                                 static_cast<uint64_t>(_M),
+                                                 static_cast<uint64_t>(_N),
+                                                 static_cast<uint64_t>(_K),
+                                                 alpha,
+                                                 x,
+                                                 static_cast<uint64_t>(isXColMajor ? _M : _K),
+                                                 y,
+                                                 static_cast<uint64_t>(isYColMajor ? _K : _N),
+                                                 beta,
+                                                 z,
+                                                 static_cast<uint64_t>(_M),
+                                                 stream);
   } else {
-    // Result c is required in row major layout Thus we pick
-    // a = y, b = x and c = a * b = y * x
-    // cublas produces output matrix only in column major layout. To get output
-    // matrix on row major layout, we need to produce transpose of output
-    // in column major layout. Therefore we perform,
-    // tr(z) = tr(y) * tr(x)
-    // we model this using cublas call for c = a * b
-    // therefore a = tr(y), b = tr(x) and c = tr(z)
-
-    a = y;
-    // If y is in row major layout, it can be/ interpreted as tr(y) on column
-    // major layout. Therefore we can pass trans_a as CUBLAS_OP_N. If y is in
-    // column major layout, cublas needs to transpose y first, therefore
-    // trans_a needs to be CUBLAS_OP_T
-    trans_a = isYColMajor == true ? CUBLAS_OP_T : CUBLAS_OP_N;
-    // Set leading dimension appropriately
-    lda = isYColMajor == true ? _K : _N;
-
-    b = x;
-    // If x is in row major layout, it can be interpreted as tr(x) on column
-    // major layout. Therefore we can pass trans_b as CUBLAS_OP_N. If x is in
-    // column major layout, cublas needs to trasponse x first, therefore
-    // trans_b needs to be CUBLAS_OP_T
-    trans_b = isXColMajor == true ? CUBLAS_OP_T : CUBLAS_OP_N;
-    // Set leading dimension appropriately
-    ldb = isXColMajor == true ? _M : _K;
-
-    c   = z;
-    ldc = _N;
-
-    M = _N;
-    N = _M;
-    K = _K;
+    return legacy_gemm<T, DevicePointerMode>(
+      res, z, y, x, _N, _M, _K, true, !isYColMajor, !isXColMajor, stream, alpha, beta);
   }
-  // Actual cuBLAS call
-  RAFT_CUBLAS_TRY(
-    cublasgemm(cublas_h, trans_a, trans_b, M, N, K, alpha, a, lda, b, ldb, beta, c, ldc, stream));
 }
 
-}  // namespace detail
-}  // namespace linalg
-}  // namespace raft
+}  // namespace raft::linalg::detail
diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh
index aea9d52673..35b877b20c 100644
--- a/cpp/include/raft/linalg/gemm.cuh
+++ b/cpp/include/raft/linalg/gemm.cuh
@@ -19,6 +19,7 @@
 #pragma once
 
 #include "detail/gemm.hpp"
+
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdarray.hpp>
@@ -27,8 +28,65 @@
 #include <raft/core/resources.hpp>
 #include <raft/util/input_validation.hpp>
 
-namespace raft {
-namespace linalg {
+namespace raft::linalg {
+
+/**
+ * @brief the wrapper of cublasLt matmul function
+ *  It computes the following equation: C = alpha .* opA(A) * opB(B) + beta .* C
+ *
+ * @tparam DevicePointerMode whether pointers alpha, beta point to device memory
+ * @tparam S the type of scale parameters alpha, beta
+ * @tparam A the element type of matrix A
+ * @tparam B the element type of matrix B
+ * @tparam C the element type of matrix C
+ *
+ * @param [in] res raft resources
+ * @param [in] trans_a cublas transpose op for A
+ * @param [in] trans_b cublas transpose op for B
+ * @param [in] m number of rows of C
+ * @param [in] n number of columns of C
+ * @param [in] k number of rows of opB(B) / number of columns of opA(A)
+ * @param [in] alpha host or device scalar
+ * @param [in] A such a matrix that the shape of column-major opA(A) is [m, k]
+ * @param [in] lda leading dimension of A
+ * @param [in] B such a matrix that the shape of column-major opA(B) is [k, n]
+ * @param [in] ldb leading dimension of B
+ * @param [in] beta host or device scalar
+ * @param [inout] C column-major matrix of size [m, n]
+ * @param [in] ldc leading dimension of C
+ */
+template <bool DevicePointerMode = false, typename S, typename A, typename B, typename C>
+void matmul(raft::resources const& res,
+            bool trans_a,
+            bool trans_b,
+            uint64_t m,
+            uint64_t n,
+            uint64_t k,
+            const S* alpha,
+            const A* a_ptr,
+            uint64_t lda,
+            const B* b_ptr,
+            uint64_t ldb,
+            const S* beta,
+            C* c_ptr,
+            uint64_t ldc)
+{
+  return detail::matmul<DevicePointerMode, S, A, B, C>(res,
+                                                       trans_a,
+                                                       trans_b,
+                                                       m,
+                                                       n,
+                                                       k,
+                                                       alpha,
+                                                       a_ptr,
+                                                       lda,
+                                                       b_ptr,
+                                                       ldb,
+                                                       beta,
+                                                       c_ptr,
+                                                       ldc,
+                                                       resource::get_cuda_stream(res));
+}
 
 /**
  * @brief the wrapper of cublas gemm function
@@ -69,7 +127,7 @@ void gemm(raft::resources const& handle,
           const int ldc,
           cudaStream_t stream)
 {
-  detail::gemm<math_t, DevicePointerMode>(
+  return detail::legacy_gemm(
     handle, trans_a, trans_b, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, stream);
 }
 
@@ -106,7 +164,7 @@ void gemm(raft::resources const& handle,
           math_t beta,
           cudaStream_t stream)
 {
-  detail::gemm(
+  detail::legacy_gemm(
     handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream);
 }
 
@@ -139,7 +197,8 @@ void gemm(raft::resources const& handle,
           cublasOperation_t trans_b,
           cudaStream_t stream)
 {
-  detail::gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, stream);
+  detail::legacy_gemm(
+    handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, stream);
 }
 
 /**
@@ -176,7 +235,7 @@ void gemm(raft::resources const& handle,
           T alpha = T(1.0),
           T beta  = T(0.0))
 {
-  detail::gemm(
+  return detail::legacy_gemm<T, false>(
     handle, z, x, y, _M, _N, _K, isZColMajor, isXColMajor, isYColMajor, stream, &alpha, &beta);
 }
 
@@ -256,24 +315,23 @@ void gemm(raft::resources const& handle,
     if (!beta) { beta = beta_host.view(); }
   }
 
-  detail::gemm<ValueType, device_mode>(handle,
-                                       z.data_handle(),
-                                       x.data_handle(),
-                                       y.data_handle(),
-                                       x.extent(0),
-                                       y.extent(1),
-                                       x.extent(1),
-                                       is_z_col_major,
-                                       is_x_col_major,
-                                       is_y_col_major,
-                                       resource::get_cuda_stream(handle),
-                                       alpha.value().data_handle(),
-                                       beta.value().data_handle());
+  return detail::legacy_gemm<ValueType, device_mode>(handle,
+                                                     z.data_handle(),
+                                                     x.data_handle(),
+                                                     y.data_handle(),
+                                                     x.extent(0),
+                                                     y.extent(1),
+                                                     x.extent(1),
+                                                     is_z_col_major,
+                                                     is_x_col_major,
+                                                     is_y_col_major,
+                                                     resource::get_cuda_stream(handle),
+                                                     alpha.value().data_handle(),
+                                                     beta.value().data_handle());
 }
 
 /** @} */  // end of gemm
 
-}  // end namespace linalg
-}  // end namespace raft
+}  // namespace raft::linalg
 
 #endif
diff --git a/cpp/include/raft/util/cache.hpp b/cpp/include/raft/util/cache.hpp
new file mode 100644
index 0000000000..ee1ad1cb19
--- /dev/null
+++ b/cpp/include/raft/util/cache.hpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/error.hpp>
+
+#include <cstddef>
+#include <list>
+#include <tuple>
+#include <unordered_map>
+#include <utility>
+
+namespace raft::cache {
+
+/** Associative cache with least recently used replacement policy. */
+template <typename K,
+          typename HashK = std::hash<K>,
+          typename EqK   = std::equal_to<K>,
+          typename... Values>
+class lru {
+ public:
+  explicit lru(size_t size) : size_(size)
+  {
+    RAFT_EXPECTS(size >= 1, "The cache must fit at least one record.");
+  }
+
+  void set(const K& key, const Values&... values)
+  {
+    auto pos = map_.find(key);
+    if (pos == map_.end()) {
+      if (map_.size() >= size_) {
+        map_.erase(queue_.back());
+        queue_.pop_back();
+      }
+    } else {
+      queue_.erase(std::get<0>(pos->second));
+    }
+    queue_.push_front(key);
+    map_[key] = std::make_tuple(queue_.begin(), values...);
+  }
+
+  auto get(const K& key, Values*... values) -> bool
+  {
+    auto pos = map_.find(key);
+    if (pos == map_.end()) { return false; }
+    auto& map_val = pos->second;
+    queue_.erase(std::get<0>(map_val));
+    queue_.push_front(key);
+    std::get<0>(map_val) = queue_.begin();
+    set_values(map_val, values..., std::index_sequence_for<Values...>());
+    return true;
+  }
+
+ private:
+  using queue_iterator = typename std::list<K>::iterator;
+  std::list<K> queue_{};
+  std::unordered_map<K, std::tuple<queue_iterator, Values...>, HashK, EqK> map_{};
+  size_t size_;
+
+  template <size_t... Is>
+  static void set_values(const std::tuple<queue_iterator, Values...>& tup,
+                         Values*... vals,
+                         std::index_sequence<Is...>)
+  {
+    ((*vals = std::get<Is + 1>(tup)), ...);
+  }
+};
+
+};  // namespace raft::cache
diff --git a/cpp/include/raft/util/cuda_data_type.hpp b/cpp/include/raft/util/cuda_data_type.hpp
new file mode 100644
index 0000000000..cf83fc2fd0
--- /dev/null
+++ b/cpp/include/raft/util/cuda_data_type.hpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cstdint>
+#include <cuda_fp16.hpp>
+#include <library_types.h>
+
+namespace raft {
+
+template <typename T>
+constexpr auto get_cuda_data_type() -> cudaDataType_t;
+
+template <>
+inline constexpr auto get_cuda_data_type<int8_t>() -> cudaDataType_t
+{
+  return CUDA_R_8I;
+}
+template <>
+inline constexpr auto get_cuda_data_type<uint8_t>() -> cudaDataType_t
+{
+  return CUDA_R_8U;
+}
+template <>
+inline constexpr auto get_cuda_data_type<int16_t>() -> cudaDataType_t
+{
+  return CUDA_R_16I;
+}
+template <>
+inline constexpr auto get_cuda_data_type<uint16_t>() -> cudaDataType_t
+{
+  return CUDA_R_16U;
+}
+template <>
+inline constexpr auto get_cuda_data_type<int32_t>() -> cudaDataType_t
+{
+  return CUDA_R_32I;
+}
+template <>
+inline constexpr auto get_cuda_data_type<uint32_t>() -> cudaDataType_t
+{
+  return CUDA_R_32U;
+}
+template <>
+inline constexpr auto get_cuda_data_type<int64_t>() -> cudaDataType_t
+{
+  return CUDA_R_64I;
+}
+template <>
+inline constexpr auto get_cuda_data_type<uint64_t>() -> cudaDataType_t
+{
+  return CUDA_R_64U;
+}
+template <>
+inline constexpr auto get_cuda_data_type<half>() -> cudaDataType_t
+{
+  return CUDA_R_16F;
+}
+template <>
+inline constexpr auto get_cuda_data_type<float>() -> cudaDataType_t
+{
+  return CUDA_R_32F;
+}
+template <>
+inline constexpr auto get_cuda_data_type<double>() -> cudaDataType_t
+{
+  return CUDA_R_64F;
+}
+}  // namespace raft
diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/test/linalg/gemm_layout.cu
index 898c8ad5aa..2d591eb942 100644
--- a/cpp/test/linalg/gemm_layout.cu
+++ b/cpp/test/linalg/gemm_layout.cu
@@ -162,7 +162,7 @@ const std::vector<GemmLayoutInputs<double>> inputsd = {
 typedef GemmLayoutTest<float> GemmLayoutTestF;
 TEST_P(GemmLayoutTestF, Result)
 {
-  ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, raft::CompareApprox<float>(1e-4)));
+  ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, raft::CompareApprox<float>(2e-4)));
 }
 
 typedef GemmLayoutTest<double> GemmLayoutTestD;

From dc7a9a42824a5e4e03f2365f4bf3a3fcf0d18ab5 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 14 Aug 2023 19:25:38 +0200
Subject: [PATCH 02/15] Replace broken (due to missing direct includes) direct
 uses of cublasgemm

---
 cpp/include/raft/stats/detail/cov.cuh      | 26 ++-------
 cpp/test/random/multi_variable_gaussian.cu | 63 +++++++++++-----------
 2 files changed, 34 insertions(+), 55 deletions(-)

diff --git a/cpp/include/raft/stats/detail/cov.cuh b/cpp/include/raft/stats/detail/cov.cuh
index 0f740c8ed9..b0f83b9bc4 100644
--- a/cpp/include/raft/stats/detail/cov.cuh
+++ b/cpp/include/raft/stats/detail/cov.cuh
@@ -57,34 +57,14 @@ void cov(raft::resources const& handle,
          cudaStream_t stream)
 {
   if (stable) {
-    cublasHandle_t cublas_h = resource::get_cublas_handle(handle);
-
     // since mean operation is assumed to be along a given column, broadcast
     // must be along rows!
     raft::stats::meanCenter(data, data, mu, D, N, rowMajor, true, stream);
     Type alpha = Type(1) / (sample ? Type(N - 1) : Type(N));
     Type beta  = Type(0);
-    if (rowMajor) {
-      // #TODO: Call from public API when ready
-      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_h,
-                                                       CUBLAS_OP_N,
-                                                       CUBLAS_OP_T,
-                                                       D,
-                                                       D,
-                                                       N,
-                                                       &alpha,
-                                                       data,
-                                                       D,
-                                                       data,
-                                                       D,
-                                                       &beta,
-                                                       covar,
-                                                       D,
-                                                       stream));
-    } else {
-      raft::linalg::gemm(
-        handle, data, N, D, data, covar, D, D, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream);
-    }
+    auto ldd   = rowMajor ? D : N;
+    linalg::gemm(
+      handle, !rowMajor, rowMajor, D, D, N, &alpha, data, ldd, data, ldd, &beta, covar, D, stream);
   } else {
     ///@todo: implement this using cutlass + customized epilogue!
     ASSERT(false, "cov: Implement stable=false case!");
diff --git a/cpp/test/random/multi_variable_gaussian.cu b/cpp/test/random/multi_variable_gaussian.cu
index e35d49e453..e5ed3429b6 100644
--- a/cpp/test/random/multi_variable_gaussian.cu
+++ b/cpp/test/random/multi_variable_gaussian.cu
@@ -22,6 +22,7 @@
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/cusolver_dn_handle.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/linalg/gemm.cuh>
 #include <raft/random/multi_variable_gaussian.cuh>
 #include <raft/util/cudart_utils.hpp>
 
@@ -107,7 +108,6 @@ class MVGTest : public ::testing::TestWithParam<MVGInputs<T>> {
     corr      = params.corr;
     tolerance = params.tolerance;
 
-    auto cublasH   = resource::get_cublas_handle(handle);
     auto cusolverH = resource::get_cusolver_dn_handle(handle);
     auto stream    = resource::get_cuda_stream(handle);
 
@@ -175,21 +175,21 @@ class MVGTest : public ::testing::TestWithParam<MVGInputs<T>> {
     // finding the cov matrix, placing in Rand_cov
     T alfa = 1.0 / (nPoints - 1), beta = 0.0;
 
-    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublasH,
-                                                     CUBLAS_OP_N,
-                                                     CUBLAS_OP_T,
-                                                     dim,
-                                                     dim,
-                                                     nPoints,
-                                                     &alfa,
-                                                     X_d.data(),
-                                                     dim,
-                                                     X_d.data(),
-                                                     dim,
-                                                     &beta,
-                                                     Rand_cov.data(),
-                                                     dim,
-                                                     stream));
+    linalg::gemm(handle,
+                 false,
+                 true,
+                 dim,
+                 dim,
+                 nPoints,
+                 &alfa,
+                 X_d.data(),
+                 dim,
+                 X_d.data(),
+                 dim,
+                 &beta,
+                 Rand_cov.data(),
+                 dim,
+                 stream);
 
     // restoring cov provided into P_d
     raft::update_device(P_d.data(), P.data(), dim * dim, stream);
@@ -247,7 +247,6 @@ class MVGMdspanTest : public ::testing::TestWithParam<MVGInputs<T>> {
     corr        = params.corr;
     tolerance   = params.tolerance;
 
-    auto cublasH   = resource::get_cublas_handle(handle);
     auto cusolverH = resource::get_cusolver_dn_handle(handle);
     auto stream    = resource::get_cuda_stream(handle);
 
@@ -309,21 +308,21 @@ class MVGMdspanTest : public ::testing::TestWithParam<MVGInputs<T>> {
     // finding the cov matrix, placing in Rand_cov
     T alfa = 1.0 / (nPoints - 1), beta = 0.0;
 
-    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublasH,
-                                                     CUBLAS_OP_N,
-                                                     CUBLAS_OP_T,
-                                                     dim,
-                                                     dim,
-                                                     nPoints,
-                                                     &alfa,
-                                                     X_d.data(),
-                                                     dim,
-                                                     X_d.data(),
-                                                     dim,
-                                                     &beta,
-                                                     Rand_cov.data(),
-                                                     dim,
-                                                     stream));
+    linalg::gemm(handle,
+                 false,
+                 true,
+                 dim,
+                 dim,
+                 nPoints,
+                 &alfa,
+                 X_d.data(),
+                 dim,
+                 X_d.data(),
+                 dim,
+                 &beta,
+                 Rand_cov.data(),
+                 dim,
+                 stream);
 
     // restoring cov provided into P_d
     raft::update_device(P_d.data(), P.data(), dim * dim, stream);

From 71c03c0dee987f53714f21f70fd7ce6d66745e53 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 15 Aug 2023 07:33:05 +0200
Subject: [PATCH 03/15] Fix docs

---
 cpp/include/raft/linalg/detail/gemm.hpp | 6 +++---
 cpp/include/raft/linalg/gemm.cuh        | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp
index 3966d7e1eb..1460641dcf 100644
--- a/cpp/include/raft/linalg/detail/gemm.hpp
+++ b/cpp/include/raft/linalg/detail/gemm.hpp
@@ -229,12 +229,12 @@ static constexpr size_t kLRUSize = 100;
  * @param [in] n number of columns of C
  * @param [in] k number of rows of opB(B) / number of columns of opA(A)
  * @param [in] alpha host or device scalar
- * @param [in] A such a matrix that the shape of column-major opA(A) is [m, k]
+ * @param [in] a_ptr such a matrix that the shape of column-major opA(A) is [m, k]
  * @param [in] lda leading dimension of A
- * @param [in] B such a matrix that the shape of column-major opA(B) is [k, n]
+ * @param [in] b_ptr such a matrix that the shape of column-major opA(B) is [k, n]
  * @param [in] ldb leading dimension of B
  * @param [in] beta host or device scalar
- * @param [inout] C column-major matrix of size [m, n]
+ * @param [inout] c_ptr column-major matrix of size [m, n]
  * @param [in] ldc leading dimension of C
  * @param [in] stream
  */
diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh
index 35b877b20c..3057a4712d 100644
--- a/cpp/include/raft/linalg/gemm.cuh
+++ b/cpp/include/raft/linalg/gemm.cuh
@@ -47,12 +47,12 @@ namespace raft::linalg {
  * @param [in] n number of columns of C
  * @param [in] k number of rows of opB(B) / number of columns of opA(A)
  * @param [in] alpha host or device scalar
- * @param [in] A such a matrix that the shape of column-major opA(A) is [m, k]
+ * @param [in] a_ptr such a matrix that the shape of column-major opA(A) is [m, k]
  * @param [in] lda leading dimension of A
- * @param [in] B such a matrix that the shape of column-major opA(B) is [k, n]
+ * @param [in] b_ptr such a matrix that the shape of column-major opA(B) is [k, n]
  * @param [in] ldb leading dimension of B
  * @param [in] beta host or device scalar
- * @param [inout] C column-major matrix of size [m, n]
+ * @param [inout] c_ptr column-major matrix of size [m, n]
  * @param [in] ldc leading dimension of C
  */
 template <bool DevicePointerMode = false, typename S, typename A, typename B, typename C>

From a2fb088050ce5ea835288febf821e0525eb89dde Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 16 Aug 2023 13:41:31 +0200
Subject: [PATCH 04/15] Replace cublasgemm where it makes sense

---
 .../raft/random/detail/make_regression.cuh    |  98 +++++++-------
 .../random/detail/multi_variable_gaussian.cuh |  37 +----
 cpp/test/random/make_regression.cu            | 128 +++++++++---------
 3 files changed, 117 insertions(+), 146 deletions(-)

diff --git a/cpp/include/raft/random/detail/make_regression.cuh b/cpp/include/raft/random/detail/make_regression.cuh
index aec1a15f84..e8fba083a7 100644
--- a/cpp/include/raft/random/detail/make_regression.cuh
+++ b/cpp/include/raft/random/detail/make_regression.cuh
@@ -22,10 +22,9 @@
 
 #include <algorithm>
 
-#include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/add.cuh>
-#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/gemm.cuh>
 #include <raft/linalg/init.cuh>
 #include <raft/linalg/qr.cuh>
 #include <raft/linalg/transpose.cuh>
@@ -62,8 +61,6 @@ static void _make_low_rank_matrix(raft::resources const& handle,
                                   raft::random::RngState& r,
                                   cudaStream_t stream)
 {
-  cublasHandle_t cublas_handle = resource::get_cublas_handle(handle);
-
   IdxT n = std::min(n_rows, n_cols);
 
   // Generate random (ortho normal) vectors with QR decomposition
@@ -92,36 +89,36 @@ static void _make_low_rank_matrix(raft::resources const& handle,
   rmm::device_uvector<DataT> temp_q0s(n_rows * n, stream);
   rmm::device_uvector<DataT> temp_out(n_rows * n_cols, stream);
   DataT alpha = 1.0, beta = 0.0;
-  raft::linalg::detail::cublasgemm(cublas_handle,
-                                   CUBLAS_OP_N,
-                                   CUBLAS_OP_N,
-                                   n_rows,
-                                   n,
-                                   n,
-                                   &alpha,
-                                   q0.data(),
-                                   n_rows,
-                                   singular_mat.data(),
-                                   n,
-                                   &beta,
-                                   temp_q0s.data(),
-                                   n_rows,
-                                   stream);
-  raft::linalg::detail::cublasgemm(cublas_handle,
-                                   CUBLAS_OP_N,
-                                   CUBLAS_OP_T,
-                                   n_rows,
-                                   n_cols,
-                                   n,
-                                   &alpha,
-                                   temp_q0s.data(),
-                                   n_rows,
-                                   q1.data(),
-                                   n_cols,
-                                   &beta,
-                                   temp_out.data(),
-                                   n_rows,
-                                   stream);
+  raft::linalg::gemm(handle,
+                     false,
+                     false,
+                     n_rows,
+                     n,
+                     n,
+                     &alpha,
+                     q0.data(),
+                     n_rows,
+                     singular_mat.data(),
+                     n,
+                     &beta,
+                     temp_q0s.data(),
+                     n_rows,
+                     stream);
+  raft::linalg::gemm(handle,
+                     false,
+                     true,
+                     n_rows,
+                     n_cols,
+                     n,
+                     &alpha,
+                     temp_q0s.data(),
+                     n_rows,
+                     q1.data(),
+                     n_cols,
+                     &beta,
+                     temp_out.data(),
+                     n_rows,
+                     stream);
 
   // Transpose from column-major to row-major
   raft::linalg::transpose(handle, temp_out.data(), out, n_rows, n_cols, stream);
@@ -165,9 +162,6 @@ void make_regression_caller(raft::resources const& handle,
 {
   n_informative = std::min(n_informative, n_cols);
 
-  cublasHandle_t cublas_handle = resource::get_cublas_handle(handle);
-
-  cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);
   raft::random::RngState r(seed, type);
 
   if (effective_rank < 0) {
@@ -219,21 +213,21 @@ void make_regression_caller(raft::resources const& handle,
 
   // Compute the output values
   DataT alpha = (DataT)1.0, beta = (DataT)0.0;
-  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
-                                                   CUBLAS_OP_T,
-                                                   CUBLAS_OP_T,
-                                                   n_rows,
-                                                   n_targets,
-                                                   n_informative,
-                                                   &alpha,
-                                                   out,
-                                                   n_cols,
-                                                   _coef,
-                                                   n_targets,
-                                                   &beta,
-                                                   _values_col,
-                                                   n_rows,
-                                                   stream));
+  raft::linalg::gemm(handle,
+                     true,
+                     true,
+                     n_rows,
+                     n_targets,
+                     n_informative,
+                     &alpha,
+                     out,
+                     n_cols,
+                     _coef,
+                     n_targets,
+                     &beta,
+                     _values_col,
+                     n_rows,
+                     stream);
 
   // Transpose the values from column-major to row-major if needed
   if (n_targets > 1) {
diff --git a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
index 68934ac1ff..59cf187915 100644
--- a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
+++ b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
@@ -20,12 +20,12 @@
 #include <memory>
 #include <optional>
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/cusolver_dn_handle.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/detail/cusolver_wrappers.hpp>
+#include <raft/linalg/gemm.cuh>
 #include <raft/linalg/matrix_vector_op.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/random/random_types.hpp>
@@ -193,7 +193,6 @@ class multi_variable_gaussian_impl {
   void give_gaussian(const int nPoints, T* P, T* X, const T* x = 0)
   {
     auto cusolverHandle = resource::get_cusolver_dn_handle(handle);
-    auto cublasHandle   = resource::get_cublas_handle(handle);
     auto cudaStream     = resource::get_cuda_stream(handle);
     if (method == chol_decomp) {
       // lower part will contains chol_decomp
@@ -233,21 +232,8 @@ class multi_variable_gaussian_impl {
       RAFT_CUDA_TRY(cudaPeekAtLastError());
 
       // P is lower triangular chol decomp mtrx
-      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublasHandle,
-                                                       CUBLAS_OP_N,
-                                                       CUBLAS_OP_N,
-                                                       dim,
-                                                       nPoints,
-                                                       dim,
-                                                       &alfa,
-                                                       P,
-                                                       dim,
-                                                       X,
-                                                       dim,
-                                                       &beta,
-                                                       X,
-                                                       dim,
-                                                       cudaStream));
+      raft::linalg::gemm(
+        handle, false, false, dim, nPoints, dim, &alfa, P, dim, X, dim, &beta, X, dim, cudaStream);
     } else {
       epsilonToZero(eig, epsilon, dim, cudaStream);
       dim3 block(64);
@@ -263,21 +249,8 @@ class multi_variable_gaussian_impl {
       ASSERT(info_h == 0, "mvg: Cov matrix has %dth Eigenval negative", info_h);
 
       // Got Q = eigvect*eigvals.sqrt in P, Q*X in X below
-      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublasHandle,
-                                                       CUBLAS_OP_N,
-                                                       CUBLAS_OP_N,
-                                                       dim,
-                                                       nPoints,
-                                                       dim,
-                                                       &alfa,
-                                                       P,
-                                                       dim,
-                                                       X,
-                                                       dim,
-                                                       &beta,
-                                                       X,
-                                                       dim,
-                                                       cudaStream));
+      raft::linalg::gemm(
+        handle, false, false, dim, nPoints, dim, &alfa, P, dim, X, dim, &beta, X, dim, cudaStream);
     }
     // working to make mean not 0
     // since we are working with column-major, nPoints and dim are swapped
diff --git a/cpp/test/random/make_regression.cu b/cpp/test/random/make_regression.cu
index 0df3b2e7b0..9db03867bd 100644
--- a/cpp/test/random/make_regression.cu
+++ b/cpp/test/random/make_regression.cu
@@ -14,23 +14,24 @@
  * limitations under the License.
  */
 
-#include <gtest/gtest.h>
-#include <raft/core/resource/cublas_handle.hpp>
-#include <raft/core/resource/cuda_stream.hpp>
-#include <thrust/count.h>
-#include <thrust/device_ptr.h>
-#include <thrust/device_vector.h>
-
 #include "../test_utils.cuh"
+
+#include <raft/core/operators.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
-#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/gemm.cuh>
+#include <raft/linalg/map_reduce.cuh>
 #include <raft/linalg/subtract.cuh>
-
 #include <raft/linalg/transpose.cuh>
 #include <raft/random/make_regression.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
 namespace raft::random {
 
 template <typename T>
@@ -45,15 +46,6 @@ struct MakeRegressionInputs {
 
 template <typename T>
 class MakeRegressionTest : public ::testing::TestWithParam<MakeRegressionInputs<T>> {
- public:
-  MakeRegressionTest()
-    : params(::testing::TestWithParam<MakeRegressionInputs<T>>::GetParam()),
-      stream(resource::get_cuda_stream(handle)),
-      values_ret(params.n_samples * params.n_targets, stream),
-      values_prod(params.n_samples * params.n_targets, stream)
-  {
-  }
-
  protected:
   void SetUp() override
   {
@@ -88,21 +80,21 @@ class MakeRegressionTest : public ::testing::TestWithParam<MakeRegressionInputs<
 
     // Calculate the values from the data and coefficients (column-major)
     T alpha = (T)1.0, beta = (T)0.0;
-    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(resource::get_cublas_handle(handle),
-                                                     CUBLAS_OP_T,
-                                                     CUBLAS_OP_T,
-                                                     params.n_samples,
-                                                     params.n_targets,
-                                                     params.n_features,
-                                                     &alpha,
-                                                     data.data(),
-                                                     params.n_features,
-                                                     coef.data(),
-                                                     params.n_targets,
-                                                     &beta,
-                                                     values_cm.data(),
-                                                     params.n_samples,
-                                                     stream));
+    raft::linalg::gemm(handle,
+                       true,
+                       true,
+                       params.n_samples,
+                       params.n_targets,
+                       params.n_features,
+                       &alpha,
+                       data.data(),
+                       params.n_features,
+                       coef.data(),
+                       params.n_targets,
+                       &beta,
+                       values_cm.data(),
+                       params.n_samples,
+                       stream);
 
     // Transpose the values to row-major
     raft::linalg::transpose(
@@ -116,16 +108,24 @@ class MakeRegressionTest : public ::testing::TestWithParam<MakeRegressionInputs<
                             stream);
 
     // Count the number of zeroes in the coefficients
-    thrust::device_ptr<T> __coef = thrust::device_pointer_cast(coef.data());
-    zero_count = thrust::count(__coef, __coef + params.n_features * params.n_targets, (T)0.0);
+    rmm::device_scalar<int> zc_device(stream);
+    raft::linalg::mapReduce(zc_device.data(),
+                            coef.size(),
+                            0,
+                            raft::compose_op{raft::cast_op<int>{}, raft::equal_const_op<T>{0}},
+                            raft::add_op{},
+                            stream,
+                            coef.data());
+    zero_count = zc_device.value(stream);
   }
 
  protected:
+  MakeRegressionInputs<T> params{::testing::TestWithParam<MakeRegressionInputs<T>>::GetParam()};
   raft::resources handle;
-  cudaStream_t stream = 0;
+  rmm::cuda_stream_view stream{resource::get_cuda_stream(handle)};
+  rmm::device_uvector<T> values_ret{size_t(params.n_samples) * size_t(params.n_targets), stream};
+  rmm::device_uvector<T> values_prod{size_t(params.n_samples) * size_t(params.n_targets), stream};
 
-  MakeRegressionInputs<T> params;
-  rmm::device_uvector<T> values_ret, values_prod;
   int zero_count;
 };
 
@@ -183,8 +183,6 @@ class MakeRegressionMdspanTest : public ::testing::TestWithParam<MakeRegressionI
  protected:
   void SetUp() override
   {
-    auto stream = resource::get_cuda_stream(handle);
-
     // Noise must be zero to compare the actual and expected values
     T noise = (T)0.0, tail_strength = (T)0.5;
 
@@ -220,21 +218,21 @@ class MakeRegressionMdspanTest : public ::testing::TestWithParam<MakeRegressionI
     // Calculate the values from the data and coefficients (column-major)
     T alpha{};
     T beta{};
-    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(resource::get_cublas_handle(handle),
-                                                     CUBLAS_OP_T,
-                                                     CUBLAS_OP_T,
-                                                     params.n_samples,
-                                                     params.n_targets,
-                                                     params.n_features,
-                                                     &alpha,
-                                                     data.data(),
-                                                     params.n_features,
-                                                     coef.data(),
-                                                     params.n_targets,
-                                                     &beta,
-                                                     values_cm.data(),
-                                                     params.n_samples,
-                                                     stream));
+    raft::linalg::gemm(handle,
+                       true,
+                       true,
+                       params.n_samples,
+                       params.n_targets,
+                       params.n_features,
+                       &alpha,
+                       data.data(),
+                       params.n_features,
+                       coef.data(),
+                       params.n_targets,
+                       &beta,
+                       values_cm.data(),
+                       params.n_samples,
+                       stream);
 
     // Transpose the values to row-major
     raft::linalg::transpose(
@@ -248,18 +246,24 @@ class MakeRegressionMdspanTest : public ::testing::TestWithParam<MakeRegressionI
                             stream);
 
     // Count the number of zeroes in the coefficients
-    thrust::device_ptr<T> __coef = thrust::device_pointer_cast(coef.data());
-    constexpr T ZERO{};
-    zero_count = thrust::count(__coef, __coef + params.n_features * params.n_targets, ZERO);
+    rmm::device_scalar<int> zc_device(stream);
+    raft::linalg::mapReduce(zc_device.data(),
+                            coef.size(),
+                            0,
+                            raft::compose_op{raft::cast_op<int>{}, raft::equal_const_op<T>{0}},
+                            raft::add_op{},
+                            stream,
+                            coef.data());
+    zero_count = zc_device.value(stream);
   }
 
  private:
   MakeRegressionInputs<T> params{::testing::TestWithParam<MakeRegressionInputs<T>>::GetParam()};
   raft::resources handle;
-  rmm::device_uvector<T> values_ret{params.n_samples * params.n_targets,
-                                    resource::get_cuda_stream(handle)};
-  rmm::device_uvector<T> values_prod{params.n_samples * params.n_targets,
-                                     resource::get_cuda_stream(handle)};
+  rmm::cuda_stream_view stream{resource::get_cuda_stream(handle)};
+  rmm::device_uvector<T> values_ret{size_t(params.n_samples) * size_t(params.n_targets), stream};
+  rmm::device_uvector<T> values_prod{size_t(params.n_samples) * size_t(params.n_targets), stream};
+
   int zero_count = -1;
 };
 

From 699de0c046b7ad93690c29efd34351e7729d8085 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 16 Aug 2023 13:42:00 +0200
Subject: [PATCH 05/15] Fix a typo

---
 cpp/include/raft/linalg/detail/gemm.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp
index 1460641dcf..a30c08efad 100644
--- a/cpp/include/raft/linalg/detail/gemm.hpp
+++ b/cpp/include/raft/linalg/detail/gemm.hpp
@@ -154,7 +154,7 @@ struct cublastlt_matmul_desc {
   inline operator cublasLtMatmulDesc_t() const noexcept { return res; }
 
   template <typename S, typename A, typename B, typename C, bool DevicePointerMode = false>
-  static inline auto for_matmult(bool transpose_a, bool transpose_b) -> cublastlt_matmul_desc
+  static inline auto for_matmul(bool transpose_a, bool transpose_b) -> cublastlt_matmul_desc
   {
     auto desc = cublastlt_matmul_desc{get_matmul_type<S, A, B, C>(), get_cuda_data_type<S>()};
     if constexpr (DevicePointerMode) {
@@ -187,7 +187,7 @@ struct matmul_desc {
   static inline auto create(raft::resources const& res, const matmul_key_t& args) -> matmul_desc
   {
     matmul_desc r{
-      cublastlt_matmul_desc::for_matmult<S, A, B, C, DevicePointerMode>(args.trans_a, args.trans_b),
+      cublastlt_matmul_desc::for_matmul<S, A, B, C, DevicePointerMode>(args.trans_a, args.trans_b),
       cublastlt_matrix_layout::for_matmul<A>(!(args.trans_a), args.m, args.k, args.lda),
       cublastlt_matrix_layout::for_matmul<B>(!(args.trans_b), args.k, args.n, args.ldb),
       cublastlt_matrix_layout::for_matmul<C>(true, args.m, args.n, args.ldc)};

From f4d634aa1c9f007f00409d367cda1892517761b1 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 21 Aug 2023 15:54:06 +0200
Subject: [PATCH 06/15] Put the cache into the resource handle as a user-define
 resource

---
 .../raft/core/resource/resource_types.hpp     |  1 +
 .../raft/core/resource/user_resource.hpp      | 71 +++++++++++++++++++
 cpp/include/raft/linalg/detail/gemm.hpp       | 16 +++--
 cpp/include/raft/util/cache.hpp               |  9 ++-
 4 files changed, 91 insertions(+), 6 deletions(-)
 create mode 100644 cpp/include/raft/core/resource/user_resource.hpp

diff --git a/cpp/include/raft/core/resource/resource_types.hpp b/cpp/include/raft/core/resource/resource_types.hpp
index b32e09bb6b..2910b32b12 100644
--- a/cpp/include/raft/core/resource/resource_types.hpp
+++ b/cpp/include/raft/core/resource/resource_types.hpp
@@ -42,6 +42,7 @@ enum resource_type {
   THRUST_POLICY,           // thrust execution policy
   WORKSPACE_RESOURCE,      // rmm device memory resource
   CUBLASLT_HANDLE,         // cublasLt handle
+  USER_DEFINED,            // user-defined default-constructible resource
 
   LAST_KEY                 // reserved for the last key
 };
diff --git a/cpp/include/raft/core/resource/user_resource.hpp b/cpp/include/raft/core/resource/user_resource.hpp
new file mode 100644
index 0000000000..5da23f75b3
--- /dev/null
+++ b/cpp/include/raft/core/resource/user_resource.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+
+#include <memory>
+#include <typeindex>
+
+namespace raft::resource {
+
+class user_resource : public resource {
+ public:
+  user_resource()                    = default;
+  ~user_resource() noexcept override = default;
+  auto get_resource() -> void* override { return this; }
+
+  template <typename Store>
+  auto load() -> Store*
+  {
+    std::lock_guard<std::mutex> _(lock_);
+    auto key = std::type_index{typeid(Store)};
+    auto pos = map_.find(key);
+    if (pos != map_.end()) { return reinterpret_cast<Store*>(pos->second.get()); }
+    auto store_ptr = new Store{};
+    map_[key] =
+      std::shared_ptr<void>(store_ptr, [](void* ptr) { delete reinterpret_cast<Store*>(ptr); });
+    return store_ptr;
+  }
+
+ private:
+  std::unordered_map<std::type_index, std::shared_ptr<void>> map_{};
+  std::mutex lock_{};
+};
+
+/** Factory that knows how to construct a specific raft::resource to populate the res_t. */
+class user_resource_factory : public resource_factory {
+ public:
+  auto get_resource_type() -> resource_type override { return resource_type::USER_DEFINED; }
+  auto make_resource() -> resource* override { return new user_resource(); }
+};
+
+/**
+ * Get the user-defined default-constructible resource if it exists, create it otherwise.
+ * @param[in] res the raft resources object
+ * @return a pointer to the user-defined resource.
+ */
+template <typename Store>
+auto get_user_resource(resources const& res) -> Store*
+{
+  if (!res.has_resource_factory(resource_type::USER_DEFINED)) {
+    res.add_resource_factory(std::make_shared<user_resource_factory>());
+  }
+  return res.get_resource<user_resource>(resource_type::USER_DEFINED)->load<Store>();
+};
+
+}  // namespace raft::resource
diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp
index a30c08efad..462d758be6 100644
--- a/cpp/include/raft/linalg/detail/gemm.hpp
+++ b/cpp/include/raft/linalg/detail/gemm.hpp
@@ -19,6 +19,7 @@
 #include <raft/core/nvtx.hpp>
 #include <raft/core/resource/cublaslt_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/user_resource.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/util/cache.hpp>
 #include <raft/util/cuda_data_type.hpp>
@@ -209,8 +210,14 @@ struct matmul_desc {
   }
 };
 
-/** Number of matmul invocations to cache. */
-static constexpr size_t kLRUSize = 100;
+/** Cache with the default constructor; tagged with input types to use separate caches. */
+template <typename S, typename A, typename B, typename C, bool DevicePointerMode>
+struct matmul_cache {
+  /** Number of matmul invocations to cache. */
+  static constexpr size_t kDefaultSize = 100;
+  cache::lru<matmul_key_t, matmul_key_hash, std::equal_to<>, std::shared_ptr<matmul_desc>> value{
+    kDefaultSize};
+};
 
 /**
  * @brief the wrapper of cublasLt matmul function
@@ -259,9 +266,8 @@ void matmul(raft::resources const& res,
     "linalg::matmul(m = %d, n = %d, k = %d)", m, n, k);
   std::shared_ptr<matmul_desc> mm_desc{nullptr};
   matmul_key_t mm_key{m, n, k, lda, ldb, ldc, trans_a, trans_b};
-  static thread_local cache::
-    lru<matmul_key_t, matmul_key_hash, std::equal_to<>, std::shared_ptr<matmul_desc>>
-      cache{kLRUSize};
+  auto& cache =
+    resource::get_user_resource<matmul_cache<S, A, B, C, DevicePointerMode>>(res)->value;
   if (!cache.get(mm_key, &mm_desc)) {
     mm_desc.reset(new matmul_desc{matmul_desc::create<S, A, B, C, DevicePointerMode>(res, mm_key)});
     cache.set(mm_key, mm_desc);
diff --git a/cpp/include/raft/util/cache.hpp b/cpp/include/raft/util/cache.hpp
index ee1ad1cb19..c174aa489d 100644
--- a/cpp/include/raft/util/cache.hpp
+++ b/cpp/include/raft/util/cache.hpp
@@ -20,6 +20,7 @@
 
 #include <cstddef>
 #include <list>
+#include <mutex>
 #include <tuple>
 #include <unordered_map>
 #include <utility>
@@ -33,13 +34,17 @@ template <typename K,
           typename... Values>
 class lru {
  public:
-  explicit lru(size_t size) : size_(size)
+  /** Default cache size. */
+  static constexpr size_t kDefaultSize = 100;
+
+  explicit lru(size_t size = kDefaultSize) : size_(size)
   {
     RAFT_EXPECTS(size >= 1, "The cache must fit at least one record.");
   }
 
   void set(const K& key, const Values&... values)
   {
+    std::lock_guard<std::mutex> guard(lock_);
     auto pos = map_.find(key);
     if (pos == map_.end()) {
       if (map_.size() >= size_) {
@@ -55,6 +60,7 @@ class lru {
 
   auto get(const K& key, Values*... values) -> bool
   {
+    std::lock_guard<std::mutex> guard(lock_);
     auto pos = map_.find(key);
     if (pos == map_.end()) { return false; }
     auto& map_val = pos->second;
@@ -69,6 +75,7 @@ class lru {
   using queue_iterator = typename std::list<K>::iterator;
   std::list<K> queue_{};
   std::unordered_map<K, std::tuple<queue_iterator, Values...>, HashK, EqK> map_{};
+  std::mutex lock_{};
   size_t size_;
 
   template <size_t... Is>

From e57eebf481063357344f58ed2c7350d9452f365f Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 22 Aug 2023 10:07:54 +0200
Subject: [PATCH 07/15] Move matmul into a separate file

---
 cpp/include/raft/linalg/detail/gemm.hpp   | 273 +-------------------
 cpp/include/raft/linalg/detail/matmul.hpp | 293 ++++++++++++++++++++++
 cpp/include/raft/linalg/gemm.cuh          |  58 -----
 3 files changed, 296 insertions(+), 328 deletions(-)
 create mode 100644 cpp/include/raft/linalg/detail/matmul.hpp

diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp
index 462d758be6..1d643566e1 100644
--- a/cpp/include/raft/linalg/detail/gemm.hpp
+++ b/cpp/include/raft/linalg/detail/gemm.hpp
@@ -15,281 +15,14 @@
  */
 #pragma once
 
-#include <raft/core/cublas_macros.hpp>
-#include <raft/core/nvtx.hpp>
-#include <raft/core/resource/cublaslt_handle.hpp>
-#include <raft/core/resource/cuda_stream.hpp>
-#include <raft/core/resource/user_resource.hpp>
+#include "matmul.hpp"
+
 #include <raft/core/resources.hpp>
-#include <raft/util/cache.hpp>
-#include <raft/util/cuda_data_type.hpp>
 
-#include <cuda_fp16.hpp>
-#include <type_traits>
+#include <cublas_v2.h>  // cublasOperation_t
 
 namespace raft::linalg::detail {
 
-/** Get the cublas compute type for the combination of input types. */
-template <typename S, typename A, typename B, typename C>
-auto get_matmul_type() -> cublasComputeType_t
-{
-  static_assert(std::is_same_v<S, float> && std::is_same_v<A, float> && std::is_same_v<B, float> &&
-                  std::is_same_v<C, float>,
-                "Unsupported combination of input types. Consult cublas API for supported types.");
-  return CUBLAS_COMPUTE_32F;
-}
-
-template <>
-inline auto get_matmul_type<float, float, float, float>() -> cublasComputeType_t
-{
-  return CUBLAS_COMPUTE_32F;
-}
-template <>
-inline auto get_matmul_type<float, half, half, float>() -> cublasComputeType_t
-{
-  return CUBLAS_COMPUTE_32F;
-}
-template <>
-inline auto get_matmul_type<float, int8_t, int8_t, float>() -> cublasComputeType_t
-{
-  return CUBLAS_COMPUTE_32F;
-}
-template <>
-inline auto get_matmul_type<float, half, half, half>() -> cublasComputeType_t
-{
-  return CUBLAS_COMPUTE_32F;
-}
-template <>
-inline auto get_matmul_type<half, half, half, half>() -> cublasComputeType_t
-{
-  return CUBLAS_COMPUTE_16F;
-}
-template <>
-inline auto get_matmul_type<int32_t, int8_t, int8_t, int32_t>() -> cublasComputeType_t
-{
-  return CUBLAS_COMPUTE_32I;
-}
-template <>
-inline auto get_matmul_type<float, int8_t, int8_t, int8_t>() -> cublasComputeType_t
-{
-  return CUBLAS_COMPUTE_32I;
-}
-template <>
-inline auto get_matmul_type<double, double, double, double>() -> cublasComputeType_t
-{
-  return CUBLAS_COMPUTE_64F;
-}
-
-/** Unique representation of a matrix multiplication (assuming fixed types). */
-struct matmul_key_t {
-  uint64_t m;
-  uint64_t n;
-  uint64_t k;
-  uint64_t lda;
-  uint64_t ldb;
-  uint64_t ldc;
-  bool trans_a;
-  bool trans_b;
-};
-
-inline auto operator==(const matmul_key_t& a, const matmul_key_t& b) -> bool
-{
-  return a.m == b.m && a.n == b.n && a.k == b.k && a.lda == b.lda && a.ldb == b.ldb &&
-         a.ldc == b.ldc && a.trans_a == b.trans_a && a.trans_b == b.trans_b;
-}
-
-struct matmul_key_hash {
-  inline auto operator()(const matmul_key_t& x) const noexcept -> std::size_t
-  {
-    return x.m * x.n * x.k + x.lda * x.ldb * x.ldc + size_t{x.trans_a} + size_t{x.trans_b} * 2;
-  }
-};
-
-/** Descriptor for a column-major cublasLt matrix. */
-struct cublastlt_matrix_layout {
-  cublasLtMatrixLayout_t res{nullptr};
-  inline cublastlt_matrix_layout(cudaDataType dtype, uint64_t rows, uint64_t cols, uint64_t ld)
-  {
-    RAFT_CUBLAS_TRY(cublasLtMatrixLayoutCreate(&res, dtype, rows, cols, ld));
-  }
-  inline cublastlt_matrix_layout(const cublastlt_matrix_layout&)                    = delete;
-  inline auto operator=(const cublastlt_matrix_layout&) -> cublastlt_matrix_layout& = delete;
-  inline cublastlt_matrix_layout(cublastlt_matrix_layout&&)                         = default;
-  inline auto operator=(cublastlt_matrix_layout&&) -> cublastlt_matrix_layout&      = default;
-
-  inline ~cublastlt_matrix_layout() noexcept
-  {
-    RAFT_CUBLAS_TRY_NO_THROW(cublasLtMatrixLayoutDestroy(res));
-  }
-
-  // NOLINTNEXTLINE
-  inline operator cublasLtMatrixLayout_t() const noexcept { return res; }
-
-  template <typename T>
-  static inline auto for_matmul(bool col_major, uint64_t rows, uint64_t cols, uint64_t ld)
-    -> cublastlt_matrix_layout
-  {
-    return cublastlt_matrix_layout{
-      get_cuda_data_type<T>(), col_major ? rows : cols, col_major ? cols : rows, ld};
-  }
-};
-
-/** Descriptor for a cublasLt matmul function. */
-struct cublastlt_matmul_desc {
-  cublasLtMatmulDesc_t res{nullptr};
-  inline cublastlt_matmul_desc(cublasComputeType_t compute_type, cudaDataType scale_type)
-  {
-    RAFT_CUBLAS_TRY(cublasLtMatmulDescCreate(&res, compute_type, scale_type));
-  }
-  inline cublastlt_matmul_desc(const cublastlt_matmul_desc&)                    = delete;
-  inline auto operator=(const cublastlt_matmul_desc&) -> cublastlt_matmul_desc& = delete;
-  inline cublastlt_matmul_desc(cublastlt_matmul_desc&&)                         = default;
-  inline auto operator=(cublastlt_matmul_desc&&) -> cublastlt_matmul_desc&      = default;
-
-  inline ~cublastlt_matmul_desc() noexcept
-  {
-    RAFT_CUBLAS_TRY_NO_THROW(cublasLtMatmulDescDestroy(res));
-  }
-
-  // NOLINTNEXTLINE
-  inline operator cublasLtMatmulDesc_t() const noexcept { return res; }
-
-  template <typename S, typename A, typename B, typename C, bool DevicePointerMode = false>
-  static inline auto for_matmul(bool transpose_a, bool transpose_b) -> cublastlt_matmul_desc
-  {
-    auto desc = cublastlt_matmul_desc{get_matmul_type<S, A, B, C>(), get_cuda_data_type<S>()};
-    if constexpr (DevicePointerMode) {
-      const cublasPointerMode_t mode = CUBLAS_POINTER_MODE_DEVICE;
-      RAFT_CUBLAS_TRY(cublasLtMatmulDescSetAttribute(
-        desc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &mode, sizeof(mode)));
-    }
-    const cublasOperation_t trans_op = CUBLAS_OP_T;
-    if (transpose_a) {
-      RAFT_CUBLAS_TRY(cublasLtMatmulDescSetAttribute(
-        desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_op, sizeof(trans_op)));
-    }
-    if (transpose_b) {
-      RAFT_CUBLAS_TRY(cublasLtMatmulDescSetAttribute(
-        desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_op, sizeof(trans_op)));
-    }
-    return desc;
-  }
-};
-
-/** Full description of matmul. */
-struct matmul_desc {
-  cublastlt_matmul_desc desc;
-  cublastlt_matrix_layout a;
-  cublastlt_matrix_layout b;
-  cublastlt_matrix_layout c;
-  cublasLtMatmulHeuristicResult_t heuristics;
-
-  template <typename S, typename A, typename B, typename C, bool DevicePointerMode = false>
-  static inline auto create(raft::resources const& res, const matmul_key_t& args) -> matmul_desc
-  {
-    matmul_desc r{
-      cublastlt_matmul_desc::for_matmul<S, A, B, C, DevicePointerMode>(args.trans_a, args.trans_b),
-      cublastlt_matrix_layout::for_matmul<A>(!(args.trans_a), args.m, args.k, args.lda),
-      cublastlt_matrix_layout::for_matmul<B>(!(args.trans_b), args.k, args.n, args.ldb),
-      cublastlt_matrix_layout::for_matmul<C>(true, args.m, args.n, args.ldc)};
-    int algo_count;
-    cublasLtMatmulPreference_t preference;
-    RAFT_CUBLAS_TRY(cublasLtMatmulPreferenceCreate(&preference));
-    RAFT_CUBLAS_TRY(cublasLtMatmulAlgoGetHeuristic(resource::get_cublaslt_handle(res),
-                                                   r.desc,
-                                                   r.a,
-                                                   r.b,
-                                                   r.c,
-                                                   r.c,
-                                                   preference,
-                                                   1,
-                                                   &r.heuristics,
-                                                   &algo_count));
-    RAFT_CUBLAS_TRY(cublasLtMatmulPreferenceDestroy(preference));
-    return r;
-  }
-};
-
-/** Cache with the default constructor; tagged with input types to use separate caches. */
-template <typename S, typename A, typename B, typename C, bool DevicePointerMode>
-struct matmul_cache {
-  /** Number of matmul invocations to cache. */
-  static constexpr size_t kDefaultSize = 100;
-  cache::lru<matmul_key_t, matmul_key_hash, std::equal_to<>, std::shared_ptr<matmul_desc>> value{
-    kDefaultSize};
-};
-
-/**
- * @brief the wrapper of cublasLt matmul function
- *  It computes the following equation: C = alpha .* opA(A) * opB(B) + beta .* C
- *
- * @tparam DevicePointerMode whether pointers alpha, beta point to device memory
- * @tparam S the type of scale parameters alpha, beta
- * @tparam A the element type of matrix A
- * @tparam B the element type of matrix B
- * @tparam C the element type of matrix C
- *
- * @param [in] res raft resources
- * @param [in] trans_a cublas transpose op for A
- * @param [in] trans_b cublas transpose op for B
- * @param [in] m number of rows of C
- * @param [in] n number of columns of C
- * @param [in] k number of rows of opB(B) / number of columns of opA(A)
- * @param [in] alpha host or device scalar
- * @param [in] a_ptr such a matrix that the shape of column-major opA(A) is [m, k]
- * @param [in] lda leading dimension of A
- * @param [in] b_ptr such a matrix that the shape of column-major opA(B) is [k, n]
- * @param [in] ldb leading dimension of B
- * @param [in] beta host or device scalar
- * @param [inout] c_ptr column-major matrix of size [m, n]
- * @param [in] ldc leading dimension of C
- * @param [in] stream
- */
-template <bool DevicePointerMode = false, typename S, typename A, typename B, typename C>
-void matmul(raft::resources const& res,
-            bool trans_a,
-            bool trans_b,
-            uint64_t m,
-            uint64_t n,
-            uint64_t k,
-            const S* alpha,
-            const A* a_ptr,
-            uint64_t lda,
-            const B* b_ptr,
-            uint64_t ldb,
-            const S* beta,
-            C* c_ptr,
-            uint64_t ldc,
-            cudaStream_t stream)
-{
-  common::nvtx::range<common::nvtx::domain::raft> batch_scope(
-    "linalg::matmul(m = %d, n = %d, k = %d)", m, n, k);
-  std::shared_ptr<matmul_desc> mm_desc{nullptr};
-  matmul_key_t mm_key{m, n, k, lda, ldb, ldc, trans_a, trans_b};
-  auto& cache =
-    resource::get_user_resource<matmul_cache<S, A, B, C, DevicePointerMode>>(res)->value;
-  if (!cache.get(mm_key, &mm_desc)) {
-    mm_desc.reset(new matmul_desc{matmul_desc::create<S, A, B, C, DevicePointerMode>(res, mm_key)});
-    cache.set(mm_key, mm_desc);
-  }
-  RAFT_CUBLAS_TRY(cublasLtMatmul(resource::get_cublaslt_handle(res),
-                                 mm_desc->desc,
-                                 alpha,
-                                 a_ptr,
-                                 mm_desc->a,
-                                 b_ptr,
-                                 mm_desc->b,
-                                 beta,
-                                 c_ptr,
-                                 mm_desc->c,
-                                 c_ptr,
-                                 mm_desc->c,
-                                 &(mm_desc->heuristics.algo),
-                                 nullptr,
-                                 0,
-                                 stream));
-}
-
 template <typename T, bool DevicePointerMode = false>
 void legacy_gemm(raft::resources const& res,
                  const bool trans_a,
diff --git a/cpp/include/raft/linalg/detail/matmul.hpp b/cpp/include/raft/linalg/detail/matmul.hpp
new file mode 100644
index 0000000000..6aa6a32bb8
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/matmul.hpp
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/cublas_macros.hpp>
+#include <raft/core/nvtx.hpp>
+#include <raft/core/resource/cublaslt_handle.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/user_resource.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/util/cache.hpp>
+#include <raft/util/cuda_data_type.hpp>
+
+#include <cuda_fp16.hpp>
+#include <type_traits>
+
+namespace raft::linalg::detail {
+
+/** Get the cublas compute type for the combination of input types. */
+template <typename S, typename A, typename B, typename C>
+auto get_matmul_type() -> cublasComputeType_t
+{
+  static_assert(std::is_same_v<S, float> && std::is_same_v<A, float> && std::is_same_v<B, float> &&
+                  std::is_same_v<C, float>,
+                "Unsupported combination of input types. Consult cublas API for supported types.");
+  return CUBLAS_COMPUTE_32F;
+}
+
+template <>
+inline auto get_matmul_type<float, float, float, float>() -> cublasComputeType_t
+{
+  return CUBLAS_COMPUTE_32F;
+}
+template <>
+inline auto get_matmul_type<float, half, half, float>() -> cublasComputeType_t
+{
+  return CUBLAS_COMPUTE_32F;
+}
+template <>
+inline auto get_matmul_type<float, int8_t, int8_t, float>() -> cublasComputeType_t
+{
+  return CUBLAS_COMPUTE_32F;
+}
+template <>
+inline auto get_matmul_type<float, half, half, half>() -> cublasComputeType_t
+{
+  return CUBLAS_COMPUTE_32F;
+}
+template <>
+inline auto get_matmul_type<half, half, half, half>() -> cublasComputeType_t
+{
+  return CUBLAS_COMPUTE_16F;
+}
+template <>
+inline auto get_matmul_type<int32_t, int8_t, int8_t, int32_t>() -> cublasComputeType_t
+{
+  return CUBLAS_COMPUTE_32I;
+}
+template <>
+inline auto get_matmul_type<float, int8_t, int8_t, int8_t>() -> cublasComputeType_t
+{
+  return CUBLAS_COMPUTE_32I;
+}
+template <>
+inline auto get_matmul_type<double, double, double, double>() -> cublasComputeType_t
+{
+  return CUBLAS_COMPUTE_64F;
+}
+
+/** Unique representation of a matrix multiplication (assuming fixed types). */
+struct matmul_key_t {
+  uint64_t m;
+  uint64_t n;
+  uint64_t k;
+  uint64_t lda;
+  uint64_t ldb;
+  uint64_t ldc;
+  bool trans_a;
+  bool trans_b;
+};
+
+inline auto operator==(const matmul_key_t& a, const matmul_key_t& b) -> bool
+{
+  return a.m == b.m && a.n == b.n && a.k == b.k && a.lda == b.lda && a.ldb == b.ldb &&
+         a.ldc == b.ldc && a.trans_a == b.trans_a && a.trans_b == b.trans_b;
+}
+
+struct matmul_key_hash {
+  inline auto operator()(const matmul_key_t& x) const noexcept -> std::size_t
+  {
+    return x.m * x.n * x.k + x.lda * x.ldb * x.ldc + size_t{x.trans_a} + size_t{x.trans_b} * 2;
+  }
+};
+
+/** Descriptor for a column-major cublasLt matrix. */
+struct cublastlt_matrix_layout {
+  cublasLtMatrixLayout_t res{nullptr};
+  inline cublastlt_matrix_layout(cudaDataType dtype, uint64_t rows, uint64_t cols, uint64_t ld)
+  {
+    RAFT_CUBLAS_TRY(cublasLtMatrixLayoutCreate(&res, dtype, rows, cols, ld));
+  }
+  inline cublastlt_matrix_layout(const cublastlt_matrix_layout&)                    = delete;
+  inline auto operator=(const cublastlt_matrix_layout&) -> cublastlt_matrix_layout& = delete;
+  inline cublastlt_matrix_layout(cublastlt_matrix_layout&&)                         = default;
+  inline auto operator=(cublastlt_matrix_layout&&) -> cublastlt_matrix_layout&      = default;
+
+  inline ~cublastlt_matrix_layout() noexcept
+  {
+    RAFT_CUBLAS_TRY_NO_THROW(cublasLtMatrixLayoutDestroy(res));
+  }
+
+  // NOLINTNEXTLINE
+  inline operator cublasLtMatrixLayout_t() const noexcept { return res; }
+
+  template <typename T>
+  static inline auto for_matmul(bool col_major, uint64_t rows, uint64_t cols, uint64_t ld)
+    -> cublastlt_matrix_layout
+  {
+    return cublastlt_matrix_layout{
+      get_cuda_data_type<T>(), col_major ? rows : cols, col_major ? cols : rows, ld};
+  }
+};
+
+/** Descriptor for a cublasLt matmul function. */
+struct cublastlt_matmul_desc {
+  cublasLtMatmulDesc_t res{nullptr};
+  inline cublastlt_matmul_desc(cublasComputeType_t compute_type, cudaDataType scale_type)
+  {
+    RAFT_CUBLAS_TRY(cublasLtMatmulDescCreate(&res, compute_type, scale_type));
+  }
+  inline cublastlt_matmul_desc(const cublastlt_matmul_desc&)                    = delete;
+  inline auto operator=(const cublastlt_matmul_desc&) -> cublastlt_matmul_desc& = delete;
+  inline cublastlt_matmul_desc(cublastlt_matmul_desc&&)                         = default;
+  inline auto operator=(cublastlt_matmul_desc&&) -> cublastlt_matmul_desc&      = default;
+
+  inline ~cublastlt_matmul_desc() noexcept
+  {
+    RAFT_CUBLAS_TRY_NO_THROW(cublasLtMatmulDescDestroy(res));
+  }
+
+  // NOLINTNEXTLINE
+  inline operator cublasLtMatmulDesc_t() const noexcept { return res; }
+
+  template <typename S, typename A, typename B, typename C, bool DevicePointerMode = false>
+  static inline auto for_matmul(bool transpose_a, bool transpose_b) -> cublastlt_matmul_desc
+  {
+    auto desc = cublastlt_matmul_desc{get_matmul_type<S, A, B, C>(), get_cuda_data_type<S>()};
+    if constexpr (DevicePointerMode) {
+      const cublasPointerMode_t mode = CUBLAS_POINTER_MODE_DEVICE;
+      RAFT_CUBLAS_TRY(cublasLtMatmulDescSetAttribute(
+        desc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &mode, sizeof(mode)));
+    }
+    const cublasOperation_t trans_op = CUBLAS_OP_T;
+    if (transpose_a) {
+      RAFT_CUBLAS_TRY(cublasLtMatmulDescSetAttribute(
+        desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_op, sizeof(trans_op)));
+    }
+    if (transpose_b) {
+      RAFT_CUBLAS_TRY(cublasLtMatmulDescSetAttribute(
+        desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_op, sizeof(trans_op)));
+    }
+    return desc;
+  }
+};
+
+/** Full description of matmul. */
+struct matmul_desc {
+  cublastlt_matmul_desc desc;
+  cublastlt_matrix_layout a;
+  cublastlt_matrix_layout b;
+  cublastlt_matrix_layout c;
+  cublasLtMatmulHeuristicResult_t heuristics;
+
+  template <typename S, typename A, typename B, typename C, bool DevicePointerMode = false>
+  static inline auto create(raft::resources const& res, const matmul_key_t& args) -> matmul_desc
+  {
+    matmul_desc r{
+      cublastlt_matmul_desc::for_matmul<S, A, B, C, DevicePointerMode>(args.trans_a, args.trans_b),
+      cublastlt_matrix_layout::for_matmul<A>(!(args.trans_a), args.m, args.k, args.lda),
+      cublastlt_matrix_layout::for_matmul<B>(!(args.trans_b), args.k, args.n, args.ldb),
+      cublastlt_matrix_layout::for_matmul<C>(true, args.m, args.n, args.ldc)};
+    int algo_count;
+    cublasLtMatmulPreference_t preference;
+    RAFT_CUBLAS_TRY(cublasLtMatmulPreferenceCreate(&preference));
+    RAFT_CUBLAS_TRY(cublasLtMatmulAlgoGetHeuristic(resource::get_cublaslt_handle(res),
+                                                   r.desc,
+                                                   r.a,
+                                                   r.b,
+                                                   r.c,
+                                                   r.c,
+                                                   preference,
+                                                   1,
+                                                   &r.heuristics,
+                                                   &algo_count));
+    RAFT_CUBLAS_TRY(cublasLtMatmulPreferenceDestroy(preference));
+    return r;
+  }
+};
+
+/** Cache with the default constructor; tagged with input types to use separate caches. */
+template <typename S, typename A, typename B, typename C, bool DevicePointerMode>
+struct matmul_cache {
+  /** Number of matmul invocations to cache. */
+  static constexpr size_t kDefaultSize = 100;
+  cache::lru<matmul_key_t, matmul_key_hash, std::equal_to<>, std::shared_ptr<matmul_desc>> value{
+    kDefaultSize};
+};
+
+/**
+ * @brief the wrapper of cublasLt matmul function
+ *  It computes the following equation: C = alpha .* opA(A) * opB(B) + beta .* C
+ *
+ * @tparam DevicePointerMode whether pointers alpha, beta point to device memory
+ * @tparam S the type of scale parameters alpha, beta
+ * @tparam A the element type of matrix A
+ * @tparam B the element type of matrix B
+ * @tparam C the element type of matrix C
+ *
+ * @param [in] res raft resources
+ * @param [in] trans_a cublas transpose op for A
+ * @param [in] trans_b cublas transpose op for B
+ * @param [in] m number of rows of C
+ * @param [in] n number of columns of C
+ * @param [in] k number of rows of opB(B) / number of columns of opA(A)
+ * @param [in] alpha host or device scalar
+ * @param [in] a_ptr such a matrix that the shape of column-major opA(A) is [m, k]
+ * @param [in] lda leading dimension of A
+ * @param [in] b_ptr such a matrix that the shape of column-major opA(B) is [k, n]
+ * @param [in] ldb leading dimension of B
+ * @param [in] beta host or device scalar
+ * @param [inout] c_ptr column-major matrix of size [m, n]
+ * @param [in] ldc leading dimension of C
+ * @param [in] stream
+ */
+template <bool DevicePointerMode = false, typename S, typename A, typename B, typename C>
+void matmul(raft::resources const& res,
+            bool trans_a,
+            bool trans_b,
+            uint64_t m,
+            uint64_t n,
+            uint64_t k,
+            const S* alpha,
+            const A* a_ptr,
+            uint64_t lda,
+            const B* b_ptr,
+            uint64_t ldb,
+            const S* beta,
+            C* c_ptr,
+            uint64_t ldc,
+            cudaStream_t stream)
+{
+  common::nvtx::range<common::nvtx::domain::raft> batch_scope(
+    "linalg::matmul(m = %d, n = %d, k = %d)", m, n, k);
+  std::shared_ptr<matmul_desc> mm_desc{nullptr};
+  matmul_key_t mm_key{m, n, k, lda, ldb, ldc, trans_a, trans_b};
+  auto& cache =
+    resource::get_user_resource<matmul_cache<S, A, B, C, DevicePointerMode>>(res)->value;
+  if (!cache.get(mm_key, &mm_desc)) {
+    mm_desc.reset(new matmul_desc{matmul_desc::create<S, A, B, C, DevicePointerMode>(res, mm_key)});
+    cache.set(mm_key, mm_desc);
+  }
+  RAFT_CUBLAS_TRY(cublasLtMatmul(resource::get_cublaslt_handle(res),
+                                 mm_desc->desc,
+                                 alpha,
+                                 a_ptr,
+                                 mm_desc->a,
+                                 b_ptr,
+                                 mm_desc->b,
+                                 beta,
+                                 c_ptr,
+                                 mm_desc->c,
+                                 c_ptr,
+                                 mm_desc->c,
+                                 &(mm_desc->heuristics.algo),
+                                 nullptr,
+                                 0,
+                                 stream));
+}
+
+}  // namespace raft::linalg::detail
diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh
index 3057a4712d..56e91aaa0b 100644
--- a/cpp/include/raft/linalg/gemm.cuh
+++ b/cpp/include/raft/linalg/gemm.cuh
@@ -30,64 +30,6 @@
 
 namespace raft::linalg {
 
-/**
- * @brief the wrapper of cublasLt matmul function
- *  It computes the following equation: C = alpha .* opA(A) * opB(B) + beta .* C
- *
- * @tparam DevicePointerMode whether pointers alpha, beta point to device memory
- * @tparam S the type of scale parameters alpha, beta
- * @tparam A the element type of matrix A
- * @tparam B the element type of matrix B
- * @tparam C the element type of matrix C
- *
- * @param [in] res raft resources
- * @param [in] trans_a cublas transpose op for A
- * @param [in] trans_b cublas transpose op for B
- * @param [in] m number of rows of C
- * @param [in] n number of columns of C
- * @param [in] k number of rows of opB(B) / number of columns of opA(A)
- * @param [in] alpha host or device scalar
- * @param [in] a_ptr such a matrix that the shape of column-major opA(A) is [m, k]
- * @param [in] lda leading dimension of A
- * @param [in] b_ptr such a matrix that the shape of column-major opA(B) is [k, n]
- * @param [in] ldb leading dimension of B
- * @param [in] beta host or device scalar
- * @param [inout] c_ptr column-major matrix of size [m, n]
- * @param [in] ldc leading dimension of C
- */
-template <bool DevicePointerMode = false, typename S, typename A, typename B, typename C>
-void matmul(raft::resources const& res,
-            bool trans_a,
-            bool trans_b,
-            uint64_t m,
-            uint64_t n,
-            uint64_t k,
-            const S* alpha,
-            const A* a_ptr,
-            uint64_t lda,
-            const B* b_ptr,
-            uint64_t ldb,
-            const S* beta,
-            C* c_ptr,
-            uint64_t ldc)
-{
-  return detail::matmul<DevicePointerMode, S, A, B, C>(res,
-                                                       trans_a,
-                                                       trans_b,
-                                                       m,
-                                                       n,
-                                                       k,
-                                                       alpha,
-                                                       a_ptr,
-                                                       lda,
-                                                       b_ptr,
-                                                       ldb,
-                                                       beta,
-                                                       c_ptr,
-                                                       ldc,
-                                                       resource::get_cuda_stream(res));
-}
-
 /**
  * @brief the wrapper of cublas gemm function
  *  It computes the following equation: C = alpha .* opA(A) * opB(B) + beta .* C

From d44bf2048ec46ec9c93e7e0891e39c7cc6101aa6 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 22 Aug 2023 11:04:15 +0200
Subject: [PATCH 08/15] Complete the docs

---
 .../raft/core/resource/cublas_handle.hpp      |  4 +--
 .../raft/core/resource/cublaslt_handle.hpp    |  6 ++--
 .../raft/core/resource/user_resource.hpp      | 30 +++++++++++-----
 docs/source/cpp_api/core_resources.rst        | 35 +++++++++++++++++--
 4 files changed, 59 insertions(+), 16 deletions(-)

diff --git a/cpp/include/raft/core/resource/cublas_handle.hpp b/cpp/include/raft/core/resource/cublas_handle.hpp
index c8d8ee4c02..33dde49135 100644
--- a/cpp/include/raft/core/resource/cublas_handle.hpp
+++ b/cpp/include/raft/core/resource/cublas_handle.hpp
@@ -60,8 +60,8 @@ class cublas_resource_factory : public resource_factory {
  */
 
 /**
- * Load a cublasres_t from raft res if it exists, otherwise
- * add it and return it.
+ * Load a `cublasHandle_t` from raft res if it exists, otherwise add it and return it.
+ *
  * @param[in] res the raft resources object
  * @return cublas handle
  */
diff --git a/cpp/include/raft/core/resource/cublaslt_handle.hpp b/cpp/include/raft/core/resource/cublaslt_handle.hpp
index 0d83fae752..16f150f268 100644
--- a/cpp/include/raft/core/resource/cublaslt_handle.hpp
+++ b/cpp/include/raft/core/resource/cublaslt_handle.hpp
@@ -42,13 +42,13 @@ class cublaslt_resource_factory : public resource_factory {
 };
 
 /**
- * @defgroup resource_cublas cuBLAS handle resource functions
+ * @defgroup resource_cublaslt cuBLASLt handle resource functions
  * @{
  */
 
 /**
- * Load a cublasLt res_t from raft res if it exists, otherwise
- * add it and return it.
+ * Load a `cublasLtHandle_t` from raft res if it exists, otherwise add it and return it.
+ *
  * @param[in] res the raft resources object
  * @return cublasLt handle
  */
diff --git a/cpp/include/raft/core/resource/user_resource.hpp b/cpp/include/raft/core/resource/user_resource.hpp
index 5da23f75b3..21877c4133 100644
--- a/cpp/include/raft/core/resource/user_resource.hpp
+++ b/cpp/include/raft/core/resource/user_resource.hpp
@@ -29,16 +29,16 @@ class user_resource : public resource {
   ~user_resource() noexcept override = default;
   auto get_resource() -> void* override { return this; }
 
-  template <typename Store>
-  auto load() -> Store*
+  template <typename ResourceT>
+  auto load() -> ResourceT*
   {
     std::lock_guard<std::mutex> _(lock_);
-    auto key = std::type_index{typeid(Store)};
+    auto key = std::type_index{typeid(ResourceT)};
     auto pos = map_.find(key);
-    if (pos != map_.end()) { return reinterpret_cast<Store*>(pos->second.get()); }
-    auto store_ptr = new Store{};
+    if (pos != map_.end()) { return reinterpret_cast<ResourceT*>(pos->second.get()); }
+    auto store_ptr = new ResourceT{};
     map_[key] =
-      std::shared_ptr<void>(store_ptr, [](void* ptr) { delete reinterpret_cast<Store*>(ptr); });
+      std::shared_ptr<void>(store_ptr, [](void* ptr) { delete reinterpret_cast<ResourceT*>(ptr); });
     return store_ptr;
   }
 
@@ -54,18 +54,30 @@ class user_resource_factory : public resource_factory {
   auto make_resource() -> resource* override { return new user_resource(); }
 };
 
+/**
+ * @defgroup resource_user_defined user-defined resource functions
+ * @{
+ */
+
 /**
  * Get the user-defined default-constructible resource if it exists, create it otherwise.
+ *
+ * @tparam ResourceT the type of the resource; it must be complete and default-constructible.
+ *
  * @param[in] res the raft resources object
  * @return a pointer to the user-defined resource.
  */
-template <typename Store>
-auto get_user_resource(resources const& res) -> Store*
+template <typename ResourceT>
+auto get_user_resource(resources const& res) -> ResourceT*
 {
   if (!res.has_resource_factory(resource_type::USER_DEFINED)) {
     res.add_resource_factory(std::make_shared<user_resource_factory>());
   }
-  return res.get_resource<user_resource>(resource_type::USER_DEFINED)->load<Store>();
+  return res.get_resource<user_resource>(resource_type::USER_DEFINED)->load<ResourceT>();
 };
 
+/**
+ * @}
+ */
+
 }  // namespace raft::resource
diff --git a/docs/source/cpp_api/core_resources.rst b/docs/source/cpp_api/core_resources.rst
index 85c454b355..af26af7bbb 100644
--- a/docs/source/cpp_api/core_resources.rst
+++ b/docs/source/cpp_api/core_resources.rst
@@ -25,7 +25,7 @@ namespace *raft::resource*
 Device Resources
 ----------------
 
-`raft::device_resources` is a convenience over using `raft::resources` directly. It provides accessor methods to retrieve resources such as the CUDA stream, stream pool, and handles to the various CUDA math libraries like cuBLAS and cuSOLVER. 
+`raft::device_resources` is a convenience over using `raft::resources` directly. It provides accessor methods to retrieve resources such as the CUDA stream, stream pool, and handles to the various CUDA math libraries like cuBLAS and cuSOLVER.
 
 ``#include <raft/core/device_resources.hpp>``
 
@@ -73,7 +73,7 @@ namespace *raft::resource*
 cuBLAS Handle
 ~~~~~~~~~~~~~
 
-``#include <raft/core/resource/cublase_handle.hpp>``
+``#include <raft/core/resource/cublas_handle.hpp>``
 
 namespace *raft::resource*
 
@@ -82,6 +82,18 @@ namespace *raft::resource*
      :members:
      :content-only:
 
+cuBLASLt Handle
+~~~~~~~~~~~~~~~
+
+``#include <raft/core/resource/cublaslt_handle.hpp>``
+
+namespace *raft::resource*
+
+ .. doxygengroup:: resource_cublaslt
+     :project: RAFT
+     :members:
+     :content-only:
+
 CUDA Stream
 ~~~~~~~~~~~
 
@@ -202,3 +214,22 @@ namespace *raft::resource*
      :project: RAFT
      :members:
      :content-only:
+
+User-defined resources
+~~~~~~~~~~~~~~~~~~~~~~
+
+A user-defined resource is an arbitrary default-constructible C++ class.
+The consumer of the API can keep such a resource in the `raft::resources` handle.
+For example, consider a function that is expected to be called repeatedly and
+involves a costly kernel configuration. One can cache the kernel configuration in
+a user-defined resource.
+The cost of accessing it is one hashmap lookup.
+
+``#include <raft/core/resource/user_resource.hpp>``
+
+namespace *raft::resource*
+
+ .. doxygengroup:: resource_user_defined
+     :project: RAFT
+     :members:
+     :content-only:

From de2958058c9c7693f3e33a660f0cdca042e842e0 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 31 Aug 2023 17:14:59 +0200
Subject: [PATCH 09/15] move matmul.hpp to cublaslt_wrappers.hpp

---
 .../raft/linalg/detail/{matmul.hpp => cublaslt_wrappers.hpp}  | 1 +
 cpp/include/raft/linalg/detail/gemm.hpp                       | 4 +---
 2 files changed, 2 insertions(+), 3 deletions(-)
 rename cpp/include/raft/linalg/detail/{matmul.hpp => cublaslt_wrappers.hpp} (99%)

diff --git a/cpp/include/raft/linalg/detail/matmul.hpp b/cpp/include/raft/linalg/detail/cublaslt_wrappers.hpp
similarity index 99%
rename from cpp/include/raft/linalg/detail/matmul.hpp
rename to cpp/include/raft/linalg/detail/cublaslt_wrappers.hpp
index 6aa6a32bb8..1c025282ed 100644
--- a/cpp/include/raft/linalg/detail/matmul.hpp
+++ b/cpp/include/raft/linalg/detail/cublaslt_wrappers.hpp
@@ -24,6 +24,7 @@
 #include <raft/util/cache.hpp>
 #include <raft/util/cuda_data_type.hpp>
 
+#include <cublasLt.h>
 #include <cuda_fp16.hpp>
 #include <type_traits>
 
diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp
index 1d643566e1..97f85fdae4 100644
--- a/cpp/include/raft/linalg/detail/gemm.hpp
+++ b/cpp/include/raft/linalg/detail/gemm.hpp
@@ -15,12 +15,10 @@
  */
 #pragma once
 
-#include "matmul.hpp"
+#include "cublaslt_wrappers.hpp"
 
 #include <raft/core/resources.hpp>
 
-#include <cublas_v2.h>  // cublasOperation_t
-
 namespace raft::linalg::detail {
 
 template <typename T, bool DevicePointerMode = false>

From 090141a2f77780b3ef0de70423bac92262b04f65 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 30 Aug 2023 08:55:29 +0200
Subject: [PATCH 10/15] Cache IVF-PQ and select-warpsort kernel launch
 parameters to reduce latency

---
 .../raft/matrix/detail/select_k-inl.cuh       |  4 +-
 .../raft/matrix/detail/select_warpsort.cuh    | 93 ++++++++++++------
 .../raft/neighbors/detail/ivf_pq_search.cuh   | 95 ++++++++++++++++---
 .../raft_internal/matrix/select_k.cuh         | 10 +-
 4 files changed, 153 insertions(+), 49 deletions(-)

diff --git a/cpp/include/raft/matrix/detail/select_k-inl.cuh b/cpp/include/raft/matrix/detail/select_k-inl.cuh
index 20fe1963fc..ba3138d918 100644
--- a/cpp/include/raft/matrix/detail/select_k-inl.cuh
+++ b/cpp/include/raft/matrix/detail/select_k-inl.cuh
@@ -286,11 +286,11 @@ void select_k(raft::resources const& handle,
     case Algo::kWarpDistributedShm:
       return detail::select::warpsort::
         select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_distributed_ext>(
-          in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
+          handle, in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
     case Algo::kWarpImmediate:
       return detail::select::warpsort::
         select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_immediate>(
-          in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
+          handle, in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
     default: RAFT_FAIL("K-selection Algorithm not supported.");
   }
 }
diff --git a/cpp/include/raft/matrix/detail/select_warpsort.cuh b/cpp/include/raft/matrix/detail/select_warpsort.cuh
index 0ee87de4f7..3fe5f52d12 100644
--- a/cpp/include/raft/matrix/detail/select_warpsort.cuh
+++ b/cpp/include/raft/matrix/detail/select_warpsort.cuh
@@ -18,7 +18,9 @@
 
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/logger.hpp>
+#include <raft/core/resource/user_resource.hpp>
 #include <raft/util/bitonic_sort.cuh>
+#include <raft/util/cache.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/integer_utils.hpp>
 #include <raft/util/pow2_utils.cuh>
@@ -773,6 +775,11 @@ __launch_bounds__(256) RAFT_KERNEL
   queue.store(out + block_id * k, out_idx + block_id * k);
 }
 
+struct launch_params {
+  int block_size    = 0;
+  int min_grid_size = 0;
+};
+
 template <template <int, bool, typename, typename> class WarpSortClass,
           typename T,
           typename IdxT,
@@ -790,16 +797,13 @@ struct launch_setup {
    * @param[in] block_size_limit
    *   Forcefully limit the block size (optional)
    */
-  static void calc_optimal_params(int k,
-                                  int* block_size,
-                                  int* min_grid_size,
-                                  int block_size_limit = 0)
+  static auto calc_optimal_params(int k, int block_size_limit) -> launch_params
   {
     const int capacity = bound_by_power_of_two(k);
     if constexpr (Capacity > 1) {
       if (capacity < Capacity) {
         return launch_setup<WarpSortClass, T, IdxT, Capacity / 2>::calc_optimal_params(
-          capacity, block_size, min_grid_size, block_size_limit);
+          capacity, block_size_limit);
       }
     }
     ASSERT(capacity <= Capacity, "Requested k is too big (%d)", k);
@@ -807,12 +811,14 @@ struct launch_setup {
       int num_of_warp = block_size / std::min<int>(WarpSize, Capacity);
       return calc_smem_size_for_block_wide<T, IdxT>(num_of_warp, k);
     };
+    launch_params ps;
     RAFT_CUDA_TRY(cudaOccupancyMaxPotentialBlockSizeVariableSMem(
-      min_grid_size,
-      block_size,
+      &ps.min_grid_size,
+      &ps.block_size,
       block_kernel<WarpSortClass, Capacity, true, T, IdxT>,
       calc_smem,
       block_size_limit));
+    return ps;
   }
 
   static void kernel(int k,
@@ -869,6 +875,29 @@ struct launch_setup {
   }
 };
 
+template <template <int, bool, typename, typename> class WarpSortClass, typename T, typename IdxT>
+struct warpsort_params_cache {
+  static constexpr size_t kDefaultSize = 100;
+  cache::lru<uint64_t, std::hash<uint64_t>, std::equal_to<>, launch_params> value{kDefaultSize};
+};
+
+template <template <int, bool, typename, typename> class WarpSortClass, typename T, typename IdxT>
+static auto calc_optimal_params(raft::resources const& res, int k, int block_size_limit = 0)
+  -> launch_params
+{
+  static thread_local std::unordered_map<uint64_t, launch_params> memo{};
+  uint64_t key = (static_cast<uint64_t>(k) << 32) | static_cast<uint64_t>(block_size_limit);
+  auto& cache =
+    resource::get_user_resource<warpsort_params_cache<WarpSortClass, T, IdxT>>(res)->value;
+  launch_params val;
+  if (!cache.get(key, &val)) {
+    val =
+      launch_setup<WarpSortClass, T, IdxT, kMaxCapacity>::calc_optimal_params(k, block_size_limit);
+    cache.set(key, val);
+  }
+  return val;
+}
+
 template <template <int, bool, typename, typename> class WarpSortClass>
 struct LaunchThreshold {};
 
@@ -898,15 +927,19 @@ struct LaunchThreshold<warp_sort_immediate> {
 };
 
 template <template <int, bool, typename, typename> class WarpSortClass, typename T, typename IdxT>
-void calc_launch_parameter(
-  size_t batch_size, size_t len, int k, int* p_num_of_block, int* p_num_of_warp)
+void calc_launch_parameter(raft::resources const& res,
+                           size_t batch_size,
+                           size_t len,
+                           int k,
+                           int* p_num_of_block,
+                           int* p_num_of_warp)
 {
   const int capacity               = bound_by_power_of_two(k);
   const int capacity_per_full_warp = std::max(capacity, WarpSize);
-  int block_size                   = 0;
-  int min_grid_size                = 0;
-  launch_setup<WarpSortClass, T, IdxT>::calc_optimal_params(k, &block_size, &min_grid_size);
-  block_size = Pow2<WarpSize>::roundDown(block_size);
+  auto lps                         = calc_optimal_params<WarpSortClass, T, IdxT>(res, k);
+  int block_size                   = lps.block_size;
+  int min_grid_size                = lps.min_grid_size;
+  block_size                       = Pow2<WarpSize>::roundDown(block_size);
 
   int num_of_warp;
   int num_of_block;
@@ -950,19 +983,16 @@ void calc_launch_parameter(
     // to occupy a single block well.
     block_size = adjust_block_size(block_size);
     do {
-      num_of_warp               = block_size / WarpSize;
-      int another_block_size    = 0;
-      int another_min_grid_size = 0;
-      launch_setup<WarpSortClass, T, IdxT>::calc_optimal_params(
-        k, &another_block_size, &another_min_grid_size, block_size);
-      another_block_size = adjust_block_size(another_block_size);
-      if (batch_size >= size_t(another_min_grid_size)  // still have enough work
-          && another_block_size < block_size           // protect against an infinite loop
-          && another_min_grid_size * another_block_size >
-               min_grid_size * block_size  // improve occupancy
+      num_of_warp        = block_size / WarpSize;
+      auto another       = calc_optimal_params<WarpSortClass, T, IdxT>(res, k, block_size);
+      another.block_size = adjust_block_size(another.block_size);
+      if (batch_size >= size_t(another.min_grid_size)  // still have enough work
+          && another.block_size < block_size           // protect against an infinite loop
+          && another.min_grid_size * another.block_size >
+               min_grid_size * block_size              // improve occupancy
       ) {
-        block_size    = another_block_size;
-        min_grid_size = another_min_grid_size;
+        block_size    = another.block_size;
+        min_grid_size = another.min_grid_size;
       } else {
         break;
       }
@@ -1036,7 +1066,8 @@ void select_k_(int num_of_block,
 }
 
 template <typename T, typename IdxT, template <int, bool, typename, typename> class WarpSortClass>
-void select_k_impl(const T* in,
+void select_k_impl(raft::resources const& res,
+                   const T* in,
                    const IdxT* in_idx,
                    size_t batch_size,
                    size_t len,
@@ -1049,7 +1080,8 @@ void select_k_impl(const T* in,
 {
   int num_of_block = 0;
   int num_of_warp  = 0;
-  calc_launch_parameter<WarpSortClass, T, IdxT>(batch_size, len, k, &num_of_block, &num_of_warp);
+  calc_launch_parameter<WarpSortClass, T, IdxT>(
+    res, batch_size, len, k, &num_of_block, &num_of_warp);
 
   select_k_<WarpSortClass, T, IdxT>(num_of_block,
                                     num_of_warp,
@@ -1103,7 +1135,8 @@ void select_k_impl(const T* in,
  *           memory pool here to avoid memory allocations within the call).
  */
 template <typename T, typename IdxT>
-void select_k(const T* in,
+void select_k(raft::resources const& res,
+              const T* in,
               const IdxT* in_idx,
               size_t batch_size,
               size_t len,
@@ -1123,7 +1156,7 @@ void select_k(const T* in,
   int num_of_block = 0;
   int num_of_warp  = 0;
   calc_launch_parameter<warp_sort_immediate, T, IdxT>(
-    batch_size, len, k, &num_of_block, &num_of_warp);
+    res, batch_size, len, k, &num_of_block, &num_of_warp);
   int len_per_thread = len / (num_of_block * num_of_warp * std::min(capacity, WarpSize));
 
   if (len_per_thread <= LaunchThreshold<warp_sort_immediate>::len_factor_for_choosing) {
@@ -1141,7 +1174,7 @@ void select_k(const T* in,
                                             mr);
   } else {
     calc_launch_parameter<warp_sort_filtered, T, IdxT>(
-      batch_size, len, k, &num_of_block, &num_of_warp);
+      res, batch_size, len, k, &num_of_block, &num_of_warp);
     select_k_<warp_sort_filtered, T, IdxT>(num_of_block,
                                            num_of_warp,
                                            in,
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
index 7f5b316d41..4b03cb53cd 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
@@ -33,6 +33,7 @@
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/detail/device_memory_resource.hpp>
 #include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resource/user_resource.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/gemm.cuh>
@@ -40,6 +41,7 @@
 #include <raft/linalg/unary_op.cuh>
 #include <raft/matrix/detail/select_k.cuh>
 #include <raft/matrix/detail/select_warpsort.cuh>
+#include <raft/util/cache.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/device_atomics.cuh>
 #include <raft/util/device_loads_stores.cuh>
@@ -80,6 +82,12 @@ void select_clusters(raft::resources const& handle,
                      const float* cluster_centers,  // [n_lists, dim_ext]
                      rmm::mr::device_memory_resource* mr)
 {
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "ivf_pq::search::select_clusters(n_probes = %u, n_queries = %u, n_lists = %u, dim = %u)",
+    n_probes,
+    n_queries,
+    n_lists,
+    dim);
   auto stream = resource::get_cuda_stream(handle);
   /* NOTE[qc_distances]
 
@@ -409,6 +417,46 @@ constexpr inline auto expected_probe_coresidency(uint32_t n_clusters,
   return 1 + (n_queries - 1) * n_probes / (2 * n_clusters);
 }
 
+struct search_kernel_key {
+  bool manage_local_topk;
+  uint32_t locality_hint;
+  double preferred_shmem_carveout;
+  uint32_t pq_bits;
+  uint32_t pq_dim;
+  uint32_t precomp_data_count;
+  uint32_t n_queries;
+  uint32_t n_probes;
+  uint32_t topk;
+};
+
+inline auto operator==(const search_kernel_key& a, const search_kernel_key& b) -> bool
+{
+  return a.manage_local_topk == b.manage_local_topk && a.locality_hint == b.locality_hint &&
+         a.preferred_shmem_carveout == b.preferred_shmem_carveout && a.pq_bits == b.pq_bits &&
+         a.pq_dim == b.pq_dim && a.precomp_data_count == b.precomp_data_count &&
+         a.n_queries == b.n_queries && a.n_probes == b.n_probes && a.topk == b.topk;
+}
+
+struct search_kernel_key_hash {
+  inline auto operator()(const search_kernel_key& x) const noexcept -> std::size_t
+  {
+    return (size_t{x.manage_local_topk} << 63) +
+           size_t{x.topk} * size_t{x.n_probes} * size_t{x.n_queries} +
+           size_t{x.precomp_data_count} * size_t{x.pq_dim} * size_t{x.pq_bits};
+  }
+};
+
+template <typename OutT, typename LutT, typename IvfSampleFilterT>
+struct search_kernel_cache {
+  /** Number of matmul invocations to cache. */
+  static constexpr size_t kDefaultSize = 100;
+  cache::lru<search_kernel_key,
+             search_kernel_key_hash,
+             std::equal_to<>,
+             selected<OutT, LutT, IvfSampleFilterT>>
+    value{kDefaultSize};
+};
+
 /**
  * The "main part" of the search, which assumes that outer-level `search` has already:
  *
@@ -433,6 +481,12 @@ void ivfpq_search_worker(raft::resources const& handle,
                          double preferred_shmem_carveout,
                          IvfSampleFilterT sample_filter)
 {
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "ivf_pq::search-worker(n_queries = %u, n_probes = %u, k = %u, dim = %zu)",
+    n_queries,
+    n_probes,
+    topK,
+    index.dim());
   auto stream = resource::get_cuda_stream(handle);
   auto mr     = resource::get_workspace_resource(handle);
 
@@ -535,17 +589,32 @@ void ivfpq_search_worker(raft::resources const& handle,
     } break;
   }
 
-  auto search_instance = compute_similarity_select<ScoreT, LutT, IvfSampleFilterT>(
-    resource::get_device_properties(handle),
-    manage_local_topk,
-    coresidency,
-    preferred_shmem_carveout,
-    index.pq_bits(),
-    index.pq_dim(),
-    precomp_data_count,
-    n_queries,
-    n_probes,
-    topK);
+  selected<ScoreT, LutT, IvfSampleFilterT> search_instance;
+  search_kernel_key search_key{manage_local_topk,
+                               coresidency,
+                               preferred_shmem_carveout,
+                               index.pq_bits(),
+                               index.pq_dim(),
+                               precomp_data_count,
+                               n_queries,
+                               n_probes,
+                               topK};
+  auto& cache =
+    resource::get_user_resource<search_kernel_cache<ScoreT, LutT, IvfSampleFilterT>>(handle)->value;
+  if (!cache.get(search_key, &search_instance)) {
+    search_instance = compute_similarity_select<ScoreT, LutT, IvfSampleFilterT>(
+      resource::get_device_properties(handle),
+      manage_local_topk,
+      coresidency,
+      preferred_shmem_carveout,
+      index.pq_bits(),
+      index.pq_dim(),
+      precomp_data_count,
+      n_queries,
+      n_probes,
+      topK);
+    cache.set(search_key, search_instance);
+  }
 
   rmm::device_uvector<LutT> device_lut(search_instance.device_lut_size, stream, mr);
   std::optional<device_vector<float>> query_kths_buf{std::nullopt};
@@ -696,7 +765,7 @@ inline auto get_max_batch_size(raft::resources const& res,
                                uint32_t max_samples) -> uint32_t
 {
   uint32_t max_batch_size         = n_queries;
-  uint32_t n_ctas_total           = getMultiProcessorCount() * 2;
+  uint32_t n_ctas_total           = resource::get_device_properties(res).multiProcessorCount * 2;
   uint32_t n_ctas_total_per_batch = n_ctas_total / max_batch_size;
   float utilization               = float(n_ctas_total_per_batch * max_batch_size) / n_ctas_total;
   if (n_ctas_total_per_batch > 1 || (n_ctas_total_per_batch == 1 && utilization < 0.6)) {
@@ -800,6 +869,8 @@ inline void search(raft::resources const& handle,
 
   for (uint32_t offset_q = 0; offset_q < n_queries; offset_q += max_queries) {
     uint32_t queries_batch = min(max_queries, n_queries - offset_q);
+    common::nvtx::range<common::nvtx::domain::raft> batch_scope(
+      "ivf_pq::search-batch(queries: %u - %u)", offset_q, offset_q + queries_batch);
 
     select_clusters(handle,
                     clusters_to_probe.data(),
diff --git a/cpp/internal/raft_internal/matrix/select_k.cuh b/cpp/internal/raft_internal/matrix/select_k.cuh
index 1d15c5fc03..176a070e03 100644
--- a/cpp/internal/raft_internal/matrix/select_k.cuh
+++ b/cpp/internal/raft_internal/matrix/select_k.cuh
@@ -150,23 +150,23 @@ void select_k_impl(const resources& handle,
                                                                stream);
     case Algo::kWarpAuto:
       return detail::select::warpsort::select_k<T, IdxT>(
-        in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+        handle, in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
     case Algo::kWarpImmediate:
       return detail::select::warpsort::
         select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_immediate>(
-          in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+          handle, in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
     case Algo::kWarpFiltered:
       return detail::select::warpsort::
         select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_filtered>(
-          in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+          handle, in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
     case Algo::kWarpDistributed:
       return detail::select::warpsort::
         select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_distributed>(
-          in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+          handle, in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
     case Algo::kWarpDistributedShm:
       return detail::select::warpsort::
         select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_distributed_ext>(
-          in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+          handle, in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
     case Algo::kFaissBlockSelect:
       return neighbors::detail::select_k(
         in, in_idx, batch_size, len, out, out_idx, select_min, k, stream);

From 3e6dfcd96898947441658beee7937dcef44de7e8 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 30 Aug 2023 11:02:43 +0200
Subject: [PATCH 11/15] Adapt the deprecated knn function to the changes

---
 cpp/include/raft/matrix/detail/select_warpsort.cuh |  2 +-
 cpp/include/raft/spatial/knn/knn.cuh               | 10 +++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/cpp/include/raft/matrix/detail/select_warpsort.cuh b/cpp/include/raft/matrix/detail/select_warpsort.cuh
index 3fe5f52d12..4a7c1551e5 100644
--- a/cpp/include/raft/matrix/detail/select_warpsort.cuh
+++ b/cpp/include/raft/matrix/detail/select_warpsort.cuh
@@ -989,7 +989,7 @@ void calc_launch_parameter(raft::resources const& res,
       if (batch_size >= size_t(another.min_grid_size)  // still have enough work
           && another.block_size < block_size           // protect against an infinite loop
           && another.min_grid_size * another.block_size >
-               min_grid_size * block_size              // improve occupancy
+               min_grid_size * block_size  // improve occupancy
       ) {
         block_size    = another.block_size;
         min_grid_size = another.min_grid_size;
diff --git a/cpp/include/raft/spatial/knn/knn.cuh b/cpp/include/raft/spatial/knn/knn.cuh
index 3c089b1d22..cef6f17705 100644
--- a/cpp/include/raft/spatial/knn/knn.cuh
+++ b/cpp/include/raft/spatial/knn/knn.cuh
@@ -18,6 +18,8 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/nvtx.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
 #include <raft/matrix/detail/select_radix.cuh>
 #include <raft/matrix/detail/select_warpsort.cuh>
 #include <raft/neighbors/detail/knn_brute_force.cuh>
@@ -161,10 +163,12 @@ template <typename idx_t = int, typename value_t = float>
         in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, true, stream);
       break;
 
-    case SelectKAlgo::WARP_SORT:
+    case SelectKAlgo::WARP_SORT: {
+      raft::resources res;
+      resource::set_cuda_stream(res, stream);
       matrix::detail::select::warpsort::select_k<value_t, idx_t>(
-        in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, stream);
-      break;
+        res, in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, stream);
+    } break;
 
     default: ASSERT(false, "Unknown algorithm (id = %d)", int(algo));
   }

From 82b34b96fcd4d32588df0a46cf7fadb0d69819c3 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 25 Jan 2024 14:35:40 +0100
Subject: [PATCH 12/15] Style check

---
 cpp/include/raft/matrix/detail/select_warpsort.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/matrix/detail/select_warpsort.cuh b/cpp/include/raft/matrix/detail/select_warpsort.cuh
index 3dbf51accc..eb7b6862df 100644
--- a/cpp/include/raft/matrix/detail/select_warpsort.cuh
+++ b/cpp/include/raft/matrix/detail/select_warpsort.cuh
@@ -989,7 +989,7 @@ void calc_launch_parameter(raft::resources const& res,
       if (batch_size >= size_t(another.min_grid_size)  // still have enough work
           && another.block_size < block_size           // protect against an infinite loop
           && another.min_grid_size * another.block_size >
-               min_grid_size * block_size              // improve occupancy
+               min_grid_size * block_size  // improve occupancy
       ) {
         block_size    = another.block_size;
         min_grid_size = another.min_grid_size;

From ce9d04415e1f7be7ae750c842c5367b25746b466 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 25 Jan 2024 16:21:49 +0100
Subject: [PATCH 13/15] Remove unused code

---
 cpp/include/raft/matrix/detail/select_warpsort.cuh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/include/raft/matrix/detail/select_warpsort.cuh b/cpp/include/raft/matrix/detail/select_warpsort.cuh
index eb7b6862df..4e1e79f2d5 100644
--- a/cpp/include/raft/matrix/detail/select_warpsort.cuh
+++ b/cpp/include/raft/matrix/detail/select_warpsort.cuh
@@ -885,7 +885,6 @@ template <template <int, bool, typename, typename> class WarpSortClass, typename
 static auto calc_optimal_params(raft::resources const& res, int k, int block_size_limit = 0)
   -> launch_params
 {
-  static thread_local std::unordered_map<uint64_t, launch_params> memo{};
   uint64_t key = (static_cast<uint64_t>(k) << 32) | static_cast<uint64_t>(block_size_limit);
   auto& cache =
     resource::get_custom_resource<warpsort_params_cache<WarpSortClass, T, IdxT>>(res)->value;

From 6d29811f2554b352c3a5bcf1fb61067b05f3c07d Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 29 Jan 2024 15:49:01 +0100
Subject: [PATCH 14/15] Make sure select_k always uses the workspace memory
 resource

---
 .../raft/matrix/detail/select_k-ext.cuh       | 28 +++----
 .../raft/matrix/detail/select_k-inl.cuh       | 79 +++++++++----------
 .../raft/matrix/detail/select_radix.cuh       | 29 +++----
 .../raft/matrix/detail/select_warpsort.cuh    | 30 +++----
 cpp/include/raft/matrix/select_k.cuh          |  1 -
 .../neighbors/detail/ivf_flat_search-inl.cuh  |  8 +-
 .../raft/neighbors/detail/ivf_pq_search.cuh   |  6 +-
 .../raft_internal/neighbors/naive_knn.cuh     | 10 +--
 .../matrix/detail/select_k_double_int64_t.cu  | 23 +++---
 .../matrix/detail/select_k_double_uint32_t.cu | 23 +++---
 cpp/src/matrix/detail/select_k_float_int32.cu | 23 +++---
 .../matrix/detail/select_k_float_int64_t.cu   | 23 +++---
 .../matrix/detail/select_k_float_uint32_t.cu  | 23 +++---
 .../matrix/detail/select_k_half_int64_t.cu    | 23 +++---
 .../matrix/detail/select_k_half_uint32_t.cu   | 23 +++---
 15 files changed, 159 insertions(+), 193 deletions(-)

diff --git a/cpp/include/raft/matrix/detail/select_k-ext.cuh b/cpp/include/raft/matrix/detail/select_k-ext.cuh
index dfdbfa2d07..e8db6827b5 100644
--- a/cpp/include/raft/matrix/detail/select_k-ext.cuh
+++ b/cpp/include/raft/matrix/detail/select_k-ext.cuh
@@ -38,25 +38,23 @@ void select_k(raft::resources const& handle,
               T* out_val,
               IdxT* out_idx,
               bool select_min,
-              rmm::mr::device_memory_resource* mr = nullptr,
-              bool sorted                         = false,
-              SelectAlgo algo                     = SelectAlgo::kAuto) RAFT_EXPLICIT;
+              bool sorted     = false,
+              SelectAlgo algo = SelectAlgo::kAuto) RAFT_EXPLICIT;
 }  // namespace raft::matrix::detail
 
 #endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
 
-#define instantiate_raft_matrix_detail_select_k(T, IdxT)                                   \
-  extern template void raft::matrix::detail::select_k(raft::resources const& handle,       \
-                                                      const T* in_val,                     \
-                                                      const IdxT* in_idx,                  \
-                                                      size_t batch_size,                   \
-                                                      size_t len,                          \
-                                                      int k,                               \
-                                                      T* out_val,                          \
-                                                      IdxT* out_idx,                       \
-                                                      bool select_min,                     \
-                                                      rmm::mr::device_memory_resource* mr, \
-                                                      bool sorted,                         \
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                             \
+  extern template void raft::matrix::detail::select_k(raft::resources const& handle, \
+                                                      const T* in_val,               \
+                                                      const IdxT* in_idx,            \
+                                                      size_t batch_size,             \
+                                                      size_t len,                    \
+                                                      int k,                         \
+                                                      T* out_val,                    \
+                                                      IdxT* out_idx,                 \
+                                                      bool select_min,               \
+                                                      bool sorted,                   \
                                                       raft::matrix::SelectAlgo algo)
 instantiate_raft_matrix_detail_select_k(__half, uint32_t);
 instantiate_raft_matrix_detail_select_k(__half, int64_t);
diff --git a/cpp/include/raft/matrix/detail/select_k-inl.cuh b/cpp/include/raft/matrix/detail/select_k-inl.cuh
index 955a7bec2e..8f40e6ae00 100644
--- a/cpp/include/raft/matrix/detail/select_k-inl.cuh
+++ b/cpp/include/raft/matrix/detail/select_k-inl.cuh
@@ -23,13 +23,12 @@
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/nvtx.hpp>
-#include <raft/matrix/init.cuh>
+#include <raft/core/operators.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/linalg/map.cuh>
 #include <raft/matrix/select_k_types.hpp>
 
-#include <raft/core/resource/thrust_policy.hpp>
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
-#include <thrust/scan.h>
+#include <cub/cub.cuh>
 
 namespace raft::matrix::detail {
 
@@ -95,15 +94,17 @@ void segmented_sort_by_key(raft::resources const& handle,
                            const ValT* offsets,
                            bool asc)
 {
-  auto stream    = raft::resource::get_cuda_stream(handle);
-  auto out_inds  = raft::make_device_vector<ValT, ValT>(handle, n_elements);
-  auto out_dists = raft::make_device_vector<KeyT, ValT>(handle, n_elements);
+  auto stream = resource::get_cuda_stream(handle);
+  auto mr     = resource::get_workspace_resource(handle);
+  auto out_inds =
+    raft::make_device_mdarray<ValT, ValT>(handle, mr, raft::make_extents<ValT>(n_elements));
+  auto out_dists =
+    raft::make_device_mdarray<KeyT, ValT>(handle, mr, raft::make_extents<ValT>(n_elements));
 
   // Determine temporary device storage requirements
-  auto d_temp_storage       = raft::make_device_vector<char, int>(handle, 0);
   size_t temp_storage_bytes = 0;
   if (asc) {
-    cub::DeviceSegmentedRadixSort::SortPairs((void*)d_temp_storage.data_handle(),
+    cub::DeviceSegmentedRadixSort::SortPairs(nullptr,
                                              temp_storage_bytes,
                                              keys,
                                              out_dists.data_handle(),
@@ -117,7 +118,7 @@ void segmented_sort_by_key(raft::resources const& handle,
                                              sizeof(ValT) * 8,
                                              stream);
   } else {
-    cub::DeviceSegmentedRadixSort::SortPairsDescending((void*)d_temp_storage.data_handle(),
+    cub::DeviceSegmentedRadixSort::SortPairsDescending(nullptr,
                                                        temp_storage_bytes,
                                                        keys,
                                                        out_dists.data_handle(),
@@ -132,7 +133,8 @@ void segmented_sort_by_key(raft::resources const& handle,
                                                        stream);
   }
 
-  d_temp_storage = raft::make_device_vector<char, int>(handle, temp_storage_bytes);
+  auto d_temp_storage = raft::make_device_mdarray<char, size_t>(
+    handle, mr, raft::make_extents<size_t>(temp_storage_bytes));
 
   if (asc) {
     // Run sorting operation
@@ -201,6 +203,7 @@ void segmented_sort_by_key(raft::resources const& handle,
  * @tparam IdxT
  *   the index type (what is being selected together with the keys).
  *
+ * @param[in] handle container of reusable resources
  * @param[in] in_val
  *   contiguous device array of inputs of size (len * batch_size);
  *   these are compared and selected.
@@ -222,9 +225,10 @@ void segmented_sort_by_key(raft::resources const& handle,
  *   the payload selected together with `out_val`.
  * @param select_min
  *   whether to select k smallest (true) or largest (false) keys.
- * @param stream
- * @param mr an optional memory resource to use across the calls (you can provide a large enough
- *           memory pool here to avoid memory allocations within the call).
+ * @param[in] sorted
+ *   whether to make sure selected pairs are sorted by value
+ * @param[in] algo
+ *   the selection algorithm to use
  */
 template <typename T, typename IdxT>
 void select_k(raft::resources const& handle,
@@ -236,24 +240,21 @@ void select_k(raft::resources const& handle,
               T* out_val,
               IdxT* out_idx,
               bool select_min,
-              rmm::mr::device_memory_resource* mr = nullptr,
-              bool sorted                         = false,
-              SelectAlgo algo                     = SelectAlgo::kAuto)
+              bool sorted     = false,
+              SelectAlgo algo = SelectAlgo::kAuto)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "matrix::select_k(batch_size = %zu, len = %zu, k = %d)", batch_size, len, k);
 
-  if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); }
-
   if (algo == SelectAlgo::kAuto) { algo = choose_select_k_algorithm(batch_size, len, k); }
 
-  auto stream = raft::resource::get_cuda_stream(handle);
   switch (algo) {
     case SelectAlgo::kRadix8bits:
     case SelectAlgo::kRadix11bits:
     case SelectAlgo::kRadix11bitsExtraPass: {
       if (algo == SelectAlgo::kRadix8bits) {
-        detail::select::radix::select_k<T, IdxT, 8, 512>(in_val,
+        detail::select::radix::select_k<T, IdxT, 8, 512>(handle,
+                                                         in_val,
                                                          in_idx,
                                                          batch_size,
                                                          len,
@@ -261,13 +262,13 @@ void select_k(raft::resources const& handle,
                                                          out_val,
                                                          out_idx,
                                                          select_min,
-                                                         true,  // fused_last_filter
-                                                         stream,
-                                                         mr);
+                                                         true  // fused_last_filter
+        );
 
       } else {
         bool fused_last_filter = algo == SelectAlgo::kRadix11bits;
-        detail::select::radix::select_k<T, IdxT, 11, 512>(in_val,
+        detail::select::radix::select_k<T, IdxT, 11, 512>(handle,
+                                                          in_val,
                                                           in_idx,
                                                           batch_size,
                                                           len,
@@ -275,20 +276,12 @@ void select_k(raft::resources const& handle,
                                                           out_val,
                                                           out_idx,
                                                           select_min,
-                                                          fused_last_filter,
-                                                          stream,
-                                                          mr);
+                                                          fused_last_filter);
       }
       if (sorted) {
-        auto offsets = raft::make_device_vector<IdxT, IdxT>(handle, (IdxT)(batch_size + 1));
-
-        raft::matrix::fill(handle, offsets.view(), (IdxT)k);
-
-        thrust::exclusive_scan(raft::resource::get_thrust_policy(handle),
-                               offsets.data_handle(),
-                               offsets.data_handle() + offsets.size(),
-                               offsets.data_handle(),
-                               0);
+        auto offsets = make_device_mdarray<IdxT, IdxT>(
+          handle, resource::get_workspace_resource(handle), make_extents<IdxT>(batch_size + 1));
+        raft::linalg::map_offset(handle, offsets.view(), mul_const_op<IdxT>(k));
 
         auto keys = raft::make_device_vector_view<T, IdxT>(out_val, (IdxT)(batch_size * k));
         auto vals = raft::make_device_vector_view<IdxT, IdxT>(out_idx, (IdxT)(batch_size * k));
@@ -301,22 +294,22 @@ void select_k(raft::resources const& handle,
     case SelectAlgo::kWarpDistributed:
       return detail::select::warpsort::
         select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_distributed>(
-          handle, in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
+          handle, in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min);
     case SelectAlgo::kWarpDistributedShm:
       return detail::select::warpsort::
         select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_distributed_ext>(
-          handle, in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
+          handle, in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min);
     case SelectAlgo::kWarpAuto:
       return detail::select::warpsort::select_k<T, IdxT>(
-        handle, in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
+        handle, in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min);
     case SelectAlgo::kWarpImmediate:
       return detail::select::warpsort::
         select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_immediate>(
-          handle, in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
+          handle, in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min);
     case SelectAlgo::kWarpFiltered:
       return detail::select::warpsort::
         select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_filtered>(
-          handle, in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
+          handle, in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min);
     default: RAFT_FAIL("K-selection Algorithm not supported.");
   }
 }
diff --git a/cpp/include/raft/matrix/detail/select_radix.cuh b/cpp/include/raft/matrix/detail/select_radix.cuh
index b6ed03b93d..16b9ac0c6d 100644
--- a/cpp/include/raft/matrix/detail/select_radix.cuh
+++ b/cpp/include/raft/matrix/detail/select_radix.cuh
@@ -19,6 +19,9 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/operators.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resource/device_properties.hpp>
 #include <raft/linalg/map.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/device_atomics.cuh>
@@ -1157,6 +1160,7 @@ void radix_topk_one_block(const T* in,
  * @tparam BlockSize
  *   Number of threads in a kernel thread block.
  *
+ * @param[in] res container of reusable resources
  * @param[in] in
  *   contiguous device array of inputs of size (len * batch_size);
  *   these are compared and selected.
@@ -1184,12 +1188,10 @@ void radix_topk_one_block(const T* in,
  *   blocks is called. The later case is preferable when leading bits of input data are almost the
  *   same. That is, when the value range of input data is narrow. In such case, there could be a
  *   large number of inputs for the last filter, hence using multiple thread blocks is beneficial.
- * @param stream
- * @param mr an optional memory resource to use across the calls (you can provide a large enough
- *           memory pool here to avoid memory allocations within the call).
  */
 template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
-void select_k(const T* in,
+void select_k(raft::resources const& res,
+              const T* in,
               const IdxT* in_idx,
               int batch_size,
               IdxT len,
@@ -1197,10 +1199,10 @@ void select_k(const T* in,
               T* out,
               IdxT* out_idx,
               bool select_min,
-              bool fused_last_filter,
-              rmm::cuda_stream_view stream,
-              rmm::mr::device_memory_resource* mr = nullptr)
+              bool fused_last_filter)
 {
+  auto stream = resource::get_cuda_stream(res);
+  auto mr     = resource::get_workspace_resource(res);
   if (k == len) {
     RAFT_CUDA_TRY(
       cudaMemcpyAsync(out, in, sizeof(T) * batch_size * len, cudaMemcpyDeviceToDevice, stream));
@@ -1210,21 +1212,12 @@ void select_k(const T* in,
     } else {
       auto out_idx_view =
         raft::make_device_vector_view(out_idx, static_cast<size_t>(len) * batch_size);
-      raft::resources handle;
-      resource::set_cuda_stream(handle, stream);
-      raft::linalg::map_offset(handle, out_idx_view, raft::mod_const_op<IdxT>(len));
+      raft::linalg::map_offset(res, out_idx_view, raft::mod_const_op<IdxT>(len));
     }
     return;
   }
 
-  // TODO: use device_resources::get_device_properties() instead; should change it when we refactor
-  // resource management
-  int sm_cnt;
-  {
-    int dev;
-    RAFT_CUDA_TRY(cudaGetDevice(&dev));
-    RAFT_CUDA_TRY(cudaDeviceGetAttribute(&sm_cnt, cudaDevAttrMultiProcessorCount, dev));
-  }
+  int sm_cnt = resource::get_device_properties(res).multiProcessorCount;
 
   constexpr int items_per_thread = 32;
 
diff --git a/cpp/include/raft/matrix/detail/select_warpsort.cuh b/cpp/include/raft/matrix/detail/select_warpsort.cuh
index 4e1e79f2d5..7cd43b030b 100644
--- a/cpp/include/raft/matrix/detail/select_warpsort.cuh
+++ b/cpp/include/raft/matrix/detail/select_warpsort.cuh
@@ -18,7 +18,9 @@
 
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/logger.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/custom_resource.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
 #include <raft/util/bitonic_sort.cuh>
 #include <raft/util/cache.hpp>
 #include <raft/util/cuda_utils.cuh>
@@ -1015,10 +1017,8 @@ void select_k_(int num_of_block,
                IdxT* out_idx,
                bool select_min,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr = nullptr)
+               rmm::mr::device_memory_resource* mr)
 {
-  if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); }
-
   rmm::device_uvector<T> tmp_val(num_of_block * k * batch_size, stream, mr);
   rmm::device_uvector<IdxT> tmp_idx(num_of_block * k * batch_size, stream, mr);
 
@@ -1071,9 +1071,7 @@ void select_k_impl(raft::resources const& res,
                    int k,
                    T* out,
                    IdxT* out_idx,
-                   bool select_min,
-                   rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr = nullptr)
+                   bool select_min)
 {
   int num_of_block = 0;
   int num_of_warp  = 0;
@@ -1090,8 +1088,8 @@ void select_k_impl(raft::resources const& res,
                                     out,
                                     out_idx,
                                     select_min,
-                                    stream,
-                                    mr);
+                                    resource::get_cuda_stream(res),
+                                    resource::get_workspace_resource(res));
 }
 
 /**
@@ -1106,6 +1104,7 @@ void select_k_impl(raft::resources const& res,
  * @tparam IdxT
  *   the index type (what is being selected together with the keys).
  *
+ * @param[in] res container of reusable resources
  * @param[in] in
  *   contiguous device array of inputs of size (len * batch_size);
  *   these are compared and selected.
@@ -1127,9 +1126,6 @@ void select_k_impl(raft::resources const& res,
  *   the payload selected together with `out`.
  * @param select_min
  *   whether to select k smallest (true) or largest (false) keys.
- * @param stream
- * @param mr an optional memory resource to use across the calls (you can provide a large enough
- *           memory pool here to avoid memory allocations within the call).
  */
 template <typename T, typename IdxT>
 void select_k(raft::resources const& res,
@@ -1140,9 +1136,7 @@ void select_k(raft::resources const& res,
               int k,
               T* out,
               IdxT* out_idx,
-              bool select_min,
-              rmm::cuda_stream_view stream,
-              rmm::mr::device_memory_resource* mr = nullptr)
+              bool select_min)
 {
   ASSERT(k <= kMaxCapacity, "Current max k is %d (requested %d)", kMaxCapacity, k);
   ASSERT(len <= size_t(std::numeric_limits<IdxT>::max()),
@@ -1167,8 +1161,8 @@ void select_k(raft::resources const& res,
                                             out,
                                             out_idx,
                                             select_min,
-                                            stream,
-                                            mr);
+                                            resource::get_cuda_stream(res),
+                                            resource::get_workspace_resource(res));
   } else {
     calc_launch_parameter<warp_sort_filtered, T, IdxT>(
       res, batch_size, len, k, &num_of_block, &num_of_warp);
@@ -1182,8 +1176,8 @@ void select_k(raft::resources const& res,
                                            out,
                                            out_idx,
                                            select_min,
-                                           stream,
-                                           mr);
+                                           resource::get_cuda_stream(res),
+                                           resource::get_workspace_resource(res));
   }
 }
 
diff --git a/cpp/include/raft/matrix/select_k.cuh b/cpp/include/raft/matrix/select_k.cuh
index 92d7db006d..e2d94c67ae 100644
--- a/cpp/include/raft/matrix/select_k.cuh
+++ b/cpp/include/raft/matrix/select_k.cuh
@@ -112,7 +112,6 @@ void select_k(raft::resources const& handle,
                                    out_val.data_handle(),
                                    out_idx.data_handle(),
                                    select_min,
-                                   nullptr,
                                    sorted,
                                    algo);
 }
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
index 09c58602a4..29d521566d 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -135,8 +135,7 @@ void search_impl(raft::resources const& handle,
                                            n_probes,
                                            coarse_distances_dev.data(),
                                            coarse_indices_dev.data(),
-                                           select_min,
-                                           search_mr);
+                                           select_min);
   RAFT_LOG_TRACE_VEC(coarse_indices_dev.data(), n_probes);
   RAFT_LOG_TRACE_VEC(coarse_distances_dev.data(), n_probes);
 
@@ -199,8 +198,7 @@ void search_impl(raft::resources const& handle,
                                          k,
                                          distances,
                                          neighbors,
-                                         select_min,
-                                         search_mr);
+                                         select_min);
   }
 }
 
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
index d3d337fbb6..d000a1a4d3 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
@@ -168,8 +168,7 @@ void select_clusters(raft::resources const& handle,
                                             n_probes,
                                             cluster_dists.data(),
                                             clusters_to_probe,
-                                            true,
-                                            mr);
+                                            true);
 }
 
 /**
@@ -661,8 +660,7 @@ void ivfpq_search_worker(raft::resources const& handle,
                                              topK,
                                              topk_dists.data(),
                                              neighbors_uint32,
-                                             true,
-                                             mr);
+                                             true);
 
   // Postprocessing
   postprocess_distances(
diff --git a/cpp/internal/raft_internal/neighbors/naive_knn.cuh b/cpp/internal/raft_internal/neighbors/naive_knn.cuh
index 594fff0ba0..35d533316f 100644
--- a/cpp/internal/raft_internal/neighbors/naive_knn.cuh
+++ b/cpp/internal/raft_internal/neighbors/naive_knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/matrix/detail/select_k.cuh>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
@@ -90,8 +92,7 @@ void naive_knn(raft::resources const& handle,
                uint32_t k,
                raft::distance::DistanceType type)
 {
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
-
+  auto mr     = resource::get_workspace_resource(handle);
   auto stream = raft::resource::get_cuda_stream(handle);
   dim3 block_dim(16, 32, 1);
   // maximum reasonable grid size in `y` direction
@@ -118,8 +119,7 @@ void naive_knn(raft::resources const& handle,
                                           static_cast<int>(k),
                                           dist_topk + offset * k,
                                           indices_topk + offset * k,
-                                          type != raft::distance::DistanceType::InnerProduct,
-                                          mr);
+                                          type != raft::distance::DistanceType::InnerProduct);
   }
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 }
diff --git a/cpp/src/matrix/detail/select_k_double_int64_t.cu b/cpp/src/matrix/detail/select_k_double_int64_t.cu
index 87e5d49d29..e32b4ef6f0 100644
--- a/cpp/src/matrix/detail/select_k_double_int64_t.cu
+++ b/cpp/src/matrix/detail/select_k_double_int64_t.cu
@@ -16,18 +16,17 @@
 
 #include <raft/matrix/detail/select_k-inl.cuh>
 
-#define instantiate_raft_matrix_detail_select_k(T, IdxT)                            \
-  template void raft::matrix::detail::select_k(raft::resources const& handle,       \
-                                               const T* in_val,                     \
-                                               const IdxT* in_idx,                  \
-                                               size_t batch_size,                   \
-                                               size_t len,                          \
-                                               int k,                               \
-                                               T* out_val,                          \
-                                               IdxT* out_idx,                       \
-                                               bool select_min,                     \
-                                               rmm::mr::device_memory_resource* mr, \
-                                               bool sorted,                         \
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                      \
+  template void raft::matrix::detail::select_k(raft::resources const& handle, \
+                                               const T* in_val,               \
+                                               const IdxT* in_idx,            \
+                                               size_t batch_size,             \
+                                               size_t len,                    \
+                                               int k,                         \
+                                               T* out_val,                    \
+                                               IdxT* out_idx,                 \
+                                               bool select_min,               \
+                                               bool sorted,                   \
                                                raft::matrix::SelectAlgo algo)
 
 instantiate_raft_matrix_detail_select_k(double, int64_t);
diff --git a/cpp/src/matrix/detail/select_k_double_uint32_t.cu b/cpp/src/matrix/detail/select_k_double_uint32_t.cu
index 67dce0e166..9aa4e957af 100644
--- a/cpp/src/matrix/detail/select_k_double_uint32_t.cu
+++ b/cpp/src/matrix/detail/select_k_double_uint32_t.cu
@@ -17,18 +17,17 @@
 #include <cstdint>  // uint32_t
 #include <raft/matrix/detail/select_k-inl.cuh>
 
-#define instantiate_raft_matrix_detail_select_k(T, IdxT)                            \
-  template void raft::matrix::detail::select_k(raft::resources const& handle,       \
-                                               const T* in_val,                     \
-                                               const IdxT* in_idx,                  \
-                                               size_t batch_size,                   \
-                                               size_t len,                          \
-                                               int k,                               \
-                                               T* out_val,                          \
-                                               IdxT* out_idx,                       \
-                                               bool select_min,                     \
-                                               rmm::mr::device_memory_resource* mr, \
-                                               bool sorted,                         \
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                      \
+  template void raft::matrix::detail::select_k(raft::resources const& handle, \
+                                               const T* in_val,               \
+                                               const IdxT* in_idx,            \
+                                               size_t batch_size,             \
+                                               size_t len,                    \
+                                               int k,                         \
+                                               T* out_val,                    \
+                                               IdxT* out_idx,                 \
+                                               bool select_min,               \
+                                               bool sorted,                   \
                                                raft::matrix::SelectAlgo algo)
 
 instantiate_raft_matrix_detail_select_k(double, uint32_t);
diff --git a/cpp/src/matrix/detail/select_k_float_int32.cu b/cpp/src/matrix/detail/select_k_float_int32.cu
index 4be7c54839..7f163a0b0d 100644
--- a/cpp/src/matrix/detail/select_k_float_int32.cu
+++ b/cpp/src/matrix/detail/select_k_float_int32.cu
@@ -16,18 +16,17 @@
 
 #include <raft/matrix/detail/select_k-inl.cuh>
 
-#define instantiate_raft_matrix_detail_select_k(T, IdxT)                            \
-  template void raft::matrix::detail::select_k(raft::resources const& handle,       \
-                                               const T* in_val,                     \
-                                               const IdxT* in_idx,                  \
-                                               size_t batch_size,                   \
-                                               size_t len,                          \
-                                               int k,                               \
-                                               T* out_val,                          \
-                                               IdxT* out_idx,                       \
-                                               bool select_min,                     \
-                                               rmm::mr::device_memory_resource* mr, \
-                                               bool sorted,                         \
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                      \
+  template void raft::matrix::detail::select_k(raft::resources const& handle, \
+                                               const T* in_val,               \
+                                               const IdxT* in_idx,            \
+                                               size_t batch_size,             \
+                                               size_t len,                    \
+                                               int k,                         \
+                                               T* out_val,                    \
+                                               IdxT* out_idx,                 \
+                                               bool select_min,               \
+                                               bool sorted,                   \
                                                raft::matrix::SelectAlgo algo)
 
 instantiate_raft_matrix_detail_select_k(float, int);
diff --git a/cpp/src/matrix/detail/select_k_float_int64_t.cu b/cpp/src/matrix/detail/select_k_float_int64_t.cu
index 6337994e86..87b6525356 100644
--- a/cpp/src/matrix/detail/select_k_float_int64_t.cu
+++ b/cpp/src/matrix/detail/select_k_float_int64_t.cu
@@ -16,18 +16,17 @@
 
 #include <raft/matrix/detail/select_k-inl.cuh>
 
-#define instantiate_raft_matrix_detail_select_k(T, IdxT)                            \
-  template void raft::matrix::detail::select_k(raft::resources const& handle,       \
-                                               const T* in_val,                     \
-                                               const IdxT* in_idx,                  \
-                                               size_t batch_size,                   \
-                                               size_t len,                          \
-                                               int k,                               \
-                                               T* out_val,                          \
-                                               IdxT* out_idx,                       \
-                                               bool select_min,                     \
-                                               rmm::mr::device_memory_resource* mr, \
-                                               bool sorted,                         \
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                      \
+  template void raft::matrix::detail::select_k(raft::resources const& handle, \
+                                               const T* in_val,               \
+                                               const IdxT* in_idx,            \
+                                               size_t batch_size,             \
+                                               size_t len,                    \
+                                               int k,                         \
+                                               T* out_val,                    \
+                                               IdxT* out_idx,                 \
+                                               bool select_min,               \
+                                               bool sorted,                   \
                                                raft::matrix::SelectAlgo algo)
 
 instantiate_raft_matrix_detail_select_k(float, int64_t);
diff --git a/cpp/src/matrix/detail/select_k_float_uint32_t.cu b/cpp/src/matrix/detail/select_k_float_uint32_t.cu
index ad26547812..e698f811d8 100644
--- a/cpp/src/matrix/detail/select_k_float_uint32_t.cu
+++ b/cpp/src/matrix/detail/select_k_float_uint32_t.cu
@@ -16,18 +16,17 @@
 
 #include <raft/matrix/detail/select_k-inl.cuh>
 
-#define instantiate_raft_matrix_detail_select_k(T, IdxT)                            \
-  template void raft::matrix::detail::select_k(raft::resources const& handle,       \
-                                               const T* in_val,                     \
-                                               const IdxT* in_idx,                  \
-                                               size_t batch_size,                   \
-                                               size_t len,                          \
-                                               int k,                               \
-                                               T* out_val,                          \
-                                               IdxT* out_idx,                       \
-                                               bool select_min,                     \
-                                               rmm::mr::device_memory_resource* mr, \
-                                               bool sorted,                         \
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                      \
+  template void raft::matrix::detail::select_k(raft::resources const& handle, \
+                                               const T* in_val,               \
+                                               const IdxT* in_idx,            \
+                                               size_t batch_size,             \
+                                               size_t len,                    \
+                                               int k,                         \
+                                               T* out_val,                    \
+                                               IdxT* out_idx,                 \
+                                               bool select_min,               \
+                                               bool sorted,                   \
                                                raft::matrix::SelectAlgo algo)
 
 instantiate_raft_matrix_detail_select_k(float, uint32_t);
diff --git a/cpp/src/matrix/detail/select_k_half_int64_t.cu b/cpp/src/matrix/detail/select_k_half_int64_t.cu
index e3c29a2033..0eee20b1fa 100644
--- a/cpp/src/matrix/detail/select_k_half_int64_t.cu
+++ b/cpp/src/matrix/detail/select_k_half_int64_t.cu
@@ -16,18 +16,17 @@
 
 #include <raft/matrix/detail/select_k-inl.cuh>
 
-#define instantiate_raft_matrix_detail_select_k(T, IdxT)                            \
-  template void raft::matrix::detail::select_k(raft::resources const& handle,       \
-                                               const T* in_val,                     \
-                                               const IdxT* in_idx,                  \
-                                               size_t batch_size,                   \
-                                               size_t len,                          \
-                                               int k,                               \
-                                               T* out_val,                          \
-                                               IdxT* out_idx,                       \
-                                               bool select_min,                     \
-                                               rmm::mr::device_memory_resource* mr, \
-                                               bool sorted,                         \
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                      \
+  template void raft::matrix::detail::select_k(raft::resources const& handle, \
+                                               const T* in_val,               \
+                                               const IdxT* in_idx,            \
+                                               size_t batch_size,             \
+                                               size_t len,                    \
+                                               int k,                         \
+                                               T* out_val,                    \
+                                               IdxT* out_idx,                 \
+                                               bool select_min,               \
+                                               bool sorted,                   \
                                                raft::matrix::SelectAlgo algo)
 
 instantiate_raft_matrix_detail_select_k(__half, int64_t);
diff --git a/cpp/src/matrix/detail/select_k_half_uint32_t.cu b/cpp/src/matrix/detail/select_k_half_uint32_t.cu
index 3e3a738915..f4e6bae21f 100644
--- a/cpp/src/matrix/detail/select_k_half_uint32_t.cu
+++ b/cpp/src/matrix/detail/select_k_half_uint32_t.cu
@@ -16,18 +16,17 @@
 
 #include <raft/matrix/detail/select_k-inl.cuh>
 
-#define instantiate_raft_matrix_detail_select_k(T, IdxT)                            \
-  template void raft::matrix::detail::select_k(raft::resources const& handle,       \
-                                               const T* in_val,                     \
-                                               const IdxT* in_idx,                  \
-                                               size_t batch_size,                   \
-                                               size_t len,                          \
-                                               int k,                               \
-                                               T* out_val,                          \
-                                               IdxT* out_idx,                       \
-                                               bool select_min,                     \
-                                               rmm::mr::device_memory_resource* mr, \
-                                               bool sorted,                         \
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                      \
+  template void raft::matrix::detail::select_k(raft::resources const& handle, \
+                                               const T* in_val,               \
+                                               const IdxT* in_idx,            \
+                                               size_t batch_size,             \
+                                               size_t len,                    \
+                                               int k,                         \
+                                               T* out_val,                    \
+                                               IdxT* out_idx,                 \
+                                               bool select_min,               \
+                                               bool sorted,                   \
                                                raft::matrix::SelectAlgo algo)
 
 instantiate_raft_matrix_detail_select_k(__half, uint32_t);

From 542aa647414fda668bd18c2271866cc6b1f72e05 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 29 Jan 2024 15:56:17 +0100
Subject: [PATCH 15/15] Revert an accidental copyright-only change

---
 cpp/include/raft/spatial/knn/knn.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/spatial/knn/knn.cuh b/cpp/include/raft/spatial/knn/knn.cuh
index e2c76e0be0..9ece7d3f1c 100644
--- a/cpp/include/raft/spatial/knn/knn.cuh
+++ b/cpp/include/raft/spatial/knn/knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.