From 2cc477b9c3ff95b0b62264caa6bba0a6c66c50a1 Mon Sep 17 00:00:00 2001 From: achirkin Date: Mon, 14 Aug 2023 16:16:45 +0200 Subject: [PATCH 01/15] Replace GEMM backend: cublas.gemm -> cublaslt.matmul --- .../raft/core/resource/cublaslt_handle.hpp | 68 +++ .../raft/core/resource/resource_types.hpp | 1 + cpp/include/raft/linalg/detail/gemm.hpp | 543 ++++++++++++------ cpp/include/raft/linalg/gemm.cuh | 100 +++- cpp/include/raft/util/cache.hpp | 83 +++ cpp/include/raft/util/cuda_data_type.hpp | 82 +++ cpp/test/linalg/gemm_layout.cu | 2 +- 7 files changed, 669 insertions(+), 210 deletions(-) create mode 100644 cpp/include/raft/core/resource/cublaslt_handle.hpp create mode 100644 cpp/include/raft/util/cache.hpp create mode 100644 cpp/include/raft/util/cuda_data_type.hpp diff --git a/cpp/include/raft/core/resource/cublaslt_handle.hpp b/cpp/include/raft/core/resource/cublaslt_handle.hpp new file mode 100644 index 0000000000..0d83fae752 --- /dev/null +++ b/cpp/include/raft/core/resource/cublaslt_handle.hpp @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include + +namespace raft::resource { + +class cublaslt_resource : public resource { + public: + cublaslt_resource() { RAFT_CUBLAS_TRY(cublasLtCreate(&handle_)); } + ~cublaslt_resource() noexcept override { RAFT_CUBLAS_TRY_NO_THROW(cublasLtDestroy(handle_)); } + auto get_resource() -> void* override { return &handle_; } + + private: + cublasLtHandle_t handle_; +}; + +/** Factory that knows how to construct a specific raft::resource to populate the res_t. */ +class cublaslt_resource_factory : public resource_factory { + public: + auto get_resource_type() -> resource_type override { return resource_type::CUBLASLT_HANDLE; } + auto make_resource() -> resource* override { return new cublaslt_resource(); } +}; + +/** + * @defgroup resource_cublas cuBLAS handle resource functions + * @{ + */ + +/** + * Load a cublasLt res_t from raft res if it exists, otherwise + * add it and return it. + * @param[in] res the raft resources object + * @return cublasLt handle + */ +inline auto get_cublaslt_handle(resources const& res) -> cublasLtHandle_t +{ + if (!res.has_resource_factory(resource_type::CUBLASLT_HANDLE)) { + res.add_resource_factory(std::make_shared()); + } + auto ret = *res.get_resource(resource_type::CUBLASLT_HANDLE); + return ret; +}; + +/** + * @} + */ + +} // namespace raft::resource diff --git a/cpp/include/raft/core/resource/resource_types.hpp b/cpp/include/raft/core/resource/resource_types.hpp index 2dc4eb1f9d..b32e09bb6b 100644 --- a/cpp/include/raft/core/resource/resource_types.hpp +++ b/cpp/include/raft/core/resource/resource_types.hpp @@ -41,6 +41,7 @@ enum resource_type { DEVICE_ID, // cuda device id THRUST_POLICY, // thrust execution policy WORKSPACE_RESOURCE, // rmm device memory resource + CUBLASLT_HANDLE, // cublasLt handle LAST_KEY // reserved for the last key }; diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp index d82c821148..3966d7e1eb 100644 --- a/cpp/include/raft/linalg/detail/gemm.hpp +++ b/cpp/include/raft/linalg/detail/gemm.hpp @@ -13,27 +13,216 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #pragma once -#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include -#include "cublas_wrappers.hpp" +namespace raft::linalg::detail { -#include -#include +/** Get the cublas compute type for the combination of input types. */ +template +auto get_matmul_type() -> cublasComputeType_t +{ + static_assert(std::is_same_v && std::is_same_v && std::is_same_v && + std::is_same_v, + "Unsupported combination of input types. Consult cublas API for supported types."); + return CUBLAS_COMPUTE_32F; +} + +template <> +inline auto get_matmul_type() -> cublasComputeType_t +{ + return CUBLAS_COMPUTE_32F; +} +template <> +inline auto get_matmul_type() -> cublasComputeType_t +{ + return CUBLAS_COMPUTE_32F; +} +template <> +inline auto get_matmul_type() -> cublasComputeType_t +{ + return CUBLAS_COMPUTE_32F; +} +template <> +inline auto get_matmul_type() -> cublasComputeType_t +{ + return CUBLAS_COMPUTE_32F; +} +template <> +inline auto get_matmul_type() -> cublasComputeType_t +{ + return CUBLAS_COMPUTE_16F; +} +template <> +inline auto get_matmul_type() -> cublasComputeType_t +{ + return CUBLAS_COMPUTE_32I; +} +template <> +inline auto get_matmul_type() -> cublasComputeType_t +{ + return CUBLAS_COMPUTE_32I; +} +template <> +inline auto get_matmul_type() -> cublasComputeType_t +{ + return CUBLAS_COMPUTE_64F; +} + +/** Unique representation of a matrix multiplication (assuming fixed types). */ +struct matmul_key_t { + uint64_t m; + uint64_t n; + uint64_t k; + uint64_t lda; + uint64_t ldb; + uint64_t ldc; + bool trans_a; + bool trans_b; +}; + +inline auto operator==(const matmul_key_t& a, const matmul_key_t& b) -> bool +{ + return a.m == b.m && a.n == b.n && a.k == b.k && a.lda == b.lda && a.ldb == b.ldb && + a.ldc == b.ldc && a.trans_a == b.trans_a && a.trans_b == b.trans_b; +} -namespace raft { -namespace linalg { -namespace detail { +struct matmul_key_hash { + inline auto operator()(const matmul_key_t& x) const noexcept -> std::size_t + { + return x.m * x.n * x.k + x.lda * x.ldb * x.ldc + size_t{x.trans_a} + size_t{x.trans_b} * 2; + } +}; + +/** Descriptor for a column-major cublasLt matrix. */ +struct cublastlt_matrix_layout { + cublasLtMatrixLayout_t res{nullptr}; + inline cublastlt_matrix_layout(cudaDataType dtype, uint64_t rows, uint64_t cols, uint64_t ld) + { + RAFT_CUBLAS_TRY(cublasLtMatrixLayoutCreate(&res, dtype, rows, cols, ld)); + } + inline cublastlt_matrix_layout(const cublastlt_matrix_layout&) = delete; + inline auto operator=(const cublastlt_matrix_layout&) -> cublastlt_matrix_layout& = delete; + inline cublastlt_matrix_layout(cublastlt_matrix_layout&&) = default; + inline auto operator=(cublastlt_matrix_layout&&) -> cublastlt_matrix_layout& = default; + + inline ~cublastlt_matrix_layout() noexcept + { + RAFT_CUBLAS_TRY_NO_THROW(cublasLtMatrixLayoutDestroy(res)); + } + + // NOLINTNEXTLINE + inline operator cublasLtMatrixLayout_t() const noexcept { return res; } + + template + static inline auto for_matmul(bool col_major, uint64_t rows, uint64_t cols, uint64_t ld) + -> cublastlt_matrix_layout + { + return cublastlt_matrix_layout{ + get_cuda_data_type(), col_major ? rows : cols, col_major ? cols : rows, ld}; + } +}; + +/** Descriptor for a cublasLt matmul function. */ +struct cublastlt_matmul_desc { + cublasLtMatmulDesc_t res{nullptr}; + inline cublastlt_matmul_desc(cublasComputeType_t compute_type, cudaDataType scale_type) + { + RAFT_CUBLAS_TRY(cublasLtMatmulDescCreate(&res, compute_type, scale_type)); + } + inline cublastlt_matmul_desc(const cublastlt_matmul_desc&) = delete; + inline auto operator=(const cublastlt_matmul_desc&) -> cublastlt_matmul_desc& = delete; + inline cublastlt_matmul_desc(cublastlt_matmul_desc&&) = default; + inline auto operator=(cublastlt_matmul_desc&&) -> cublastlt_matmul_desc& = default; + + inline ~cublastlt_matmul_desc() noexcept + { + RAFT_CUBLAS_TRY_NO_THROW(cublasLtMatmulDescDestroy(res)); + } + + // NOLINTNEXTLINE + inline operator cublasLtMatmulDesc_t() const noexcept { return res; } + + template + static inline auto for_matmult(bool transpose_a, bool transpose_b) -> cublastlt_matmul_desc + { + auto desc = cublastlt_matmul_desc{get_matmul_type(), get_cuda_data_type()}; + if constexpr (DevicePointerMode) { + const cublasPointerMode_t mode = CUBLAS_POINTER_MODE_DEVICE; + RAFT_CUBLAS_TRY(cublasLtMatmulDescSetAttribute( + desc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &mode, sizeof(mode))); + } + const cublasOperation_t trans_op = CUBLAS_OP_T; + if (transpose_a) { + RAFT_CUBLAS_TRY(cublasLtMatmulDescSetAttribute( + desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_op, sizeof(trans_op))); + } + if (transpose_b) { + RAFT_CUBLAS_TRY(cublasLtMatmulDescSetAttribute( + desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_op, sizeof(trans_op))); + } + return desc; + } +}; + +/** Full description of matmul. */ +struct matmul_desc { + cublastlt_matmul_desc desc; + cublastlt_matrix_layout a; + cublastlt_matrix_layout b; + cublastlt_matrix_layout c; + cublasLtMatmulHeuristicResult_t heuristics; + + template + static inline auto create(raft::resources const& res, const matmul_key_t& args) -> matmul_desc + { + matmul_desc r{ + cublastlt_matmul_desc::for_matmult(args.trans_a, args.trans_b), + cublastlt_matrix_layout::for_matmul(!(args.trans_a), args.m, args.k, args.lda), + cublastlt_matrix_layout::for_matmul(!(args.trans_b), args.k, args.n, args.ldb), + cublastlt_matrix_layout::for_matmul(true, args.m, args.n, args.ldc)}; + int algo_count; + cublasLtMatmulPreference_t preference; + RAFT_CUBLAS_TRY(cublasLtMatmulPreferenceCreate(&preference)); + RAFT_CUBLAS_TRY(cublasLtMatmulAlgoGetHeuristic(resource::get_cublaslt_handle(res), + r.desc, + r.a, + r.b, + r.c, + r.c, + preference, + 1, + &r.heuristics, + &algo_count)); + RAFT_CUBLAS_TRY(cublasLtMatmulPreferenceDestroy(preference)); + return r; + } +}; + +/** Number of matmul invocations to cache. */ +static constexpr size_t kLRUSize = 100; /** - * @brief the wrapper of cublas gemm function + * @brief the wrapper of cublasLt matmul function * It computes the following equation: C = alpha .* opA(A) * opB(B) + beta .* C * - * @tparam math_t the element type * @tparam DevicePointerMode whether pointers alpha, beta point to device memory - * @param [in] handle raft handle + * @tparam S the type of scale parameters alpha, beta + * @tparam A the element type of matrix A + * @tparam B the element type of matrix B + * @tparam C the element type of matrix C + * + * @param [in] res raft resources * @param [in] trans_a cublas transpose op for A * @param [in] trans_b cublas transpose op for B * @param [in] m number of rows of C @@ -49,195 +238,173 @@ namespace detail { * @param [in] ldc leading dimension of C * @param [in] stream */ -template -void gemm(raft::resources const& handle, - const bool trans_a, - const bool trans_b, - const int m, - const int n, - const int k, - const math_t* alpha, - const math_t* A, - const int lda, - const math_t* B, - const int ldb, - const math_t* beta, - math_t* C, - const int ldc, - cudaStream_t stream) +template +void matmul(raft::resources const& res, + bool trans_a, + bool trans_b, + uint64_t m, + uint64_t n, + uint64_t k, + const S* alpha, + const A* a_ptr, + uint64_t lda, + const B* b_ptr, + uint64_t ldb, + const S* beta, + C* c_ptr, + uint64_t ldc, + cudaStream_t stream) { - auto cublas_h = raft::resource::get_cublas_handle(handle); - cublas_device_pointer_mode pmode(cublas_h); - RAFT_CUBLAS_TRY(cublasgemm(cublas_h, - trans_a ? CUBLAS_OP_T : CUBLAS_OP_N, - trans_b ? CUBLAS_OP_T : CUBLAS_OP_N, - m, - n, - k, - alpha, - A, - lda, - B, - ldb, - beta, - C, - ldc, - stream)); + common::nvtx::range batch_scope( + "linalg::matmul(m = %d, n = %d, k = %d)", m, n, k); + std::shared_ptr mm_desc{nullptr}; + matmul_key_t mm_key{m, n, k, lda, ldb, ldc, trans_a, trans_b}; + static thread_local cache:: + lru, std::shared_ptr> + cache{kLRUSize}; + if (!cache.get(mm_key, &mm_desc)) { + mm_desc.reset(new matmul_desc{matmul_desc::create(res, mm_key)}); + cache.set(mm_key, mm_desc); + } + RAFT_CUBLAS_TRY(cublasLtMatmul(resource::get_cublaslt_handle(res), + mm_desc->desc, + alpha, + a_ptr, + mm_desc->a, + b_ptr, + mm_desc->b, + beta, + c_ptr, + mm_desc->c, + c_ptr, + mm_desc->c, + &(mm_desc->heuristics.algo), + nullptr, + 0, + stream)); } -/** - * @brief the wrapper of cublas gemm function - * It computes the following equation: D = alpha . opA(A) * opB(B) + beta . C - * @tparam math_t the type of input/output matrices - * @param handle raft handle - * @param a input matrix - * @param n_rows_a number of rows of A - * @param n_cols_a number of columns of A - * @param b input matrix - * @param c output matrix - * @param n_rows_c number of rows of C - * @param n_cols_c number of columns of C - * @param trans_a cublas transpose op for A - * @param trans_b cublas transpose op for B - * @param alpha scalar - * @param beta scalar - * @param stream cuda stream - */ -template -void gemm(raft::resources const& handle, - const math_t* a, - int n_rows_a, - int n_cols_a, - const math_t* b, - math_t* c, - int n_rows_c, - int n_cols_c, - cublasOperation_t trans_a, - cublasOperation_t trans_b, - math_t alpha, - math_t beta, - cudaStream_t stream) +template +void legacy_gemm(raft::resources const& res, + const bool trans_a, + const bool trans_b, + const int m, + const int n, + const int k, + const T* alpha, + const T* A, + const int lda, + const T* B, + const int ldb, + const T* beta, + T* C, + const int ldc, + cudaStream_t stream) +{ + return matmul(res, + trans_a, + trans_b, + static_cast(m), + static_cast(n), + static_cast(k), + alpha, + A, + static_cast(lda), + B, + static_cast(ldb), + beta, + C, + static_cast(ldc), + stream); +} + +template +void legacy_gemm(raft::resources const& res, + const T* a, + int n_rows_a, + int n_cols_a, + const T* b, + T* c, + int n_rows_c, + int n_cols_c, + cublasOperation_t trans_a, + cublasOperation_t trans_b, + T alpha, + T beta, + cudaStream_t stream) { - auto cublas_h = raft::resource::get_cublas_handle(handle); - - int m = n_rows_c; - int n = n_cols_c; - int k = trans_a == CUBLAS_OP_T ? n_rows_a : n_cols_a; - int lda = trans_a == CUBLAS_OP_T ? k : m; - int ldb = trans_b == CUBLAS_OP_T ? n : k; - int ldc = m; - RAFT_CUBLAS_TRY( - cublasgemm(cublas_h, trans_a, trans_b, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc, stream)); + int m = n_rows_c; + int n = n_cols_c; + auto k = trans_a == CUBLAS_OP_T ? n_rows_a : n_cols_a; + return matmul(res, + trans_a == CUBLAS_OP_T, + trans_b == CUBLAS_OP_T, + static_cast(n_rows_c), + static_cast(n_cols_c), + static_cast(k), + &alpha, + a, + static_cast(trans_a == CUBLAS_OP_T ? k : m), + b, + static_cast(trans_b == CUBLAS_OP_T ? n : k), + &beta, + c, + static_cast(m), + stream); } -template -void gemm(raft::resources const& handle, - const math_t* a, - int n_rows_a, - int n_cols_a, - const math_t* b, - math_t* c, - int n_rows_c, - int n_cols_c, - cublasOperation_t trans_a, - cublasOperation_t trans_b, - cudaStream_t stream) +template +void legacy_gemm(raft::resources const& res, + const T* a, + int n_rows_a, + int n_cols_a, + const T* b, + T* c, + int n_rows_c, + int n_cols_c, + cublasOperation_t trans_a, + cublasOperation_t trans_b, + cudaStream_t stream) { - math_t alpha = math_t(1); - math_t beta = math_t(0); - gemm( - handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream); + return legacy_gemm( + res, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, T{1}, T{0}, stream); } template -void gemm(raft::resources const& handle, - T* z, - T* x, - T* y, - int _M, - int _N, - int _K, - bool isZColMajor, - bool isXColMajor, - bool isYColMajor, - cudaStream_t stream, - T* alpha, - T* beta) +void legacy_gemm(raft::resources const& res, + T* z, + T* x, + T* y, + int _M, + int _N, + int _K, + bool isZColMajor, + bool isXColMajor, + bool isYColMajor, + cudaStream_t stream, + const T* alpha, + const T* beta) { - auto cublas_h = raft::resource::get_cublas_handle(handle); - cublas_device_pointer_mode pmode(cublas_h); - - cublasOperation_t trans_a, trans_b; - T *a, *b, *c; - int lda, ldb, ldc; - int M, N, K; - // This function performs c = a * b. Based on the required output layout, - // either a = x, b = y or a = y, b = x. In either case c = z. - if (isZColMajor == true) { - // Result c is required in column major layout. Thus we perform, - // z = x * y - // Using BLAS call c = a * b. Therefore a = x, b = y and c = z - - a = x; - // If x is in row major layout, cublas needs to transpose x first, - // therefore trans_x needs to be CUBLAS_OP_T. If x is in column major - // layout, trans_b needs to be CUBLAS_OP_N. - trans_a = isXColMajor == true ? CUBLAS_OP_N : CUBLAS_OP_T; - // Set leading dimension appropriately - lda = isXColMajor == true ? _M : _K; - - b = y; - // If y is in row major layout, cublas needs to transpose y first, - // therefore trans_x needs to be CUBLAS_OP_T. If x is in column major - // layout, trans_b needs to be CUBLAS_OP_N. - trans_b = isYColMajor == true ? CUBLAS_OP_N : CUBLAS_OP_T; - ldb = isYColMajor == true ? _K : _N; - - c = z; - ldc = _M; - M = _M; - N = _N; - K = _K; + if (isZColMajor) { + return matmul(res, + !isXColMajor, + !isYColMajor, + static_cast(_M), + static_cast(_N), + static_cast(_K), + alpha, + x, + static_cast(isXColMajor ? _M : _K), + y, + static_cast(isYColMajor ? _K : _N), + beta, + z, + static_cast(_M), + stream); } else { - // Result c is required in row major layout Thus we pick - // a = y, b = x and c = a * b = y * x - // cublas produces output matrix only in column major layout. To get output - // matrix on row major layout, we need to produce transpose of output - // in column major layout. Therefore we perform, - // tr(z) = tr(y) * tr(x) - // we model this using cublas call for c = a * b - // therefore a = tr(y), b = tr(x) and c = tr(z) - - a = y; - // If y is in row major layout, it can be/ interpreted as tr(y) on column - // major layout. Therefore we can pass trans_a as CUBLAS_OP_N. If y is in - // column major layout, cublas needs to transpose y first, therefore - // trans_a needs to be CUBLAS_OP_T - trans_a = isYColMajor == true ? CUBLAS_OP_T : CUBLAS_OP_N; - // Set leading dimension appropriately - lda = isYColMajor == true ? _K : _N; - - b = x; - // If x is in row major layout, it can be interpreted as tr(x) on column - // major layout. Therefore we can pass trans_b as CUBLAS_OP_N. If x is in - // column major layout, cublas needs to trasponse x first, therefore - // trans_b needs to be CUBLAS_OP_T - trans_b = isXColMajor == true ? CUBLAS_OP_T : CUBLAS_OP_N; - // Set leading dimension appropriately - ldb = isXColMajor == true ? _M : _K; - - c = z; - ldc = _N; - - M = _N; - N = _M; - K = _K; + return legacy_gemm( + res, z, y, x, _N, _M, _K, true, !isYColMajor, !isXColMajor, stream, alpha, beta); } - // Actual cuBLAS call - RAFT_CUBLAS_TRY( - cublasgemm(cublas_h, trans_a, trans_b, M, N, K, alpha, a, lda, b, ldb, beta, c, ldc, stream)); } -} // namespace detail -} // namespace linalg -} // namespace raft +} // namespace raft::linalg::detail diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh index aea9d52673..35b877b20c 100644 --- a/cpp/include/raft/linalg/gemm.cuh +++ b/cpp/include/raft/linalg/gemm.cuh @@ -19,6 +19,7 @@ #pragma once #include "detail/gemm.hpp" + #include #include #include @@ -27,8 +28,65 @@ #include #include -namespace raft { -namespace linalg { +namespace raft::linalg { + +/** + * @brief the wrapper of cublasLt matmul function + * It computes the following equation: C = alpha .* opA(A) * opB(B) + beta .* C + * + * @tparam DevicePointerMode whether pointers alpha, beta point to device memory + * @tparam S the type of scale parameters alpha, beta + * @tparam A the element type of matrix A + * @tparam B the element type of matrix B + * @tparam C the element type of matrix C + * + * @param [in] res raft resources + * @param [in] trans_a cublas transpose op for A + * @param [in] trans_b cublas transpose op for B + * @param [in] m number of rows of C + * @param [in] n number of columns of C + * @param [in] k number of rows of opB(B) / number of columns of opA(A) + * @param [in] alpha host or device scalar + * @param [in] A such a matrix that the shape of column-major opA(A) is [m, k] + * @param [in] lda leading dimension of A + * @param [in] B such a matrix that the shape of column-major opA(B) is [k, n] + * @param [in] ldb leading dimension of B + * @param [in] beta host or device scalar + * @param [inout] C column-major matrix of size [m, n] + * @param [in] ldc leading dimension of C + */ +template +void matmul(raft::resources const& res, + bool trans_a, + bool trans_b, + uint64_t m, + uint64_t n, + uint64_t k, + const S* alpha, + const A* a_ptr, + uint64_t lda, + const B* b_ptr, + uint64_t ldb, + const S* beta, + C* c_ptr, + uint64_t ldc) +{ + return detail::matmul(res, + trans_a, + trans_b, + m, + n, + k, + alpha, + a_ptr, + lda, + b_ptr, + ldb, + beta, + c_ptr, + ldc, + resource::get_cuda_stream(res)); +} /** * @brief the wrapper of cublas gemm function @@ -69,7 +127,7 @@ void gemm(raft::resources const& handle, const int ldc, cudaStream_t stream) { - detail::gemm( + return detail::legacy_gemm( handle, trans_a, trans_b, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, stream); } @@ -106,7 +164,7 @@ void gemm(raft::resources const& handle, math_t beta, cudaStream_t stream) { - detail::gemm( + detail::legacy_gemm( handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream); } @@ -139,7 +197,8 @@ void gemm(raft::resources const& handle, cublasOperation_t trans_b, cudaStream_t stream) { - detail::gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, stream); + detail::legacy_gemm( + handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, stream); } /** @@ -176,7 +235,7 @@ void gemm(raft::resources const& handle, T alpha = T(1.0), T beta = T(0.0)) { - detail::gemm( + return detail::legacy_gemm( handle, z, x, y, _M, _N, _K, isZColMajor, isXColMajor, isYColMajor, stream, &alpha, &beta); } @@ -256,24 +315,23 @@ void gemm(raft::resources const& handle, if (!beta) { beta = beta_host.view(); } } - detail::gemm(handle, - z.data_handle(), - x.data_handle(), - y.data_handle(), - x.extent(0), - y.extent(1), - x.extent(1), - is_z_col_major, - is_x_col_major, - is_y_col_major, - resource::get_cuda_stream(handle), - alpha.value().data_handle(), - beta.value().data_handle()); + return detail::legacy_gemm(handle, + z.data_handle(), + x.data_handle(), + y.data_handle(), + x.extent(0), + y.extent(1), + x.extent(1), + is_z_col_major, + is_x_col_major, + is_y_col_major, + resource::get_cuda_stream(handle), + alpha.value().data_handle(), + beta.value().data_handle()); } /** @} */ // end of gemm -} // end namespace linalg -} // end namespace raft +} // namespace raft::linalg #endif diff --git a/cpp/include/raft/util/cache.hpp b/cpp/include/raft/util/cache.hpp new file mode 100644 index 0000000000..ee1ad1cb19 --- /dev/null +++ b/cpp/include/raft/util/cache.hpp @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include +#include +#include + +namespace raft::cache { + +/** Associative cache with least recently used replacement policy. */ +template , + typename EqK = std::equal_to, + typename... Values> +class lru { + public: + explicit lru(size_t size) : size_(size) + { + RAFT_EXPECTS(size >= 1, "The cache must fit at least one record."); + } + + void set(const K& key, const Values&... values) + { + auto pos = map_.find(key); + if (pos == map_.end()) { + if (map_.size() >= size_) { + map_.erase(queue_.back()); + queue_.pop_back(); + } + } else { + queue_.erase(std::get<0>(pos->second)); + } + queue_.push_front(key); + map_[key] = std::make_tuple(queue_.begin(), values...); + } + + auto get(const K& key, Values*... values) -> bool + { + auto pos = map_.find(key); + if (pos == map_.end()) { return false; } + auto& map_val = pos->second; + queue_.erase(std::get<0>(map_val)); + queue_.push_front(key); + std::get<0>(map_val) = queue_.begin(); + set_values(map_val, values..., std::index_sequence_for()); + return true; + } + + private: + using queue_iterator = typename std::list::iterator; + std::list queue_{}; + std::unordered_map, HashK, EqK> map_{}; + size_t size_; + + template + static void set_values(const std::tuple& tup, + Values*... vals, + std::index_sequence) + { + ((*vals = std::get(tup)), ...); + } +}; + +}; // namespace raft::cache diff --git a/cpp/include/raft/util/cuda_data_type.hpp b/cpp/include/raft/util/cuda_data_type.hpp new file mode 100644 index 0000000000..cf83fc2fd0 --- /dev/null +++ b/cpp/include/raft/util/cuda_data_type.hpp @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +namespace raft { + +template +constexpr auto get_cuda_data_type() -> cudaDataType_t; + +template <> +inline constexpr auto get_cuda_data_type() -> cudaDataType_t +{ + return CUDA_R_8I; +} +template <> +inline constexpr auto get_cuda_data_type() -> cudaDataType_t +{ + return CUDA_R_8U; +} +template <> +inline constexpr auto get_cuda_data_type() -> cudaDataType_t +{ + return CUDA_R_16I; +} +template <> +inline constexpr auto get_cuda_data_type() -> cudaDataType_t +{ + return CUDA_R_16U; +} +template <> +inline constexpr auto get_cuda_data_type() -> cudaDataType_t +{ + return CUDA_R_32I; +} +template <> +inline constexpr auto get_cuda_data_type() -> cudaDataType_t +{ + return CUDA_R_32U; +} +template <> +inline constexpr auto get_cuda_data_type() -> cudaDataType_t +{ + return CUDA_R_64I; +} +template <> +inline constexpr auto get_cuda_data_type() -> cudaDataType_t +{ + return CUDA_R_64U; +} +template <> +inline constexpr auto get_cuda_data_type() -> cudaDataType_t +{ + return CUDA_R_16F; +} +template <> +inline constexpr auto get_cuda_data_type() -> cudaDataType_t +{ + return CUDA_R_32F; +} +template <> +inline constexpr auto get_cuda_data_type() -> cudaDataType_t +{ + return CUDA_R_64F; +} +} // namespace raft diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/test/linalg/gemm_layout.cu index 898c8ad5aa..2d591eb942 100644 --- a/cpp/test/linalg/gemm_layout.cu +++ b/cpp/test/linalg/gemm_layout.cu @@ -162,7 +162,7 @@ const std::vector> inputsd = { typedef GemmLayoutTest GemmLayoutTestF; TEST_P(GemmLayoutTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, raft::CompareApprox(1e-4))); + ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, raft::CompareApprox(2e-4))); } typedef GemmLayoutTest GemmLayoutTestD; From dc7a9a42824a5e4e03f2365f4bf3a3fcf0d18ab5 Mon Sep 17 00:00:00 2001 From: achirkin Date: Mon, 14 Aug 2023 19:25:38 +0200 Subject: [PATCH 02/15] Replace broken (due to missing direct includes) direct uses of cublasgemm --- cpp/include/raft/stats/detail/cov.cuh | 26 ++------- cpp/test/random/multi_variable_gaussian.cu | 63 +++++++++++----------- 2 files changed, 34 insertions(+), 55 deletions(-) diff --git a/cpp/include/raft/stats/detail/cov.cuh b/cpp/include/raft/stats/detail/cov.cuh index 0f740c8ed9..b0f83b9bc4 100644 --- a/cpp/include/raft/stats/detail/cov.cuh +++ b/cpp/include/raft/stats/detail/cov.cuh @@ -57,34 +57,14 @@ void cov(raft::resources const& handle, cudaStream_t stream) { if (stable) { - cublasHandle_t cublas_h = resource::get_cublas_handle(handle); - // since mean operation is assumed to be along a given column, broadcast // must be along rows! raft::stats::meanCenter(data, data, mu, D, N, rowMajor, true, stream); Type alpha = Type(1) / (sample ? Type(N - 1) : Type(N)); Type beta = Type(0); - if (rowMajor) { - // #TODO: Call from public API when ready - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_h, - CUBLAS_OP_N, - CUBLAS_OP_T, - D, - D, - N, - &alpha, - data, - D, - data, - D, - &beta, - covar, - D, - stream)); - } else { - raft::linalg::gemm( - handle, data, N, D, data, covar, D, D, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream); - } + auto ldd = rowMajor ? D : N; + linalg::gemm( + handle, !rowMajor, rowMajor, D, D, N, &alpha, data, ldd, data, ldd, &beta, covar, D, stream); } else { ///@todo: implement this using cutlass + customized epilogue! ASSERT(false, "cov: Implement stable=false case!"); diff --git a/cpp/test/random/multi_variable_gaussian.cu b/cpp/test/random/multi_variable_gaussian.cu index e35d49e453..e5ed3429b6 100644 --- a/cpp/test/random/multi_variable_gaussian.cu +++ b/cpp/test/random/multi_variable_gaussian.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -107,7 +108,6 @@ class MVGTest : public ::testing::TestWithParam> { corr = params.corr; tolerance = params.tolerance; - auto cublasH = resource::get_cublas_handle(handle); auto cusolverH = resource::get_cusolver_dn_handle(handle); auto stream = resource::get_cuda_stream(handle); @@ -175,21 +175,21 @@ class MVGTest : public ::testing::TestWithParam> { // finding the cov matrix, placing in Rand_cov T alfa = 1.0 / (nPoints - 1), beta = 0.0; - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublasH, - CUBLAS_OP_N, - CUBLAS_OP_T, - dim, - dim, - nPoints, - &alfa, - X_d.data(), - dim, - X_d.data(), - dim, - &beta, - Rand_cov.data(), - dim, - stream)); + linalg::gemm(handle, + false, + true, + dim, + dim, + nPoints, + &alfa, + X_d.data(), + dim, + X_d.data(), + dim, + &beta, + Rand_cov.data(), + dim, + stream); // restoring cov provided into P_d raft::update_device(P_d.data(), P.data(), dim * dim, stream); @@ -247,7 +247,6 @@ class MVGMdspanTest : public ::testing::TestWithParam> { corr = params.corr; tolerance = params.tolerance; - auto cublasH = resource::get_cublas_handle(handle); auto cusolverH = resource::get_cusolver_dn_handle(handle); auto stream = resource::get_cuda_stream(handle); @@ -309,21 +308,21 @@ class MVGMdspanTest : public ::testing::TestWithParam> { // finding the cov matrix, placing in Rand_cov T alfa = 1.0 / (nPoints - 1), beta = 0.0; - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublasH, - CUBLAS_OP_N, - CUBLAS_OP_T, - dim, - dim, - nPoints, - &alfa, - X_d.data(), - dim, - X_d.data(), - dim, - &beta, - Rand_cov.data(), - dim, - stream)); + linalg::gemm(handle, + false, + true, + dim, + dim, + nPoints, + &alfa, + X_d.data(), + dim, + X_d.data(), + dim, + &beta, + Rand_cov.data(), + dim, + stream); // restoring cov provided into P_d raft::update_device(P_d.data(), P.data(), dim * dim, stream); From 71c03c0dee987f53714f21f70fd7ce6d66745e53 Mon Sep 17 00:00:00 2001 From: achirkin Date: Tue, 15 Aug 2023 07:33:05 +0200 Subject: [PATCH 03/15] Fix docs --- cpp/include/raft/linalg/detail/gemm.hpp | 6 +++--- cpp/include/raft/linalg/gemm.cuh | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp index 3966d7e1eb..1460641dcf 100644 --- a/cpp/include/raft/linalg/detail/gemm.hpp +++ b/cpp/include/raft/linalg/detail/gemm.hpp @@ -229,12 +229,12 @@ static constexpr size_t kLRUSize = 100; * @param [in] n number of columns of C * @param [in] k number of rows of opB(B) / number of columns of opA(A) * @param [in] alpha host or device scalar - * @param [in] A such a matrix that the shape of column-major opA(A) is [m, k] + * @param [in] a_ptr such a matrix that the shape of column-major opA(A) is [m, k] * @param [in] lda leading dimension of A - * @param [in] B such a matrix that the shape of column-major opA(B) is [k, n] + * @param [in] b_ptr such a matrix that the shape of column-major opA(B) is [k, n] * @param [in] ldb leading dimension of B * @param [in] beta host or device scalar - * @param [inout] C column-major matrix of size [m, n] + * @param [inout] c_ptr column-major matrix of size [m, n] * @param [in] ldc leading dimension of C * @param [in] stream */ diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh index 35b877b20c..3057a4712d 100644 --- a/cpp/include/raft/linalg/gemm.cuh +++ b/cpp/include/raft/linalg/gemm.cuh @@ -47,12 +47,12 @@ namespace raft::linalg { * @param [in] n number of columns of C * @param [in] k number of rows of opB(B) / number of columns of opA(A) * @param [in] alpha host or device scalar - * @param [in] A such a matrix that the shape of column-major opA(A) is [m, k] + * @param [in] a_ptr such a matrix that the shape of column-major opA(A) is [m, k] * @param [in] lda leading dimension of A - * @param [in] B such a matrix that the shape of column-major opA(B) is [k, n] + * @param [in] b_ptr such a matrix that the shape of column-major opA(B) is [k, n] * @param [in] ldb leading dimension of B * @param [in] beta host or device scalar - * @param [inout] C column-major matrix of size [m, n] + * @param [inout] c_ptr column-major matrix of size [m, n] * @param [in] ldc leading dimension of C */ template From a2fb088050ce5ea835288febf821e0525eb89dde Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 16 Aug 2023 13:41:31 +0200 Subject: [PATCH 04/15] Replace cublasgemm where it makes sense --- .../raft/random/detail/make_regression.cuh | 98 +++++++------- .../random/detail/multi_variable_gaussian.cuh | 37 +---- cpp/test/random/make_regression.cu | 128 +++++++++--------- 3 files changed, 117 insertions(+), 146 deletions(-) diff --git a/cpp/include/raft/random/detail/make_regression.cuh b/cpp/include/raft/random/detail/make_regression.cuh index aec1a15f84..e8fba083a7 100644 --- a/cpp/include/raft/random/detail/make_regression.cuh +++ b/cpp/include/raft/random/detail/make_regression.cuh @@ -22,10 +22,9 @@ #include -#include #include #include -#include +#include #include #include #include @@ -62,8 +61,6 @@ static void _make_low_rank_matrix(raft::resources const& handle, raft::random::RngState& r, cudaStream_t stream) { - cublasHandle_t cublas_handle = resource::get_cublas_handle(handle); - IdxT n = std::min(n_rows, n_cols); // Generate random (ortho normal) vectors with QR decomposition @@ -92,36 +89,36 @@ static void _make_low_rank_matrix(raft::resources const& handle, rmm::device_uvector temp_q0s(n_rows * n, stream); rmm::device_uvector temp_out(n_rows * n_cols, stream); DataT alpha = 1.0, beta = 0.0; - raft::linalg::detail::cublasgemm(cublas_handle, - CUBLAS_OP_N, - CUBLAS_OP_N, - n_rows, - n, - n, - &alpha, - q0.data(), - n_rows, - singular_mat.data(), - n, - &beta, - temp_q0s.data(), - n_rows, - stream); - raft::linalg::detail::cublasgemm(cublas_handle, - CUBLAS_OP_N, - CUBLAS_OP_T, - n_rows, - n_cols, - n, - &alpha, - temp_q0s.data(), - n_rows, - q1.data(), - n_cols, - &beta, - temp_out.data(), - n_rows, - stream); + raft::linalg::gemm(handle, + false, + false, + n_rows, + n, + n, + &alpha, + q0.data(), + n_rows, + singular_mat.data(), + n, + &beta, + temp_q0s.data(), + n_rows, + stream); + raft::linalg::gemm(handle, + false, + true, + n_rows, + n_cols, + n, + &alpha, + temp_q0s.data(), + n_rows, + q1.data(), + n_cols, + &beta, + temp_out.data(), + n_rows, + stream); // Transpose from column-major to row-major raft::linalg::transpose(handle, temp_out.data(), out, n_rows, n_cols, stream); @@ -165,9 +162,6 @@ void make_regression_caller(raft::resources const& handle, { n_informative = std::min(n_informative, n_cols); - cublasHandle_t cublas_handle = resource::get_cublas_handle(handle); - - cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST); raft::random::RngState r(seed, type); if (effective_rank < 0) { @@ -219,21 +213,21 @@ void make_regression_caller(raft::resources const& handle, // Compute the output values DataT alpha = (DataT)1.0, beta = (DataT)0.0; - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle, - CUBLAS_OP_T, - CUBLAS_OP_T, - n_rows, - n_targets, - n_informative, - &alpha, - out, - n_cols, - _coef, - n_targets, - &beta, - _values_col, - n_rows, - stream)); + raft::linalg::gemm(handle, + true, + true, + n_rows, + n_targets, + n_informative, + &alpha, + out, + n_cols, + _coef, + n_targets, + &beta, + _values_col, + n_rows, + stream); // Transpose the values from column-major to row-major if needed if (n_targets > 1) { diff --git a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh index 68934ac1ff..59cf187915 100644 --- a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh +++ b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh @@ -20,12 +20,12 @@ #include #include #include -#include #include #include #include #include #include +#include #include #include #include @@ -193,7 +193,6 @@ class multi_variable_gaussian_impl { void give_gaussian(const int nPoints, T* P, T* X, const T* x = 0) { auto cusolverHandle = resource::get_cusolver_dn_handle(handle); - auto cublasHandle = resource::get_cublas_handle(handle); auto cudaStream = resource::get_cuda_stream(handle); if (method == chol_decomp) { // lower part will contains chol_decomp @@ -233,21 +232,8 @@ class multi_variable_gaussian_impl { RAFT_CUDA_TRY(cudaPeekAtLastError()); // P is lower triangular chol decomp mtrx - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublasHandle, - CUBLAS_OP_N, - CUBLAS_OP_N, - dim, - nPoints, - dim, - &alfa, - P, - dim, - X, - dim, - &beta, - X, - dim, - cudaStream)); + raft::linalg::gemm( + handle, false, false, dim, nPoints, dim, &alfa, P, dim, X, dim, &beta, X, dim, cudaStream); } else { epsilonToZero(eig, epsilon, dim, cudaStream); dim3 block(64); @@ -263,21 +249,8 @@ class multi_variable_gaussian_impl { ASSERT(info_h == 0, "mvg: Cov matrix has %dth Eigenval negative", info_h); // Got Q = eigvect*eigvals.sqrt in P, Q*X in X below - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublasHandle, - CUBLAS_OP_N, - CUBLAS_OP_N, - dim, - nPoints, - dim, - &alfa, - P, - dim, - X, - dim, - &beta, - X, - dim, - cudaStream)); + raft::linalg::gemm( + handle, false, false, dim, nPoints, dim, &alfa, P, dim, X, dim, &beta, X, dim, cudaStream); } // working to make mean not 0 // since we are working with column-major, nPoints and dim are swapped diff --git a/cpp/test/random/make_regression.cu b/cpp/test/random/make_regression.cu index 0df3b2e7b0..9db03867bd 100644 --- a/cpp/test/random/make_regression.cu +++ b/cpp/test/random/make_regression.cu @@ -14,23 +14,24 @@ * limitations under the License. */ -#include -#include -#include -#include -#include -#include - #include "../test_utils.cuh" + +#include +#include #include -#include +#include +#include #include - #include #include #include #include +#include +#include + +#include + namespace raft::random { template @@ -45,15 +46,6 @@ struct MakeRegressionInputs { template class MakeRegressionTest : public ::testing::TestWithParam> { - public: - MakeRegressionTest() - : params(::testing::TestWithParam>::GetParam()), - stream(resource::get_cuda_stream(handle)), - values_ret(params.n_samples * params.n_targets, stream), - values_prod(params.n_samples * params.n_targets, stream) - { - } - protected: void SetUp() override { @@ -88,21 +80,21 @@ class MakeRegressionTest : public ::testing::TestWithParam __coef = thrust::device_pointer_cast(coef.data()); - zero_count = thrust::count(__coef, __coef + params.n_features * params.n_targets, (T)0.0); + rmm::device_scalar zc_device(stream); + raft::linalg::mapReduce(zc_device.data(), + coef.size(), + 0, + raft::compose_op{raft::cast_op{}, raft::equal_const_op{0}}, + raft::add_op{}, + stream, + coef.data()); + zero_count = zc_device.value(stream); } protected: + MakeRegressionInputs params{::testing::TestWithParam>::GetParam()}; raft::resources handle; - cudaStream_t stream = 0; + rmm::cuda_stream_view stream{resource::get_cuda_stream(handle)}; + rmm::device_uvector values_ret{size_t(params.n_samples) * size_t(params.n_targets), stream}; + rmm::device_uvector values_prod{size_t(params.n_samples) * size_t(params.n_targets), stream}; - MakeRegressionInputs params; - rmm::device_uvector values_ret, values_prod; int zero_count; }; @@ -183,8 +183,6 @@ class MakeRegressionMdspanTest : public ::testing::TestWithParam __coef = thrust::device_pointer_cast(coef.data()); - constexpr T ZERO{}; - zero_count = thrust::count(__coef, __coef + params.n_features * params.n_targets, ZERO); + rmm::device_scalar zc_device(stream); + raft::linalg::mapReduce(zc_device.data(), + coef.size(), + 0, + raft::compose_op{raft::cast_op{}, raft::equal_const_op{0}}, + raft::add_op{}, + stream, + coef.data()); + zero_count = zc_device.value(stream); } private: MakeRegressionInputs params{::testing::TestWithParam>::GetParam()}; raft::resources handle; - rmm::device_uvector values_ret{params.n_samples * params.n_targets, - resource::get_cuda_stream(handle)}; - rmm::device_uvector values_prod{params.n_samples * params.n_targets, - resource::get_cuda_stream(handle)}; + rmm::cuda_stream_view stream{resource::get_cuda_stream(handle)}; + rmm::device_uvector values_ret{size_t(params.n_samples) * size_t(params.n_targets), stream}; + rmm::device_uvector values_prod{size_t(params.n_samples) * size_t(params.n_targets), stream}; + int zero_count = -1; }; From 699de0c046b7ad93690c29efd34351e7729d8085 Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 16 Aug 2023 13:42:00 +0200 Subject: [PATCH 05/15] Fix a typo --- cpp/include/raft/linalg/detail/gemm.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp index 1460641dcf..a30c08efad 100644 --- a/cpp/include/raft/linalg/detail/gemm.hpp +++ b/cpp/include/raft/linalg/detail/gemm.hpp @@ -154,7 +154,7 @@ struct cublastlt_matmul_desc { inline operator cublasLtMatmulDesc_t() const noexcept { return res; } template - static inline auto for_matmult(bool transpose_a, bool transpose_b) -> cublastlt_matmul_desc + static inline auto for_matmul(bool transpose_a, bool transpose_b) -> cublastlt_matmul_desc { auto desc = cublastlt_matmul_desc{get_matmul_type(), get_cuda_data_type()}; if constexpr (DevicePointerMode) { @@ -187,7 +187,7 @@ struct matmul_desc { static inline auto create(raft::resources const& res, const matmul_key_t& args) -> matmul_desc { matmul_desc r{ - cublastlt_matmul_desc::for_matmult(args.trans_a, args.trans_b), + cublastlt_matmul_desc::for_matmul(args.trans_a, args.trans_b), cublastlt_matrix_layout::for_matmul(!(args.trans_a), args.m, args.k, args.lda), cublastlt_matrix_layout::for_matmul(!(args.trans_b), args.k, args.n, args.ldb), cublastlt_matrix_layout::for_matmul(true, args.m, args.n, args.ldc)}; From f4d634aa1c9f007f00409d367cda1892517761b1 Mon Sep 17 00:00:00 2001 From: achirkin Date: Mon, 21 Aug 2023 15:54:06 +0200 Subject: [PATCH 06/15] Put the cache into the resource handle as a user-define resource --- .../raft/core/resource/resource_types.hpp | 1 + .../raft/core/resource/user_resource.hpp | 71 +++++++++++++++++++ cpp/include/raft/linalg/detail/gemm.hpp | 16 +++-- cpp/include/raft/util/cache.hpp | 9 ++- 4 files changed, 91 insertions(+), 6 deletions(-) create mode 100644 cpp/include/raft/core/resource/user_resource.hpp diff --git a/cpp/include/raft/core/resource/resource_types.hpp b/cpp/include/raft/core/resource/resource_types.hpp index b32e09bb6b..2910b32b12 100644 --- a/cpp/include/raft/core/resource/resource_types.hpp +++ b/cpp/include/raft/core/resource/resource_types.hpp @@ -42,6 +42,7 @@ enum resource_type { THRUST_POLICY, // thrust execution policy WORKSPACE_RESOURCE, // rmm device memory resource CUBLASLT_HANDLE, // cublasLt handle + USER_DEFINED, // user-defined default-constructible resource LAST_KEY // reserved for the last key }; diff --git a/cpp/include/raft/core/resource/user_resource.hpp b/cpp/include/raft/core/resource/user_resource.hpp new file mode 100644 index 0000000000..5da23f75b3 --- /dev/null +++ b/cpp/include/raft/core/resource/user_resource.hpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include +#include + +namespace raft::resource { + +class user_resource : public resource { + public: + user_resource() = default; + ~user_resource() noexcept override = default; + auto get_resource() -> void* override { return this; } + + template + auto load() -> Store* + { + std::lock_guard _(lock_); + auto key = std::type_index{typeid(Store)}; + auto pos = map_.find(key); + if (pos != map_.end()) { return reinterpret_cast(pos->second.get()); } + auto store_ptr = new Store{}; + map_[key] = + std::shared_ptr(store_ptr, [](void* ptr) { delete reinterpret_cast(ptr); }); + return store_ptr; + } + + private: + std::unordered_map> map_{}; + std::mutex lock_{}; +}; + +/** Factory that knows how to construct a specific raft::resource to populate the res_t. */ +class user_resource_factory : public resource_factory { + public: + auto get_resource_type() -> resource_type override { return resource_type::USER_DEFINED; } + auto make_resource() -> resource* override { return new user_resource(); } +}; + +/** + * Get the user-defined default-constructible resource if it exists, create it otherwise. + * @param[in] res the raft resources object + * @return a pointer to the user-defined resource. + */ +template +auto get_user_resource(resources const& res) -> Store* +{ + if (!res.has_resource_factory(resource_type::USER_DEFINED)) { + res.add_resource_factory(std::make_shared()); + } + return res.get_resource(resource_type::USER_DEFINED)->load(); +}; + +} // namespace raft::resource diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp index a30c08efad..462d758be6 100644 --- a/cpp/include/raft/linalg/detail/gemm.hpp +++ b/cpp/include/raft/linalg/detail/gemm.hpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -209,8 +210,14 @@ struct matmul_desc { } }; -/** Number of matmul invocations to cache. */ -static constexpr size_t kLRUSize = 100; +/** Cache with the default constructor; tagged with input types to use separate caches. */ +template +struct matmul_cache { + /** Number of matmul invocations to cache. */ + static constexpr size_t kDefaultSize = 100; + cache::lru, std::shared_ptr> value{ + kDefaultSize}; +}; /** * @brief the wrapper of cublasLt matmul function @@ -259,9 +266,8 @@ void matmul(raft::resources const& res, "linalg::matmul(m = %d, n = %d, k = %d)", m, n, k); std::shared_ptr mm_desc{nullptr}; matmul_key_t mm_key{m, n, k, lda, ldb, ldc, trans_a, trans_b}; - static thread_local cache:: - lru, std::shared_ptr> - cache{kLRUSize}; + auto& cache = + resource::get_user_resource>(res)->value; if (!cache.get(mm_key, &mm_desc)) { mm_desc.reset(new matmul_desc{matmul_desc::create(res, mm_key)}); cache.set(mm_key, mm_desc); diff --git a/cpp/include/raft/util/cache.hpp b/cpp/include/raft/util/cache.hpp index ee1ad1cb19..c174aa489d 100644 --- a/cpp/include/raft/util/cache.hpp +++ b/cpp/include/raft/util/cache.hpp @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -33,13 +34,17 @@ template class lru { public: - explicit lru(size_t size) : size_(size) + /** Default cache size. */ + static constexpr size_t kDefaultSize = 100; + + explicit lru(size_t size = kDefaultSize) : size_(size) { RAFT_EXPECTS(size >= 1, "The cache must fit at least one record."); } void set(const K& key, const Values&... values) { + std::lock_guard guard(lock_); auto pos = map_.find(key); if (pos == map_.end()) { if (map_.size() >= size_) { @@ -55,6 +60,7 @@ class lru { auto get(const K& key, Values*... values) -> bool { + std::lock_guard guard(lock_); auto pos = map_.find(key); if (pos == map_.end()) { return false; } auto& map_val = pos->second; @@ -69,6 +75,7 @@ class lru { using queue_iterator = typename std::list::iterator; std::list queue_{}; std::unordered_map, HashK, EqK> map_{}; + std::mutex lock_{}; size_t size_; template From e57eebf481063357344f58ed2c7350d9452f365f Mon Sep 17 00:00:00 2001 From: achirkin Date: Tue, 22 Aug 2023 10:07:54 +0200 Subject: [PATCH 07/15] Move matmul into a separate file --- cpp/include/raft/linalg/detail/gemm.hpp | 273 +------------------- cpp/include/raft/linalg/detail/matmul.hpp | 293 ++++++++++++++++++++++ cpp/include/raft/linalg/gemm.cuh | 58 ----- 3 files changed, 296 insertions(+), 328 deletions(-) create mode 100644 cpp/include/raft/linalg/detail/matmul.hpp diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp index 462d758be6..1d643566e1 100644 --- a/cpp/include/raft/linalg/detail/gemm.hpp +++ b/cpp/include/raft/linalg/detail/gemm.hpp @@ -15,281 +15,14 @@ */ #pragma once -#include -#include -#include -#include -#include +#include "matmul.hpp" + #include -#include -#include -#include -#include +#include // cublasOperation_t namespace raft::linalg::detail { -/** Get the cublas compute type for the combination of input types. */ -template -auto get_matmul_type() -> cublasComputeType_t -{ - static_assert(std::is_same_v && std::is_same_v && std::is_same_v && - std::is_same_v, - "Unsupported combination of input types. Consult cublas API for supported types."); - return CUBLAS_COMPUTE_32F; -} - -template <> -inline auto get_matmul_type() -> cublasComputeType_t -{ - return CUBLAS_COMPUTE_32F; -} -template <> -inline auto get_matmul_type() -> cublasComputeType_t -{ - return CUBLAS_COMPUTE_32F; -} -template <> -inline auto get_matmul_type() -> cublasComputeType_t -{ - return CUBLAS_COMPUTE_32F; -} -template <> -inline auto get_matmul_type() -> cublasComputeType_t -{ - return CUBLAS_COMPUTE_32F; -} -template <> -inline auto get_matmul_type() -> cublasComputeType_t -{ - return CUBLAS_COMPUTE_16F; -} -template <> -inline auto get_matmul_type() -> cublasComputeType_t -{ - return CUBLAS_COMPUTE_32I; -} -template <> -inline auto get_matmul_type() -> cublasComputeType_t -{ - return CUBLAS_COMPUTE_32I; -} -template <> -inline auto get_matmul_type() -> cublasComputeType_t -{ - return CUBLAS_COMPUTE_64F; -} - -/** Unique representation of a matrix multiplication (assuming fixed types). */ -struct matmul_key_t { - uint64_t m; - uint64_t n; - uint64_t k; - uint64_t lda; - uint64_t ldb; - uint64_t ldc; - bool trans_a; - bool trans_b; -}; - -inline auto operator==(const matmul_key_t& a, const matmul_key_t& b) -> bool -{ - return a.m == b.m && a.n == b.n && a.k == b.k && a.lda == b.lda && a.ldb == b.ldb && - a.ldc == b.ldc && a.trans_a == b.trans_a && a.trans_b == b.trans_b; -} - -struct matmul_key_hash { - inline auto operator()(const matmul_key_t& x) const noexcept -> std::size_t - { - return x.m * x.n * x.k + x.lda * x.ldb * x.ldc + size_t{x.trans_a} + size_t{x.trans_b} * 2; - } -}; - -/** Descriptor for a column-major cublasLt matrix. */ -struct cublastlt_matrix_layout { - cublasLtMatrixLayout_t res{nullptr}; - inline cublastlt_matrix_layout(cudaDataType dtype, uint64_t rows, uint64_t cols, uint64_t ld) - { - RAFT_CUBLAS_TRY(cublasLtMatrixLayoutCreate(&res, dtype, rows, cols, ld)); - } - inline cublastlt_matrix_layout(const cublastlt_matrix_layout&) = delete; - inline auto operator=(const cublastlt_matrix_layout&) -> cublastlt_matrix_layout& = delete; - inline cublastlt_matrix_layout(cublastlt_matrix_layout&&) = default; - inline auto operator=(cublastlt_matrix_layout&&) -> cublastlt_matrix_layout& = default; - - inline ~cublastlt_matrix_layout() noexcept - { - RAFT_CUBLAS_TRY_NO_THROW(cublasLtMatrixLayoutDestroy(res)); - } - - // NOLINTNEXTLINE - inline operator cublasLtMatrixLayout_t() const noexcept { return res; } - - template - static inline auto for_matmul(bool col_major, uint64_t rows, uint64_t cols, uint64_t ld) - -> cublastlt_matrix_layout - { - return cublastlt_matrix_layout{ - get_cuda_data_type(), col_major ? rows : cols, col_major ? cols : rows, ld}; - } -}; - -/** Descriptor for a cublasLt matmul function. */ -struct cublastlt_matmul_desc { - cublasLtMatmulDesc_t res{nullptr}; - inline cublastlt_matmul_desc(cublasComputeType_t compute_type, cudaDataType scale_type) - { - RAFT_CUBLAS_TRY(cublasLtMatmulDescCreate(&res, compute_type, scale_type)); - } - inline cublastlt_matmul_desc(const cublastlt_matmul_desc&) = delete; - inline auto operator=(const cublastlt_matmul_desc&) -> cublastlt_matmul_desc& = delete; - inline cublastlt_matmul_desc(cublastlt_matmul_desc&&) = default; - inline auto operator=(cublastlt_matmul_desc&&) -> cublastlt_matmul_desc& = default; - - inline ~cublastlt_matmul_desc() noexcept - { - RAFT_CUBLAS_TRY_NO_THROW(cublasLtMatmulDescDestroy(res)); - } - - // NOLINTNEXTLINE - inline operator cublasLtMatmulDesc_t() const noexcept { return res; } - - template - static inline auto for_matmul(bool transpose_a, bool transpose_b) -> cublastlt_matmul_desc - { - auto desc = cublastlt_matmul_desc{get_matmul_type(), get_cuda_data_type()}; - if constexpr (DevicePointerMode) { - const cublasPointerMode_t mode = CUBLAS_POINTER_MODE_DEVICE; - RAFT_CUBLAS_TRY(cublasLtMatmulDescSetAttribute( - desc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &mode, sizeof(mode))); - } - const cublasOperation_t trans_op = CUBLAS_OP_T; - if (transpose_a) { - RAFT_CUBLAS_TRY(cublasLtMatmulDescSetAttribute( - desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_op, sizeof(trans_op))); - } - if (transpose_b) { - RAFT_CUBLAS_TRY(cublasLtMatmulDescSetAttribute( - desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_op, sizeof(trans_op))); - } - return desc; - } -}; - -/** Full description of matmul. */ -struct matmul_desc { - cublastlt_matmul_desc desc; - cublastlt_matrix_layout a; - cublastlt_matrix_layout b; - cublastlt_matrix_layout c; - cublasLtMatmulHeuristicResult_t heuristics; - - template - static inline auto create(raft::resources const& res, const matmul_key_t& args) -> matmul_desc - { - matmul_desc r{ - cublastlt_matmul_desc::for_matmul(args.trans_a, args.trans_b), - cublastlt_matrix_layout::for_matmul(!(args.trans_a), args.m, args.k, args.lda), - cublastlt_matrix_layout::for_matmul(!(args.trans_b), args.k, args.n, args.ldb), - cublastlt_matrix_layout::for_matmul(true, args.m, args.n, args.ldc)}; - int algo_count; - cublasLtMatmulPreference_t preference; - RAFT_CUBLAS_TRY(cublasLtMatmulPreferenceCreate(&preference)); - RAFT_CUBLAS_TRY(cublasLtMatmulAlgoGetHeuristic(resource::get_cublaslt_handle(res), - r.desc, - r.a, - r.b, - r.c, - r.c, - preference, - 1, - &r.heuristics, - &algo_count)); - RAFT_CUBLAS_TRY(cublasLtMatmulPreferenceDestroy(preference)); - return r; - } -}; - -/** Cache with the default constructor; tagged with input types to use separate caches. */ -template -struct matmul_cache { - /** Number of matmul invocations to cache. */ - static constexpr size_t kDefaultSize = 100; - cache::lru, std::shared_ptr> value{ - kDefaultSize}; -}; - -/** - * @brief the wrapper of cublasLt matmul function - * It computes the following equation: C = alpha .* opA(A) * opB(B) + beta .* C - * - * @tparam DevicePointerMode whether pointers alpha, beta point to device memory - * @tparam S the type of scale parameters alpha, beta - * @tparam A the element type of matrix A - * @tparam B the element type of matrix B - * @tparam C the element type of matrix C - * - * @param [in] res raft resources - * @param [in] trans_a cublas transpose op for A - * @param [in] trans_b cublas transpose op for B - * @param [in] m number of rows of C - * @param [in] n number of columns of C - * @param [in] k number of rows of opB(B) / number of columns of opA(A) - * @param [in] alpha host or device scalar - * @param [in] a_ptr such a matrix that the shape of column-major opA(A) is [m, k] - * @param [in] lda leading dimension of A - * @param [in] b_ptr such a matrix that the shape of column-major opA(B) is [k, n] - * @param [in] ldb leading dimension of B - * @param [in] beta host or device scalar - * @param [inout] c_ptr column-major matrix of size [m, n] - * @param [in] ldc leading dimension of C - * @param [in] stream - */ -template -void matmul(raft::resources const& res, - bool trans_a, - bool trans_b, - uint64_t m, - uint64_t n, - uint64_t k, - const S* alpha, - const A* a_ptr, - uint64_t lda, - const B* b_ptr, - uint64_t ldb, - const S* beta, - C* c_ptr, - uint64_t ldc, - cudaStream_t stream) -{ - common::nvtx::range batch_scope( - "linalg::matmul(m = %d, n = %d, k = %d)", m, n, k); - std::shared_ptr mm_desc{nullptr}; - matmul_key_t mm_key{m, n, k, lda, ldb, ldc, trans_a, trans_b}; - auto& cache = - resource::get_user_resource>(res)->value; - if (!cache.get(mm_key, &mm_desc)) { - mm_desc.reset(new matmul_desc{matmul_desc::create(res, mm_key)}); - cache.set(mm_key, mm_desc); - } - RAFT_CUBLAS_TRY(cublasLtMatmul(resource::get_cublaslt_handle(res), - mm_desc->desc, - alpha, - a_ptr, - mm_desc->a, - b_ptr, - mm_desc->b, - beta, - c_ptr, - mm_desc->c, - c_ptr, - mm_desc->c, - &(mm_desc->heuristics.algo), - nullptr, - 0, - stream)); -} - template void legacy_gemm(raft::resources const& res, const bool trans_a, diff --git a/cpp/include/raft/linalg/detail/matmul.hpp b/cpp/include/raft/linalg/detail/matmul.hpp new file mode 100644 index 0000000000..6aa6a32bb8 --- /dev/null +++ b/cpp/include/raft/linalg/detail/matmul.hpp @@ -0,0 +1,293 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace raft::linalg::detail { + +/** Get the cublas compute type for the combination of input types. */ +template +auto get_matmul_type() -> cublasComputeType_t +{ + static_assert(std::is_same_v && std::is_same_v && std::is_same_v && + std::is_same_v, + "Unsupported combination of input types. Consult cublas API for supported types."); + return CUBLAS_COMPUTE_32F; +} + +template <> +inline auto get_matmul_type() -> cublasComputeType_t +{ + return CUBLAS_COMPUTE_32F; +} +template <> +inline auto get_matmul_type() -> cublasComputeType_t +{ + return CUBLAS_COMPUTE_32F; +} +template <> +inline auto get_matmul_type() -> cublasComputeType_t +{ + return CUBLAS_COMPUTE_32F; +} +template <> +inline auto get_matmul_type() -> cublasComputeType_t +{ + return CUBLAS_COMPUTE_32F; +} +template <> +inline auto get_matmul_type() -> cublasComputeType_t +{ + return CUBLAS_COMPUTE_16F; +} +template <> +inline auto get_matmul_type() -> cublasComputeType_t +{ + return CUBLAS_COMPUTE_32I; +} +template <> +inline auto get_matmul_type() -> cublasComputeType_t +{ + return CUBLAS_COMPUTE_32I; +} +template <> +inline auto get_matmul_type() -> cublasComputeType_t +{ + return CUBLAS_COMPUTE_64F; +} + +/** Unique representation of a matrix multiplication (assuming fixed types). */ +struct matmul_key_t { + uint64_t m; + uint64_t n; + uint64_t k; + uint64_t lda; + uint64_t ldb; + uint64_t ldc; + bool trans_a; + bool trans_b; +}; + +inline auto operator==(const matmul_key_t& a, const matmul_key_t& b) -> bool +{ + return a.m == b.m && a.n == b.n && a.k == b.k && a.lda == b.lda && a.ldb == b.ldb && + a.ldc == b.ldc && a.trans_a == b.trans_a && a.trans_b == b.trans_b; +} + +struct matmul_key_hash { + inline auto operator()(const matmul_key_t& x) const noexcept -> std::size_t + { + return x.m * x.n * x.k + x.lda * x.ldb * x.ldc + size_t{x.trans_a} + size_t{x.trans_b} * 2; + } +}; + +/** Descriptor for a column-major cublasLt matrix. */ +struct cublastlt_matrix_layout { + cublasLtMatrixLayout_t res{nullptr}; + inline cublastlt_matrix_layout(cudaDataType dtype, uint64_t rows, uint64_t cols, uint64_t ld) + { + RAFT_CUBLAS_TRY(cublasLtMatrixLayoutCreate(&res, dtype, rows, cols, ld)); + } + inline cublastlt_matrix_layout(const cublastlt_matrix_layout&) = delete; + inline auto operator=(const cublastlt_matrix_layout&) -> cublastlt_matrix_layout& = delete; + inline cublastlt_matrix_layout(cublastlt_matrix_layout&&) = default; + inline auto operator=(cublastlt_matrix_layout&&) -> cublastlt_matrix_layout& = default; + + inline ~cublastlt_matrix_layout() noexcept + { + RAFT_CUBLAS_TRY_NO_THROW(cublasLtMatrixLayoutDestroy(res)); + } + + // NOLINTNEXTLINE + inline operator cublasLtMatrixLayout_t() const noexcept { return res; } + + template + static inline auto for_matmul(bool col_major, uint64_t rows, uint64_t cols, uint64_t ld) + -> cublastlt_matrix_layout + { + return cublastlt_matrix_layout{ + get_cuda_data_type(), col_major ? rows : cols, col_major ? cols : rows, ld}; + } +}; + +/** Descriptor for a cublasLt matmul function. */ +struct cublastlt_matmul_desc { + cublasLtMatmulDesc_t res{nullptr}; + inline cublastlt_matmul_desc(cublasComputeType_t compute_type, cudaDataType scale_type) + { + RAFT_CUBLAS_TRY(cublasLtMatmulDescCreate(&res, compute_type, scale_type)); + } + inline cublastlt_matmul_desc(const cublastlt_matmul_desc&) = delete; + inline auto operator=(const cublastlt_matmul_desc&) -> cublastlt_matmul_desc& = delete; + inline cublastlt_matmul_desc(cublastlt_matmul_desc&&) = default; + inline auto operator=(cublastlt_matmul_desc&&) -> cublastlt_matmul_desc& = default; + + inline ~cublastlt_matmul_desc() noexcept + { + RAFT_CUBLAS_TRY_NO_THROW(cublasLtMatmulDescDestroy(res)); + } + + // NOLINTNEXTLINE + inline operator cublasLtMatmulDesc_t() const noexcept { return res; } + + template + static inline auto for_matmul(bool transpose_a, bool transpose_b) -> cublastlt_matmul_desc + { + auto desc = cublastlt_matmul_desc{get_matmul_type(), get_cuda_data_type()}; + if constexpr (DevicePointerMode) { + const cublasPointerMode_t mode = CUBLAS_POINTER_MODE_DEVICE; + RAFT_CUBLAS_TRY(cublasLtMatmulDescSetAttribute( + desc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &mode, sizeof(mode))); + } + const cublasOperation_t trans_op = CUBLAS_OP_T; + if (transpose_a) { + RAFT_CUBLAS_TRY(cublasLtMatmulDescSetAttribute( + desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_op, sizeof(trans_op))); + } + if (transpose_b) { + RAFT_CUBLAS_TRY(cublasLtMatmulDescSetAttribute( + desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_op, sizeof(trans_op))); + } + return desc; + } +}; + +/** Full description of matmul. */ +struct matmul_desc { + cublastlt_matmul_desc desc; + cublastlt_matrix_layout a; + cublastlt_matrix_layout b; + cublastlt_matrix_layout c; + cublasLtMatmulHeuristicResult_t heuristics; + + template + static inline auto create(raft::resources const& res, const matmul_key_t& args) -> matmul_desc + { + matmul_desc r{ + cublastlt_matmul_desc::for_matmul(args.trans_a, args.trans_b), + cublastlt_matrix_layout::for_matmul(!(args.trans_a), args.m, args.k, args.lda), + cublastlt_matrix_layout::for_matmul(!(args.trans_b), args.k, args.n, args.ldb), + cublastlt_matrix_layout::for_matmul(true, args.m, args.n, args.ldc)}; + int algo_count; + cublasLtMatmulPreference_t preference; + RAFT_CUBLAS_TRY(cublasLtMatmulPreferenceCreate(&preference)); + RAFT_CUBLAS_TRY(cublasLtMatmulAlgoGetHeuristic(resource::get_cublaslt_handle(res), + r.desc, + r.a, + r.b, + r.c, + r.c, + preference, + 1, + &r.heuristics, + &algo_count)); + RAFT_CUBLAS_TRY(cublasLtMatmulPreferenceDestroy(preference)); + return r; + } +}; + +/** Cache with the default constructor; tagged with input types to use separate caches. */ +template +struct matmul_cache { + /** Number of matmul invocations to cache. */ + static constexpr size_t kDefaultSize = 100; + cache::lru, std::shared_ptr> value{ + kDefaultSize}; +}; + +/** + * @brief the wrapper of cublasLt matmul function + * It computes the following equation: C = alpha .* opA(A) * opB(B) + beta .* C + * + * @tparam DevicePointerMode whether pointers alpha, beta point to device memory + * @tparam S the type of scale parameters alpha, beta + * @tparam A the element type of matrix A + * @tparam B the element type of matrix B + * @tparam C the element type of matrix C + * + * @param [in] res raft resources + * @param [in] trans_a cublas transpose op for A + * @param [in] trans_b cublas transpose op for B + * @param [in] m number of rows of C + * @param [in] n number of columns of C + * @param [in] k number of rows of opB(B) / number of columns of opA(A) + * @param [in] alpha host or device scalar + * @param [in] a_ptr such a matrix that the shape of column-major opA(A) is [m, k] + * @param [in] lda leading dimension of A + * @param [in] b_ptr such a matrix that the shape of column-major opA(B) is [k, n] + * @param [in] ldb leading dimension of B + * @param [in] beta host or device scalar + * @param [inout] c_ptr column-major matrix of size [m, n] + * @param [in] ldc leading dimension of C + * @param [in] stream + */ +template +void matmul(raft::resources const& res, + bool trans_a, + bool trans_b, + uint64_t m, + uint64_t n, + uint64_t k, + const S* alpha, + const A* a_ptr, + uint64_t lda, + const B* b_ptr, + uint64_t ldb, + const S* beta, + C* c_ptr, + uint64_t ldc, + cudaStream_t stream) +{ + common::nvtx::range batch_scope( + "linalg::matmul(m = %d, n = %d, k = %d)", m, n, k); + std::shared_ptr mm_desc{nullptr}; + matmul_key_t mm_key{m, n, k, lda, ldb, ldc, trans_a, trans_b}; + auto& cache = + resource::get_user_resource>(res)->value; + if (!cache.get(mm_key, &mm_desc)) { + mm_desc.reset(new matmul_desc{matmul_desc::create(res, mm_key)}); + cache.set(mm_key, mm_desc); + } + RAFT_CUBLAS_TRY(cublasLtMatmul(resource::get_cublaslt_handle(res), + mm_desc->desc, + alpha, + a_ptr, + mm_desc->a, + b_ptr, + mm_desc->b, + beta, + c_ptr, + mm_desc->c, + c_ptr, + mm_desc->c, + &(mm_desc->heuristics.algo), + nullptr, + 0, + stream)); +} + +} // namespace raft::linalg::detail diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh index 3057a4712d..56e91aaa0b 100644 --- a/cpp/include/raft/linalg/gemm.cuh +++ b/cpp/include/raft/linalg/gemm.cuh @@ -30,64 +30,6 @@ namespace raft::linalg { -/** - * @brief the wrapper of cublasLt matmul function - * It computes the following equation: C = alpha .* opA(A) * opB(B) + beta .* C - * - * @tparam DevicePointerMode whether pointers alpha, beta point to device memory - * @tparam S the type of scale parameters alpha, beta - * @tparam A the element type of matrix A - * @tparam B the element type of matrix B - * @tparam C the element type of matrix C - * - * @param [in] res raft resources - * @param [in] trans_a cublas transpose op for A - * @param [in] trans_b cublas transpose op for B - * @param [in] m number of rows of C - * @param [in] n number of columns of C - * @param [in] k number of rows of opB(B) / number of columns of opA(A) - * @param [in] alpha host or device scalar - * @param [in] a_ptr such a matrix that the shape of column-major opA(A) is [m, k] - * @param [in] lda leading dimension of A - * @param [in] b_ptr such a matrix that the shape of column-major opA(B) is [k, n] - * @param [in] ldb leading dimension of B - * @param [in] beta host or device scalar - * @param [inout] c_ptr column-major matrix of size [m, n] - * @param [in] ldc leading dimension of C - */ -template -void matmul(raft::resources const& res, - bool trans_a, - bool trans_b, - uint64_t m, - uint64_t n, - uint64_t k, - const S* alpha, - const A* a_ptr, - uint64_t lda, - const B* b_ptr, - uint64_t ldb, - const S* beta, - C* c_ptr, - uint64_t ldc) -{ - return detail::matmul(res, - trans_a, - trans_b, - m, - n, - k, - alpha, - a_ptr, - lda, - b_ptr, - ldb, - beta, - c_ptr, - ldc, - resource::get_cuda_stream(res)); -} - /** * @brief the wrapper of cublas gemm function * It computes the following equation: C = alpha .* opA(A) * opB(B) + beta .* C From d44bf2048ec46ec9c93e7e0891e39c7cc6101aa6 Mon Sep 17 00:00:00 2001 From: achirkin Date: Tue, 22 Aug 2023 11:04:15 +0200 Subject: [PATCH 08/15] Complete the docs --- .../raft/core/resource/cublas_handle.hpp | 4 +-- .../raft/core/resource/cublaslt_handle.hpp | 6 ++-- .../raft/core/resource/user_resource.hpp | 30 +++++++++++----- docs/source/cpp_api/core_resources.rst | 35 +++++++++++++++++-- 4 files changed, 59 insertions(+), 16 deletions(-) diff --git a/cpp/include/raft/core/resource/cublas_handle.hpp b/cpp/include/raft/core/resource/cublas_handle.hpp index c8d8ee4c02..33dde49135 100644 --- a/cpp/include/raft/core/resource/cublas_handle.hpp +++ b/cpp/include/raft/core/resource/cublas_handle.hpp @@ -60,8 +60,8 @@ class cublas_resource_factory : public resource_factory { */ /** - * Load a cublasres_t from raft res if it exists, otherwise - * add it and return it. + * Load a `cublasHandle_t` from raft res if it exists, otherwise add it and return it. + * * @param[in] res the raft resources object * @return cublas handle */ diff --git a/cpp/include/raft/core/resource/cublaslt_handle.hpp b/cpp/include/raft/core/resource/cublaslt_handle.hpp index 0d83fae752..16f150f268 100644 --- a/cpp/include/raft/core/resource/cublaslt_handle.hpp +++ b/cpp/include/raft/core/resource/cublaslt_handle.hpp @@ -42,13 +42,13 @@ class cublaslt_resource_factory : public resource_factory { }; /** - * @defgroup resource_cublas cuBLAS handle resource functions + * @defgroup resource_cublaslt cuBLASLt handle resource functions * @{ */ /** - * Load a cublasLt res_t from raft res if it exists, otherwise - * add it and return it. + * Load a `cublasLtHandle_t` from raft res if it exists, otherwise add it and return it. + * * @param[in] res the raft resources object * @return cublasLt handle */ diff --git a/cpp/include/raft/core/resource/user_resource.hpp b/cpp/include/raft/core/resource/user_resource.hpp index 5da23f75b3..21877c4133 100644 --- a/cpp/include/raft/core/resource/user_resource.hpp +++ b/cpp/include/raft/core/resource/user_resource.hpp @@ -29,16 +29,16 @@ class user_resource : public resource { ~user_resource() noexcept override = default; auto get_resource() -> void* override { return this; } - template - auto load() -> Store* + template + auto load() -> ResourceT* { std::lock_guard _(lock_); - auto key = std::type_index{typeid(Store)}; + auto key = std::type_index{typeid(ResourceT)}; auto pos = map_.find(key); - if (pos != map_.end()) { return reinterpret_cast(pos->second.get()); } - auto store_ptr = new Store{}; + if (pos != map_.end()) { return reinterpret_cast(pos->second.get()); } + auto store_ptr = new ResourceT{}; map_[key] = - std::shared_ptr(store_ptr, [](void* ptr) { delete reinterpret_cast(ptr); }); + std::shared_ptr(store_ptr, [](void* ptr) { delete reinterpret_cast(ptr); }); return store_ptr; } @@ -54,18 +54,30 @@ class user_resource_factory : public resource_factory { auto make_resource() -> resource* override { return new user_resource(); } }; +/** + * @defgroup resource_user_defined user-defined resource functions + * @{ + */ + /** * Get the user-defined default-constructible resource if it exists, create it otherwise. + * + * @tparam ResourceT the type of the resource; it must be complete and default-constructible. + * * @param[in] res the raft resources object * @return a pointer to the user-defined resource. */ -template -auto get_user_resource(resources const& res) -> Store* +template +auto get_user_resource(resources const& res) -> ResourceT* { if (!res.has_resource_factory(resource_type::USER_DEFINED)) { res.add_resource_factory(std::make_shared()); } - return res.get_resource(resource_type::USER_DEFINED)->load(); + return res.get_resource(resource_type::USER_DEFINED)->load(); }; +/** + * @} + */ + } // namespace raft::resource diff --git a/docs/source/cpp_api/core_resources.rst b/docs/source/cpp_api/core_resources.rst index 85c454b355..af26af7bbb 100644 --- a/docs/source/cpp_api/core_resources.rst +++ b/docs/source/cpp_api/core_resources.rst @@ -25,7 +25,7 @@ namespace *raft::resource* Device Resources ---------------- -`raft::device_resources` is a convenience over using `raft::resources` directly. It provides accessor methods to retrieve resources such as the CUDA stream, stream pool, and handles to the various CUDA math libraries like cuBLAS and cuSOLVER. +`raft::device_resources` is a convenience over using `raft::resources` directly. It provides accessor methods to retrieve resources such as the CUDA stream, stream pool, and handles to the various CUDA math libraries like cuBLAS and cuSOLVER. ``#include `` @@ -73,7 +73,7 @@ namespace *raft::resource* cuBLAS Handle ~~~~~~~~~~~~~ -``#include `` +``#include `` namespace *raft::resource* @@ -82,6 +82,18 @@ namespace *raft::resource* :members: :content-only: +cuBLASLt Handle +~~~~~~~~~~~~~~~ + +``#include `` + +namespace *raft::resource* + + .. doxygengroup:: resource_cublaslt + :project: RAFT + :members: + :content-only: + CUDA Stream ~~~~~~~~~~~ @@ -202,3 +214,22 @@ namespace *raft::resource* :project: RAFT :members: :content-only: + +User-defined resources +~~~~~~~~~~~~~~~~~~~~~~ + +A user-defined resource is an arbitrary default-constructible C++ class. +The consumer of the API can keep such a resource in the `raft::resources` handle. +For example, consider a function that is expected to be called repeatedly and +involves a costly kernel configuration. One can cache the kernel configuration in +a user-defined resource. +The cost of accessing it is one hashmap lookup. + +``#include `` + +namespace *raft::resource* + + .. doxygengroup:: resource_user_defined + :project: RAFT + :members: + :content-only: From de2958058c9c7693f3e33a660f0cdca042e842e0 Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 31 Aug 2023 17:14:59 +0200 Subject: [PATCH 09/15] move matmul.hpp to cublaslt_wrappers.hpp --- .../raft/linalg/detail/{matmul.hpp => cublaslt_wrappers.hpp} | 1 + cpp/include/raft/linalg/detail/gemm.hpp | 4 +--- 2 files changed, 2 insertions(+), 3 deletions(-) rename cpp/include/raft/linalg/detail/{matmul.hpp => cublaslt_wrappers.hpp} (99%) diff --git a/cpp/include/raft/linalg/detail/matmul.hpp b/cpp/include/raft/linalg/detail/cublaslt_wrappers.hpp similarity index 99% rename from cpp/include/raft/linalg/detail/matmul.hpp rename to cpp/include/raft/linalg/detail/cublaslt_wrappers.hpp index 6aa6a32bb8..1c025282ed 100644 --- a/cpp/include/raft/linalg/detail/matmul.hpp +++ b/cpp/include/raft/linalg/detail/cublaslt_wrappers.hpp @@ -24,6 +24,7 @@ #include #include +#include #include #include diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp index 1d643566e1..97f85fdae4 100644 --- a/cpp/include/raft/linalg/detail/gemm.hpp +++ b/cpp/include/raft/linalg/detail/gemm.hpp @@ -15,12 +15,10 @@ */ #pragma once -#include "matmul.hpp" +#include "cublaslt_wrappers.hpp" #include -#include // cublasOperation_t - namespace raft::linalg::detail { template From 090141a2f77780b3ef0de70423bac92262b04f65 Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 30 Aug 2023 08:55:29 +0200 Subject: [PATCH 10/15] Cache IVF-PQ and select-warpsort kernel launch parameters to reduce latency --- .../raft/matrix/detail/select_k-inl.cuh | 4 +- .../raft/matrix/detail/select_warpsort.cuh | 93 ++++++++++++------ .../raft/neighbors/detail/ivf_pq_search.cuh | 95 ++++++++++++++++--- .../raft_internal/matrix/select_k.cuh | 10 +- 4 files changed, 153 insertions(+), 49 deletions(-) diff --git a/cpp/include/raft/matrix/detail/select_k-inl.cuh b/cpp/include/raft/matrix/detail/select_k-inl.cuh index 20fe1963fc..ba3138d918 100644 --- a/cpp/include/raft/matrix/detail/select_k-inl.cuh +++ b/cpp/include/raft/matrix/detail/select_k-inl.cuh @@ -286,11 +286,11 @@ void select_k(raft::resources const& handle, case Algo::kWarpDistributedShm: return detail::select::warpsort:: select_k_impl( - in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr); + handle, in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr); case Algo::kWarpImmediate: return detail::select::warpsort:: select_k_impl( - in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr); + handle, in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr); default: RAFT_FAIL("K-selection Algorithm not supported."); } } diff --git a/cpp/include/raft/matrix/detail/select_warpsort.cuh b/cpp/include/raft/matrix/detail/select_warpsort.cuh index 0ee87de4f7..3fe5f52d12 100644 --- a/cpp/include/raft/matrix/detail/select_warpsort.cuh +++ b/cpp/include/raft/matrix/detail/select_warpsort.cuh @@ -18,7 +18,9 @@ #include #include +#include #include +#include #include #include #include @@ -773,6 +775,11 @@ __launch_bounds__(256) RAFT_KERNEL queue.store(out + block_id * k, out_idx + block_id * k); } +struct launch_params { + int block_size = 0; + int min_grid_size = 0; +}; + template