From c735ecb65abf477fb30a1c1a4015161cd7cbb294 Mon Sep 17 00:00:00 2001 From: William Hicks Date: Fri, 6 Oct 2023 11:07:53 -0400 Subject: [PATCH] Provide a raft::copy overload for mdspan-to-mdspan copies (#1818) # Purpose This PR provides a utility for copying between generic mdspans. This includes between host and device, between mdspans of different layouts, and between mdspans of different (convertible) data types ## API `raft::copy(raft_resources, dest_mdspan, src_mdspan);` # Limitations - Currently does not support copies between mdspans on two different GPUs - Currently not performant for generic host-to-host copies (would be much easier to optimize with submdspan for padded layouts) - Submdspan with padded layouts would also make it easier to improve perf of some device-to-device copies, though perf should already be quite good for most device-to-device copies. # Design - Includes optional `RAFT_DISABLE_CUDA` build definition in order to use this utility in CUDA-free builds (important for use in the FIL backend for Triton) - Includes a new `raft::stream_view` object which is a thin wrapper around `rmm::stream_view`. Its purpose is solely to provide a symbol that will be defined in CUDA-free builds and which will throw exceptions or log error messages if someone tries to use a CUDA stream in a CUDA-free build. This avoids a whole bunch of ifdefs that would otherwise infect the whole codebase. - Uses (roughly in order of preference): `cudaMemcpyAsync, std::copy, cublas, custom device kernel, custom host-to-host transfer logic` for the underlying copy - Provides two different headers: `raft/core/copy.hpp` and `raft/core/copy.cuh`. This is to accommodate the custom kernel necessary for handling completely generic device-to-device copies. See below for more details. ## Details on the header split For many instantiations, even those which involve the device, we do not require nvcc compilation. If, however, we determine at compilation time that we must use a custom kernel for the copy, then we must invoke nvcc. We do not wish to indicate that a public header file is a C++ header when it is a CUDA header or vice versa, so we split the definitions into separate `hpp` and `cuh` files, with all template instantiations requiring the custom kernel enable-if'd out of the hpp file. Thus, the cuh header can be used for _any_ mdspan-to-mdspan copy, but the hpp file will not compile for those specific instantiations that require a custom kernel. The recommended workflow is that if a `cpp` file requires an mdspan-to-mdspan copy, first try the `hpp` header. If that fails, the `cpp` file must be converted to a `cu` file, and the `cuh` header should be used. For source files that are already being compiled with nvcc (i.e. `.cu` files), the `cuh` header might as well be used and will not result in any additional compile time penalty. # Remaining tasks to leave WIP status - [x] Add benchmarks for copies - [x] Ensure that new function is correctly added to docs # Follow-up items - Optimize host-to-host transfers using a cache-oblivious approach with SIMD-accelerated transposes for contiguous memory - Test cache-oblivious device-to-device transfers and compare performance - Provide transparent support for copies between devices. ## Relationship to mdbuffer This utility encapsulates a substantial chunk of the core logic required for the mdbuffer implementation. It is being split into its own PR both because it is useful on its own and because the mdbuffer work has been delayed by higher priority tasks. Close #1779 Authors: - William Hicks (https://github.com/wphicks) - Tarang Jain (https://github.com/tarang-jain) - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Divye Gala (https://github.com/divyegala) URL: https://github.com/rapidsai/raft/pull/1818 --- cpp/bench/prims/CMakeLists.txt | 6 +- cpp/bench/prims/core/copy.cu | 401 +++++++++++++ cpp/include/raft/core/copy.cuh | 74 +++ cpp/include/raft/core/copy.hpp | 69 +++ cpp/include/raft/core/cuda_support.hpp | 23 + cpp/include/raft/core/detail/copy.hpp | 541 ++++++++++++++++++ cpp/include/raft/core/error.hpp | 12 +- .../raft/core/resource/resource_types.hpp | 2 + .../raft/core/resource/stream_view.hpp | 101 ++++ cpp/include/raft/core/stream_view.hpp | 108 ++++ cpp/test/CMakeLists.txt | 19 +- cpp/test/core/mdspan_copy.cpp | 301 ++++++++++ cpp/test/core/mdspan_copy.cu | 433 ++++++++++++++ cpp/test/core/stream_view.cpp | 43 ++ 14 files changed, 2129 insertions(+), 4 deletions(-) create mode 100644 cpp/bench/prims/core/copy.cu create mode 100644 cpp/include/raft/core/copy.cuh create mode 100644 cpp/include/raft/core/copy.hpp create mode 100644 cpp/include/raft/core/cuda_support.hpp create mode 100644 cpp/include/raft/core/detail/copy.hpp create mode 100644 cpp/include/raft/core/resource/stream_view.hpp create mode 100644 cpp/include/raft/core/stream_view.hpp create mode 100644 cpp/test/core/mdspan_copy.cpp create mode 100644 cpp/test/core/mdspan_copy.cu create mode 100644 cpp/test/core/stream_view.cpp diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt index ca4b0f099d..5da2cd916b 100644 --- a/cpp/bench/prims/CMakeLists.txt +++ b/cpp/bench/prims/CMakeLists.txt @@ -32,6 +32,7 @@ function(ConfigureBench) PRIVATE raft::raft raft_internal $<$:raft::compiled> + ${RAFT_CTK_MATH_DEPENDENCIES} benchmark::benchmark Threads::Threads $ @@ -73,11 +74,14 @@ function(ConfigureBench) endfunction() if(BUILD_PRIMS_BENCH) + ConfigureBench( + NAME CORE_BENCH PATH bench/prims/core/bitset.cu bench/prims/core/copy.cu bench/prims/main.cpp + ) + ConfigureBench( NAME CLUSTER_BENCH PATH bench/prims/cluster/kmeans_balanced.cu bench/prims/cluster/kmeans.cu bench/prims/main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY ) - ConfigureBench(NAME CORE_BENCH PATH bench/prims/core/bitset.cu bench/prims/main.cpp) ConfigureBench( NAME TUNE_DISTANCE PATH bench/prims/distance/tune_pairwise/kernel.cu diff --git a/cpp/bench/prims/core/copy.cu b/cpp/bench/prims/core/copy.cu new file mode 100644 index 0000000000..31ee83b924 --- /dev/null +++ b/cpp/bench/prims/core/copy.cu @@ -0,0 +1,401 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace raft::bench::core { + +template +auto constexpr const default_dims = []() { + auto dims = std::array{}; + std::fill(dims.begin(), dims.end(), 2); + return dims; +}(); + +template +auto constexpr const default_dims = std::array{3000000}; + +template +auto constexpr const default_dims = std::array{1000, 3000}; + +template +auto constexpr const default_dims = std::array{20, 300, 500}; + +template > +struct bench_array_type; + +template +struct bench_array_type> { + template + auto static constexpr const extent_type = raft::dynamic_extent; + + using type = + std::conditional_t...>, LayoutPolicy>, + device_mdarray...>, LayoutPolicy>>; +}; + +template +struct params { + std::array dims = default_dims; + using src_array_type = + typename bench_array_type::type; + using dst_array_type = + typename bench_array_type::type; +}; + +template +struct CopyBench : public fixture { + using params_type = + params; + using src_array_type = typename params_type::src_array_type; + using dst_array_type = typename params_type::dst_array_type; + explicit CopyBench(const params_type& ps) + : fixture{true}, + res_{}, + params_{ps}, + src_{ + res_, + typename src_array_type::mapping_type{ + std::apply([](auto... exts) { return make_extents(exts...); }, ps.dims)}, + typename src_array_type::container_policy_type{}, + }, + dst_{ + res_, + typename dst_array_type::mapping_type{ + std::apply([](auto... exts) { return make_extents(exts...); }, ps.dims)}, + typename dst_array_type::container_policy_type{}, + } + { + res_.get_cublas_handle(); // initialize cublas handle + auto src_data = std::vector(src_.size()); + std::iota(src_data.begin(), src_data.end(), SrcT{}); + raft::copy(src_.data_handle(), src_data.data(), src_.size(), res_.get_stream()); + } + + void run_benchmark(::benchmark::State& state) override + { + loop_on_state(state, [this]() { raft::copy(res_, dst_.view(), src_.view()); }); + } + + private: + raft::device_resources res_; + params_type params_; + src_array_type src_; + dst_array_type dst_; +}; + +template +auto static const inputs = std::vector{ParamsT{}}; + +#define COPY_REGISTER(BenchT) \ + RAFT_BENCH_REGISTER(BenchT, "BenchT", inputs) + +using copy_bench_device_device_1d_same_dtype_same_layout = CopyBench; +using copy_bench_device_device_1d_same_dtype_diff_layout = CopyBench; +using copy_bench_device_device_1d_diff_dtype_diff_layout = CopyBench; +using copy_bench_device_device_2d_same_dtype_diff_layout = CopyBench; +using copy_bench_device_device_2d_same_dtype_diff_layout_cublas = CopyBench; +using copy_bench_device_device_3d_diff_dtype_diff_layout = CopyBench; +using copy_bench_device_device_3d_diff_dtype_same_layout = CopyBench; + +using copy_bench_host_host_1d_same_dtype_same_layout = CopyBench; +using copy_bench_host_host_1d_same_dtype_diff_layout = CopyBench; +using copy_bench_host_host_1d_diff_dtype_diff_layout = CopyBench; +using copy_bench_host_host_2d_same_dtype_diff_layout = CopyBench; +using copy_bench_host_host_2d_same_dtype_diff_layout_float_float = CopyBench; +using copy_bench_host_host_3d_diff_dtype_same_layout = CopyBench; +using copy_bench_host_host_3d_diff_dtype_diff_layout = CopyBench; + +using copy_bench_device_host_1d_same_dtype_same_layout = CopyBench; +using copy_bench_device_host_1d_same_dtype_diff_layout = CopyBench; +using copy_bench_device_host_1d_diff_dtype_diff_layout = CopyBench; +using copy_bench_device_host_2d_same_dtype_diff_layout = CopyBench; +using copy_bench_device_host_2d_same_dtype_diff_layout_cublas = CopyBench; +using copy_bench_device_host_3d_diff_dtype_same_layout = CopyBench; +using copy_bench_device_host_3d_diff_dtype_diff_layout = CopyBench; + +using copy_bench_host_device_1d_same_dtype_same_layout = CopyBench; +using copy_bench_host_device_1d_same_dtype_diff_layout = CopyBench; +using copy_bench_host_device_1d_diff_dtype_diff_layout = CopyBench; +using copy_bench_host_device_2d_same_dtype_diff_layout = CopyBench; +using copy_bench_host_device_2d_same_dtype_diff_layout_cublas = CopyBench; +using copy_bench_host_device_3d_diff_dtype_diff_layout = CopyBench; +using copy_bench_host_device_3d_diff_dtype_same_layout = CopyBench; + +// COPY_REGISTER(copy_bench_same_dtype_1d_host_host); +COPY_REGISTER(copy_bench_device_device_1d_same_dtype_same_layout); +COPY_REGISTER(copy_bench_device_device_1d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_device_1d_diff_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_device_2d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_device_2d_same_dtype_diff_layout_cublas); +COPY_REGISTER(copy_bench_device_device_3d_diff_dtype_same_layout); +COPY_REGISTER(copy_bench_device_device_3d_diff_dtype_diff_layout); + +COPY_REGISTER(copy_bench_host_host_1d_same_dtype_same_layout); +COPY_REGISTER(copy_bench_host_host_1d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_host_1d_diff_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_host_2d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_host_2d_same_dtype_diff_layout_float_float); +COPY_REGISTER(copy_bench_host_host_3d_diff_dtype_same_layout); +COPY_REGISTER(copy_bench_host_host_3d_diff_dtype_diff_layout); + +COPY_REGISTER(copy_bench_device_host_1d_same_dtype_same_layout); +COPY_REGISTER(copy_bench_device_host_1d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_host_1d_diff_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_host_2d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_host_2d_same_dtype_diff_layout_cublas); +COPY_REGISTER(copy_bench_device_host_3d_diff_dtype_same_layout); +COPY_REGISTER(copy_bench_device_host_3d_diff_dtype_diff_layout); + +COPY_REGISTER(copy_bench_host_device_1d_same_dtype_same_layout); +COPY_REGISTER(copy_bench_host_device_1d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_device_1d_diff_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_device_2d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_device_2d_same_dtype_diff_layout_cublas); +COPY_REGISTER(copy_bench_host_device_3d_diff_dtype_same_layout); +COPY_REGISTER(copy_bench_host_device_3d_diff_dtype_diff_layout); + +} // namespace raft::bench::core diff --git a/cpp/include/raft/core/copy.cuh b/cpp/include/raft/core/copy.cuh new file mode 100644 index 0000000000..f256f9ea0f --- /dev/null +++ b/cpp/include/raft/core/copy.cuh @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +namespace raft { +/** + * @brief Copy data from one mdspan to another with the same extents + * + * This function copies data from one mdspan to another, regardless of whether + * or not the mdspans have the same layout, memory type (host/device/managed) + * or data type. So long as it is possible to convert the data type from source + * to destination, and the extents are equal, this function should be able to + * perform the copy. Any necessary device operations will be stream-ordered via the CUDA stream + * provided by the `raft::resources` argument. + * + * This header includes a custom kernel used for copying data between + * completely arbitrary mdspans on device. To compile this function in a + * non-CUDA translation unit, `raft/core/copy.hpp` may be used instead. The + * pure C++ header will correctly compile even without a CUDA compiler. + * Depending on the specialization, this CUDA header may invoke the kernel and + * therefore require a CUDA compiler. + * + * Limitations: Currently this function does not support copying directly + * between two arbitrary mdspans on different CUDA devices. It is assumed that the caller sets the + * correct CUDA device. Furthermore, host-to-host copies that require a transformation of the + * underlying memory layout are currently not performant, although they are supported. + * + * Note that when copying to an mdspan with a non-unique layout (i.e. the same + * underlying memory is addressed by different element indexes), the source + * data must contain non-unique values for every non-unique destination + * element. If this is not the case, the behavior is undefined. Some copies + * to non-unique layouts which are well-defined will nevertheless fail with an + * exception to avoid race conditions in the underlying copy. + * + * @tparam DstType An mdspan type for the destination container. + * @tparam SrcType An mdspan type for the source container + * @param res raft::resources used to provide a stream for copies involving the + * device. + * @param dst The destination mdspan. + * @param src The source mdspan. + */ +template +detail::mdspan_copyable_with_kernel_t copy(resources const& res, + DstType&& dst, + SrcType&& src) +{ + detail::copy(res, std::forward(dst), std::forward(src)); +} + +#ifndef RAFT_NON_CUDA_COPY_IMPLEMENTED +#define RAFT_NON_CUDA_COPY_IMPLEMENTED +template +detail::mdspan_copyable_not_with_kernel_t copy(resources const& res, + DstType&& dst, + SrcType&& src) +{ + detail::copy(res, std::forward(dst), std::forward(src)); +} +#endif +} // namespace raft diff --git a/cpp/include/raft/core/copy.hpp b/cpp/include/raft/core/copy.hpp new file mode 100644 index 0000000000..0a16b742a2 --- /dev/null +++ b/cpp/include/raft/core/copy.hpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +namespace raft { + +#ifndef RAFT_NON_CUDA_COPY_IMPLEMENTED +#define RAFT_NON_CUDA_COPY_IMPLEMENTED +/** + * @brief Copy data from one mdspan to another with the same extents + * + * This function copies data from one mdspan to another, regardless of whether + * or not the mdspans have the same layout, memory type (host/device/managed) + * or data type. So long as it is possible to convert the data type from source + * to destination, and the extents are equal, this function should be able to + * perform the copy. + * + * This header does _not_ include the custom kernel used for copying data + * between completely arbitrary mdspans on device. For arbitrary copies of this + * kind, `#include ` instead. Specializations of this + * function that require the custom kernel will be SFINAE-omitted when this + * header is used instead of `copy.cuh`. This header _does_ support + * device-to-device copies that can be performed with cuBLAS or a + * straightforward cudaMemcpy. Any necessary device operations will be stream-ordered via the CUDA + * stream provided by the `raft::resources` argument. + * + * Limitations: Currently this function does not support copying directly + * between two arbitrary mdspans on different CUDA devices. It is assumed that the caller sets the + * correct CUDA device. Furthermore, host-to-host copies that require a transformation of the + * underlying memory layout are currently not performant, although they are supported. + * + * Note that when copying to an mdspan with a non-unique layout (i.e. the same + * underlying memory is addressed by different element indexes), the source + * data must contain non-unique values for every non-unique destination + * element. If this is not the case, the behavior is undefined. Some copies + * to non-unique layouts which are well-defined will nevertheless fail with an + * exception to avoid race conditions in the underlying copy. + * + * @tparam DstType An mdspan type for the destination container. + * @tparam SrcType An mdspan type for the source container + * @param res raft::resources used to provide a stream for copies involving the + * device. + * @param dst The destination mdspan. + * @param src The source mdspan. + */ +template +detail::mdspan_copyable_not_with_kernel_t copy(resources const& res, + DstType&& dst, + SrcType&& src) +{ + detail::copy(res, std::forward(dst), std::forward(src)); +} +#endif + +} // namespace raft diff --git a/cpp/include/raft/core/cuda_support.hpp b/cpp/include/raft/core/cuda_support.hpp new file mode 100644 index 0000000000..07fb95a921 --- /dev/null +++ b/cpp/include/raft/core/cuda_support.hpp @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +namespace raft { +#ifndef RAFT_DISABLE_CUDA +auto constexpr static const CUDA_ENABLED = true; +#else +auto constexpr static const CUDA_ENABLED = false; +#endif +} // namespace raft diff --git a/cpp/include/raft/core/detail/copy.hpp b/cpp/include/raft/core/detail/copy.hpp new file mode 100644 index 0000000000..b23660fefe --- /dev/null +++ b/cpp/include/raft/core/detail/copy.hpp @@ -0,0 +1,541 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifndef RAFT_DISABLE_CUDA +#include +#include +#include +#include +#ifdef __CUDACC__ +#include +#endif +#endif + +namespace raft { +namespace detail { + +template +struct mdspan_copyable : std::false_type { + auto static constexpr const custom_kernel_allowed = false; + auto static constexpr const custom_kernel_not_allowed = false; +}; + +/* + * A helper struct used to determine whether one mdspan type can be copied to + * another and if so how + */ +template +struct mdspan_copyable>>, + std::bool_constant>>>>> { + using dst_type = std::remove_reference_t; + using src_type = std::remove_reference_t; + + // Extents properties + using dst_extents_type = typename dst_type::extents_type; + using src_extents_type = typename src_type::extents_type; + using index_type = + std::conditional_t<(std::numeric_limits::max() > + std::numeric_limits::max()), + typename dst_extents_type::index_type, + typename src_extents_type::index_type>; + + // Dtype properties + using dst_value_type = typename dst_type::value_type; + using src_value_type = typename src_type::value_type; + using dst_element_type = typename dst_type::element_type; + using src_element_type = typename src_type::element_type; + auto static constexpr const same_dtype = std::is_same_v; + auto static constexpr const compatible_dtype = + std::is_assignable_v; + + auto static constexpr const dst_float = std::is_same_v; + auto static constexpr const src_float = std::is_same_v; + auto static constexpr const dst_double = std::is_same_v; + auto static constexpr const src_double = std::is_same_v; + + auto static constexpr const both_float = dst_float && src_float; + auto static constexpr const both_double = dst_double && src_double; + auto static constexpr const both_float_or_both_double = both_float || both_double; + + // Ranks + auto static constexpr const dst_rank = dst_extents_type::rank(); + auto static constexpr const src_rank = src_extents_type::rank(); + auto static constexpr const compatible_rank = (dst_rank == src_rank); + auto static constexpr const has_vector_rank = (dst_rank == 1); + auto static constexpr const has_matrix_rank = (dst_rank == 2); + + // Layout properties + using dst_layout_type = typename dst_type::layout_type; + using src_layout_type = typename src_type::layout_type; + + auto static constexpr const same_layout = std::is_same_v; + + auto static check_for_unique_dst(dst_type dst) + { + if constexpr (!dst_type::is_always_unique()) { + RAFT_EXPECTS(dst.is_unique(), "Destination mdspan must be unique for parallelized copies"); + } + } + + auto static constexpr const src_contiguous = + std::disjunction_v, + std::is_same>; + + auto static constexpr const dst_contiguous = + std::disjunction_v, + std::is_same>; + + auto static constexpr const both_contiguous = src_contiguous && dst_contiguous; + + auto static constexpr const same_underlying_layout = + std::disjunction_v, + std::bool_constant>; + // Layout for intermediate tile if copying through custom kernel + using tile_layout_type = + std::conditional_t>; + + // Accessibility + auto static constexpr const dst_device_accessible = is_device_mdspan_v; + auto static constexpr const src_device_accessible = is_device_mdspan_v; + auto static constexpr const both_device_accessible = + dst_device_accessible && src_device_accessible; + + auto static constexpr const dst_host_accessible = is_host_mdspan_v; + auto static constexpr const src_host_accessible = is_host_mdspan_v; + auto static constexpr const both_host_accessible = dst_host_accessible && src_host_accessible; + + // Allowed copy codepaths + auto static constexpr const can_use_host = both_host_accessible; + +#if (defined(__AVX__) || defined(__SSE__) || defined(__ARM_NEON)) + // TODO(wphicks): Following should be only necessary restrictions. Test if + // perf actually improves once fully implemented. + // auto static constexpr const can_use_simd = can_use_host && both_contiguous && + // both_float_or_both_double; + auto static constexpr const can_use_simd = + can_use_host && both_contiguous && both_float && has_matrix_rank; +#else + auto static constexpr const can_use_simd = false; +#endif + + auto static constexpr const can_use_std_copy = + std::conjunction_v, + std::bool_constant, + std::bool_constant, + std::bool_constant>; + auto static constexpr const can_use_raft_copy = + std::conjunction_v, + std::bool_constant, + std::bool_constant, + std::bool_constant>; + + // Do we need intermediate storage on device in order to perform + // non-trivial layout or dtype conversions after copying source from host or + // before copying converted results back to host? + auto static constexpr const requires_intermediate = + !both_host_accessible && !both_device_accessible && !can_use_raft_copy; + + auto static constexpr const use_intermediate_dst = + std::conjunction_v, + std::bool_constant>; + + auto static constexpr const use_intermediate_src = + std::conjunction_v, + std::bool_constant>; + auto static constexpr const can_use_device = + std::conjunction_v, + std::disjunction, + std::bool_constant, + std::bool_constant>>; + + auto static constexpr const can_use_cublas = + std::conjunction_v, + std::bool_constant, + std::bool_constant, + std::bool_constant, + std::bool_constant, + std::bool_constant>; + + auto static constexpr const custom_kernel_allowed = + std::conjunction_v, + std::bool_constant>; + + auto static constexpr const custom_kernel_not_allowed = !custom_kernel_allowed; + auto static constexpr const custom_kernel_required = + std::conjunction_v, + std::bool_constant>; + + // Viable overload? + auto static constexpr const value = + std::conjunction_v>, + std::bool_constant>, + std::bool_constant>; + using type = std::enable_if_t; +}; + +template +using mdspan_copyable_t = typename mdspan_copyable::type; +template +auto static constexpr const mdspan_copyable_v = + mdspan_copyable::value; + +template +auto static constexpr const mdspan_copyable_with_kernel_v = + mdspan_copyable::custom_kernel_allowed; +template +auto static constexpr const mdspan_copyable_not_with_kernel_v = + mdspan_copyable::custom_kernel_not_allowed; + +template +using mdspan_copyable_with_kernel_t = + std::enable_if_t, T>; + +template +using mdspan_copyable_not_with_kernel_t = + std::enable_if_t, T>; + +#ifdef __CUDACC__ +auto static constexpr const mdspan_copy_tile_dim = 32; +auto static constexpr const mdspan_copy_tile_elems = mdspan_copy_tile_dim * mdspan_copy_tile_dim; + +// Helper struct to work around lack of CUDA-native std::apply +template +struct index_sequence {}; + +template +struct make_index_sequence + : std::conditional_t, + make_index_sequence> {}; + +/* template +__host__ __device__ decltype(auto) apply(LambdaT&& lambda, ContainerT&& args, index_sequence) +{ + return lambda(args[Idx]...); +} + +template +__host__ __device__ decltype(auto) apply(LambdaT&& lambda, ContainerT&& args) +{ + return apply(std::forward(lambda), std::forward(args), +make_index_sequence{}); +} */ + +/* + * Given an mdspan and an array of indices, return a reference to the + * indicated element. + */ +template +__device__ decltype(auto) get_mdspan_elem(MdspanType md, + IdxType const* indices, + index_sequence) +{ + return md(indices[Idx]...); +} + +template +__device__ decltype(auto) get_mdspan_elem(MdspanType md, IdxType const* indices) +{ + return get_mdspan_elem( + md, indices, make_index_sequence{}); +} + +/* Advance old_indices forward by the number of mdspan elements specified + * by increment. Store the result in indices. Return true if the new + * indices are valid for the input mdspan. + */ +template +__device__ auto increment_indices(IdxType* indices, + MdspanType const& md, + IdxType const* old_indices, + IdxType const* index_strides, + IncrType increment) +{ +#pragma unroll + for (auto i = typename MdspanType::extents_type::rank_type{}; i < md.rank(); ++i) { + increment += index_strides[i] * old_indices[i]; + } + +#pragma unroll + for (auto i = typename MdspanType::extents_type::rank_type{}; i < md.rank(); ++i) { + // Iterate through dimensions in order from slowest to fastest varying for + // layout_right and layout_left. Otherwise, just iterate through dimensions + // in order. + // + // TODO(wphicks): It is possible to always iterate through dimensions in + // the slowest to fastest order. Consider this or at minimum expanding to + // padded layouts. + auto const real_index = [](auto ind) { + if constexpr (std::is_same_v) { + return MdspanType::rank() - ind - 1; + } else { + return ind; + } + }(i); + + auto cur_index = IdxType{}; + + while (cur_index < md.extent(real_index) - 1 && increment >= index_strides[real_index]) { + increment -= index_strides[real_index]; + ++cur_index; + } + indices[real_index] = cur_index; + } + + return increment == IdxType{}; +} + +/* + * WARNING: This kernel _must_ be launched with mdspan_copy_tile_dim x + * mdspan_copy_tile_dim threads per block. This restriction allows for + * additional optimizations at the expense of generalized launch + * parameters. + */ +template +__global__ mdspan_copyable_with_kernel_t mdspan_copy_kernel(DstType dst, + SrcType src) +{ + using config = mdspan_copyable; + + // An intermediate storage location for the data to be copied. + __shared__ typename config::dst_value_type tile[mdspan_copy_tile_dim][mdspan_copy_tile_dim + 1]; + + // Compute the cumulative product of extents in order from fastest to + // slowest varying extent + typename config::index_type index_strides[config::dst_rank]; + auto cur_stride = typename config::index_type{1}; +#pragma unroll + for (auto i = typename SrcType::extents_type::rank_type{}; i < config::src_rank; ++i) { + // Iterate through dimensions in order from fastest to slowest varying + auto const real_index = [](auto ind) { + if constexpr (std::is_same_v) { + return config::src_rank - ind - 1; + } else { + return ind; + } + }(i); + + index_strides[real_index] = cur_stride; + cur_stride *= src.extent(real_index); + } + + // The index of the first element in the mdspan which will be copied via + // the current tile for this block. + typename config::index_type tile_offset[config::dst_rank] = {0}; + typename config::index_type cur_indices[config::dst_rank]; + auto valid_tile = increment_indices( + tile_offset, src, tile_offset, index_strides, blockIdx.x * mdspan_copy_tile_elems); + + while (valid_tile) { + auto tile_read_x = std::is_same_v + ? threadIdx.x + : threadIdx.y; + auto tile_read_y = std::is_same_v + ? threadIdx.y + : threadIdx.x; + + auto valid_index = increment_indices(cur_indices, + src, + tile_offset, + index_strides, + tile_read_x * mdspan_copy_tile_dim + tile_read_y); + + if constexpr (config::same_underlying_layout || !config::dst_contiguous) { + if (valid_index) { + tile[tile_read_x][tile_read_y] = get_mdspan_elem(src, cur_indices); + get_mdspan_elem(dst, cur_indices) = tile[tile_read_x][tile_read_y]; + } + } else { + if (valid_index) { tile[tile_read_x][tile_read_y] = get_mdspan_elem(src, cur_indices); } + __syncthreads(); + + valid_index = increment_indices(cur_indices, + src, + tile_offset, + index_strides, + tile_read_y * mdspan_copy_tile_dim + tile_read_x); + if (valid_index) { get_mdspan_elem(dst, cur_indices) = tile[tile_read_y][tile_read_x]; } + __syncthreads(); + } + valid_tile = increment_indices( + tile_offset, src, tile_offset, index_strides, blockDim.x * mdspan_copy_tile_elems); + } +} +#endif + +template +mdspan_copyable_t copy(resources const& res, DstType&& dst, SrcType&& src) +{ + using config = mdspan_copyable; + for (auto i = std::size_t{}; i < config::src_rank; ++i) { + RAFT_EXPECTS(src.extent(i) == dst.extent(i), "Must copy between mdspans of the same shape"); + } + + if constexpr (config::use_intermediate_src) { +#ifndef RAFT_DISABLE_CUDA + // Copy to intermediate source on device, then perform necessary + // changes in layout on device, directly into final destination + using mdarray_t = device_mdarray; + auto intermediate = mdarray_t(res, + typename mdarray_t::mapping_type{src.extents()}, + typename mdarray_t::container_policy_type{}); + detail::copy(res, intermediate.view(), src); + detail::copy(res, dst, intermediate.view()); +#else + // Not possible to reach this due to enable_ifs. Included for safety. + throw(raft::non_cuda_build_error("Copying to device in non-CUDA build")); +#endif + + } else if constexpr (config::use_intermediate_dst) { +#ifndef RAFT_DISABLE_CUDA + // Perform necessary changes in layout on device, then copy to final + // destination on host + using mdarray_t = device_mdarray; + auto intermediate = mdarray_t(res, + typename mdarray_t::mapping_type{dst.extents()}, + typename mdarray_t::container_policy_type{}); + detail::copy(res, intermediate.view(), src); + detail::copy(res, dst, intermediate.view()); +#else + throw(raft::non_cuda_build_error("Copying from device in non-CUDA build")); +#endif + } else if constexpr (config::can_use_raft_copy) { +#ifndef RAFT_DISABLE_CUDA + raft::copy(dst.data_handle(), src.data_handle(), dst.size(), resource::get_cuda_stream(res)); +#else + // Not possible to reach this due to enable_ifs. Included for safety. + throw(raft::non_cuda_build_error("Copying to from or on device in non-CUDA build")); +#endif + } else if constexpr (config::can_use_cublas) { +#ifndef RAFT_DISABLE_CUDA + auto constexpr const alpha = typename std::remove_reference_t::value_type{1}; + auto constexpr const beta = typename std::remove_reference_t::value_type{0}; + if constexpr (std::is_same_v) { + CUBLAS_TRY(linalg::detail::cublasgeam(resource::get_cublas_handle(res), + CUBLAS_OP_T, + CUBLAS_OP_N, + dst.extent(1), + dst.extent(0), + &alpha, + src.data_handle(), + src.extent(0), + &beta, + dst.data_handle(), + dst.extent(1), + dst.data_handle(), + dst.extent(1), + resource::get_cuda_stream(res))); + } else { + CUBLAS_TRY(linalg::detail::cublasgeam(resource::get_cublas_handle(res), + CUBLAS_OP_T, + CUBLAS_OP_N, + dst.extent(0), + dst.extent(1), + &alpha, + src.data_handle(), + src.extent(1), + &beta, + dst.data_handle(), + dst.extent(0), + dst.data_handle(), + dst.extent(0), + resource::get_cuda_stream(res))); + } +#else + // Not possible to reach this due to enable_ifs. Included for safety. + throw(raft::non_cuda_build_error("Copying to from or on device in non-CUDA build")); +#endif + } else if constexpr (config::custom_kernel_allowed) { +#ifdef __CUDACC__ + config::check_for_unique_dst(dst); + auto const blocks = std::min( + // This maximum is somewhat arbitrary. Could query the device to see + // how many blocks we could reasonably allow, but this is probably + // sufficient considering that this kernel will likely overlap with + // real computations for most use cases. + typename config::index_type{32}, + raft::ceildiv(typename config::index_type(dst.size()), + typename config::index_type(mdspan_copy_tile_elems))); + auto constexpr const threads = dim3{mdspan_copy_tile_dim, mdspan_copy_tile_dim, 1}; + mdspan_copy_kernel<<>>(dst, src); +#else + // Should never actually reach this because of enable_ifs. Included for + // safety. + RAFT_FAIL( + "raft::copy called in a way that requires custom kernel. Please use " + "raft/core/copy.cuh and include the header in a .cu file"); +#endif + } else if constexpr (config::can_use_std_copy) { + std::copy(src.data_handle(), src.data_handle() + dst.size(), dst.data_handle()); + } else { + // TODO(wphicks): Make the following cache-oblivious and add SIMD support + auto indices = std::array{}; + for (auto i = std::size_t{}; i < dst.size(); ++i) { + if (i != 0) { + if constexpr (std::is_same_v) { + // For layout_right/layout_c_contiguous, we iterate over the + // rightmost extent fastest + auto dim = config::src_rank - 1; + while ((++indices[dim]) == src.extent(dim)) { + indices[dim] = typename config::index_type{}; + --dim; + } + } else { + // For layout_left/layout_f_contiguous (and currently all other + // layouts), we iterate over the leftmost extent fastest. The + // cache-oblivious implementation should work through dimensions in + // order of increasing stride. + auto dim = std::size_t{}; + while ((++indices[dim]) == src.extent(dim)) { + indices[dim] = typename config::index_type{}; + ++dim; + } + } + } + std::apply(dst, indices) = std::apply(src, indices); + } + } +} +} // namespace detail +} // namespace raft diff --git a/cpp/include/raft/core/error.hpp b/cpp/include/raft/core/error.hpp index 84b244f4dc..9045c5c871 100644 --- a/cpp/include/raft/core/error.hpp +++ b/cpp/include/raft/core/error.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -98,6 +98,16 @@ struct logic_error : public raft::exception { explicit logic_error(std::string const& message) : raft::exception(message) {} }; +/** + * @brief Exception thrown when attempting to use CUDA features from a non-CUDA + * build + * + */ +struct non_cuda_build_error : public raft::exception { + explicit non_cuda_build_error(char const* const message) : raft::exception(message) {} + explicit non_cuda_build_error(std::string const& message) : raft::exception(message) {} +}; + /** * @} */ diff --git a/cpp/include/raft/core/resource/resource_types.hpp b/cpp/include/raft/core/resource/resource_types.hpp index 8e331293bf..c30f2e81e8 100644 --- a/cpp/include/raft/core/resource/resource_types.hpp +++ b/cpp/include/raft/core/resource/resource_types.hpp @@ -39,6 +39,8 @@ enum resource_type { SUB_COMMUNICATOR, // raft sub communicator DEVICE_PROPERTIES, // cuda device properties DEVICE_ID, // cuda device id + STREAM_VIEW, // view of a cuda stream or a placeholder in + // CUDA-free builds THRUST_POLICY, // thrust execution policy WORKSPACE_RESOURCE, // rmm device memory resource diff --git a/cpp/include/raft/core/resource/stream_view.hpp b/cpp/include/raft/core/resource/stream_view.hpp new file mode 100644 index 0000000000..ccf516076f --- /dev/null +++ b/cpp/include/raft/core/resource/stream_view.hpp @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#ifndef RAFT_DISABLE_CUDA +#include +#endif + +namespace raft::resource { +struct stream_view_resource : public resource { + stream_view_resource(raft::stream_view view = raft::stream_view_per_thread) : stream(view) {} + void* get_resource() override { return &stream; } + + ~stream_view_resource() override {} + + private: + raft::stream_view stream; +}; + +/** + * Factory that knows how to construct a specific raft::resource to populate + * the resources instance. + */ +struct stream_view_resource_factory : public resource_factory { + public: + stream_view_resource_factory(raft::stream_view view = raft::stream_view_per_thread) : stream(view) + { + } + resource_type get_resource_type() override { return resource_type::STREAM_VIEW; } + resource* make_resource() override { return new stream_view_resource(stream); } + + private: + raft::stream_view stream; +}; + +/** + * @defgroup resource_stream_view stream resource functions compatible with + * non-CUDA builds + * @{ + */ +/** + * Load a raft::stream_view from a resources instance (and populate it on the res + * if needed). + * @param res raft res object for managing resources + * @return + */ +inline raft::stream_view get_stream_view(resources const& res) +{ + if (!res.has_resource_factory(resource_type::STREAM_VIEW)) { + res.add_resource_factory(std::make_shared()); + } + return *res.get_resource(resource_type::STREAM_VIEW); +}; + +/** + * Load a raft::stream__view from a resources instance (and populate it on the res + * if needed). + * @param[in] res raft resources object for managing resources + * @param[in] view raft stream view + */ +inline void set_stream_view(resources const& res, raft::stream_view view) +{ + res.add_resource_factory(std::make_shared(view)); +}; + +/** + * @brief synchronize a specific stream + * + * @param[in] res the raft resources object + * @param[in] stream stream to synchronize + */ +inline void sync_stream_view(const resources& res, raft::stream_view stream) +{ + stream.interruptible_synchronize(); +} + +/** + * @brief synchronize main stream on the resources instance + */ +inline void sync_stream_view(const resources& res) { sync_stream_view(res, get_stream_view(res)); } + +/** + * @} + */ + +} // namespace raft::resource diff --git a/cpp/include/raft/core/stream_view.hpp b/cpp/include/raft/core/stream_view.hpp new file mode 100644 index 0000000000..f7e7934dbf --- /dev/null +++ b/cpp/include/raft/core/stream_view.hpp @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#ifndef RAFT_DISABLE_CUDA +#include +#include +#endif + +namespace raft { + +namespace detail { +struct fail_stream_view { + constexpr fail_stream_view() = default; + constexpr fail_stream_view(fail_stream_view const&) = default; + constexpr fail_stream_view(fail_stream_view&&) = default; + auto constexpr operator=(fail_stream_view const&) -> fail_stream_view& = default; + auto constexpr operator=(fail_stream_view&&) -> fail_stream_view& = default; + auto value() { throw non_cuda_build_error{"Attempted to access CUDA stream in non-CUDA build"}; } + [[nodiscard]] auto is_per_thread_default() const { return false; } + [[nodiscard]] auto is_default() const { return false; } + void synchronize() const + { + throw non_cuda_build_error{"Attempted to sync CUDA stream in non-CUDA build"}; + } + void synchronize_no_throw() const + { + RAFT_LOG_ERROR("Attempted to sync CUDA stream in non-CUDA build"); + } +}; +} // namespace detail + +/** A lightweight wrapper around rmm::cuda_stream_view that can be used in + * CUDA-free builds + * + * While CUDA-free builds should never actually make use of a CUDA stream at + * runtime, it is sometimes useful to have a symbol that can stand in place of + * a CUDA stream to avoid excessive ifdef directives interspersed with other + * logic. This struct's methods invoke the underlying rmm::cuda_stream_view in + * CUDA-enabled builds but throw runtime exceptions if any non-trivial method + * is called from a CUDA-free build */ +struct stream_view { +#ifndef RAFT_DISABLE_CUDA + using underlying_view_type = rmm::cuda_stream_view; +#else + using underlying_view_type = detail::fail_stream_view; +#endif + + constexpr stream_view( + underlying_view_type base_view = stream_view::get_underlying_per_thread_default()) + : base_view_{base_view} + { + } + constexpr stream_view(stream_view const&) = default; + constexpr stream_view(stream_view&&) = default; + auto operator=(stream_view const&) -> stream_view& = default; + auto operator=(stream_view&&) -> stream_view& = default; + auto value() { return base_view_.value(); } + operator underlying_view_type() const noexcept { return base_view_; } + [[nodiscard]] auto is_per_thread_default() const { return base_view_.is_per_thread_default(); } + [[nodiscard]] auto is_default() const { return base_view_.is_default(); } + void synchronize() const { base_view_.synchronize(); } + void synchronize_no_throw() const { base_view_.synchronize_no_throw(); } + void interruptible_synchronize() const + { +#ifndef RAFT_DISABLE_CUDA + interruptible::synchronize(base_view_); +#else + synchronize(); +#endif + } + + auto underlying() { return base_view_; } + void synchronize_if_cuda_enabled() + { + if constexpr (raft::CUDA_ENABLED) { base_view_.synchronize(); } + } + + private: + underlying_view_type base_view_; + auto static get_underlying_per_thread_default() -> underlying_view_type + { +#ifndef RAFT_DISABLE_CUDA + return rmm::cuda_stream_per_thread; +#else + auto static constexpr const default_fail_stream = underlying_view_type{}; + return default_fail_stream; +#endif + } +}; + +auto static const stream_view_per_thread = stream_view{}; + +} // namespace raft diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 0651ccac86..8da5e6986c 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -21,7 +21,7 @@ rapids_test_init() function(ConfigureTest) - set(options OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY) + set(options OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY NOCUDA) set(oneValueArgs NAME GPUS PERCENT) set(multiValueArgs PATH TARGETS CONFIGURATIONS) @@ -37,7 +37,11 @@ function(ConfigureTest) set(_RAFT_TEST_PERCENT 100) endif() - set(TEST_NAME ${_RAFT_TEST_NAME}) + if(_RAFT_TEST_NOCUDA) + set(TEST_NAME "${_RAFT_TEST_NAME}_NOCUDA") + else() + set(TEST_NAME ${_RAFT_TEST_NAME}) + endif() add_executable(${TEST_NAME} ${_RAFT_TEST_PATH}) target_link_libraries( @@ -68,6 +72,9 @@ function(ConfigureTest) if(_RAFT_TEST_EXPLICIT_INSTANTIATE_ONLY) target_compile_definitions(${TEST_NAME} PRIVATE "RAFT_EXPLICIT_INSTANTIATE_ONLY") endif() + if(_RAFT_TEST_NOCUDA) + target_compile_definitions(${TEST_NAME} PRIVATE "RAFT_DISABLE_CUDA") + endif() target_include_directories(${TEST_NAME} PUBLIC "$") @@ -117,6 +124,8 @@ if(BUILD_TESTS) test/core/interruptible.cu test/core/nvtx.cpp test/core/mdarray.cu + test/core/mdspan_copy.cpp + test/core/mdspan_copy.cu test/core/mdspan_utils.cu test/core/numpy_serializer.cu test/core/memory_type.cpp @@ -124,12 +133,18 @@ if(BUILD_TESTS) test/core/sparse_matrix.cpp test/core/span.cpp test/core/span.cu + test/core/stream_view.cpp test/core/temporary_device_buffer.cu test/test.cpp LIB EXPLICIT_INSTANTIATE_ONLY ) + ConfigureTest( + NAME CORE_TEST PATH test/core/stream_view.cpp test/core/mdspan_copy.cpp LIB + EXPLICIT_INSTANTIATE_ONLY NOCUDA + ) + ConfigureTest( NAME DISTANCE_TEST diff --git a/cpp/test/core/mdspan_copy.cpp b/cpp/test/core/mdspan_copy.cpp new file mode 100644 index 0000000000..2f938e3035 --- /dev/null +++ b/cpp/test/core/mdspan_copy.cpp @@ -0,0 +1,301 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.h" +#include +#include +#include +#ifndef RAFT_DISABLE_CUDA +#include +#include +#endif +#include +#include + +namespace raft { +TEST(MDSpanCopy, Mdspan1DHostHost) +{ + auto res = resources{}; + auto cols = std::uint32_t{2}; + auto in_left = make_host_vector(res, cols); + + auto gen_unique_entry = [](auto&& x) { return x; }; + for (auto i = std::uint32_t{}; i < cols; ++i) { + in_left(i) = gen_unique_entry(i); + } + + auto out_right = make_host_vector(res, cols); + static_assert(detail::mdspan_copyable::can_use_std_copy, + "Current implementation should use std::copy for this copy"); + copy(res, out_right.view(), in_left.view()); + for (auto i = std::uint32_t{}; i < cols; ++i) { + ASSERT_TRUE(match(out_right(i), double(gen_unique_entry(i)), CompareApprox{0.0001})); + } +} + +#ifndef RAFT_DISABLE_CUDA +TEST(MDSpanCopy, Mdspan1DHostDevice) +{ + auto res = device_resources{}; + auto cols = std::uint32_t{2}; + auto in_left = make_host_vector(res, cols); + + auto gen_unique_entry = [](auto&& x) { return x; }; + for (auto i = std::uint32_t{}; i < cols; ++i) { + in_left(i) = gen_unique_entry(i); + } + + auto out_right = make_device_vector(res, cols); + static_assert(detail::mdspan_copyable::can_use_raft_copy, + "Current implementation should use raft::copy for this copy"); + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < cols; ++i) { + ASSERT_TRUE( + match(float(out_right(i)), float(gen_unique_entry(i)), CompareApprox{0.0001f})); + } +} + +TEST(MDSpanCopy, Mdspan1DDeviceHost) +{ + auto res = device_resources{}; + auto cols = std::uint32_t{2}; + auto in_left = make_device_vector(res, cols); + + auto gen_unique_entry = [](auto&& x) { return x; }; + for (auto i = std::uint32_t{}; i < cols; ++i) { + in_left(i) = gen_unique_entry(i); + } + + auto out_right = make_host_vector(res, cols); + static_assert(detail::mdspan_copyable::can_use_raft_copy, + "Current implementation should use raft::copy for this copy"); + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < cols; ++i) { + ASSERT_TRUE( + match(float(out_right(i)), float(gen_unique_entry(i)), CompareApprox{0.0001f})); + } +} +#endif + +TEST(MDSpanCopy, Mdspan3DHostHost) +{ + auto res = resources{}; + auto constexpr depth = std::uint32_t{500}; + auto constexpr rows = std::uint32_t{300}; + auto constexpr cols = std::uint32_t{200}; + auto in_left = make_host_mdarray( + res, extents{}); + auto in_right = make_host_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; }; + + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + in_left(i, j, k) = gen_unique_entry(i, j, k); + in_right(i, j, k) = gen_unique_entry(i, j, k); + } + } + } + + auto out_left = make_host_mdarray( + res, extents{}); + auto out_right = make_host_mdarray( + res, extents{}); + + static_assert(detail::mdspan_copyable::can_use_std_copy, + "Current implementation should use std::copy for this copy"); + copy(res, out_right.view(), in_right.view()); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match( + out_right(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); + } + } + } + + copy(res, out_right.view(), in_left.view()); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match( + out_right(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); + } + } + } + + copy(res, out_left.view(), in_right.view()); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match( + out_left(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); + } + } + } + + static_assert(detail::mdspan_copyable:: + can_use_std_copy, + "Current implementation should use std::copy for this copy"); + copy(res, out_left.view(), in_left.view()); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match( + out_left(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); + } + } + } +} + +#ifndef RAFT_DISABLE_CUDA +TEST(MDSpanCopy, Mdspan3DHostDevice) +{ + auto res = device_resources{}; + // Use smaller values here since host/device copy takes awhile. + // Non-trivial logic is tested in the other cases. + auto constexpr depth = std::uint32_t{5}; + auto constexpr rows = std::uint32_t{3}; + auto constexpr cols = std::uint32_t{2}; + auto in_left = make_host_mdarray( + res, extents{}); + auto in_right = make_host_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; }; + + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + in_left(i, j, k) = gen_unique_entry(i, j, k); + in_right(i, j, k) = gen_unique_entry(i, j, k); + } + } + } + + auto out_left = make_device_mdarray( + res, extents{}); + auto out_right = + make_device_mdarray( + res, extents{}); + + static_assert(detail::mdspan_copyable::can_use_raft_copy, + "Current implementation should use raft::copy for this copy"); + copy(res, out_right.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match(float(out_right(i, j, k)), + float(gen_unique_entry(i, j, k)), + CompareApprox{0.0001})); + } + } + } + + static_assert(detail::mdspan_copyable:: + can_use_raft_copy, + "Current implementation should use raft::copy for this copy"); + copy(res, out_left.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match(float(out_left(i, j, k)), + float(gen_unique_entry(i, j, k)), + CompareApprox{0.0001})); + } + } + } +} + +TEST(MDSpanCopy, Mdspan2DDeviceDevice) +{ + auto res = device_resources{}; + auto constexpr rows = std::uint32_t{300}; + auto constexpr cols = std::uint32_t{200}; + auto in_left = make_device_mdarray( + res, extents{}); + auto in_right = make_device_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y) { return x * 7 + y * 11; }; + + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + in_left(i, j) = gen_unique_entry(i, j); + in_right(i, j) = gen_unique_entry(i, j); + } + } + + auto out_left = make_device_mdarray( + res, extents{}); + auto out_right = make_device_mdarray( + res, extents{}); + + static_assert(detail::mdspan_copyable::can_use_raft_copy, + "Current implementation should use raft::copy for this copy"); + copy(res, out_right.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE( + match(float(out_right(i, j)), float(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } + + static_assert(detail::mdspan_copyable::can_use_cublas, + "Current implementation should use cuBLAS for this copy"); + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE( + match(float(out_right(i, j)), float(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } + + static_assert(detail::mdspan_copyable::can_use_cublas, + "Current implementation should use cuBLAS for this copy"); + copy(res, out_left.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE( + match(float(out_left(i, j)), float(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } +} +#endif + +} // namespace raft diff --git a/cpp/test/core/mdspan_copy.cu b/cpp/test/core/mdspan_copy.cu new file mode 100644 index 0000000000..95d7d3befd --- /dev/null +++ b/cpp/test/core/mdspan_copy.cu @@ -0,0 +1,433 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.h" +#include +#include +#include +#include +#include +#include + +namespace raft { +TEST(MDSpanCopy, Mdspan3DDeviceDeviceCuda) +{ + auto res = device_resources{}; + auto constexpr const depth = std::uint32_t{50}; + auto constexpr const rows = std::uint32_t{30}; + auto constexpr const cols = std::uint32_t{20}; + auto in_left = make_device_mdarray( + res, extents{}); + auto in_right = make_device_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; }; + + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + in_left(i, j, k) = gen_unique_entry(i, j, k); + in_right(i, j, k) = gen_unique_entry(i, j, k); + } + } + } + res.sync_stream(); + + // Test dtype conversion without transpose + auto out_long = + make_device_mdarray( + res, extents{}); + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_long.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(std::int64_t(out_long(i, j, k)), std::int64_t(gen_unique_entry(i, j, k))); + } + } + } + + // Test transpose + auto out_left = make_device_mdarray( + res, extents{}); + auto out_right = make_device_mdarray( + res, extents{}); + + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(int(out_right(i, j, k)), int(gen_unique_entry(i, j, k))); + } + } + } + + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_left.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(int(out_left(i, j, k)), int(gen_unique_entry(i, j, k))); + } + } + } +} + +TEST(MDSpanCopy, Mdspan2DDeviceDeviceCuda) +{ + auto res = device_resources{}; + auto constexpr rows = std::uint32_t{30}; + auto constexpr cols = std::uint32_t{20}; + auto in_left = make_device_mdarray( + res, extents{}); + auto in_right = make_device_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y) { return x * 7 + y * 11; }; + + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + in_left(i, j) = gen_unique_entry(i, j); + in_right(i, j) = gen_unique_entry(i, j); + } + } + + auto out_left = make_device_mdarray( + res, extents{}); + auto out_right = make_device_mdarray( + res, extents{}); + + res.sync_stream(); + + // Test dtype conversion without transpose + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_right.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } + + // Test dtype conversion with transpose + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_left.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_left(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } +} +TEST(MDSpanCopy, Mdspan3DDeviceHostCuda) +{ + auto res = device_resources{}; + auto constexpr const depth = std::uint32_t{50}; + auto constexpr const rows = std::uint32_t{30}; + auto constexpr const cols = std::uint32_t{20}; + auto in_left = make_device_mdarray( + res, extents{}); + auto in_right = make_device_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; }; + + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + in_left(i, j, k) = gen_unique_entry(i, j, k); + in_right(i, j, k) = gen_unique_entry(i, j, k); + } + } + } + res.sync_stream(); + + // Test dtype conversion without transpose + auto out_long = + make_host_mdarray( + res, extents{}); + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_long.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(std::int64_t(out_long(i, j, k)), std::int64_t(gen_unique_entry(i, j, k))); + } + } + } + + // Test transpose + auto out_left = make_host_mdarray( + res, extents{}); + auto out_right = make_host_mdarray( + res, extents{}); + + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(int(out_right(i, j, k)), int(gen_unique_entry(i, j, k))); + } + } + } + + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_left.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(int(out_left(i, j, k)), int(gen_unique_entry(i, j, k))); + } + } + } +} + +TEST(MDSpanCopy, Mdspan2DDeviceHostCuda) +{ + auto res = device_resources{}; + auto constexpr rows = std::uint32_t{30}; + auto constexpr cols = std::uint32_t{20}; + auto in_left = make_host_mdarray( + res, extents{}); + auto in_right = make_host_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y) { return x * 7 + y * 11; }; + + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + in_left(i, j) = gen_unique_entry(i, j); + in_right(i, j) = gen_unique_entry(i, j); + } + } + + auto out_left = make_device_mdarray( + res, extents{}); + auto out_right = make_device_mdarray( + res, extents{}); + + res.sync_stream(); + + // Test dtype conversion without transpose + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_right.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } + + // Test dtype conversion with transpose + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_left.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_left(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } +} + +TEST(MDSpanCopy, Mdspan3DHostDeviceCuda) +{ + auto res = device_resources{}; + auto constexpr const depth = std::uint32_t{50}; + auto constexpr const rows = std::uint32_t{30}; + auto constexpr const cols = std::uint32_t{20}; + auto in_left = make_device_mdarray( + res, extents{}); + auto in_right = make_device_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; }; + + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + in_left(i, j, k) = gen_unique_entry(i, j, k); + in_right(i, j, k) = gen_unique_entry(i, j, k); + } + } + } + res.sync_stream(); + + // Test dtype conversion without transpose + auto out_long = + make_device_mdarray( + res, extents{}); + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_long.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(std::int64_t(out_long(i, j, k)), std::int64_t(gen_unique_entry(i, j, k))); + } + } + } + + // Test transpose + auto out_left = make_device_mdarray( + res, extents{}); + auto out_right = make_device_mdarray( + res, extents{}); + + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(int(out_right(i, j, k)), int(gen_unique_entry(i, j, k))); + } + } + } + + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_left.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(int(out_left(i, j, k)), int(gen_unique_entry(i, j, k))); + } + } + } +} + +TEST(MDSpanCopy, Mdspan2DHostDeviceCuda) +{ + auto res = device_resources{}; + auto constexpr rows = std::uint32_t{30}; + auto constexpr cols = std::uint32_t{20}; + auto in_left = make_device_mdarray( + res, extents{}); + auto in_right = make_device_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y) { return x * 7 + y * 11; }; + + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + in_left(i, j) = gen_unique_entry(i, j); + in_right(i, j) = gen_unique_entry(i, j); + } + } + + auto out_left = make_device_mdarray( + res, extents{}); + auto out_right = make_device_mdarray( + res, extents{}); + + res.sync_stream(); + + // Test dtype conversion without transpose + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_right.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } + + // Test dtype conversion with transpose + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_left.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_left(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } +} + +} // namespace raft diff --git a/cpp/test/core/stream_view.cpp b/cpp/test/core/stream_view.cpp new file mode 100644 index 0000000000..715c53fe21 --- /dev/null +++ b/cpp/test/core/stream_view.cpp @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#ifndef RAFT_DISABLE_CUDA +#include +#endif +namespace raft { +TEST(StreamView, Default) +{ + auto stream = stream_view_per_thread; + ASSERT_EQ(stream.is_per_thread_default(), raft::CUDA_ENABLED); + ASSERT_FALSE(stream.is_default()); + if (raft::CUDA_ENABLED) { + EXPECT_NO_THROW(stream.synchronize()); + EXPECT_NO_THROW(stream.interruptible_synchronize()); + } else { + EXPECT_THROW(stream.synchronize(), raft::non_cuda_build_error); + EXPECT_THROW(stream.interruptible_synchronize(), raft::non_cuda_build_error); + } + EXPECT_NO_THROW(stream.synchronize_no_throw()); + EXPECT_NO_THROW(stream.synchronize_if_cuda_enabled()); +#ifndef RAFT_DISABLE_CUDA + static_assert(std::is_same_v, + "underlying should return rmm::cuda_stream_view"); +#endif +} +} // namespace raft