From c735ecb65abf477fb30a1c1a4015161cd7cbb294 Mon Sep 17 00:00:00 2001
From: William Hicks <wphicks@users.noreply.github.com>
Date: Fri, 6 Oct 2023 11:07:53 -0400
Subject: [PATCH] Provide a raft::copy overload for mdspan-to-mdspan copies
 (#1818)

# Purpose
This PR provides a utility for copying between generic mdspans. This includes between host and device, between mdspans of different layouts, and between mdspans of different (convertible) data types

## API
`raft::copy(raft_resources, dest_mdspan, src_mdspan);`

# Limitations

- Currently does not support copies between mdspans on two different GPUs
- Currently not performant for generic host-to-host copies (would be much easier to optimize with submdspan for padded layouts)
- Submdspan with padded layouts would also make it easier to improve perf of some device-to-device copies, though perf should already be quite good for most device-to-device copies.

# Design

- Includes optional `RAFT_DISABLE_CUDA` build definition in order to use this utility in CUDA-free builds (important for use in the FIL backend for Triton)
- Includes a new `raft::stream_view` object which is a thin wrapper around `rmm::stream_view`. Its purpose is solely to provide a symbol that will be defined in CUDA-free builds and which will throw exceptions or log error messages if someone tries to use a CUDA stream in a CUDA-free build. This avoids a whole bunch of ifdefs that would otherwise infect the whole codebase.
- Uses (roughly in order of preference): `cudaMemcpyAsync, std::copy, cublas, custom device kernel, custom host-to-host transfer logic` for the underlying copy
- Provides two different headers: `raft/core/copy.hpp` and `raft/core/copy.cuh`. This is to accommodate the custom kernel necessary for handling completely generic device-to-device copies. See below for more details.

## Details on the header split
For many instantiations, even those which involve the device, we do not require nvcc compilation. If, however, we determine at compilation time that we must use a custom kernel for the copy, then we must invoke nvcc. We do not wish to indicate that a public header file is a C++ header when it is a CUDA header or vice versa, so we split the definitions into separate `hpp` and `cuh` files, with all template instantiations requiring the custom kernel enable-if'd out of the hpp file.

Thus, the cuh header can be used for _any_ mdspan-to-mdspan copy, but the hpp file will not compile for those specific instantiations that require a custom kernel. The recommended workflow is that if a `cpp` file requires an mdspan-to-mdspan copy, first try the `hpp` header. If that fails, the `cpp` file must be converted to a `cu` file, and the `cuh` header should be used. For source files that are already being compiled with nvcc (i.e. `.cu` files), the `cuh` header might as well be used and will not result in any additional compile time penalty.

# Remaining tasks to leave WIP status

- [x] Add benchmarks for copies
- [x] Ensure that new function is correctly added to docs

# Follow-up items

- Optimize host-to-host transfers using a cache-oblivious approach with SIMD-accelerated transposes for contiguous memory
- Test cache-oblivious device-to-device transfers and compare performance
- Provide transparent support for copies between devices.

## Relationship to mdbuffer
This utility encapsulates a substantial chunk of the core logic required for the mdbuffer implementation. It is being split into its own PR both because it is useful on its own and because the mdbuffer work has been delayed by higher priority tasks.

Close #1779

Authors:
  - William Hicks (https://github.com/wphicks)
  - Tarang Jain (https://github.com/tarang-jain)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Divye Gala (https://github.com/divyegala)

URL: https://github.com/rapidsai/raft/pull/1818
---
 cpp/bench/prims/CMakeLists.txt                |   6 +-
 cpp/bench/prims/core/copy.cu                  | 401 +++++++++++++
 cpp/include/raft/core/copy.cuh                |  74 +++
 cpp/include/raft/core/copy.hpp                |  69 +++
 cpp/include/raft/core/cuda_support.hpp        |  23 +
 cpp/include/raft/core/detail/copy.hpp         | 541 ++++++++++++++++++
 cpp/include/raft/core/error.hpp               |  12 +-
 .../raft/core/resource/resource_types.hpp     |   2 +
 .../raft/core/resource/stream_view.hpp        | 101 ++++
 cpp/include/raft/core/stream_view.hpp         | 108 ++++
 cpp/test/CMakeLists.txt                       |  19 +-
 cpp/test/core/mdspan_copy.cpp                 | 301 ++++++++++
 cpp/test/core/mdspan_copy.cu                  | 433 ++++++++++++++
 cpp/test/core/stream_view.cpp                 |  43 ++
 14 files changed, 2129 insertions(+), 4 deletions(-)
 create mode 100644 cpp/bench/prims/core/copy.cu
 create mode 100644 cpp/include/raft/core/copy.cuh
 create mode 100644 cpp/include/raft/core/copy.hpp
 create mode 100644 cpp/include/raft/core/cuda_support.hpp
 create mode 100644 cpp/include/raft/core/detail/copy.hpp
 create mode 100644 cpp/include/raft/core/resource/stream_view.hpp
 create mode 100644 cpp/include/raft/core/stream_view.hpp
 create mode 100644 cpp/test/core/mdspan_copy.cpp
 create mode 100644 cpp/test/core/mdspan_copy.cu
 create mode 100644 cpp/test/core/stream_view.cpp

diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
index ca4b0f099d..5da2cd916b 100644
--- a/cpp/bench/prims/CMakeLists.txt
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -32,6 +32,7 @@ function(ConfigureBench)
     PRIVATE raft::raft
             raft_internal
             $<$<BOOL:${ConfigureBench_LIB}>:raft::compiled>
+            ${RAFT_CTK_MATH_DEPENDENCIES}
             benchmark::benchmark
             Threads::Threads
             $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
@@ -73,11 +74,14 @@ function(ConfigureBench)
 endfunction()
 
 if(BUILD_PRIMS_BENCH)
+  ConfigureBench(
+    NAME CORE_BENCH PATH bench/prims/core/bitset.cu bench/prims/core/copy.cu bench/prims/main.cpp
+  )
+
   ConfigureBench(
     NAME CLUSTER_BENCH PATH bench/prims/cluster/kmeans_balanced.cu bench/prims/cluster/kmeans.cu
     bench/prims/main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY
   )
-  ConfigureBench(NAME CORE_BENCH PATH bench/prims/core/bitset.cu bench/prims/main.cpp)
 
   ConfigureBench(
     NAME TUNE_DISTANCE PATH bench/prims/distance/tune_pairwise/kernel.cu
diff --git a/cpp/bench/prims/core/copy.cu b/cpp/bench/prims/core/copy.cu
new file mode 100644
index 0000000000..31ee83b924
--- /dev/null
+++ b/cpp/bench/prims/core/copy.cu
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+#include <cstdint>
+#include <raft/core/copy.cuh>
+#include <raft/core/cudart_utils.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/mdspan_types.hpp>
+#include <raft/core/memory_type.hpp>
+#include <raft/thirdparty/mdspan/include/experimental/mdspan>
+
+namespace raft::bench::core {
+
+template <typename IdxT, std::size_t Rank>
+auto constexpr const default_dims = []() {
+  auto dims = std::array<IdxT, Rank>{};
+  std::fill(dims.begin(), dims.end(), 2);
+  return dims;
+}();
+
+template <typename IdxT>
+auto constexpr const default_dims<IdxT, std::size_t{1}> = std::array<IdxT, 1>{3000000};
+
+template <typename IdxT>
+auto constexpr const default_dims<IdxT, std::size_t{2}> = std::array<IdxT, 2>{1000, 3000};
+
+template <typename IdxT>
+auto constexpr const default_dims<IdxT, std::size_t{3}> = std::array<IdxT, 3>{20, 300, 500};
+
+template <typename T,
+          typename IdxT,
+          typename LayoutPolicy,
+          memory_type MemType,
+          std::size_t Rank,
+          typename = std::make_index_sequence<Rank>>
+struct bench_array_type;
+
+template <typename T,
+          typename IdxT,
+          typename LayoutPolicy,
+          memory_type MemType,
+          std::size_t Rank,
+          std::size_t... S>
+struct bench_array_type<T, IdxT, LayoutPolicy, MemType, Rank, std::index_sequence<S...>> {
+  template <std::size_t>
+  auto static constexpr const extent_type = raft::dynamic_extent;
+
+  using type =
+    std::conditional_t<MemType == memory_type::host,
+                       host_mdarray<T, extents<IdxT, extent_type<S>...>, LayoutPolicy>,
+                       device_mdarray<T, extents<IdxT, extent_type<S>...>, LayoutPolicy>>;
+};
+
+template <typename SrcT,
+          typename DstT,
+          typename IdxT,
+          typename SrcLayoutPolicy,
+          typename DstLayoutPolicy,
+          memory_type SrcMemType,
+          memory_type DstMemType,
+          std::size_t Rank>
+struct params {
+  std::array<IdxT, Rank> dims = default_dims<IdxT, Rank>;
+  using src_array_type =
+    typename bench_array_type<SrcT, IdxT, SrcLayoutPolicy, SrcMemType, Rank>::type;
+  using dst_array_type =
+    typename bench_array_type<DstT, IdxT, DstLayoutPolicy, DstMemType, Rank>::type;
+};
+
+template <typename SrcT,
+          typename DstT,
+          typename IdxT,
+          typename SrcLayoutPolicy,
+          typename DstLayoutPolicy,
+          memory_type SrcMemType,
+          memory_type DstMemType,
+          std::size_t Rank>
+struct CopyBench : public fixture {
+  using params_type =
+    params<SrcT, DstT, IdxT, SrcLayoutPolicy, DstLayoutPolicy, SrcMemType, DstMemType, Rank>;
+  using src_array_type = typename params_type::src_array_type;
+  using dst_array_type = typename params_type::dst_array_type;
+  explicit CopyBench(const params_type& ps)
+    : fixture{true},
+      res_{},
+      params_{ps},
+      src_{
+        res_,
+        typename src_array_type::mapping_type{
+          std::apply([](auto... exts) { return make_extents<IdxT>(exts...); }, ps.dims)},
+        typename src_array_type::container_policy_type{},
+      },
+      dst_{
+        res_,
+        typename dst_array_type::mapping_type{
+          std::apply([](auto... exts) { return make_extents<IdxT>(exts...); }, ps.dims)},
+        typename dst_array_type::container_policy_type{},
+      }
+  {
+    res_.get_cublas_handle();  // initialize cublas handle
+    auto src_data = std::vector<SrcT>(src_.size());
+    std::iota(src_data.begin(), src_data.end(), SrcT{});
+    raft::copy(src_.data_handle(), src_data.data(), src_.size(), res_.get_stream());
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    loop_on_state(state, [this]() { raft::copy(res_, dst_.view(), src_.view()); });
+  }
+
+ private:
+  raft::device_resources res_;
+  params_type params_;
+  src_array_type src_;
+  dst_array_type dst_;
+};
+
+template <typename ParamsT>
+auto static const inputs = std::vector<ParamsT>{ParamsT{}};
+
+#define COPY_REGISTER(BenchT) \
+  RAFT_BENCH_REGISTER(BenchT, "BenchT", inputs<typename BenchT::params_type>)
+
+using copy_bench_device_device_1d_same_dtype_same_layout        = CopyBench<int,
+                                                                     int,
+                                                                     int,
+                                                                     layout_c_contiguous,
+                                                                     layout_c_contiguous,
+                                                                     memory_type::device,
+                                                                     memory_type::device,
+                                                                     1>;
+using copy_bench_device_device_1d_same_dtype_diff_layout        = CopyBench<int,
+                                                                     int,
+                                                                     int,
+                                                                     layout_c_contiguous,
+                                                                     layout_f_contiguous,
+                                                                     memory_type::device,
+                                                                     memory_type::device,
+                                                                     1>;
+using copy_bench_device_device_1d_diff_dtype_diff_layout        = CopyBench<float,
+                                                                     double,
+                                                                     int,
+                                                                     layout_c_contiguous,
+                                                                     layout_f_contiguous,
+                                                                     memory_type::device,
+                                                                     memory_type::device,
+                                                                     1>;
+using copy_bench_device_device_2d_same_dtype_diff_layout        = CopyBench<int,
+                                                                     int,
+                                                                     int,
+                                                                     layout_c_contiguous,
+                                                                     layout_f_contiguous,
+                                                                     memory_type::device,
+                                                                     memory_type::device,
+                                                                     2>;
+using copy_bench_device_device_2d_same_dtype_diff_layout_cublas = CopyBench<float,
+                                                                            float,
+                                                                            int,
+                                                                            layout_c_contiguous,
+                                                                            layout_f_contiguous,
+                                                                            memory_type::device,
+                                                                            memory_type::device,
+                                                                            2>;
+using copy_bench_device_device_3d_diff_dtype_diff_layout        = CopyBench<float,
+                                                                     double,
+                                                                     int,
+                                                                     layout_c_contiguous,
+                                                                     layout_f_contiguous,
+                                                                     memory_type::device,
+                                                                     memory_type::device,
+                                                                     3>;
+using copy_bench_device_device_3d_diff_dtype_same_layout        = CopyBench<float,
+                                                                     double,
+                                                                     int,
+                                                                     layout_c_contiguous,
+                                                                     layout_c_contiguous,
+                                                                     memory_type::device,
+                                                                     memory_type::device,
+                                                                     3>;
+
+using copy_bench_host_host_1d_same_dtype_same_layout             = CopyBench<int,
+                                                                 int,
+                                                                 int,
+                                                                 layout_c_contiguous,
+                                                                 layout_c_contiguous,
+                                                                 memory_type::host,
+                                                                 memory_type::host,
+                                                                 1>;
+using copy_bench_host_host_1d_same_dtype_diff_layout             = CopyBench<int,
+                                                                 int,
+                                                                 int,
+                                                                 layout_c_contiguous,
+                                                                 layout_f_contiguous,
+                                                                 memory_type::host,
+                                                                 memory_type::host,
+                                                                 1>;
+using copy_bench_host_host_1d_diff_dtype_diff_layout             = CopyBench<float,
+                                                                 double,
+                                                                 int,
+                                                                 layout_c_contiguous,
+                                                                 layout_f_contiguous,
+                                                                 memory_type::host,
+                                                                 memory_type::host,
+                                                                 1>;
+using copy_bench_host_host_2d_same_dtype_diff_layout             = CopyBench<int,
+                                                                 int,
+                                                                 int,
+                                                                 layout_c_contiguous,
+                                                                 layout_f_contiguous,
+                                                                 memory_type::host,
+                                                                 memory_type::host,
+                                                                 2>;
+using copy_bench_host_host_2d_same_dtype_diff_layout_float_float = CopyBench<float,
+                                                                             float,
+                                                                             int,
+                                                                             layout_c_contiguous,
+                                                                             layout_f_contiguous,
+                                                                             memory_type::host,
+                                                                             memory_type::host,
+                                                                             2>;
+using copy_bench_host_host_3d_diff_dtype_same_layout             = CopyBench<float,
+                                                                 double,
+                                                                 int,
+                                                                 layout_c_contiguous,
+                                                                 layout_c_contiguous,
+                                                                 memory_type::host,
+                                                                 memory_type::host,
+                                                                 3>;
+using copy_bench_host_host_3d_diff_dtype_diff_layout             = CopyBench<float,
+                                                                 double,
+                                                                 int,
+                                                                 layout_c_contiguous,
+                                                                 layout_f_contiguous,
+                                                                 memory_type::host,
+                                                                 memory_type::host,
+                                                                 3>;
+
+using copy_bench_device_host_1d_same_dtype_same_layout        = CopyBench<int,
+                                                                   int,
+                                                                   int,
+                                                                   layout_c_contiguous,
+                                                                   layout_c_contiguous,
+                                                                   memory_type::device,
+                                                                   memory_type::host,
+                                                                   1>;
+using copy_bench_device_host_1d_same_dtype_diff_layout        = CopyBench<int,
+                                                                   int,
+                                                                   int,
+                                                                   layout_c_contiguous,
+                                                                   layout_f_contiguous,
+                                                                   memory_type::device,
+                                                                   memory_type::host,
+                                                                   1>;
+using copy_bench_device_host_1d_diff_dtype_diff_layout        = CopyBench<float,
+                                                                   double,
+                                                                   int,
+                                                                   layout_c_contiguous,
+                                                                   layout_f_contiguous,
+                                                                   memory_type::device,
+                                                                   memory_type::host,
+                                                                   1>;
+using copy_bench_device_host_2d_same_dtype_diff_layout        = CopyBench<int,
+                                                                   int,
+                                                                   int,
+                                                                   layout_c_contiguous,
+                                                                   layout_f_contiguous,
+                                                                   memory_type::device,
+                                                                   memory_type::host,
+                                                                   2>;
+using copy_bench_device_host_2d_same_dtype_diff_layout_cublas = CopyBench<float,
+                                                                          float,
+                                                                          int,
+                                                                          layout_c_contiguous,
+                                                                          layout_f_contiguous,
+                                                                          memory_type::device,
+                                                                          memory_type::host,
+                                                                          2>;
+using copy_bench_device_host_3d_diff_dtype_same_layout        = CopyBench<float,
+                                                                   double,
+                                                                   int,
+                                                                   layout_c_contiguous,
+                                                                   layout_c_contiguous,
+                                                                   memory_type::device,
+                                                                   memory_type::host,
+                                                                   3>;
+using copy_bench_device_host_3d_diff_dtype_diff_layout        = CopyBench<float,
+                                                                   double,
+                                                                   int,
+                                                                   layout_c_contiguous,
+                                                                   layout_f_contiguous,
+                                                                   memory_type::device,
+                                                                   memory_type::host,
+                                                                   3>;
+
+using copy_bench_host_device_1d_same_dtype_same_layout        = CopyBench<int,
+                                                                   int,
+                                                                   int,
+                                                                   layout_c_contiguous,
+                                                                   layout_c_contiguous,
+                                                                   memory_type::host,
+                                                                   memory_type::device,
+                                                                   1>;
+using copy_bench_host_device_1d_same_dtype_diff_layout        = CopyBench<int,
+                                                                   int,
+                                                                   int,
+                                                                   layout_c_contiguous,
+                                                                   layout_f_contiguous,
+                                                                   memory_type::host,
+                                                                   memory_type::device,
+                                                                   1>;
+using copy_bench_host_device_1d_diff_dtype_diff_layout        = CopyBench<float,
+                                                                   double,
+                                                                   int,
+                                                                   layout_c_contiguous,
+                                                                   layout_f_contiguous,
+                                                                   memory_type::host,
+                                                                   memory_type::device,
+                                                                   1>;
+using copy_bench_host_device_2d_same_dtype_diff_layout        = CopyBench<int,
+                                                                   int,
+                                                                   int,
+                                                                   layout_c_contiguous,
+                                                                   layout_f_contiguous,
+                                                                   memory_type::host,
+                                                                   memory_type::device,
+                                                                   2>;
+using copy_bench_host_device_2d_same_dtype_diff_layout_cublas = CopyBench<float,
+                                                                          float,
+                                                                          int,
+                                                                          layout_c_contiguous,
+                                                                          layout_f_contiguous,
+                                                                          memory_type::host,
+                                                                          memory_type::device,
+                                                                          2>;
+using copy_bench_host_device_3d_diff_dtype_diff_layout        = CopyBench<float,
+                                                                   double,
+                                                                   int,
+                                                                   layout_c_contiguous,
+                                                                   layout_f_contiguous,
+                                                                   memory_type::host,
+                                                                   memory_type::device,
+                                                                   3>;
+using copy_bench_host_device_3d_diff_dtype_same_layout        = CopyBench<float,
+                                                                   double,
+                                                                   int,
+                                                                   layout_c_contiguous,
+                                                                   layout_c_contiguous,
+                                                                   memory_type::host,
+                                                                   memory_type::device,
+                                                                   3>;
+
+// COPY_REGISTER(copy_bench_same_dtype_1d_host_host);
+COPY_REGISTER(copy_bench_device_device_1d_same_dtype_same_layout);
+COPY_REGISTER(copy_bench_device_device_1d_same_dtype_diff_layout);
+COPY_REGISTER(copy_bench_device_device_1d_diff_dtype_diff_layout);
+COPY_REGISTER(copy_bench_device_device_2d_same_dtype_diff_layout);
+COPY_REGISTER(copy_bench_device_device_2d_same_dtype_diff_layout_cublas);
+COPY_REGISTER(copy_bench_device_device_3d_diff_dtype_same_layout);
+COPY_REGISTER(copy_bench_device_device_3d_diff_dtype_diff_layout);
+
+COPY_REGISTER(copy_bench_host_host_1d_same_dtype_same_layout);
+COPY_REGISTER(copy_bench_host_host_1d_same_dtype_diff_layout);
+COPY_REGISTER(copy_bench_host_host_1d_diff_dtype_diff_layout);
+COPY_REGISTER(copy_bench_host_host_2d_same_dtype_diff_layout);
+COPY_REGISTER(copy_bench_host_host_2d_same_dtype_diff_layout_float_float);
+COPY_REGISTER(copy_bench_host_host_3d_diff_dtype_same_layout);
+COPY_REGISTER(copy_bench_host_host_3d_diff_dtype_diff_layout);
+
+COPY_REGISTER(copy_bench_device_host_1d_same_dtype_same_layout);
+COPY_REGISTER(copy_bench_device_host_1d_same_dtype_diff_layout);
+COPY_REGISTER(copy_bench_device_host_1d_diff_dtype_diff_layout);
+COPY_REGISTER(copy_bench_device_host_2d_same_dtype_diff_layout);
+COPY_REGISTER(copy_bench_device_host_2d_same_dtype_diff_layout_cublas);
+COPY_REGISTER(copy_bench_device_host_3d_diff_dtype_same_layout);
+COPY_REGISTER(copy_bench_device_host_3d_diff_dtype_diff_layout);
+
+COPY_REGISTER(copy_bench_host_device_1d_same_dtype_same_layout);
+COPY_REGISTER(copy_bench_host_device_1d_same_dtype_diff_layout);
+COPY_REGISTER(copy_bench_host_device_1d_diff_dtype_diff_layout);
+COPY_REGISTER(copy_bench_host_device_2d_same_dtype_diff_layout);
+COPY_REGISTER(copy_bench_host_device_2d_same_dtype_diff_layout_cublas);
+COPY_REGISTER(copy_bench_host_device_3d_diff_dtype_same_layout);
+COPY_REGISTER(copy_bench_host_device_3d_diff_dtype_diff_layout);
+
+}  // namespace raft::bench::core
diff --git a/cpp/include/raft/core/copy.cuh b/cpp/include/raft/core/copy.cuh
new file mode 100644
index 0000000000..f256f9ea0f
--- /dev/null
+++ b/cpp/include/raft/core/copy.cuh
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/core/detail/copy.hpp>
+namespace raft {
+/**
+ * @brief Copy data from one mdspan to another with the same extents
+ *
+ * This function copies data from one mdspan to another, regardless of whether
+ * or not the mdspans have the same layout, memory type (host/device/managed)
+ * or data type. So long as it is possible to convert the data type from source
+ * to destination, and the extents are equal, this function should be able to
+ * perform the copy. Any necessary device operations will be stream-ordered via the CUDA stream
+ * provided by the `raft::resources` argument.
+ *
+ * This header includes a custom kernel used for copying data between
+ * completely arbitrary mdspans on device. To compile this function in a
+ * non-CUDA translation unit, `raft/core/copy.hpp` may be used instead. The
+ * pure C++ header will correctly compile even without a CUDA compiler.
+ * Depending on the specialization, this CUDA header may invoke the kernel and
+ * therefore require a CUDA compiler.
+ *
+ * Limitations: Currently this function does not support copying directly
+ * between two arbitrary mdspans on different CUDA devices. It is assumed that the caller sets the
+ * correct CUDA device. Furthermore, host-to-host copies that require a transformation of the
+ * underlying memory layout are currently not performant, although they are supported.
+ *
+ * Note that when copying to an mdspan with a non-unique layout (i.e. the same
+ * underlying memory is addressed by different element indexes), the source
+ * data must contain non-unique values for every non-unique destination
+ * element. If this is not the case, the behavior is undefined. Some copies
+ * to non-unique layouts which are well-defined will nevertheless fail with an
+ * exception to avoid race conditions in the underlying copy.
+ *
+ * @tparam DstType An mdspan type for the destination container.
+ * @tparam SrcType An mdspan type for the source container
+ * @param res raft::resources used to provide a stream for copies involving the
+ * device.
+ * @param dst The destination mdspan.
+ * @param src The source mdspan.
+ */
+template <typename DstType, typename SrcType>
+detail::mdspan_copyable_with_kernel_t<DstType, SrcType> copy(resources const& res,
+                                                             DstType&& dst,
+                                                             SrcType&& src)
+{
+  detail::copy(res, std::forward<DstType>(dst), std::forward<SrcType>(src));
+}
+
+#ifndef RAFT_NON_CUDA_COPY_IMPLEMENTED
+#define RAFT_NON_CUDA_COPY_IMPLEMENTED
+template <typename DstType, typename SrcType>
+detail::mdspan_copyable_not_with_kernel_t<DstType, SrcType> copy(resources const& res,
+                                                                 DstType&& dst,
+                                                                 SrcType&& src)
+{
+  detail::copy(res, std::forward<DstType>(dst), std::forward<SrcType>(src));
+}
+#endif
+}  // namespace raft
diff --git a/cpp/include/raft/core/copy.hpp b/cpp/include/raft/core/copy.hpp
new file mode 100644
index 0000000000..0a16b742a2
--- /dev/null
+++ b/cpp/include/raft/core/copy.hpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/core/detail/copy.hpp>
+namespace raft {
+
+#ifndef RAFT_NON_CUDA_COPY_IMPLEMENTED
+#define RAFT_NON_CUDA_COPY_IMPLEMENTED
+/**
+ * @brief Copy data from one mdspan to another with the same extents
+ *
+ * This function copies data from one mdspan to another, regardless of whether
+ * or not the mdspans have the same layout, memory type (host/device/managed)
+ * or data type. So long as it is possible to convert the data type from source
+ * to destination, and the extents are equal, this function should be able to
+ * perform the copy.
+ *
+ * This header does _not_ include the custom kernel used for copying data
+ * between completely arbitrary mdspans on device. For arbitrary copies of this
+ * kind, `#include <raft/core/copy.cuh>` instead. Specializations of this
+ * function that require the custom kernel will be SFINAE-omitted when this
+ * header is used instead of `copy.cuh`. This header _does_ support
+ * device-to-device copies that can be performed with cuBLAS or a
+ * straightforward cudaMemcpy. Any necessary device operations will be stream-ordered via the CUDA
+ * stream provided by the `raft::resources` argument.
+ *
+ * Limitations: Currently this function does not support copying directly
+ * between two arbitrary mdspans on different CUDA devices. It is assumed that the caller sets the
+ * correct CUDA device. Furthermore, host-to-host copies that require a transformation of the
+ * underlying memory layout are currently not performant, although they are supported.
+ *
+ * Note that when copying to an mdspan with a non-unique layout (i.e. the same
+ * underlying memory is addressed by different element indexes), the source
+ * data must contain non-unique values for every non-unique destination
+ * element. If this is not the case, the behavior is undefined. Some copies
+ * to non-unique layouts which are well-defined will nevertheless fail with an
+ * exception to avoid race conditions in the underlying copy.
+ *
+ * @tparam DstType An mdspan type for the destination container.
+ * @tparam SrcType An mdspan type for the source container
+ * @param res raft::resources used to provide a stream for copies involving the
+ * device.
+ * @param dst The destination mdspan.
+ * @param src The source mdspan.
+ */
+template <typename DstType, typename SrcType>
+detail::mdspan_copyable_not_with_kernel_t<DstType, SrcType> copy(resources const& res,
+                                                                 DstType&& dst,
+                                                                 SrcType&& src)
+{
+  detail::copy(res, std::forward<DstType>(dst), std::forward<SrcType>(src));
+}
+#endif
+
+}  // namespace raft
diff --git a/cpp/include/raft/core/cuda_support.hpp b/cpp/include/raft/core/cuda_support.hpp
new file mode 100644
index 0000000000..07fb95a921
--- /dev/null
+++ b/cpp/include/raft/core/cuda_support.hpp
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+namespace raft {
+#ifndef RAFT_DISABLE_CUDA
+auto constexpr static const CUDA_ENABLED = true;
+#else
+auto constexpr static const CUDA_ENABLED = false;
+#endif
+}  // namespace raft
diff --git a/cpp/include/raft/core/detail/copy.hpp b/cpp/include/raft/core/detail/copy.hpp
new file mode 100644
index 0000000000..b23660fefe
--- /dev/null
+++ b/cpp/include/raft/core/detail/copy.hpp
@@ -0,0 +1,541 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <cstdio>
+#include <execution>
+#include <raft/core/cuda_support.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/error.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/mdspan.hpp>
+#include <raft/core/resource/stream_view.hpp>
+#include <raft/core/resources.hpp>
+#include <type_traits>
+#ifndef RAFT_DISABLE_CUDA
+#include <raft/core/cudart_utils.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/resource/cublas_handle.hpp>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#ifdef __CUDACC__
+#include <raft/util/cuda_dev_essentials.cuh>
+#endif
+#endif
+
+namespace raft {
+namespace detail {
+
+template <bool B,
+          typename DstType = void,
+          typename SrcType = void,
+          typename T       = void,
+          typename         = void>
+struct mdspan_copyable : std::false_type {
+  auto static constexpr const custom_kernel_allowed     = false;
+  auto static constexpr const custom_kernel_not_allowed = false;
+};
+
+/*
+ * A helper struct used to determine whether one mdspan type can be copied to
+ * another and if so how
+ */
+template <typename DstType, typename SrcType, typename T>
+struct mdspan_copyable<true,
+                       DstType,
+                       SrcType,
+                       T,
+                       std::enable_if_t<std::conjunction_v<
+                         std::bool_constant<is_mdspan_v<std::remove_reference_t<DstType>>>,
+                         std::bool_constant<is_mdspan_v<std::remove_reference_t<SrcType>>>>>> {
+  using dst_type = std::remove_reference_t<DstType>;
+  using src_type = std::remove_reference_t<SrcType>;
+
+  // Extents properties
+  using dst_extents_type = typename dst_type::extents_type;
+  using src_extents_type = typename src_type::extents_type;
+  using index_type =
+    std::conditional_t<(std::numeric_limits<typename dst_extents_type::index_type>::max() >
+                        std::numeric_limits<typename src_extents_type::index_type>::max()),
+                       typename dst_extents_type::index_type,
+                       typename src_extents_type::index_type>;
+
+  // Dtype properties
+  using dst_value_type                   = typename dst_type::value_type;
+  using src_value_type                   = typename src_type::value_type;
+  using dst_element_type                 = typename dst_type::element_type;
+  using src_element_type                 = typename src_type::element_type;
+  auto static constexpr const same_dtype = std::is_same_v<dst_value_type, src_value_type>;
+  auto static constexpr const compatible_dtype =
+    std::is_assignable_v<typename dst_type::reference, typename src_type::reference>;
+
+  auto static constexpr const dst_float  = std::is_same_v<dst_value_type, float>;
+  auto static constexpr const src_float  = std::is_same_v<src_value_type, float>;
+  auto static constexpr const dst_double = std::is_same_v<dst_value_type, double>;
+  auto static constexpr const src_double = std::is_same_v<src_value_type, double>;
+
+  auto static constexpr const both_float                = dst_float && src_float;
+  auto static constexpr const both_double               = dst_double && src_double;
+  auto static constexpr const both_float_or_both_double = both_float || both_double;
+
+  // Ranks
+  auto static constexpr const dst_rank        = dst_extents_type::rank();
+  auto static constexpr const src_rank        = src_extents_type::rank();
+  auto static constexpr const compatible_rank = (dst_rank == src_rank);
+  auto static constexpr const has_vector_rank = (dst_rank == 1);
+  auto static constexpr const has_matrix_rank = (dst_rank == 2);
+
+  // Layout properties
+  using dst_layout_type = typename dst_type::layout_type;
+  using src_layout_type = typename src_type::layout_type;
+
+  auto static constexpr const same_layout = std::is_same_v<dst_layout_type, src_layout_type>;
+
+  auto static check_for_unique_dst(dst_type dst)
+  {
+    if constexpr (!dst_type::is_always_unique()) {
+      RAFT_EXPECTS(dst.is_unique(), "Destination mdspan must be unique for parallelized copies");
+    }
+  }
+
+  auto static constexpr const src_contiguous =
+    std::disjunction_v<std::is_same<src_layout_type, layout_c_contiguous>,
+                       std::is_same<src_layout_type, layout_f_contiguous>>;
+
+  auto static constexpr const dst_contiguous =
+    std::disjunction_v<std::is_same<dst_layout_type, layout_c_contiguous>,
+                       std::is_same<dst_layout_type, layout_f_contiguous>>;
+
+  auto static constexpr const both_contiguous = src_contiguous && dst_contiguous;
+
+  auto static constexpr const same_underlying_layout =
+    std::disjunction_v<std::bool_constant<same_layout>,
+                       std::bool_constant<has_vector_rank && both_contiguous>>;
+  // Layout for intermediate tile if copying through custom kernel
+  using tile_layout_type =
+    std::conditional_t<src_contiguous,
+                       src_layout_type,
+                       std::conditional_t<dst_contiguous, dst_layout_type, layout_c_contiguous>>;
+
+  // Accessibility
+  auto static constexpr const dst_device_accessible = is_device_mdspan_v<dst_type>;
+  auto static constexpr const src_device_accessible = is_device_mdspan_v<src_type>;
+  auto static constexpr const both_device_accessible =
+    dst_device_accessible && src_device_accessible;
+
+  auto static constexpr const dst_host_accessible  = is_host_mdspan_v<dst_type>;
+  auto static constexpr const src_host_accessible  = is_host_mdspan_v<src_type>;
+  auto static constexpr const both_host_accessible = dst_host_accessible && src_host_accessible;
+
+  // Allowed copy codepaths
+  auto static constexpr const can_use_host = both_host_accessible;
+
+#if (defined(__AVX__) || defined(__SSE__) || defined(__ARM_NEON))
+  // TODO(wphicks): Following should be only necessary restrictions. Test if
+  // perf actually improves once fully implemented.
+  // auto static constexpr const can_use_simd = can_use_host && both_contiguous &&
+  // both_float_or_both_double;
+  auto static constexpr const can_use_simd =
+    can_use_host && both_contiguous && both_float && has_matrix_rank;
+#else
+  auto static constexpr const can_use_simd = false;
+#endif
+
+  auto static constexpr const can_use_std_copy =
+    std::conjunction_v<std::bool_constant<can_use_host>,
+                       std::bool_constant<compatible_dtype>,
+                       std::bool_constant<both_contiguous>,
+                       std::bool_constant<same_underlying_layout>>;
+  auto static constexpr const can_use_raft_copy =
+    std::conjunction_v<std::bool_constant<CUDA_ENABLED>,
+                       std::bool_constant<same_dtype>,
+                       std::bool_constant<both_contiguous>,
+                       std::bool_constant<same_underlying_layout>>;
+
+  // Do we need intermediate storage on device in order to perform
+  // non-trivial layout or dtype conversions after copying source from host or
+  // before copying converted results back to host?
+  auto static constexpr const requires_intermediate =
+    !both_host_accessible && !both_device_accessible && !can_use_raft_copy;
+
+  auto static constexpr const use_intermediate_dst =
+    std::conjunction_v<std::bool_constant<requires_intermediate>,
+                       std::bool_constant<src_device_accessible>>;
+
+  auto static constexpr const use_intermediate_src =
+    std::conjunction_v<std::bool_constant<requires_intermediate>,
+                       std::bool_constant<!use_intermediate_dst>>;
+  auto static constexpr const can_use_device =
+    std::conjunction_v<std::bool_constant<CUDA_ENABLED>,
+                       std::disjunction<std::bool_constant<both_device_accessible>,
+                                        std::bool_constant<requires_intermediate>,
+                                        std::bool_constant<can_use_raft_copy>>>;
+
+  auto static constexpr const can_use_cublas =
+    std::conjunction_v<std::bool_constant<can_use_device>,
+                       std::bool_constant<compatible_dtype>,
+                       std::bool_constant<both_contiguous>,
+                       std::bool_constant<!same_underlying_layout>,
+                       std::bool_constant<has_matrix_rank>,
+                       std::bool_constant<both_float_or_both_double>>;
+
+  auto static constexpr const custom_kernel_allowed =
+    std::conjunction_v<std::bool_constant<can_use_device>,
+                       std::bool_constant<!(can_use_raft_copy || can_use_cublas)>>;
+
+  auto static constexpr const custom_kernel_not_allowed = !custom_kernel_allowed;
+  auto static constexpr const custom_kernel_required =
+    std::conjunction_v<std::bool_constant<!can_use_host>,
+                       std::bool_constant<!(can_use_raft_copy || can_use_cublas)>>;
+
+  // Viable overload?
+  auto static constexpr const value =
+    std::conjunction_v<std::bool_constant<is_mdspan_v<src_type>>,
+                       std::bool_constant<is_mdspan_v<dst_type>>,
+                       std::bool_constant<can_use_host || can_use_device>>;
+  using type = std::enable_if_t<value, T>;
+};
+
+template <typename DstType, typename SrcType, typename T = void>
+using mdspan_copyable_t = typename mdspan_copyable<true, DstType, SrcType, T>::type;
+template <typename DstType, typename SrcType>
+auto static constexpr const mdspan_copyable_v =
+  mdspan_copyable<true, DstType, SrcType, void>::value;
+
+template <typename DstType, typename SrcType>
+auto static constexpr const mdspan_copyable_with_kernel_v =
+  mdspan_copyable<true, DstType, SrcType, void>::custom_kernel_allowed;
+template <typename DstType, typename SrcType>
+auto static constexpr const mdspan_copyable_not_with_kernel_v =
+  mdspan_copyable<true, DstType, SrcType, void>::custom_kernel_not_allowed;
+
+template <typename DstType, typename SrcType, typename T = void>
+using mdspan_copyable_with_kernel_t =
+  std::enable_if_t<mdspan_copyable_with_kernel_v<DstType, SrcType>, T>;
+
+template <typename DstType, typename SrcType, typename T = void>
+using mdspan_copyable_not_with_kernel_t =
+  std::enable_if_t<mdspan_copyable_not_with_kernel_v<DstType, SrcType>, T>;
+
+#ifdef __CUDACC__
+auto static constexpr const mdspan_copy_tile_dim   = 32;
+auto static constexpr const mdspan_copy_tile_elems = mdspan_copy_tile_dim * mdspan_copy_tile_dim;
+
+// Helper struct to work around lack of CUDA-native std::apply
+template <typename IdxType, IdxType... Idx>
+struct index_sequence {};
+
+template <typename IdxType, IdxType N, IdxType... Idx>
+struct make_index_sequence
+  : std::conditional_t<N == IdxType{},
+                       index_sequence<IdxType, Idx...>,
+                       make_index_sequence<IdxType, N - IdxType{1}, N - IdxType{1}, Idx...>> {};
+
+/* template <typename LambdaT, typename ContainerT, typename IdxT, IdxT... Idx>
+__host__ __device__ decltype(auto) apply(LambdaT&& lambda, ContainerT&& args, index_sequence<IdxT,
+Idx...>)
+{
+  return lambda(args[Idx]...);
+}
+
+template <typename LambdaT, typename ContainerT, typename IdxT, IdxT size>
+__host__ __device__ decltype(auto) apply(LambdaT&& lambda, ContainerT&& args)
+{
+  return apply(std::forward<LambdaT>(lambda), std::forward<ContainerT>(args),
+make_index_sequence<IdxT, size>{});
+} */
+
+/*
+ * Given an mdspan and an array of indices, return a reference to the
+ * indicated element.
+ */
+template <typename MdspanType, typename IdxType, IdxType... Idx>
+__device__ decltype(auto) get_mdspan_elem(MdspanType md,
+                                          IdxType const* indices,
+                                          index_sequence<IdxType, Idx...>)
+{
+  return md(indices[Idx]...);
+}
+
+template <typename MdspanType, typename IdxType>
+__device__ decltype(auto) get_mdspan_elem(MdspanType md, IdxType const* indices)
+{
+  return get_mdspan_elem(
+    md, indices, make_index_sequence<IdxType, MdspanType::extents_type::rank()>{});
+}
+
+/* Advance old_indices forward by the number of mdspan elements specified
+ * by increment. Store the result in indices. Return true if the new
+ * indices are valid for the input mdspan.
+ */
+template <typename MdspanType, typename IdxType, typename IncrType>
+__device__ auto increment_indices(IdxType* indices,
+                                  MdspanType const& md,
+                                  IdxType const* old_indices,
+                                  IdxType const* index_strides,
+                                  IncrType increment)
+{
+#pragma unroll
+  for (auto i = typename MdspanType::extents_type::rank_type{}; i < md.rank(); ++i) {
+    increment += index_strides[i] * old_indices[i];
+  }
+
+#pragma unroll
+  for (auto i = typename MdspanType::extents_type::rank_type{}; i < md.rank(); ++i) {
+    // Iterate through dimensions in order from slowest to fastest varying for
+    // layout_right and layout_left. Otherwise, just iterate through dimensions
+    // in order.
+    //
+    // TODO(wphicks): It is possible to always iterate through dimensions in
+    // the slowest to fastest order. Consider this or at minimum expanding to
+    // padded layouts.
+    auto const real_index = [](auto ind) {
+      if constexpr (std::is_same_v<typename MdspanType::layout_type, layout_f_contiguous>) {
+        return MdspanType::rank() - ind - 1;
+      } else {
+        return ind;
+      }
+    }(i);
+
+    auto cur_index = IdxType{};
+
+    while (cur_index < md.extent(real_index) - 1 && increment >= index_strides[real_index]) {
+      increment -= index_strides[real_index];
+      ++cur_index;
+    }
+    indices[real_index] = cur_index;
+  }
+
+  return increment == IdxType{};
+}
+
+/*
+ * WARNING: This kernel _must_ be launched with mdspan_copy_tile_dim x
+ * mdspan_copy_tile_dim threads per block. This restriction allows for
+ * additional optimizations at the expense of generalized launch
+ * parameters.
+ */
+template <typename DstType, typename SrcType>
+__global__ mdspan_copyable_with_kernel_t<DstType, SrcType> mdspan_copy_kernel(DstType dst,
+                                                                              SrcType src)
+{
+  using config = mdspan_copyable<true, DstType, SrcType>;
+
+  // An intermediate storage location for the data to be copied.
+  __shared__ typename config::dst_value_type tile[mdspan_copy_tile_dim][mdspan_copy_tile_dim + 1];
+
+  // Compute the cumulative product of extents in order from fastest to
+  // slowest varying extent
+  typename config::index_type index_strides[config::dst_rank];
+  auto cur_stride = typename config::index_type{1};
+#pragma unroll
+  for (auto i = typename SrcType::extents_type::rank_type{}; i < config::src_rank; ++i) {
+    // Iterate through dimensions in order from fastest to slowest varying
+    auto const real_index = [](auto ind) {
+      if constexpr (std::is_same_v<typename config::src_layout_type, layout_c_contiguous>) {
+        return config::src_rank - ind - 1;
+      } else {
+        return ind;
+      }
+    }(i);
+
+    index_strides[real_index] = cur_stride;
+    cur_stride *= src.extent(real_index);
+  }
+
+  // The index of the first element in the mdspan which will be copied via
+  // the current tile for this block.
+  typename config::index_type tile_offset[config::dst_rank] = {0};
+  typename config::index_type cur_indices[config::dst_rank];
+  auto valid_tile = increment_indices(
+    tile_offset, src, tile_offset, index_strides, blockIdx.x * mdspan_copy_tile_elems);
+
+  while (valid_tile) {
+    auto tile_read_x = std::is_same_v<typename config::src_layout_type, layout_f_contiguous>
+                         ? threadIdx.x
+                         : threadIdx.y;
+    auto tile_read_y = std::is_same_v<typename config::src_layout_type, layout_f_contiguous>
+                         ? threadIdx.y
+                         : threadIdx.x;
+
+    auto valid_index = increment_indices(cur_indices,
+                                         src,
+                                         tile_offset,
+                                         index_strides,
+                                         tile_read_x * mdspan_copy_tile_dim + tile_read_y);
+
+    if constexpr (config::same_underlying_layout || !config::dst_contiguous) {
+      if (valid_index) {
+        tile[tile_read_x][tile_read_y]    = get_mdspan_elem(src, cur_indices);
+        get_mdspan_elem(dst, cur_indices) = tile[tile_read_x][tile_read_y];
+      }
+    } else {
+      if (valid_index) { tile[tile_read_x][tile_read_y] = get_mdspan_elem(src, cur_indices); }
+      __syncthreads();
+
+      valid_index = increment_indices(cur_indices,
+                                      src,
+                                      tile_offset,
+                                      index_strides,
+                                      tile_read_y * mdspan_copy_tile_dim + tile_read_x);
+      if (valid_index) { get_mdspan_elem(dst, cur_indices) = tile[tile_read_y][tile_read_x]; }
+      __syncthreads();
+    }
+    valid_tile = increment_indices(
+      tile_offset, src, tile_offset, index_strides, blockDim.x * mdspan_copy_tile_elems);
+  }
+}
+#endif
+
+template <typename DstType, typename SrcType>
+mdspan_copyable_t<DstType, SrcType> copy(resources const& res, DstType&& dst, SrcType&& src)
+{
+  using config = mdspan_copyable<true, DstType, SrcType>;
+  for (auto i = std::size_t{}; i < config::src_rank; ++i) {
+    RAFT_EXPECTS(src.extent(i) == dst.extent(i), "Must copy between mdspans of the same shape");
+  }
+
+  if constexpr (config::use_intermediate_src) {
+#ifndef RAFT_DISABLE_CUDA
+    // Copy to intermediate source on device, then perform necessary
+    // changes in layout on device, directly into final destination
+    using mdarray_t   = device_mdarray<typename config::src_value_type,
+                                     typename config::src_extents_type,
+                                     typename config::src_layout_type>;
+    auto intermediate = mdarray_t(res,
+                                  typename mdarray_t::mapping_type{src.extents()},
+                                  typename mdarray_t::container_policy_type{});
+    detail::copy(res, intermediate.view(), src);
+    detail::copy(res, dst, intermediate.view());
+#else
+    // Not possible to reach this due to enable_ifs. Included for safety.
+    throw(raft::non_cuda_build_error("Copying to device in non-CUDA build"));
+#endif
+
+  } else if constexpr (config::use_intermediate_dst) {
+#ifndef RAFT_DISABLE_CUDA
+    // Perform necessary changes in layout on device, then copy to final
+    // destination on host
+    using mdarray_t   = device_mdarray<typename config::dst_value_type,
+                                     typename config::dst_extents_type,
+                                     typename config::dst_layout_type>;
+    auto intermediate = mdarray_t(res,
+                                  typename mdarray_t::mapping_type{dst.extents()},
+                                  typename mdarray_t::container_policy_type{});
+    detail::copy(res, intermediate.view(), src);
+    detail::copy(res, dst, intermediate.view());
+#else
+    throw(raft::non_cuda_build_error("Copying from device in non-CUDA build"));
+#endif
+  } else if constexpr (config::can_use_raft_copy) {
+#ifndef RAFT_DISABLE_CUDA
+    raft::copy(dst.data_handle(), src.data_handle(), dst.size(), resource::get_cuda_stream(res));
+#else
+    // Not possible to reach this due to enable_ifs. Included for safety.
+    throw(raft::non_cuda_build_error("Copying to from or on device in non-CUDA build"));
+#endif
+  } else if constexpr (config::can_use_cublas) {
+#ifndef RAFT_DISABLE_CUDA
+    auto constexpr const alpha = typename std::remove_reference_t<DstType>::value_type{1};
+    auto constexpr const beta  = typename std::remove_reference_t<DstType>::value_type{0};
+    if constexpr (std::is_same_v<typename config::dst_layout_type, layout_c_contiguous>) {
+      CUBLAS_TRY(linalg::detail::cublasgeam(resource::get_cublas_handle(res),
+                                            CUBLAS_OP_T,
+                                            CUBLAS_OP_N,
+                                            dst.extent(1),
+                                            dst.extent(0),
+                                            &alpha,
+                                            src.data_handle(),
+                                            src.extent(0),
+                                            &beta,
+                                            dst.data_handle(),
+                                            dst.extent(1),
+                                            dst.data_handle(),
+                                            dst.extent(1),
+                                            resource::get_cuda_stream(res)));
+    } else {
+      CUBLAS_TRY(linalg::detail::cublasgeam(resource::get_cublas_handle(res),
+                                            CUBLAS_OP_T,
+                                            CUBLAS_OP_N,
+                                            dst.extent(0),
+                                            dst.extent(1),
+                                            &alpha,
+                                            src.data_handle(),
+                                            src.extent(1),
+                                            &beta,
+                                            dst.data_handle(),
+                                            dst.extent(0),
+                                            dst.data_handle(),
+                                            dst.extent(0),
+                                            resource::get_cuda_stream(res)));
+    }
+#else
+    // Not possible to reach this due to enable_ifs. Included for safety.
+    throw(raft::non_cuda_build_error("Copying to from or on device in non-CUDA build"));
+#endif
+  } else if constexpr (config::custom_kernel_allowed) {
+#ifdef __CUDACC__
+    config::check_for_unique_dst(dst);
+    auto const blocks = std::min(
+      // This maximum is somewhat arbitrary. Could query the device to see
+      // how many blocks we could reasonably allow, but this is probably
+      // sufficient considering that this kernel will likely overlap with
+      // real computations for most use cases.
+      typename config::index_type{32},
+      raft::ceildiv(typename config::index_type(dst.size()),
+                    typename config::index_type(mdspan_copy_tile_elems)));
+    auto constexpr const threads = dim3{mdspan_copy_tile_dim, mdspan_copy_tile_dim, 1};
+    mdspan_copy_kernel<<<blocks, threads, 0, resource::get_cuda_stream(res)>>>(dst, src);
+#else
+    // Should never actually reach this because of enable_ifs. Included for
+    // safety.
+    RAFT_FAIL(
+      "raft::copy called in a way that requires custom kernel. Please use "
+      "raft/core/copy.cuh and include the header in a .cu file");
+#endif
+  } else if constexpr (config::can_use_std_copy) {
+    std::copy(src.data_handle(), src.data_handle() + dst.size(), dst.data_handle());
+  } else {
+    // TODO(wphicks): Make the following cache-oblivious and add SIMD support
+    auto indices = std::array<typename config::index_type, config::dst_rank>{};
+    for (auto i = std::size_t{}; i < dst.size(); ++i) {
+      if (i != 0) {
+        if constexpr (std::is_same_v<typename config::src_layout_type, layout_c_contiguous>) {
+          // For layout_right/layout_c_contiguous, we iterate over the
+          // rightmost extent fastest
+          auto dim = config::src_rank - 1;
+          while ((++indices[dim]) == src.extent(dim)) {
+            indices[dim] = typename config::index_type{};
+            --dim;
+          }
+        } else {
+          // For layout_left/layout_f_contiguous (and currently all other
+          // layouts), we iterate over the leftmost extent fastest. The
+          // cache-oblivious implementation should work through dimensions in
+          // order of increasing stride.
+          auto dim = std::size_t{};
+          while ((++indices[dim]) == src.extent(dim)) {
+            indices[dim] = typename config::index_type{};
+            ++dim;
+          }
+        }
+      }
+      std::apply(dst, indices) = std::apply(src, indices);
+    }
+  }
+}
+}  // namespace detail
+}  // namespace raft
diff --git a/cpp/include/raft/core/error.hpp b/cpp/include/raft/core/error.hpp
index 84b244f4dc..9045c5c871 100644
--- a/cpp/include/raft/core/error.hpp
+++ b/cpp/include/raft/core/error.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -98,6 +98,16 @@ struct logic_error : public raft::exception {
   explicit logic_error(std::string const& message) : raft::exception(message) {}
 };
 
+/**
+ * @brief Exception thrown when attempting to use CUDA features from a non-CUDA
+ * build
+ *
+ */
+struct non_cuda_build_error : public raft::exception {
+  explicit non_cuda_build_error(char const* const message) : raft::exception(message) {}
+  explicit non_cuda_build_error(std::string const& message) : raft::exception(message) {}
+};
+
 /**
  * @}
  */
diff --git a/cpp/include/raft/core/resource/resource_types.hpp b/cpp/include/raft/core/resource/resource_types.hpp
index 8e331293bf..c30f2e81e8 100644
--- a/cpp/include/raft/core/resource/resource_types.hpp
+++ b/cpp/include/raft/core/resource/resource_types.hpp
@@ -39,6 +39,8 @@ enum resource_type {
   SUB_COMMUNICATOR,        // raft sub communicator
   DEVICE_PROPERTIES,       // cuda device properties
   DEVICE_ID,               // cuda device id
+  STREAM_VIEW,             // view of a cuda stream or a placeholder in
+                           // CUDA-free builds
   THRUST_POLICY,           // thrust execution policy
   WORKSPACE_RESOURCE,      // rmm device memory resource
 
diff --git a/cpp/include/raft/core/resource/stream_view.hpp b/cpp/include/raft/core/resource/stream_view.hpp
new file mode 100644
index 0000000000..ccf516076f
--- /dev/null
+++ b/cpp/include/raft/core/resource/stream_view.hpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/resources.hpp>
+#include <raft/core/stream_view.hpp>
+#ifndef RAFT_DISABLE_CUDA
+#include <raft/core/resource/cuda_stream.hpp>
+#endif
+
+namespace raft::resource {
+struct stream_view_resource : public resource {
+  stream_view_resource(raft::stream_view view = raft::stream_view_per_thread) : stream(view) {}
+  void* get_resource() override { return &stream; }
+
+  ~stream_view_resource() override {}
+
+ private:
+  raft::stream_view stream;
+};
+
+/**
+ * Factory that knows how to construct a specific raft::resource to populate
+ * the resources instance.
+ */
+struct stream_view_resource_factory : public resource_factory {
+ public:
+  stream_view_resource_factory(raft::stream_view view = raft::stream_view_per_thread) : stream(view)
+  {
+  }
+  resource_type get_resource_type() override { return resource_type::STREAM_VIEW; }
+  resource* make_resource() override { return new stream_view_resource(stream); }
+
+ private:
+  raft::stream_view stream;
+};
+
+/**
+ * @defgroup resource_stream_view stream resource functions compatible with
+ * non-CUDA builds
+ * @{
+ */
+/**
+ * Load a raft::stream_view from a resources instance (and populate it on the res
+ * if needed).
+ * @param res raft res object for managing resources
+ * @return
+ */
+inline raft::stream_view get_stream_view(resources const& res)
+{
+  if (!res.has_resource_factory(resource_type::STREAM_VIEW)) {
+    res.add_resource_factory(std::make_shared<stream_view_resource_factory>());
+  }
+  return *res.get_resource<raft::stream_view>(resource_type::STREAM_VIEW);
+};
+
+/**
+ * Load a raft::stream__view from a resources instance (and populate it on the res
+ * if needed).
+ * @param[in] res raft resources object for managing resources
+ * @param[in] view raft stream view
+ */
+inline void set_stream_view(resources const& res, raft::stream_view view)
+{
+  res.add_resource_factory(std::make_shared<stream_view_resource_factory>(view));
+};
+
+/**
+ * @brief synchronize a specific stream
+ *
+ * @param[in] res the raft resources object
+ * @param[in] stream stream to synchronize
+ */
+inline void sync_stream_view(const resources& res, raft::stream_view stream)
+{
+  stream.interruptible_synchronize();
+}
+
+/**
+ * @brief synchronize main stream on the resources instance
+ */
+inline void sync_stream_view(const resources& res) { sync_stream_view(res, get_stream_view(res)); }
+
+/**
+ * @}
+ */
+
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/stream_view.hpp b/cpp/include/raft/core/stream_view.hpp
new file mode 100644
index 0000000000..f7e7934dbf
--- /dev/null
+++ b/cpp/include/raft/core/stream_view.hpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <raft/core/cuda_support.hpp>
+#include <raft/core/error.hpp>
+#include <raft/core/logger.hpp>
+#ifndef RAFT_DISABLE_CUDA
+#include <raft/core/interruptible.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#endif
+
+namespace raft {
+
+namespace detail {
+struct fail_stream_view {
+  constexpr fail_stream_view()                                           = default;
+  constexpr fail_stream_view(fail_stream_view const&)                    = default;
+  constexpr fail_stream_view(fail_stream_view&&)                         = default;
+  auto constexpr operator=(fail_stream_view const&) -> fail_stream_view& = default;
+  auto constexpr operator=(fail_stream_view&&) -> fail_stream_view&      = default;
+  auto value() { throw non_cuda_build_error{"Attempted to access CUDA stream in non-CUDA build"}; }
+  [[nodiscard]] auto is_per_thread_default() const { return false; }
+  [[nodiscard]] auto is_default() const { return false; }
+  void synchronize() const
+  {
+    throw non_cuda_build_error{"Attempted to sync CUDA stream in non-CUDA build"};
+  }
+  void synchronize_no_throw() const
+  {
+    RAFT_LOG_ERROR("Attempted to sync CUDA stream in non-CUDA build");
+  }
+};
+}  // namespace detail
+
+/** A lightweight wrapper around rmm::cuda_stream_view that can be used in
+ * CUDA-free builds
+ *
+ * While CUDA-free builds should never actually make use of a CUDA stream at
+ * runtime, it is sometimes useful to have a symbol that can stand in place of
+ * a CUDA stream to avoid excessive ifdef directives interspersed with other
+ * logic. This struct's methods invoke the underlying rmm::cuda_stream_view in
+ * CUDA-enabled builds but throw runtime exceptions if any non-trivial method
+ * is called from a CUDA-free build */
+struct stream_view {
+#ifndef RAFT_DISABLE_CUDA
+  using underlying_view_type = rmm::cuda_stream_view;
+#else
+  using underlying_view_type = detail::fail_stream_view;
+#endif
+
+  constexpr stream_view(
+    underlying_view_type base_view = stream_view::get_underlying_per_thread_default())
+    : base_view_{base_view}
+  {
+  }
+  constexpr stream_view(stream_view const&)          = default;
+  constexpr stream_view(stream_view&&)               = default;
+  auto operator=(stream_view const&) -> stream_view& = default;
+  auto operator=(stream_view&&) -> stream_view&      = default;
+  auto value() { return base_view_.value(); }
+  operator underlying_view_type() const noexcept { return base_view_; }
+  [[nodiscard]] auto is_per_thread_default() const { return base_view_.is_per_thread_default(); }
+  [[nodiscard]] auto is_default() const { return base_view_.is_default(); }
+  void synchronize() const { base_view_.synchronize(); }
+  void synchronize_no_throw() const { base_view_.synchronize_no_throw(); }
+  void interruptible_synchronize() const
+  {
+#ifndef RAFT_DISABLE_CUDA
+    interruptible::synchronize(base_view_);
+#else
+    synchronize();
+#endif
+  }
+
+  auto underlying() { return base_view_; }
+  void synchronize_if_cuda_enabled()
+  {
+    if constexpr (raft::CUDA_ENABLED) { base_view_.synchronize(); }
+  }
+
+ private:
+  underlying_view_type base_view_;
+  auto static get_underlying_per_thread_default() -> underlying_view_type
+  {
+#ifndef RAFT_DISABLE_CUDA
+    return rmm::cuda_stream_per_thread;
+#else
+    auto static constexpr const default_fail_stream = underlying_view_type{};
+    return default_fail_stream;
+#endif
+  }
+};
+
+auto static const stream_view_per_thread = stream_view{};
+
+}  // namespace raft
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 0651ccac86..8da5e6986c 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -21,7 +21,7 @@ rapids_test_init()
 
 function(ConfigureTest)
 
-  set(options OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY)
+  set(options OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY NOCUDA)
   set(oneValueArgs NAME GPUS PERCENT)
   set(multiValueArgs PATH TARGETS CONFIGURATIONS)
 
@@ -37,7 +37,11 @@ function(ConfigureTest)
     set(_RAFT_TEST_PERCENT 100)
   endif()
 
-  set(TEST_NAME ${_RAFT_TEST_NAME})
+  if(_RAFT_TEST_NOCUDA)
+    set(TEST_NAME "${_RAFT_TEST_NAME}_NOCUDA")
+  else()
+    set(TEST_NAME ${_RAFT_TEST_NAME})
+  endif()
 
   add_executable(${TEST_NAME} ${_RAFT_TEST_PATH})
   target_link_libraries(
@@ -68,6 +72,9 @@ function(ConfigureTest)
   if(_RAFT_TEST_EXPLICIT_INSTANTIATE_ONLY)
     target_compile_definitions(${TEST_NAME} PRIVATE "RAFT_EXPLICIT_INSTANTIATE_ONLY")
   endif()
+  if(_RAFT_TEST_NOCUDA)
+    target_compile_definitions(${TEST_NAME} PRIVATE "RAFT_DISABLE_CUDA")
+  endif()
 
   target_include_directories(${TEST_NAME} PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/test>")
 
@@ -117,6 +124,8 @@ if(BUILD_TESTS)
     test/core/interruptible.cu
     test/core/nvtx.cpp
     test/core/mdarray.cu
+    test/core/mdspan_copy.cpp
+    test/core/mdspan_copy.cu
     test/core/mdspan_utils.cu
     test/core/numpy_serializer.cu
     test/core/memory_type.cpp
@@ -124,12 +133,18 @@ if(BUILD_TESTS)
     test/core/sparse_matrix.cpp
     test/core/span.cpp
     test/core/span.cu
+    test/core/stream_view.cpp
     test/core/temporary_device_buffer.cu
     test/test.cpp
     LIB
     EXPLICIT_INSTANTIATE_ONLY
   )
 
+  ConfigureTest(
+    NAME CORE_TEST PATH test/core/stream_view.cpp test/core/mdspan_copy.cpp LIB
+    EXPLICIT_INSTANTIATE_ONLY NOCUDA
+  )
+
   ConfigureTest(
     NAME
     DISTANCE_TEST
diff --git a/cpp/test/core/mdspan_copy.cpp b/cpp/test/core/mdspan_copy.cpp
new file mode 100644
index 0000000000..2f938e3035
--- /dev/null
+++ b/cpp/test/core/mdspan_copy.cpp
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <cstdint>
+#include <gtest/gtest.h>
+#include <raft/core/copy.hpp>
+#ifndef RAFT_DISABLE_CUDA
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#endif
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/resources.hpp>
+
+namespace raft {
+TEST(MDSpanCopy, Mdspan1DHostHost)
+{
+  auto res     = resources{};
+  auto cols    = std::uint32_t{2};
+  auto in_left = make_host_vector<float, std::uint32_t, layout_c_contiguous>(res, cols);
+
+  auto gen_unique_entry = [](auto&& x) { return x; };
+  for (auto i = std::uint32_t{}; i < cols; ++i) {
+    in_left(i) = gen_unique_entry(i);
+  }
+
+  auto out_right = make_host_vector<double, std::uint32_t, layout_f_contiguous>(res, cols);
+  static_assert(detail::mdspan_copyable<true,
+                                        decltype(out_right.view()),
+                                        decltype(in_left.view())>::can_use_std_copy,
+                "Current implementation should use std::copy for this copy");
+  copy(res, out_right.view(), in_left.view());
+  for (auto i = std::uint32_t{}; i < cols; ++i) {
+    ASSERT_TRUE(match(out_right(i), double(gen_unique_entry(i)), CompareApprox<double>{0.0001}));
+  }
+}
+
+#ifndef RAFT_DISABLE_CUDA
+TEST(MDSpanCopy, Mdspan1DHostDevice)
+{
+  auto res     = device_resources{};
+  auto cols    = std::uint32_t{2};
+  auto in_left = make_host_vector<float, std::uint32_t, layout_c_contiguous>(res, cols);
+
+  auto gen_unique_entry = [](auto&& x) { return x; };
+  for (auto i = std::uint32_t{}; i < cols; ++i) {
+    in_left(i) = gen_unique_entry(i);
+  }
+
+  auto out_right = make_device_vector<float, std::uint32_t, layout_f_contiguous>(res, cols);
+  static_assert(detail::mdspan_copyable<true,
+                                        decltype(out_right.view()),
+                                        decltype(in_left.view())>::can_use_raft_copy,
+                "Current implementation should use raft::copy for this copy");
+  copy(res, out_right.view(), in_left.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < cols; ++i) {
+    ASSERT_TRUE(
+      match(float(out_right(i)), float(gen_unique_entry(i)), CompareApprox<float>{0.0001f}));
+  }
+}
+
+TEST(MDSpanCopy, Mdspan1DDeviceHost)
+{
+  auto res     = device_resources{};
+  auto cols    = std::uint32_t{2};
+  auto in_left = make_device_vector<float, std::uint32_t, layout_c_contiguous>(res, cols);
+
+  auto gen_unique_entry = [](auto&& x) { return x; };
+  for (auto i = std::uint32_t{}; i < cols; ++i) {
+    in_left(i) = gen_unique_entry(i);
+  }
+
+  auto out_right = make_host_vector<float, std::uint32_t, layout_f_contiguous>(res, cols);
+  static_assert(detail::mdspan_copyable<true,
+                                        decltype(out_right.view()),
+                                        decltype(in_left.view())>::can_use_raft_copy,
+                "Current implementation should use raft::copy for this copy");
+  copy(res, out_right.view(), in_left.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < cols; ++i) {
+    ASSERT_TRUE(
+      match(float(out_right(i)), float(gen_unique_entry(i)), CompareApprox<float>{0.0001f}));
+  }
+}
+#endif
+
+TEST(MDSpanCopy, Mdspan3DHostHost)
+{
+  auto res             = resources{};
+  auto constexpr depth = std::uint32_t{500};
+  auto constexpr rows  = std::uint32_t{300};
+  auto constexpr cols  = std::uint32_t{200};
+  auto in_left = make_host_mdarray<float, std::uint32_t, layout_f_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto in_right = make_host_mdarray<float, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; };
+
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        in_left(i, j, k)  = gen_unique_entry(i, j, k);
+        in_right(i, j, k) = gen_unique_entry(i, j, k);
+      }
+    }
+  }
+
+  auto out_left = make_host_mdarray<double, std::uint32_t, layout_f_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto out_right = make_host_mdarray<double, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+
+  static_assert(detail::mdspan_copyable<true,
+                                        decltype(out_right.view()),
+                                        decltype(in_right.view())>::can_use_std_copy,
+                "Current implementation should use std::copy for this copy");
+  copy(res, out_right.view(), in_right.view());
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_TRUE(match(
+          out_right(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox<double>{0.0001}));
+      }
+    }
+  }
+
+  copy(res, out_right.view(), in_left.view());
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_TRUE(match(
+          out_right(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox<double>{0.0001}));
+      }
+    }
+  }
+
+  copy(res, out_left.view(), in_right.view());
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_TRUE(match(
+          out_left(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox<double>{0.0001}));
+      }
+    }
+  }
+
+  static_assert(detail::mdspan_copyable<true, decltype(out_left.view()), decltype(in_left.view())>::
+                  can_use_std_copy,
+                "Current implementation should use std::copy for this copy");
+  copy(res, out_left.view(), in_left.view());
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_TRUE(match(
+          out_left(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox<double>{0.0001}));
+      }
+    }
+  }
+}
+
+#ifndef RAFT_DISABLE_CUDA
+TEST(MDSpanCopy, Mdspan3DHostDevice)
+{
+  auto res = device_resources{};
+  // Use smaller values here since host/device copy takes awhile.
+  // Non-trivial logic is tested in the other cases.
+  auto constexpr depth = std::uint32_t{5};
+  auto constexpr rows  = std::uint32_t{3};
+  auto constexpr cols  = std::uint32_t{2};
+  auto in_left = make_host_mdarray<float, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto in_right = make_host_mdarray<float, std::uint32_t, layout_f_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; };
+
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        in_left(i, j, k)  = gen_unique_entry(i, j, k);
+        in_right(i, j, k) = gen_unique_entry(i, j, k);
+      }
+    }
+  }
+
+  auto out_left = make_device_mdarray<float, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto out_right =
+    make_device_mdarray<float, std::uint32_t, layout_f_contiguous, depth, rows, cols>(
+      res, extents<std::uint32_t, depth, rows, cols>{});
+
+  static_assert(detail::mdspan_copyable<true,
+                                        decltype(out_right.view()),
+                                        decltype(in_right.view())>::can_use_raft_copy,
+                "Current implementation should use raft::copy for this copy");
+  copy(res, out_right.view(), in_right.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_TRUE(match(float(out_right(i, j, k)),
+                          float(gen_unique_entry(i, j, k)),
+                          CompareApprox<float>{0.0001}));
+      }
+    }
+  }
+
+  static_assert(detail::mdspan_copyable<true, decltype(out_left.view()), decltype(in_left.view())>::
+                  can_use_raft_copy,
+                "Current implementation should use raft::copy for this copy");
+  copy(res, out_left.view(), in_left.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_TRUE(match(float(out_left(i, j, k)),
+                          float(gen_unique_entry(i, j, k)),
+                          CompareApprox<float>{0.0001}));
+      }
+    }
+  }
+}
+
+TEST(MDSpanCopy, Mdspan2DDeviceDevice)
+{
+  auto res            = device_resources{};
+  auto constexpr rows = std::uint32_t{300};
+  auto constexpr cols = std::uint32_t{200};
+  auto in_left        = make_device_mdarray<float, std::uint32_t, layout_c_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+  auto in_right = make_device_mdarray<float, std::uint32_t, layout_f_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+  auto gen_unique_entry = [](auto&& x, auto&& y) { return x * 7 + y * 11; };
+
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      in_left(i, j)  = gen_unique_entry(i, j);
+      in_right(i, j) = gen_unique_entry(i, j);
+    }
+  }
+
+  auto out_left = make_device_mdarray<float, std::uint32_t, layout_c_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+  auto out_right = make_device_mdarray<float, std::uint32_t, layout_f_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+
+  static_assert(detail::mdspan_copyable<true,
+                                        decltype(out_right.view()),
+                                        decltype(in_right.view())>::can_use_raft_copy,
+                "Current implementation should use raft::copy for this copy");
+  copy(res, out_right.view(), in_right.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      ASSERT_TRUE(
+        match(float(out_right(i, j)), float(gen_unique_entry(i, j)), CompareApprox<float>{0.0001}));
+    }
+  }
+
+  static_assert(detail::mdspan_copyable<true,
+                                        decltype(out_right.view()),
+                                        decltype(in_left.view())>::can_use_cublas,
+                "Current implementation should use cuBLAS for this copy");
+  copy(res, out_right.view(), in_left.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      ASSERT_TRUE(
+        match(float(out_right(i, j)), float(gen_unique_entry(i, j)), CompareApprox<float>{0.0001}));
+    }
+  }
+
+  static_assert(detail::mdspan_copyable<true,
+                                        decltype(out_left.view()),
+                                        decltype(in_right.view())>::can_use_cublas,
+                "Current implementation should use cuBLAS for this copy");
+  copy(res, out_left.view(), in_right.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      ASSERT_TRUE(
+        match(float(out_left(i, j)), float(gen_unique_entry(i, j)), CompareApprox<float>{0.0001}));
+    }
+  }
+}
+#endif
+
+}  // namespace raft
diff --git a/cpp/test/core/mdspan_copy.cu b/cpp/test/core/mdspan_copy.cu
new file mode 100644
index 0000000000..95d7d3befd
--- /dev/null
+++ b/cpp/test/core/mdspan_copy.cu
@@ -0,0 +1,433 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <cstdint>
+#include <gtest/gtest.h>
+#include <raft/core/copy.cuh>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdarray.hpp>
+
+namespace raft {
+TEST(MDSpanCopy, Mdspan3DDeviceDeviceCuda)
+{
+  auto res                   = device_resources{};
+  auto constexpr const depth = std::uint32_t{50};
+  auto constexpr const rows  = std::uint32_t{30};
+  auto constexpr const cols  = std::uint32_t{20};
+  auto in_left = make_device_mdarray<int, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto in_right = make_device_mdarray<int, std::uint32_t, layout_f_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; };
+
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        in_left(i, j, k)  = gen_unique_entry(i, j, k);
+        in_right(i, j, k) = gen_unique_entry(i, j, k);
+      }
+    }
+  }
+  res.sync_stream();
+
+  // Test dtype conversion without transpose
+  auto out_long =
+    make_device_mdarray<std::int64_t, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+      res, extents<std::uint32_t, depth, rows, cols>{});
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_long.view()), decltype(in_left.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_long.view(), in_left.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_EQ(std::int64_t(out_long(i, j, k)), std::int64_t(gen_unique_entry(i, j, k)));
+      }
+    }
+  }
+
+  // Test transpose
+  auto out_left = make_device_mdarray<int, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto out_right = make_device_mdarray<int, std::uint32_t, layout_f_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_right.view()), decltype(in_left.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_right.view(), in_left.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_EQ(int(out_right(i, j, k)), int(gen_unique_entry(i, j, k)));
+      }
+    }
+  }
+
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_left.view()), decltype(in_right.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_left.view(), in_right.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_EQ(int(out_left(i, j, k)), int(gen_unique_entry(i, j, k)));
+      }
+    }
+  }
+}
+
+TEST(MDSpanCopy, Mdspan2DDeviceDeviceCuda)
+{
+  auto res            = device_resources{};
+  auto constexpr rows = std::uint32_t{30};
+  auto constexpr cols = std::uint32_t{20};
+  auto in_left        = make_device_mdarray<float, std::uint32_t, layout_c_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+  auto in_right = make_device_mdarray<float, std::uint32_t, layout_f_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+  auto gen_unique_entry = [](auto&& x, auto&& y) { return x * 7 + y * 11; };
+
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      in_left(i, j)  = gen_unique_entry(i, j);
+      in_right(i, j) = gen_unique_entry(i, j);
+    }
+  }
+
+  auto out_left = make_device_mdarray<double, std::uint32_t, layout_c_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+  auto out_right = make_device_mdarray<double, std::uint32_t, layout_f_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+
+  res.sync_stream();
+
+  // Test dtype conversion without transpose
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_right.view()), decltype(in_right.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_right.view(), in_right.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      ASSERT_TRUE(match(
+        double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox<double>{0.0001}));
+    }
+  }
+
+  // Test dtype conversion with transpose
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_right.view()), decltype(in_left.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_right.view(), in_left.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      ASSERT_TRUE(match(
+        double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox<double>{0.0001}));
+    }
+  }
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_left.view()), decltype(in_right.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_left.view(), in_right.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      ASSERT_TRUE(match(
+        double(out_left(i, j)), double(gen_unique_entry(i, j)), CompareApprox<double>{0.0001}));
+    }
+  }
+}
+TEST(MDSpanCopy, Mdspan3DDeviceHostCuda)
+{
+  auto res                   = device_resources{};
+  auto constexpr const depth = std::uint32_t{50};
+  auto constexpr const rows  = std::uint32_t{30};
+  auto constexpr const cols  = std::uint32_t{20};
+  auto in_left = make_device_mdarray<int, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto in_right = make_device_mdarray<int, std::uint32_t, layout_f_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; };
+
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        in_left(i, j, k)  = gen_unique_entry(i, j, k);
+        in_right(i, j, k) = gen_unique_entry(i, j, k);
+      }
+    }
+  }
+  res.sync_stream();
+
+  // Test dtype conversion without transpose
+  auto out_long =
+    make_host_mdarray<std::int64_t, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+      res, extents<std::uint32_t, depth, rows, cols>{});
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_long.view()), decltype(in_left.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_long.view(), in_left.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_EQ(std::int64_t(out_long(i, j, k)), std::int64_t(gen_unique_entry(i, j, k)));
+      }
+    }
+  }
+
+  // Test transpose
+  auto out_left = make_host_mdarray<int, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto out_right = make_host_mdarray<int, std::uint32_t, layout_f_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_right.view()), decltype(in_left.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_right.view(), in_left.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_EQ(int(out_right(i, j, k)), int(gen_unique_entry(i, j, k)));
+      }
+    }
+  }
+
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_left.view()), decltype(in_right.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_left.view(), in_right.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_EQ(int(out_left(i, j, k)), int(gen_unique_entry(i, j, k)));
+      }
+    }
+  }
+}
+
+TEST(MDSpanCopy, Mdspan2DDeviceHostCuda)
+{
+  auto res            = device_resources{};
+  auto constexpr rows = std::uint32_t{30};
+  auto constexpr cols = std::uint32_t{20};
+  auto in_left        = make_host_mdarray<float, std::uint32_t, layout_c_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+  auto in_right = make_host_mdarray<float, std::uint32_t, layout_f_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+  auto gen_unique_entry = [](auto&& x, auto&& y) { return x * 7 + y * 11; };
+
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      in_left(i, j)  = gen_unique_entry(i, j);
+      in_right(i, j) = gen_unique_entry(i, j);
+    }
+  }
+
+  auto out_left = make_device_mdarray<double, std::uint32_t, layout_c_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+  auto out_right = make_device_mdarray<double, std::uint32_t, layout_f_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+
+  res.sync_stream();
+
+  // Test dtype conversion without transpose
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_right.view()), decltype(in_right.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_right.view(), in_right.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      ASSERT_TRUE(match(
+        double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox<double>{0.0001}));
+    }
+  }
+
+  // Test dtype conversion with transpose
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_right.view()), decltype(in_left.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_right.view(), in_left.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      ASSERT_TRUE(match(
+        double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox<double>{0.0001}));
+    }
+  }
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_left.view()), decltype(in_right.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_left.view(), in_right.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      ASSERT_TRUE(match(
+        double(out_left(i, j)), double(gen_unique_entry(i, j)), CompareApprox<double>{0.0001}));
+    }
+  }
+}
+
+TEST(MDSpanCopy, Mdspan3DHostDeviceCuda)
+{
+  auto res                   = device_resources{};
+  auto constexpr const depth = std::uint32_t{50};
+  auto constexpr const rows  = std::uint32_t{30};
+  auto constexpr const cols  = std::uint32_t{20};
+  auto in_left = make_device_mdarray<int, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto in_right = make_device_mdarray<int, std::uint32_t, layout_f_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; };
+
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        in_left(i, j, k)  = gen_unique_entry(i, j, k);
+        in_right(i, j, k) = gen_unique_entry(i, j, k);
+      }
+    }
+  }
+  res.sync_stream();
+
+  // Test dtype conversion without transpose
+  auto out_long =
+    make_device_mdarray<std::int64_t, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+      res, extents<std::uint32_t, depth, rows, cols>{});
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_long.view()), decltype(in_left.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_long.view(), in_left.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_EQ(std::int64_t(out_long(i, j, k)), std::int64_t(gen_unique_entry(i, j, k)));
+      }
+    }
+  }
+
+  // Test transpose
+  auto out_left = make_device_mdarray<int, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+  auto out_right = make_device_mdarray<int, std::uint32_t, layout_f_contiguous, depth, rows, cols>(
+    res, extents<std::uint32_t, depth, rows, cols>{});
+
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_right.view()), decltype(in_left.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_right.view(), in_left.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_EQ(int(out_right(i, j, k)), int(gen_unique_entry(i, j, k)));
+      }
+    }
+  }
+
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_left.view()), decltype(in_right.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_left.view(), in_right.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < depth; ++i) {
+    for (auto j = std::uint32_t{}; j < rows; ++j) {
+      for (auto k = std::uint32_t{}; k < cols; ++k) {
+        ASSERT_EQ(int(out_left(i, j, k)), int(gen_unique_entry(i, j, k)));
+      }
+    }
+  }
+}
+
+TEST(MDSpanCopy, Mdspan2DHostDeviceCuda)
+{
+  auto res            = device_resources{};
+  auto constexpr rows = std::uint32_t{30};
+  auto constexpr cols = std::uint32_t{20};
+  auto in_left        = make_device_mdarray<float, std::uint32_t, layout_c_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+  auto in_right = make_device_mdarray<float, std::uint32_t, layout_f_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+  auto gen_unique_entry = [](auto&& x, auto&& y) { return x * 7 + y * 11; };
+
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      in_left(i, j)  = gen_unique_entry(i, j);
+      in_right(i, j) = gen_unique_entry(i, j);
+    }
+  }
+
+  auto out_left = make_device_mdarray<double, std::uint32_t, layout_c_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+  auto out_right = make_device_mdarray<double, std::uint32_t, layout_f_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+
+  res.sync_stream();
+
+  // Test dtype conversion without transpose
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_right.view()), decltype(in_right.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_right.view(), in_right.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      ASSERT_TRUE(match(
+        double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox<double>{0.0001}));
+    }
+  }
+
+  // Test dtype conversion with transpose
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_right.view()), decltype(in_left.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_right.view(), in_left.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      ASSERT_TRUE(match(
+        double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox<double>{0.0001}));
+    }
+  }
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_left.view()), decltype(in_right.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_left.view(), in_right.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      ASSERT_TRUE(match(
+        double(out_left(i, j)), double(gen_unique_entry(i, j)), CompareApprox<double>{0.0001}));
+    }
+  }
+}
+
+}  // namespace raft
diff --git a/cpp/test/core/stream_view.cpp b/cpp/test/core/stream_view.cpp
new file mode 100644
index 0000000000..715c53fe21
--- /dev/null
+++ b/cpp/test/core/stream_view.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/core/cuda_support.hpp>
+#include <raft/core/stream_view.hpp>
+#ifndef RAFT_DISABLE_CUDA
+#include <rmm/cuda_stream_view.hpp>
+#endif
+namespace raft {
+TEST(StreamView, Default)
+{
+  auto stream = stream_view_per_thread;
+  ASSERT_EQ(stream.is_per_thread_default(), raft::CUDA_ENABLED);
+  ASSERT_FALSE(stream.is_default());
+  if (raft::CUDA_ENABLED) {
+    EXPECT_NO_THROW(stream.synchronize());
+    EXPECT_NO_THROW(stream.interruptible_synchronize());
+  } else {
+    EXPECT_THROW(stream.synchronize(), raft::non_cuda_build_error);
+    EXPECT_THROW(stream.interruptible_synchronize(), raft::non_cuda_build_error);
+  }
+  EXPECT_NO_THROW(stream.synchronize_no_throw());
+  EXPECT_NO_THROW(stream.synchronize_if_cuda_enabled());
+#ifndef RAFT_DISABLE_CUDA
+  static_assert(std::is_same_v<decltype(stream.underlying()), rmm::cuda_stream_view>,
+                "underlying should return rmm::cuda_stream_view");
+#endif
+}
+}  // namespace raft