From 6acae3c2f41638115a518bf7491c004d8f642489 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Thu, 16 Nov 2023 16:23:49 -0800 Subject: [PATCH] Use `cuda::mr::memory_resource` instead of raw `device_memory_resource` (#1095) This introduces `cuda::mr::{async_}resource_ref` as a type erased safe resource wrapper that is meant to replace uses of `{host, device}_memory_resource` We provide both async and classic allocate functions that delegate back to the original resource used to construct the `cuda::mr::{async_}resource_ref` In comparison to `{host, device}_memory_resource` the new feature provides additional compile time checks that will help users avoid common pitfalls with heterogeneous memory allocations. As a first step we provide the properties `cuda::mr::host_accessible` and `cuda::mr::device_accessible`. These properties can be added to an internal or even external type through a free function `get_property` ```cpp // For a user defined resource struct my_resource { friend void get_property(my_resource const&, cuda::mr::device_accessible) noexcept {} }; // For an external resource void get_property(some_external_resource const&, cuda::mr::device_accessible) noexcept {} ``` The advantage is that we can constrain interfaces based on these properties ```cpp void do_some_computation_on_device(cuda::mr::async_resource_ref mr, ...) { ... } ``` This function will fail to compile if it is passed any resource that does not support async allocations or is not tagged as providing device accessible memory. In the same way the following function will only compile if the provided resource provides the classic allocate / deallocate interface and is tagged to provide host accessible memory ```cpp void do_some_computation_on_host(cuda::mr::resource_ref mr, ...) { ... } ``` The property system is highly flexible and can easily be user provided to add their own properties as needed. That gives it both the flexibility of an inheritance based implementation and the security of a strictly type checked interface Authors: - Michael Schellenberger Costa (https://github.com/miscco) - Bradley Dice (https://github.com/bdice) - Mark Harris (https://github.com/harrism) Approvers: - Jake Hemstad (https://github.com/jrhemstad) - Mark Harris (https://github.com/harrism) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/rmm/pull/1095 --- CMakeLists.txt | 3 + cmake/thirdparty/get_libcudacxx.cmake | 23 + include/rmm/cuda_stream_view.hpp | 16 + include/rmm/device_buffer.hpp | 26 +- include/rmm/device_uvector.hpp | 21 +- .../mr/device/callback_memory_resource.hpp | 4 +- .../rmm/mr/device/device_memory_resource.hpp | 146 ++++++- .../rmm/mr/device/pool_memory_resource.hpp | 71 ++- .../mr/device/thrust_allocator_adaptor.hpp | 26 +- include/rmm/mr/host/host_memory_resource.hpp | 35 ++ .../rmm/mr/host/pinned_memory_resource.hpp | 81 ++++ python/docs/conf.py | 12 + python/rmm/tests/test_rmm.py | 32 +- tests/CMakeLists.txt | 10 + tests/device_buffer_tests.cu | 47 +- tests/device_uvector_tests.cpp | 5 +- tests/mr/device/adaptor_tests.cpp | 19 + tests/mr/device/cuda_async_mr_tests.cpp | 2 + tests/mr/device/cuda_async_view_mr_tests.cpp | 4 + .../mr/device/mr_ref_multithreaded_tests.cpp | 232 ++++++++++ tests/mr/device/mr_ref_test.hpp | 408 ++++++++++++++++++ tests/mr/device/mr_ref_tests.cpp | 109 +++++ tests/mr/device/mr_test.hpp | 4 - tests/mr/device/pool_mr_tests.cpp | 51 +++ tests/mr/device/thrust_allocator_tests.cu | 4 +- tests/mr/host/mr_ref_tests.cpp | 258 +++++++++++ tests/mr/host/mr_tests.cpp | 8 +- tests/mr/host/pinned_pool_mr_tests.cpp | 96 +++++ 28 files changed, 1664 insertions(+), 89 deletions(-) create mode 100644 cmake/thirdparty/get_libcudacxx.cmake create mode 100644 tests/mr/device/mr_ref_multithreaded_tests.cpp create mode 100644 tests/mr/device/mr_ref_test.hpp create mode 100644 tests/mr/device/mr_ref_tests.cpp create mode 100644 tests/mr/host/mr_ref_tests.cpp create mode 100644 tests/mr/host/pinned_pool_mr_tests.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c883bc4c..93fcdabe9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -70,6 +70,7 @@ rapids_cpm_init() include(cmake/thirdparty/get_fmt.cmake) include(cmake/thirdparty/get_spdlog.cmake) +include(cmake/thirdparty/get_libcudacxx.cmake) include(cmake/thirdparty/get_thrust.cmake) # ################################################################################################## @@ -89,11 +90,13 @@ else() target_link_libraries(rmm INTERFACE CUDA::cudart) endif() +target_link_libraries(rmm INTERFACE libcudacxx::libcudacxx) target_link_libraries(rmm INTERFACE rmm::Thrust) target_link_libraries(rmm INTERFACE fmt::fmt-header-only) target_link_libraries(rmm INTERFACE spdlog::spdlog_header_only) target_link_libraries(rmm INTERFACE dl) target_compile_features(rmm INTERFACE cxx_std_17 $) +target_compile_definitions(rmm INTERFACE LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) # ################################################################################################## # * tests and benchmarks --------------------------------------------------------------------------- diff --git a/cmake/thirdparty/get_libcudacxx.cmake b/cmake/thirdparty/get_libcudacxx.cmake new file mode 100644 index 000000000..14b0d492f --- /dev/null +++ b/cmake/thirdparty/get_libcudacxx.cmake @@ -0,0 +1,23 @@ +# ============================================================================= +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +# Use CPM to find or clone libcudacxx +function(find_and_configure_libcudacxx) + + include(${rapids-cmake-dir}/cpm/libcudacxx.cmake) + rapids_cpm_libcudacxx(BUILD_EXPORT_SET rmm-exports INSTALL_EXPORT_SET rmm-exports) + +endfunction() + +find_and_configure_libcudacxx() diff --git a/include/rmm/cuda_stream_view.hpp b/include/rmm/cuda_stream_view.hpp index 7809140fb..f8564b16b 100644 --- a/include/rmm/cuda_stream_view.hpp +++ b/include/rmm/cuda_stream_view.hpp @@ -20,6 +20,8 @@ #include +#include + #include #include #include @@ -58,6 +60,13 @@ class cuda_stream_view { */ constexpr cuda_stream_view(cudaStream_t stream) noexcept : stream_{stream} {} + /** + * @brief Implicit conversion from stream_ref. + * + * @param stream The underlying stream for this view + */ + constexpr cuda_stream_view(cuda::stream_ref stream) noexcept : stream_{stream.get()} {} + /** * @brief Get the wrapped stream. * @@ -72,6 +81,13 @@ class cuda_stream_view { */ constexpr operator cudaStream_t() const noexcept { return value(); } + /** + * @brief Implicit conversion to stream_ref. + * + * @return stream_ref The underlying stream referenced by this cuda_stream_view + */ + constexpr operator cuda::stream_ref() const noexcept { return value(); } + /** * @briefreturn{true if the wrapped stream is the CUDA per-thread default stream} */ diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp index 20fa4f36e..4a780018e 100644 --- a/include/rmm/device_buffer.hpp +++ b/include/rmm/device_buffer.hpp @@ -28,6 +28,8 @@ #include #include +#include + namespace rmm { /** * @addtogroup data_containers @@ -80,6 +82,8 @@ namespace rmm { *``` */ class device_buffer { + using async_resource_ref = cuda::mr::async_resource_ref; + public: // The copy constructor and copy assignment operator without a stream are deleted because they // provide no way to specify an explicit stream @@ -107,7 +111,7 @@ class device_buffer { */ explicit device_buffer(std::size_t size, cuda_stream_view stream, - mr::device_memory_resource* mr = mr::get_current_device_resource()) + async_resource_ref mr = mr::get_current_device_resource()) : _stream{stream}, _mr{mr} { cuda_set_device_raii dev{_device}; @@ -136,7 +140,7 @@ class device_buffer { device_buffer(void const* source_data, std::size_t size, cuda_stream_view stream, - mr::device_memory_resource* mr = mr::get_current_device_resource()) + async_resource_ref mr = rmm::mr::get_current_device_resource()) : _stream{stream}, _mr{mr} { cuda_set_device_raii dev{_device}; @@ -167,7 +171,7 @@ class device_buffer { */ device_buffer(device_buffer const& other, cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) + async_resource_ref mr = rmm::mr::get_current_device_resource()) : device_buffer{other.data(), other.size(), stream, mr} { } @@ -245,7 +249,6 @@ class device_buffer { { cuda_set_device_raii dev{_device}; deallocate_async(); - _mr = nullptr; _stream = cuda_stream_view{}; } @@ -407,18 +410,19 @@ class device_buffer { void set_stream(cuda_stream_view stream) noexcept { _stream = stream; } /** - * @briefreturn{Pointer to the memory resource used to allocate and deallocate} + * @briefreturn{The async_resource_ref used to allocate and deallocate} */ - [[nodiscard]] mr::device_memory_resource* memory_resource() const noexcept { return _mr; } + [[nodiscard]] async_resource_ref memory_resource() const noexcept { return _mr; } private: void* _data{nullptr}; ///< Pointer to device memory allocation std::size_t _size{}; ///< Requested size of the device memory allocation std::size_t _capacity{}; ///< The actual size of the device memory allocation cuda_stream_view _stream{}; ///< Stream to use for device memory deallocation - mr::device_memory_resource* _mr{ - mr::get_current_device_resource()}; ///< The memory resource used to - ///< allocate/deallocate device memory + + async_resource_ref _mr{ + rmm::mr::get_current_device_resource()}; ///< The memory resource used to + ///< allocate/deallocate device memory cuda_device_id _device{get_current_cuda_device()}; /** @@ -434,7 +438,7 @@ class device_buffer { { _size = bytes; _capacity = bytes; - _data = (bytes > 0) ? memory_resource()->allocate(bytes, stream()) : nullptr; + _data = (bytes > 0) ? _mr.allocate_async(bytes, stream()) : nullptr; } /** @@ -448,7 +452,7 @@ class device_buffer { */ void deallocate_async() noexcept { - if (capacity() > 0) { memory_resource()->deallocate(data(), capacity(), stream()); } + if (capacity() > 0) { _mr.deallocate_async(data(), capacity(), stream()); } _size = 0; _capacity = 0; _data = nullptr; diff --git a/include/rmm/device_uvector.hpp b/include/rmm/device_uvector.hpp index 982d2095d..3f77f59f7 100644 --- a/include/rmm/device_uvector.hpp +++ b/include/rmm/device_uvector.hpp @@ -26,6 +26,8 @@ #include #include +#include + namespace rmm { /** * @addtogroup data_containers @@ -72,6 +74,7 @@ namespace rmm { */ template class device_uvector { + using async_resource_ref = cuda::mr::async_resource_ref; static_assert(std::is_trivially_copyable::value, "device_uvector only supports types that are trivially copyable."); @@ -121,10 +124,9 @@ class device_uvector { * @param stream The stream on which to perform the allocation * @param mr The resource used to allocate the device storage */ - explicit device_uvector( - std::size_t size, - cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) + explicit device_uvector(std::size_t size, + cuda_stream_view stream, + async_resource_ref mr = rmm::mr::get_current_device_resource()) : _storage{elements_to_bytes(size), stream, mr} { } @@ -138,10 +140,9 @@ class device_uvector { * @param stream The stream on which to perform the copy * @param mr The resource used to allocate device memory for the new vector */ - explicit device_uvector( - device_uvector const& other, - cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) + explicit device_uvector(device_uvector const& other, + cuda_stream_view stream, + async_resource_ref mr = rmm::mr::get_current_device_resource()) : _storage{other._storage, stream, mr} { } @@ -524,9 +525,9 @@ class device_uvector { [[nodiscard]] bool is_empty() const noexcept { return size() == 0; } /** - * @briefreturn{Pointer to underlying resource used to allocate and deallocate the device storage} + * @briefreturn{The async_resource_ref used to allocate and deallocate the device storage} */ - [[nodiscard]] mr::device_memory_resource* memory_resource() const noexcept + [[nodiscard]] async_resource_ref memory_resource() const noexcept { return _storage.memory_resource(); } diff --git a/include/rmm/mr/device/callback_memory_resource.hpp b/include/rmm/mr/device/callback_memory_resource.hpp index c6519ed5c..36802c83a 100644 --- a/include/rmm/mr/device/callback_memory_resource.hpp +++ b/include/rmm/mr/device/callback_memory_resource.hpp @@ -143,8 +143,8 @@ class callback_memory_resource final : public device_memory_resource { throw std::runtime_error("cannot get free / total memory"); } - [[nodiscard]] virtual bool supports_streams() const noexcept { return false; } - [[nodiscard]] virtual bool supports_get_mem_info() const noexcept { return false; } + [[nodiscard]] bool supports_streams() const noexcept override { return false; } + [[nodiscard]] bool supports_get_mem_info() const noexcept override { return false; } allocate_callback_t allocate_callback_; deallocate_callback_t deallocate_callback_; diff --git a/include/rmm/mr/device/device_memory_resource.hpp b/include/rmm/mr/device/device_memory_resource.hpp index bda52ac67..63e5f39a4 100644 --- a/include/rmm/mr/device/device_memory_resource.hpp +++ b/include/rmm/mr/device/device_memory_resource.hpp @@ -18,6 +18,8 @@ #include #include +#include + #include #include @@ -119,7 +121,7 @@ class device_memory_resource { /** * @brief Deallocate memory pointed to by \p p. * - * `p` must have been returned by a prior call to `allocate(bytes,stream)` on + * `p` must have been returned by a prior call to `allocate(bytes, stream)` on * a `device_memory_resource` that compares equal to `*this`, and the storage * it points to must not yet have been deallocated, otherwise behavior is * undefined. @@ -155,6 +157,140 @@ class device_memory_resource { return do_is_equal(other); } + /** + * @brief Allocates memory of size at least \p bytes. + * + * The returned pointer will have at minimum 256 byte alignment. + * + * @throws rmm::bad_alloc When the requested `bytes` cannot be allocated on + * the specified `stream`. + * + * @param bytes The size of the allocation + * @param alignment The expected alignment of the allocation + * @return void* Pointer to the newly allocated memory + */ + void* allocate(std::size_t bytes, std::size_t alignment) + { + return do_allocate(rmm::detail::align_up(bytes, alignment), cuda_stream_view{}); + } + + /** + * @brief Deallocate memory pointed to by \p p. + * + * `p` must have been returned by a prior call to `allocate(bytes, stream)` on + * a `device_memory_resource` that compares equal to `*this`, and the storage + * it points to must not yet have been deallocated, otherwise behavior is + * undefined. + * + * @param ptr Pointer to be deallocated + * @param bytes The size in bytes of the allocation. This must be equal to the + * value of `bytes` that was passed to the `allocate` call that returned `p`. + * @param alignment The alignment that was passed to the `allocate` call that returned `p` + */ + void deallocate(void* ptr, std::size_t bytes, std::size_t alignment) + { + do_deallocate(ptr, rmm::detail::align_up(bytes, alignment), cuda_stream_view{}); + } + + /** + * @brief Allocates memory of size at least \p bytes. + * + * The returned pointer will have at minimum 256 byte alignment. + * + * @throws rmm::bad_alloc When the requested `bytes` cannot be allocated on + * the specified `stream`. + * + * @param bytes The size of the allocation + * @param alignment The expected alignment of the allocation + * @param stream Stream on which to perform allocation + * @return void* Pointer to the newly allocated memory + */ + void* allocate_async(std::size_t bytes, std::size_t alignment, cuda_stream_view stream) + { + return do_allocate(rmm::detail::align_up(bytes, alignment), stream); + } + + /** + * @brief Allocates memory of size at least \p bytes. + * + * The returned pointer will have at minimum 256 byte alignment. + * + * @throws rmm::bad_alloc When the requested `bytes` cannot be allocated on + * the specified `stream`. + * + * @param bytes The size of the allocation + * @param stream Stream on which to perform allocation + * @return void* Pointer to the newly allocated memory + */ + void* allocate_async(std::size_t bytes, cuda_stream_view stream) + { + return do_allocate(bytes, stream); + } + + /** + * @brief Deallocate memory pointed to by \p p. + * + * `p` must have been returned by a prior call to `allocate(bytes, stream)` on + * a `device_memory_resource` that compares equal to `*this`, and the storage + * it points to must not yet have been deallocated, otherwise behavior is + * undefined. + * + * @param ptr Pointer to be deallocated + * @param bytes The size in bytes of the allocation. This must be equal to the + * value of `bytes` that was passed to the `allocate` call that returned `p`. + * @param alignment The alignment that was passed to the `allocate` call that returned `p` + * @param stream Stream on which to perform allocation + */ + void deallocate_async(void* ptr, + std::size_t bytes, + std::size_t alignment, + cuda_stream_view stream) + { + do_deallocate(ptr, rmm::detail::align_up(bytes, alignment), stream); + } + + /** + * @brief Deallocate memory pointed to by \p p. + * + * `p` must have been returned by a prior call to `allocate(bytes, stream)` on + * a `device_memory_resource` that compares equal to `*this`, and the storage + * it points to must not yet have been deallocated, otherwise behavior is + * undefined. + * + * @param ptr Pointer to be deallocated + * @param bytes The size in bytes of the allocation. This must be equal to the + * value of `bytes` that was passed to the `allocate` call that returned `p`. + * @param stream Stream on which to perform allocation + */ + void deallocate_async(void* ptr, std::size_t bytes, cuda_stream_view stream) + { + do_deallocate(ptr, bytes, stream); + } + + /** + * @brief Comparison operator with another device_memory_resource + * + * @param other The other resource to compare to + * @return true If the two resources are equivalent + * @return false If the two resources are not equivalent + */ + [[nodiscard]] bool operator==(device_memory_resource const& other) const noexcept + { + return do_is_equal(other); + } + + /** + * @brief Comparison operator with another device_memory_resource + * + * @param other The other resource to compare to + * @return false If the two resources are equivalent + * @return true If the two resources are not equivalent + */ + [[nodiscard]] bool operator!=(device_memory_resource const& other) const noexcept + { + return !do_is_equal(other); + } + /** * @brief Query whether the resource supports use of non-null CUDA streams for * allocation/deallocation. @@ -183,6 +319,13 @@ class device_memory_resource { return do_get_mem_info(stream); } + /** + * @brief Enables the `cuda::mr::device_accessible` property + * + * This property declares that a `device_memory_resource` provides device accessible memory + */ + friend void get_property(device_memory_resource const&, cuda::mr::device_accessible) noexcept {} + private: /** * @brief Allocates memory of size at least \p bytes. @@ -241,5 +384,6 @@ class device_memory_resource { [[nodiscard]] virtual std::pair do_get_mem_info( cuda_stream_view stream) const = 0; }; +static_assert(cuda::mr::async_resource_with); /** @} */ // end of group } // namespace rmm::mr diff --git a/include/rmm/mr/device/pool_memory_resource.hpp b/include/rmm/mr/device/pool_memory_resource.hpp index f6d3710e9..c85408359 100644 --- a/include/rmm/mr/device/pool_memory_resource.hpp +++ b/include/rmm/mr/device/pool_memory_resource.hpp @@ -51,6 +51,36 @@ namespace rmm::mr { * @{ * @file */ +namespace detail { +/** + * @brief A helper class to remove the device_accessible property + * + * We want to be able to use the pool_memory_resource with an upstream that may not + * be device accessible. To avoid rewriting the world, we allow conditionally removing + * the cuda::mr::device_accessible property. + * + * @tparam PoolResource the pool_memory_resource class + * @tparam Upstream memory_resource to use for allocating the pool. + * @tparam Property The property we want to potentially remove. + */ +template +struct maybe_remove_property {}; + +/** + * @brief Specialization of maybe_remove_property to not propagate nonexistent properties + */ +template +struct maybe_remove_property>> { + /** + * @brief Explicit removal of the friend function so we do not pretend to provide device + * accessible memory + */ + friend void get_property(const PoolResource&, Property) = delete; +}; +} // namespace detail /** * @brief A coalescing best-fit suballocator which uses a pool of memory allocated from @@ -64,8 +94,11 @@ namespace rmm::mr { */ template class pool_memory_resource final - : public detail::stream_ordered_memory_resource, - detail::coalescing_free_list> { + : public detail:: + maybe_remove_property, Upstream, cuda::mr::device_accessible>, + public detail::stream_ordered_memory_resource, + detail::coalescing_free_list>, + public cuda::forward_property, Upstream> { public: friend class detail::stream_ordered_memory_resource, detail::coalescing_free_list>; @@ -104,6 +137,31 @@ class pool_memory_resource final initialize_pool(initial_pool_size, maximum_pool_size); } + /** + * @brief Construct a `pool_memory_resource` and allocate the initial device memory pool using + * `upstream_mr`. + * + * @throws rmm::logic_error if `upstream_mr == nullptr` + * @throws rmm::logic_error if `initial_pool_size` is neither the default nor aligned to a + * multiple of pool_memory_resource::allocation_alignment bytes. + * @throws rmm::logic_error if `maximum_pool_size` is neither the default nor aligned to a + * multiple of pool_memory_resource::allocation_alignment bytes. + * + * @param upstream_mr The memory_resource from which to allocate blocks for the pool. + * @param initial_pool_size Minimum size, in bytes, of the initial pool. Defaults to half of the + * available memory on the current device. + * @param maximum_pool_size Maximum size, in bytes, that the pool can grow to. Defaults to all + * of the available memory on the current device. + */ + template , int> = 0> + explicit pool_memory_resource(Upstream2& upstream_mr, + thrust::optional initial_pool_size = thrust::nullopt, + thrust::optional maximum_pool_size = thrust::nullopt) + : pool_memory_resource(cuda::std::addressof(upstream_mr), initial_pool_size, maximum_pool_size) + { + } + /** * @brief Destroy the `pool_memory_resource` and deallocate all memory it allocated using * the upstream resource. @@ -131,6 +189,13 @@ class pool_memory_resource final */ [[nodiscard]] bool supports_get_mem_info() const noexcept override { return false; } + /** + * @brief Get the upstream memory_resource object. + * + * @return const reference to the upstream memory resource. + */ + [[nodiscard]] const Upstream& upstream_resource() const noexcept { return *upstream_mr_; } + /** * @brief Get the upstream memory_resource object. * @@ -296,7 +361,7 @@ class pool_memory_resource final if (size == 0) { return {}; } try { - void* ptr = get_upstream()->allocate(size, stream); + void* ptr = get_upstream()->allocate_async(size, stream); return thrust::optional{ *upstream_blocks_.emplace(static_cast(ptr), size, true).first}; } catch (std::exception const& e) { diff --git a/include/rmm/mr/device/thrust_allocator_adaptor.hpp b/include/rmm/mr/device/thrust_allocator_adaptor.hpp index a1386a842..562a0d79e 100644 --- a/include/rmm/mr/device/thrust_allocator_adaptor.hpp +++ b/include/rmm/mr/device/thrust_allocator_adaptor.hpp @@ -24,6 +24,8 @@ #include #include +#include + namespace rmm::mr { /** * @addtogroup device_resource_adaptors @@ -42,6 +44,8 @@ namespace rmm::mr { */ template class thrust_allocator : public thrust::device_malloc_allocator { + using async_resource_ref = cuda::mr::async_resource_ref; + public: using Base = thrust::device_malloc_allocator; ///< The base type of this allocator using pointer = typename Base::pointer; ///< The pointer type @@ -79,9 +83,7 @@ class thrust_allocator : public thrust::device_malloc_allocator { * @param mr The resource to be used for device memory allocation * @param stream The stream to be used for device memory (de)allocation */ - thrust_allocator(cuda_stream_view stream, device_memory_resource* mr) : _stream{stream}, _mr(mr) - { - } + thrust_allocator(cuda_stream_view stream, async_resource_ref mr) : _stream{stream}, _mr(mr) {} /** * @brief Copy constructor. Copies the resource pointer and stream. @@ -102,7 +104,8 @@ class thrust_allocator : public thrust::device_malloc_allocator { */ pointer allocate(size_type num) { - return thrust::device_pointer_cast(static_cast(_mr->allocate(num * sizeof(T), _stream))); + return thrust::device_pointer_cast( + static_cast(_mr.allocate_async(num * sizeof(T), _stream))); } /** @@ -114,22 +117,29 @@ class thrust_allocator : public thrust::device_malloc_allocator { */ void deallocate(pointer ptr, size_type num) { - return _mr->deallocate(thrust::raw_pointer_cast(ptr), num * sizeof(T), _stream); + return _mr.deallocate_async(thrust::raw_pointer_cast(ptr), num * sizeof(T), _stream); } /** - * @briefreturn{The device memory resource used by this} + * @briefreturn{The async_resource_ref used to allocate and deallocate} */ - [[nodiscard]] device_memory_resource* resource() const noexcept { return _mr; } + [[nodiscard]] async_resource_ref memory_resource() const noexcept { return _mr; } /** * @briefreturn{The stream used by this allocator} */ [[nodiscard]] cuda_stream_view stream() const noexcept { return _stream; } + /** + * @brief Enables the `cuda::mr::device_accessible` property + * + * This property declares that a `thrust_allocator` provides device accessible memory + */ + friend void get_property(thrust_allocator const&, cuda::mr::device_accessible) noexcept {} + private: cuda_stream_view _stream{}; - device_memory_resource* _mr{rmm::mr::get_current_device_resource()}; + async_resource_ref _mr{rmm::mr::get_current_device_resource()}; }; /** @} */ // end of group } // namespace rmm::mr diff --git a/include/rmm/mr/host/host_memory_resource.hpp b/include/rmm/mr/host/host_memory_resource.hpp index 3f6f90785..ce870287c 100644 --- a/include/rmm/mr/host/host_memory_resource.hpp +++ b/include/rmm/mr/host/host_memory_resource.hpp @@ -15,6 +15,8 @@ */ #pragma once +#include + #include #include @@ -112,6 +114,37 @@ class host_memory_resource { return do_is_equal(other); } + /** + * @brief Comparison operator with another device_memory_resource + * + * @param other The other resource to compare to + * @return true If the two resources are equivalent + * @return false If the two resources are not equivalent + */ + [[nodiscard]] bool operator==(host_memory_resource const& other) const noexcept + { + return do_is_equal(other); + } + + /** + * @brief Comparison operator with another device_memory_resource + * + * @param other The other resource to compare to + * @return false If the two resources are equivalent + * @return true If the two resources are not equivalent + */ + [[nodiscard]] bool operator!=(host_memory_resource const& other) const noexcept + { + return !do_is_equal(other); + } + + /** + * @brief Enables the `cuda::mr::host_accessible` property + * + * This property declares that a `host_memory_resource` provides host accessible memory + */ + friend void get_property(host_memory_resource const&, cuda::mr::host_accessible) noexcept {} + private: /** * @brief Allocates memory on the host of size at least `bytes` bytes. @@ -162,5 +195,7 @@ class host_memory_resource { return this == &other; } }; +static_assert(cuda::mr::resource_with); /** @} */ // end of group + } // namespace rmm::mr diff --git a/include/rmm/mr/host/pinned_memory_resource.hpp b/include/rmm/mr/host/pinned_memory_resource.hpp index f8d08f66c..e49767faf 100644 --- a/include/rmm/mr/host/pinned_memory_resource.hpp +++ b/include/rmm/mr/host/pinned_memory_resource.hpp @@ -15,6 +15,7 @@ */ #pragma once +#include #include #include #include @@ -46,6 +47,83 @@ class pinned_memory_resource final : public host_memory_resource { pinned_memory_resource& operator=(pinned_memory_resource&&) = default; ///< @default_move_assignment{pinned_memory_resource} + /** + * @brief Query whether the pinned_memory_resource supports use of non-null CUDA streams for + * allocation/deallocation. + * + * @returns bool false. + */ + [[nodiscard]] bool supports_streams() const noexcept { return false; } + + /** + * @brief Query whether the resource supports the get_mem_info API. + * + * @return bool false. + */ + [[nodiscard]] bool supports_get_mem_info() const noexcept { return false; } + + /** + * @brief Queries the amount of free and total memory for the resource. + * + * @param stream the stream whose memory manager we want to retrieve + * + * @returns a pair containing the free memory in bytes in .first and total amount of memory in + * .second + */ + [[nodiscard]] std::pair get_mem_info(cuda_stream_view stream) const + { + return std::make_pair(0, 0); + } + + /** + * @brief Pretend to support the allocate_async interface, falling back to stream 0 + * + * @throws rmm::bad_alloc When the requested `bytes` cannot be allocated on + * the specified `stream`. + * + * @param bytes The size of the allocation + * @param alignment The expected alignment of the allocation + * @return void* Pointer to the newly allocated memory + */ + [[nodiscard]] void* allocate_async(std::size_t bytes, std::size_t alignment, cuda_stream_view) + { + return do_allocate(bytes, alignment); + } + + /** + * @brief Pretend to support the allocate_async interface, falling back to stream 0 + * + * @throws rmm::bad_alloc When the requested `bytes` cannot be allocated on + * the specified `stream`. + * + * @param bytes The size of the allocation + * @return void* Pointer to the newly allocated memory + */ + [[nodiscard]] void* allocate_async(std::size_t bytes, cuda_stream_view) + { + return do_allocate(bytes); + } + + /** + * @brief Pretend to support the deallocate_async interface, falling back to stream 0 + * + * @param ptr Pointer to be deallocated + * @param bytes The size in bytes of the allocation. This must be equal to the + * value of `bytes` that was passed to the `allocate` call that returned `p`. + * @param alignment The alignment that was passed to the `allocate` call that returned `p` + */ + void deallocate_async(void* ptr, std::size_t bytes, std::size_t alignment, cuda_stream_view) + { + do_deallocate(ptr, rmm::detail::align_up(bytes, alignment)); + } + + /** + * @brief Enables the `cuda::mr::device_accessible` property + * + * This property declares that a `pinned_memory_resource` provides device accessible memory + */ + friend void get_property(pinned_memory_resource const&, cuda::mr::device_accessible) noexcept {} + private: /** * @brief Allocates pinned memory on the host of size at least `bytes` bytes. @@ -99,5 +177,8 @@ class pinned_memory_resource final : public host_memory_resource { ptr, bytes, alignment, [](void* ptr) { RMM_ASSERT_CUDA_SUCCESS(cudaFreeHost(ptr)); }); } }; +static_assert(cuda::mr::async_resource_with); /** @} */ // end of group } // namespace rmm::mr diff --git a/python/docs/conf.py b/python/docs/conf.py index a063b52eb..ec6ddc70a 100644 --- a/python/docs/conf.py +++ b/python/docs/conf.py @@ -216,6 +216,18 @@ def on_missing_reference(app, env, node, contnode): "cudaStreamPerThread", "thrust", "spdlog", + "stream_ref", + # libcu++ names + "cuda", + "cuda::mr", + "resource", + "resource_ref", + "async_resource", + "async_resource_ref", + "device_accessible", + "host_accessible", + "forward_property", + "enable_if_t", # Unknown types "int64_t", "int8_t", diff --git a/python/rmm/tests/test_rmm.py b/python/rmm/tests/test_rmm.py index fd537749b..b5dc81c1f 100644 --- a/python/rmm/tests/test_rmm.py +++ b/python/rmm/tests/test_rmm.py @@ -630,11 +630,11 @@ def test_statistics_resource_adaptor(stats_mr): del buffers[i] assert stats_mr.allocation_counts == { - "current_bytes": 5000, + "current_bytes": 5040, "current_count": 5, - "peak_bytes": 10000, + "peak_bytes": 10080, "peak_count": 10, - "total_bytes": 10000, + "total_bytes": 10080, "total_count": 10, } @@ -646,19 +646,19 @@ def test_statistics_resource_adaptor(stats_mr): buffers.append(rmm.DeviceBuffer(size=1000)) assert mr2.allocation_counts == { - "current_bytes": 2000, + "current_bytes": 2016, "current_count": 2, - "peak_bytes": 2000, + "peak_bytes": 2016, "peak_count": 2, - "total_bytes": 2000, + "total_bytes": 2016, "total_count": 2, } assert stats_mr.allocation_counts == { - "current_bytes": 7000, + "current_bytes": 7056, "current_count": 7, - "peak_bytes": 10000, + "peak_bytes": 10080, "peak_count": 10, - "total_bytes": 12000, + "total_bytes": 12096, "total_count": 12, } @@ -668,17 +668,17 @@ def test_statistics_resource_adaptor(stats_mr): assert mr2.allocation_counts == { "current_bytes": 0, "current_count": 0, - "peak_bytes": 2000, + "peak_bytes": 2016, "peak_count": 2, - "total_bytes": 2000, + "total_bytes": 2016, "total_count": 2, } assert stats_mr.allocation_counts == { "current_bytes": 0, "current_count": 0, - "peak_bytes": 10000, + "peak_bytes": 10080, "peak_count": 10, - "total_bytes": 12000, + "total_bytes": 12096, "total_count": 12, } gc.collect() @@ -696,7 +696,7 @@ def test_tracking_resource_adaptor(): for i in range(9, 0, -2): del buffers[i] - assert mr.get_allocated_bytes() == 5000 + assert mr.get_allocated_bytes() == 5040 # Push a new Tracking adaptor mr2 = rmm.mr.TrackingResourceAdaptor(mr, capture_stacks=True) @@ -705,8 +705,8 @@ def test_tracking_resource_adaptor(): for _ in range(2): buffers.append(rmm.DeviceBuffer(size=1000)) - assert mr2.get_allocated_bytes() == 2000 - assert mr.get_allocated_bytes() == 7000 + assert mr2.get_allocated_bytes() == 2016 + assert mr.get_allocated_bytes() == 7056 # Ensure we get back a non-empty string for the allocations assert len(mr.get_outstanding_allocations_str()) > 0 diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 752496279..a3d493e40 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -126,6 +126,10 @@ endfunction() ConfigureTest(DEVICE_MR_TEST mr/device/mr_tests.cpp mr/device/mr_multithreaded_tests.cpp GPUS 1 PERCENT 90) +# device mr_ref tests +ConfigureTest(DEVICE_MR_REF_TEST mr/device/mr_ref_tests.cpp + mr/device/mr_ref_multithreaded_tests.cpp GPUS 1 PERCENT 100) + # general adaptor tests ConfigureTest(ADAPTOR_TEST mr/device/adaptor_tests.cpp) @@ -162,6 +166,12 @@ ConfigureTest(LIMITING_TEST mr/device/limiting_mr_tests.cpp) # host mr tests ConfigureTest(HOST_MR_TEST mr/host/mr_tests.cpp) +# host mr_ref tests +ConfigureTest(HOST_MR_REF_TEST mr/host/mr_ref_tests.cpp) + +# pinned pool mr tests +ConfigureTest(PINNED_POOL_MR_TEST mr/host/pinned_pool_mr_tests.cpp) + # cuda stream tests ConfigureTest(CUDA_STREAM_TEST cuda_stream_tests.cpp cuda_stream_pool_tests.cpp) diff --git a/tests/device_buffer_tests.cu b/tests/device_buffer_tests.cu index e0d8e5555..f73be0201 100644 --- a/tests/device_buffer_tests.cu +++ b/tests/device_buffer_tests.cu @@ -58,6 +58,7 @@ struct DeviceBufferTest : public ::testing::Test { }; using resources = ::testing::Types; +using async_resource_ref = cuda::mr::async_resource_ref; TYPED_TEST_CASE(DeviceBufferTest, resources); @@ -74,7 +75,7 @@ TYPED_TEST(DeviceBufferTest, DefaultMemoryResource) EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.ssize()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource()); + EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); } @@ -85,30 +86,28 @@ TYPED_TEST(DeviceBufferTest, DefaultMemoryResourceStream) EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource()); + EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource()); EXPECT_EQ(this->stream, buff.stream()); } TYPED_TEST(DeviceBufferTest, ExplicitMemoryResource) { - rmm::device_buffer buff(this->size, rmm::cuda_stream_view{}, &this->mr); + rmm::device_buffer buff(this->size, rmm::cuda_stream_view{}, this->mr); EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(&this->mr, buff.memory_resource()); - EXPECT_TRUE(this->mr.is_equal(*buff.memory_resource())); + EXPECT_EQ(async_resource_ref{this->mr}, buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); } TYPED_TEST(DeviceBufferTest, ExplicitMemoryResourceStream) { - rmm::device_buffer buff(this->size, this->stream, &this->mr); + rmm::device_buffer buff(this->size, this->stream, this->mr); this->stream.synchronize(); EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(&this->mr, buff.memory_resource()); - EXPECT_TRUE(this->mr.is_equal(*buff.memory_resource())); + EXPECT_EQ(async_resource_ref{this->mr}, buff.memory_resource()); EXPECT_EQ(this->stream, buff.stream()); } @@ -120,7 +119,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawDevicePointer) EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource()); + EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); // TODO check for equality between the contents of the two allocations @@ -136,7 +135,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawHostPointer) EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource()); + EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); buff.stream().synchronize(); // TODO check for equality between the contents of the two allocations @@ -149,7 +148,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromNullptr) EXPECT_EQ(nullptr, buff.data()); EXPECT_EQ(0, buff.size()); EXPECT_EQ(0, buff.capacity()); - EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource()); + EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); } @@ -175,8 +174,8 @@ TYPED_TEST(DeviceBufferTest, CopyConstructor) EXPECT_NE(buff.data(), buff_copy.data()); EXPECT_EQ(buff.size(), buff_copy.size()); EXPECT_EQ(buff.capacity(), buff_copy.capacity()); - EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource()); - EXPECT_TRUE(buff_copy.memory_resource()->is_equal(*rmm::mr::get_current_device_resource())); + EXPECT_EQ(buff_copy.memory_resource(), + async_resource_ref{rmm::mr::get_current_device_resource()}); EXPECT_EQ(buff_copy.stream(), rmm::cuda_stream_view{}); EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default), @@ -188,7 +187,7 @@ TYPED_TEST(DeviceBufferTest, CopyConstructor) // now use buff's stream and MR rmm::device_buffer buff_copy2(buff, buff.stream(), buff.memory_resource()); EXPECT_EQ(buff_copy2.memory_resource(), buff.memory_resource()); - EXPECT_TRUE(buff_copy2.memory_resource()->is_equal(*buff.memory_resource())); + EXPECT_EQ(buff_copy2.memory_resource(), buff.memory_resource()); EXPECT_EQ(buff_copy2.stream(), buff.stream()); EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default), @@ -218,8 +217,8 @@ TYPED_TEST(DeviceBufferTest, CopyCapacityLargerThanSize) // The capacity of the copy should be equal to the `size()` of the original EXPECT_EQ(new_size, buff_copy.capacity()); - EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource()); - EXPECT_TRUE(buff_copy.memory_resource()->is_equal(*rmm::mr::get_current_device_resource())); + EXPECT_EQ(buff_copy.memory_resource(), + async_resource_ref{rmm::mr::get_current_device_resource()}); EXPECT_EQ(buff_copy.stream(), rmm::cuda_stream_view{}); EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default), @@ -244,7 +243,6 @@ TYPED_TEST(DeviceBufferTest, CopyConstructorExplicitMr) EXPECT_EQ(buff.size(), buff_copy.size()); EXPECT_EQ(buff.capacity(), buff_copy.capacity()); EXPECT_EQ(buff.memory_resource(), buff_copy.memory_resource()); - EXPECT_TRUE(buff.memory_resource()->is_equal(*buff_copy.memory_resource())); EXPECT_NE(buff.stream(), buff_copy.stream()); EXPECT_TRUE(thrust::equal(rmm::exec_policy(buff_copy.stream()), @@ -276,7 +274,6 @@ TYPED_TEST(DeviceBufferTest, CopyCapacityLargerThanSizeExplicitMr) EXPECT_EQ(new_size, buff_copy.capacity()); EXPECT_NE(buff.capacity(), buff_copy.capacity()); EXPECT_EQ(buff.memory_resource(), buff_copy.memory_resource()); - EXPECT_TRUE(buff.memory_resource()->is_equal(*buff_copy.memory_resource())); EXPECT_NE(buff.stream(), buff_copy.stream()); EXPECT_TRUE(thrust::equal(rmm::exec_policy(buff_copy.stream()), @@ -292,7 +289,7 @@ TYPED_TEST(DeviceBufferTest, MoveConstructor) auto* ptr = buff.data(); auto size = buff.size(); auto capacity = buff.capacity(); - auto* mr = buff.memory_resource(); + auto mr = buff.memory_resource(); auto stream = buff.stream(); // New buffer should have the same contents as the original @@ -310,7 +307,6 @@ TYPED_TEST(DeviceBufferTest, MoveConstructor) EXPECT_EQ(0, buff.size()); // NOLINT(bugprone-use-after-move) EXPECT_EQ(0, buff.capacity()); // NOLINT(bugprone-use-after-move) EXPECT_EQ(rmm::cuda_stream_default, buff.stream()); // NOLINT(bugprone-use-after-move) - EXPECT_NE(nullptr, buff.memory_resource()); // NOLINT(bugprone-use-after-move) } TYPED_TEST(DeviceBufferTest, MoveConstructorStream) @@ -320,7 +316,7 @@ TYPED_TEST(DeviceBufferTest, MoveConstructorStream) auto* ptr = buff.data(); auto size = buff.size(); auto capacity = buff.capacity(); - auto* mr = buff.memory_resource(); + auto mr = buff.memory_resource(); auto stream = buff.stream(); // New buffer should have the same contents as the original @@ -339,7 +335,6 @@ TYPED_TEST(DeviceBufferTest, MoveConstructorStream) EXPECT_EQ(0, buff.size()); // NOLINT(bugprone-use-after-move) EXPECT_EQ(0, buff.capacity()); // NOLINT(bugprone-use-after-move) EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); // NOLINT(bugprone-use-after-move) - EXPECT_NE(nullptr, buff.memory_resource()); // NOLINT(bugprone-use-after-move) } TYPED_TEST(DeviceBufferTest, MoveAssignmentToDefault) @@ -348,7 +343,7 @@ TYPED_TEST(DeviceBufferTest, MoveAssignmentToDefault) auto* ptr = src.data(); auto size = src.size(); auto capacity = src.capacity(); - auto* mr = src.memory_resource(); + auto mr = src.memory_resource(); auto stream = src.stream(); rmm::device_buffer dest; @@ -367,7 +362,6 @@ TYPED_TEST(DeviceBufferTest, MoveAssignmentToDefault) EXPECT_EQ(0, src.size()); EXPECT_EQ(0, src.capacity()); EXPECT_EQ(rmm::cuda_stream_default, src.stream()); - EXPECT_NE(nullptr, src.memory_resource()); } TYPED_TEST(DeviceBufferTest, MoveAssignment) @@ -376,7 +370,7 @@ TYPED_TEST(DeviceBufferTest, MoveAssignment) auto* ptr = src.data(); auto size = src.size(); auto capacity = src.capacity(); - auto* mr = src.memory_resource(); + auto mr = src.memory_resource(); auto stream = src.stream(); rmm::device_buffer dest(this->size - 1, rmm::cuda_stream_default, &this->mr); @@ -395,7 +389,6 @@ TYPED_TEST(DeviceBufferTest, MoveAssignment) EXPECT_EQ(0, src.size()); EXPECT_EQ(0, src.capacity()); EXPECT_EQ(rmm::cuda_stream_default, src.stream()); - EXPECT_NE(nullptr, src.memory_resource()); } TYPED_TEST(DeviceBufferTest, SelfMoveAssignment) @@ -404,7 +397,7 @@ TYPED_TEST(DeviceBufferTest, SelfMoveAssignment) auto* ptr = buff.data(); auto size = buff.size(); auto capacity = buff.capacity(); - auto* mr = buff.memory_resource(); + auto mr = buff.memory_resource(); auto stream = buff.stream(); buff = std::move(buff); // self-move-assignment shouldn't modify the buffer diff --git a/tests/device_uvector_tests.cpp b/tests/device_uvector_tests.cpp index 69d89e305..3c042a437 100644 --- a/tests/device_uvector_tests.cpp +++ b/tests/device_uvector_tests.cpp @@ -30,14 +30,15 @@ struct TypedUVectorTest : ::testing::Test { [[nodiscard]] rmm::cuda_stream_view stream() const noexcept { return rmm::cuda_stream_view{}; } }; -using TestTypes = ::testing::Types; +using TestTypes = ::testing::Types; +using async_resource_ref = cuda::mr::async_resource_ref; TYPED_TEST_CASE(TypedUVectorTest, TestTypes); TYPED_TEST(TypedUVectorTest, MemoryResource) { rmm::device_uvector vec(128, this->stream()); - EXPECT_EQ(vec.memory_resource(), rmm::mr::get_current_device_resource()); + EXPECT_EQ(vec.memory_resource(), async_resource_ref{rmm::mr::get_current_device_resource()}); } TYPED_TEST(TypedUVectorTest, ZeroSizeConstructor) diff --git a/tests/mr/device/adaptor_tests.cpp b/tests/mr/device/adaptor_tests.cpp index 44c14240b..98fc3a429 100644 --- a/tests/mr/device/adaptor_tests.cpp +++ b/tests/mr/device/adaptor_tests.cpp @@ -29,6 +29,8 @@ #include #include +#include + #include #include @@ -64,6 +66,23 @@ using adaptors = ::testing::Types, thread_safe_resource_adaptor, tracking_resource_adaptor>; +static_assert( + cuda::mr::resource_with, cuda::mr::device_accessible>); +static_assert(cuda::mr::resource_with, + cuda::mr::device_accessible>); +static_assert(cuda::mr::resource_with, + cuda::mr::device_accessible>); +static_assert( + cuda::mr::resource_with, cuda::mr::device_accessible>); +static_assert( + cuda::mr::resource_with, cuda::mr::device_accessible>); +static_assert(cuda::mr::resource_with, + cuda::mr::device_accessible>); +static_assert(cuda::mr::resource_with, + cuda::mr::device_accessible>); +static_assert(cuda::mr::resource_with, + cuda::mr::device_accessible>); + template struct AdaptorTest : public ::testing::Test { using adaptor_type = MemoryResourceType; diff --git a/tests/mr/device/cuda_async_mr_tests.cpp b/tests/mr/device/cuda_async_mr_tests.cpp index 37ed5c306..90c7b0ff9 100644 --- a/tests/mr/device/cuda_async_mr_tests.cpp +++ b/tests/mr/device/cuda_async_mr_tests.cpp @@ -24,6 +24,8 @@ namespace rmm::test { namespace { using cuda_async_mr = rmm::mr::cuda_async_memory_resource; +static_assert(cuda::mr::resource_with); +static_assert(cuda::mr::async_resource_with); class AsyncMRTest : public ::testing::Test { protected: diff --git a/tests/mr/device/cuda_async_view_mr_tests.cpp b/tests/mr/device/cuda_async_view_mr_tests.cpp index 209429b4b..fe82431a9 100644 --- a/tests/mr/device/cuda_async_view_mr_tests.cpp +++ b/tests/mr/device/cuda_async_view_mr_tests.cpp @@ -18,12 +18,16 @@ #include #include +#include + #include namespace rmm::test { namespace { using cuda_async_view_mr = rmm::mr::cuda_async_view_memory_resource; +static_assert(cuda::mr::resource_with); +static_assert(cuda::mr::async_resource_with); #if defined(RMM_CUDA_MALLOC_ASYNC_SUPPORT) diff --git a/tests/mr/device/mr_ref_multithreaded_tests.cpp b/tests/mr/device/mr_ref_multithreaded_tests.cpp new file mode 100644 index 000000000..76f9e6b61 --- /dev/null +++ b/tests/mr/device/mr_ref_multithreaded_tests.cpp @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "mr_ref_test.hpp" + +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +namespace rmm::test { +namespace { + +struct mr_ref_test_mt : public mr_ref_test {}; + +INSTANTIATE_TEST_CASE_P(MultiThreadResourceTests, + mr_ref_test_mt, + ::testing::Values(mr_factory{"CUDA", &make_cuda}, +#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT + mr_factory{"CUDA_Async", &make_cuda_async}, +#endif + mr_factory{"Managed", &make_managed}, + mr_factory{"Pool", &make_pool}, + mr_factory{"Arena", &make_arena}, + mr_factory{"Binning", &make_binning}), + [](auto const& info) { return info.param.name; }); + +template +void spawn_n(std::size_t num_threads, Task task, Arguments&&... args) +{ + std::vector threads; + threads.reserve(num_threads); + for (std::size_t i = 0; i < num_threads; ++i) { + threads.emplace_back(std::thread(task, std::forward(args)...)); + } + + for (auto& thread : threads) { + thread.join(); + } +} + +template +void spawn(Task task, Arguments&&... args) +{ + spawn_n(4, task, std::forward(args)...); +} + +TEST_P(mr_ref_test_mt, Allocate) { spawn(test_various_allocations, this->ref); } + +TEST_P(mr_ref_test_mt, AllocateDefaultStream) +{ + spawn(test_various_async_allocations, this->ref, rmm::cuda_stream_view{}); +} + +TEST_P(mr_ref_test_mt, AllocateOnStream) +{ + spawn(test_various_async_allocations, this->ref, this->stream.view()); +} + +TEST_P(mr_ref_test_mt, RandomAllocations) +{ + spawn(test_random_allocations, this->ref, default_num_allocations, default_max_size); +} + +TEST_P(mr_ref_test_mt, RandomAllocationsDefaultStream) +{ + spawn(test_random_async_allocations, + this->ref, + default_num_allocations, + default_max_size, + rmm::cuda_stream_view{}); +} + +TEST_P(mr_ref_test_mt, RandomAllocationsStream) +{ + spawn(test_random_async_allocations, + this->ref, + default_num_allocations, + default_max_size, + this->stream.view()); +} + +TEST_P(mr_ref_test_mt, MixedRandomAllocationFree) +{ + spawn(test_mixed_random_allocation_free, this->ref, default_max_size); +} + +TEST_P(mr_ref_test_mt, MixedRandomAllocationFreeDefaultStream) +{ + spawn( + test_mixed_random_async_allocation_free, this->ref, default_max_size, rmm::cuda_stream_view{}); +} + +TEST_P(mr_ref_test_mt, MixedRandomAllocationFreeStream) +{ + spawn(test_mixed_random_async_allocation_free, this->ref, default_max_size, this->stream.view()); +} + +void allocate_async_loop(async_resource_ref ref, + std::size_t num_allocations, + std::list& allocations, + std::mutex& mtx, + std::condition_variable& allocations_ready, + cudaEvent_t& event, + rmm::cuda_stream_view stream) +{ + constexpr std::size_t max_size{1_MiB}; + + std::default_random_engine generator; + std::uniform_int_distribution size_distribution(1, max_size); + + for (std::size_t i = 0; i < num_allocations; ++i) { + std::size_t size = size_distribution(generator); + void* ptr = ref.allocate_async(size, stream); + { + std::lock_guard lock(mtx); + RMM_CUDA_TRY(cudaEventRecord(event, stream.value())); + allocations.emplace_back(ptr, size); + } + allocations_ready.notify_one(); + } + + // Work around for threads going away before cudaEvent has finished async processing + cudaEventSynchronize(event); +} + +void deallocate_async_loop(async_resource_ref ref, + std::size_t num_allocations, + std::list& allocations, + std::mutex& mtx, + std::condition_variable& allocations_ready, + cudaEvent_t& event, + rmm::cuda_stream_view stream) +{ + for (std::size_t i = 0; i < num_allocations; i++) { + std::unique_lock lock(mtx); + allocations_ready.wait(lock, [&allocations] { return !allocations.empty(); }); + RMM_CUDA_TRY(cudaStreamWaitEvent(stream.value(), event)); + allocation alloc = allocations.front(); + allocations.pop_front(); + ref.deallocate_async(alloc.ptr, alloc.size, stream); + } + + // Work around for threads going away before cudaEvent has finished async processing + cudaEventSynchronize(event); +} + +void test_allocate_async_free_different_threads(async_resource_ref ref, + rmm::cuda_stream_view streamA, + rmm::cuda_stream_view streamB) +{ + constexpr std::size_t num_allocations{100}; + + std::mutex mtx; + std::condition_variable allocations_ready; + std::list allocations; + cudaEvent_t event; + + RMM_CUDA_TRY(cudaEventCreate(&event)); + + std::thread producer(allocate_async_loop, + ref, + num_allocations, + std::ref(allocations), + std::ref(mtx), + std::ref(allocations_ready), + std::ref(event), + streamA); + + std::thread consumer(deallocate_async_loop, + ref, + num_allocations, + std::ref(allocations), + std::ref(mtx), + std::ref(allocations_ready), + std::ref(event), + streamB); + + producer.join(); + consumer.join(); + + RMM_CUDA_TRY(cudaEventDestroy(event)); +} + +TEST_P(mr_ref_test_mt, AllocFreeDifferentThreadsDefaultStream) +{ + test_allocate_async_free_different_threads( + this->ref, rmm::cuda_stream_default, rmm::cuda_stream_default); +} + +TEST_P(mr_ref_test_mt, AllocFreeDifferentThreadsPerThreadDefaultStream) +{ + test_allocate_async_free_different_threads( + this->ref, rmm::cuda_stream_per_thread, rmm::cuda_stream_per_thread); +} + +TEST_P(mr_ref_test_mt, AllocFreeDifferentThreadsSameStream) +{ + test_allocate_async_free_different_threads(this->ref, this->stream, this->stream); +} + +TEST_P(mr_ref_test_mt, AllocFreeDifferentThreadsDifferentStream) +{ + rmm::cuda_stream streamB; + test_allocate_async_free_different_threads(this->ref, this->stream, streamB); + streamB.synchronize(); +} + +} // namespace +} // namespace rmm::test diff --git a/tests/mr/device/mr_ref_test.hpp b/tests/mr/device/mr_ref_test.hpp new file mode 100644 index 000000000..804c710a5 --- /dev/null +++ b/tests/mr/device/mr_ref_test.hpp @@ -0,0 +1,408 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "../../byte_literals.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +#include +#include +#include +#include +#include + +using resource_ref = cuda::mr::resource_ref; +using async_resource_ref = cuda::mr::async_resource_ref; + +namespace rmm::test { + +/** + * @brief Returns if a pointer points to a device memory or managed memory + * allocation. + */ +inline bool is_device_memory(void* ptr) +{ + cudaPointerAttributes attributes{}; + if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; } + return (attributes.type == cudaMemoryTypeDevice) or (attributes.type == cudaMemoryTypeManaged); +} + +enum size_in_bytes : size_t {}; + +constexpr auto default_num_allocations{100}; +constexpr size_in_bytes default_max_size{5_MiB}; + +struct allocation { + void* ptr{nullptr}; + std::size_t size{0}; + allocation(void* ptr, std::size_t size) : ptr{ptr}, size{size} {} + allocation() = default; +}; + +// Various test functions, shared between single-threaded and multithreaded tests. +inline void test_allocate(resource_ref ref, std::size_t bytes) +{ + try { + void* ptr = ref.allocate(bytes); + EXPECT_NE(nullptr, ptr); + EXPECT_TRUE(rmm::detail::is_pointer_aligned(ptr)); + EXPECT_TRUE(is_device_memory(ptr)); + ref.deallocate(ptr, bytes); + } catch (rmm::out_of_memory const& e) { + EXPECT_NE(std::string{e.what()}.find("out_of_memory"), std::string::npos); + } +} + +inline void test_allocate_async(async_resource_ref ref, + std::size_t bytes, + cuda_stream_view stream = {}) +{ + try { + void* ptr = ref.allocate_async(bytes, stream); + if (not stream.is_default()) { stream.synchronize(); } + EXPECT_NE(nullptr, ptr); + EXPECT_TRUE(rmm::detail::is_pointer_aligned(ptr)); + EXPECT_TRUE(is_device_memory(ptr)); + ref.deallocate_async(ptr, bytes, stream); + if (not stream.is_default()) { stream.synchronize(); } + } catch (rmm::out_of_memory const& e) { + EXPECT_NE(std::string{e.what()}.find("out_of_memory"), std::string::npos); + } +} + +// Simple reproducer for https://github.com/rapidsai/rmm/issues/861 +inline void concurrent_allocations_are_different(resource_ref ref) +{ + const auto size{8_B}; + void* ptr1 = ref.allocate(size); + void* ptr2 = ref.allocate(size); + + EXPECT_NE(ptr1, ptr2); + + ref.deallocate(ptr1, size); + ref.deallocate(ptr2, size); +} + +inline void concurrent_async_allocations_are_different(async_resource_ref ref, + cuda_stream_view stream) +{ + const auto size{8_B}; + void* ptr1 = ref.allocate_async(size, stream); + void* ptr2 = ref.allocate_async(size, stream); + + EXPECT_NE(ptr1, ptr2); + + ref.deallocate_async(ptr1, size, stream); + ref.deallocate_async(ptr2, size, stream); +} + +inline void test_various_allocations(resource_ref ref) +{ + // test allocating zero bytes on non-default stream + { + void* ptr = ref.allocate(0); + EXPECT_NO_THROW(ref.deallocate(ptr, 0)); + } + + test_allocate(ref, 4_B); + test_allocate(ref, 1_KiB); + test_allocate(ref, 1_MiB); + test_allocate(ref, 1_GiB); + + // should fail to allocate too much + { + void* ptr{nullptr}; + EXPECT_THROW(ptr = ref.allocate(1_PiB), rmm::out_of_memory); + EXPECT_EQ(nullptr, ptr); + + // test e.what(); + try { + ptr = ref.allocate(1_PiB); + } catch (rmm::out_of_memory const& e) { + EXPECT_NE(std::string{e.what()}.find("out_of_memory"), std::string::npos); + } + } +} + +inline void test_various_async_allocations(async_resource_ref ref, cuda_stream_view stream) +{ + // test allocating zero bytes on non-default stream + { + void* ptr = ref.allocate_async(0, stream); + stream.synchronize(); + EXPECT_NO_THROW(ref.deallocate_async(ptr, 0, stream)); + stream.synchronize(); + } + + test_allocate_async(ref, 4_B, stream); + test_allocate_async(ref, 1_KiB, stream); + test_allocate_async(ref, 1_MiB, stream); + test_allocate_async(ref, 1_GiB, stream); + + // should fail to allocate too much + { + void* ptr{nullptr}; + EXPECT_THROW(ptr = ref.allocate_async(1_PiB, stream), rmm::out_of_memory); + EXPECT_EQ(nullptr, ptr); + + // test e.what(); + try { + ptr = ref.allocate_async(1_PiB, stream); + } catch (rmm::out_of_memory const& e) { + EXPECT_NE(std::string{e.what()}.find("out_of_memory"), std::string::npos); + } + } +} + +inline void test_random_allocations(resource_ref ref, + std::size_t num_allocations = default_num_allocations, + size_in_bytes max_size = default_max_size) +{ + std::vector allocations(num_allocations); + + std::default_random_engine generator; + std::uniform_int_distribution distribution(1, max_size); + + // num_allocations allocations from [0,max_size) + std::for_each( + allocations.begin(), allocations.end(), [&generator, &distribution, &ref](allocation& alloc) { + alloc.size = distribution(generator); + EXPECT_NO_THROW(alloc.ptr = ref.allocate(alloc.size)); + EXPECT_NE(nullptr, alloc.ptr); + EXPECT_TRUE(rmm::detail::is_pointer_aligned(alloc.ptr)); + }); + + std::for_each(allocations.begin(), allocations.end(), [&ref](allocation& alloc) { + EXPECT_NO_THROW(ref.deallocate(alloc.ptr, alloc.size)); + }); +} + +inline void test_random_async_allocations(async_resource_ref ref, + std::size_t num_allocations = default_num_allocations, + size_in_bytes max_size = default_max_size, + cuda_stream_view stream = {}) +{ + std::vector allocations(num_allocations); + + std::default_random_engine generator; + std::uniform_int_distribution distribution(1, max_size); + + // num_allocations allocations from [0,max_size) + std::for_each(allocations.begin(), + allocations.end(), + [&generator, &distribution, &ref, stream](allocation& alloc) { + alloc.size = distribution(generator); + EXPECT_NO_THROW(alloc.ptr = ref.allocate(alloc.size)); + if (not stream.is_default()) { stream.synchronize(); } + EXPECT_NE(nullptr, alloc.ptr); + EXPECT_TRUE(rmm::detail::is_pointer_aligned(alloc.ptr)); + }); + + std::for_each(allocations.begin(), allocations.end(), [stream, &ref](allocation& alloc) { + EXPECT_NO_THROW(ref.deallocate(alloc.ptr, alloc.size)); + if (not stream.is_default()) { stream.synchronize(); } + }); +} + +inline void test_mixed_random_allocation_free(resource_ref ref, + size_in_bytes max_size = default_max_size) +{ + std::default_random_engine generator; + constexpr std::size_t num_allocations{100}; + + std::uniform_int_distribution size_distribution(1, max_size); + + constexpr int allocation_probability{53}; // percent + constexpr int max_probability{99}; + std::uniform_int_distribution op_distribution(0, max_probability); + std::uniform_int_distribution index_distribution(0, num_allocations - 1); + + std::size_t active_allocations{0}; + std::size_t allocation_count{0}; + + std::vector allocations; + + for (std::size_t i = 0; i < num_allocations * 2; ++i) { + bool do_alloc = true; + if (active_allocations > 0) { + int chance = op_distribution(generator); + do_alloc = (chance < allocation_probability) && (allocation_count < num_allocations); + } + + if (do_alloc) { + std::size_t size = size_distribution(generator); + active_allocations++; + allocation_count++; + EXPECT_NO_THROW(allocations.emplace_back(ref.allocate(size), size)); + auto new_allocation = allocations.back(); + EXPECT_NE(nullptr, new_allocation.ptr); + EXPECT_TRUE(rmm::detail::is_pointer_aligned(new_allocation.ptr)); + } else { + auto const index = static_cast(index_distribution(generator) % active_allocations); + active_allocations--; + allocation to_free = allocations[index]; + allocations.erase(std::next(allocations.begin(), index)); + EXPECT_NO_THROW(ref.deallocate(to_free.ptr, to_free.size)); + } + } + + EXPECT_EQ(active_allocations, 0); + EXPECT_EQ(allocations.size(), active_allocations); +} + +inline void test_mixed_random_async_allocation_free(async_resource_ref ref, + size_in_bytes max_size = default_max_size, + cuda_stream_view stream = {}) +{ + std::default_random_engine generator; + constexpr std::size_t num_allocations{100}; + + std::uniform_int_distribution size_distribution(1, max_size); + + constexpr int allocation_probability{53}; // percent + constexpr int max_probability{99}; + std::uniform_int_distribution op_distribution(0, max_probability); + std::uniform_int_distribution index_distribution(0, num_allocations - 1); + + std::size_t active_allocations{0}; + std::size_t allocation_count{0}; + + std::vector allocations; + + for (std::size_t i = 0; i < num_allocations * 2; ++i) { + bool do_alloc = true; + if (active_allocations > 0) { + int chance = op_distribution(generator); + do_alloc = (chance < allocation_probability) && (allocation_count < num_allocations); + } + + if (do_alloc) { + std::size_t size = size_distribution(generator); + active_allocations++; + allocation_count++; + EXPECT_NO_THROW(allocations.emplace_back(ref.allocate_async(size, stream), size)); + auto new_allocation = allocations.back(); + EXPECT_NE(nullptr, new_allocation.ptr); + EXPECT_TRUE(rmm::detail::is_pointer_aligned(new_allocation.ptr)); + } else { + auto const index = static_cast(index_distribution(generator) % active_allocations); + active_allocations--; + allocation to_free = allocations[index]; + allocations.erase(std::next(allocations.begin(), index)); + EXPECT_NO_THROW(ref.deallocate_async(to_free.ptr, to_free.size, stream)); + } + } + + EXPECT_EQ(active_allocations, 0); + EXPECT_EQ(allocations.size(), active_allocations); +} + +using MRFactoryFunc = std::function()>; + +/// Encapsulates a `device_memory_resource` factory function and associated name +struct mr_factory { + mr_factory(std::string name, MRFactoryFunc factory) + : name{std::move(name)}, factory{std::move(factory)} + { + } + + std::string name; ///< Name to associate with tests that use this factory + MRFactoryFunc factory; ///< Factory function that returns shared_ptr to `device_memory_resource` + ///< instance to use in test +}; + +/// Test fixture class value-parameterized on different `mr_factory`s +struct mr_ref_test : public ::testing::TestWithParam { + void SetUp() override + { + auto factory = GetParam().factory; + mr = factory(); + if (mr == nullptr) { + GTEST_SKIP() << "Skipping tests since the memory resource is not supported with this CUDA " + << "driver/runtime version"; + } + ref = async_resource_ref{*mr}; + } + + std::shared_ptr mr; ///< Pointer to resource to use in tests + async_resource_ref ref{*mr}; + rmm::cuda_stream stream{}; +}; + +struct mr_ref_allocation_test : public mr_ref_test {}; + +/// MR factory functions +inline auto make_cuda() { return std::make_shared(); } + +inline auto make_cuda_async() +{ + if (rmm::detail::async_alloc::is_supported()) { + return std::make_shared(); + } + return std::shared_ptr{nullptr}; +} + +inline auto make_managed() { return std::make_shared(); } + +inline auto make_pool() +{ + return rmm::mr::make_owning_wrapper(make_cuda()); +} + +inline auto make_arena() +{ + return rmm::mr::make_owning_wrapper(make_cuda()); +} + +inline auto make_fixed_size() +{ + return rmm::mr::make_owning_wrapper(make_cuda()); +} + +inline auto make_binning() +{ + auto pool = make_pool(); + // Add a binning_memory_resource with fixed-size bins of sizes 256, 512, 1024, 2048 and 4096KiB + // Larger allocations will use the pool resource + auto const bin_range_start{18}; + auto const bin_range_end{22}; + + auto mr = rmm::mr::make_owning_wrapper( + pool, bin_range_start, bin_range_end); + return mr; +} + +} // namespace rmm::test diff --git a/tests/mr/device/mr_ref_tests.cpp b/tests/mr/device/mr_ref_tests.cpp new file mode 100644 index 000000000..a9a94696a --- /dev/null +++ b/tests/mr/device/mr_ref_tests.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "mr_ref_test.hpp" + +#include + +#include + +#include + +namespace rmm::test { +namespace { + +INSTANTIATE_TEST_SUITE_P(ResourceTests, + mr_ref_test, + ::testing::Values(mr_factory{"CUDA", &make_cuda}, +#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT + mr_factory{"CUDA_Async", &make_cuda_async}, +#endif + mr_factory{"Managed", &make_managed}, + mr_factory{"Pool", &make_pool}, + mr_factory{"Arena", &make_arena}, + mr_factory{"Binning", &make_binning}, + mr_factory{"Fixed_Size", &make_fixed_size}), + [](auto const& info) { return info.param.name; }); + +// Leave out fixed-size MR here because it can't handle the dynamic allocation sizes +INSTANTIATE_TEST_SUITE_P(ResourceAllocationTests, + mr_ref_allocation_test, + ::testing::Values(mr_factory{"CUDA", &make_cuda}, +#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT + mr_factory{"CUDA_Async", &make_cuda_async}, +#endif + mr_factory{"Managed", &make_managed}, + mr_factory{"Pool", &make_pool}, + mr_factory{"Arena", &make_arena}, + mr_factory{"Binning", &make_binning}), + [](auto const& info) { return info.param.name; }); +TEST_P(mr_ref_test, SelfEquality) { EXPECT_TRUE(this->ref == this->ref); } + +// Simple reproducer for https://github.com/rapidsai/rmm/issues/861 +TEST_P(mr_ref_test, AllocationsAreDifferent) { concurrent_allocations_are_different(this->ref); } + +TEST_P(mr_ref_test, AsyncAllocationsAreDifferentDefaultStream) +{ + concurrent_async_allocations_are_different(this->ref, cuda_stream_view{}); +} + +TEST_P(mr_ref_test, AsyncAllocationsAreDifferent) +{ + concurrent_async_allocations_are_different(this->ref, this->stream); +} + +TEST_P(mr_ref_allocation_test, AllocateDefault) { test_various_allocations(this->ref); } + +TEST_P(mr_ref_allocation_test, AllocateDefaultStream) +{ + test_various_async_allocations(this->ref, cuda_stream_view{}); +} + +TEST_P(mr_ref_allocation_test, AllocateOnStream) +{ + test_various_async_allocations(this->ref, this->stream); +} + +TEST_P(mr_ref_allocation_test, RandomAllocations) { test_random_allocations(this->ref); } + +TEST_P(mr_ref_allocation_test, RandomAllocationsDefaultStream) +{ + test_random_async_allocations( + this->ref, default_num_allocations, default_max_size, cuda_stream_view{}); +} + +TEST_P(mr_ref_allocation_test, RandomAllocationsStream) +{ + test_random_async_allocations(this->ref, default_num_allocations, default_max_size, this->stream); +} + +TEST_P(mr_ref_allocation_test, MixedRandomAllocationFree) +{ + test_mixed_random_allocation_free(this->ref, default_max_size); +} + +TEST_P(mr_ref_allocation_test, MixedRandomAllocationFreeDefaultStream) +{ + test_mixed_random_async_allocation_free(this->ref, default_max_size, cuda_stream_view{}); +} + +TEST_P(mr_ref_allocation_test, MixedRandomAllocationFreeStream) +{ + test_mixed_random_async_allocation_free(this->ref, default_max_size, this->stream); +} + +} // namespace +} // namespace rmm::test diff --git a/tests/mr/device/mr_test.hpp b/tests/mr/device/mr_test.hpp index 8c69df215..03f880e72 100644 --- a/tests/mr/device/mr_test.hpp +++ b/tests/mr/device/mr_test.hpp @@ -52,11 +52,7 @@ inline bool is_device_memory(void* ptr) { cudaPointerAttributes attributes{}; if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; } -#if CUDART_VERSION < 10000 // memoryType is deprecated in CUDA 10 - return attributes.memoryType == cudaMemoryTypeDevice; -#else return (attributes.type == cudaMemoryTypeDevice) or (attributes.type == cudaMemoryTypeManaged); -#endif } enum size_in_bytes : size_t {}; diff --git a/tests/mr/device/pool_mr_tests.cpp b/tests/mr/device/pool_mr_tests.cpp index 4a234d2f9..2f32889d0 100644 --- a/tests/mr/device/pool_mr_tests.cpp +++ b/tests/mr/device/pool_mr_tests.cpp @@ -55,6 +55,18 @@ TEST(PoolTest, ThrowMaxLessThanInitial) EXPECT_THROW(max_less_than_initial(), rmm::logic_error); } +TEST(PoolTest, ReferenceThrowMaxLessThanInitial) +{ + // Make sure first argument is enough larger than the second that alignment rounding doesn't + // make them equal + auto max_less_than_initial = []() { + const auto initial{1024}; + const auto maximum{256}; + pool_mr mr{*rmm::mr::get_current_device_resource(), initial, maximum}; + }; + EXPECT_THROW(max_less_than_initial(), rmm::logic_error); +} + TEST(PoolTest, AllocateNinetyPercent) { auto allocate_ninety = []() { @@ -190,4 +202,43 @@ TEST(PoolTest, MultidevicePool) } } // namespace + +namespace test_properties { +class fake_async_resource { + public: + // To model `async_resource` + void* allocate(std::size_t, std::size_t) { return nullptr; } + void deallocate(void* ptr, std::size_t, std::size_t) {} + void* allocate_async(std::size_t, std::size_t, cuda::stream_ref) { return nullptr; } + void deallocate_async(void* ptr, std::size_t, std::size_t, cuda::stream_ref) {} + + bool operator==(const fake_async_resource& other) const { return true; } + bool operator!=(const fake_async_resource& other) const { return false; } + + // To model stream_resource + [[nodiscard]] bool supports_streams() const noexcept { return false; } + [[nodiscard]] bool supports_get_mem_info() const noexcept { return false; } + + private: + void* do_allocate(std::size_t bytes, cuda_stream_view) { return nullptr; } + void do_deallocate(void* ptr, std::size_t, cuda_stream_view) {} + [[nodiscard]] bool do_is_equal(fake_async_resource const& other) const noexcept { return true; } +}; +static_assert(!cuda::has_property); +static_assert(!cuda::has_property, + cuda::mr::device_accessible>); + +// Ensure that we forward the property if it is there +class fake_async_resource_device_accessible : public fake_async_resource { + friend void get_property(const fake_async_resource_device_accessible&, + cuda::mr::device_accessible) + { + } +}; +static_assert( + cuda::has_property); +static_assert( + cuda::has_property, + cuda::mr::device_accessible>); +} // namespace test_properties } // namespace rmm::test diff --git a/tests/mr/device/thrust_allocator_tests.cu b/tests/mr/device/thrust_allocator_tests.cu index 41fb15973..ed8875cbe 100644 --- a/tests/mr/device/thrust_allocator_tests.cu +++ b/tests/mr/device/thrust_allocator_tests.cu @@ -32,6 +32,7 @@ namespace rmm::test { namespace { struct allocator_test : public mr_test {}; +using async_resource_ref = cuda::mr::async_resource_ref; TEST_P(allocator_test, first) { @@ -44,7 +45,8 @@ TEST_P(allocator_test, defaults) { rmm::mr::thrust_allocator allocator(rmm::cuda_stream_default); EXPECT_EQ(allocator.stream(), rmm::cuda_stream_default); - EXPECT_EQ(allocator.resource(), rmm::mr::get_current_device_resource()); + EXPECT_EQ(allocator.memory_resource(), + async_resource_ref{rmm::mr::get_current_device_resource()}); } INSTANTIATE_TEST_CASE_P(ThrustAllocatorTests, diff --git a/tests/mr/host/mr_ref_tests.cpp b/tests/mr/host/mr_ref_tests.cpp new file mode 100644 index 000000000..6563eb635 --- /dev/null +++ b/tests/mr/host/mr_ref_tests.cpp @@ -0,0 +1,258 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../../byte_literals.hpp" + +#include +#include +#include +#include + +#include + +#include + +#include + +#include +#include +#include + +namespace rmm::test { +namespace { +inline bool is_aligned(void* ptr, std::size_t alignment = alignof(std::max_align_t)) +{ + return rmm::detail::is_pointer_aligned(ptr, alignment); +} + +// Returns true if a pointer points to a device memory or managed memory allocation. +inline bool is_device_memory(void* ptr) +{ + cudaPointerAttributes attributes{}; + if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; } + return (attributes.type == cudaMemoryTypeDevice) or (attributes.type == cudaMemoryTypeManaged); +} + +/** + * @brief Returns if a pointer `p` points to pinned host memory. + */ +inline bool is_pinned_memory(void* ptr) +{ + cudaPointerAttributes attributes{}; + if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; } + return attributes.type == cudaMemoryTypeHost; +} + +constexpr std::size_t size_word{4_B}; +constexpr std::size_t size_kb{1_KiB}; +constexpr std::size_t size_mb{1_MiB}; +constexpr std::size_t size_gb{1_GiB}; +constexpr std::size_t size_pb{1_PiB}; + +struct allocation { + void* ptr{nullptr}; + std::size_t size{0}; + allocation(void* ptr, std::size_t size) : ptr{ptr}, size{size} {} + allocation() = default; +}; +} // namespace + +template +struct MRRefTest : public ::testing::Test { + MemoryResourceType mr; + cuda::mr::resource_ref ref; + + MRRefTest() : mr{}, ref{mr} {} +}; + +using resources = ::testing::Types; +static_assert(cuda::mr::resource_with); +static_assert(cuda::mr::resource_with); + +TYPED_TEST_CASE(MRRefTest, resources); + +TYPED_TEST(MRRefTest, SelfEquality) { EXPECT_TRUE(this->ref == this->ref); } + +TYPED_TEST(MRRefTest, AllocateZeroBytes) +{ + void* ptr{nullptr}; + EXPECT_NO_THROW(ptr = this->ref.allocate(0)); + EXPECT_NO_THROW(this->ref.deallocate(ptr, 0)); +} + +TYPED_TEST(MRRefTest, AllocateWord) +{ + void* ptr{nullptr}; + EXPECT_NO_THROW(ptr = this->ref.allocate(size_word)); + EXPECT_NE(nullptr, ptr); + EXPECT_TRUE(is_aligned(ptr)); + EXPECT_FALSE(is_device_memory(ptr)); + EXPECT_NO_THROW(this->ref.deallocate(ptr, size_word)); +} + +TYPED_TEST(MRRefTest, AllocateKB) +{ + void* ptr{nullptr}; + EXPECT_NO_THROW(ptr = this->ref.allocate(size_kb)); + EXPECT_NE(nullptr, ptr); + EXPECT_TRUE(is_aligned(ptr)); + EXPECT_FALSE(is_device_memory(ptr)); + EXPECT_NO_THROW(this->ref.deallocate(ptr, size_kb)); +} + +TYPED_TEST(MRRefTest, AllocateMB) +{ + void* ptr{nullptr}; + EXPECT_NO_THROW(ptr = this->ref.allocate(size_mb)); + EXPECT_NE(nullptr, ptr); + EXPECT_TRUE(is_aligned(ptr)); + EXPECT_FALSE(is_device_memory(ptr)); + EXPECT_NO_THROW(this->ref.deallocate(ptr, size_mb)); +} + +TYPED_TEST(MRRefTest, AllocateGB) +{ + void* ptr{nullptr}; + EXPECT_NO_THROW(ptr = this->ref.allocate(size_gb)); + EXPECT_NE(nullptr, ptr); + EXPECT_TRUE(is_aligned(ptr)); + EXPECT_FALSE(is_device_memory(ptr)); + EXPECT_NO_THROW(this->ref.deallocate(ptr, size_gb)); +} + +TYPED_TEST(MRRefTest, AllocateTooMuch) +{ + void* ptr{nullptr}; + EXPECT_THROW(ptr = this->ref.allocate(size_pb), std::bad_alloc); + EXPECT_EQ(nullptr, ptr); +} + +TYPED_TEST(MRRefTest, RandomAllocations) +{ + constexpr std::size_t num_allocations{100}; + std::vector allocations(num_allocations); + + constexpr std::size_t MAX_ALLOCATION_SIZE{5 * size_mb}; + + std::default_random_engine generator; + std::uniform_int_distribution distribution(1, MAX_ALLOCATION_SIZE); + + // 100 allocations from [0,5MB) + std::for_each( + allocations.begin(), allocations.end(), [&generator, &distribution, this](allocation& alloc) { + alloc.size = distribution(generator); + EXPECT_NO_THROW(alloc.ptr = this->ref.allocate(alloc.size)); + EXPECT_NE(nullptr, alloc.ptr); + EXPECT_TRUE(is_aligned(alloc.ptr)); + }); + + std::for_each(allocations.begin(), allocations.end(), [this](allocation& alloc) { + EXPECT_NO_THROW(this->ref.deallocate(alloc.ptr, alloc.size)); + }); +} + +TYPED_TEST(MRRefTest, MixedRandomAllocationFree) +{ + std::default_random_engine generator; + + constexpr std::size_t MAX_ALLOCATION_SIZE{10 * size_mb}; + std::uniform_int_distribution size_distribution(1, MAX_ALLOCATION_SIZE); + + // How often a free will occur. For example, if `1`, then every allocation + // will immediately be free'd. Or, if 4, on average, a free will occur after + // every 4th allocation + constexpr std::size_t FREE_FREQUENCY{4}; + std::uniform_int_distribution free_distribution(1, FREE_FREQUENCY); + + std::deque allocations; + + constexpr std::size_t num_allocations{100}; + for (std::size_t i = 0; i < num_allocations; ++i) { + std::size_t allocation_size = size_distribution(generator); + EXPECT_NO_THROW(allocations.emplace_back(this->ref.allocate(allocation_size), allocation_size)); + auto new_allocation = allocations.back(); + EXPECT_NE(nullptr, new_allocation.ptr); + EXPECT_TRUE(is_aligned(new_allocation.ptr)); + + bool const free_front{free_distribution(generator) == free_distribution.max()}; + + if (free_front) { + auto front = allocations.front(); + EXPECT_NO_THROW(this->ref.deallocate(front.ptr, front.size)); + allocations.pop_front(); + } + } + // free any remaining allocations + for (auto alloc : allocations) { + EXPECT_NO_THROW(this->ref.deallocate(alloc.ptr, alloc.size)); + allocations.pop_front(); + } +} + +static constexpr std::size_t MinTestedAlignment{16}; +static constexpr std::size_t MaxTestedAlignment{4096}; +static constexpr std::size_t TestedAlignmentMultiplier{2}; +static constexpr std::size_t NUM_TRIALS{100}; + +TYPED_TEST(MRRefTest, AlignmentTest) +{ + std::default_random_engine generator(0); + constexpr std::size_t MAX_ALLOCATION_SIZE{10 * size_mb}; + std::uniform_int_distribution size_distribution(1, MAX_ALLOCATION_SIZE); + + for (std::size_t num_trials = 0; num_trials < NUM_TRIALS; ++num_trials) { + for (std::size_t alignment = MinTestedAlignment; alignment <= MaxTestedAlignment; + alignment *= TestedAlignmentMultiplier) { + auto allocation_size = size_distribution(generator); + void* ptr{nullptr}; + EXPECT_NO_THROW(ptr = this->ref.allocate(allocation_size, alignment)); + EXPECT_TRUE(is_aligned(ptr, alignment)); + EXPECT_NO_THROW(this->ref.deallocate(ptr, allocation_size, alignment)); + } + } +} + +TYPED_TEST(MRRefTest, UnsupportedAlignmentTest) +{ + std::default_random_engine generator(0); + constexpr std::size_t MAX_ALLOCATION_SIZE{10 * size_mb}; + std::uniform_int_distribution size_distribution(1, MAX_ALLOCATION_SIZE); + + for (std::size_t num_trials = 0; num_trials < NUM_TRIALS; ++num_trials) { + for (std::size_t alignment = MinTestedAlignment; alignment <= MaxTestedAlignment; + alignment *= TestedAlignmentMultiplier) { + auto allocation_size = size_distribution(generator); + void* ptr{nullptr}; + // An unsupported alignment (like an odd number) should result in an + // alignment of `alignof(std::max_align_t)` + auto const bad_alignment = alignment + 1; + EXPECT_NO_THROW(ptr = this->ref.allocate(allocation_size, bad_alignment)); + EXPECT_TRUE(is_aligned(ptr, alignof(std::max_align_t))); + EXPECT_NO_THROW(this->ref.deallocate(ptr, allocation_size, bad_alignment)); + } + } +} + +TEST(PinnedResource, isPinned) +{ + rmm::mr::pinned_memory_resource mr; + cuda::mr::resource_ref ref{mr}; + void* ptr{nullptr}; + EXPECT_NO_THROW(ptr = ref.allocate(100)); + EXPECT_TRUE(is_pinned_memory(ptr)); + EXPECT_NO_THROW(ref.deallocate(ptr, 100)); +} +} // namespace rmm::test diff --git a/tests/mr/host/mr_tests.cpp b/tests/mr/host/mr_tests.cpp index 1cd59f5a6..678d6aeb8 100644 --- a/tests/mr/host/mr_tests.cpp +++ b/tests/mr/host/mr_tests.cpp @@ -23,6 +23,8 @@ #include +#include + #include #include @@ -41,11 +43,7 @@ inline bool is_device_memory(void* ptr) { cudaPointerAttributes attributes{}; if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; } -#if CUDART_VERSION < 10000 // memoryType is deprecated in CUDA 10 - return attributes.memoryType == cudaMemoryTypeDevice; -#else return (attributes.type == cudaMemoryTypeDevice) or (attributes.type == cudaMemoryTypeManaged); -#endif } /** @@ -80,6 +78,8 @@ struct MRTest : public ::testing::Test { }; using resources = ::testing::Types; +static_assert(cuda::mr::resource_with); +static_assert(cuda::mr::resource_with); TYPED_TEST_CASE(MRTest, resources); diff --git a/tests/mr/host/pinned_pool_mr_tests.cpp b/tests/mr/host/pinned_pool_mr_tests.cpp new file mode 100644 index 000000000..dcdae37fa --- /dev/null +++ b/tests/mr/host/pinned_pool_mr_tests.cpp @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +// explicit instantiation for test coverage purposes +template class rmm::mr::pool_memory_resource; + +namespace rmm::test { +namespace { +using pool_mr = rmm::mr::pool_memory_resource; + +TEST(PinnedPoolTest, ThrowOnNullUpstream) +{ + auto construct_nullptr = []() { pool_mr mr{nullptr}; }; + EXPECT_THROW(construct_nullptr(), rmm::logic_error); +} + +TEST(PinnedPoolTest, ThrowMaxLessThanInitial) +{ + // Make sure first argument is enough larger than the second that alignment rounding doesn't + // make them equal + auto max_less_than_initial = []() { + rmm::mr::pinned_memory_resource pinned_mr{}; + const auto initial{1024}; + const auto maximum{256}; + pool_mr mr{&pinned_mr, initial, maximum}; + }; + EXPECT_THROW(max_less_than_initial(), rmm::logic_error); +} + +TEST(PinnedPoolTest, ReferenceThrowMaxLessThanInitial) +{ + // Make sure first argument is enough larger than the second that alignment rounding doesn't + // make them equal + auto max_less_than_initial = []() { + rmm::mr::pinned_memory_resource pinned_mr{}; + const auto initial{1024}; + const auto maximum{256}; + pool_mr mr{pinned_mr, initial, maximum}; + }; + EXPECT_THROW(max_less_than_initial(), rmm::logic_error); +} + +// Issue #527 +TEST(PinnedPoolTest, InitialAndMaxPoolSizeEqual) +{ + EXPECT_NO_THROW([]() { + rmm::mr::pinned_memory_resource pinned_mr{}; + pool_mr mr(pinned_mr, 1000192, 1000192); + mr.allocate(1000); + }()); +} + +TEST(PinnedPoolTest, NonAlignedPoolSize) +{ + EXPECT_THROW( + []() { + rmm::mr::pinned_memory_resource pinned_mr{}; + pool_mr mr(pinned_mr, 1000031, 1000192); + mr.allocate(1000); + }(), + rmm::logic_error); + + EXPECT_THROW( + []() { + rmm::mr::pinned_memory_resource pinned_mr{}; + pool_mr mr(pinned_mr, 1000192, 1000200); + mr.allocate(1000); + }(), + rmm::logic_error); +} + +} // namespace +} // namespace rmm::test