Require explicit pool size in pool_memory_resource and move some th…

…ings out of detail namespace (#1417) Fixes #1416. - ~Deprecates existing ctors of `pool_memory_resource` that provide optional parameter for the initial pool size.~ - Adds new ctors that require an explicit initial pool size. - We don't yet deprecate anything in this PR because that would break builds of some RAPIDS libraries. We will follow up with PRs to cuDF, cuGraph and anything else needed to remove deprecated usages after this PR is merged. - Adds a new utility `fraction_of_available_device_memory` that calculates the specified fraction of free memory on the current CUDA device. This is now used in tests to provide an explicit pool size and can be used to produce the previous behavior of `pool_memory_resource` for consumers of the library. - Moves `available_device_memory` from a detail header to `cuda_device.hpp` so it is now publicly usable, along with the above utility. - Temporarily adds `detail::available_device_memory` as an alias of the above in order to keep cudf and cugraph building until we can update them. - Duplicates commonly externally used alignment functions that are currently in `rmm::detail` to the public `rmm` namespace. The detail versions will be removed after cuDF and cuGraph are updated to not use them. Authors: - Mark Harris (https://github.com/harrism) - Lawrence Mitchell (https://github.com/wence-) Approvers: - Michael Schellenberger Costa (https://github.com/miscco) - Lawrence Mitchell (https://github.com/wence-) - Jake Hemstad (https://github.com/jrhemstad) URL: #1417
rapidsai · Jan 15, 2024 · 64aa941 · 64aa941
1 parent 40ce295
commit 64aa941
Show file tree

Hide file tree

Showing 41 changed files with 420 additions and 218 deletions.
diff --git a/.clang-tidy b/.clang-tidy
@@ -62,8 +62,8 @@ CheckOptions:
     value:           'alignment'
   - key:             cppcoreguidelines-avoid-magic-numbers.IgnorePowersOf2IntegerValues
     value:           '1'
-  - key:             readability-magic-numbers.IgnorePowersOf2IntegerValues
-    value:           '1'
+  - key:             cppcoreguidelines-avoid-magic-numbers.IgnoredIntegerValues
+    value:           "0;1;2;3;4;50;100"
   - key:             cppcoreguidelines-avoid-do-while.IgnoreMacros
     value:           'true'
 ...
diff --git a/README.md b/README.md
@@ -332,7 +332,9 @@ Accessing and modifying the default resource is done through two functions:
 ```c++
 rmm::mr::cuda_memory_resource cuda_mr;
 // Construct a resource that uses a coalescing best-fit pool allocator
-rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> pool_mr{&cuda_mr};
+// With the pool initially half of available device memory
+auto initial_size = rmm::percent_of_free_device_memory(50);
+rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> pool_mr{&cuda_mr, initial_size};
 rmm::mr::set_current_device_resource(&pool_mr); // Updates the current device resource pointer to `pool_mr`
 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(); // Points to `pool_mr`
 ```
@@ -351,11 +353,13 @@ per-device resources. Here is an example loop that creates `unique_ptr`s to `poo
 objects for each device and sets them as the per-device resource for that device.
 
 ```c++
-std::vector<unique_ptr<pool_memory_resource>> per_device_pools;
+using pool_mr = rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>;
+std::vector<unique_ptr<pool_mr>> per_device_pools;
 for(int i = 0; i < N; ++i) {
   cudaSetDevice(i); // set device i before creating MR
   // Use a vector of unique_ptr to maintain the lifetime of the MRs
-  per_device_pools.push_back(std::make_unique<pool_memory_resource>());
+  // Note: for brevity, omitting creation of upstream and computing initial_size
+  per_device_pools.push_back(std::make_unique<pool_mr>(upstream, initial_size));
   // Set the per-device resource for device i
   set_per_device_resource(cuda_device_id{i}, &per_device_pools.back());
 }

diff --git a/benchmarks/device_uvector/device_uvector_bench.cu b/benchmarks/device_uvector/device_uvector_bench.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #include "../synchronization/synchronization.hpp"
 
+#include <rmm/cuda_device.hpp>
 #include <rmm/cuda_stream.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/device_uvector.hpp>
@@ -38,7 +39,8 @@
 void BM_UvectorSizeConstruction(benchmark::State& state)
 {
   rmm::mr::cuda_memory_resource cuda_mr{};
-  rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> mr{&cuda_mr};
+  rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> mr{
+    &cuda_mr, rmm::percent_of_free_device_memory(50)};
   rmm::mr::set_current_device_resource(&mr);
 
   for (auto _ : state) {  // NOLINT(clang-analyzer-deadcode.DeadStores)
@@ -59,7 +61,8 @@ BENCHMARK(BM_UvectorSizeConstruction)
 void BM_ThrustVectorSizeConstruction(benchmark::State& state)
 {
   rmm::mr::cuda_memory_resource cuda_mr{};
-  rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> mr{&cuda_mr};
+  rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> mr{
+    &cuda_mr, rmm::percent_of_free_device_memory(50)};
   rmm::mr::set_current_device_resource(&mr);
 
   for (auto _ : state) {  // NOLINT(clang-analyzer-deadcode.DeadStores)

diff --git a/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu b/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #include <benchmarks/utilities/cxxopts.hpp>
 
+#include <rmm/cuda_device.hpp>
 #include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_pool.hpp>
 #include <rmm/device_uvector.hpp>
@@ -100,7 +101,8 @@ inline auto make_cuda_async() { return std::make_shared<rmm::mr::cuda_async_memo
 
 inline auto make_pool()
 {
-  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda());
+  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
+    make_cuda(), rmm::percent_of_free_device_memory(50));
 }
 
 inline auto make_arena()

diff --git a/benchmarks/random_allocations/random_allocations.cpp b/benchmarks/random_allocations/random_allocations.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #include <benchmarks/utilities/cxxopts.hpp>
 
+#include <rmm/cuda_device.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>
 #include <rmm/mr/device/binning_memory_resource.hpp>
 #include <rmm/mr/device/cuda_async_memory_resource.hpp>
@@ -165,12 +166,13 @@ inline auto make_cuda_async() { return std::make_shared<rmm::mr::cuda_async_memo
 
 inline auto make_pool()
 {
-  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda());
+  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
+    make_cuda(), rmm::percent_of_free_device_memory(50));
 }
 
 inline auto make_arena()
 {
-  auto free = rmm::detail::available_device_memory().first;
+  auto free = rmm::available_device_memory().first;
   constexpr auto reserve{64UL << 20};  // Leave some space for CUDA overhead.
   return rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(make_cuda(), free - reserve);
 }

diff --git a/benchmarks/replay/replay.cpp b/benchmarks/replay/replay.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -61,7 +61,7 @@ inline auto make_pool(std::size_t simulated_size)
     return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
       make_simulated(simulated_size), simulated_size, simulated_size);
   }
-  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda());
+  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda(), 0);
 }
 
 inline auto make_arena(std::size_t simulated_size)

diff --git a/doxygen/Doxyfile b/doxygen/Doxyfile
@@ -504,7 +504,7 @@ EXTRACT_PACKAGE        = NO
 # included in the documentation.
 # The default value is: NO.
 
-EXTRACT_STATIC         = NO
+EXTRACT_STATIC         = YES
 
 # If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
 # locally in source files will be included in the documentation. If set to NO,

diff --git a/include/doxygen_groups.h b/include/doxygen_groups.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,4 +41,5 @@
  * @defgroup errors Errors
  * @defgroup logging Logging
  * @defgroup thrust_integrations Thrust Integrations
+ * @defgroup utilities Utilities
  */
diff --git a/include/rmm/aligned.hpp b/include/rmm/aligned.hpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+namespace rmm {
+
+/**
+ * @addtogroup utilities
+ * @{
+ * @file
+ */
+
+/**
+ * @brief Default alignment used for host memory allocated by RMM.
+ *
+ */
+static constexpr std::size_t RMM_DEFAULT_HOST_ALIGNMENT{alignof(std::max_align_t)};
+
+/**
+ * @brief Default alignment used for CUDA memory allocation.
+ *
+ */
+static constexpr std::size_t CUDA_ALLOCATION_ALIGNMENT{256};
+
+/**
+ * @brief Returns whether or not `value` is a power of 2.
+ *
+ * @param[in] value to check.
+ *
+ * @return Whether the input a power of two with non-negative exponent
+ */
+constexpr bool is_pow2(std::size_t value) { return (value != 0U) && ((value & (value - 1)) == 0U); }
+
+/**
+ * @brief Returns whether or not `alignment` is a valid memory alignment.
+ *
+ * @param[in] alignment to check
+ *
+ * @return Whether the alignment is valid
+ */
+constexpr bool is_supported_alignment(std::size_t alignment) { return is_pow2(alignment); }
+
+/**
+ * @brief Align up to nearest multiple of specified power of 2
+ *
+ * @param[in] value value to align
+ * @param[in] alignment amount, in bytes, must be a power of 2
+ *
+ * @return Return the aligned value, as one would expect
+ */
+constexpr std::size_t align_up(std::size_t value, std::size_t alignment) noexcept
+{
+  assert(is_supported_alignment(alignment));
+  return (value + (alignment - 1)) & ~(alignment - 1);
+}
+
+/**
+ * @brief Align down to the nearest multiple of specified power of 2
+ *
+ * @param[in] value value to align
+ * @param[in] alignment amount, in bytes, must be a power of 2
+ *
+ * @return Return the aligned value, as one would expect
+ */
+constexpr std::size_t align_down(std::size_t value, std::size_t alignment) noexcept
+{
+  assert(is_supported_alignment(alignment));
+  return value & ~(alignment - 1);
+}
+
+/**
+ * @brief Checks whether a value is aligned to a multiple of a specified power of 2
+ *
+ * @param[in] value value to check for alignment
+ * @param[in] alignment amount, in bytes, must be a power of 2
+ *
+ * @return true if aligned
+ */
+constexpr bool is_aligned(std::size_t value, std::size_t alignment) noexcept
+{
+  assert(is_supported_alignment(alignment));
+  return value == align_down(value, alignment);
+}
+
+/**
+ * @brief Checks whether the provided pointer is aligned to a specified @p alignment
+ *
+ * @param[in] ptr pointer to check for alignment
+ * @param[in] alignment required alignment in bytes, must be a power of 2
+ *
+ * @return true if the pointer is aligned
+ */
+inline bool is_pointer_aligned(void* ptr, std::size_t alignment = CUDA_ALLOCATION_ALIGNMENT)
+{
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
+  return is_aligned(reinterpret_cast<std::uintptr_t>(ptr), alignment);
+}
+
+/** @} */  // end of group
+
+}  // namespace rmm
diff --git a/include/rmm/cuda_device.hpp b/include/rmm/cuda_device.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <rmm/aligned.hpp>
 #include <rmm/detail/error.hpp>
 
 #include <cuda_runtime_api.h>
@@ -102,6 +103,49 @@ inline int get_num_cuda_devices()
   return num_dev;
 }
 
+/**
+ * @brief Returns the available and total device memory in bytes for the current device
+ *
+ * @return The available and total device memory in bytes for the current device as a std::pair.
+ */
+inline std::pair<std::size_t, std::size_t> available_device_memory()
+{
+  std::size_t free{};
+  std::size_t total{};
+  RMM_CUDA_TRY(cudaMemGetInfo(&free, &total));
+  return {free, total};
+}
+
+namespace detail {
+
+/**
+ * @brief Returns the available and total device memory in bytes for the current device
+ *
+ * @deprecated Use rmm::available_device_memory() instead.
+ *
+ * @return The available and total device memory in bytes for the current device as a std::pair.
+ */
+//[[deprecated("Use `rmm::available_device_memory` instead.")]]  //
+const auto available_device_memory = rmm::available_device_memory;
+
+}  // namespace detail
+
+/**
+ * @brief Returns the approximate specified percent of available device memory on the current CUDA
+ * device, aligned (down) to the nearest CUDA allocation size.
+ *
+ * @param percent The percent of free memory to return.
+ *
+ * @return The recommended initial device memory pool size in bytes.
+ */
+inline std::size_t percent_of_free_device_memory(int percent)
+{
+  [[maybe_unused]] auto const [free, total] = rmm::available_device_memory();
+  auto fraction                             = static_cast<double>(percent) / 100.0;
+  return rmm::align_down(static_cast<std::size_t>(static_cast<double>(free) * fraction),
+                         rmm::CUDA_ALLOCATION_ALIGNMENT);
+}
+
 /**
  * @brief RAII class that sets the current CUDA device to the specified device on construction
  * and restores the previous device on destruction.