rapidsai · rapids-bot · May 20, 2024 · May 6, 2024 · May 6, 2024 · May 6, 2024
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -81,6 +81,11 @@ class cuda_stream_pool {
  */
 cuda_stream_pool* create_global_cuda_stream_pool();
 
+/**
+ * @brief Get the global stream pool.
+ */
+cuda_stream_pool& global_cuda_stream_pool();
+
 /**
  * @brief Acquire a set of `cuda_stream_view` objects and synchronize them to an event on another
  * stream.

@@ -41,4 +41,13 @@ rmm::host_async_resource_ref set_host_memory_resource(rmm::host_async_resource_r
  */
 rmm::host_async_resource_ref get_host_memory_resource();
 
+/**
+ * @brief Configure the size of the default host memory resource.
+ *
+ * Must be called before any other function in this header.
+ *
+ * @param size The size of the default host memory resource
+ */
+void config_host_memory_resource(size_t size);
+
 }  // namespace cudf::io
@@ -16,10 +16,13 @@
 
 #include "config_utils.hpp"
 
+#include <cudf/detail/utilities/stream_pool.hpp>
+#include <cudf/io/memory_resource.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_device.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
@@ -87,38 +90,188 @@ bool is_stable_enabled() { return is_all_enabled() or get_env_policy() == usage_
 
 }  // namespace nvcomp_integration
 
-inline std::mutex& host_mr_lock()
+}  // namespace detail
+
+namespace {
+class fixed_pinned_pool_memory_resource {
+  using upstream_mr    = rmm::mr::pinned_host_memory_resource;
+  using host_pooled_mr = rmm::mr::pool_memory_resource<upstream_mr>;
+
+ private:
+  upstream_mr upstream_mr_{};
+  size_t pool_size_{0};
+  // Raw pointer to avoid a segfault when the pool is destroyed on exit
+  host_pooled_mr* pool_{nullptr};
+  void* pool_begin_{nullptr};
+  void* pool_end_{nullptr};
+  cuda::stream_ref stream_{cudf::detail::global_cuda_stream_pool().get_stream().value()};
+
+ public:
+  fixed_pinned_pool_memory_resource(size_t size)
+    : pool_size_{size}, pool_{new host_pooled_mr(upstream_mr_, size, size)}
+  {
+    if (pool_size_ == 0) { return; }
+
+    // Allocate full size from the pinned pool to figure out the beginning and end address
+    pool_begin_ = pool_->allocate_async(pool_size_, stream_);
+    pool_end_   = static_cast<void*>(static_cast<uint8_t*>(pool_begin_) + pool_size_);
+    pool_->deallocate_async(pool_begin_, pool_size_, stream_);
+  }
+
+  void* do_allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream)
+  {
+    if (bytes <= pool_size_) {
+      try {
+        return pool_->allocate_async(bytes, alignment, stream);
+      } catch (...) {
+        // If the pool is exhausted, fall back to the upstream memory resource
+      }
+    }
+
+    return upstream_mr_.allocate_async(bytes, alignment, stream);
+  }
+
+  void do_deallocate_async(void* ptr,
+                           std::size_t bytes,
+                           std::size_t alignment,
+                           cuda::stream_ref stream) noexcept
+  {
+    if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr <= pool_end_) {
+      pool_->deallocate_async(ptr, bytes, alignment, stream);
+    } else {
+      upstream_mr_.deallocate_async(ptr, bytes, alignment, stream);
+    }
+  }
+
+  void* allocate_async(std::size_t bytes, cuda::stream_ref stream)
+  {
+    return do_allocate_async(bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
+  }
+
+  void* allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream)
+  {
+    return do_allocate_async(bytes, alignment, stream);
+  }
+
+  void* allocate(std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
+  {
+    auto const result = do_allocate_async(bytes, alignment, stream_);
+    stream_.wait();
+    return result;
+  }
+
+  void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) noexcept
+  {
+    return do_deallocate_async(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
+  }
+
+  void deallocate_async(void* ptr,
+                        std::size_t bytes,
+                        std::size_t alignment,
+                        cuda::stream_ref stream) noexcept
+  {
+    return do_deallocate_async(ptr, bytes, alignment, stream);
+  }
+
+  void deallocate(void* ptr,
+                  std::size_t bytes,
+                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept
+  {
+    deallocate_async(ptr, bytes, alignment, stream_);
+    stream_.wait();
+  }
+
+  bool operator==(fixed_pinned_pool_memory_resource const& other) const
+  {
+    return pool_ == other.pool_ and stream_ == other.stream_;
+  }
+
+  bool operator!=(fixed_pinned_pool_memory_resource const& other) const
+  {
+    return !operator==(other);
+  }
+
+  [[maybe_unused]] friend void get_property(fixed_pinned_pool_memory_resource const&,
+                                            cuda::mr::device_accessible) noexcept
+  {
+  }
+
+  [[maybe_unused]] friend void get_property(fixed_pinned_pool_memory_resource const&,
+                                            cuda::mr::host_accessible) noexcept
+  {
+  }
+};
+
+static_assert(cuda::mr::resource_with<fixed_pinned_pool_memory_resource,
+                                      cuda::mr::device_accessible,
+                                      cuda::mr::host_accessible>,
+              "");
+
+rmm::host_async_resource_ref make_default_pinned_mr(std::optional<size_t> config_size)
 {
-  static std::mutex map_lock;
-  return map_lock;
+  static fixed_pinned_pool_memory_resource mr = [config_size]() {
+    auto const size = [&config_size]() -> size_t {
+      if (auto const env_val = getenv("LIBCUDF_PINNED_POOL_SIZE"); env_val != nullptr) {
+        return std::atol(env_val);
+      }
+
+      if (config_size.has_value()) { return *config_size; }
+
+      size_t free{}, total{};
+      CUDF_CUDA_TRY(cudaMemGetInfo(&free, &total));
+      // 0.5% of the total device memory, capped at 100MB
+      return std::min(total / 200, size_t{100} * 1024 * 1024);
+    }();
+
+    // rmm requires the pool size to be a multiple of 256 bytes
+    auto const aligned_size = (size + 255) & ~255;
+    CUDF_LOG_INFO("Pinned pool size = {}", aligned_size);
+
+    // make the pool with max size equal to the initial size
+    return fixed_pinned_pool_memory_resource{aligned_size};
+  }();
+
+  return mr;
 }
 
-inline rmm::host_async_resource_ref default_pinned_mr()
+rmm::host_async_resource_ref make_host_mr(std::optional<size_t> size)
 {
-  static rmm::mr::pinned_host_memory_resource default_mr{};
-  return default_mr;
+  static rmm::host_async_resource_ref mr_ref = make_default_pinned_mr(size);
+  return mr_ref;
 }
 
-CUDF_EXPORT inline auto& host_mr()
+std::mutex& host_mr_mutex()
 {
-  static rmm::host_async_resource_ref host_mr = default_pinned_mr();
-  return host_mr;
+  static std::mutex map_lock;
+  return map_lock;
 }
 
-}  // namespace detail
+rmm::host_async_resource_ref& host_mr()
+{
+  static rmm::host_async_resource_ref mr_ref = make_host_mr(std::nullopt);
+  return mr_ref;
+}
+
+}  // namespace
 
 rmm::host_async_resource_ref set_host_memory_resource(rmm::host_async_resource_ref mr)
 {
-  std::lock_guard lock{detail::host_mr_lock()};
-  auto last_mr      = detail::host_mr();
-  detail::host_mr() = mr;
+  std::scoped_lock lock{host_mr_mutex()};
+  auto last_mr = host_mr();
+  host_mr()    = mr;
   return last_mr;
 }
 
 rmm::host_async_resource_ref get_host_memory_resource()
 {
-  std::lock_guard lock{detail::host_mr_lock()};
-  return detail::host_mr();
+  std::scoped_lock lock{host_mr_mutex()};
+  return host_mr();
+}
+
+void config_host_memory_resource(size_t size)
+{
+  std::scoped_lock lock{host_mr_mutex()};
+  make_host_mr(size);
 }
 
 }  // namespace cudf::io