-
Notifications
You must be signed in to change notification settings - Fork 915
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add default pinned pool that falls back to new pinned allocations #15665
Changes from 29 commits
163ad97
1e850d6
3be42ba
395dcf1
70ae74e
503d170
0873b1f
ff18a21
f5a735c
5766805
0bd92bf
854c0ab
5bf0ce4
284654d
ff4d7f6
f5b2c84
cf3f8a3
80b5963
d23684d
1828e05
0122038
60030da
a62377e
6733c45
abf40a8
fa7dce7
a244d7c
7076e73
0b8aa44
3db44a3
27d30c8
224e68f
382e7b3
b2fd734
0eccf9a
ecd6481
709123f
01b1bdb
ecb5f5a
f0d0bf0
f989a56
d0e6dd7
2b4952a
fdcfad3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,10 +16,12 @@ | |
|
||
#include "config_utils.hpp" | ||
|
||
#include <cudf/detail/utilities/stream_pool.hpp> | ||
#include <cudf/utilities/error.hpp> | ||
#include <cudf/utilities/export.hpp> | ||
|
||
#include <rmm/cuda_device.hpp> | ||
#include <rmm/mr/device/pool_memory_resource.hpp> | ||
#include <rmm/mr/pinned_host_memory_resource.hpp> | ||
#include <rmm/resource_ref.hpp> | ||
|
||
|
@@ -87,38 +89,165 @@ bool is_stable_enabled() { return is_all_enabled() or get_env_policy() == usage_ | |
|
||
} // namespace nvcomp_integration | ||
|
||
inline std::mutex& host_mr_lock() | ||
} // namespace detail | ||
|
||
namespace { | ||
class fixed_pinned_pool_memory_resource { | ||
using upstream_mr = rmm::mr::pinned_host_memory_resource; | ||
using host_pooled_mr = rmm::mr::pool_memory_resource<upstream_mr>; | ||
|
||
private: | ||
upstream_mr upstream_mr_{}; | ||
size_t pool_size_{0}; | ||
// Raw pointer to avoid a segfault when the pool is destroyed on exit | ||
host_pooled_mr* pool_{nullptr}; | ||
void* pool_begin_{nullptr}; | ||
void* pool_end_{nullptr}; | ||
cuda::stream_ref stream_{cudf::detail::global_cuda_stream_pool().get_stream().value()}; | ||
|
||
public: | ||
fixed_pinned_pool_memory_resource(size_t size) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need to check and throw if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think so; the resource works fine with a zero-capacity pool. I used this when benchmarking. Basically to verify that the performance is the same as the non-pooled resource. So zero is a valid value for the size IMO. |
||
: pool_size_{size}, pool_{new host_pooled_mr(upstream_mr_, size, size)} | ||
{ | ||
if (pool_size_ == 0) { return; } | ||
|
||
// Allocate full size from the pinned pool to figure out the beginning and end address | ||
pool_begin_ = pool_->allocate_async(pool_size_, stream_); | ||
pool_end_ = static_cast<void*>(static_cast<uint8_t*>(pool_begin_) + pool_size_); | ||
pool_->deallocate_async(pool_begin_, pool_size_, stream_); | ||
} | ||
|
||
void* do_allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm late, but these |
||
{ | ||
if (bytes <= pool_size_) { | ||
try { | ||
return pool_->allocate_async(bytes, alignment, stream); | ||
} catch (...) { | ||
// If the pool is exhausted, fall back to the upstream memory resource | ||
} | ||
} | ||
|
||
return upstream_mr_.allocate_async(bytes, alignment, stream); | ||
} | ||
|
||
void do_deallocate_async(void* ptr, | ||
ttnghia marked this conversation as resolved.
Show resolved
Hide resolved
|
||
std::size_t bytes, | ||
std::size_t alignment, | ||
cuda::stream_ref stream) noexcept | ||
{ | ||
if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr <= pool_end_) { | ||
pool_->deallocate_async(ptr, bytes, alignment, stream); | ||
} else { | ||
upstream_mr_.deallocate_async(ptr, bytes, alignment, stream); | ||
} | ||
} | ||
|
||
void* allocate_async(std::size_t bytes, cuda::stream_ref stream) | ||
{ | ||
return do_allocate_async(bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream); | ||
} | ||
|
||
void* allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream) | ||
{ | ||
return do_allocate_async(bytes, alignment, stream); | ||
} | ||
|
||
void* allocate(std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) | ||
{ | ||
auto const result = do_allocate_async(bytes, alignment, stream_); | ||
stream_.wait(); | ||
return result; | ||
} | ||
|
||
void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) noexcept | ||
{ | ||
return do_deallocate_async(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream); | ||
} | ||
|
||
void deallocate_async(void* ptr, | ||
std::size_t bytes, | ||
std::size_t alignment, | ||
cuda::stream_ref stream) noexcept | ||
{ | ||
return do_deallocate_async(ptr, bytes, alignment, stream); | ||
} | ||
|
||
void deallocate(void* ptr, | ||
std::size_t bytes, | ||
std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept | ||
{ | ||
deallocate_async(ptr, bytes, alignment, stream_); | ||
stream_.wait(); | ||
} | ||
|
||
bool operator==(fixed_pinned_pool_memory_resource const& other) const | ||
{ | ||
return pool_ == other.pool_ and stream_ == other.stream_; | ||
} | ||
|
||
bool operator!=(fixed_pinned_pool_memory_resource const& other) const | ||
{ | ||
return !operator==(other); | ||
} | ||
|
||
[[maybe_unused]] friend void get_property(fixed_pinned_pool_memory_resource const&, | ||
cuda::mr::device_accessible) noexcept | ||
{ | ||
} | ||
|
||
[[maybe_unused]] friend void get_property(fixed_pinned_pool_memory_resource const&, | ||
cuda::mr::host_accessible) noexcept | ||
{ | ||
} | ||
}; | ||
vuule marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
rmm::host_async_resource_ref default_pinned_mr() | ||
vuule marked this conversation as resolved.
Show resolved
Hide resolved
|
||
{ | ||
static std::mutex map_lock; | ||
return map_lock; | ||
auto const size = []() -> size_t { | ||
if (auto const env_val = getenv("LIBCUDF_PINNED_POOL_SIZE"); env_val != nullptr) { | ||
return std::atol(env_val); | ||
} | ||
|
||
size_t free{}, total{}; | ||
CUDF_CUDA_TRY(cudaMemGetInfo(&free, &total)); | ||
// 0.5% of the total device memory, capped at 100MB | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Allocating a 100MB pool takes 30ms on my system. This is smaller then CUDA runtime init (~60ms) and cuFile/kvikio init (180ms) so IMO this won't significantly impact user experience. |
||
return std::min((total / 200 + 255) & ~255, size_t{100} * 1024 * 1024); | ||
}(); | ||
vuule marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
CUDF_LOG_INFO("Pinned pool size = {}", size); | ||
|
||
// make the pool with max size equal to the initial size | ||
static fixed_pinned_pool_memory_resource mr{size}; | ||
|
||
return mr; | ||
} | ||
|
||
inline rmm::host_async_resource_ref default_pinned_mr() | ||
std::mutex& host_mr_mutex() | ||
{ | ||
static rmm::mr::pinned_host_memory_resource default_mr{}; | ||
return default_mr; | ||
static std::mutex map_lock; | ||
return map_lock; | ||
} | ||
|
||
CUDF_EXPORT inline auto& host_mr() | ||
auto& host_mr() | ||
{ | ||
static rmm::host_async_resource_ref host_mr = default_pinned_mr(); | ||
return host_mr; | ||
} | ||
|
||
} // namespace detail | ||
} // namespace | ||
|
||
rmm::host_async_resource_ref set_host_memory_resource(rmm::host_async_resource_ref mr) | ||
{ | ||
std::lock_guard lock{detail::host_mr_lock()}; | ||
auto last_mr = detail::host_mr(); | ||
detail::host_mr() = mr; | ||
std::scoped_lock lock{host_mr_mutex()}; | ||
auto last_mr = host_mr(); | ||
host_mr() = mr; | ||
return last_mr; | ||
} | ||
|
||
rmm::host_async_resource_ref get_host_memory_resource() | ||
{ | ||
std::lock_guard lock{detail::host_mr_lock()}; | ||
return detail::host_mr(); | ||
std::scoped_lock lock{host_mr_mutex()}; | ||
return host_mr(); | ||
} | ||
|
||
} // namespace cudf::io |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
had to expose the pool to get a stream from it without forking