Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Forward-merge branch-24.08 into branch-24.10 #16571

Merged
merged 2 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 15 additions & 6 deletions docs/cudf/source/cudf_pandas/how-it-works.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,20 @@ allocation may be a bottleneck depending on the workload. Managed memory
enables oversubscribing GPU memory. This allows cudf.pandas to process
data larger than GPU memory in many cases, without CPU (Pandas) fallback.

```{note}
CUDA Managed Memory on Windows, and more specifically Windows Subsystem for
Linux (WSL2), [does not support oversubscription](
https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#system-requirements-for-unified-memory),
only unified addressing. Furthermore, managed memory on WSL2 has undesirable
performance characteristics. Therefore, `cudf.pandas` uses a non-managed pool
allocator on WSL2, so `cudf.pandas` is limited to the physical size of GPU memory.
```

Other memory allocators can be used by changing the environment
variable `CUDF_PANDAS_RMM_MODE` to one of the following.
variable `CUDF_PANDAS_RMM_MODE` to one of the following:

1. "managed_pool" (default): CUDA Unified Memory (managed memory) with RMM's asynchronous pool allocator.
2. "managed": CUDA Unified Memory, (managed memory) with no pool allocator.
3. "async": CUDA's built-in pool asynchronous pool allocator with normal CUDA device memory.
4. "pool": RMM's asynchronous pool allocator with normal CUDA device memory.
5. "cuda": normal CUDA device memory with no pool allocator.
1. `"managed_pool"` (default, if supported): CUDA Unified Memory (managed memory) with RMM's asynchronous pool allocator.
2. `"managed"`: CUDA Unified Memory, (managed memory) with no pool allocator.
3. `"async"`: CUDA's built-in pool asynchronous pool allocator with normal CUDA device memory.
4. `"pool"` (default if `"managed_pool"` is not supported): RMM's asynchronous pool allocator with normal CUDA device memory.
5. `"cuda"`: normal CUDA device memory with no pool allocator.
22 changes: 22 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/utils.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ from libc.stdint cimport uintptr_t
from libcpp.functional cimport reference_wrapper
from libcpp.vector cimport vector

from cuda import cudart

from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
from cudf._lib.pylibcudf.libcudf.types cimport bitmask_type

Expand Down Expand Up @@ -34,3 +36,23 @@ cdef vector[reference_wrapper[const scalar]] _as_vector(list source):
c_scalars.push_back(
reference_wrapper[constscalar](dereference((<Scalar?>slr).c_obj)))
return c_scalars


def _is_concurrent_managed_access_supported():
"""Check the availability of concurrent managed access (UVM).

Note that WSL2 does not support managed memory.
"""

# Ensure CUDA is initialized before checking cudaDevAttrConcurrentManagedAccess
cudart.cudaFree(0)

device_id = 0
err, supports_managed_access = cudart.cudaDeviceGetAttribute(
cudart.cudaDeviceAttr.cudaDevAttrConcurrentManagedAccess, device_id
)
if err != cudart.cudaError_t.cudaSuccess:
raise RuntimeError(
f"Failed to check cudaDevAttrConcurrentManagedAccess with error {err}"
)
return supports_managed_access != 0
24 changes: 20 additions & 4 deletions python/cudf/cudf/pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@
}


def _enable_managed_prefetching(rmm_mode):
if "managed" in rmm_mode:
def _enable_managed_prefetching(rmm_mode, managed_memory_is_supported):
if managed_memory_is_supported and "managed" in rmm_mode:
for key in _SUPPORTED_PREFETCHES:
pylibcudf.experimental.enable_prefetching(key)

Expand All @@ -40,7 +40,20 @@ def install():
global LOADED
LOADED = loader is not None

rmm_mode = os.getenv("CUDF_PANDAS_RMM_MODE", "managed_pool")
# The default mode is "managed_pool" if UVM is supported, otherwise "pool"
managed_memory_is_supported = (
pylibcudf.utils._is_concurrent_managed_access_supported()
)
default_rmm_mode = (
"managed_pool" if managed_memory_is_supported else "pool"
)
rmm_mode = os.getenv("CUDF_PANDAS_RMM_MODE", default_rmm_mode)

if "managed" in rmm_mode and not managed_memory_is_supported:
raise ValueError(
f"Managed memory is not supported on this system, so the requested {rmm_mode=} is invalid."
)

# Check if a non-default memory resource is set
current_mr = rmm.mr.get_current_device_resource()
if not isinstance(current_mr, rmm.mr.CudaMemoryResource):
Expand All @@ -53,6 +66,7 @@ def install():
free_memory, _ = rmm.mr.available_device_memory()
free_memory = int(round(float(free_memory) * 0.80 / 256) * 256)
new_mr = current_mr

if rmm_mode == "pool":
new_mr = rmm.mr.PoolMemoryResource(
current_mr,
Expand All @@ -71,8 +85,10 @@ def install():
)
elif rmm_mode != "cuda":
raise ValueError(f"Unsupported {rmm_mode=}")

rmm.mr.set_current_device_resource(new_mr)
_enable_managed_prefetching(rmm_mode)

_enable_managed_prefetching(rmm_mode, managed_memory_is_supported)


def pytest_load_initial_conftests(early_config, parser, args):
Expand Down
Loading