Merge pull request #16571 from bdice/branch-24.10-merge-24.08

Forward-merge branch-24.08 into branch-24.10
rapidsai · Aug 15, 2024 · 6912246 · 6912246
2 parents ac42bc8 + ed31523
commit 6912246
Show file tree

Hide file tree

Showing 3 changed files with 57 additions and 10 deletions.
diff --git a/docs/cudf/source/cudf_pandas/how-it-works.md b/docs/cudf/source/cudf_pandas/how-it-works.md
@@ -44,11 +44,20 @@ allocation may be a bottleneck depending on the workload. Managed memory
 enables oversubscribing GPU memory. This allows cudf.pandas to process
 data larger than GPU memory in many cases, without CPU (Pandas) fallback.
 
+```{note}
+CUDA Managed Memory on Windows, and more specifically Windows Subsystem for
+Linux (WSL2), [does not support oversubscription](
+https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#system-requirements-for-unified-memory),
+only unified addressing. Furthermore, managed memory on WSL2 has undesirable
+performance characteristics. Therefore, `cudf.pandas` uses a non-managed pool
+allocator on WSL2, so `cudf.pandas` is limited to the physical size of GPU memory.
+```
+
 Other memory allocators can be used by changing the environment
-variable `CUDF_PANDAS_RMM_MODE` to one of the following.
+variable `CUDF_PANDAS_RMM_MODE` to one of the following:
 
-1. "managed_pool" (default): CUDA Unified Memory (managed memory) with RMM's asynchronous pool allocator.
-2. "managed": CUDA Unified Memory, (managed memory) with no pool allocator.
-3. "async": CUDA's built-in pool asynchronous pool allocator with normal CUDA device memory.
-4. "pool": RMM's asynchronous pool allocator with normal CUDA device memory.
-5. "cuda": normal CUDA device memory with no pool allocator.
+1. `"managed_pool"` (default, if supported): CUDA Unified Memory (managed memory) with RMM's asynchronous pool allocator.
+2. `"managed"`: CUDA Unified Memory, (managed memory) with no pool allocator.
+3. `"async"`: CUDA's built-in pool asynchronous pool allocator with normal CUDA device memory.
+4. `"pool"` (default if `"managed_pool"` is not supported): RMM's asynchronous pool allocator with normal CUDA device memory.
+5. `"cuda"`: normal CUDA device memory with no pool allocator.
diff --git a/python/cudf/cudf/_lib/pylibcudf/utils.pyx b/python/cudf/cudf/_lib/pylibcudf/utils.pyx
@@ -6,6 +6,8 @@ from libc.stdint cimport uintptr_t
 from libcpp.functional cimport reference_wrapper
 from libcpp.vector cimport vector
 
+from cuda import cudart
+
 from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 from cudf._lib.pylibcudf.libcudf.types cimport bitmask_type
 
@@ -34,3 +36,23 @@ cdef vector[reference_wrapper[const scalar]] _as_vector(list source):
         c_scalars.push_back(
             reference_wrapper[constscalar](dereference((<Scalar?>slr).c_obj)))
     return c_scalars
+
+
+def _is_concurrent_managed_access_supported():
+    """Check the availability of concurrent managed access (UVM).
+
+    Note that WSL2 does not support managed memory.
+    """
+
+    # Ensure CUDA is initialized before checking cudaDevAttrConcurrentManagedAccess
+    cudart.cudaFree(0)
+
+    device_id = 0
+    err, supports_managed_access = cudart.cudaDeviceGetAttribute(
+        cudart.cudaDeviceAttr.cudaDevAttrConcurrentManagedAccess, device_id
+    )
+    if err != cudart.cudaError_t.cudaSuccess:
+        raise RuntimeError(
+            f"Failed to check cudaDevAttrConcurrentManagedAccess with error {err}"
+        )
+    return supports_managed_access != 0
diff --git a/python/cudf/cudf/pandas/__init__.py b/python/cudf/cudf/pandas/__init__.py
@@ -26,8 +26,8 @@
 }
 
 
-def _enable_managed_prefetching(rmm_mode):
-    if "managed" in rmm_mode:
+def _enable_managed_prefetching(rmm_mode, managed_memory_is_supported):
+    if managed_memory_is_supported and "managed" in rmm_mode:
         for key in _SUPPORTED_PREFETCHES:
             pylibcudf.experimental.enable_prefetching(key)
 
@@ -40,7 +40,20 @@ def install():
     global LOADED
     LOADED = loader is not None
 
-    rmm_mode = os.getenv("CUDF_PANDAS_RMM_MODE", "managed_pool")
+    # The default mode is "managed_pool" if UVM is supported, otherwise "pool"
+    managed_memory_is_supported = (
+        pylibcudf.utils._is_concurrent_managed_access_supported()
+    )
+    default_rmm_mode = (
+        "managed_pool" if managed_memory_is_supported else "pool"
+    )
+    rmm_mode = os.getenv("CUDF_PANDAS_RMM_MODE", default_rmm_mode)
+
+    if "managed" in rmm_mode and not managed_memory_is_supported:
+        raise ValueError(
+            f"Managed memory is not supported on this system, so the requested {rmm_mode=} is invalid."
+        )
+
     # Check if a non-default memory resource is set
     current_mr = rmm.mr.get_current_device_resource()
     if not isinstance(current_mr, rmm.mr.CudaMemoryResource):
@@ -53,6 +66,7 @@ def install():
     free_memory, _ = rmm.mr.available_device_memory()
     free_memory = int(round(float(free_memory) * 0.80 / 256) * 256)
     new_mr = current_mr
+
     if rmm_mode == "pool":
         new_mr = rmm.mr.PoolMemoryResource(
             current_mr,
@@ -71,8 +85,10 @@ def install():
         )
     elif rmm_mode != "cuda":
         raise ValueError(f"Unsupported {rmm_mode=}")
+
     rmm.mr.set_current_device_resource(new_mr)
-    _enable_managed_prefetching(rmm_mode)
+
+    _enable_managed_prefetching(rmm_mode, managed_memory_is_supported)
 
 
 def pytest_load_initial_conftests(early_config, parser, args):