Skip to content

Commit

Permalink
Merge pull request #17417 from rapidsai/branch-24.12
Browse files Browse the repository at this point in the history
Forward-merge branch-24.12 into branch-25.02
  • Loading branch information
GPUtester authored Nov 22, 2024
2 parents b2419dd + 305182e commit 2827a03
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 5 deletions.
7 changes: 7 additions & 0 deletions docs/cudf/source/cudf_polars/engine_options.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,10 @@ engine = GPUEngine(
result = query.collect(engine=engine)
```
Note that passing `chunked: False` disables chunked reading entirely, and thus `chunk_read_limit` and `pass_read_limit` will have no effect.

## Disabling CUDA Managed Memory

By default `cudf_polars` will default to [CUDA managed memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#unified-memory-introduction) with RMM's pool allocator. On systems that don't support managed memory, a non-managed asynchronous pool
allocator is used.
Managed memory can be turned off by setting `POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY` to `0`. System requirements for managed memory can be found [here](
https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#system-requirements-for-unified-memory).
6 changes: 6 additions & 0 deletions docs/cudf/source/cudf_polars/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@ and run on the CPU.

Benchmark
---------

.. note::
The following benchmarks were performed with `POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY` environment variable set to `"0"`.
Using managed memory (the default) imposes a performance cost in order to avoid out of memory errors.
Peak performance can still be attained by setting the environment variable to 1.

We reproduced the `Polars Decision Support (PDS) <https://github.com/pola-rs/polars-benchmark>`__ benchmark to compare Polars GPU engine with the default CPU settings across several dataset sizes. Here are the results:

.. figure:: ../_static/pds_benchmark_polars.png
Expand Down
56 changes: 51 additions & 5 deletions python/cudf_polars/cudf_polars/callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

from polars.exceptions import ComputeError, PerformanceWarning

import pylibcudf
import rmm
from rmm._cuda import gpu

Expand All @@ -32,8 +33,26 @@
__all__: list[str] = ["execute_with_cudf"]


_SUPPORTED_PREFETCHES = {
"column_view::get_data",
"mutable_column_view::get_data",
"gather",
"hash_join",
}


def _env_get_int(name, default):
try:
return int(os.getenv(name, default))
except (ValueError, TypeError): # pragma: no cover
return default # pragma: no cover


@cache
def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource:
def default_memory_resource(
device: int,
cuda_managed_memory: bool, # noqa: FBT001
) -> rmm.mr.DeviceMemoryResource:
"""
Return the default memory resource for cudf-polars.
Expand All @@ -42,15 +61,35 @@ def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource:
device
Disambiguating device id when selecting the device. Must be
the active device when this function is called.
cuda_managed_memory
Whether to use managed memory or not.
Returns
-------
rmm.mr.DeviceMemoryResource
The default memory resource that cudf-polars uses. Currently
an async pool resource.
a managed memory resource, if `cuda_managed_memory` is `True`.
else, an async pool resource is returned.
"""
try:
return rmm.mr.CudaAsyncMemoryResource()
if (
cuda_managed_memory
and pylibcudf.utils._is_concurrent_managed_access_supported()
):
# Allocating 80% of the available memory for the pool.
# Leaving a 20% headroom to avoid OOM errors.
free_memory, _ = rmm.mr.available_device_memory()
free_memory = int(round(float(free_memory) * 0.80 / 256) * 256)
for key in _SUPPORTED_PREFETCHES:
pylibcudf.experimental.enable_prefetching(key)
mr = rmm.mr.PrefetchResourceAdaptor(
rmm.mr.PoolMemoryResource(
rmm.mr.ManagedMemoryResource(),
initial_pool_size=free_memory,
)
)
else:
mr = rmm.mr.CudaAsyncMemoryResource()
except RuntimeError as e: # pragma: no cover
msg, *_ = e.args
if (
Expand All @@ -64,6 +103,8 @@ def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource:
) from None
else:
raise
else:
return mr


@contextlib.contextmanager
Expand All @@ -89,10 +130,15 @@ def set_memory_resource(
at entry. If a memory resource is provided, it must be valid to
use with the currently active device.
"""
previous = rmm.mr.get_current_device_resource()
if mr is None:
device: int = gpu.getDevice()
mr = default_memory_resource(device)
previous = rmm.mr.get_current_device_resource()
mr = default_memory_resource(
device=device,
cuda_managed_memory=bool(
_env_get_int("POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", default=1) != 0
),
)
rmm.mr.set_current_device_resource(mr)
try:
yield mr
Expand Down
20 changes: 20 additions & 0 deletions python/cudf_polars/tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import rmm

from cudf_polars.callback import default_memory_resource
from cudf_polars.dsl.ir import DataFrameScan
from cudf_polars.testing.asserts import (
assert_gpu_result_equal,
Expand Down Expand Up @@ -58,6 +59,25 @@ def test_invalid_memory_resource_raises(mr):
q.collect(engine=pl.GPUEngine(memory_resource=mr))


@pytest.mark.parametrize("disable_managed_memory", ["1", "0"])
def test_cudf_polars_enable_disable_managed_memory(monkeypatch, disable_managed_memory):
q = pl.LazyFrame({"a": [1, 2, 3]})

with monkeypatch.context() as monkeycontext:
monkeycontext.setenv(
"POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", disable_managed_memory
)
result = q.collect(engine=pl.GPUEngine())
mr = default_memory_resource(0, bool(disable_managed_memory == "1"))
if disable_managed_memory == "1":
assert isinstance(mr, rmm.mr.PrefetchResourceAdaptor)
assert isinstance(mr.upstream_mr, rmm.mr.PoolMemoryResource)
else:
assert isinstance(mr, rmm.mr.CudaAsyncMemoryResource)
monkeycontext.delenv("POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY")
assert_frame_equal(q.collect(), result)


def test_explicit_device_zero():
q = pl.LazyFrame({"a": [1, 2, 3]})

Expand Down

0 comments on commit 2827a03

Please sign in to comment.