Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Forward-merge branch-24.12 into branch-25.02 #17417

Merged
merged 1 commit into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/cudf/source/cudf_polars/engine_options.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,10 @@ engine = GPUEngine(
result = query.collect(engine=engine)
```
Note that passing `chunked: False` disables chunked reading entirely, and thus `chunk_read_limit` and `pass_read_limit` will have no effect.

## Disabling CUDA Managed Memory

By default `cudf_polars` will default to [CUDA managed memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#unified-memory-introduction) with RMM's pool allocator. On systems that don't support managed memory, a non-managed asynchronous pool
allocator is used.
Managed memory can be turned off by setting `POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY` to `0`. System requirements for managed memory can be found [here](
https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#system-requirements-for-unified-memory).
6 changes: 6 additions & 0 deletions docs/cudf/source/cudf_polars/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@ and run on the CPU.

Benchmark
---------

.. note::
The following benchmarks were performed with `POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY` environment variable set to `"0"`.
Using managed memory (the default) imposes a performance cost in order to avoid out of memory errors.
Peak performance can still be attained by setting the environment variable to 1.

We reproduced the `Polars Decision Support (PDS) <https://github.com/pola-rs/polars-benchmark>`__ benchmark to compare Polars GPU engine with the default CPU settings across several dataset sizes. Here are the results:

.. figure:: ../_static/pds_benchmark_polars.png
Expand Down
56 changes: 51 additions & 5 deletions python/cudf_polars/cudf_polars/callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

from polars.exceptions import ComputeError, PerformanceWarning

import pylibcudf
import rmm
from rmm._cuda import gpu

Expand All @@ -32,8 +33,26 @@
__all__: list[str] = ["execute_with_cudf"]


_SUPPORTED_PREFETCHES = {
"column_view::get_data",
"mutable_column_view::get_data",
"gather",
"hash_join",
}


def _env_get_int(name, default):
try:
return int(os.getenv(name, default))
except (ValueError, TypeError): # pragma: no cover
return default # pragma: no cover


@cache
def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource:
def default_memory_resource(
device: int,
cuda_managed_memory: bool, # noqa: FBT001
) -> rmm.mr.DeviceMemoryResource:
"""
Return the default memory resource for cudf-polars.

Expand All @@ -42,15 +61,35 @@ def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource:
device
Disambiguating device id when selecting the device. Must be
the active device when this function is called.
cuda_managed_memory
Whether to use managed memory or not.

Returns
-------
rmm.mr.DeviceMemoryResource
The default memory resource that cudf-polars uses. Currently
an async pool resource.
a managed memory resource, if `cuda_managed_memory` is `True`.
else, an async pool resource is returned.
"""
try:
return rmm.mr.CudaAsyncMemoryResource()
if (
cuda_managed_memory
and pylibcudf.utils._is_concurrent_managed_access_supported()
):
# Allocating 80% of the available memory for the pool.
# Leaving a 20% headroom to avoid OOM errors.
free_memory, _ = rmm.mr.available_device_memory()
free_memory = int(round(float(free_memory) * 0.80 / 256) * 256)
for key in _SUPPORTED_PREFETCHES:
pylibcudf.experimental.enable_prefetching(key)
mr = rmm.mr.PrefetchResourceAdaptor(
rmm.mr.PoolMemoryResource(
rmm.mr.ManagedMemoryResource(),
initial_pool_size=free_memory,
)
)
else:
mr = rmm.mr.CudaAsyncMemoryResource()
except RuntimeError as e: # pragma: no cover
msg, *_ = e.args
if (
Expand All @@ -64,6 +103,8 @@ def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource:
) from None
else:
raise
else:
return mr


@contextlib.contextmanager
Expand All @@ -89,10 +130,15 @@ def set_memory_resource(
at entry. If a memory resource is provided, it must be valid to
use with the currently active device.
"""
previous = rmm.mr.get_current_device_resource()
if mr is None:
device: int = gpu.getDevice()
mr = default_memory_resource(device)
previous = rmm.mr.get_current_device_resource()
mr = default_memory_resource(
device=device,
cuda_managed_memory=bool(
_env_get_int("POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", default=1) != 0
),
)
rmm.mr.set_current_device_resource(mr)
try:
yield mr
Expand Down
20 changes: 20 additions & 0 deletions python/cudf_polars/tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import rmm

from cudf_polars.callback import default_memory_resource
from cudf_polars.dsl.ir import DataFrameScan
from cudf_polars.testing.asserts import (
assert_gpu_result_equal,
Expand Down Expand Up @@ -58,6 +59,25 @@ def test_invalid_memory_resource_raises(mr):
q.collect(engine=pl.GPUEngine(memory_resource=mr))


@pytest.mark.parametrize("disable_managed_memory", ["1", "0"])
def test_cudf_polars_enable_disable_managed_memory(monkeypatch, disable_managed_memory):
q = pl.LazyFrame({"a": [1, 2, 3]})

with monkeypatch.context() as monkeycontext:
monkeycontext.setenv(
"POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", disable_managed_memory
)
result = q.collect(engine=pl.GPUEngine())
mr = default_memory_resource(0, bool(disable_managed_memory == "1"))
if disable_managed_memory == "1":
assert isinstance(mr, rmm.mr.PrefetchResourceAdaptor)
assert isinstance(mr.upstream_mr, rmm.mr.PoolMemoryResource)
else:
assert isinstance(mr, rmm.mr.CudaAsyncMemoryResource)
monkeycontext.delenv("POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY")
assert_frame_equal(q.collect(), result)


def test_explicit_device_zero():
q = pl.LazyFrame({"a": [1, 2, 3]})

Expand Down
Loading