Skip to content

Commit

Permalink
Improve device error messages (#315)
Browse files Browse the repository at this point in the history
  • Loading branch information
PicoCentauri authored Jul 28, 2024
1 parent 80c50d3 commit 96f2c74
Show file tree
Hide file tree
Showing 2 changed files with 164 additions and 94 deletions.
100 changes: 58 additions & 42 deletions src/metatrain/utils/devices.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,9 @@
import torch


def _get_available_devices() -> List[str]:
available_devices = ["cpu"]
if torch.cuda.is_available():
available_devices.append("cuda")
if torch.cuda.device_count() > 1:
available_devices.append("multi-cuda")
# for torch<2.0 `torch.backends.mps.is_available()` is required for a reasonable
# check.
if torch.backends.mps.is_built() and torch.backends.mps.is_available():
available_devices.append("mps")

return available_devices
def _mps_is_available() -> bool:
# require `torch.backends.mps.is_available()` for a reasonable check in torch<2.0
return torch.backends.mps.is_built() and torch.backends.mps.is_available()


def pick_devices(
Expand All @@ -31,10 +22,17 @@ def pick_devices(
:param architecture_devices: Devices supported by the architecture. The list should
be sorted by the preference of the architecture while the most prefferred device
should be first and the least one last.
:param desired_device: desired device by the user
:param desired_device: desired device by the user. For example, ``"cpu"``,
"``cuda``", ``"multi-gpu"``, etc.
"""

available_devices = _get_available_devices()
available_devices = ["cpu"]
if torch.cuda.is_available():
available_devices.append("cuda")
if torch.cuda.device_count() > 1:
available_devices.append("multi-cuda")
if _mps_is_available():
available_devices.append("mps")

# intersect between available and architecture's devices. keep order of architecture
possible_devices = [d for d in architecture_devices if d in available_devices]
Expand All @@ -52,37 +50,55 @@ def pick_devices(
else:
desired_device = desired_device.lower()

# convert "gpu" and "multi-gpu" to "cuda" or "mps" if available
if desired_device == "gpu":
if torch.cuda.is_available():
desired_device = "cuda"
elif torch.backends.mps.is_built() and torch.backends.mps.is_available():
desired_device = "mps"
else:
raise ValueError(
"Requested 'gpu' device, but found no GPU (CUDA or MPS) devices."
)
if desired_device == "multi-gpu":
desired_device = "multi-cuda"

if desired_device not in possible_devices:
# convert "gpu" and "multi-gpu" to "cuda" or "mps" if available
if desired_device == "gpu":
if torch.cuda.is_available():
desired_device = "cuda"
elif _mps_is_available():
desired_device = "mps"
else:
raise ValueError(
f"Unsupported desired device {desired_device!r}. "
f"Please choose from {', '.join(possible_devices)}."
)
if desired_device == "multi-cuda" and torch.cuda.device_count() < 2:
raise ValueError(
"Requested device 'multi-gpu' or 'multi-cuda', but found only one CUDA "
"device. If you want to run on a single GPU, please use 'gpu' or "
"'cuda' instead."
"Requested 'gpu' device, but found no GPU (CUDA or MPS) devices."
)
elif desired_device == "cuda" and not torch.cuda.is_available():
raise ValueError("Requested 'cuda' device, but cuda is not available.")
elif desired_device == "mps" and not _mps_is_available():
raise ValueError("Requested 'mps' device, but mps is not available.")

if possible_devices.index(desired_device) > 0:
warnings.warn(
f"Device {desired_device!r} requested, but {possible_devices[0]!r} is "
"prefferred by the architecture and available on current system.",
stacklevel=2,
)
if desired_device == "multi-gpu":
desired_device = "multi-cuda"

if desired_device not in architecture_devices:
raise ValueError(
f"Desired device {desired_device!r} is not supported by the selected "
f"architecture. Please choose from {', '.join(possible_devices)}."
)

if desired_device not in available_devices:
raise ValueError(
f"Desired device {desired_device!r} is not supported on "
f"your current system. Please choose from {', '.join(possible_devices)}."
)

if possible_devices.index(desired_device) > 0:
warnings.warn(
f"Device {desired_device!r} requested, but {possible_devices[0]!r} is "
"prefferred by the architecture and available on current system.",
stacklevel=2,
)

if (
desired_device == "cuda"
and torch.cuda.device_count() > 1
and any(d in possible_devices for d in ["multi-cuda", "multi_gpu"])
):
warnings.warn(
"Requested single 'cuda' device but current system has "
f"{torch.cuda.device_count()} cuda devices and architecture supports "
"multi-gpu training. Consider using 'multi-gpu' to accelerate "
"training.",
stacklevel=2,
)

# convert the requested device to a list of torch devices
if desired_device == "multi-cuda":
Expand Down
158 changes: 106 additions & 52 deletions tests/utils/test_device.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,20 @@
file.
"""

from typing import List

import pytest
import torch

from metatrain.utils import devices
from metatrain.utils.devices import pick_devices


def is_true() -> bool:
return True


def is_false() -> bool:
return False


@pytest.mark.parametrize("desired_device", ["cpu", None])
def test_pick_devices(desired_device):
picked_devices = pick_devices(["cpu"], desired_device)
Expand All @@ -24,10 +29,7 @@ def test_pick_devices(desired_device):

@pytest.mark.parametrize("desired_device", ["cuda", None])
def test_pick_devices_cuda(desired_device, monkeypatch):
def _get_available_devices() -> List[str]:
return ["cuda", "cpu"]

monkeypatch.setattr(devices, "_get_available_devices", _get_available_devices)
monkeypatch.setattr(torch.cuda, "is_available", is_true)

picked_devices = pick_devices(["cuda", "cpu"], desired_device)

Expand All @@ -36,11 +38,9 @@ def _get_available_devices() -> List[str]:

def test_pick_devices_prefer_architecture(monkeypatch):
"""Use architecture's preferred device if several matching devices are available."""

def _get_available_devices() -> List[str]:
return ["mps", "cpu", "cuda"]

monkeypatch.setattr(devices, "_get_available_devices", _get_available_devices)
monkeypatch.setattr(torch.cuda, "is_available", is_true)
monkeypatch.setattr(torch.backends.mps, "is_built", is_true)
monkeypatch.setattr(torch.backends.mps, "is_available", is_true)

picked_devices = pick_devices(["cuda", "cpu"])

Expand All @@ -49,21 +49,17 @@ def _get_available_devices() -> List[str]:

@pytest.mark.parametrize("desired_device", ["mps", None])
def test_pick_devices_mps(desired_device, monkeypatch):
def _get_available_devices() -> List[str]:
return ["mps", "cpu"]

monkeypatch.setattr(devices, "_get_available_devices", _get_available_devices)
monkeypatch.setattr(torch.backends.mps, "is_built", is_true)
monkeypatch.setattr(torch.backends.mps, "is_available", is_true)

picked_devices = pick_devices(["mps", "cpu"], desired_device)

assert picked_devices == [torch.device("mps")]


def test_no_matching_device(monkeypatch):
def _get_available_devices() -> List[str]:
return ["cpu"]

monkeypatch.setattr(devices, "_get_available_devices", _get_available_devices)
monkeypatch.setattr(torch.backends.mps, "is_built", is_false)
monkeypatch.setattr(torch.backends.mps, "is_available", is_false)

match = (
"No matching device found! The architecture requires cuda, mps; but your "
Expand All @@ -73,64 +69,122 @@ def _get_available_devices() -> List[str]:
pick_devices(["cuda", "mps"])


def test_pick_devices_unsoprted():
match = "Unsupported desired device 'cuda'. Please choose from cpu."
def test_pick_devices_unsupported_by_architecture(monkeypatch):
monkeypatch.setattr(torch.cuda, "is_available", is_true)
match = (
"Desired device 'cuda' is not supported by the selected architecture. "
"Please choose from cpu."
)
with pytest.raises(ValueError, match=match):
pick_devices(["cpu"], "cuda")


def test_pick_devices_preferred_warning(monkeypatch):
def _get_available_devices() -> List[str]:
return ["mps", "cpu"]
@pytest.mark.parametrize("desired_device", ["multi-cuda", "multi-gpu"])
def test_pick_devices_multi_error(desired_device, monkeypatch):
def device_count() -> int:
return 1

monkeypatch.setattr(torch.cuda, "is_available", is_true)
monkeypatch.setattr(torch.cuda, "device_count", device_count)

match = (
"Desired device 'multi-cuda' is not supported on your current system. "
"Please choose from cpu."
)
with pytest.raises(ValueError, match=match):
pick_devices(["multi-cuda", "cpu"], desired_device=desired_device)


monkeypatch.setattr(devices, "_get_available_devices", _get_available_devices)
def test_pick_devices_preferred_warning(monkeypatch):
monkeypatch.setattr(torch.backends.mps, "is_built", is_true)
monkeypatch.setattr(torch.backends.mps, "is_available", is_true)

match = "Device 'cpu' requested, but 'mps' is prefferred"
with pytest.warns(UserWarning, match=match):
pick_devices(["mps", "cpu", "cuda"], desired_device="cpu")


@pytest.mark.parametrize("desired_device", ["multi-cuda", "multi-gpu"])
def test_pick_devices_multi_error(desired_device, monkeypatch):
def _get_available_devices() -> List[str]:
return ["multi-cuda", "cuda", "cpu"]
def test_pick_devices_gpu_cuda_map(monkeypatch):
monkeypatch.setattr(torch.cuda, "is_available", is_true)

monkeypatch.setattr(devices, "_get_available_devices", _get_available_devices)
picked_devices = pick_devices(["cuda", "cpu"], "gpu")
assert picked_devices == [torch.device("cuda")]

with pytest.raises(ValueError, match="Requested device 'multi-gpu'"):
pick_devices(["multi-cuda", "cpu"], desired_device=desired_device)

def test_pick_devices_no_cuda(monkeypatch):
monkeypatch.setattr(torch.cuda, "is_available", is_false)

# Below tests that require specific devices to be present
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available")
def test_pick_devices_gpu_cuda_map():
picked_devices = pick_devices(["cuda", "cpu"], "gpu")
assert picked_devices == [torch.device("cuda")]
match = "Requested 'cuda' device, but cuda is not available."
with pytest.raises(ValueError, match=match):
pick_devices(["cuda", "cpu"], "cuda")


@pytest.mark.skipif(
not (torch.backends.mps.is_built() and torch.backends.mps.is_available()),
reason="MPS is not available",
)
def test_pick_devices_gpu_mps_map():
def test_pick_devices_gpu_mps_map(monkeypatch):
monkeypatch.setattr(torch.backends.mps, "is_built", is_true)
monkeypatch.setattr(torch.backends.mps, "is_available", is_true)

picked_devices = pick_devices(["mps", "cpu"], "gpu")
assert picked_devices == [torch.device("mps")]


@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="less than 2 CUDA devices")
@pytest.mark.parametrize(
"is_built, is_available", [(is_true, is_false), (is_false, is_true)]
)
def test_pick_devices_no_mps(monkeypatch, is_built, is_available):
monkeypatch.setattr(torch.backends.mps, "is_built", is_built)
monkeypatch.setattr(torch.backends.mps, "is_available", is_available)

match = "Requested 'mps' device, but mps is not available."
with pytest.raises(ValueError, match=match):
pick_devices(["mps", "cpu"], "mps")


@pytest.mark.parametrize("desired_device", ["multi-cuda", "multi-gpu"])
def test_pick_devices_multi_cuda(desired_device):
picked_devices = pick_devices(["cpu", "cuda", "multi-cuda"], desired_device)
def test_pick_devices_multi_cuda(desired_device, monkeypatch):
def device_count() -> int:
return 2

monkeypatch.setattr(torch.cuda, "is_available", is_true)
monkeypatch.setattr(torch.cuda, "device_count", device_count)

picked_devices = pick_devices(["multi-cuda", "cpu", "cuda"], desired_device)
assert picked_devices == [
torch.device(f"cuda:{i}") for i in range(torch.cuda.device_count())
]


@pytest.mark.skipif(
torch.cuda.is_available()
or (torch.backends.mps.is_built() and torch.backends.mps.is_available()),
reason="GPU device available",
@pytest.mark.parametrize(
"cuda_is_available, mps_is_build, mps_is_available",
[
(is_false, is_false, is_false),
(is_false, is_true, is_false),
(is_false, is_false, is_true),
],
)
def test_pick_devices_gpu_not_available():
def test_pick_devices_gpu_not_available(
cuda_is_available, mps_is_build, mps_is_available, monkeypatch
):
monkeypatch.setattr(torch.cuda, "is_available", cuda_is_available)
monkeypatch.setattr(torch.backends.mps, "is_built", mps_is_build)
monkeypatch.setattr(torch.backends.mps, "is_available", mps_is_available)

with pytest.raises(ValueError, match="Requested 'gpu' device, but found no GPU"):
pick_devices(["cuda", "cpu"], "gpu")
pick_devices(["mps", "cpu"], "gpu")


def test_multi_gpu_warning(monkeypatch):
def device_count() -> int:
return 2

monkeypatch.setattr(torch.cuda, "is_available", is_true)
monkeypatch.setattr(torch.cuda, "device_count", device_count)

match = (
"Requested single 'cuda' device but current system has 2 cuda devices and "
"architecture supports multi-gpu training. Consider using 'multi-gpu' to "
"accelerate training."
)
with pytest.warns(UserWarning, match=match):
picked_devices = pick_devices(["cuda", "multi-cuda", "cpu"], "cuda")

assert picked_devices == [torch.device("cuda")]

0 comments on commit 96f2c74

Please sign in to comment.