Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feat] amdsmi bindings integration #132

Merged
merged 22 commits into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ dependencies = [
"pydantic", # The `zeus.utils.pydantic_v1` compatibility layer allows us to unpin Pydantic in most cases.
"rich",
"tyro",
"httpx"
"httpx",
"amdsmi"
]
dynamic = ["version"]

Expand Down
99 changes: 71 additions & 28 deletions zeus/device/gpu/amd.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,17 @@
from __future__ import annotations
import functools
import os
import concurrent.futures
import contextlib
import time
from typing import Sequence
from functools import lru_cache

try:
import amdsmi # type: ignore
except ImportError:
# must catch all exceptions, since ImportError is not the only exception that can be raised (ex. OSError on version mismatch).
# Specific exceptions are handled when import and initialization are retested in `amdsmi_is_available`
except Exception:

class MockAMDSMI:
"""Mock class for AMD SMI library."""
Expand Down Expand Up @@ -41,6 +45,15 @@ def amdsmi_is_available() -> bool:
except ImportError:
logger.info("amdsmi is not available.")
return False
# usually thrown if amdsmi can't find libamd_smi.so
except OSError:
if os.getenv("ROCM_PATH") is None:
logger.warning("`ROCM_PATH` is not set. Do you have ROCm installed?")
return False
# usually thrown if versions of amdsmi and ROCm are incompatible.
except AttributeError:
logger.warning("Do you have the correct version of ROCm and amdsmi installed?")
parthraut marked this conversation as resolved.
Show resolved Hide resolved
return False
try:
amdsmi.amdsmi_init()
logger.info("amdsmi is available and initialized")
Expand Down Expand Up @@ -71,10 +84,6 @@ def __init__(self, gpu_index: int) -> None:
"""Initialize the GPU object."""
super().__init__(gpu_index)
self._get_handle()
# XXX(Jae-Won): Right now, the energy API's unit is broken (either the
# `power` field or the `counter_resolution` field). Before that, we're
# disabling the energy API.
self._supportsGetTotalEnergyConsumption = False

_exception_map = {
1: gpu_common.ZeusGPUInvalidArgError, # amdsmi.amdsmi_wrapper.AMDSMI_STATUS_INVAL
Expand Down Expand Up @@ -230,7 +239,8 @@ def getInstantPowerUsage(self) -> int:
"""Return the current power draw of the GPU. Units: mW."""
# returns in W, convert to mW
return int(
amdsmi.amdsmi_get_power_info(self.handle)["average_socket_power"] * 1000
int(amdsmi.amdsmi_get_power_info(self.handle)["average_socket_power"])
* 1000
)

@_handle_amdsmi_errors
Expand All @@ -241,29 +251,49 @@ def getAverageMemoryPowerUsage(self) -> int:
)

@_handle_amdsmi_errors
def supportsGetTotalEnergyConsumption(self) -> bool:
"""Check if the GPU supports retrieving total energy consumption."""
if self._supportsGetTotalEnergyConsumption is None:
try:
_ = amdsmi.amdsmi_get_energy_count(self.handle)
self._supportsGetTotalEnergyConsumption = True
except amdsmi.AmdSmiLibraryException as e:
if (
e.get_error_code() == 2
): # amdsmi.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED
self._supportsGetTotalEnergyConsumption = False
else:
raise e

def supportsGetTotalEnergyConsumption(
self,
) -> bool:
parthraut marked this conversation as resolved.
Show resolved Hide resolved
"""Check if the GPU supports retrieving total energy consumption. Returns a future object of the result."""
wait_time = 0.5 # seconds
threshold = 0.8 # 80% threshold
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to first check whether self._supportsGetTotalEnergyConsumption is not None and return the cached value for future invocations of this method?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

with the current structure it's never called more than once. But that would be better, I'll add it

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This method is part of the public API of GPU and GPUs, meaning people can always call it in their code. Thanks!


power = self.getInstantPowerUsage()
initial_energy = self.getTotalEnergyConsumption()
time.sleep(wait_time)
final_energy = self.getTotalEnergyConsumption()

measured_energy = final_energy - initial_energy
expected_energy = power * wait_time # power is in mW, wait_time is in seconds

# if the difference between measured and expected energy is less than 1% of the expected energy, then the API is supported
if abs(measured_energy - expected_energy) < threshold * expected_energy:
parthraut marked this conversation as resolved.
Show resolved Hide resolved
self._supportsGetTotalEnergyConsumption = True
else:
self._supportsGetTotalEnergyConsumption = False
logger.info(
"Disabling `getTotalEnergyConsumption` for device %d. The result of `amdsmi.amdsmi_get_energy_count` is not accurate. Expected energy: %d mJ, Measured energy: %d mJ"
"This is a known issue with some AMD GPUs, please see https://github.com/ROCm/amdsmi/issues/38 for more information."
"Energy metrics will still be available and measured through polling of `getInstantPowerUsage` method.",
self.gpu_index,
expected_energy,
measured_energy,
)
return self._supportsGetTotalEnergyConsumption

@_handle_amdsmi_errors
def getTotalEnergyConsumption(self) -> int:
"""Return the total energy consumption of the GPU since driver load. Units: mJ."""
info = amdsmi.amdsmi_get_energy_count(self.handle)
return int(
info["power"] / 1e3
) # returns in micro Joules, convert to mili Joules
energy_dict = amdsmi.amdsmi_get_energy_count(self.handle)
parthraut marked this conversation as resolved.
Show resolved Hide resolved
if "energy_accumulator" in energy_dict: # New API
parthraut marked this conversation as resolved.
Show resolved Hide resolved
energy = (
energy_dict["energy_accumulator"] * energy_dict["counter_resolution"]
)
else:
# Old API: assume has key "power". If not, exception will be handled by _handle_amdsmi_errors.
energy = energy_dict["power"] * energy_dict["counter_resolution"]

return int(energy / 1e3) # returns in micro Joules, convert to mili Joules


class AMDGPUs(gpu_common.GPUs):
Expand Down Expand Up @@ -292,11 +322,11 @@ def __init__(self, ensure_homogeneous: bool = False) -> None:
self._init_gpus()
if ensure_homogeneous:
self._ensure_homogeneous()
except amdsmi.AmdSmiException as e:
except amdsmi.AmdSmiLibraryException as e:
exception_class = AMDGPU._exception_map.get(
e.value, gpu_common.ZeusBaseGPUError
e.get_error_code(), gpu_common.ZeusBaseGPUError
)
raise exception_class(e.msg) from e
raise exception_class(e.get_error_info()) from e

@property
def gpus(self) -> Sequence[AMDGPU]:
Expand All @@ -318,7 +348,20 @@ def _init_gpus(self) -> None:
else:
visible_indices = list(range(len(amdsmi.amdsmi_get_processor_handles())))

self._gpus = [AMDGPU(gpu_num) for gpu_num in visible_indices]
# create a threadpool with the number of visible GPUs
with concurrent.futures.ThreadPoolExecutor(
max_workers=len(visible_indices)
) as executor:
self._gpus = [AMDGPU(gpu_num) for gpu_num in visible_indices]

# check if GPUs support getTotalEnergyConsumption. Returns a future object of the result.
futures = [
executor.submit(gpu.supportsGetTotalEnergyConsumption)
for gpu in self._gpus
]
jaywonchung marked this conversation as resolved.
Show resolved Hide resolved

# wait for all futures to complete
concurrent.futures.wait(futures)

def __del__(self) -> None:
"""Shut down AMDSMI."""
Expand Down
Loading