diff --git a/docs/measure/index.md b/docs/measure/index.md index 92512440..52698bd0 100644 --- a/docs/measure/index.md +++ b/docs/measure/index.md @@ -149,3 +149,23 @@ Total time (s): 4.421529293060303 Total energy (J): {'GPU0': 198.52566362297537, 'GPU1': 206.22215216255188, 'GPU2': 201.08565518283845, 'GPU3': 201.79834523367884} ``` + +## Hardware Support +We currently support both NVIDIA (via NVML) and AMD GPUs (via AMDSMI, with ROCm 6.1 or later). + +### `get_gpus` +The [`get_gpus`][zeus.device.get_gpus] function returns a [`GPUs`][zeus.device.gpu.GPUs] object, which can be either an [`NVIDIAGPUs`][zeus.device.gpu.NVIDIAGPUs] or [`AMDGPUs`][zeus.device.gpu.AMDGPUs] object depending on the availability of `nvml` or `amdsmi`. Each [`GPUs`][zeus.device.gpu.GPUs] object contains one or more [`GPU`][zeus.device.gpu.common.GPU] instances, which are specifically [`NVIDIAGPU`][zeus.device.gpu.nvidia.NVIDIAGPU] or [`AMDGPU`][zeus.device.gpu.amd.AMDGPU] objects. + +These [`GPU`][zeus.device.gpu.common.GPU] objects directly call respective `nvml` or `amdsmi` methods, providing a one-to-one mapping of methods for seamless GPU abstraction and support for multiple GPU types. For example: +- [`NVIDIAGPU.getName`][zeus.device.gpu.nvidia.NVIDIAGPU.getName] calls `pynvml.nvmlDeviceGetName`. +- [`AMDGPU.getName`][zeus.device.gpu.amd.AMDGPU.getName] calls `amdsmi.amdsmi_get_gpu_asic_info`. + +### Notes on AMD GPUs + +#### AMD GPUs Initialization +`amdsmi.amdsmi_get_energy_count` sometimes returns invalid values on certain GPUs or ROCm versions (e.g., MI100 on ROCm 6.2). See [ROCm issue #38](https://github.com/ROCm/amdsmi/issues/38) for more details. During the [`AMDGPUs`][zeus.device.gpu.AMDGPUs] object initialization, we call `amdsmi.amdsmi_get_energy_count` twice for each GPU, with a 0.5-second delay between calls. This difference is compared to power measurements to determine if `amdsmi.amdsmi_get_energy_count` is stable and reliable. Initialization takes 0.5 seconds regardless of the number of AMD GPUs. + +`amdsmi.amdsmi_get_power_info` provides "average_socket_power" and "current_socket_power" fields, but the "current_socket_power" field is sometimes not supported and returns "N/A." During the [`AMDGPUs`][zeus.device.gpu.AMDGPUs] object initialization, this method is checked, and if "N/A" is returned, the [`AMDGPU.getInstantPowerUsage`][zeus.device.gpu.amd.AMDGPU.getInstantPowerUsage] method is disabled. Instead, [`AMDGPU.getAveragePowerUsage`][zeus.device.gpu.amd.AMDGPU.getAveragePowerUsage] needs to be used. + +#### Supported AMD SMI Versions +Only ROCm >= 6.1 is supported, as the AMDSMI APIs for power and energy return wrong values. For more information, see [ROCm issue #22](https://github.com/ROCm/amdsmi/issues/22). Ensure your `amdsmi` and ROCm versions are up to date. diff --git a/pyproject.toml b/pyproject.toml index 7a050e89..c6ec8722 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,8 @@ dependencies = [ "pydantic", # The `zeus.utils.pydantic_v1` compatibility layer allows us to unpin Pydantic in most cases. "rich", "tyro", - "httpx" + "httpx", + "amdsmi" ] dynamic = ["version"] diff --git a/zeus/device/gpu/amd.py b/zeus/device/gpu/amd.py index 34299e7d..8024420c 100644 --- a/zeus/device/gpu/amd.py +++ b/zeus/device/gpu/amd.py @@ -4,12 +4,15 @@ import functools import os import contextlib +import time from typing import Sequence from functools import lru_cache try: import amdsmi # type: ignore -except ImportError: +# must catch all exceptions, since ImportError is not the only exception that can be raised (ex. OSError on version mismatch). +# Specific exceptions are handled when import and initialization are retested in `amdsmi_is_available` +except Exception: class MockAMDSMI: """Mock class for AMD SMI library.""" @@ -41,6 +44,18 @@ def amdsmi_is_available() -> bool: except ImportError: logger.info("amdsmi is not available.") return False + # usually thrown if amdsmi can't find libamd_smi.so + except OSError: + if os.getenv("ROCM_PATH") is None: + logger.warning("`ROCM_PATH` is not set. Do you have ROCm installed?") + return False + # usually thrown if versions of amdsmi and ROCm are incompatible. + except AttributeError: + logger.warning( + "Failed to import amdsmi. " + "Ensure amdsmi's version is at least as high as the current ROCm version." + ) + return False try: amdsmi.amdsmi_init() logger.info("amdsmi is available and initialized") @@ -71,10 +86,10 @@ def __init__(self, gpu_index: int) -> None: """Initialize the GPU object.""" super().__init__(gpu_index) self._get_handle() - # XXX(Jae-Won): Right now, the energy API's unit is broken (either the - # `power` field or the `counter_resolution` field). Before that, we're - # disabling the energy API. - self._supportsGetTotalEnergyConsumption = False + + # These values are updated in AMDGPUs constructor + self._supportsGetTotalEnergyConsumption = True + self._supportsInstantPowerUsage = True _exception_map = { 1: gpu_common.ZeusGPUInvalidArgError, # amdsmi.amdsmi_wrapper.AMDSMI_STATUS_INVAL @@ -225,12 +240,28 @@ def resetGpuLockedClocks(self, _block: bool = True) -> None: clk_type=amdsmi.AmdSmiClkType.GFX, ) # expects MHz + @_handle_amdsmi_errors + def getAveragePowerUsage(self) -> int: + """Return the average power draw of the GPU. Units: mW.""" + # returns in W, convert to mW + return ( + int(amdsmi.amdsmi_get_power_info(self.handle)["average_socket_power"]) + * 1000 + ) + @_handle_amdsmi_errors def getInstantPowerUsage(self) -> int: """Return the current power draw of the GPU. Units: mW.""" + if not self._supportsInstantPowerUsage: + raise gpu_common.ZeusGPUNotSupportedError( + "Instant power usage is not supported on this AMD GPU. " + "This is because amdsmi.amdsmi_get_power_info does not return a valid 'current_socket_power'. " + "Please use `getAveragePowerUsage` instead." + ) # returns in W, convert to mW - return int( - amdsmi.amdsmi_get_power_info(self.handle)["average_socket_power"] * 1000 + return ( + int(amdsmi.amdsmi_get_power_info(self.handle)["current_socket_power"]) + * 1000 ) @_handle_amdsmi_errors @@ -242,28 +273,28 @@ def getAverageMemoryPowerUsage(self) -> int: @_handle_amdsmi_errors def supportsGetTotalEnergyConsumption(self) -> bool: - """Check if the GPU supports retrieving total energy consumption.""" - if self._supportsGetTotalEnergyConsumption is None: - try: - _ = amdsmi.amdsmi_get_energy_count(self.handle) - self._supportsGetTotalEnergyConsumption = True - except amdsmi.AmdSmiLibraryException as e: - if ( - e.get_error_code() == 2 - ): # amdsmi.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED - self._supportsGetTotalEnergyConsumption = False - else: - raise e - + """Check if the GPU supports retrieving total energy consumption. Returns a future object of the result.""" return self._supportsGetTotalEnergyConsumption @_handle_amdsmi_errors def getTotalEnergyConsumption(self) -> int: """Return the total energy consumption of the GPU since driver load. Units: mJ.""" - info = amdsmi.amdsmi_get_energy_count(self.handle) - return int( - info["power"] / 1e3 - ) # returns in micro Joules, convert to mili Joules + if not self._supportsGetTotalEnergyConsumption: + raise gpu_common.ZeusGPUNotSupportedError( + "Total energy consumption is not supported on this AMD GPU. " + "This is because the result of `amdsmi.amdsmi_get_energy_count` is not accurate. " + "Please use `getAveragePowerUsage` or `getInstantPowerUsage` to calculate energy usage." + ) + energy_dict = amdsmi.amdsmi_get_energy_count(self.handle) + if "energy_accumulator" in energy_dict: # Changed since amdsmi 6.2.1 + energy = ( + energy_dict["energy_accumulator"] * energy_dict["counter_resolution"] + ) + else: + # Old API: assume has key "power". If not, exception will be handled by _handle_amdsmi_errors. + energy = energy_dict["power"] * energy_dict["counter_resolution"] + + return int(energy / 1e3) # returns in micro Joules, convert to mili Joules class AMDGPUs(gpu_common.GPUs): @@ -292,11 +323,11 @@ def __init__(self, ensure_homogeneous: bool = False) -> None: self._init_gpus() if ensure_homogeneous: self._ensure_homogeneous() - except amdsmi.AmdSmiException as e: + except amdsmi.AmdSmiLibraryException as e: exception_class = AMDGPU._exception_map.get( - e.value, gpu_common.ZeusBaseGPUError + e.get_error_code(), gpu_common.ZeusBaseGPUError ) - raise exception_class(e.msg) from e + raise exception_class(e.get_error_info()) from e @property def gpus(self) -> Sequence[AMDGPU]: @@ -318,8 +349,46 @@ def _init_gpus(self) -> None: else: visible_indices = list(range(len(amdsmi.amdsmi_get_processor_handles()))) + # create the number of visible GPUs self._gpus = [AMDGPU(gpu_num) for gpu_num in visible_indices] + # set _supportsInstantPowerUsage for all GPUs + for gpu in self._gpus: + gpu._supportsInstantPowerUsage = isinstance( + amdsmi.amdsmi_get_power_info(gpu.handle)["current_socket_power"], + int, + ) # amdsmi.amdsmi_get_power_info["current_socket_power"] returns "N/A" if not supported + + # set _supportsGetTotalEnergyConsumption for all GPUs + wait_time = 0.5 # seconds + powers = [gpu.getAveragePowerUsage() for gpu in self._gpus] + initial_energies = [gpu.getTotalEnergyConsumption() for gpu in self._gpus] + time.sleep(wait_time) + final_energies = [gpu.getTotalEnergyConsumption() for gpu in self._gpus] + measured_energies = [ + final - initial for final, initial in zip(final_energies, initial_energies) + ] + expected_energies = [ + power * wait_time for power in powers + ] # energy = power * time + + for gpu, measured_energy, expected_energy in zip( + self._gpus, measured_energies, expected_energies + ): + # Loose bound to rule out very obvious counter problems + if 0.1 < measured_energy / expected_energy < 10: + gpu._supportsGetTotalEnergyConsumption = True + else: + gpu._supportsGetTotalEnergyConsumption = False + logger.info( + "Disabling `getTotalEnergyConsumption` for device %d. The result of `amdsmi.amdsmi_get_energy_count` is not accurate. Expected energy: %d mJ, Measured energy: %d mJ. " + "This is a known issue with some AMD GPUs, please see https://github.com/ROCm/amdsmi/issues/38 for more information. " + "You can still measure energy by polling either `getInstantPowerUsage` or `getAveragePowerUsage` and integrating over time.", + gpu.gpu_index, + expected_energy, + measured_energy, + ) + def __del__(self) -> None: """Shut down AMDSMI.""" with contextlib.suppress(amdsmi.AmdSmiException): diff --git a/zeus/device/gpu/common.py b/zeus/device/gpu/common.py index 87a5c28a..89db5871 100644 --- a/zeus/device/gpu/common.py +++ b/zeus/device/gpu/common.py @@ -96,6 +96,11 @@ def resetGpuLockedClocks(self, _block: bool = True) -> None: """Reset the locked GPU clocks to the default.""" pass + @abc.abstractmethod + def getAveragePowerUsage(self) -> int: + """Return the average power usage of the GPU. Units: mW.""" + pass + @abc.abstractmethod def getInstantPowerUsage(self) -> int: """Return the current power draw of the GPU. Units: mW.""" diff --git a/zeus/device/gpu/nvidia.py b/zeus/device/gpu/nvidia.py index 134696de..82500d11 100644 --- a/zeus/device/gpu/nvidia.py +++ b/zeus/device/gpu/nvidia.py @@ -189,6 +189,16 @@ def resetGpuLockedClocks(self, _block: bool = True) -> None: """Reset the locked GPU clocks to the default.""" pynvml.nvmlDeviceResetGpuLockedClocks(self.handle) + @_handle_nvml_errors + def getAveragePowerUsage(self) -> int: + """Return the average power draw of the GPU. Units: mW.""" + metric = pynvml.nvmlDeviceGetFieldValues( + self.handle, [pynvml.NVML_FI_DEV_POWER_AVERAGE] + )[0] + if (ret := metric.nvmlReturn) != pynvml.NVML_SUCCESS: + raise pynvml.NVMLError(ret) + return metric.value.uiVal + @_handle_nvml_errors def getInstantPowerUsage(self) -> int: """Return the current power draw of the GPU. Units: mW."""