From 07a353958113a2ee2ff74121fb4454b0d6e365b1 Mon Sep 17 00:00:00 2001 From: Jae-Won Chung Date: Thu, 30 May 2024 00:42:09 -0400 Subject: [PATCH] Zeusd doc comments, debugging outputs --- zeus/device/gpu/nvidia.py | 9 +++++++++ zeusd/src/devices/gpu/mod.rs | 13 +++++++++++-- zeusd/src/devices/mod.rs | 2 ++ zeusd/src/routes/gpu.rs | 1 + zeusd/src/routes/mod.rs | 2 ++ 5 files changed, 25 insertions(+), 2 deletions(-) diff --git a/zeus/device/gpu/nvidia.py b/zeus/device/gpu/nvidia.py index 3301ec91..b87ef66c 100644 --- a/zeus/device/gpu/nvidia.py +++ b/zeus/device/gpu/nvidia.py @@ -256,6 +256,7 @@ def setPowerManagementLimit(self, power_limit_mw: int, block: bool = True) -> No ) if resp.status_code != 200: raise ZeusdError(f"Failed to set power management limit: {resp.text}") + logger.debug("Took %s ms to set power limit", resp.elapsed.microseconds / 1000) @_handle_nvml_errors def resetPowerManagementLimit(self, block: bool = True) -> None: @@ -273,6 +274,9 @@ def setPersistenceMode(self, enabled: bool, block: bool = False) -> None: ) if resp.status_code != 200: raise ZeusdError(f"Failed to set persistence mode: {resp.text}") + logger.debug( + "Took %s ms to set persistence mode", resp.elapsed.microseconds / 1000 + ) def setMemoryLockedClocks( self, min_clock_mhz: int, max_clock_mhz: int, block: bool = True @@ -286,6 +290,9 @@ def setMemoryLockedClocks( ) if resp.status_code != 200: raise ZeusdError(f"Failed to set memory locked clocks: {resp.text}") + logger.debug( + "Took %s ms to set memory locked clocks", resp.elapsed.microseconds / 1000 + ) def resetMemoryLockedClocks(self, block: bool = True) -> None: """Reset the locked memory clocks to the default.""" @@ -369,6 +376,8 @@ def _init_gpus(self) -> None: raise ZeusdError( f"ZEUSD_SOCK_PATH points to non-existent file: {sock_path}" ) + if not Path(sock_path).is_socket(): + raise ZeusdError(f"ZEUSD_SOCK_PATH is not a socket: {sock_path}") if not os.access(sock_path, os.W_OK): raise ZeusdError(f"ZEUSD_SOCK_PATH is not writable: {sock_path}") self._gpus = [ diff --git a/zeusd/src/devices/gpu/mod.rs b/zeusd/src/devices/gpu/mod.rs index 7c8a4933..281f54ba 100644 --- a/zeusd/src/devices/gpu/mod.rs +++ b/zeusd/src/devices/gpu/mod.rs @@ -27,22 +27,29 @@ use crate::error::ZeusdError; /// This trait can be used to abstract over different GPU management libraries. /// Currently, this was done to facilitate testing. pub trait GpuManager { + /// Get the number of GPUs visible in the node. fn device_count() -> Result where Self: Sized; + /// Set the persistence mode of the GPU. fn set_persistence_mode(&mut self, enabled: bool) -> Result<(), ZeusdError>; + /// Set the power management limit in milliwatts. fn set_power_management_limit(&mut self, power_limit: u32) -> Result<(), ZeusdError>; + /// Set the GPU's locked clock range in MHz. fn set_gpu_locked_clocks( &mut self, min_clock_mhz: u32, max_clock_mhz: u32, ) -> Result<(), ZeusdError>; + /// Reset the GPU's locked clocks. fn reset_gpu_locked_clocks(&mut self) -> Result<(), ZeusdError>; + /// Set the memory locked clock range in MHz. fn set_mem_locked_clocks( &mut self, min_clock_mhz: u32, max_clock_mhz: u32, ) -> Result<(), ZeusdError>; + /// Reset the memory locked clocks. fn reset_mem_locked_clocks(&mut self) -> Result<(), ZeusdError>; } @@ -50,8 +57,10 @@ pub trait GpuManager { /// /// This is the type that is sent to the GPU management background task. /// The optional `Sender` is used to send a response back to the caller if the -/// user wanted to block until the command is executed. -/// The `Span` is used to propagate tracing context starting from the request. +/// user wanted to block until the command is done executing. +/// The `Instant` object is when the request was received by the server. +/// It's used to log how long it took until the command was executed on the GPU. +/// The `Span` object is used to propagate tracing context starting from the request. pub type GpuCommandRequest = ( GpuCommand, Option>>, diff --git a/zeusd/src/devices/mod.rs b/zeusd/src/devices/mod.rs index 8ed41297..eaeb673a 100644 --- a/zeusd/src/devices/mod.rs +++ b/zeusd/src/devices/mod.rs @@ -1 +1,3 @@ +//! Interfaces for interacting with devices + pub mod gpu; diff --git a/zeusd/src/routes/gpu.rs b/zeusd/src/routes/gpu.rs index 314f1f19..b0d5bf75 100644 --- a/zeusd/src/routes/gpu.rs +++ b/zeusd/src/routes/gpu.rs @@ -115,6 +115,7 @@ impl_handler_for_gpu_command!( post("/{gpu_id}/reset_mem_locked_clocks"), ); +/// Register GPU routes with the Actix web server. pub fn gpu_routes(cfg: &mut web::ServiceConfig) { cfg.service(set_persistence_mode_handler) .service(set_power_limit_handler) diff --git a/zeusd/src/routes/mod.rs b/zeusd/src/routes/mod.rs index 67783626..fae1350b 100644 --- a/zeusd/src/routes/mod.rs +++ b/zeusd/src/routes/mod.rs @@ -1,3 +1,5 @@ +//! Routes and handlers for interacting with devices + pub mod gpu; pub use gpu::gpu_routes;