Skip to content

Commit

Permalink
Zeusd doc comments, debugging outputs
Browse files Browse the repository at this point in the history
  • Loading branch information
jaywonchung committed May 30, 2024
1 parent c5d8dad commit 07a3539
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 2 deletions.
9 changes: 9 additions & 0 deletions zeus/device/gpu/nvidia.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ def setPowerManagementLimit(self, power_limit_mw: int, block: bool = True) -> No
)
if resp.status_code != 200:
raise ZeusdError(f"Failed to set power management limit: {resp.text}")
logger.debug("Took %s ms to set power limit", resp.elapsed.microseconds / 1000)

@_handle_nvml_errors
def resetPowerManagementLimit(self, block: bool = True) -> None:
Expand All @@ -273,6 +274,9 @@ def setPersistenceMode(self, enabled: bool, block: bool = False) -> None:
)
if resp.status_code != 200:
raise ZeusdError(f"Failed to set persistence mode: {resp.text}")
logger.debug(
"Took %s ms to set persistence mode", resp.elapsed.microseconds / 1000
)

def setMemoryLockedClocks(
self, min_clock_mhz: int, max_clock_mhz: int, block: bool = True
Expand All @@ -286,6 +290,9 @@ def setMemoryLockedClocks(
)
if resp.status_code != 200:
raise ZeusdError(f"Failed to set memory locked clocks: {resp.text}")
logger.debug(
"Took %s ms to set memory locked clocks", resp.elapsed.microseconds / 1000
)

def resetMemoryLockedClocks(self, block: bool = True) -> None:
"""Reset the locked memory clocks to the default."""
Expand Down Expand Up @@ -369,6 +376,8 @@ def _init_gpus(self) -> None:
raise ZeusdError(
f"ZEUSD_SOCK_PATH points to non-existent file: {sock_path}"
)
if not Path(sock_path).is_socket():
raise ZeusdError(f"ZEUSD_SOCK_PATH is not a socket: {sock_path}")
if not os.access(sock_path, os.W_OK):
raise ZeusdError(f"ZEUSD_SOCK_PATH is not writable: {sock_path}")
self._gpus = [
Expand Down
13 changes: 11 additions & 2 deletions zeusd/src/devices/gpu/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,31 +27,40 @@ use crate::error::ZeusdError;
/// This trait can be used to abstract over different GPU management libraries.
/// Currently, this was done to facilitate testing.
pub trait GpuManager {
/// Get the number of GPUs visible in the node.
fn device_count() -> Result<u32, ZeusdError>
where
Self: Sized;
/// Set the persistence mode of the GPU.
fn set_persistence_mode(&mut self, enabled: bool) -> Result<(), ZeusdError>;
/// Set the power management limit in milliwatts.
fn set_power_management_limit(&mut self, power_limit: u32) -> Result<(), ZeusdError>;
/// Set the GPU's locked clock range in MHz.
fn set_gpu_locked_clocks(
&mut self,
min_clock_mhz: u32,
max_clock_mhz: u32,
) -> Result<(), ZeusdError>;
/// Reset the GPU's locked clocks.
fn reset_gpu_locked_clocks(&mut self) -> Result<(), ZeusdError>;
/// Set the memory locked clock range in MHz.
fn set_mem_locked_clocks(
&mut self,
min_clock_mhz: u32,
max_clock_mhz: u32,
) -> Result<(), ZeusdError>;
/// Reset the memory locked clocks.
fn reset_mem_locked_clocks(&mut self) -> Result<(), ZeusdError>;
}

/// A request to execute a GPU command.
///
/// This is the type that is sent to the GPU management background task.
/// The optional `Sender` is used to send a response back to the caller if the
/// user wanted to block until the command is executed.
/// The `Span` is used to propagate tracing context starting from the request.
/// user wanted to block until the command is done executing.
/// The `Instant` object is when the request was received by the server.
/// It's used to log how long it took until the command was executed on the GPU.
/// The `Span` object is used to propagate tracing context starting from the request.
pub type GpuCommandRequest = (
GpuCommand,
Option<Sender<Result<(), ZeusdError>>>,
Expand Down
2 changes: 2 additions & 0 deletions zeusd/src/devices/mod.rs
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
//! Interfaces for interacting with devices
pub mod gpu;
1 change: 1 addition & 0 deletions zeusd/src/routes/gpu.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ impl_handler_for_gpu_command!(
post("/{gpu_id}/reset_mem_locked_clocks"),
);

/// Register GPU routes with the Actix web server.
pub fn gpu_routes(cfg: &mut web::ServiceConfig) {
cfg.service(set_persistence_mode_handler)
.service(set_power_limit_handler)
Expand Down
2 changes: 2 additions & 0 deletions zeusd/src/routes/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//! Routes and handlers for interacting with devices
pub mod gpu;

pub use gpu::gpu_routes;

0 comments on commit 07a3539

Please sign in to comment.