diff --git a/zeusd/Cargo.toml b/zeusd/Cargo.toml index 7ab93151..c4451c87 100644 --- a/zeusd/Cargo.toml +++ b/zeusd/Cargo.toml @@ -15,7 +15,6 @@ name = "zeusd" [dependencies] actix-web = "4" -nvml-wrapper = "0.10" tokio = { version = "1", features = ["macros", "rt-multi-thread"] } thiserror = "1" clap = { version = "4.5.4", features = ["derive"] } @@ -28,6 +27,9 @@ tracing-actix-web = "0.7.10" nix = { version = "0.29", default-features = false, features = ["user"] } paste = "1" +[target.'cfg(target_os = "linux")'.dependencies] +nvml-wrapper = "0.10" + [dev-dependencies] once_cell = "1.7.2" reqwest = { version = "0.11", default-features = false, features = ["json"] } diff --git a/zeusd/src/devices/gpu/linux.rs b/zeusd/src/devices/gpu/linux.rs new file mode 100644 index 00000000..1a879763 --- /dev/null +++ b/zeusd/src/devices/gpu/linux.rs @@ -0,0 +1,75 @@ +use nvml_wrapper::enums::device::GpuLockedClocksSetting; +use nvml_wrapper::{Device, Nvml}; + +use crate::devices::gpu::GpuManager; +use crate::error::ZeusdError; + +#[cfg(target_os = "linux")] +pub struct NvmlGpu<'n> { + _nvml: &'static Nvml, + device: Device<'n>, +} + +#[cfg(target_os = "linux")] +impl NvmlGpu<'static> { + pub fn init(index: u32) -> Result { + // `Device` needs to hold a reference to `Nvml`, meaning that `Nvml` must outlive `Device`. + // We can achieve this by leaking a `Box` containing `Nvml` and holding a reference to it. + // `Nvml` will actually live until the server terminates inside the GPU management task. + let _nvml = Box::leak(Box::new(Nvml::init()?)); + let device = _nvml.device_by_index(index)?; + Ok(Self { _nvml, device }) + } +} + +#[cfg(target_os = "linux")] +impl GpuManager for NvmlGpu<'static> { + fn device_count() -> Result { + let nvml = Nvml::init()?; + Ok(nvml.device_count()?) + } + + #[inline] + fn set_persistent_mode(&mut self, enabled: bool) -> Result<(), ZeusdError> { + Ok(self.device.set_persistent(enabled)?) + } + + #[inline] + fn set_power_management_limit(&mut self, power_limit_mw: u32) -> Result<(), ZeusdError> { + Ok(self.device.set_power_management_limit(power_limit_mw)?) + } + + #[inline] + fn set_gpu_locked_clocks( + &mut self, + min_clock_mhz: u32, + max_clock_mhz: u32, + ) -> Result<(), ZeusdError> { + let setting = GpuLockedClocksSetting::Numeric { + min_clock_mhz, + max_clock_mhz, + }; + Ok(self.device.set_gpu_locked_clocks(setting)?) + } + + #[inline] + fn reset_gpu_locked_clocks(&mut self) -> Result<(), ZeusdError> { + Ok(self.device.reset_gpu_locked_clocks()?) + } + + #[inline] + fn set_mem_locked_clocks( + &mut self, + min_clock_mhz: u32, + max_clock_mhz: u32, + ) -> Result<(), ZeusdError> { + Ok(self + .device + .set_mem_locked_clocks(min_clock_mhz, max_clock_mhz)?) + } + + #[inline] + fn reset_mem_locked_clocks(&mut self) -> Result<(), ZeusdError> { + Ok(self.device.reset_mem_locked_clocks()?) + } +} diff --git a/zeusd/src/devices/gpu/macos.rs b/zeusd/src/devices/gpu/macos.rs new file mode 100644 index 00000000..4bf98650 --- /dev/null +++ b/zeusd/src/devices/gpu/macos.rs @@ -0,0 +1,48 @@ +use crate::devices::gpu::GpuManager; +use crate::error::ZeusdError; + +pub struct NvmlGpu; + +impl NvmlGpu { + pub fn init(_index: u32) -> Result { + Ok(Self) + } +} + +impl GpuManager for NvmlGpu { + fn device_count() -> Result { + Ok(1) + } + + fn set_persistent_mode(&mut self, _enabled: bool) -> Result<(), ZeusdError> { + Ok(()) + } + + fn set_power_management_limit(&mut self, _power_limit_mw: u32) -> Result<(), ZeusdError> { + Ok(()) + } + + fn set_gpu_locked_clocks( + &mut self, + _min_clock_mhz: u32, + _max_clock_mhz: u32, + ) -> Result<(), ZeusdError> { + Ok(()) + } + + fn reset_gpu_locked_clocks(&mut self) -> Result<(), ZeusdError> { + Ok(()) + } + + fn set_mem_locked_clocks( + &mut self, + _min_clock_mhz: u32, + _max_clock_mhz: u32, + ) -> Result<(), ZeusdError> { + Ok(()) + } + + fn reset_mem_locked_clocks(&mut self) -> Result<(), ZeusdError> { + Ok(()) + } +} diff --git a/zeusd/src/devices/gpu.rs b/zeusd/src/devices/gpu/mod.rs similarity index 82% rename from zeusd/src/devices/gpu.rs rename to zeusd/src/devices/gpu/mod.rs index 9ee77a35..4745f2e3 100644 --- a/zeusd/src/devices/gpu.rs +++ b/zeusd/src/devices/gpu/mod.rs @@ -1,9 +1,18 @@ //! GPU management module that interfaces with NVML -use std::time::Instant; +#[cfg(target_os = "linux")] +mod linux; + +#[cfg(target_os = "linux")] +pub use linux::NvmlGpu; + +#[cfg(target_os = "macos")] +mod macos; -use nvml_wrapper::enums::device::GpuLockedClocksSetting; -use nvml_wrapper::{Device, Nvml}; +#[cfg(target_os = "macos")] +pub use macos::NvmlGpu; + +use std::time::Instant; use tokio::sync::mpsc::{Sender, UnboundedReceiver, UnboundedSender}; use tracing::Span; @@ -33,73 +42,6 @@ pub trait GpuManager { fn reset_mem_locked_clocks(&mut self) -> Result<(), ZeusdError>; } -pub struct NvmlGpu<'n> { - _nvml: &'static Nvml, - device: Device<'n>, -} - -impl NvmlGpu<'static> { - pub fn init(index: u32) -> Result { - // `Device` needs to hold a reference to `Nvml`, meaning that `Nvml` must outlive `Device`. - // We can achieve this by leaking a `Box` containing `Nvml` and holding a reference to it. - // `Nvml` will actually live until the server terminates inside the GPU management task. - let _nvml = Box::leak(Box::new(Nvml::init()?)); - let device = _nvml.device_by_index(index)?; - Ok(Self { _nvml, device }) - } -} - -impl GpuManager for NvmlGpu<'static> { - fn device_count() -> Result { - let nvml = Nvml::init()?; - Ok(nvml.device_count()?) - } - - #[inline] - fn set_persistent_mode(&mut self, enabled: bool) -> Result<(), ZeusdError> { - Ok(self.device.set_persistent(enabled)?) - } - - #[inline] - fn set_power_management_limit(&mut self, power_limit_mw: u32) -> Result<(), ZeusdError> { - Ok(self.device.set_power_management_limit(power_limit_mw)?) - } - - #[inline] - fn set_gpu_locked_clocks( - &mut self, - min_clock_mhz: u32, - max_clock_mhz: u32, - ) -> Result<(), ZeusdError> { - let setting = GpuLockedClocksSetting::Numeric { - min_clock_mhz, - max_clock_mhz, - }; - Ok(self.device.set_gpu_locked_clocks(setting)?) - } - - #[inline] - fn reset_gpu_locked_clocks(&mut self) -> Result<(), ZeusdError> { - Ok(self.device.reset_gpu_locked_clocks()?) - } - - #[inline] - fn set_mem_locked_clocks( - &mut self, - min_clock_mhz: u32, - max_clock_mhz: u32, - ) -> Result<(), ZeusdError> { - Ok(self - .device - .set_mem_locked_clocks(min_clock_mhz, max_clock_mhz)?) - } - - #[inline] - fn reset_mem_locked_clocks(&mut self) -> Result<(), ZeusdError> { - Ok(self.device.reset_mem_locked_clocks()?) - } -} - /// A request to execute a GPU command. /// /// This is the type that is sent to the GPU management background task.