Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow zeusd dev and testing on MacOS #82

Merged
merged 3 commits into from
May 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion zeusd/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ name = "zeusd"

[dependencies]
actix-web = "4"
nvml-wrapper = "0.10"
tokio = { version = "1", features = ["macros", "rt-multi-thread"] }
thiserror = "1"
clap = { version = "4.5.4", features = ["derive"] }
Expand All @@ -28,6 +27,9 @@ tracing-actix-web = "0.7.10"
nix = { version = "0.29", default-features = false, features = ["user"] }
paste = "1"

[target.'cfg(target_os = "linux")'.dependencies]
nvml-wrapper = "0.10"

[dev-dependencies]
once_cell = "1.7.2"
reqwest = { version = "0.11", default-features = false, features = ["json"] }
Expand Down
75 changes: 75 additions & 0 deletions zeusd/src/devices/gpu/linux.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
use nvml_wrapper::enums::device::GpuLockedClocksSetting;
use nvml_wrapper::{Device, Nvml};

use crate::devices::gpu::GpuManager;
use crate::error::ZeusdError;

#[cfg(target_os = "linux")]
pub struct NvmlGpu<'n> {
_nvml: &'static Nvml,
device: Device<'n>,
}

#[cfg(target_os = "linux")]
impl NvmlGpu<'static> {
pub fn init(index: u32) -> Result<Self, ZeusdError> {
// `Device` needs to hold a reference to `Nvml`, meaning that `Nvml` must outlive `Device`.
// We can achieve this by leaking a `Box` containing `Nvml` and holding a reference to it.
// `Nvml` will actually live until the server terminates inside the GPU management task.
let _nvml = Box::leak(Box::new(Nvml::init()?));
let device = _nvml.device_by_index(index)?;
Ok(Self { _nvml, device })
}
}

#[cfg(target_os = "linux")]
impl GpuManager for NvmlGpu<'static> {
fn device_count() -> Result<u32, ZeusdError> {
let nvml = Nvml::init()?;
Ok(nvml.device_count()?)
}

#[inline]
fn set_persistent_mode(&mut self, enabled: bool) -> Result<(), ZeusdError> {
Ok(self.device.set_persistent(enabled)?)
}

#[inline]
fn set_power_management_limit(&mut self, power_limit_mw: u32) -> Result<(), ZeusdError> {
Ok(self.device.set_power_management_limit(power_limit_mw)?)
}

#[inline]
fn set_gpu_locked_clocks(
&mut self,
min_clock_mhz: u32,
max_clock_mhz: u32,
) -> Result<(), ZeusdError> {
let setting = GpuLockedClocksSetting::Numeric {
min_clock_mhz,
max_clock_mhz,
};
Ok(self.device.set_gpu_locked_clocks(setting)?)
}

#[inline]
fn reset_gpu_locked_clocks(&mut self) -> Result<(), ZeusdError> {
Ok(self.device.reset_gpu_locked_clocks()?)
}

#[inline]
fn set_mem_locked_clocks(
&mut self,
min_clock_mhz: u32,
max_clock_mhz: u32,
) -> Result<(), ZeusdError> {
Ok(self
.device
.set_mem_locked_clocks(min_clock_mhz, max_clock_mhz)?)
}

#[inline]
fn reset_mem_locked_clocks(&mut self) -> Result<(), ZeusdError> {
Ok(self.device.reset_mem_locked_clocks()?)
}
}
48 changes: 48 additions & 0 deletions zeusd/src/devices/gpu/macos.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
use crate::devices::gpu::GpuManager;
use crate::error::ZeusdError;

pub struct NvmlGpu;

impl NvmlGpu {
pub fn init(_index: u32) -> Result<Self, ZeusdError> {
Ok(Self)
}
}

impl GpuManager for NvmlGpu {
fn device_count() -> Result<u32, ZeusdError> {
Ok(1)
}

fn set_persistent_mode(&mut self, _enabled: bool) -> Result<(), ZeusdError> {
Ok(())
}

fn set_power_management_limit(&mut self, _power_limit_mw: u32) -> Result<(), ZeusdError> {
Ok(())
}

fn set_gpu_locked_clocks(
&mut self,
_min_clock_mhz: u32,
_max_clock_mhz: u32,
) -> Result<(), ZeusdError> {
Ok(())
}

fn reset_gpu_locked_clocks(&mut self) -> Result<(), ZeusdError> {
Ok(())
}

fn set_mem_locked_clocks(
&mut self,
_min_clock_mhz: u32,
_max_clock_mhz: u32,
) -> Result<(), ZeusdError> {
Ok(())
}

fn reset_mem_locked_clocks(&mut self) -> Result<(), ZeusdError> {
Ok(())
}
}
82 changes: 12 additions & 70 deletions zeusd/src/devices/gpu.rs → zeusd/src/devices/gpu/mod.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
//! GPU management module that interfaces with NVML

use std::time::Instant;
#[cfg(target_os = "linux")]
mod linux;

#[cfg(target_os = "linux")]
pub use linux::NvmlGpu;

#[cfg(target_os = "macos")]
mod macos;

use nvml_wrapper::enums::device::GpuLockedClocksSetting;
use nvml_wrapper::{Device, Nvml};
#[cfg(target_os = "macos")]
pub use macos::NvmlGpu;

use std::time::Instant;
use tokio::sync::mpsc::{Sender, UnboundedReceiver, UnboundedSender};
use tracing::Span;

Expand Down Expand Up @@ -33,73 +42,6 @@ pub trait GpuManager {
fn reset_mem_locked_clocks(&mut self) -> Result<(), ZeusdError>;
}

pub struct NvmlGpu<'n> {
_nvml: &'static Nvml,
device: Device<'n>,
}

impl NvmlGpu<'static> {
pub fn init(index: u32) -> Result<Self, ZeusdError> {
// `Device` needs to hold a reference to `Nvml`, meaning that `Nvml` must outlive `Device`.
// We can achieve this by leaking a `Box` containing `Nvml` and holding a reference to it.
// `Nvml` will actually live until the server terminates inside the GPU management task.
let _nvml = Box::leak(Box::new(Nvml::init()?));
let device = _nvml.device_by_index(index)?;
Ok(Self { _nvml, device })
}
}

impl GpuManager for NvmlGpu<'static> {
fn device_count() -> Result<u32, ZeusdError> {
let nvml = Nvml::init()?;
Ok(nvml.device_count()?)
}

#[inline]
fn set_persistent_mode(&mut self, enabled: bool) -> Result<(), ZeusdError> {
Ok(self.device.set_persistent(enabled)?)
}

#[inline]
fn set_power_management_limit(&mut self, power_limit_mw: u32) -> Result<(), ZeusdError> {
Ok(self.device.set_power_management_limit(power_limit_mw)?)
}

#[inline]
fn set_gpu_locked_clocks(
&mut self,
min_clock_mhz: u32,
max_clock_mhz: u32,
) -> Result<(), ZeusdError> {
let setting = GpuLockedClocksSetting::Numeric {
min_clock_mhz,
max_clock_mhz,
};
Ok(self.device.set_gpu_locked_clocks(setting)?)
}

#[inline]
fn reset_gpu_locked_clocks(&mut self) -> Result<(), ZeusdError> {
Ok(self.device.reset_gpu_locked_clocks()?)
}

#[inline]
fn set_mem_locked_clocks(
&mut self,
min_clock_mhz: u32,
max_clock_mhz: u32,
) -> Result<(), ZeusdError> {
Ok(self
.device
.set_mem_locked_clocks(min_clock_mhz, max_clock_mhz)?)
}

#[inline]
fn reset_mem_locked_clocks(&mut self) -> Result<(), ZeusdError> {
Ok(self.device.reset_mem_locked_clocks()?)
}
}

/// A request to execute a GPU command.
///
/// This is the type that is sent to the GPU management background task.
Expand Down
Loading