Skip to content

Commit

Permalink
Update CI, fix GPU index error in zeusd
Browse files Browse the repository at this point in the history
  • Loading branch information
jaywonchung committed May 29, 2024
1 parent 2d1d4e8 commit 889c034
Show file tree
Hide file tree
Showing 12 changed files with 101 additions and 12 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/deploy_homepage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ env:
jobs:
deploy:
runs-on: ubuntu-latest
if: github.event.repository.fork == false
if: github.repository_owner = 'ml-energy'
steps:
- name: Checkout repository
uses: actions/checkout@v4
Expand Down
24 changes: 24 additions & 0 deletions .github/workflows/publish_crates_io.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: Release

on:
push:
tags:
- zeusd-v*

jobs:
cargo-publish:
if: github.repository_owner == 'ml-energy'
runs-on: ubuntu-latest
env:
CARGO_TERM_COLOR: always
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
sparse-checkout: zeusd
- name: Publish to crates.io
uses: katyo/publish-crates@v2
with:
path: zeusd
registry-token: ${{ secrets.CRATES_IO_TOKEN }}
check-repo: false
4 changes: 2 additions & 2 deletions .github/workflows/publish_pypi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@ name: Publish Python package to PyPI
on:
push:
tags:
- v*
- zeus-v*

jobs:
publish:
runs-on: ubuntu-latest
if: github.event.repository.fork == false
if: github.repository_owner = 'ml-energy'
steps:
- name: Checkout repository
uses: actions/checkout@v3
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/push_docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ on:
branches:
- master
tags:
- v*
- zeus-v*
paths:
- '.github/workflows/push_docker.yaml'
- 'capriccio/**'
Expand All @@ -21,6 +21,7 @@ on:

jobs:
build_and_push:
if: github.repository_owner == 'ml-energy'
runs-on: ubuntu-latest
steps:
- name: Remove unnecessary files
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ on:

# Jobs initiated by previous pushes get cancelled by a new push.
concurrency:
group: ${{ github.ref }}-lint-and-test
group: ${{ github.ref }}-zeus-lint-and-test
cancel-in-progress: true

jobs:
Expand Down
4 changes: 2 additions & 2 deletions zeusd/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions zeusd/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "zeus"
version = "0.1.0"
name = "zeusd"
version = "0.1.1"
authors = ["Jae-Won Chung <[email protected]>"]
description = "Zeus daemon"
license = "Apache-2.0"
Expand Down
6 changes: 5 additions & 1 deletion zeusd/README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# Now maintained at [`zeus`](https://crates.io/crates/zeus)

Please contact @jaywonchung if you'd like to take ownership of this handle.

# Zeus daemon (`zeusd`)

`zeusd` is a daemon designed to run with admin privileges and expose API endpoints that wrap privileged NVML methods.
Expand All @@ -18,7 +22,7 @@ To make this as low latency as possible, `zeusd` was written with Rust.
To install `zeusd`:

```sh
cargo install zeus
cargo install zeusd
```

With the following, `zeusd` will listen to a unix domain socket at `/var/run/zeusd.sock`, which is writable to anyone (since file permission is 666).
Expand Down
2 changes: 1 addition & 1 deletion zeusd/src/devices/gpu/macos.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ impl NvmlGpu {

impl GpuManager for NvmlGpu {
fn device_count() -> Result<u32, ZeusdError> {
Ok(1)
Ok(0)
}

fn set_persistent_mode(&mut self, _enabled: bool) -> Result<(), ZeusdError> {
Expand Down
6 changes: 6 additions & 0 deletions zeusd/src/devices/gpu/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@ impl GpuManagementTasks {
command: GpuCommand,
request_start_time: Instant,
) -> Result<(), ZeusdError> {
if gpu_id >= self.senders.len() {
return Err(ZeusdError::GpuNotFoundError(gpu_id));
}
if gpu_id >= self.senders.len() {
return Err(ZeusdError::GpuNotFoundError(gpu_id));
}
Expand All @@ -114,6 +117,9 @@ impl GpuManagementTasks {
command: GpuCommand,
request_start_time: Instant,
) -> Result<(), ZeusdError> {
if gpu_id >= self.senders.len() {
return Err(ZeusdError::GpuNotFoundError(gpu_id));
}
let (tx, mut rx) = tokio::sync::mpsc::channel(1);
self.senders[gpu_id]
.send((command, Some(tx), request_start_time, Span::current()))
Expand Down
2 changes: 1 addition & 1 deletion zeusd/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ async fn main() -> anyhow::Result<()> {
}
ConnectionMode::TCP => {
let listener = TcpListener::bind(&config.tcp_bind_address)?;
tracing::info!("Listening on {}", &config.tcp_bind_address);
tracing::info!("Listening on {}", &listener.local_addr()?);

start_server_tcp(listener, device_tasks, num_workers)?.await?;
}
Expand Down
54 changes: 54 additions & 0 deletions zeusd/tests/gpu.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,20 @@ async fn test_set_persistent_mode_invalid() {
.await
.expect("Failed to read response")
.contains("invalid type"));

let url = SetPersistentMode::build_url(&app, 5); // Invalid GPU ID
let resp = client
.post(url)
.json(&serde_json::json!(
{
"enabled": true,
"block": true
}
))
.send()
.await
.expect("Failed to send request");
assert_eq!(resp.status(), 400);
}

#[tokio::test]
Expand Down Expand Up @@ -279,6 +293,20 @@ async fn test_set_power_limit_invalid() {
.await
.expect("Failed to read response")
.contains("missing field"));

let url = SetPowerLimit::build_url(&app, 5); // Invalid GPU ID
let resp = client
.post(url)
.json(&serde_json::json!(
{
"power_limit_mw": 100_000,
"block": true
}
))
.send()
.await
.expect("Failed to send request");
assert_eq!(resp.status(), 400);
}

#[tokio::test]
Expand Down Expand Up @@ -466,6 +494,19 @@ async fn test_gpu_locked_clocks_invalid() {
.await
.expect("Failed to read response")
.contains("missing field"));

let url = ResetGpuLockedClocks::build_url(&app, 5); // Invalid GPU ID
let resp = client
.post(url)
.json(&serde_json::json!(
{
"block": true
}
))
.send()
.await
.expect("Failed to send request");
assert_eq!(resp.status(), 400);
}

#[tokio::test]
Expand Down Expand Up @@ -655,6 +696,19 @@ async fn test_mem_locked_clocks_invalid() {
.await
.expect("Failed to read response")
.contains("missing field"));

let url = ResetMemLockedClocks::build_url(&app, 5); // Invalid GPU ID
let resp = client
.post(url)
.json(&serde_json::json!(
{
"block": true
}
))
.send()
.await
.expect("Failed to send request");
assert_eq!(resp.status(), 400);
}

#[tokio::test]
Expand Down

0 comments on commit 889c034

Please sign in to comment.