Skip to content

Commit

Permalink
The changes include:
Browse files Browse the repository at this point in the history
  configuration to work on Jetson

  Scaphandra Specific
    Getting the process ID (PID) of a container and its target server
    Using a single listener thread approach to find child thread PIDs

  Tegrastat invocation for energy monitoring

  GPU functions
    Avoiding the use of nvidia-smi on Tegra
    Adding Nvidia runtime flags for container invocation
    Creating a new folder for Jetson functions
  • Loading branch information
Abdul Rehman authored and abrehman94 committed Dec 8, 2023
1 parent e6545ed commit 4319bfc
Show file tree
Hide file tree
Showing 135 changed files with 9,082 additions and 28 deletions.
2 changes: 1 addition & 1 deletion src/Ilúvatar/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,4 @@ format:
format-check:
@cargo fmt --all -- --check
clippy:
@cargo clippy -- -Dclippy::suspicious -Dclippy::correctness -Dclippy::perf -Aclippy::single_match -Aclippy::new_without_default -Aclippy::too_many_arguments -Aclippy::type-complexity -Dclippy::from_over_into -Aclippy::redundant-field-names -Dwarnings
@cargo clippy --fix -- -Dclippy::suspicious -Dclippy::correctness -Dclippy::perf -Aclippy::single_match -Aclippy::new_without_default -Aclippy::too_many_arguments -Aclippy::type-complexity -Dclippy::from_over_into -Aclippy::redundant-field-names -Dwarnings
4 changes: 4 additions & 0 deletions src/Ilúvatar/ansible/worker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,10 @@
# Energy config
"ILUVATAR_WORKER__energy__rapl_freq_ms" : "{{ worker_rapl_log_freq_ms | default(0) }}"
"ILUVATAR_WORKER__energy__perf_freq_ms" : "{{ worker_perf_log_freq_ms | default(0) }}"
"ILUVATAR_WORKER__energy__tegra_freq_ms" : "{{ worker_tegra_log_freq_ms | default(0) }}"
"ILUVATAR_WORKER__energy__process_freq_ms" : "{{ worker_process_log_freq_ms | default(0) }}"
"ILUVATAR_WORKER__energy__kernel_cpu_frequencies_freq_ms" : "{{ worker_kernel_cpu_frequencies_log_freq_ms | default(0) }}"
"ILUVATAR_WORKER__energy__hardware_cpu_frequencies_freq_ms" : "{{ worker_hardware_cpu_frequencies_log_freq_ms | default(0) }}"
"ILUVATAR_WORKER__energy__ipmi_freq_ms" : "{{ worker_ipmi_log_freq_ms | default(0) }}"
"ILUVATAR_WORKER__energy__ipmi_pass_file" : "{{ worker_ipmi_pass_file | default('') }}"
"ILUVATAR_WORKER__energy__ipmi_ip_addr" : "{{ worker_ipmi_ip_addr | default(servers[ansible_host].ipmi_ip) }}"
Expand Down
6 changes: 6 additions & 0 deletions src/Ilúvatar/iluvatar_library/src/energy/energy_layer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ pub struct DataExtractorVisitor {
pub fqdn: Option<String>,
}

impl Default for DataExtractorVisitor {
fn default() -> Self {
Self::new()
}
}

impl DataExtractorVisitor {
pub fn new() -> Self {
DataExtractorVisitor {
Expand Down
32 changes: 28 additions & 4 deletions src/Ilúvatar/iluvatar_library/src/energy/energy_logging.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use super::{ipmi::IPMIMonitor, process_pct::ProcessMonitor, rapl::RaplMonitor};
use crate::{
cpu_interaction::CpuFreqMonitor,
energy::{perf::start_perf_stat, EnergyConfig},
energy::{perf::start_perf_stat, tegrastats::start_tegrastats, EnergyConfig},
transaction::TransactionId,
};
use anyhow::Result;
Expand All @@ -17,13 +17,14 @@ pub struct EnergyLogger {
ipmi: Option<Arc<IPMIMonitor>>,
proc: Option<Arc<ProcessMonitor>>,
_perf_child: Option<std::process::Child>,
_tegra_child: Option<std::process::Child>,
cpu: Option<Arc<CpuFreqMonitor>>,
config: Option<Arc<EnergyConfig>>,
}

impl EnergyLogger {
pub async fn boxed(config: Option<&Arc<EnergyConfig>>, tid: &TransactionId) -> Result<Arc<Self>> {
let (perf_child, ipmi, rapl, proc, cpu) = match config {
let (perf_child, tegra_child, ipmi, rapl, proc, cpu) = match config {
Some(config) => {
let perf_child = match config.perf_enabled() {
true => {
Expand All @@ -47,6 +48,28 @@ impl EnergyLogger {
false => None,
};

let tegra_child = match config.tegra_enabled() {
true => {
let tegra_file = Path::new(&config.log_folder);
let tegra_file = tegra_file.join("tegrastats.log");
debug!(tid=%tid, "Starting tegra monitoring");
let f = match tegra_file.to_str() {
Some(f) => f,
None => {
anyhow::bail!(
"Failed to start tegra because the log file could not be formatted properly"
);
}
};
if let Some(ms) = config.tegra_freq_ms {
Some(start_tegrastats(&f, tid, ms).await?)
} else {
None
}
}
false => None,
};

let ipmi = match config.ipmi_enabled() {
true => {
debug!(tid=%tid, "Starting IPMI energy monitoring");
Expand Down Expand Up @@ -82,16 +105,17 @@ impl EnergyLogger {
}
false => None,
};
(perf_child, ipmi, rapl, proc, cpu_mon)
(perf_child, tegra_child, ipmi, rapl, proc, cpu_mon)
}
None => (None, None, None, None, None),
None => (None, None, None, None, None, None),
};

Ok(Arc::new(EnergyLogger {
rapl,
ipmi,
proc,
_perf_child: perf_child,
_tegra_child: tegra_child,
cpu,
config: config.cloned(),
}))
Expand Down
12 changes: 12 additions & 0 deletions src/Ilúvatar/iluvatar_library/src/energy/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ pub mod ipmi;
pub mod perf;
pub mod process_pct;
pub mod rapl;
pub mod tegrastats;

#[derive(Debug, serde::Deserialize, clap::Parser)]
#[clap(author, version, about)]
Expand All @@ -28,6 +29,14 @@ pub struct EnergyConfig {
#[clap(long, action)]
pub perf_freq_ms: Option<u64>,

/// Log energy usage as monitored via `tegrastats`
/// If 0 then tegra is disabled
/// Currently tegra is not killed on worker shutdown, it must be killed manually and externally.
/// It is also hard to guarantee that tegra will be removed, as the mode of the main process exiting can vary.
/// Executing `kill -9 $(ps -ax | grep tegra | awk '"'"'{print $1}'"'"' )` on the host should work.
#[clap(long, action)]
pub tegra_freq_ms: Option<u64>,

/// Log instantaneous cpu utilization of this process.
/// If 0 then logging is disabled.
#[clap(long, action)]
Expand Down Expand Up @@ -67,6 +76,9 @@ impl EnergyConfig {
pub fn perf_enabled(&self) -> bool {
Self::enabled(&self.perf_freq_ms)
}
pub fn tegra_enabled(&self) -> bool {
Self::enabled(&self.tegra_freq_ms)
}
pub fn rapl_enabled(&self) -> bool {
Self::enabled(&self.rapl_freq_ms)
}
Expand Down
65 changes: 65 additions & 0 deletions src/Ilúvatar/iluvatar_library/src/energy/tegrastats.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
use crate::{bail_error, transaction::TransactionId, utils::execute_cmd_nonblocking};
use anyhow::Result;
use std::{
process::Child,
time::{Duration, SystemTime},
};
use tracing::{info, warn};

/// Start tegrastat tracking of several metrics:
/// tegrastats --interval 1000 --logfile temp.log
/*
Usage: tegrastats [-option]
Options:
--help : print this help screen
--interval <millisec> : sample the information in <milliseconds>
--logfile <filename> : dump the output of tegrastats to <filename>
--load_cfg <filename> : load the information from <filename>
--readall : collect all stats including performance intensive stats
--save_cfg <filename> : save the information to <filename>
--start : run tegrastats as a daemon process in the background
--stop : stop any running instances of tegrastats
--verbose : print verbose message
*/
pub async fn start_tegrastats<S>(outfile: &S, tid: &TransactionId, stat_duration_ms: u64) -> Result<Child>
where
S: AsRef<str> + ?Sized + std::fmt::Display,
{
let st = stat_duration_ms.to_string();
let args = vec!["--interval", &st.as_str(), "--logfile", outfile.as_ref()];
test_args(tid, &args).await?;
info!(tid=%tid, tegrastats=?args, "tegrastat arguments");
execute_cmd_nonblocking("/usr/bin/tegrastats", &args, None, tid)
}

async fn test_args(tid: &TransactionId, args: &Vec<&str>) -> Result<bool> {
let mut child = execute_cmd_nonblocking("/usr/bin/tegrastats", args, None, tid)?;
let start = SystemTime::now();

let timeout = Duration::from_secs(1);
while start.elapsed()? < timeout {
match child.try_wait() {
Ok(exit) => match exit {
// an exit means the metric doesn't exist
Some(_) => return Ok(false),
None => {
// didn't exit yet
tokio::time::sleep(Duration::from_millis(5)).await;
continue;
}
},
Err(e) => {
warn!(tid=%tid, error=%e, "Checking if `{:?}` args existed encountered an error", args);
return Ok(false);
}
};
}
// probably would have errored out after a second
// safe to assume metric exists
match child.kill() {
Ok(_) => (),
Err(e) => bail_error!(tid=%tid, error=%e, "Failed to kill perf child when testing args"),
};
Ok(true)
}
6 changes: 6 additions & 0 deletions src/Ilúvatar/iluvatar_library/src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,12 @@ pub struct FunctionInvocationTimings {
/// cold invocation latency time recorded on worker
pub cold_invoke_duration_us: Vec<u128>,
}
impl Default for FunctionInvocationTimings {
fn default() -> Self {
Self::new()
}
}

impl FunctionInvocationTimings {
pub fn new() -> Self {
FunctionInvocationTimings {
Expand Down
31 changes: 31 additions & 0 deletions src/Ilúvatar/iluvatar_library/src/utils/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,41 @@ use crate::utils::port::Port;
use anyhow::Result;
use async_process::Command as AsyncCommand;
use std::collections::HashMap;
use std::num::ParseIntError;
use std::process::{Child, Command, Output, Stdio};
use std::{str, thread, time};
use tokio::signal::unix::{signal, Signal, SignalKind};
use tracing::{debug, info};

pub fn get_child_pid(ppid: u32) -> Result<u32, ParseIntError> {
let output = Command::new("pgrep")
.arg("-P")
.arg(ppid.to_string())
.output()
.expect("failed to execute process");

str::from_utf8(&output.stdout).unwrap().trim().parse::<u32>()
}

pub fn try_get_child_pid(ppid: u32, timeout_ms: u64, tries: u32) -> u32 {
let millis = time::Duration::from_millis(timeout_ms);
let mut tries = tries;

while tries > 0 {
let r = get_child_pid(ppid);

let cpid = r.unwrap_or(0);
if cpid != 0 {
return cpid;
}

tries -= 1;
thread::sleep(millis);
}

0
}

lazy_static::lazy_static! {
pub static ref SIMULATION_CHECK: parking_lot::Mutex<bool> = parking_lot::Mutex::new(false);
}
Expand Down
18 changes: 9 additions & 9 deletions src/Ilúvatar/iluvatar_load_gen/src/benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use crate::trace::prepare_function_args;
use crate::utils::*;
use anyhow::Result;
use clap::Parser;
use iluvatar_library::types::{Compute, FunctionInvocationTimings, Isolation, MemSizeMb, ResourceTimings};
use iluvatar_library::types::{Compute, Isolation, MemSizeMb, ResourceTimings};
use iluvatar_library::utils::config::args_to_json;
use iluvatar_library::{logging::LocalTime, transaction::gen_tid, utils::port_utils::Port};
use serde::{Deserialize, Serialize};
Expand Down Expand Up @@ -34,6 +34,12 @@ pub struct BenchmarkStore {
/// map of function name to data
pub data: HashMap<String, FunctionStore>,
}
impl Default for BenchmarkStore {
fn default() -> Self {
Self::new()
}
}

impl BenchmarkStore {
pub fn new() -> Self {
BenchmarkStore { data: HashMap::new() }
Expand Down Expand Up @@ -171,10 +177,7 @@ pub async fn benchmark_controller(
let compute = Compute::CPU; // TODO: update when controller returns more details
let resource_entry = match func_data.resource_data.get_mut(&compute.try_into()?) {
Some(r) => r,
None => func_data
.resource_data
.entry(compute.try_into()?)
.or_insert_with(FunctionInvocationTimings::new),
None => func_data.resource_data.entry(compute.try_into()?).or_default(),
};
if invoke_result.function_output.body.cold {
resource_entry
Expand Down Expand Up @@ -363,10 +366,7 @@ pub fn benchmark_worker(threaded_rt: &Runtime, functions: Vec<ToBenchmarkFunctio
if invoke.worker_response.success {
let resource_entry = match d.resource_data.get_mut(&compute.try_into()?) {
Some(r) => r,
None => d
.resource_data
.entry(compute.try_into()?)
.or_insert_with(FunctionInvocationTimings::new),
None => d.resource_data.entry(compute.try_into()?).or_default(),
};
if invoke.function_output.body.cold {
resource_entry
Expand Down
1 change: 0 additions & 1 deletion src/Ilúvatar/iluvatar_load_gen/src/trace/trace_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,6 @@ pub fn save_controller_results(results: Vec<CompletedControllerInvocation>, args
anyhow::bail!("Failed to write json of result because {}", e);
}
};

for r in results {
let to_write = format!(
"{},{},{},{},{},{},{}\n",
Expand Down
4 changes: 2 additions & 2 deletions src/Ilúvatar/iluvatar_worker_library/src/rpc/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ impl WorkerAPI for RPCWorkerAPI {
tid: TransactionId,
) -> Result<InvokeResponse> {
let request = Request::new(InvokeRequest {
function_name: function_name,
function_name,
function_version: version,
json_args: args,
transaction_id: tid,
Expand Down Expand Up @@ -182,7 +182,7 @@ impl WorkerAPI for RPCWorkerAPI {
compute: Compute,
) -> Result<String> {
let request = Request::new(PrewarmRequest {
function_name: function_name,
function_name,
function_version: version,
transaction_id: tid.clone(),
compute: compute.bits(),
Expand Down
Loading

0 comments on commit 4319bfc

Please sign in to comment.