Skip to content

Commit

Permalink
(WIP) Put process core dumps onto the U.2 debug zvol
Browse files Browse the repository at this point in the history
  • Loading branch information
lif committed Jul 18, 2023
1 parent 4cf90f3 commit 07cce93
Show file tree
Hide file tree
Showing 4 changed files with 228 additions and 69 deletions.
62 changes: 62 additions & 0 deletions illumos-utils/src/coreadm.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
use camino::Utf8PathBuf;
use std::ffi::OsString;
use std::os::unix::ffi::OsStringExt;
use std::process::Command;

#[derive(thiserror::Error, Debug)]
pub enum CoreAdmError {
#[error("Error obtaining or modifying coreadm configuration. core_dir: {core_dir:?}")]
Execution { core_dir: Utf8PathBuf },

#[error("Invalid invocation of coreadm: {0:?} {1:?}")]
InvalidCommand(Vec<String>, OsString),

#[error("coreadm process was terminated by a signal.")]
TerminatedBySignal,

#[error("coreadm invocation exited with unexpected return code {0}")]
UnexpectedExitCode(i32),

#[error("Failed to execute dumpadm process: {0}")]
Exec(std::io::Error),
}

const COREADM: &str = "/usr/bin/coreadm";

pub fn coreadm(core_dir: &Utf8PathBuf) -> Result<(), CoreAdmError> {
let mut cmd = Command::new(COREADM);
cmd.env_clear();

// disable per-process core patterns
cmd.arg("-d").arg("process");
cmd.arg("-d").arg("process-setid");

// use the global core pattern
cmd.arg("-e").arg("global");
cmd.arg("-e").arg("global-setid");

// set the global pattern to place all cores into core_dir,
// with filenames of "core.[zone-name].[exe-filename].[pid].[time]"
cmd.arg("-g").arg(core_dir.join("core.%z.%f.%p.%t"));

// also collect DWARF data from the exe and its library deps
cmd.arg("-G").arg("default+debug");

let out = cmd.output().map_err(CoreAdmError::Exec)?;

match out.status.code() {
Some(0) => Ok(()),
Some(1) => Err(CoreAdmError::Execution { core_dir: core_dir.clone() }),
Some(2) => {
// unwrap: every arg we've provided in this function is UTF-8
let mut args =
vec![cmd.get_program().to_str().unwrap().to_string()];
cmd.get_args()
.for_each(|arg| args.push(arg.to_str().unwrap().to_string()));
let stderr = OsString::from_vec(out.stderr);
Err(CoreAdmError::InvalidCommand(args, stderr))
}
Some(n) => Err(CoreAdmError::UnexpectedExitCode(n)),
None => Err(CoreAdmError::TerminatedBySignal),
}
}
4 changes: 2 additions & 2 deletions illumos-utils/src/dumpadm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ pub enum DumpAdmError {
Execution { dump_slice: Utf8PathBuf, savecore_dir: Option<Utf8PathBuf> },

#[error("Invalid invocation of dumpadm: {0:?} {1:?}")]
InvalidCommand(Vec<String>, std::ffi::OsString),
InvalidCommand(Vec<String>, OsString),

#[error("dumpadm process was terminated by a signal.")]
TerminatedBySignal,
Expand All @@ -98,7 +98,7 @@ pub enum DumpAdmError {
Mkdir(std::io::Error),

#[error("savecore failed: {0:?}")]
SavecoreFailure(std::ffi::OsString),
SavecoreFailure(OsString),

#[error("Failed to execute dumpadm process: {0}")]
ExecDumpadm(std::io::Error),
Expand Down
1 change: 1 addition & 0 deletions illumos-utils/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
use cfg_if::cfg_if;

pub mod addrobj;
pub mod coreadm;
pub mod destructor;
pub mod dkio;
pub mod dladm;
Expand Down
230 changes: 163 additions & 67 deletions sled-agent/src/storage/dump_setup.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,27 @@
use crate::storage_manager::DiskWrapper;
use camino::Utf8PathBuf;
use illumos_utils::dumpadm::DumpAdmError;
use illumos_utils::zpool::ZpoolHealth;
use omicron_common::disk::DiskIdentity;
use sled_hardware::DiskVariant;
use slog::Logger;
use std::collections::HashMap;
use std::collections::{HashMap, HashSet};
use std::ffi::OsString;
use std::sync::Arc;
use tokio::sync::MutexGuard;

#[derive(Default)]
pub struct DumpSetup {
// prevent us from running savecore concurrently.
savecore_lock: Arc<std::sync::Mutex<()>>,
worker: Arc<std::sync::Mutex<DumpSetupWorker>>,
}

#[derive(Default)]
struct DumpSetupWorker {
chosen_dump_slice: Option<Utf8PathBuf>,
chosen_dump_dir: Option<Utf8PathBuf>,
known_dump_slices: Vec<Utf8PathBuf>,
known_dump_dirs: Vec<Utf8PathBuf>,
savecored_slices: HashSet<Utf8PathBuf>,
}

impl DumpSetup {
Expand Down Expand Up @@ -43,7 +53,7 @@ impl DumpSetup {
sled_hardware::disk::DUMP_DATASET,
));
} else {
warn!(log, "Zpool {name:?} not online, won't attempt to savecore dumps there");
warn!(log, "Zpool {name:?} not online, won't attempt to save core dumps there");
}
}
}
Expand All @@ -52,65 +62,58 @@ impl DumpSetup {
}
}

dump_slices.sort();
u2_dump_dirs.sort();

let savecore_lock = self.savecore_lock.clone();
tokio::task::spawn_blocking(move || {
// TODO: a more reasonable way of deduplicating the effort.
let _guard = savecore_lock.lock();
Self::run_dumpadm_and_savecore(log, dump_slices, u2_dump_dirs);
let savecore_lock = self.worker.clone();
tokio::task::spawn_blocking(move || match savecore_lock.lock() {
Ok(mut guard) => {
guard.update_disk_loadout(log, dump_slices, u2_dump_dirs);
}
Err(err) => {
error!(log, "DumpSetup mutex poisoned: {err:?}");
}
});
}
}

fn run_dumpadm_and_savecore(
impl DumpSetupWorker {
fn update_disk_loadout(
&mut self,
log: Logger,
dump_slices: Vec<Utf8PathBuf>,
u2_dump_dirs: Vec<Utf8PathBuf>,
dump_dirs: Vec<Utf8PathBuf>,
) {
for dump_slice in dump_slices {
// NOTE: because of the need to have dumpadm change the global
// state of which slice the system is using for dumps in order
// for savecore to behave the way we want (i.e. clear the flag
// after succeeding), we could hypothetically miss a dump if
// the kernel crashes again while savecore is still running.
if u2_dump_dirs.is_empty() {
// Don't risk overwriting an existing dump if there's
// already one there until we can attempt to savecore(8)
// it away and clear the flag to make room.
match illumos_utils::dumpadm::dump_flag_is_valid(&dump_slice) {
Ok(false) => {
// Have dumpadm write the config for crash dumps to be
// on this slice, at least, until a U.2 comes along.
match illumos_utils::dumpadm::dumpadm(&dump_slice, None)
{
Ok(_) => {
info!(log, "Using dump device {dump_slice:?} with no savecore destination (no U.2 debug zvol yet)");
}
Err(err) => {
warn!(log, "Could not configure {dump_slice:?} as dump device: {err:?}");
}
}
}
Ok(true) => {
warn!(log, "Not configuring {dump_slice:?} as it appears to contain a dump we cannot yet send to a U.2 debug zvol");
}
Err(err) => {
debug!(
log,
"Dump slice {dump_slice:?} appears to be unused : {err:?}",
);
}
}
} else {
// Try each U.2 until we succeed once
for mountpoint in &u2_dump_dirs {
// Let's try to see if it appears to have a dump already
self.known_dump_slices = dump_slices;
self.known_dump_dirs = dump_dirs;

self.known_dump_slices.sort();
self.known_dump_dirs.sort();

if let Some(x) = &self.chosen_dump_dir {
if !self.known_dump_dirs.contains(x) {
warn!(log, "Previously-chosen dump dir {x:?} no longer exists in our view of reality");
self.chosen_dump_dir = None;
}
}
if let Some(x) = &self.chosen_dump_slice {
if !self.known_dump_slices.contains(x) {
warn!(log, "Previously-chosen dump slice {x:?} no longer exists in our view of reality");
self.chosen_dump_slice = None;
}
}

if self.chosen_dump_dir.is_none() {
// TODO: decide based on factors more meaningful than disk ID sort order
self.chosen_dump_dir = self.known_dump_dirs.first().cloned();
}

if self.chosen_dump_slice.is_none() {
if self.chosen_dump_dir.is_some() {
for dump_slice in self.known_dump_slices.clone() {
// Let's try to see if it appears to have a kernel dump already
match illumos_utils::dumpadm::dump_flag_is_valid(
&dump_slice,
) {
Ok(true) => {
debug!(log, "Dump slice {dump_slice:?} appears to have a valid header; will attempt to savecore to {mountpoint:?}");
debug!(log, "Dump slice {dump_slice:?} appears to have a valid header; will attempt to savecore");
}
Ok(false) => {
info!(log, "Dump slice {dump_slice:?} appears to have already been saved");
Expand All @@ -119,36 +122,129 @@ impl DumpSetup {
debug!(log, "Dump slice {dump_slice:?} appears to be unused: {err:?}");
}
}
// Have dumpadm write the config for crash dumps to be
// on this slice, and invoke savecore(8) to save any
// dump that's already present there.
match illumos_utils::dumpadm::dumpadm(
&dump_slice,
Some(mountpoint),
) {
if let Ok(_) = self.dumpadm_and_savecore(&dump_slice) {
self.chosen_dump_slice = Some(dump_slice);
break;
}
}
} else {
// Don't risk overwriting an existing kernel dump if there's
// already one there until we can attempt to savecore(8)
// it away and clear the flag to make room.
for dump_slice in &self.known_dump_slices {
match illumos_utils::dumpadm::dump_flag_is_valid(dump_slice)
{
Ok(false) => {
// Have dumpadm write the config for crash dumps to be
// on this slice, at least, until a U.2 comes along.
match illumos_utils::dumpadm::dumpadm(
dump_slice, None,
) {
Ok(_) => {
info!(log, "Using dump device {dump_slice:?} with no savecore destination (no U.2 debug zvol yet)");
self.chosen_dump_slice =
Some(dump_slice.clone());
break;
}
Err(err) => {
warn!(log, "Could not configure {dump_slice:?} as dump device: {err:?}");
}
}
}
Ok(true) => {
warn!(log, "Not configuring {dump_slice:?} as it appears to contain a dump we cannot yet send to a U.2 debug zvol");
}
Err(err) => {
warn!(log, "Could not configure {dump_slice:?} as dump device with {mountpoint:?} as savecore destination: {err:?}");
debug!(
log,
"Dump slice {dump_slice:?} appears to be unused : {err:?}",
);
}
}
}
}
}

if let Some(dump_dir) = self.chosen_dump_dir.clone() {
// tell the system to write *userspace process* cores here.
match illumos_utils::coreadm::coreadm(&dump_dir) {
Ok(()) => {
info!(
log,
"Set process core dump directory to {dump_dir:?}"
);
}
Err(err) => {
error!(log, "Couldn't configure process core dump directory to {dump_dir:?}: {err:?}");
}
}

let mut changed_slice = false;
for dump_slice in self.known_dump_slices.clone() {
if !self.savecored_slices.contains(&dump_slice) {
changed_slice = true;
// temporarily changes the system's dump slice so savecore(8)
// can update the header in the slice when it finishes...
match self.dumpadm_and_savecore(&dump_slice) {
Ok(saved) => {
if let Some(stdout) = saved {
if let Some(stdout) = &saved {
info!(
log,
"Saved dump from {dump_slice:?} to {mountpoint:?}: {stdout:?}"
"Saved dump from {dump_slice:?} to {dump_dir:?}: {stdout:?}"
);
} else {
info!(
log,
"Set {dump_slice:?} as system dump slice",
);
}
// If there was one present, we successfully
// compressed it onto a U.2's pool, no need to
// try others.
break;
}
Err(err) => {
warn!(log, "Could not configure {dump_slice:?} as dump device with {dump_dir:?} as savecore destination: {err:?}");
}
}
}
}

// ...so then we restore the chosen dump slice for the system to use
// in the event of a kernel crash
if changed_slice {
if let Some(dump_slice) = &self.chosen_dump_slice {
if let Err(err) =
illumos_utils::dumpadm::dumpadm(dump_slice, None)
{
error!(log, "Could not restore dump slice to {dump_slice:?}: {err:?}");
}
}
}
}
}

// Have dumpadm write the config for crash dumps to be
// on this slice, and then invoke savecore(8) to save any
// dump that's already present there.
//
// NOTE: because of the need to have dumpadm change the global
// state of which slice the system is using for dumps in order
// for savecore to behave the way we want (i.e. clear the flag
// after succeeding), we could hypothetically miss a dump if
// the kernel crashes again while savecore is still running.
fn dumpadm_and_savecore(
&mut self,
dump_slice: &Utf8PathBuf,
) -> Result<Option<OsString>, DumpAdmError> {
// TODO: untangle savecore from illumos_utils::dumpadm
assert!(self.chosen_dump_dir.is_some());

match illumos_utils::dumpadm::dumpadm(
&dump_slice,
self.chosen_dump_dir.as_ref(),
) {
Ok(saved) => {
self.savecored_slices.insert(dump_slice.clone());
Ok(saved)
}
Err(err) => Err(err),
}
}
}

0 comments on commit 07cce93

Please sign in to comment.