Skip to content

Commit

Permalink
sled-agent performs archival of rotated logs for all zones onto U.2 d…
Browse files Browse the repository at this point in the history
…ebug dataset (#3713)

This periodically moves logs rotated by logadm in cron
(oxidecomputer/helios#107) into the crypt/debug
zfs dataset on the U.2 chosen by the logic in #3677. It replaces the
rotated number (*.log.0, *.log.1) with the unix epoch timestamp of the
rotated log's modification time such that they don't collide when
collected repeatedly (logadm will reset numbering when the previous ones
are moved away).

(for #2478)
  • Loading branch information
lifning authored and Alan Hanson committed Jul 24, 2023
1 parent 53296f5 commit 2a1fbb6
Show file tree
Hide file tree
Showing 4 changed files with 156 additions and 25 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ gateway-client = { path = "gateway-client" }
gateway-messages = { git = "https://github.com/oxidecomputer/management-gateway-service", default-features = false, features = ["std"], rev = "146a687f7413bfe580869bb6017f3bfe8b4710b1" }
gateway-sp-comms = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "146a687f7413bfe580869bb6017f3bfe8b4710b1" }
gateway-test-utils = { path = "gateway-test-utils" }
glob = "0.3.1"
headers = "0.3.8"
heck = "0.4"
hex = "0.4.3"
Expand Down
1 change: 1 addition & 0 deletions sled-agent/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ dpd-client.workspace = true
dropshot.workspace = true
flate2.workspace = true
futures.workspace = true
glob.workspace = true
http.workspace = true
hyper-staticfile.workspace = true
gateway-client.workspace = true
Expand Down
178 changes: 153 additions & 25 deletions sled-agent/src/storage/dump_setup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,16 @@ use crate::storage_manager::DiskWrapper;
use camino::Utf8PathBuf;
use derive_more::{AsRef, Deref};
use illumos_utils::dumpadm::DumpAdmError;
use illumos_utils::zone::{AdmError, Zones};
use illumos_utils::zpool::ZpoolHealth;
use omicron_common::disk::DiskIdentity;
use sled_hardware::DiskVariant;
use slog::Logger;
use std::collections::{HashMap, HashSet};
use std::ffi::OsString;
use std::path::{Path, PathBuf};
use std::sync::{Arc, Weak};
use std::time::{SystemTime, UNIX_EPOCH};
use tokio::sync::MutexGuard;

pub struct DumpSetup {
Expand All @@ -23,9 +26,9 @@ impl DumpSetup {
log.new(o!("component" => "DumpSetup-worker")),
)));
let worker_weak = Arc::downgrade(&worker);
let log_poll = log.new(o!("component" => "DumpSetup-rotation"));
let log_poll = log.new(o!("component" => "DumpSetup-archival"));
let _poller = std::thread::spawn(move || {
Self::poll_file_rotation(worker_weak, log_poll)
Self::poll_file_archival(worker_weak, log_poll)
});
let log = log.new(o!("component" => "DumpSetup"));
Self { worker, _poller, log }
Expand Down Expand Up @@ -54,7 +57,7 @@ struct DumpSetupWorker {
log: Logger,
}

const ROTATION_DURATION: std::time::Duration =
const ARCHIVAL_INTERVAL: std::time::Duration =
std::time::Duration::from_secs(300);

impl DumpSetup {
Expand Down Expand Up @@ -130,7 +133,7 @@ impl DumpSetup {
});
}

fn poll_file_rotation(
fn poll_file_archival(
worker: Weak<std::sync::Mutex<DumpSetupWorker>>,
log: Logger,
) {
Expand All @@ -140,10 +143,10 @@ impl DumpSetup {
match mutex.lock() {
Ok(mut guard) => {
guard.reevaluate_choices();
if let Err(err) = guard.rotate_files(&log) {
if let Err(err) = guard.archive_files() {
error!(
log,
"Failed to rotate debug/dump files: {err:?}"
"Failed to archive debug/dump files: {err:?}"
);
}
}
Expand All @@ -162,7 +165,7 @@ impl DumpSetup {
);
break;
}
std::thread::sleep(ROTATION_DURATION);
std::thread::sleep(ARCHIVAL_INTERVAL);
}
}
}
Expand Down Expand Up @@ -247,7 +250,7 @@ impl DumpSetupWorker {
},
);
self.known_core_dirs.sort_by_cached_key(|mnt| {
// these get rotated out periodically anyway, pick one with room
// these get archived periodically anyway, pick one with room
let available = zfs_get_integer(&**mnt, "available").unwrap_or(0);
(u64::MAX - available, mnt.clone())
});
Expand Down Expand Up @@ -411,43 +414,150 @@ impl DumpSetupWorker {
}
}

fn rotate_files(&self, log: &Logger) -> Result<(), std::io::Error> {
fn archive_files(&self) -> std::io::Result<()> {
if let Some(debug_dir) = &self.chosen_debug_dir {
if self.known_core_dirs.is_empty() {
info!(log, "No core dump locations yet known.");
info!(self.log, "No core dump locations yet known.");
}
for core_dir in &self.known_core_dirs {
if let Ok(dir) = core_dir.read_dir() {
for entry in dir.flatten() {
if let Some(path) = entry.file_name().to_str() {
let dest = debug_dir.join(path);

let mut dest_f = std::fs::File::create(&dest)?;
let mut src_f = std::fs::File::open(&entry.path())?;

std::io::copy(&mut src_f, &mut dest_f)?;
dest_f.sync_all()?;

drop(src_f);
drop(dest_f);

if let Err(err) = std::fs::remove_file(entry.path())
if let Err(err) =
Self::copy_sync_and_remove(&entry.path(), &dest)
{
warn!(log, "Could not remove {entry:?} after copying it to {dest:?}: {err:?}");
error!(
self.log,
"Failed to archive {entry:?}: {err:?}"
);
} else {
info!(
log,
"Relocated core {entry:?} to {dest:?}"
self.log,
"Relocated {entry:?} to {dest:?}"
);
}
} else {
error!(log, "Non-UTF8 path found while rotating core dumps: {entry:?}");
error!(self.log, "Non-UTF8 path found while archiving core dumps: {entry:?}");
}
}
}
}
} else {
info!(log, "No rotation destination for crash dumps yet chosen.");
info!(
self.log,
"No archival destination for crash dumps yet chosen."
);
}

if let Err(err) = self.archive_logs() {
if !matches!(err, ArchiveLogsError::NoDebugDirYet) {
error!(
self.log,
"Failure while trying to archive logs to debug dataset: {err:?}"
);
}
}

Ok(())
}

fn copy_sync_and_remove(
source: impl AsRef<Path>,
dest: impl AsRef<Path>,
) -> std::io::Result<()> {
let source = source.as_ref();
let dest = dest.as_ref();
let mut dest_f = std::fs::File::create(&dest)?;
let mut src_f = std::fs::File::open(&source)?;

std::io::copy(&mut src_f, &mut dest_f)?;

dest_f.sync_all()?;

drop(src_f);
drop(dest_f);

std::fs::remove_file(source)?;
Ok(())
}

fn archive_logs(&self) -> Result<(), ArchiveLogsError> {
let debug_dir = self
.chosen_debug_dir
.as_ref()
.ok_or(ArchiveLogsError::NoDebugDirYet)?;
// zone crate's 'deprecated' functions collide if you try to enable
// its 'sync' and 'async' features simultaneously :(
let rt =
tokio::runtime::Runtime::new().map_err(ArchiveLogsError::Tokio)?;
let oxz_zones = rt.block_on(Zones::get())?;
self.archive_logs_inner(
debug_dir,
PathBuf::from("/var/svc/log"),
"global",
)?;
for zone in oxz_zones {
let logdir = zone.path().join("root/var/svc/log");
let zone_name = zone.name();
self.archive_logs_inner(debug_dir, logdir, zone_name)?;
}
Ok(())
}

fn archive_logs_inner(
&self,
debug_dir: &DebugDirPath,
logdir: PathBuf,
zone_name: &str,
) -> Result<(), ArchiveLogsError> {
let mut rotated_log_files = Vec::new();
// patterns matching archived logs, e.g. foo.log.3
// keep checking for greater numbers of digits until we don't find any
for n in 1..9 {
let pattern = logdir
.join(format!("*.log.{}", "[0-9]".repeat(n)))
.to_str()
.ok_or_else(|| ArchiveLogsError::Utf8(zone_name.to_string()))?
.to_string();
rotated_log_files.extend(glob::glob(&pattern)?.flatten());
}
let dest_dir = debug_dir.join(zone_name).into_std_path_buf();
if !rotated_log_files.is_empty() {
std::fs::create_dir_all(&dest_dir)?;
let count = rotated_log_files.len();
info!(
self.log,
"Archiving {count} log files from {zone_name} zone"
);
}
for entry in rotated_log_files {
let src_name = entry.file_name().unwrap();
// as we archive them, logadm will keep resetting to .log.0,
// so we need to maintain our own numbering in the dest dataset.
// we'll use the modified date of the rotated log file, or try
// falling back to the time of archival if that fails, and
// falling back to counting up from 0 if *that* somehow fails.
let mut n = entry
.metadata()
.and_then(|m| m.modified())
.unwrap_or_else(|_| SystemTime::now())
.duration_since(UNIX_EPOCH)
.map(|d| d.as_secs())
.unwrap_or(0);
let mut dest;
loop {
dest = dest_dir.join(src_name).with_extension(format!("{n}"));
if dest.exists() {
n += 1;
} else {
break;
}
}
if let Err(err) = Self::copy_sync_and_remove(&entry, dest) {
warn!(self.log, "Failed to archive {entry:?}: {err:?}");
}
}
Ok(())
}
Expand Down Expand Up @@ -480,3 +590,21 @@ impl DumpSetupWorker {
}
}
}

#[derive(thiserror::Error, Debug)]
enum ArchiveLogsError {
#[error("Couldn't make an async runtime to get zone info: {0}")]
Tokio(std::io::Error),
#[error("I/O error: {0}")]
IoError(#[from] std::io::Error),
#[error("Error calling zoneadm: {0}")]
Zoneadm(#[from] AdmError),
#[error("Non-UTF8 zone path for zone {0}")]
Utf8(String),
#[error("Glob pattern invalid: {0}")]
Glob(#[from] glob::PatternError),
#[error(
"No debug dir into which we should archive logs has yet been chosen"
)]
NoDebugDirYet,
}

0 comments on commit 2a1fbb6

Please sign in to comment.