Skip to content

Commit

Permalink
Back /var/fm/fmd with a dataset from the boot M.2
Browse files Browse the repository at this point in the history
/var/fm/fmd is where the illumos fault management system records data.
We want to preserve this data across system reboots and in real time
rather than via periodic data copying, so that the information is
available should the system panic shortly thereafter.

Fixes: #4211
  • Loading branch information
citrus-it committed Oct 5, 2023
1 parent bb4e0cc commit e29f4be
Show file tree
Hide file tree
Showing 8 changed files with 231 additions and 20 deletions.
41 changes: 33 additions & 8 deletions illumos-utils/src/zfs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ enum EnsureFilesystemErrorRaw {

#[error("Failed to mount encrypted filesystem: {0}")]
MountEncryptedFsFailed(crate::ExecutionError),

#[error("Failed to mount overlay filesystem: {0}")]
MountOverlayFsFailed(crate::ExecutionError),
}

/// Error returned by [`Zfs::ensure_filesystem`].
Expand Down Expand Up @@ -202,13 +205,15 @@ impl Zfs {
/// Creates a new ZFS filesystem named `name`, unless one already exists.
///
/// Applies an optional quota, provided _in bytes_.
#[allow(clippy::too_many_arguments)]
pub fn ensure_filesystem(
name: &str,
mountpoint: Mountpoint,
zoned: bool,
do_format: bool,
encryption_details: Option<EncryptionDetails>,
size_details: Option<SizeDetails>,
options: Option<Vec<String>>,
) -> Result<(), EnsureFilesystemError> {
let (exists, mounted) = Self::dataset_exists(name, &mountpoint)?;
if exists {
Expand Down Expand Up @@ -261,7 +266,14 @@ impl Zfs {
]);
}

if let Some(opts) = options {
for o in &opts {
cmd.args(&["-o", &o]);
}
}

cmd.args(&["-o", &format!("mountpoint={}", mountpoint), name]);

execute(cmd).map_err(|err| EnsureFilesystemError {
name: name.to_string(),
mountpoint: mountpoint.clone(),
Expand Down Expand Up @@ -322,6 +334,20 @@ impl Zfs {
Ok(())
}

pub fn mount_overlay_dataset(
name: &str,
mountpoint: &Mountpoint,
) -> Result<(), EnsureFilesystemError> {
let mut command = std::process::Command::new(PFEXEC);
let cmd = command.args(&[ZFS, "mount", "-O", name]);
execute(cmd).map_err(|err| EnsureFilesystemError {
name: name.to_string(),
mountpoint: mountpoint.clone(),
err: EnsureFilesystemErrorRaw::MountOverlayFsFailed(err),
})?;
Ok(())
}

// Return (true, mounted) if the dataset exists, (false, false) otherwise,
// where mounted is if the dataset is mounted.
fn dataset_exists(
Expand Down Expand Up @@ -385,7 +411,7 @@ impl Zfs {
Zfs::get_value(filesystem_name, &format!("oxide:{}", name))
}

fn get_value(
pub fn get_value(
filesystem_name: &str,
name: &str,
) -> Result<String, GetValueError> {
Expand Down Expand Up @@ -422,13 +448,12 @@ pub fn get_all_omicron_datasets_for_delete() -> anyhow::Result<Vec<String>> {
let internal = pool.kind() == crate::zpool::ZpoolKind::Internal;
let pool = pool.to_string();
for dataset in &Zfs::list_datasets(&pool)? {
// Avoid erasing crashdump datasets on internal pools
if dataset == "crash" && internal {
continue;
}

// The swap device might be in use, so don't assert that it can be deleted.
if dataset == "swap" && internal {
// Avoid erasing crashdump, backing data and swap datasets on
// internal pools. The swap device may be in use.
if internal
&& (["crash", "backing", "swap"].contains(&dataset.as_str())
|| dataset.starts_with("backing/"))
{
continue;
}

Expand Down
167 changes: 167 additions & 0 deletions sled-agent/src/backingfs.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Operations for dealing with persistent backing mounts for OS data
// On Oxide hardware, the root filesystem is backed by a ramdisk and
// non-persistent. However, there are several things within the root filesystem
// which are useful to preserve across reboots, and these are backed persistent
// datasets on the boot disk.
//
// Each boot disk contains a dataset sled_hardware::disk::M2_BACKING_DATASET
// and for each backing mount, a child dataset is created under there that
// is configured with the desired mountpoint in the root filesystem. Since
// there are multiple disks which can be used to boot, these datasets are also
// marked with the "canmount=noauto" attribute so that they do not all try to
// mount automatically and race -- only one could ever succeed. This allows us
// to come along later an specifically mount the one that we want (the one from
// the current boot disk) and also perform an overlay mount so that it succeeds
// even if there is content from the ramdisk image or early boot services
// present underneath. The overlay mount action is optionally bracketed with a
// service stop/start.

use camino::Utf8PathBuf;
use illumos_utils::zfs::{
EnsureFilesystemError, GetValueError, Mountpoint, SizeDetails, Zfs,
};

#[derive(Debug, thiserror::Error)]
pub enum BackingFsError {
#[error("Error administering service: {0}")]
Adm(#[from] smf::AdmError),

#[error("Error retrieving dataset property: {0}")]
DatasetProperty(#[from] GetValueError),

#[error("Error initializing dataset: {0}")]
Mount(#[from] EnsureFilesystemError),
}

struct BackingFs {
// Dataset name
name: &'static str,
// Mountpoint
mountpoint: &'static str,
// Optional quota, in _bytes_
quota: Option<usize>,
// Optional compression mode
compression: Option<&'static str>,
// Linked service
service: Option<&'static str>,
}

impl BackingFs {
const fn new(name: &'static str) -> Self {
Self {
name,
mountpoint: "legacy",
quota: None,
compression: None,
service: None,
}
}

const fn mountpoint(mut self, mountpoint: &'static str) -> Self {
self.mountpoint = mountpoint;
self
}

const fn quota(mut self, quota: usize) -> Self {
self.quota = Some(quota);
self
}

const fn compression(mut self, compression: &'static str) -> Self {
self.compression = Some(compression);
self
}

const fn service(mut self, service: &'static str) -> Self {
self.service = Some(service);
self
}
}

const BACKING_FMD_DATASET: &'static str = "fmd";
const BACKING_FMD_MOUNTPOINT: &'static str = "/var/fm/fmd";
const BACKING_FMD_SERVICE: &'static str = "svc:/system/fmd:default";
const BACKING_FMD_QUOTA: usize = 500 * (1 << 20); // 500 MiB

const BACKING_COMPRESSION: &'static str = "on";

const BACKINGFS_COUNT: usize = 1;
static BACKINGFS: [BackingFs; BACKINGFS_COUNT] =
[BackingFs::new(BACKING_FMD_DATASET)
.mountpoint(BACKING_FMD_MOUNTPOINT)
.quota(BACKING_FMD_QUOTA)
.compression(BACKING_COMPRESSION)
.service(BACKING_FMD_SERVICE)];

/// Ensure that the backing filesystems are mounted.
/// If the underlying dataset for a backing fs does not exist on the specified
/// boot disk then it will be created.
pub(crate) fn ensure_backing_fs(
log: &slog::Logger,
boot_zpool_name: &illumos_utils::zpool::ZpoolName,
) -> Result<(), BackingFsError> {
let log = log.new(o!(
"component" => "BackingFs",
));
for bfs in BACKINGFS.iter() {
info!(log, "Processing {}", bfs.name);

let dataset = format!(
"{}/{}/{}",
boot_zpool_name,
sled_hardware::disk::M2_BACKING_DATASET,
bfs.name
);
let mountpoint = Mountpoint::Path(Utf8PathBuf::from(bfs.mountpoint));

info!(log, "Ensuring dataset {}", dataset);

let size_details = Some(SizeDetails {
quota: bfs.quota,
compression: bfs.compression,
});

Zfs::ensure_filesystem(
&dataset,
mountpoint.clone(),
false, // zoned
true, // do_format
None, // encryption_details,
size_details,
Some(vec!["canmount=noauto".to_string()]), // options
)?;

if Zfs::get_value(&bfs.mountpoint, "mountpoint")? == bfs.mountpoint {
info!(log, "{} is already mounted", bfs.mountpoint);
return Ok(());
}

if let Some(service) = bfs.service {
info!(log, "Stopping service {}", service);
smf::Adm::new()
.disable()
.temporary()
.synchronous()
.run(smf::AdmSelection::ByPattern(&[service]))?;
}

info!(log, "Mounting {} on {}", dataset, mountpoint);

Zfs::mount_overlay_dataset(&dataset, &mountpoint)?;

if let Some(service) = bfs.service {
info!(log, "Starting service {}", service);
smf::Adm::new()
.enable()
.synchronous()
.run(smf::AdmSelection::ByPattern(&[service]))?;
}
}

Ok(())
}
1 change: 1 addition & 0 deletions sled-agent/src/bootstrap/pre_server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,7 @@ fn ensure_zfs_ramdisk_dataset() -> Result<(), StartError> {
do_format,
encryption_details,
quota,
None,
)
.map_err(StartError::EnsureZfsRamdiskDataset)
}
Expand Down
1 change: 1 addition & 0 deletions sled-agent/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ pub mod sim;
pub mod common;

// Modules for the non-simulated sled agent.
mod backingfs;
pub mod bootstrap;
pub mod config;
mod http_entrypoints;
Expand Down
22 changes: 17 additions & 5 deletions sled-agent/src/sled_agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,15 @@ use illumos_utils::{dladm::MockDladm as Dladm, zone::MockZones as Zones};

#[derive(thiserror::Error, Debug)]
pub enum Error {
#[error("Could not find boot disk")]
BootDiskNotFound,

#[error("Configuration error: {0}")]
Config(#[from] crate::config::ConfigError),

#[error("Error setting up backing filesystems: {0}")]
BackingFs(#[from] crate::backingfs::BackingFsError),

#[error("Error setting up swap device: {0}")]
SwapDevice(#[from] crate::swap_device::SwapDeviceError),

Expand Down Expand Up @@ -268,14 +274,17 @@ impl SledAgent {
));
info!(&log, "SledAgent::new(..) starting");

// Configure a swap device of the configured size before other system setup.
let boot_disk = storage
.resources()
.boot_disk()
.await
.ok_or_else(|| Error::BootDiskNotFound)?;

// Configure a swap device of the configured size before other system
// setup.
match config.swap_device_size_gb {
Some(sz) if sz > 0 => {
info!(log, "Requested swap device of size {} GiB", sz);
let boot_disk =
storage.resources().boot_disk().await.ok_or_else(|| {
crate::swap_device::SwapDeviceError::BootDiskNotFound
})?;
crate::swap_device::ensure_swap_device(
&parent_log,
&boot_disk.1,
Expand All @@ -290,6 +299,9 @@ impl SledAgent {
}
}

info!(log, "Mounting backing filesystems");
crate::backingfs::ensure_backing_fs(&parent_log, &boot_disk.1)?;

// Ensure we have a thread that automatically reaps process contracts
// when they become empty. See the comments in
// illumos-utils/src/running_zone.rs for more detail.
Expand Down
1 change: 1 addition & 0 deletions sled-agent/src/storage_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,7 @@ impl StorageWorker {
do_format,
encryption_details,
size_details,
None,
)?;
// Ensure the dataset has a usable UUID.
if let Ok(id_str) = Zfs::get_oxide_value(&fs_name, "uuid") {
Expand Down
3 changes: 0 additions & 3 deletions sled-agent/src/swap_device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,6 @@ use zeroize::Zeroize;

#[derive(Debug, thiserror::Error)]
pub enum SwapDeviceError {
#[error("Could not find boot disk")]
BootDiskNotFound,

#[error("Error running ZFS command: {0}")]
Zfs(illumos_utils::ExecutionError),

Expand Down
15 changes: 11 additions & 4 deletions sled-hardware/src/disk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ pub const CRASH_DATASET: &'static str = "crash";
pub const CLUSTER_DATASET: &'static str = "cluster";
pub const CONFIG_DATASET: &'static str = "config";
pub const M2_DEBUG_DATASET: &'static str = "debug";
pub const M2_BACKING_DATASET: &'static str = "backing";
// TODO-correctness: This value of 100GiB is a pretty wild guess, and should be
// tuned as needed.
pub const DEBUG_DATASET_QUOTA: usize = 100 * (1 << 30);
Expand All @@ -282,15 +283,19 @@ static U2_EXPECTED_DATASETS: [ExpectedDataset; U2_EXPECTED_DATASET_COUNT] = [
.compression(DUMP_DATASET_COMPRESSION),
];

const M2_EXPECTED_DATASET_COUNT: usize = 5;
const M2_EXPECTED_DATASET_COUNT: usize = 6;
static M2_EXPECTED_DATASETS: [ExpectedDataset; M2_EXPECTED_DATASET_COUNT] = [
// Stores software images.
//
// Should be duplicated to both M.2s.
ExpectedDataset::new(INSTALL_DATASET),
// Stores crash dumps.
ExpectedDataset::new(CRASH_DATASET),
// Stores cluter configuration information.
// Backing store for OS data that should be persisted across reboots.
// Its children are selectively overlay mounted onto parts of the ramdisk
// root.
ExpectedDataset::new(M2_BACKING_DATASET),
// Stores cluster configuration information.
//
// Should be duplicated to both M.2s.
ExpectedDataset::new(CLUSTER_DATASET),
Expand Down Expand Up @@ -524,6 +529,7 @@ impl Disk {
do_format,
Some(encryption_details),
None,
None,
);

keyfile.zero_and_unlink().await.map_err(|error| {
Expand Down Expand Up @@ -562,8 +568,8 @@ impl Disk {
"Automatically destroying dataset: {}", name
);
Zfs::destroy_dataset(name).or_else(|err| {
// If we can't find the dataset, that's fine -- it might
// not have been formatted yet.
// If we can't find the dataset, that's fine -- it
// might not have been formatted yet.
if let DestroyDatasetErrorVariant::NotFound =
err.err
{
Expand All @@ -588,6 +594,7 @@ impl Disk {
do_format,
encryption_details,
size_details,
None,
)?;

if dataset.wipe {
Expand Down

0 comments on commit e29f4be

Please sign in to comment.